aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorMark Brown <broonie@kernel.org>2015-10-12 13:09:27 -0400
committerMark Brown <broonie@kernel.org>2015-10-12 13:09:27 -0400
commit79828b4fa835f73cdaf4bffa48696abdcbea9d02 (patch)
tree5e0fa7156acb75ba603022bc807df8f2fedb97a8 /kernel
parent721b51fcf91898299d96f4b72cb9434cda29dce6 (diff)
parent8c1a9d6323abf0fb1e5dad96cf3f1c783505ea5a (diff)
Merge remote-tracking branch 'asoc/fix/rt5645' into asoc-fix-rt5645
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile105
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit.h18
-rw-r--r--kernel/audit_fsnotify.c216
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/audit_watch.c56
-rw-r--r--kernel/auditfilter.c83
-rw-r--r--kernel/auditsc.c9
-rw-r--r--kernel/bpf/arraymap.c137
-rw-r--r--kernel/bpf/core.c9
-rw-r--r--kernel/bpf/syscall.c14
-rw-r--r--kernel/bpf/verifier.c58
-rw-r--r--kernel/cgroup.c131
-rw-r--r--kernel/cgroup_freezer.c2
-rw-r--r--kernel/cgroup_pids.c355
-rw-r--r--kernel/cpu.c61
-rw-r--r--kernel/cpu_pm.c2
-rw-r--r--kernel/cpuset.c2
-rw-r--r--kernel/cred.c13
-rw-r--r--kernel/events/core.c280
-rw-r--r--kernel/events/ring_buffer.c15
-rw-r--r--kernel/events/uprobes.c228
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/extable.c1
-rw-r--r--kernel/fork.c70
-rw-r--r--kernel/futex.c100
-rw-r--r--kernel/irq/chip.c43
-rw-r--r--kernel/irq/generic-chip.c6
-rw-r--r--kernel/irq/handle.c4
-rw-r--r--kernel/irq/internals.h11
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/irq/irqdomain.c18
-rw-r--r--kernel/irq/manage.c64
-rw-r--r--kernel/irq/msi.c17
-rw-r--r--kernel/irq/pm.c12
-rw-r--r--kernel/irq/resend.c22
-rw-r--r--kernel/irq/spurious.c26
-rw-r--r--kernel/jump_label.c158
-rw-r--r--kernel/kexec.c2531
-rw-r--r--kernel/kexec_core.c1534
-rw-r--r--kernel/kexec_file.c1045
-rw-r--r--kernel/kexec_internal.h22
-rw-r--r--kernel/kmod.c100
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/ksysfs.c6
-rw-r--r--kernel/kthread.c31
-rw-r--r--kernel/livepatch/core.c6
-rw-r--r--kernel/locking/Makefile4
-rw-r--r--kernel/locking/percpu-rwsem.c13
-rw-r--r--kernel/locking/qrwlock.c47
-rw-r--r--kernel/locking/qspinlock.c6
-rw-r--r--kernel/locking/qspinlock_paravirt.h101
-rw-r--r--kernel/locking/rtmutex-tester.c420
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rtmutex_common.h22
-rw-r--r--kernel/membarrier.c66
-rw-r--r--kernel/memremap.c190
-rw-r--r--kernel/module.c8
-rw-r--r--kernel/module_signing.c213
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/pid.c5
-rw-r--r--kernel/power/Kconfig10
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/power/swap.c12
-rw-r--r--kernel/power/wakelock.c18
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/profile.c8
-rw-r--r--kernel/ptrace.c13
-rw-r--r--kernel/rcu/rcutorture.c42
-rw-r--r--kernel/rcu/srcu.c15
-rw-r--r--kernel/rcu/tiny.c8
-rw-r--r--kernel/rcu/tree.c681
-rw-r--r--kernel/rcu/tree.h96
-rw-r--r--kernel/rcu/tree_plugin.h130
-rw-r--r--kernel/rcu/tree_trace.c19
-rw-r--r--kernel/rcu/update.c90
-rw-r--r--kernel/reboot.c2
-rw-r--r--kernel/resource.c61
-rw-r--r--kernel/sched/core.c127
-rw-r--r--kernel/sched/cputime.c101
-rw-r--r--kernel/sched/deadline.c40
-rw-r--r--kernel/sched/debug.c48
-rw-r--r--kernel/sched/fair.c939
-rw-r--r--kernel/sched/features.h18
-rw-r--r--kernel/sched/idle.c14
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/rt.c42
-rw-r--r--kernel/sched/sched.h39
-rw-r--r--kernel/sched/stop_task.c1
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/seccomp.c17
-rw-r--r--kernel/signal.c13
-rw-r--r--kernel/smpboot.c27
-rw-r--r--kernel/stop_machine.c44
-rw-r--r--kernel/sys.c3
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c12
-rw-r--r--kernel/system_certificates.S20
-rw-r--r--kernel/system_keyring.c106
-rw-r--r--kernel/task_work.c12
-rw-r--r--kernel/time/Kconfig2
-rw-r--r--kernel/time/hrtimer.c36
-rw-r--r--kernel/time/ntp.c5
-rw-r--r--kernel/time/tick-broadcast-hrtimer.c49
-rw-r--r--kernel/time/tick-broadcast.c1
-rw-r--r--kernel/time/tick-common.c4
-rw-r--r--kernel/time/tick-sched.c72
-rw-r--r--kernel/time/time.c53
-rw-r--r--kernel/time/timekeeping.c19
-rw-r--r--kernel/time/timer.c4
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/trace/Kconfig2
-rw-r--r--kernel/trace/blktrace.c10
-rw-r--r--kernel/trace/bpf_trace.c63
-rw-r--r--kernel/trace/ftrace.c61
-rw-r--r--kernel/trace/ring_buffer.c764
-rw-r--r--kernel/trace/trace.c4
-rw-r--r--kernel/trace/trace.h1
-rw-r--r--kernel/trace/trace_branch.c17
-rw-r--r--kernel/trace/trace_events.c25
-rw-r--r--kernel/trace/trace_events_filter.c54
-rw-r--r--kernel/trace/trace_functions_graph.c4
-rw-r--r--kernel/trace/trace_kprobe.c20
-rw-r--r--kernel/trace/trace_output.c4
-rw-r--r--kernel/trace/trace_sched_switch.c2
-rw-r--r--kernel/trace/trace_sched_wakeup.c2
-rw-r--r--kernel/trace/trace_stack.c68
-rw-r--r--kernel/trace/trace_uprobe.c22
-rw-r--r--kernel/user_namespace.c5
-rw-r--r--kernel/watchdog.c189
-rw-r--r--kernel/workqueue.c28
131 files changed, 7153 insertions, 6014 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 43c4c920f30a..53abf008ecb3 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -45,16 +45,18 @@ ifneq ($(CONFIG_SMP),y)
45obj-y += up.o 45obj-y += up.o
46endif 46endif
47obj-$(CONFIG_UID16) += uid16.o 47obj-$(CONFIG_UID16) += uid16.o
48obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
49obj-$(CONFIG_MODULES) += module.o 48obj-$(CONFIG_MODULES) += module.o
50obj-$(CONFIG_MODULE_SIG) += module_signing.o 49obj-$(CONFIG_MODULE_SIG) += module_signing.o
51obj-$(CONFIG_KALLSYMS) += kallsyms.o 50obj-$(CONFIG_KALLSYMS) += kallsyms.o
52obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 51obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
52obj-$(CONFIG_KEXEC_CORE) += kexec_core.o
53obj-$(CONFIG_KEXEC) += kexec.o 53obj-$(CONFIG_KEXEC) += kexec.o
54obj-$(CONFIG_KEXEC_FILE) += kexec_file.o
54obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o 55obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
55obj-$(CONFIG_COMPAT) += compat.o 56obj-$(CONFIG_COMPAT) += compat.o
56obj-$(CONFIG_CGROUPS) += cgroup.o 57obj-$(CONFIG_CGROUPS) += cgroup.o
57obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o 58obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
59obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o
58obj-$(CONFIG_CPUSETS) += cpuset.o 60obj-$(CONFIG_CPUSETS) += cpuset.o
59obj-$(CONFIG_UTS_NS) += utsname.o 61obj-$(CONFIG_UTS_NS) += utsname.o
60obj-$(CONFIG_USER_NS) += user_namespace.o 62obj-$(CONFIG_USER_NS) += user_namespace.o
@@ -64,7 +66,7 @@ obj-$(CONFIG_SMP) += stop_machine.o
64obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 66obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
65obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 67obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
66obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 68obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
67obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o 69obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
68obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 70obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
69obj-$(CONFIG_GCOV_KERNEL) += gcov/ 71obj-$(CONFIG_GCOV_KERNEL) += gcov/
70obj-$(CONFIG_KPROBES) += kprobes.o 72obj-$(CONFIG_KPROBES) += kprobes.o
@@ -98,6 +100,9 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
98obj-$(CONFIG_JUMP_LABEL) += jump_label.o 100obj-$(CONFIG_JUMP_LABEL) += jump_label.o
99obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o 101obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
100obj-$(CONFIG_TORTURE_TEST) += torture.o 102obj-$(CONFIG_TORTURE_TEST) += torture.o
103obj-$(CONFIG_MEMBARRIER) += membarrier.o
104
105obj-$(CONFIG_HAS_IOMEM) += memremap.o
101 106
102$(obj)/configs.o: $(obj)/config_data.h 107$(obj)/configs.o: $(obj)/config_data.h
103 108
@@ -111,99 +116,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
111targets += config_data.h 116targets += config_data.h
112$(obj)/config_data.h: $(obj)/config_data.gz FORCE 117$(obj)/config_data.h: $(obj)/config_data.gz FORCE
113 $(call filechk,ikconfiggz) 118 $(call filechk,ikconfiggz)
114
115###############################################################################
116#
117# Roll all the X.509 certificates that we can find together and pull them into
118# the kernel so that they get loaded into the system trusted keyring during
119# boot.
120#
121# We look in the source root and the build root for all files whose name ends
122# in ".x509". Unfortunately, this will generate duplicate filenames, so we
123# have make canonicalise the pathnames and then sort them to discard the
124# duplicates.
125#
126###############################################################################
127ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
128X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
129X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
130X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
131 $(or $(realpath $(CERT)),$(CERT))))
132X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
133
134ifeq ($(X509_CERTIFICATES),)
135$(warning *** No X.509 certificates found ***)
136endif
137
138ifneq ($(wildcard $(obj)/.x509.list),)
139ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
140$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)")
141$(shell rm $(obj)/.x509.list)
142endif
143endif
144
145kernel/system_certificates.o: $(obj)/x509_certificate_list
146
147quiet_cmd_x509certs = CERTS $@
148 cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
149
150targets += $(obj)/x509_certificate_list
151$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
152 $(call if_changed,x509certs)
153
154targets += $(obj)/.x509.list
155$(obj)/.x509.list:
156 @echo $(X509_CERTIFICATES) >$@
157endif
158
159clean-files := x509_certificate_list .x509.list
160
161ifeq ($(CONFIG_MODULE_SIG),y)
162###############################################################################
163#
164# If module signing is requested, say by allyesconfig, but a key has not been
165# supplied, then one will need to be generated to make sure the build does not
166# fail and that the kernel may be used afterwards.
167#
168###############################################################################
169ifndef CONFIG_MODULE_SIG_HASH
170$(error Could not determine digest type to use from kernel config)
171endif
172
173signing_key.priv signing_key.x509: x509.genkey
174 @echo "###"
175 @echo "### Now generating an X.509 key pair to be used for signing modules."
176 @echo "###"
177 @echo "### If this takes a long time, you might wish to run rngd in the"
178 @echo "### background to keep the supply of entropy topped up. It"
179 @echo "### needs to be run as root, and uses a hardware random"
180 @echo "### number generator if one is available."
181 @echo "###"
182 openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
183 -batch -x509 -config x509.genkey \
184 -outform DER -out signing_key.x509 \
185 -keyout signing_key.priv 2>&1
186 @echo "###"
187 @echo "### Key pair generated."
188 @echo "###"
189
190x509.genkey:
191 @echo Generating X.509 key generation config
192 @echo >x509.genkey "[ req ]"
193 @echo >>x509.genkey "default_bits = 4096"
194 @echo >>x509.genkey "distinguished_name = req_distinguished_name"
195 @echo >>x509.genkey "prompt = no"
196 @echo >>x509.genkey "string_mask = utf8only"
197 @echo >>x509.genkey "x509_extensions = myexts"
198 @echo >>x509.genkey
199 @echo >>x509.genkey "[ req_distinguished_name ]"
200 @echo >>x509.genkey "#O = Unspecified company"
201 @echo >>x509.genkey "CN = Build time autogenerated kernel key"
202 @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company"
203 @echo >>x509.genkey
204 @echo >>x509.genkey "[ myexts ]"
205 @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
206 @echo >>x509.genkey "keyUsage=digitalSignature"
207 @echo >>x509.genkey "subjectKeyIdentifier=hash"
208 @echo >>x509.genkey "authorityKeyIdentifier=keyid"
209endif
diff --git a/kernel/audit.c b/kernel/audit.c
index f9e6065346db..662c007635fb 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1761,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
1761 } else 1761 } else
1762 audit_log_format(ab, " name=(null)"); 1762 audit_log_format(ab, " name=(null)");
1763 1763
1764 if (n->ino != (unsigned long)-1) 1764 if (n->ino != AUDIT_INO_UNSET)
1765 audit_log_format(ab, " inode=%lu" 1765 audit_log_format(ab, " inode=%lu"
1766 " dev=%02x:%02x mode=%#ho" 1766 " dev=%02x:%02x mode=%#ho"
1767 " ouid=%u ogid=%u rdev=%02x:%02x", 1767 " ouid=%u ogid=%u rdev=%02x:%02x",
diff --git a/kernel/audit.h b/kernel/audit.h
index d641f9bb3ed0..dadf86a0e59e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -50,6 +50,7 @@ enum audit_state {
50 50
51/* Rule lists */ 51/* Rule lists */
52struct audit_watch; 52struct audit_watch;
53struct audit_fsnotify_mark;
53struct audit_tree; 54struct audit_tree;
54struct audit_chunk; 55struct audit_chunk;
55 56
@@ -252,6 +253,7 @@ struct audit_net {
252extern int selinux_audit_rule_update(void); 253extern int selinux_audit_rule_update(void);
253 254
254extern struct mutex audit_filter_mutex; 255extern struct mutex audit_filter_mutex;
256extern int audit_del_rule(struct audit_entry *);
255extern void audit_free_rule_rcu(struct rcu_head *); 257extern void audit_free_rule_rcu(struct rcu_head *);
256extern struct list_head audit_filter_list[]; 258extern struct list_head audit_filter_list[];
257 259
@@ -269,6 +271,15 @@ extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
269extern void audit_remove_watch_rule(struct audit_krule *krule); 271extern void audit_remove_watch_rule(struct audit_krule *krule);
270extern char *audit_watch_path(struct audit_watch *watch); 272extern char *audit_watch_path(struct audit_watch *watch);
271extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev); 273extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
274
275extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len);
276extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
277extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
278extern void audit_remove_mark_rule(struct audit_krule *krule);
279extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev);
280extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
281extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
282
272#else 283#else
273#define audit_put_watch(w) {} 284#define audit_put_watch(w) {}
274#define audit_get_watch(w) {} 285#define audit_get_watch(w) {}
@@ -278,6 +289,13 @@ extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev
278#define audit_watch_path(w) "" 289#define audit_watch_path(w) ""
279#define audit_watch_compare(w, i, d) 0 290#define audit_watch_compare(w, i, d) 0
280 291
292#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL))
293#define audit_mark_path(m) ""
294#define audit_remove_mark(m)
295#define audit_remove_mark_rule(k)
296#define audit_mark_compare(m, i, d) 0
297#define audit_exe_compare(t, m) (-EINVAL)
298#define audit_dupe_exe(n, o) (-EINVAL)
281#endif /* CONFIG_AUDIT_WATCH */ 299#endif /* CONFIG_AUDIT_WATCH */
282 300
283#ifdef CONFIG_AUDIT_TREE 301#ifdef CONFIG_AUDIT_TREE
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
new file mode 100644
index 000000000000..27c6046c2c3d
--- /dev/null
+++ b/kernel/audit_fsnotify.c
@@ -0,0 +1,216 @@
1/* audit_fsnotify.c -- tracking inodes
2 *
3 * Copyright 2003-2009,2014-2015 Red Hat, Inc.
4 * Copyright 2005 Hewlett-Packard Development Company, L.P.
5 * Copyright 2005 IBM Corporation
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 */
17
18#include <linux/kernel.h>
19#include <linux/audit.h>
20#include <linux/kthread.h>
21#include <linux/mutex.h>
22#include <linux/fs.h>
23#include <linux/fsnotify_backend.h>
24#include <linux/namei.h>
25#include <linux/netlink.h>
26#include <linux/sched.h>
27#include <linux/slab.h>
28#include <linux/security.h>
29#include "audit.h"
30
31/*
32 * this mark lives on the parent directory of the inode in question.
33 * but dev, ino, and path are about the child
34 */
35struct audit_fsnotify_mark {
36 dev_t dev; /* associated superblock device */
37 unsigned long ino; /* associated inode number */
38 char *path; /* insertion path */
39 struct fsnotify_mark mark; /* fsnotify mark on the inode */
40 struct audit_krule *rule;
41};
42
43/* fsnotify handle. */
44static struct fsnotify_group *audit_fsnotify_group;
45
46/* fsnotify events we care about. */
47#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
48 FS_MOVE_SELF | FS_EVENT_ON_CHILD)
49
50static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
51{
52 kfree(audit_mark->path);
53 kfree(audit_mark);
54}
55
56static void audit_fsnotify_free_mark(struct fsnotify_mark *mark)
57{
58 struct audit_fsnotify_mark *audit_mark;
59
60 audit_mark = container_of(mark, struct audit_fsnotify_mark, mark);
61 audit_fsnotify_mark_free(audit_mark);
62}
63
64char *audit_mark_path(struct audit_fsnotify_mark *mark)
65{
66 return mark->path;
67}
68
69int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev)
70{
71 if (mark->ino == AUDIT_INO_UNSET)
72 return 0;
73 return (mark->ino == ino) && (mark->dev == dev);
74}
75
76static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
77 struct inode *inode)
78{
79 audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
80 audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
81}
82
83struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len)
84{
85 struct audit_fsnotify_mark *audit_mark;
86 struct path path;
87 struct dentry *dentry;
88 struct inode *inode;
89 int ret;
90
91 if (pathname[0] != '/' || pathname[len-1] == '/')
92 return ERR_PTR(-EINVAL);
93
94 dentry = kern_path_locked(pathname, &path);
95 if (IS_ERR(dentry))
96 return (void *)dentry; /* returning an error */
97 inode = path.dentry->d_inode;
98 mutex_unlock(&inode->i_mutex);
99
100 audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
101 if (unlikely(!audit_mark)) {
102 audit_mark = ERR_PTR(-ENOMEM);
103 goto out;
104 }
105
106 fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark);
107 audit_mark->mark.mask = AUDIT_FS_EVENTS;
108 audit_mark->path = pathname;
109 audit_update_mark(audit_mark, dentry->d_inode);
110 audit_mark->rule = krule;
111
112 ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true);
113 if (ret < 0) {
114 audit_fsnotify_mark_free(audit_mark);
115 audit_mark = ERR_PTR(ret);
116 }
117out:
118 dput(dentry);
119 path_put(&path);
120 return audit_mark;
121}
122
123static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op)
124{
125 struct audit_buffer *ab;
126 struct audit_krule *rule = audit_mark->rule;
127
128 if (!audit_enabled)
129 return;
130 ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
131 if (unlikely(!ab))
132 return;
133 audit_log_format(ab, "auid=%u ses=%u op=",
134 from_kuid(&init_user_ns, audit_get_loginuid(current)),
135 audit_get_sessionid(current));
136 audit_log_string(ab, op);
137 audit_log_format(ab, " path=");
138 audit_log_untrustedstring(ab, audit_mark->path);
139 audit_log_key(ab, rule->filterkey);
140 audit_log_format(ab, " list=%d res=1", rule->listnr);
141 audit_log_end(ab);
142}
143
144void audit_remove_mark(struct audit_fsnotify_mark *audit_mark)
145{
146 fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group);
147 fsnotify_put_mark(&audit_mark->mark);
148}
149
150void audit_remove_mark_rule(struct audit_krule *krule)
151{
152 struct audit_fsnotify_mark *mark = krule->exe;
153
154 audit_remove_mark(mark);
155}
156
157static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
158{
159 struct audit_krule *rule = audit_mark->rule;
160 struct audit_entry *entry = container_of(rule, struct audit_entry, rule);
161
162 audit_mark_log_rule_change(audit_mark, "autoremove_rule");
163 audit_del_rule(entry);
164}
165
166/* Update mark data in audit rules based on fsnotify events. */
167static int audit_mark_handle_event(struct fsnotify_group *group,
168 struct inode *to_tell,
169 struct fsnotify_mark *inode_mark,
170 struct fsnotify_mark *vfsmount_mark,
171 u32 mask, void *data, int data_type,
172 const unsigned char *dname, u32 cookie)
173{
174 struct audit_fsnotify_mark *audit_mark;
175 struct inode *inode = NULL;
176
177 audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
178
179 BUG_ON(group != audit_fsnotify_group);
180
181 switch (data_type) {
182 case (FSNOTIFY_EVENT_PATH):
183 inode = ((struct path *)data)->dentry->d_inode;
184 break;
185 case (FSNOTIFY_EVENT_INODE):
186 inode = (struct inode *)data;
187 break;
188 default:
189 BUG();
190 return 0;
191 };
192
193 if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
194 if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
195 return 0;
196 audit_update_mark(audit_mark, inode);
197 } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
198 audit_autoremove_mark_rule(audit_mark);
199
200 return 0;
201}
202
203static const struct fsnotify_ops audit_mark_fsnotify_ops = {
204 .handle_event = audit_mark_handle_event,
205};
206
207static int __init audit_fsnotify_init(void)
208{
209 audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
210 if (IS_ERR(audit_fsnotify_group)) {
211 audit_fsnotify_group = NULL;
212 audit_panic("cannot create audit fsnotify group");
213 }
214 return 0;
215}
216device_initcall(audit_fsnotify_init);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b0f9877273fc..94ecdabda8e6 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -479,6 +479,8 @@ static void kill_rules(struct audit_tree *tree)
479 if (rule->tree) { 479 if (rule->tree) {
480 /* not a half-baked one */ 480 /* not a half-baked one */
481 audit_tree_log_remove_rule(rule); 481 audit_tree_log_remove_rule(rule);
482 if (entry->rule.exe)
483 audit_remove_mark(entry->rule.exe);
482 rule->tree = NULL; 484 rule->tree = NULL;
483 list_del_rcu(&entry->list); 485 list_del_rcu(&entry->list);
484 list_del(&entry->rule.list); 486 list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 6e30024d9aac..656c7e93ac0d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch)
138 138
139int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev) 139int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
140{ 140{
141 return (watch->ino != (unsigned long)-1) && 141 return (watch->ino != AUDIT_INO_UNSET) &&
142 (watch->ino == ino) && 142 (watch->ino == ino) &&
143 (watch->dev == dev); 143 (watch->dev == dev);
144} 144}
@@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path)
179 INIT_LIST_HEAD(&watch->rules); 179 INIT_LIST_HEAD(&watch->rules);
180 atomic_set(&watch->count, 1); 180 atomic_set(&watch->count, 1);
181 watch->path = path; 181 watch->path = path;
182 watch->dev = (dev_t)-1; 182 watch->dev = AUDIT_DEV_UNSET;
183 watch->ino = (unsigned long)-1; 183 watch->ino = AUDIT_INO_UNSET;
184 184
185 return watch; 185 return watch;
186} 186}
@@ -203,7 +203,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
203 if (IS_ERR(watch)) 203 if (IS_ERR(watch))
204 return PTR_ERR(watch); 204 return PTR_ERR(watch);
205 205
206 audit_get_watch(watch);
207 krule->watch = watch; 206 krule->watch = watch;
208 207
209 return 0; 208 return 0;
@@ -313,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent,
313 list_replace(&oentry->rule.list, 312 list_replace(&oentry->rule.list,
314 &nentry->rule.list); 313 &nentry->rule.list);
315 } 314 }
315 if (oentry->rule.exe)
316 audit_remove_mark(oentry->rule.exe);
316 317
317 audit_watch_log_rule_change(r, owatch, "updated_rules"); 318 audit_watch_log_rule_change(r, owatch, "updated_rules");
318 319
@@ -343,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
343 list_for_each_entry_safe(r, nextr, &w->rules, rlist) { 344 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
344 e = container_of(r, struct audit_entry, rule); 345 e = container_of(r, struct audit_entry, rule);
345 audit_watch_log_rule_change(r, w, "remove_rule"); 346 audit_watch_log_rule_change(r, w, "remove_rule");
347 if (e->rule.exe)
348 audit_remove_mark(e->rule.exe);
346 list_del(&r->rlist); 349 list_del(&r->rlist);
347 list_del(&r->list); 350 list_del(&r->list);
348 list_del_rcu(&e->list); 351 list_del_rcu(&e->list);
@@ -387,19 +390,20 @@ static void audit_add_to_parent(struct audit_krule *krule,
387 390
388 watch_found = 1; 391 watch_found = 1;
389 392
390 /* put krule's and initial refs to temporary watch */ 393 /* put krule's ref to temporary watch */
391 audit_put_watch(watch);
392 audit_put_watch(watch); 394 audit_put_watch(watch);
393 395
394 audit_get_watch(w); 396 audit_get_watch(w);
395 krule->watch = watch = w; 397 krule->watch = watch = w;
398
399 audit_put_parent(parent);
396 break; 400 break;
397 } 401 }
398 402
399 if (!watch_found) { 403 if (!watch_found) {
400 audit_get_parent(parent);
401 watch->parent = parent; 404 watch->parent = parent;
402 405
406 audit_get_watch(watch);
403 list_add(&watch->wlist, &parent->watches); 407 list_add(&watch->wlist, &parent->watches);
404 } 408 }
405 list_add(&krule->rlist, &watch->rules); 409 list_add(&krule->rlist, &watch->rules);
@@ -437,9 +441,6 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
437 441
438 audit_add_to_parent(krule, parent); 442 audit_add_to_parent(krule, parent);
439 443
440 /* match get in audit_find_parent or audit_init_parent */
441 audit_put_parent(parent);
442
443 h = audit_hash_ino((u32)watch->ino); 444 h = audit_hash_ino((u32)watch->ino);
444 *list = &audit_inode_hash[h]; 445 *list = &audit_inode_hash[h];
445error: 446error:
@@ -496,7 +497,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
496 if (mask & (FS_CREATE|FS_MOVED_TO) && inode) 497 if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
497 audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); 498 audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
498 else if (mask & (FS_DELETE|FS_MOVED_FROM)) 499 else if (mask & (FS_DELETE|FS_MOVED_FROM))
499 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1); 500 audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
500 else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF)) 501 else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
501 audit_remove_parent_watches(parent); 502 audit_remove_parent_watches(parent);
502 503
@@ -517,3 +518,36 @@ static int __init audit_watch_init(void)
517 return 0; 518 return 0;
518} 519}
519device_initcall(audit_watch_init); 520device_initcall(audit_watch_init);
521
522int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
523{
524 struct audit_fsnotify_mark *audit_mark;
525 char *pathname;
526
527 pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL);
528 if (!pathname)
529 return -ENOMEM;
530
531 audit_mark = audit_alloc_mark(new, pathname, strlen(pathname));
532 if (IS_ERR(audit_mark)) {
533 kfree(pathname);
534 return PTR_ERR(audit_mark);
535 }
536 new->exe = audit_mark;
537
538 return 0;
539}
540
541int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
542{
543 struct file *exe_file;
544 unsigned long ino;
545 dev_t dev;
546
547 rcu_read_lock();
548 exe_file = rcu_dereference(tsk->mm->exe_file);
549 ino = exe_file->f_inode->i_ino;
550 dev = exe_file->f_inode->i_sb->s_dev;
551 rcu_read_unlock();
552 return audit_mark_compare(mark, ino, dev);
553}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 72e1660a79a3..7714d93edb85 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
405 if (f->val > AUDIT_MAX_FIELD_COMPARE) 405 if (f->val > AUDIT_MAX_FIELD_COMPARE)
406 return -EINVAL; 406 return -EINVAL;
407 break; 407 break;
408 case AUDIT_EXE:
409 if (f->op != Audit_equal)
410 return -EINVAL;
411 if (entry->rule.listnr != AUDIT_FILTER_EXIT)
412 return -EINVAL;
413 break;
408 }; 414 };
409 return 0; 415 return 0;
410} 416}
@@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
419 size_t remain = datasz - sizeof(struct audit_rule_data); 425 size_t remain = datasz - sizeof(struct audit_rule_data);
420 int i; 426 int i;
421 char *str; 427 char *str;
428 struct audit_fsnotify_mark *audit_mark;
422 429
423 entry = audit_to_entry_common(data); 430 entry = audit_to_entry_common(data);
424 if (IS_ERR(entry)) 431 if (IS_ERR(entry))
@@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
539 entry->rule.buflen += f->val; 546 entry->rule.buflen += f->val;
540 entry->rule.filterkey = str; 547 entry->rule.filterkey = str;
541 break; 548 break;
549 case AUDIT_EXE:
550 if (entry->rule.exe || f->val > PATH_MAX)
551 goto exit_free;
552 str = audit_unpack_string(&bufp, &remain, f->val);
553 if (IS_ERR(str)) {
554 err = PTR_ERR(str);
555 goto exit_free;
556 }
557 entry->rule.buflen += f->val;
558
559 audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
560 if (IS_ERR(audit_mark)) {
561 kfree(str);
562 err = PTR_ERR(audit_mark);
563 goto exit_free;
564 }
565 entry->rule.exe = audit_mark;
566 break;
542 } 567 }
543 } 568 }
544 569
@@ -549,10 +574,10 @@ exit_nofree:
549 return entry; 574 return entry;
550 575
551exit_free: 576exit_free:
552 if (entry->rule.watch)
553 audit_put_watch(entry->rule.watch); /* matches initial get */
554 if (entry->rule.tree) 577 if (entry->rule.tree)
555 audit_put_tree(entry->rule.tree); /* that's the temporary one */ 578 audit_put_tree(entry->rule.tree); /* that's the temporary one */
579 if (entry->rule.exe)
580 audit_remove_mark(entry->rule.exe); /* that's the template one */
556 audit_free_rule(entry); 581 audit_free_rule(entry);
557 return ERR_PTR(err); 582 return ERR_PTR(err);
558} 583}
@@ -617,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
617 data->buflen += data->values[i] = 642 data->buflen += data->values[i] =
618 audit_pack_string(&bufp, krule->filterkey); 643 audit_pack_string(&bufp, krule->filterkey);
619 break; 644 break;
645 case AUDIT_EXE:
646 data->buflen += data->values[i] =
647 audit_pack_string(&bufp, audit_mark_path(krule->exe));
648 break;
620 case AUDIT_LOGINUID_SET: 649 case AUDIT_LOGINUID_SET:
621 if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) { 650 if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
622 data->fields[i] = AUDIT_LOGINUID; 651 data->fields[i] = AUDIT_LOGINUID;
@@ -680,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
680 if (strcmp(a->filterkey, b->filterkey)) 709 if (strcmp(a->filterkey, b->filterkey))
681 return 1; 710 return 1;
682 break; 711 break;
712 case AUDIT_EXE:
713 /* both paths exist based on above type compare */
714 if (strcmp(audit_mark_path(a->exe),
715 audit_mark_path(b->exe)))
716 return 1;
717 break;
683 case AUDIT_UID: 718 case AUDIT_UID:
684 case AUDIT_EUID: 719 case AUDIT_EUID:
685 case AUDIT_SUID: 720 case AUDIT_SUID:
@@ -801,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
801 err = -ENOMEM; 836 err = -ENOMEM;
802 else 837 else
803 new->filterkey = fk; 838 new->filterkey = fk;
839 break;
840 case AUDIT_EXE:
841 err = audit_dupe_exe(new, old);
842 break;
804 } 843 }
805 if (err) { 844 if (err) {
845 if (new->exe)
846 audit_remove_mark(new->exe);
806 audit_free_rule(entry); 847 audit_free_rule(entry);
807 return ERR_PTR(err); 848 return ERR_PTR(err);
808 } 849 }
@@ -863,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
863 struct audit_watch *watch = entry->rule.watch; 904 struct audit_watch *watch = entry->rule.watch;
864 struct audit_tree *tree = entry->rule.tree; 905 struct audit_tree *tree = entry->rule.tree;
865 struct list_head *list; 906 struct list_head *list;
866 int err; 907 int err = 0;
867#ifdef CONFIG_AUDITSYSCALL 908#ifdef CONFIG_AUDITSYSCALL
868 int dont_count = 0; 909 int dont_count = 0;
869 910
@@ -881,7 +922,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
881 /* normally audit_add_tree_rule() will free it on failure */ 922 /* normally audit_add_tree_rule() will free it on failure */
882 if (tree) 923 if (tree)
883 audit_put_tree(tree); 924 audit_put_tree(tree);
884 goto error; 925 return err;
885 } 926 }
886 927
887 if (watch) { 928 if (watch) {
@@ -895,14 +936,14 @@ static inline int audit_add_rule(struct audit_entry *entry)
895 */ 936 */
896 if (tree) 937 if (tree)
897 audit_put_tree(tree); 938 audit_put_tree(tree);
898 goto error; 939 return err;
899 } 940 }
900 } 941 }
901 if (tree) { 942 if (tree) {
902 err = audit_add_tree_rule(&entry->rule); 943 err = audit_add_tree_rule(&entry->rule);
903 if (err) { 944 if (err) {
904 mutex_unlock(&audit_filter_mutex); 945 mutex_unlock(&audit_filter_mutex);
905 goto error; 946 return err;
906 } 947 }
907 } 948 }
908 949
@@ -933,19 +974,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
933#endif 974#endif
934 mutex_unlock(&audit_filter_mutex); 975 mutex_unlock(&audit_filter_mutex);
935 976
936 return 0;
937
938error:
939 if (watch)
940 audit_put_watch(watch); /* tmp watch, matches initial get */
941 return err; 977 return err;
942} 978}
943 979
944/* Remove an existing rule from filterlist. */ 980/* Remove an existing rule from filterlist. */
945static inline int audit_del_rule(struct audit_entry *entry) 981int audit_del_rule(struct audit_entry *entry)
946{ 982{
947 struct audit_entry *e; 983 struct audit_entry *e;
948 struct audit_watch *watch = entry->rule.watch;
949 struct audit_tree *tree = entry->rule.tree; 984 struct audit_tree *tree = entry->rule.tree;
950 struct list_head *list; 985 struct list_head *list;
951 int ret = 0; 986 int ret = 0;
@@ -961,7 +996,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
961 mutex_lock(&audit_filter_mutex); 996 mutex_lock(&audit_filter_mutex);
962 e = audit_find_rule(entry, &list); 997 e = audit_find_rule(entry, &list);
963 if (!e) { 998 if (!e) {
964 mutex_unlock(&audit_filter_mutex);
965 ret = -ENOENT; 999 ret = -ENOENT;
966 goto out; 1000 goto out;
967 } 1001 }
@@ -972,9 +1006,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
972 if (e->rule.tree) 1006 if (e->rule.tree)
973 audit_remove_tree_rule(&e->rule); 1007 audit_remove_tree_rule(&e->rule);
974 1008
975 list_del_rcu(&e->list); 1009 if (e->rule.exe)
976 list_del(&e->rule.list); 1010 audit_remove_mark_rule(&e->rule);
977 call_rcu(&e->rcu, audit_free_rule_rcu);
978 1011
979#ifdef CONFIG_AUDITSYSCALL 1012#ifdef CONFIG_AUDITSYSCALL
980 if (!dont_count) 1013 if (!dont_count)
@@ -983,11 +1016,14 @@ static inline int audit_del_rule(struct audit_entry *entry)
983 if (!audit_match_signal(entry)) 1016 if (!audit_match_signal(entry))
984 audit_signals--; 1017 audit_signals--;
985#endif 1018#endif
986 mutex_unlock(&audit_filter_mutex); 1019
1020 list_del_rcu(&e->list);
1021 list_del(&e->rule.list);
1022 call_rcu(&e->rcu, audit_free_rule_rcu);
987 1023
988out: 1024out:
989 if (watch) 1025 mutex_unlock(&audit_filter_mutex);
990 audit_put_watch(watch); /* match initial get */ 1026
991 if (tree) 1027 if (tree)
992 audit_put_tree(tree); /* that's the temporary one */ 1028 audit_put_tree(tree); /* that's the temporary one */
993 1029
@@ -1077,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
1077 WARN_ON(1); 1113 WARN_ON(1);
1078 } 1114 }
1079 1115
1080 if (err || type == AUDIT_DEL_RULE) 1116 if (err || type == AUDIT_DEL_RULE) {
1117 if (entry->rule.exe)
1118 audit_remove_mark(entry->rule.exe);
1081 audit_free_rule(entry); 1119 audit_free_rule(entry);
1120 }
1082 1121
1083 return err; 1122 return err;
1084} 1123}
@@ -1370,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r)
1370 return 0; 1409 return 0;
1371 1410
1372 nentry = audit_dupe_rule(r); 1411 nentry = audit_dupe_rule(r);
1412 if (entry->rule.exe)
1413 audit_remove_mark(entry->rule.exe);
1373 if (IS_ERR(nentry)) { 1414 if (IS_ERR(nentry)) {
1374 /* save the first error encountered for the 1415 /* save the first error encountered for the
1375 * return value */ 1416 * return value */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e85bdfd15fed..b86cc04959de 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val)
180 return 0; 180 return 0;
181 181
182 list_for_each_entry(n, &ctx->names_list, list) { 182 list_for_each_entry(n, &ctx->names_list, list) {
183 if ((n->ino != -1) && 183 if ((n->ino != AUDIT_INO_UNSET) &&
184 ((n->mode & S_IFMT) == mode)) 184 ((n->mode & S_IFMT) == mode))
185 return 1; 185 return 1;
186 } 186 }
@@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk,
466 result = audit_comparator(ctx->ppid, f->op, f->val); 466 result = audit_comparator(ctx->ppid, f->op, f->val);
467 } 467 }
468 break; 468 break;
469 case AUDIT_EXE:
470 result = audit_exe_compare(tsk, rule->exe);
471 break;
469 case AUDIT_UID: 472 case AUDIT_UID:
470 result = audit_uid_comparator(cred->uid, f->op, f->uid); 473 result = audit_uid_comparator(cred->uid, f->op, f->uid);
471 break; 474 break;
@@ -1680,7 +1683,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
1680 aname->should_free = true; 1683 aname->should_free = true;
1681 } 1684 }
1682 1685
1683 aname->ino = (unsigned long)-1; 1686 aname->ino = AUDIT_INO_UNSET;
1684 aname->type = type; 1687 aname->type = type;
1685 list_add_tail(&aname->list, &context->names_list); 1688 list_add_tail(&aname->list, &context->names_list);
1686 1689
@@ -1922,7 +1925,7 @@ void __audit_inode_child(const struct inode *parent,
1922 if (inode) 1925 if (inode)
1923 audit_copy_inode(found_child, dentry, inode); 1926 audit_copy_inode(found_child, dentry, inode);
1924 else 1927 else
1925 found_child->ino = (unsigned long)-1; 1928 found_child->ino = AUDIT_INO_UNSET;
1926} 1929}
1927EXPORT_SYMBOL_GPL(__audit_inode_child); 1930EXPORT_SYMBOL_GPL(__audit_inode_child);
1928 1931
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index cb31229a6fa4..29ace107f236 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -150,15 +150,15 @@ static int __init register_array_map(void)
150} 150}
151late_initcall(register_array_map); 151late_initcall(register_array_map);
152 152
153static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) 153static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
154{ 154{
155 /* only bpf_prog file descriptors can be stored in prog_array map */ 155 /* only file descriptors can be stored in this type of map */
156 if (attr->value_size != sizeof(u32)) 156 if (attr->value_size != sizeof(u32))
157 return ERR_PTR(-EINVAL); 157 return ERR_PTR(-EINVAL);
158 return array_map_alloc(attr); 158 return array_map_alloc(attr);
159} 159}
160 160
161static void prog_array_map_free(struct bpf_map *map) 161static void fd_array_map_free(struct bpf_map *map)
162{ 162{
163 struct bpf_array *array = container_of(map, struct bpf_array, map); 163 struct bpf_array *array = container_of(map, struct bpf_array, map);
164 int i; 164 int i;
@@ -167,21 +167,21 @@ static void prog_array_map_free(struct bpf_map *map)
167 167
168 /* make sure it's empty */ 168 /* make sure it's empty */
169 for (i = 0; i < array->map.max_entries; i++) 169 for (i = 0; i < array->map.max_entries; i++)
170 BUG_ON(array->prog[i] != NULL); 170 BUG_ON(array->ptrs[i] != NULL);
171 kvfree(array); 171 kvfree(array);
172} 172}
173 173
174static void *prog_array_map_lookup_elem(struct bpf_map *map, void *key) 174static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
175{ 175{
176 return NULL; 176 return NULL;
177} 177}
178 178
179/* only called from syscall */ 179/* only called from syscall */
180static int prog_array_map_update_elem(struct bpf_map *map, void *key, 180static int fd_array_map_update_elem(struct bpf_map *map, void *key,
181 void *value, u64 map_flags) 181 void *value, u64 map_flags)
182{ 182{
183 struct bpf_array *array = container_of(map, struct bpf_array, map); 183 struct bpf_array *array = container_of(map, struct bpf_array, map);
184 struct bpf_prog *prog, *old_prog; 184 void *new_ptr, *old_ptr;
185 u32 index = *(u32 *)key, ufd; 185 u32 index = *(u32 *)key, ufd;
186 186
187 if (map_flags != BPF_ANY) 187 if (map_flags != BPF_ANY)
@@ -191,57 +191,75 @@ static int prog_array_map_update_elem(struct bpf_map *map, void *key,
191 return -E2BIG; 191 return -E2BIG;
192 192
193 ufd = *(u32 *)value; 193 ufd = *(u32 *)value;
194 prog = bpf_prog_get(ufd); 194 new_ptr = map->ops->map_fd_get_ptr(map, ufd);
195 if (IS_ERR(prog)) 195 if (IS_ERR(new_ptr))
196 return PTR_ERR(prog); 196 return PTR_ERR(new_ptr);
197
198 if (!bpf_prog_array_compatible(array, prog)) {
199 bpf_prog_put(prog);
200 return -EINVAL;
201 }
202 197
203 old_prog = xchg(array->prog + index, prog); 198 old_ptr = xchg(array->ptrs + index, new_ptr);
204 if (old_prog) 199 if (old_ptr)
205 bpf_prog_put_rcu(old_prog); 200 map->ops->map_fd_put_ptr(old_ptr);
206 201
207 return 0; 202 return 0;
208} 203}
209 204
210static int prog_array_map_delete_elem(struct bpf_map *map, void *key) 205static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
211{ 206{
212 struct bpf_array *array = container_of(map, struct bpf_array, map); 207 struct bpf_array *array = container_of(map, struct bpf_array, map);
213 struct bpf_prog *old_prog; 208 void *old_ptr;
214 u32 index = *(u32 *)key; 209 u32 index = *(u32 *)key;
215 210
216 if (index >= array->map.max_entries) 211 if (index >= array->map.max_entries)
217 return -E2BIG; 212 return -E2BIG;
218 213
219 old_prog = xchg(array->prog + index, NULL); 214 old_ptr = xchg(array->ptrs + index, NULL);
220 if (old_prog) { 215 if (old_ptr) {
221 bpf_prog_put_rcu(old_prog); 216 map->ops->map_fd_put_ptr(old_ptr);
222 return 0; 217 return 0;
223 } else { 218 } else {
224 return -ENOENT; 219 return -ENOENT;
225 } 220 }
226} 221}
227 222
223static void *prog_fd_array_get_ptr(struct bpf_map *map, int fd)
224{
225 struct bpf_array *array = container_of(map, struct bpf_array, map);
226 struct bpf_prog *prog = bpf_prog_get(fd);
227 if (IS_ERR(prog))
228 return prog;
229
230 if (!bpf_prog_array_compatible(array, prog)) {
231 bpf_prog_put(prog);
232 return ERR_PTR(-EINVAL);
233 }
234 return prog;
235}
236
237static void prog_fd_array_put_ptr(void *ptr)
238{
239 struct bpf_prog *prog = ptr;
240
241 bpf_prog_put_rcu(prog);
242}
243
228/* decrement refcnt of all bpf_progs that are stored in this map */ 244/* decrement refcnt of all bpf_progs that are stored in this map */
229void bpf_prog_array_map_clear(struct bpf_map *map) 245void bpf_fd_array_map_clear(struct bpf_map *map)
230{ 246{
231 struct bpf_array *array = container_of(map, struct bpf_array, map); 247 struct bpf_array *array = container_of(map, struct bpf_array, map);
232 int i; 248 int i;
233 249
234 for (i = 0; i < array->map.max_entries; i++) 250 for (i = 0; i < array->map.max_entries; i++)
235 prog_array_map_delete_elem(map, &i); 251 fd_array_map_delete_elem(map, &i);
236} 252}
237 253
238static const struct bpf_map_ops prog_array_ops = { 254static const struct bpf_map_ops prog_array_ops = {
239 .map_alloc = prog_array_map_alloc, 255 .map_alloc = fd_array_map_alloc,
240 .map_free = prog_array_map_free, 256 .map_free = fd_array_map_free,
241 .map_get_next_key = array_map_get_next_key, 257 .map_get_next_key = array_map_get_next_key,
242 .map_lookup_elem = prog_array_map_lookup_elem, 258 .map_lookup_elem = fd_array_map_lookup_elem,
243 .map_update_elem = prog_array_map_update_elem, 259 .map_update_elem = fd_array_map_update_elem,
244 .map_delete_elem = prog_array_map_delete_elem, 260 .map_delete_elem = fd_array_map_delete_elem,
261 .map_fd_get_ptr = prog_fd_array_get_ptr,
262 .map_fd_put_ptr = prog_fd_array_put_ptr,
245}; 263};
246 264
247static struct bpf_map_type_list prog_array_type __read_mostly = { 265static struct bpf_map_type_list prog_array_type __read_mostly = {
@@ -255,3 +273,60 @@ static int __init register_prog_array_map(void)
255 return 0; 273 return 0;
256} 274}
257late_initcall(register_prog_array_map); 275late_initcall(register_prog_array_map);
276
277static void perf_event_array_map_free(struct bpf_map *map)
278{
279 bpf_fd_array_map_clear(map);
280 fd_array_map_free(map);
281}
282
283static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
284{
285 struct perf_event *event;
286 const struct perf_event_attr *attr;
287
288 event = perf_event_get(fd);
289 if (IS_ERR(event))
290 return event;
291
292 attr = perf_event_attrs(event);
293 if (IS_ERR(attr))
294 return (void *)attr;
295
296 if (attr->type != PERF_TYPE_RAW &&
297 attr->type != PERF_TYPE_HARDWARE) {
298 perf_event_release_kernel(event);
299 return ERR_PTR(-EINVAL);
300 }
301 return event;
302}
303
304static void perf_event_fd_array_put_ptr(void *ptr)
305{
306 struct perf_event *event = ptr;
307
308 perf_event_release_kernel(event);
309}
310
311static const struct bpf_map_ops perf_event_array_ops = {
312 .map_alloc = fd_array_map_alloc,
313 .map_free = perf_event_array_map_free,
314 .map_get_next_key = array_map_get_next_key,
315 .map_lookup_elem = fd_array_map_lookup_elem,
316 .map_update_elem = fd_array_map_update_elem,
317 .map_delete_elem = fd_array_map_delete_elem,
318 .map_fd_get_ptr = perf_event_fd_array_get_ptr,
319 .map_fd_put_ptr = perf_event_fd_array_put_ptr,
320};
321
322static struct bpf_map_type_list perf_event_array_type __read_mostly = {
323 .ops = &perf_event_array_ops,
324 .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
325};
326
327static int __init register_perf_event_array_map(void)
328{
329 bpf_register_map_type(&perf_event_array_type);
330 return 0;
331}
332late_initcall(register_perf_event_array_map);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index c5bedc82bc1c..67c380cfa9ca 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -177,6 +177,7 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
177{ 177{
178 return 0; 178 return 0;
179} 179}
180EXPORT_SYMBOL_GPL(__bpf_call_base);
180 181
181/** 182/**
182 * __bpf_prog_run - run eBPF program on a given context 183 * __bpf_prog_run - run eBPF program on a given context
@@ -449,11 +450,15 @@ select_insn:
449 450
450 tail_call_cnt++; 451 tail_call_cnt++;
451 452
452 prog = READ_ONCE(array->prog[index]); 453 prog = READ_ONCE(array->ptrs[index]);
453 if (unlikely(!prog)) 454 if (unlikely(!prog))
454 goto out; 455 goto out;
455 456
456 ARG1 = BPF_R1; 457 /* ARG1 at this point is guaranteed to point to CTX from
458 * the verifier side due to the fact that the tail call is
459 * handeled like a helper, that is, bpf_tail_call_proto,
460 * where arg1_type is ARG_PTR_TO_CTX.
461 */
457 insn = prog->insnsi; 462 insn = prog->insnsi;
458 goto select_insn; 463 goto select_insn;
459out: 464out:
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index a1b14d197a4f..35bac8e8b071 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -72,7 +72,7 @@ static int bpf_map_release(struct inode *inode, struct file *filp)
72 /* prog_array stores refcnt-ed bpf_prog pointers 72 /* prog_array stores refcnt-ed bpf_prog pointers
73 * release them all when user space closes prog_array_fd 73 * release them all when user space closes prog_array_fd
74 */ 74 */
75 bpf_prog_array_map_clear(map); 75 bpf_fd_array_map_clear(map);
76 76
77 bpf_map_put(map); 77 bpf_map_put(map);
78 return 0; 78 return 0;
@@ -155,14 +155,15 @@ static int map_lookup_elem(union bpf_attr *attr)
155 void __user *ukey = u64_to_ptr(attr->key); 155 void __user *ukey = u64_to_ptr(attr->key);
156 void __user *uvalue = u64_to_ptr(attr->value); 156 void __user *uvalue = u64_to_ptr(attr->value);
157 int ufd = attr->map_fd; 157 int ufd = attr->map_fd;
158 struct fd f = fdget(ufd);
159 struct bpf_map *map; 158 struct bpf_map *map;
160 void *key, *value, *ptr; 159 void *key, *value, *ptr;
160 struct fd f;
161 int err; 161 int err;
162 162
163 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) 163 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
164 return -EINVAL; 164 return -EINVAL;
165 165
166 f = fdget(ufd);
166 map = bpf_map_get(f); 167 map = bpf_map_get(f);
167 if (IS_ERR(map)) 168 if (IS_ERR(map))
168 return PTR_ERR(map); 169 return PTR_ERR(map);
@@ -213,14 +214,15 @@ static int map_update_elem(union bpf_attr *attr)
213 void __user *ukey = u64_to_ptr(attr->key); 214 void __user *ukey = u64_to_ptr(attr->key);
214 void __user *uvalue = u64_to_ptr(attr->value); 215 void __user *uvalue = u64_to_ptr(attr->value);
215 int ufd = attr->map_fd; 216 int ufd = attr->map_fd;
216 struct fd f = fdget(ufd);
217 struct bpf_map *map; 217 struct bpf_map *map;
218 void *key, *value; 218 void *key, *value;
219 struct fd f;
219 int err; 220 int err;
220 221
221 if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM)) 222 if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
222 return -EINVAL; 223 return -EINVAL;
223 224
225 f = fdget(ufd);
224 map = bpf_map_get(f); 226 map = bpf_map_get(f);
225 if (IS_ERR(map)) 227 if (IS_ERR(map))
226 return PTR_ERR(map); 228 return PTR_ERR(map);
@@ -265,14 +267,15 @@ static int map_delete_elem(union bpf_attr *attr)
265{ 267{
266 void __user *ukey = u64_to_ptr(attr->key); 268 void __user *ukey = u64_to_ptr(attr->key);
267 int ufd = attr->map_fd; 269 int ufd = attr->map_fd;
268 struct fd f = fdget(ufd);
269 struct bpf_map *map; 270 struct bpf_map *map;
271 struct fd f;
270 void *key; 272 void *key;
271 int err; 273 int err;
272 274
273 if (CHECK_ATTR(BPF_MAP_DELETE_ELEM)) 275 if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
274 return -EINVAL; 276 return -EINVAL;
275 277
278 f = fdget(ufd);
276 map = bpf_map_get(f); 279 map = bpf_map_get(f);
277 if (IS_ERR(map)) 280 if (IS_ERR(map))
278 return PTR_ERR(map); 281 return PTR_ERR(map);
@@ -305,14 +308,15 @@ static int map_get_next_key(union bpf_attr *attr)
305 void __user *ukey = u64_to_ptr(attr->key); 308 void __user *ukey = u64_to_ptr(attr->key);
306 void __user *unext_key = u64_to_ptr(attr->next_key); 309 void __user *unext_key = u64_to_ptr(attr->next_key);
307 int ufd = attr->map_fd; 310 int ufd = attr->map_fd;
308 struct fd f = fdget(ufd);
309 struct bpf_map *map; 311 struct bpf_map *map;
310 void *key, *next_key; 312 void *key, *next_key;
313 struct fd f;
311 int err; 314 int err;
312 315
313 if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY)) 316 if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
314 return -EINVAL; 317 return -EINVAL;
315 318
319 f = fdget(ufd);
316 map = bpf_map_get(f); 320 map = bpf_map_get(f);
317 if (IS_ERR(map)) 321 if (IS_ERR(map))
318 return PTR_ERR(map); 322 return PTR_ERR(map);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 039d866fd36a..b074b23000d6 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -238,6 +238,14 @@ static const char * const reg_type_str[] = {
238 [CONST_IMM] = "imm", 238 [CONST_IMM] = "imm",
239}; 239};
240 240
241static const struct {
242 int map_type;
243 int func_id;
244} func_limit[] = {
245 {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call},
246 {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read},
247};
248
241static void print_verifier_state(struct verifier_env *env) 249static void print_verifier_state(struct verifier_env *env)
242{ 250{
243 enum bpf_reg_type t; 251 enum bpf_reg_type t;
@@ -275,7 +283,7 @@ static const char *const bpf_class_string[] = {
275 [BPF_ALU64] = "alu64", 283 [BPF_ALU64] = "alu64",
276}; 284};
277 285
278static const char *const bpf_alu_string[] = { 286static const char *const bpf_alu_string[16] = {
279 [BPF_ADD >> 4] = "+=", 287 [BPF_ADD >> 4] = "+=",
280 [BPF_SUB >> 4] = "-=", 288 [BPF_SUB >> 4] = "-=",
281 [BPF_MUL >> 4] = "*=", 289 [BPF_MUL >> 4] = "*=",
@@ -299,7 +307,7 @@ static const char *const bpf_ldst_string[] = {
299 [BPF_DW >> 3] = "u64", 307 [BPF_DW >> 3] = "u64",
300}; 308};
301 309
302static const char *const bpf_jmp_string[] = { 310static const char *const bpf_jmp_string[16] = {
303 [BPF_JA >> 4] = "jmp", 311 [BPF_JA >> 4] = "jmp",
304 [BPF_JEQ >> 4] = "==", 312 [BPF_JEQ >> 4] = "==",
305 [BPF_JGT >> 4] = ">", 313 [BPF_JGT >> 4] = ">",
@@ -648,6 +656,9 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
648 struct verifier_state *state = &env->cur_state; 656 struct verifier_state *state = &env->cur_state;
649 int size, err = 0; 657 int size, err = 0;
650 658
659 if (state->regs[regno].type == PTR_TO_STACK)
660 off += state->regs[regno].imm;
661
651 size = bpf_size_to_bytes(bpf_size); 662 size = bpf_size_to_bytes(bpf_size);
652 if (size < 0) 663 if (size < 0)
653 return size; 664 return size;
@@ -667,7 +678,8 @@ static int check_mem_access(struct verifier_env *env, u32 regno, int off,
667 if (!err && t == BPF_READ && value_regno >= 0) 678 if (!err && t == BPF_READ && value_regno >= 0)
668 mark_reg_unknown_value(state->regs, value_regno); 679 mark_reg_unknown_value(state->regs, value_regno);
669 680
670 } else if (state->regs[regno].type == FRAME_PTR) { 681 } else if (state->regs[regno].type == FRAME_PTR ||
682 state->regs[regno].type == PTR_TO_STACK) {
671 if (off >= 0 || off < -MAX_BPF_STACK) { 683 if (off >= 0 || off < -MAX_BPF_STACK) {
672 verbose("invalid stack off=%d size=%d\n", off, size); 684 verbose("invalid stack off=%d size=%d\n", off, size);
673 return -EACCES; 685 return -EACCES;
@@ -833,6 +845,28 @@ static int check_func_arg(struct verifier_env *env, u32 regno,
833 return err; 845 return err;
834} 846}
835 847
848static int check_map_func_compatibility(struct bpf_map *map, int func_id)
849{
850 bool bool_map, bool_func;
851 int i;
852
853 if (!map)
854 return 0;
855
856 for (i = 0; i < ARRAY_SIZE(func_limit); i++) {
857 bool_map = (map->map_type == func_limit[i].map_type);
858 bool_func = (func_id == func_limit[i].func_id);
859 /* only when map & func pair match it can continue.
860 * don't allow any other map type to be passed into
861 * the special func;
862 */
863 if (bool_map != bool_func)
864 return -EINVAL;
865 }
866
867 return 0;
868}
869
836static int check_call(struct verifier_env *env, int func_id) 870static int check_call(struct verifier_env *env, int func_id)
837{ 871{
838 struct verifier_state *state = &env->cur_state; 872 struct verifier_state *state = &env->cur_state;
@@ -908,21 +942,9 @@ static int check_call(struct verifier_env *env, int func_id)
908 return -EINVAL; 942 return -EINVAL;
909 } 943 }
910 944
911 if (map && map->map_type == BPF_MAP_TYPE_PROG_ARRAY && 945 err = check_map_func_compatibility(map, func_id);
912 func_id != BPF_FUNC_tail_call) 946 if (err)
913 /* prog_array map type needs extra care: 947 return err;
914 * only allow to pass it into bpf_tail_call() for now.
915 * bpf_map_delete_elem() can be allowed in the future,
916 * while bpf_map_update_elem() must only be done via syscall
917 */
918 return -EINVAL;
919
920 if (func_id == BPF_FUNC_tail_call &&
921 map->map_type != BPF_MAP_TYPE_PROG_ARRAY)
922 /* don't allow any other map type to be passed into
923 * bpf_tail_call()
924 */
925 return -EINVAL;
926 948
927 return 0; 949 return 0;
928} 950}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f89d9292eee6..2cf0f79f1fc9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -107,8 +107,8 @@ static DEFINE_SPINLOCK(release_agent_path_lock);
107struct percpu_rw_semaphore cgroup_threadgroup_rwsem; 107struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
108 108
109#define cgroup_assert_mutex_or_rcu_locked() \ 109#define cgroup_assert_mutex_or_rcu_locked() \
110 rcu_lockdep_assert(rcu_read_lock_held() || \ 110 RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \
111 lockdep_is_held(&cgroup_mutex), \ 111 !lockdep_is_held(&cgroup_mutex), \
112 "cgroup_mutex or RCU read lock required"); 112 "cgroup_mutex or RCU read lock required");
113 113
114/* 114/*
@@ -145,6 +145,7 @@ static const char *cgroup_subsys_name[] = {
145 * part of that cgroup. 145 * part of that cgroup.
146 */ 146 */
147struct cgroup_root cgrp_dfl_root; 147struct cgroup_root cgrp_dfl_root;
148EXPORT_SYMBOL_GPL(cgrp_dfl_root);
148 149
149/* 150/*
150 * The default hierarchy always exists but is hidden until mounted for the 151 * The default hierarchy always exists but is hidden until mounted for the
@@ -186,6 +187,9 @@ static u64 css_serial_nr_next = 1;
186static unsigned long have_fork_callback __read_mostly; 187static unsigned long have_fork_callback __read_mostly;
187static unsigned long have_exit_callback __read_mostly; 188static unsigned long have_exit_callback __read_mostly;
188 189
190/* Ditto for the can_fork callback. */
191static unsigned long have_canfork_callback __read_mostly;
192
189static struct cftype cgroup_dfl_base_files[]; 193static struct cftype cgroup_dfl_base_files[];
190static struct cftype cgroup_legacy_base_files[]; 194static struct cftype cgroup_legacy_base_files[];
191 195
@@ -207,7 +211,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
207 211
208 idr_preload(gfp_mask); 212 idr_preload(gfp_mask);
209 spin_lock_bh(&cgroup_idr_lock); 213 spin_lock_bh(&cgroup_idr_lock);
210 ret = idr_alloc(idr, ptr, start, end, gfp_mask); 214 ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT);
211 spin_unlock_bh(&cgroup_idr_lock); 215 spin_unlock_bh(&cgroup_idr_lock);
212 idr_preload_end(); 216 idr_preload_end();
213 return ret; 217 return ret;
@@ -1027,10 +1031,13 @@ static const struct file_operations proc_cgroupstats_operations;
1027static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, 1031static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft,
1028 char *buf) 1032 char *buf)
1029{ 1033{
1034 struct cgroup_subsys *ss = cft->ss;
1035
1030 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && 1036 if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) &&
1031 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) 1037 !(cgrp->root->flags & CGRP_ROOT_NOPREFIX))
1032 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s", 1038 snprintf(buf, CGROUP_FILE_NAME_MAX, "%s.%s",
1033 cft->ss->name, cft->name); 1039 cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name,
1040 cft->name);
1034 else 1041 else
1035 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); 1042 strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX);
1036 return buf; 1043 return buf;
@@ -1332,9 +1339,10 @@ static int cgroup_show_options(struct seq_file *seq,
1332 struct cgroup_subsys *ss; 1339 struct cgroup_subsys *ss;
1333 int ssid; 1340 int ssid;
1334 1341
1335 for_each_subsys(ss, ssid) 1342 if (root != &cgrp_dfl_root)
1336 if (root->subsys_mask & (1 << ssid)) 1343 for_each_subsys(ss, ssid)
1337 seq_printf(seq, ",%s", ss->name); 1344 if (root->subsys_mask & (1 << ssid))
1345 seq_show_option(seq, ss->legacy_name, NULL);
1338 if (root->flags & CGRP_ROOT_NOPREFIX) 1346 if (root->flags & CGRP_ROOT_NOPREFIX)
1339 seq_puts(seq, ",noprefix"); 1347 seq_puts(seq, ",noprefix");
1340 if (root->flags & CGRP_ROOT_XATTR) 1348 if (root->flags & CGRP_ROOT_XATTR)
@@ -1342,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq,
1342 1350
1343 spin_lock(&release_agent_path_lock); 1351 spin_lock(&release_agent_path_lock);
1344 if (strlen(root->release_agent_path)) 1352 if (strlen(root->release_agent_path))
1345 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1353 seq_show_option(seq, "release_agent",
1354 root->release_agent_path);
1346 spin_unlock(&release_agent_path_lock); 1355 spin_unlock(&release_agent_path_lock);
1347 1356
1348 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags)) 1357 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
1349 seq_puts(seq, ",clone_children"); 1358 seq_puts(seq, ",clone_children");
1350 if (strlen(root->name)) 1359 if (strlen(root->name))
1351 seq_printf(seq, ",name=%s", root->name); 1360 seq_show_option(seq, "name", root->name);
1352 return 0; 1361 return 0;
1353} 1362}
1354 1363
@@ -1447,7 +1456,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
1447 } 1456 }
1448 1457
1449 for_each_subsys(ss, i) { 1458 for_each_subsys(ss, i) {
1450 if (strcmp(token, ss->name)) 1459 if (strcmp(token, ss->legacy_name))
1451 continue; 1460 continue;
1452 if (ss->disabled) 1461 if (ss->disabled)
1453 continue; 1462 continue;
@@ -1666,7 +1675,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask)
1666 1675
1667 lockdep_assert_held(&cgroup_mutex); 1676 lockdep_assert_held(&cgroup_mutex);
1668 1677
1669 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_NOWAIT); 1678 ret = cgroup_idr_alloc(&root->cgroup_idr, root_cgrp, 1, 2, GFP_KERNEL);
1670 if (ret < 0) 1679 if (ret < 0)
1671 goto out; 1680 goto out;
1672 root_cgrp->id = ret; 1681 root_cgrp->id = ret;
@@ -4579,7 +4588,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4579 if (err) 4588 if (err)
4580 goto err_free_css; 4589 goto err_free_css;
4581 4590
4582 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); 4591 err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_KERNEL);
4583 if (err < 0) 4592 if (err < 0)
4584 goto err_free_percpu_ref; 4593 goto err_free_percpu_ref;
4585 css->id = err; 4594 css->id = err;
@@ -4656,7 +4665,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4656 * Temporarily set the pointer to NULL, so idr_find() won't return 4665 * Temporarily set the pointer to NULL, so idr_find() won't return
4657 * a half-baked cgroup. 4666 * a half-baked cgroup.
4658 */ 4667 */
4659 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_NOWAIT); 4668 cgrp->id = cgroup_idr_alloc(&root->cgroup_idr, NULL, 2, 0, GFP_KERNEL);
4660 if (cgrp->id < 0) { 4669 if (cgrp->id < 0) {
4661 ret = -ENOMEM; 4670 ret = -ENOMEM;
4662 goto out_cancel_ref; 4671 goto out_cancel_ref;
@@ -4955,6 +4964,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
4955 4964
4956 have_fork_callback |= (bool)ss->fork << ss->id; 4965 have_fork_callback |= (bool)ss->fork << ss->id;
4957 have_exit_callback |= (bool)ss->exit << ss->id; 4966 have_exit_callback |= (bool)ss->exit << ss->id;
4967 have_canfork_callback |= (bool)ss->can_fork << ss->id;
4958 4968
4959 /* At system boot, before all subsystems have been 4969 /* At system boot, before all subsystems have been
4960 * registered, no tasks have been forked, so we don't 4970 * registered, no tasks have been forked, so we don't
@@ -4993,6 +5003,8 @@ int __init cgroup_init_early(void)
4993 5003
4994 ss->id = i; 5004 ss->id = i;
4995 ss->name = cgroup_subsys_name[i]; 5005 ss->name = cgroup_subsys_name[i];
5006 if (!ss->legacy_name)
5007 ss->legacy_name = cgroup_subsys_name[i];
4996 5008
4997 if (ss->early_init) 5009 if (ss->early_init)
4998 cgroup_init_subsys(ss, true); 5010 cgroup_init_subsys(ss, true);
@@ -5136,9 +5148,11 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5136 continue; 5148 continue;
5137 5149
5138 seq_printf(m, "%d:", root->hierarchy_id); 5150 seq_printf(m, "%d:", root->hierarchy_id);
5139 for_each_subsys(ss, ssid) 5151 if (root != &cgrp_dfl_root)
5140 if (root->subsys_mask & (1 << ssid)) 5152 for_each_subsys(ss, ssid)
5141 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 5153 if (root->subsys_mask & (1 << ssid))
5154 seq_printf(m, "%s%s", count++ ? "," : "",
5155 ss->legacy_name);
5142 if (strlen(root->name)) 5156 if (strlen(root->name))
5143 seq_printf(m, "%sname=%s", count ? "," : "", 5157 seq_printf(m, "%sname=%s", count ? "," : "",
5144 root->name); 5158 root->name);
@@ -5178,7 +5192,7 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
5178 5192
5179 for_each_subsys(ss, i) 5193 for_each_subsys(ss, i)
5180 seq_printf(m, "%s\t%d\t%d\t%d\n", 5194 seq_printf(m, "%s\t%d\t%d\t%d\n",
5181 ss->name, ss->root->hierarchy_id, 5195 ss->legacy_name, ss->root->hierarchy_id,
5182 atomic_read(&ss->root->nr_cgrps), !ss->disabled); 5196 atomic_read(&ss->root->nr_cgrps), !ss->disabled);
5183 5197
5184 mutex_unlock(&cgroup_mutex); 5198 mutex_unlock(&cgroup_mutex);
@@ -5197,6 +5211,19 @@ static const struct file_operations proc_cgroupstats_operations = {
5197 .release = single_release, 5211 .release = single_release,
5198}; 5212};
5199 5213
5214static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
5215{
5216 if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END)
5217 return &ss_priv[i - CGROUP_CANFORK_START];
5218 return NULL;
5219}
5220
5221static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i)
5222{
5223 void **private = subsys_canfork_priv_p(ss_priv, i);
5224 return private ? *private : NULL;
5225}
5226
5200/** 5227/**
5201 * cgroup_fork - initialize cgroup related fields during copy_process() 5228 * cgroup_fork - initialize cgroup related fields during copy_process()
5202 * @child: pointer to task_struct of forking parent process. 5229 * @child: pointer to task_struct of forking parent process.
@@ -5212,6 +5239,57 @@ void cgroup_fork(struct task_struct *child)
5212} 5239}
5213 5240
5214/** 5241/**
5242 * cgroup_can_fork - called on a new task before the process is exposed
5243 * @child: the task in question.
5244 *
5245 * This calls the subsystem can_fork() callbacks. If the can_fork() callback
5246 * returns an error, the fork aborts with that error code. This allows for
5247 * a cgroup subsystem to conditionally allow or deny new forks.
5248 */
5249int cgroup_can_fork(struct task_struct *child,
5250 void *ss_priv[CGROUP_CANFORK_COUNT])
5251{
5252 struct cgroup_subsys *ss;
5253 int i, j, ret;
5254
5255 for_each_subsys_which(ss, i, &have_canfork_callback) {
5256 ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i));
5257 if (ret)
5258 goto out_revert;
5259 }
5260
5261 return 0;
5262
5263out_revert:
5264 for_each_subsys(ss, j) {
5265 if (j >= i)
5266 break;
5267 if (ss->cancel_fork)
5268 ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j));
5269 }
5270
5271 return ret;
5272}
5273
5274/**
5275 * cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
5276 * @child: the task in question
5277 *
5278 * This calls the cancel_fork() callbacks if a fork failed *after*
5279 * cgroup_can_fork() succeded.
5280 */
5281void cgroup_cancel_fork(struct task_struct *child,
5282 void *ss_priv[CGROUP_CANFORK_COUNT])
5283{
5284 struct cgroup_subsys *ss;
5285 int i;
5286
5287 for_each_subsys(ss, i)
5288 if (ss->cancel_fork)
5289 ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i));
5290}
5291
5292/**
5215 * cgroup_post_fork - called on a new task after adding it to the task list 5293 * cgroup_post_fork - called on a new task after adding it to the task list
5216 * @child: the task in question 5294 * @child: the task in question
5217 * 5295 *
@@ -5221,7 +5299,8 @@ void cgroup_fork(struct task_struct *child)
5221 * cgroup_task_iter_start() - to guarantee that the new task ends up on its 5299 * cgroup_task_iter_start() - to guarantee that the new task ends up on its
5222 * list. 5300 * list.
5223 */ 5301 */
5224void cgroup_post_fork(struct task_struct *child) 5302void cgroup_post_fork(struct task_struct *child,
5303 void *old_ss_priv[CGROUP_CANFORK_COUNT])
5225{ 5304{
5226 struct cgroup_subsys *ss; 5305 struct cgroup_subsys *ss;
5227 int i; 5306 int i;
@@ -5266,7 +5345,7 @@ void cgroup_post_fork(struct task_struct *child)
5266 * and addition to css_set. 5345 * and addition to css_set.
5267 */ 5346 */
5268 for_each_subsys_which(ss, i, &have_fork_callback) 5347 for_each_subsys_which(ss, i, &have_fork_callback)
5269 ss->fork(child); 5348 ss->fork(child, subsys_canfork_priv(old_ss_priv, i));
5270} 5349}
5271 5350
5272/** 5351/**
@@ -5400,12 +5479,14 @@ static int __init cgroup_disable(char *str)
5400 continue; 5479 continue;
5401 5480
5402 for_each_subsys(ss, i) { 5481 for_each_subsys(ss, i) {
5403 if (!strcmp(token, ss->name)) { 5482 if (strcmp(token, ss->name) &&
5404 ss->disabled = 1; 5483 strcmp(token, ss->legacy_name))
5405 printk(KERN_INFO "Disabling %s control group" 5484 continue;
5406 " subsystem\n", ss->name); 5485
5407 break; 5486 ss->disabled = 1;
5408 } 5487 printk(KERN_INFO "Disabling %s control group subsystem\n",
5488 ss->name);
5489 break;
5409 } 5490 }
5410 } 5491 }
5411 return 1; 5492 return 1;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 92b98cc0ee76..f1b30ad5dc6d 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -203,7 +203,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
203 * to do anything as freezer_attach() will put @task into the appropriate 203 * to do anything as freezer_attach() will put @task into the appropriate
204 * state. 204 * state.
205 */ 205 */
206static void freezer_fork(struct task_struct *task) 206static void freezer_fork(struct task_struct *task, void *private)
207{ 207{
208 struct freezer *freezer; 208 struct freezer *freezer;
209 209
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
new file mode 100644
index 000000000000..806cd7693ac8
--- /dev/null
+++ b/kernel/cgroup_pids.c
@@ -0,0 +1,355 @@
1/*
2 * Process number limiting controller for cgroups.
3 *
4 * Used to allow a cgroup hierarchy to stop any new processes from fork()ing
5 * after a certain limit is reached.
6 *
7 * Since it is trivial to hit the task limit without hitting any kmemcg limits
8 * in place, PIDs are a fundamental resource. As such, PID exhaustion must be
9 * preventable in the scope of a cgroup hierarchy by allowing resource limiting
10 * of the number of tasks in a cgroup.
11 *
12 * In order to use the `pids` controller, set the maximum number of tasks in
13 * pids.max (this is not available in the root cgroup for obvious reasons). The
14 * number of processes currently in the cgroup is given by pids.current.
15 * Organisational operations are not blocked by cgroup policies, so it is
16 * possible to have pids.current > pids.max. However, it is not possible to
17 * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking
18 * would cause a cgroup policy to be violated.
19 *
20 * To set a cgroup to have no limit, set pids.max to "max". This is the default
21 * for all new cgroups (N.B. that PID limits are hierarchical, so the most
22 * stringent limit in the hierarchy is followed).
23 *
24 * pids.current tracks all child cgroup hierarchies, so parent/pids.current is
25 * a superset of parent/child/pids.current.
26 *
27 * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com>
28 *
29 * This file is subject to the terms and conditions of version 2 of the GNU
30 * General Public License. See the file COPYING in the main directory of the
31 * Linux distribution for more details.
32 */
33
34#include <linux/kernel.h>
35#include <linux/threads.h>
36#include <linux/atomic.h>
37#include <linux/cgroup.h>
38#include <linux/slab.h>
39
40#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
41#define PIDS_MAX_STR "max"
42
43struct pids_cgroup {
44 struct cgroup_subsys_state css;
45
46 /*
47 * Use 64-bit types so that we can safely represent "max" as
48 * %PIDS_MAX = (%PID_MAX_LIMIT + 1).
49 */
50 atomic64_t counter;
51 int64_t limit;
52};
53
54static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css)
55{
56 return container_of(css, struct pids_cgroup, css);
57}
58
59static struct pids_cgroup *parent_pids(struct pids_cgroup *pids)
60{
61 return css_pids(pids->css.parent);
62}
63
64static struct cgroup_subsys_state *
65pids_css_alloc(struct cgroup_subsys_state *parent)
66{
67 struct pids_cgroup *pids;
68
69 pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL);
70 if (!pids)
71 return ERR_PTR(-ENOMEM);
72
73 pids->limit = PIDS_MAX;
74 atomic64_set(&pids->counter, 0);
75 return &pids->css;
76}
77
78static void pids_css_free(struct cgroup_subsys_state *css)
79{
80 kfree(css_pids(css));
81}
82
83/**
84 * pids_cancel - uncharge the local pid count
85 * @pids: the pid cgroup state
86 * @num: the number of pids to cancel
87 *
88 * This function will WARN if the pid count goes under 0, because such a case is
89 * a bug in the pids controller proper.
90 */
91static void pids_cancel(struct pids_cgroup *pids, int num)
92{
93 /*
94 * A negative count (or overflow for that matter) is invalid,
95 * and indicates a bug in the `pids` controller proper.
96 */
97 WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter));
98}
99
100/**
101 * pids_uncharge - hierarchically uncharge the pid count
102 * @pids: the pid cgroup state
103 * @num: the number of pids to uncharge
104 */
105static void pids_uncharge(struct pids_cgroup *pids, int num)
106{
107 struct pids_cgroup *p;
108
109 for (p = pids; p; p = parent_pids(p))
110 pids_cancel(p, num);
111}
112
113/**
114 * pids_charge - hierarchically charge the pid count
115 * @pids: the pid cgroup state
116 * @num: the number of pids to charge
117 *
118 * This function does *not* follow the pid limit set. It cannot fail and the new
119 * pid count may exceed the limit. This is only used for reverting failed
120 * attaches, where there is no other way out than violating the limit.
121 */
122static void pids_charge(struct pids_cgroup *pids, int num)
123{
124 struct pids_cgroup *p;
125
126 for (p = pids; p; p = parent_pids(p))
127 atomic64_add(num, &p->counter);
128}
129
130/**
131 * pids_try_charge - hierarchically try to charge the pid count
132 * @pids: the pid cgroup state
133 * @num: the number of pids to charge
134 *
135 * This function follows the set limit. It will fail if the charge would cause
136 * the new value to exceed the hierarchical limit. Returns 0 if the charge
137 * succeded, otherwise -EAGAIN.
138 */
139static int pids_try_charge(struct pids_cgroup *pids, int num)
140{
141 struct pids_cgroup *p, *q;
142
143 for (p = pids; p; p = parent_pids(p)) {
144 int64_t new = atomic64_add_return(num, &p->counter);
145
146 /*
147 * Since new is capped to the maximum number of pid_t, if
148 * p->limit is %PIDS_MAX then we know that this test will never
149 * fail.
150 */
151 if (new > p->limit)
152 goto revert;
153 }
154
155 return 0;
156
157revert:
158 for (q = pids; q != p; q = parent_pids(q))
159 pids_cancel(q, num);
160 pids_cancel(p, num);
161
162 return -EAGAIN;
163}
164
165static int pids_can_attach(struct cgroup_subsys_state *css,
166 struct cgroup_taskset *tset)
167{
168 struct pids_cgroup *pids = css_pids(css);
169 struct task_struct *task;
170
171 cgroup_taskset_for_each(task, tset) {
172 struct cgroup_subsys_state *old_css;
173 struct pids_cgroup *old_pids;
174
175 /*
176 * No need to pin @old_css between here and cancel_attach()
177 * because cgroup core protects it from being freed before
178 * the migration completes or fails.
179 */
180 old_css = task_css(task, pids_cgrp_id);
181 old_pids = css_pids(old_css);
182
183 pids_charge(pids, 1);
184 pids_uncharge(old_pids, 1);
185 }
186
187 return 0;
188}
189
190static void pids_cancel_attach(struct cgroup_subsys_state *css,
191 struct cgroup_taskset *tset)
192{
193 struct pids_cgroup *pids = css_pids(css);
194 struct task_struct *task;
195
196 cgroup_taskset_for_each(task, tset) {
197 struct cgroup_subsys_state *old_css;
198 struct pids_cgroup *old_pids;
199
200 old_css = task_css(task, pids_cgrp_id);
201 old_pids = css_pids(old_css);
202
203 pids_charge(old_pids, 1);
204 pids_uncharge(pids, 1);
205 }
206}
207
208static int pids_can_fork(struct task_struct *task, void **priv_p)
209{
210 struct cgroup_subsys_state *css;
211 struct pids_cgroup *pids;
212 int err;
213
214 /*
215 * Use the "current" task_css for the pids subsystem as the tentative
216 * css. It is possible we will charge the wrong hierarchy, in which
217 * case we will forcefully revert/reapply the charge on the right
218 * hierarchy after it is committed to the task proper.
219 */
220 css = task_get_css(current, pids_cgrp_id);
221 pids = css_pids(css);
222
223 err = pids_try_charge(pids, 1);
224 if (err)
225 goto err_css_put;
226
227 *priv_p = css;
228 return 0;
229
230err_css_put:
231 css_put(css);
232 return err;
233}
234
235static void pids_cancel_fork(struct task_struct *task, void *priv)
236{
237 struct cgroup_subsys_state *css = priv;
238 struct pids_cgroup *pids = css_pids(css);
239
240 pids_uncharge(pids, 1);
241 css_put(css);
242}
243
244static void pids_fork(struct task_struct *task, void *priv)
245{
246 struct cgroup_subsys_state *css;
247 struct cgroup_subsys_state *old_css = priv;
248 struct pids_cgroup *pids;
249 struct pids_cgroup *old_pids = css_pids(old_css);
250
251 css = task_get_css(task, pids_cgrp_id);
252 pids = css_pids(css);
253
254 /*
255 * If the association has changed, we have to revert and reapply the
256 * charge/uncharge on the wrong hierarchy to the current one. Since
257 * the association can only change due to an organisation event, its
258 * okay for us to ignore the limit in this case.
259 */
260 if (pids != old_pids) {
261 pids_uncharge(old_pids, 1);
262 pids_charge(pids, 1);
263 }
264
265 css_put(css);
266 css_put(old_css);
267}
268
269static void pids_exit(struct cgroup_subsys_state *css,
270 struct cgroup_subsys_state *old_css,
271 struct task_struct *task)
272{
273 struct pids_cgroup *pids = css_pids(old_css);
274
275 pids_uncharge(pids, 1);
276}
277
278static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf,
279 size_t nbytes, loff_t off)
280{
281 struct cgroup_subsys_state *css = of_css(of);
282 struct pids_cgroup *pids = css_pids(css);
283 int64_t limit;
284 int err;
285
286 buf = strstrip(buf);
287 if (!strcmp(buf, PIDS_MAX_STR)) {
288 limit = PIDS_MAX;
289 goto set_limit;
290 }
291
292 err = kstrtoll(buf, 0, &limit);
293 if (err)
294 return err;
295
296 if (limit < 0 || limit >= PIDS_MAX)
297 return -EINVAL;
298
299set_limit:
300 /*
301 * Limit updates don't need to be mutex'd, since it isn't
302 * critical that any racing fork()s follow the new limit.
303 */
304 pids->limit = limit;
305 return nbytes;
306}
307
308static int pids_max_show(struct seq_file *sf, void *v)
309{
310 struct cgroup_subsys_state *css = seq_css(sf);
311 struct pids_cgroup *pids = css_pids(css);
312 int64_t limit = pids->limit;
313
314 if (limit >= PIDS_MAX)
315 seq_printf(sf, "%s\n", PIDS_MAX_STR);
316 else
317 seq_printf(sf, "%lld\n", limit);
318
319 return 0;
320}
321
322static s64 pids_current_read(struct cgroup_subsys_state *css,
323 struct cftype *cft)
324{
325 struct pids_cgroup *pids = css_pids(css);
326
327 return atomic64_read(&pids->counter);
328}
329
330static struct cftype pids_files[] = {
331 {
332 .name = "max",
333 .write = pids_max_write,
334 .seq_show = pids_max_show,
335 .flags = CFTYPE_NOT_ON_ROOT,
336 },
337 {
338 .name = "current",
339 .read_s64 = pids_current_read,
340 },
341 { } /* terminate */
342};
343
344struct cgroup_subsys pids_cgrp_subsys = {
345 .css_alloc = pids_css_alloc,
346 .css_free = pids_css_free,
347 .can_attach = pids_can_attach,
348 .cancel_attach = pids_cancel_attach,
349 .can_fork = pids_can_fork,
350 .cancel_fork = pids_cancel_fork,
351 .fork = pids_fork,
352 .exit = pids_exit,
353 .legacy_cftypes = pids_files,
354 .dfl_cftypes = pids_files,
355};
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6a374544d495..82cf9dff4295 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -191,21 +191,22 @@ void cpu_hotplug_done(void)
191void cpu_hotplug_disable(void) 191void cpu_hotplug_disable(void)
192{ 192{
193 cpu_maps_update_begin(); 193 cpu_maps_update_begin();
194 cpu_hotplug_disabled = 1; 194 cpu_hotplug_disabled++;
195 cpu_maps_update_done(); 195 cpu_maps_update_done();
196} 196}
197EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
197 198
198void cpu_hotplug_enable(void) 199void cpu_hotplug_enable(void)
199{ 200{
200 cpu_maps_update_begin(); 201 cpu_maps_update_begin();
201 cpu_hotplug_disabled = 0; 202 WARN_ON(--cpu_hotplug_disabled < 0);
202 cpu_maps_update_done(); 203 cpu_maps_update_done();
203} 204}
204 205EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
205#endif /* CONFIG_HOTPLUG_CPU */ 206#endif /* CONFIG_HOTPLUG_CPU */
206 207
207/* Need to know about CPUs going up/down? */ 208/* Need to know about CPUs going up/down? */
208int __ref register_cpu_notifier(struct notifier_block *nb) 209int register_cpu_notifier(struct notifier_block *nb)
209{ 210{
210 int ret; 211 int ret;
211 cpu_maps_update_begin(); 212 cpu_maps_update_begin();
@@ -214,7 +215,7 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
214 return ret; 215 return ret;
215} 216}
216 217
217int __ref __register_cpu_notifier(struct notifier_block *nb) 218int __register_cpu_notifier(struct notifier_block *nb)
218{ 219{
219 return raw_notifier_chain_register(&cpu_chain, nb); 220 return raw_notifier_chain_register(&cpu_chain, nb);
220} 221}
@@ -244,7 +245,7 @@ static void cpu_notify_nofail(unsigned long val, void *v)
244EXPORT_SYMBOL(register_cpu_notifier); 245EXPORT_SYMBOL(register_cpu_notifier);
245EXPORT_SYMBOL(__register_cpu_notifier); 246EXPORT_SYMBOL(__register_cpu_notifier);
246 247
247void __ref unregister_cpu_notifier(struct notifier_block *nb) 248void unregister_cpu_notifier(struct notifier_block *nb)
248{ 249{
249 cpu_maps_update_begin(); 250 cpu_maps_update_begin();
250 raw_notifier_chain_unregister(&cpu_chain, nb); 251 raw_notifier_chain_unregister(&cpu_chain, nb);
@@ -252,7 +253,7 @@ void __ref unregister_cpu_notifier(struct notifier_block *nb)
252} 253}
253EXPORT_SYMBOL(unregister_cpu_notifier); 254EXPORT_SYMBOL(unregister_cpu_notifier);
254 255
255void __ref __unregister_cpu_notifier(struct notifier_block *nb) 256void __unregister_cpu_notifier(struct notifier_block *nb)
256{ 257{
257 raw_notifier_chain_unregister(&cpu_chain, nb); 258 raw_notifier_chain_unregister(&cpu_chain, nb);
258} 259}
@@ -329,7 +330,7 @@ struct take_cpu_down_param {
329}; 330};
330 331
331/* Take this CPU down. */ 332/* Take this CPU down. */
332static int __ref take_cpu_down(void *_param) 333static int take_cpu_down(void *_param)
333{ 334{
334 struct take_cpu_down_param *param = _param; 335 struct take_cpu_down_param *param = _param;
335 int err; 336 int err;
@@ -348,7 +349,7 @@ static int __ref take_cpu_down(void *_param)
348} 349}
349 350
350/* Requires cpu_add_remove_lock to be held */ 351/* Requires cpu_add_remove_lock to be held */
351static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 352static int _cpu_down(unsigned int cpu, int tasks_frozen)
352{ 353{
353 int err, nr_calls = 0; 354 int err, nr_calls = 0;
354 void *hcpu = (void *)(long)cpu; 355 void *hcpu = (void *)(long)cpu;
@@ -381,14 +382,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
381 * will observe it. 382 * will observe it.
382 * 383 *
383 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might 384 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
384 * not imply sync_sched(), so explicitly call both. 385 * not imply sync_sched(), so wait for both.
385 * 386 *
386 * Do sync before park smpboot threads to take care the rcu boost case. 387 * Do sync before park smpboot threads to take care the rcu boost case.
387 */ 388 */
388#ifdef CONFIG_PREEMPT 389 if (IS_ENABLED(CONFIG_PREEMPT))
389 synchronize_sched(); 390 synchronize_rcu_mult(call_rcu, call_rcu_sched);
390#endif 391 else
391 synchronize_rcu(); 392 synchronize_rcu();
392 393
393 smpboot_park_threads(cpu); 394 smpboot_park_threads(cpu);
394 395
@@ -401,7 +402,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
401 /* 402 /*
402 * So now all preempt/rcu users must observe !cpu_active(). 403 * So now all preempt/rcu users must observe !cpu_active().
403 */ 404 */
404 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 405 err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
405 if (err) { 406 if (err) {
406 /* CPU didn't die: tell everyone. Can't complain. */ 407 /* CPU didn't die: tell everyone. Can't complain. */
407 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); 408 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
@@ -442,7 +443,7 @@ out_release:
442 return err; 443 return err;
443} 444}
444 445
445int __ref cpu_down(unsigned int cpu) 446int cpu_down(unsigned int cpu)
446{ 447{
447 int err; 448 int err;
448 449
@@ -527,18 +528,9 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen)
527 goto out_notify; 528 goto out_notify;
528 } 529 }
529 530
530 /*
531 * Some architectures have to walk the irq descriptors to
532 * setup the vector space for the cpu which comes online.
533 * Prevent irq alloc/free across the bringup.
534 */
535 irq_lock_sparse();
536
537 /* Arch-specific enabling code. */ 531 /* Arch-specific enabling code. */
538 ret = __cpu_up(cpu, idle); 532 ret = __cpu_up(cpu, idle);
539 533
540 irq_unlock_sparse();
541
542 if (ret != 0) 534 if (ret != 0)
543 goto out_notify; 535 goto out_notify;
544 BUG_ON(!cpu_online(cpu)); 536 BUG_ON(!cpu_online(cpu));
@@ -617,13 +609,18 @@ int disable_nonboot_cpus(void)
617 } 609 }
618 } 610 }
619 611
620 if (!error) { 612 if (!error)
621 BUG_ON(num_online_cpus() > 1); 613 BUG_ON(num_online_cpus() > 1);
622 /* Make sure the CPUs won't be enabled by someone else */ 614 else
623 cpu_hotplug_disabled = 1;
624 } else {
625 pr_err("Non-boot CPUs are not disabled\n"); 615 pr_err("Non-boot CPUs are not disabled\n");
626 } 616
617 /*
618 * Make sure the CPUs won't be enabled by someone else. We need to do
619 * this even in case of failure as all disable_nonboot_cpus() users are
620 * supposed to do enable_nonboot_cpus() on the failure path.
621 */
622 cpu_hotplug_disabled++;
623
627 cpu_maps_update_done(); 624 cpu_maps_update_done();
628 return error; 625 return error;
629} 626}
@@ -636,13 +633,13 @@ void __weak arch_enable_nonboot_cpus_end(void)
636{ 633{
637} 634}
638 635
639void __ref enable_nonboot_cpus(void) 636void enable_nonboot_cpus(void)
640{ 637{
641 int cpu, error; 638 int cpu, error;
642 639
643 /* Allow everyone to use the CPU hotplug again */ 640 /* Allow everyone to use the CPU hotplug again */
644 cpu_maps_update_begin(); 641 cpu_maps_update_begin();
645 cpu_hotplug_disabled = 0; 642 WARN_ON(--cpu_hotplug_disabled < 0);
646 if (cpumask_empty(frozen_cpus)) 643 if (cpumask_empty(frozen_cpus))
647 goto out; 644 goto out;
648 645
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 9656a3c36503..009cc9a17d95 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -180,7 +180,7 @@ EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
180 * low power state that may have caused some blocks in the same power domain 180 * low power state that may have caused some blocks in the same power domain
181 * to reset. 181 * to reset.
182 * 182 *
183 * Must be called after cpu_pm_exit has been called on all cpus in the power 183 * Must be called after cpu_cluster_pm_enter has been called for the power
184 * domain, and before cpu_pm_exit has been called on any cpu in the power 184 * domain, and before cpu_pm_exit has been called on any cpu in the power
185 * domain. Notified drivers can include VFP co-processor, interrupt controller 185 * domain. Notified drivers can include VFP co-processor, interrupt controller
186 * and its PM extensions, local CPU timers context save/restore which 186 * and its PM extensions, local CPU timers context save/restore which
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ee14e3a35a29..f0acff0f66c9 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1223,7 +1223,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1223 spin_unlock_irq(&callback_lock); 1223 spin_unlock_irq(&callback_lock);
1224 1224
1225 /* use trialcs->mems_allowed as a temp variable */ 1225 /* use trialcs->mems_allowed as a temp variable */
1226 update_nodemasks_hier(cs, &cs->mems_allowed); 1226 update_nodemasks_hier(cs, &trialcs->mems_allowed);
1227done: 1227done:
1228 return retval; 1228 return retval;
1229} 1229}
diff --git a/kernel/cred.c b/kernel/cred.c
index ec1c07667ec1..71179a09c1d6 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -20,11 +20,16 @@
20#include <linux/cn_proc.h> 20#include <linux/cn_proc.h>
21 21
22#if 0 22#if 0
23#define kdebug(FMT, ...) \ 23#define kdebug(FMT, ...) \
24 printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) 24 printk("[%-5.5s%5u] " FMT "\n", \
25 current->comm, current->pid, ##__VA_ARGS__)
25#else 26#else
26#define kdebug(FMT, ...) \ 27#define kdebug(FMT, ...) \
27 no_printk("[%-5.5s%5u] "FMT"\n", current->comm, current->pid ,##__VA_ARGS__) 28do { \
29 if (0) \
30 no_printk("[%-5.5s%5u] " FMT "\n", \
31 current->comm, current->pid, ##__VA_ARGS__); \
32} while (0)
28#endif 33#endif
29 34
30static struct kmem_cache *cred_jar; 35static struct kmem_cache *cred_jar;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d3dae3419b99..f548f69c4299 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -163,6 +163,7 @@ static atomic_t nr_mmap_events __read_mostly;
163static atomic_t nr_comm_events __read_mostly; 163static atomic_t nr_comm_events __read_mostly;
164static atomic_t nr_task_events __read_mostly; 164static atomic_t nr_task_events __read_mostly;
165static atomic_t nr_freq_events __read_mostly; 165static atomic_t nr_freq_events __read_mostly;
166static atomic_t nr_switch_events __read_mostly;
166 167
167static LIST_HEAD(pmus); 168static LIST_HEAD(pmus);
168static DEFINE_MUTEX(pmus_lock); 169static DEFINE_MUTEX(pmus_lock);
@@ -1868,8 +1869,6 @@ event_sched_in(struct perf_event *event,
1868 1869
1869 perf_pmu_disable(event->pmu); 1870 perf_pmu_disable(event->pmu);
1870 1871
1871 event->tstamp_running += tstamp - event->tstamp_stopped;
1872
1873 perf_set_shadow_time(event, ctx, tstamp); 1872 perf_set_shadow_time(event, ctx, tstamp);
1874 1873
1875 perf_log_itrace_start(event); 1874 perf_log_itrace_start(event);
@@ -1881,6 +1880,8 @@ event_sched_in(struct perf_event *event,
1881 goto out; 1880 goto out;
1882 } 1881 }
1883 1882
1883 event->tstamp_running += tstamp - event->tstamp_stopped;
1884
1884 if (!is_software_event(event)) 1885 if (!is_software_event(event))
1885 cpuctx->active_oncpu++; 1886 cpuctx->active_oncpu++;
1886 if (!ctx->nr_active++) 1887 if (!ctx->nr_active++)
@@ -2619,6 +2620,9 @@ static void perf_pmu_sched_task(struct task_struct *prev,
2619 local_irq_restore(flags); 2620 local_irq_restore(flags);
2620} 2621}
2621 2622
2623static void perf_event_switch(struct task_struct *task,
2624 struct task_struct *next_prev, bool sched_in);
2625
2622#define for_each_task_context_nr(ctxn) \ 2626#define for_each_task_context_nr(ctxn) \
2623 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) 2627 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2624 2628
@@ -2641,6 +2645,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
2641 if (__this_cpu_read(perf_sched_cb_usages)) 2645 if (__this_cpu_read(perf_sched_cb_usages))
2642 perf_pmu_sched_task(task, next, false); 2646 perf_pmu_sched_task(task, next, false);
2643 2647
2648 if (atomic_read(&nr_switch_events))
2649 perf_event_switch(task, next, false);
2650
2644 for_each_task_context_nr(ctxn) 2651 for_each_task_context_nr(ctxn)
2645 perf_event_context_sched_out(task, ctxn, next); 2652 perf_event_context_sched_out(task, ctxn, next);
2646 2653
@@ -2831,6 +2838,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2831 if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) 2838 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2832 perf_cgroup_sched_in(prev, task); 2839 perf_cgroup_sched_in(prev, task);
2833 2840
2841 if (atomic_read(&nr_switch_events))
2842 perf_event_switch(task, prev, true);
2843
2834 if (__this_cpu_read(perf_sched_cb_usages)) 2844 if (__this_cpu_read(perf_sched_cb_usages))
2835 perf_pmu_sched_task(prev, task, true); 2845 perf_pmu_sched_task(prev, task, true);
2836} 2846}
@@ -3212,6 +3222,59 @@ static inline u64 perf_event_count(struct perf_event *event)
3212 return __perf_event_count(event); 3222 return __perf_event_count(event);
3213} 3223}
3214 3224
3225/*
3226 * NMI-safe method to read a local event, that is an event that
3227 * is:
3228 * - either for the current task, or for this CPU
3229 * - does not have inherit set, for inherited task events
3230 * will not be local and we cannot read them atomically
3231 * - must not have a pmu::count method
3232 */
3233u64 perf_event_read_local(struct perf_event *event)
3234{
3235 unsigned long flags;
3236 u64 val;
3237
3238 /*
3239 * Disabling interrupts avoids all counter scheduling (context
3240 * switches, timer based rotation and IPIs).
3241 */
3242 local_irq_save(flags);
3243
3244 /* If this is a per-task event, it must be for current */
3245 WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
3246 event->hw.target != current);
3247
3248 /* If this is a per-CPU event, it must be for this CPU */
3249 WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
3250 event->cpu != smp_processor_id());
3251
3252 /*
3253 * It must not be an event with inherit set, we cannot read
3254 * all child counters from atomic context.
3255 */
3256 WARN_ON_ONCE(event->attr.inherit);
3257
3258 /*
3259 * It must not have a pmu::count method, those are not
3260 * NMI safe.
3261 */
3262 WARN_ON_ONCE(event->pmu->count);
3263
3264 /*
3265 * If the event is currently on this CPU, its either a per-task event,
3266 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3267 * oncpu == -1).
3268 */
3269 if (event->oncpu == smp_processor_id())
3270 event->pmu->read(event);
3271
3272 val = local64_read(&event->count);
3273 local_irq_restore(flags);
3274
3275 return val;
3276}
3277
3215static u64 perf_event_read(struct perf_event *event) 3278static u64 perf_event_read(struct perf_event *event)
3216{ 3279{
3217 /* 3280 /*
@@ -3454,6 +3517,10 @@ static void unaccount_event(struct perf_event *event)
3454 atomic_dec(&nr_task_events); 3517 atomic_dec(&nr_task_events);
3455 if (event->attr.freq) 3518 if (event->attr.freq)
3456 atomic_dec(&nr_freq_events); 3519 atomic_dec(&nr_freq_events);
3520 if (event->attr.context_switch) {
3521 static_key_slow_dec_deferred(&perf_sched_events);
3522 atomic_dec(&nr_switch_events);
3523 }
3457 if (is_cgroup_event(event)) 3524 if (is_cgroup_event(event))
3458 static_key_slow_dec_deferred(&perf_sched_events); 3525 static_key_slow_dec_deferred(&perf_sched_events);
3459 if (has_branch_stack(event)) 3526 if (has_branch_stack(event))
@@ -3958,28 +4025,21 @@ static void perf_event_for_each(struct perf_event *event,
3958 perf_event_for_each_child(sibling, func); 4025 perf_event_for_each_child(sibling, func);
3959} 4026}
3960 4027
3961static int perf_event_period(struct perf_event *event, u64 __user *arg) 4028struct period_event {
3962{ 4029 struct perf_event *event;
3963 struct perf_event_context *ctx = event->ctx;
3964 int ret = 0, active;
3965 u64 value; 4030 u64 value;
4031};
3966 4032
3967 if (!is_sampling_event(event)) 4033static int __perf_event_period(void *info)
3968 return -EINVAL; 4034{
3969 4035 struct period_event *pe = info;
3970 if (copy_from_user(&value, arg, sizeof(value))) 4036 struct perf_event *event = pe->event;
3971 return -EFAULT; 4037 struct perf_event_context *ctx = event->ctx;
3972 4038 u64 value = pe->value;
3973 if (!value) 4039 bool active;
3974 return -EINVAL;
3975 4040
3976 raw_spin_lock_irq(&ctx->lock); 4041 raw_spin_lock(&ctx->lock);
3977 if (event->attr.freq) { 4042 if (event->attr.freq) {
3978 if (value > sysctl_perf_event_sample_rate) {
3979 ret = -EINVAL;
3980 goto unlock;
3981 }
3982
3983 event->attr.sample_freq = value; 4043 event->attr.sample_freq = value;
3984 } else { 4044 } else {
3985 event->attr.sample_period = value; 4045 event->attr.sample_period = value;
@@ -3998,11 +4058,53 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
3998 event->pmu->start(event, PERF_EF_RELOAD); 4058 event->pmu->start(event, PERF_EF_RELOAD);
3999 perf_pmu_enable(ctx->pmu); 4059 perf_pmu_enable(ctx->pmu);
4000 } 4060 }
4061 raw_spin_unlock(&ctx->lock);
4001 4062
4002unlock: 4063 return 0;
4064}
4065
4066static int perf_event_period(struct perf_event *event, u64 __user *arg)
4067{
4068 struct period_event pe = { .event = event, };
4069 struct perf_event_context *ctx = event->ctx;
4070 struct task_struct *task;
4071 u64 value;
4072
4073 if (!is_sampling_event(event))
4074 return -EINVAL;
4075
4076 if (copy_from_user(&value, arg, sizeof(value)))
4077 return -EFAULT;
4078
4079 if (!value)
4080 return -EINVAL;
4081
4082 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4083 return -EINVAL;
4084
4085 task = ctx->task;
4086 pe.value = value;
4087
4088 if (!task) {
4089 cpu_function_call(event->cpu, __perf_event_period, &pe);
4090 return 0;
4091 }
4092
4093retry:
4094 if (!task_function_call(task, __perf_event_period, &pe))
4095 return 0;
4096
4097 raw_spin_lock_irq(&ctx->lock);
4098 if (ctx->is_active) {
4099 raw_spin_unlock_irq(&ctx->lock);
4100 task = ctx->task;
4101 goto retry;
4102 }
4103
4104 __perf_event_period(&pe);
4003 raw_spin_unlock_irq(&ctx->lock); 4105 raw_spin_unlock_irq(&ctx->lock);
4004 4106
4005 return ret; 4107 return 0;
4006} 4108}
4007 4109
4008static const struct file_operations perf_fops; 4110static const struct file_operations perf_fops;
@@ -4740,12 +4842,20 @@ static const struct file_operations perf_fops = {
4740 * to user-space before waking everybody up. 4842 * to user-space before waking everybody up.
4741 */ 4843 */
4742 4844
4845static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
4846{
4847 /* only the parent has fasync state */
4848 if (event->parent)
4849 event = event->parent;
4850 return &event->fasync;
4851}
4852
4743void perf_event_wakeup(struct perf_event *event) 4853void perf_event_wakeup(struct perf_event *event)
4744{ 4854{
4745 ring_buffer_wakeup(event); 4855 ring_buffer_wakeup(event);
4746 4856
4747 if (event->pending_kill) { 4857 if (event->pending_kill) {
4748 kill_fasync(&event->fasync, SIGIO, event->pending_kill); 4858 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
4749 event->pending_kill = 0; 4859 event->pending_kill = 0;
4750 } 4860 }
4751} 4861}
@@ -5982,6 +6092,91 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost)
5982} 6092}
5983 6093
5984/* 6094/*
6095 * context_switch tracking
6096 */
6097
6098struct perf_switch_event {
6099 struct task_struct *task;
6100 struct task_struct *next_prev;
6101
6102 struct {
6103 struct perf_event_header header;
6104 u32 next_prev_pid;
6105 u32 next_prev_tid;
6106 } event_id;
6107};
6108
6109static int perf_event_switch_match(struct perf_event *event)
6110{
6111 return event->attr.context_switch;
6112}
6113
6114static void perf_event_switch_output(struct perf_event *event, void *data)
6115{
6116 struct perf_switch_event *se = data;
6117 struct perf_output_handle handle;
6118 struct perf_sample_data sample;
6119 int ret;
6120
6121 if (!perf_event_switch_match(event))
6122 return;
6123
6124 /* Only CPU-wide events are allowed to see next/prev pid/tid */
6125 if (event->ctx->task) {
6126 se->event_id.header.type = PERF_RECORD_SWITCH;
6127 se->event_id.header.size = sizeof(se->event_id.header);
6128 } else {
6129 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
6130 se->event_id.header.size = sizeof(se->event_id);
6131 se->event_id.next_prev_pid =
6132 perf_event_pid(event, se->next_prev);
6133 se->event_id.next_prev_tid =
6134 perf_event_tid(event, se->next_prev);
6135 }
6136
6137 perf_event_header__init_id(&se->event_id.header, &sample, event);
6138
6139 ret = perf_output_begin(&handle, event, se->event_id.header.size);
6140 if (ret)
6141 return;
6142
6143 if (event->ctx->task)
6144 perf_output_put(&handle, se->event_id.header);
6145 else
6146 perf_output_put(&handle, se->event_id);
6147
6148 perf_event__output_id_sample(event, &handle, &sample);
6149
6150 perf_output_end(&handle);
6151}
6152
6153static void perf_event_switch(struct task_struct *task,
6154 struct task_struct *next_prev, bool sched_in)
6155{
6156 struct perf_switch_event switch_event;
6157
6158 /* N.B. caller checks nr_switch_events != 0 */
6159
6160 switch_event = (struct perf_switch_event){
6161 .task = task,
6162 .next_prev = next_prev,
6163 .event_id = {
6164 .header = {
6165 /* .type */
6166 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
6167 /* .size */
6168 },
6169 /* .next_prev_pid */
6170 /* .next_prev_tid */
6171 },
6172 };
6173
6174 perf_event_aux(perf_event_switch_output,
6175 &switch_event,
6176 NULL);
6177}
6178
6179/*
5985 * IRQ throttle logging 6180 * IRQ throttle logging
5986 */ 6181 */
5987 6182
@@ -6040,8 +6235,6 @@ static void perf_log_itrace_start(struct perf_event *event)
6040 event->hw.itrace_started) 6235 event->hw.itrace_started)
6041 return; 6236 return;
6042 6237
6043 event->hw.itrace_started = 1;
6044
6045 rec.header.type = PERF_RECORD_ITRACE_START; 6238 rec.header.type = PERF_RECORD_ITRACE_START;
6046 rec.header.misc = 0; 6239 rec.header.misc = 0;
6047 rec.header.size = sizeof(rec); 6240 rec.header.size = sizeof(rec);
@@ -6124,7 +6317,7 @@ static int __perf_event_overflow(struct perf_event *event,
6124 else 6317 else
6125 perf_event_output(event, data, regs); 6318 perf_event_output(event, data, regs);
6126 6319
6127 if (event->fasync && event->pending_kill) { 6320 if (*perf_event_fasync(event) && event->pending_kill) {
6128 event->pending_wakeup = 1; 6321 event->pending_wakeup = 1;
6129 irq_work_queue(&event->pending); 6322 irq_work_queue(&event->pending);
6130 } 6323 }
@@ -6749,8 +6942,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
6749 if (event->tp_event->prog) 6942 if (event->tp_event->prog)
6750 return -EEXIST; 6943 return -EEXIST;
6751 6944
6752 if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) 6945 if (!(event->tp_event->flags & TRACE_EVENT_FL_UKPROBE))
6753 /* bpf programs can only be attached to kprobes */ 6946 /* bpf programs can only be attached to u/kprobes */
6754 return -EINVAL; 6947 return -EINVAL;
6755 6948
6756 prog = bpf_prog_get(prog_fd); 6949 prog = bpf_prog_get(prog_fd);
@@ -7479,6 +7672,10 @@ static void account_event(struct perf_event *event)
7479 if (atomic_inc_return(&nr_freq_events) == 1) 7672 if (atomic_inc_return(&nr_freq_events) == 1)
7480 tick_nohz_full_kick_all(); 7673 tick_nohz_full_kick_all();
7481 } 7674 }
7675 if (event->attr.context_switch) {
7676 atomic_inc(&nr_switch_events);
7677 static_key_slow_inc(&perf_sched_events.key);
7678 }
7482 if (has_branch_stack(event)) 7679 if (has_branch_stack(event))
7483 static_key_slow_inc(&perf_sched_events.key); 7680 static_key_slow_inc(&perf_sched_events.key);
7484 if (is_cgroup_event(event)) 7681 if (is_cgroup_event(event))
@@ -8574,6 +8771,31 @@ void perf_event_delayed_put(struct task_struct *task)
8574 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); 8771 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
8575} 8772}
8576 8773
8774struct perf_event *perf_event_get(unsigned int fd)
8775{
8776 int err;
8777 struct fd f;
8778 struct perf_event *event;
8779
8780 err = perf_fget_light(fd, &f);
8781 if (err)
8782 return ERR_PTR(err);
8783
8784 event = f.file->private_data;
8785 atomic_long_inc(&event->refcount);
8786 fdput(f);
8787
8788 return event;
8789}
8790
8791const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
8792{
8793 if (!event)
8794 return ERR_PTR(-EINVAL);
8795
8796 return &event->attr;
8797}
8798
8577/* 8799/*
8578 * inherit a event from parent task to child task: 8800 * inherit a event from parent task to child task:
8579 */ 8801 */
@@ -8872,7 +9094,7 @@ static void perf_event_init_cpu(int cpu)
8872 mutex_unlock(&swhash->hlist_mutex); 9094 mutex_unlock(&swhash->hlist_mutex);
8873} 9095}
8874 9096
8875#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC 9097#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
8876static void __perf_event_exit_context(void *__info) 9098static void __perf_event_exit_context(void *__info)
8877{ 9099{
8878 struct remove_event re = { .detach_group = true }; 9100 struct remove_event re = { .detach_group = true };
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index b2be01b1aa9d..182bc30899d5 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -437,7 +437,10 @@ static struct page *rb_alloc_aux_page(int node, int order)
437 437
438 if (page && order) { 438 if (page && order) {
439 /* 439 /*
440 * Communicate the allocation size to the driver 440 * Communicate the allocation size to the driver:
441 * if we managed to secure a high-order allocation,
442 * set its first page's private to this order;
443 * !PagePrivate(page) means it's just a normal page.
441 */ 444 */
442 split_page(page, order); 445 split_page(page, order);
443 SetPagePrivate(page); 446 SetPagePrivate(page);
@@ -559,11 +562,13 @@ static void __rb_free_aux(struct ring_buffer *rb)
559 rb->aux_priv = NULL; 562 rb->aux_priv = NULL;
560 } 563 }
561 564
562 for (pg = 0; pg < rb->aux_nr_pages; pg++) 565 if (rb->aux_nr_pages) {
563 rb_free_aux_page(rb, pg); 566 for (pg = 0; pg < rb->aux_nr_pages; pg++)
567 rb_free_aux_page(rb, pg);
564 568
565 kfree(rb->aux_pages); 569 kfree(rb->aux_pages);
566 rb->aux_nr_pages = 0; 570 rb->aux_nr_pages = 0;
571 }
567} 572}
568 573
569void rb_free_aux(struct ring_buffer *rb) 574void rb_free_aux(struct ring_buffer *rb)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index cb346f26a22d..4e5e9798aa0c 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -86,15 +86,6 @@ struct uprobe {
86 struct arch_uprobe arch; 86 struct arch_uprobe arch;
87}; 87};
88 88
89struct return_instance {
90 struct uprobe *uprobe;
91 unsigned long func;
92 unsigned long orig_ret_vaddr; /* original return address */
93 bool chained; /* true, if instance is nested */
94
95 struct return_instance *next; /* keep as stack */
96};
97
98/* 89/*
99 * Execute out of line area: anonymous executable mapping installed 90 * Execute out of line area: anonymous executable mapping installed
100 * by the probed task to execute the copy of the original instruction 91 * by the probed task to execute the copy of the original instruction
@@ -105,17 +96,18 @@ struct return_instance {
105 * allocated. 96 * allocated.
106 */ 97 */
107struct xol_area { 98struct xol_area {
108 wait_queue_head_t wq; /* if all slots are busy */ 99 wait_queue_head_t wq; /* if all slots are busy */
109 atomic_t slot_count; /* number of in-use slots */ 100 atomic_t slot_count; /* number of in-use slots */
110 unsigned long *bitmap; /* 0 = free slot */ 101 unsigned long *bitmap; /* 0 = free slot */
111 struct page *page;
112 102
103 struct vm_special_mapping xol_mapping;
104 struct page *pages[2];
113 /* 105 /*
114 * We keep the vma's vm_start rather than a pointer to the vma 106 * We keep the vma's vm_start rather than a pointer to the vma
115 * itself. The probed process or a naughty kernel module could make 107 * itself. The probed process or a naughty kernel module could make
116 * the vma go away, and we must handle that reasonably gracefully. 108 * the vma go away, and we must handle that reasonably gracefully.
117 */ 109 */
118 unsigned long vaddr; /* Page(s) of instruction slots */ 110 unsigned long vaddr; /* Page(s) of instruction slots */
119}; 111};
120 112
121/* 113/*
@@ -366,6 +358,18 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
366 return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn); 358 return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)&auprobe->insn);
367} 359}
368 360
361static struct uprobe *get_uprobe(struct uprobe *uprobe)
362{
363 atomic_inc(&uprobe->ref);
364 return uprobe;
365}
366
367static void put_uprobe(struct uprobe *uprobe)
368{
369 if (atomic_dec_and_test(&uprobe->ref))
370 kfree(uprobe);
371}
372
369static int match_uprobe(struct uprobe *l, struct uprobe *r) 373static int match_uprobe(struct uprobe *l, struct uprobe *r)
370{ 374{
371 if (l->inode < r->inode) 375 if (l->inode < r->inode)
@@ -393,10 +397,8 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset)
393 while (n) { 397 while (n) {
394 uprobe = rb_entry(n, struct uprobe, rb_node); 398 uprobe = rb_entry(n, struct uprobe, rb_node);
395 match = match_uprobe(&u, uprobe); 399 match = match_uprobe(&u, uprobe);
396 if (!match) { 400 if (!match)
397 atomic_inc(&uprobe->ref); 401 return get_uprobe(uprobe);
398 return uprobe;
399 }
400 402
401 if (match < 0) 403 if (match < 0)
402 n = n->rb_left; 404 n = n->rb_left;
@@ -432,10 +434,8 @@ static struct uprobe *__insert_uprobe(struct uprobe *uprobe)
432 parent = *p; 434 parent = *p;
433 u = rb_entry(parent, struct uprobe, rb_node); 435 u = rb_entry(parent, struct uprobe, rb_node);
434 match = match_uprobe(uprobe, u); 436 match = match_uprobe(uprobe, u);
435 if (!match) { 437 if (!match)
436 atomic_inc(&u->ref); 438 return get_uprobe(u);
437 return u;
438 }
439 439
440 if (match < 0) 440 if (match < 0)
441 p = &parent->rb_left; 441 p = &parent->rb_left;
@@ -472,12 +472,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
472 return u; 472 return u;
473} 473}
474 474
475static void put_uprobe(struct uprobe *uprobe)
476{
477 if (atomic_dec_and_test(&uprobe->ref))
478 kfree(uprobe);
479}
480
481static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset) 475static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
482{ 476{
483 struct uprobe *uprobe, *cur_uprobe; 477 struct uprobe *uprobe, *cur_uprobe;
@@ -1039,14 +1033,14 @@ static void build_probe_list(struct inode *inode,
1039 if (u->inode != inode || u->offset < min) 1033 if (u->inode != inode || u->offset < min)
1040 break; 1034 break;
1041 list_add(&u->pending_list, head); 1035 list_add(&u->pending_list, head);
1042 atomic_inc(&u->ref); 1036 get_uprobe(u);
1043 } 1037 }
1044 for (t = n; (t = rb_next(t)); ) { 1038 for (t = n; (t = rb_next(t)); ) {
1045 u = rb_entry(t, struct uprobe, rb_node); 1039 u = rb_entry(t, struct uprobe, rb_node);
1046 if (u->inode != inode || u->offset > max) 1040 if (u->inode != inode || u->offset > max)
1047 break; 1041 break;
1048 list_add(&u->pending_list, head); 1042 list_add(&u->pending_list, head);
1049 atomic_inc(&u->ref); 1043 get_uprobe(u);
1050 } 1044 }
1051 } 1045 }
1052 spin_unlock(&uprobes_treelock); 1046 spin_unlock(&uprobes_treelock);
@@ -1132,11 +1126,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1132/* Slot allocation for XOL */ 1126/* Slot allocation for XOL */
1133static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) 1127static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
1134{ 1128{
1135 int ret = -EALREADY; 1129 struct vm_area_struct *vma;
1130 int ret;
1136 1131
1137 down_write(&mm->mmap_sem); 1132 down_write(&mm->mmap_sem);
1138 if (mm->uprobes_state.xol_area) 1133 if (mm->uprobes_state.xol_area) {
1134 ret = -EALREADY;
1139 goto fail; 1135 goto fail;
1136 }
1140 1137
1141 if (!area->vaddr) { 1138 if (!area->vaddr) {
1142 /* Try to map as high as possible, this is only a hint. */ 1139 /* Try to map as high as possible, this is only a hint. */
@@ -1148,11 +1145,15 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
1148 } 1145 }
1149 } 1146 }
1150 1147
1151 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, 1148 vma = _install_special_mapping(mm, area->vaddr, PAGE_SIZE,
1152 VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO, &area->page); 1149 VM_EXEC|VM_MAYEXEC|VM_DONTCOPY|VM_IO,
1153 if (ret) 1150 &area->xol_mapping);
1151 if (IS_ERR(vma)) {
1152 ret = PTR_ERR(vma);
1154 goto fail; 1153 goto fail;
1154 }
1155 1155
1156 ret = 0;
1156 smp_wmb(); /* pairs with get_xol_area() */ 1157 smp_wmb(); /* pairs with get_xol_area() */
1157 mm->uprobes_state.xol_area = area; 1158 mm->uprobes_state.xol_area = area;
1158 fail: 1159 fail:
@@ -1175,21 +1176,24 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
1175 if (!area->bitmap) 1176 if (!area->bitmap)
1176 goto free_area; 1177 goto free_area;
1177 1178
1178 area->page = alloc_page(GFP_HIGHUSER); 1179 area->xol_mapping.name = "[uprobes]";
1179 if (!area->page) 1180 area->xol_mapping.pages = area->pages;
1181 area->pages[0] = alloc_page(GFP_HIGHUSER);
1182 if (!area->pages[0])
1180 goto free_bitmap; 1183 goto free_bitmap;
1184 area->pages[1] = NULL;
1181 1185
1182 area->vaddr = vaddr; 1186 area->vaddr = vaddr;
1183 init_waitqueue_head(&area->wq); 1187 init_waitqueue_head(&area->wq);
1184 /* Reserve the 1st slot for get_trampoline_vaddr() */ 1188 /* Reserve the 1st slot for get_trampoline_vaddr() */
1185 set_bit(0, area->bitmap); 1189 set_bit(0, area->bitmap);
1186 atomic_set(&area->slot_count, 1); 1190 atomic_set(&area->slot_count, 1);
1187 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE); 1191 copy_to_page(area->pages[0], 0, &insn, UPROBE_SWBP_INSN_SIZE);
1188 1192
1189 if (!xol_add_vma(mm, area)) 1193 if (!xol_add_vma(mm, area))
1190 return area; 1194 return area;
1191 1195
1192 __free_page(area->page); 1196 __free_page(area->pages[0]);
1193 free_bitmap: 1197 free_bitmap:
1194 kfree(area->bitmap); 1198 kfree(area->bitmap);
1195 free_area: 1199 free_area:
@@ -1227,7 +1231,7 @@ void uprobe_clear_state(struct mm_struct *mm)
1227 if (!area) 1231 if (!area)
1228 return; 1232 return;
1229 1233
1230 put_page(area->page); 1234 put_page(area->pages[0]);
1231 kfree(area->bitmap); 1235 kfree(area->bitmap);
1232 kfree(area); 1236 kfree(area);
1233} 1237}
@@ -1296,7 +1300,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1296 if (unlikely(!xol_vaddr)) 1300 if (unlikely(!xol_vaddr))
1297 return 0; 1301 return 0;
1298 1302
1299 arch_uprobe_copy_ixol(area->page, xol_vaddr, 1303 arch_uprobe_copy_ixol(area->pages[0], xol_vaddr,
1300 &uprobe->arch.ixol, sizeof(uprobe->arch.ixol)); 1304 &uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1301 1305
1302 return xol_vaddr; 1306 return xol_vaddr;
@@ -1333,6 +1337,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
1333 1337
1334 clear_bit(slot_nr, area->bitmap); 1338 clear_bit(slot_nr, area->bitmap);
1335 atomic_dec(&area->slot_count); 1339 atomic_dec(&area->slot_count);
1340 smp_mb__after_atomic(); /* pairs with prepare_to_wait() */
1336 if (waitqueue_active(&area->wq)) 1341 if (waitqueue_active(&area->wq))
1337 wake_up(&area->wq); 1342 wake_up(&area->wq);
1338 1343
@@ -1376,6 +1381,14 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
1376 return instruction_pointer(regs); 1381 return instruction_pointer(regs);
1377} 1382}
1378 1383
1384static struct return_instance *free_ret_instance(struct return_instance *ri)
1385{
1386 struct return_instance *next = ri->next;
1387 put_uprobe(ri->uprobe);
1388 kfree(ri);
1389 return next;
1390}
1391
1379/* 1392/*
1380 * Called with no locks held. 1393 * Called with no locks held.
1381 * Called in context of a exiting or a exec-ing thread. 1394 * Called in context of a exiting or a exec-ing thread.
@@ -1383,7 +1396,7 @@ unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
1383void uprobe_free_utask(struct task_struct *t) 1396void uprobe_free_utask(struct task_struct *t)
1384{ 1397{
1385 struct uprobe_task *utask = t->utask; 1398 struct uprobe_task *utask = t->utask;
1386 struct return_instance *ri, *tmp; 1399 struct return_instance *ri;
1387 1400
1388 if (!utask) 1401 if (!utask)
1389 return; 1402 return;
@@ -1392,13 +1405,8 @@ void uprobe_free_utask(struct task_struct *t)
1392 put_uprobe(utask->active_uprobe); 1405 put_uprobe(utask->active_uprobe);
1393 1406
1394 ri = utask->return_instances; 1407 ri = utask->return_instances;
1395 while (ri) { 1408 while (ri)
1396 tmp = ri; 1409 ri = free_ret_instance(ri);
1397 ri = ri->next;
1398
1399 put_uprobe(tmp->uprobe);
1400 kfree(tmp);
1401 }
1402 1410
1403 xol_free_insn_slot(t); 1411 xol_free_insn_slot(t);
1404 kfree(utask); 1412 kfree(utask);
@@ -1437,7 +1445,7 @@ static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
1437 return -ENOMEM; 1445 return -ENOMEM;
1438 1446
1439 *n = *o; 1447 *n = *o;
1440 atomic_inc(&n->uprobe->ref); 1448 get_uprobe(n->uprobe);
1441 n->next = NULL; 1449 n->next = NULL;
1442 1450
1443 *p = n; 1451 *p = n;
@@ -1515,12 +1523,25 @@ static unsigned long get_trampoline_vaddr(void)
1515 return trampoline_vaddr; 1523 return trampoline_vaddr;
1516} 1524}
1517 1525
1526static void cleanup_return_instances(struct uprobe_task *utask, bool chained,
1527 struct pt_regs *regs)
1528{
1529 struct return_instance *ri = utask->return_instances;
1530 enum rp_check ctx = chained ? RP_CHECK_CHAIN_CALL : RP_CHECK_CALL;
1531
1532 while (ri && !arch_uretprobe_is_alive(ri, ctx, regs)) {
1533 ri = free_ret_instance(ri);
1534 utask->depth--;
1535 }
1536 utask->return_instances = ri;
1537}
1538
1518static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs) 1539static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
1519{ 1540{
1520 struct return_instance *ri; 1541 struct return_instance *ri;
1521 struct uprobe_task *utask; 1542 struct uprobe_task *utask;
1522 unsigned long orig_ret_vaddr, trampoline_vaddr; 1543 unsigned long orig_ret_vaddr, trampoline_vaddr;
1523 bool chained = false; 1544 bool chained;
1524 1545
1525 if (!get_xol_area()) 1546 if (!get_xol_area())
1526 return; 1547 return;
@@ -1536,49 +1557,47 @@ static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
1536 return; 1557 return;
1537 } 1558 }
1538 1559
1539 ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL); 1560 ri = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
1540 if (!ri) 1561 if (!ri)
1541 goto fail; 1562 return;
1542 1563
1543 trampoline_vaddr = get_trampoline_vaddr(); 1564 trampoline_vaddr = get_trampoline_vaddr();
1544 orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs); 1565 orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
1545 if (orig_ret_vaddr == -1) 1566 if (orig_ret_vaddr == -1)
1546 goto fail; 1567 goto fail;
1547 1568
1569 /* drop the entries invalidated by longjmp() */
1570 chained = (orig_ret_vaddr == trampoline_vaddr);
1571 cleanup_return_instances(utask, chained, regs);
1572
1548 /* 1573 /*
1549 * We don't want to keep trampoline address in stack, rather keep the 1574 * We don't want to keep trampoline address in stack, rather keep the
1550 * original return address of first caller thru all the consequent 1575 * original return address of first caller thru all the consequent
1551 * instances. This also makes breakpoint unwrapping easier. 1576 * instances. This also makes breakpoint unwrapping easier.
1552 */ 1577 */
1553 if (orig_ret_vaddr == trampoline_vaddr) { 1578 if (chained) {
1554 if (!utask->return_instances) { 1579 if (!utask->return_instances) {
1555 /* 1580 /*
1556 * This situation is not possible. Likely we have an 1581 * This situation is not possible. Likely we have an
1557 * attack from user-space. 1582 * attack from user-space.
1558 */ 1583 */
1559 pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n", 1584 uprobe_warn(current, "handle tail call");
1560 current->pid, current->tgid);
1561 goto fail; 1585 goto fail;
1562 } 1586 }
1563
1564 chained = true;
1565 orig_ret_vaddr = utask->return_instances->orig_ret_vaddr; 1587 orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
1566 } 1588 }
1567 1589
1568 atomic_inc(&uprobe->ref); 1590 ri->uprobe = get_uprobe(uprobe);
1569 ri->uprobe = uprobe;
1570 ri->func = instruction_pointer(regs); 1591 ri->func = instruction_pointer(regs);
1592 ri->stack = user_stack_pointer(regs);
1571 ri->orig_ret_vaddr = orig_ret_vaddr; 1593 ri->orig_ret_vaddr = orig_ret_vaddr;
1572 ri->chained = chained; 1594 ri->chained = chained;
1573 1595
1574 utask->depth++; 1596 utask->depth++;
1575
1576 /* add instance to the stack */
1577 ri->next = utask->return_instances; 1597 ri->next = utask->return_instances;
1578 utask->return_instances = ri; 1598 utask->return_instances = ri;
1579 1599
1580 return; 1600 return;
1581
1582 fail: 1601 fail:
1583 kfree(ri); 1602 kfree(ri);
1584} 1603}
@@ -1766,46 +1785,58 @@ handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
1766 up_read(&uprobe->register_rwsem); 1785 up_read(&uprobe->register_rwsem);
1767} 1786}
1768 1787
1769static bool handle_trampoline(struct pt_regs *regs) 1788static struct return_instance *find_next_ret_chain(struct return_instance *ri)
1770{ 1789{
1771 struct uprobe_task *utask;
1772 struct return_instance *ri, *tmp;
1773 bool chained; 1790 bool chained;
1774 1791
1792 do {
1793 chained = ri->chained;
1794 ri = ri->next; /* can't be NULL if chained */
1795 } while (chained);
1796
1797 return ri;
1798}
1799
1800static void handle_trampoline(struct pt_regs *regs)
1801{
1802 struct uprobe_task *utask;
1803 struct return_instance *ri, *next;
1804 bool valid;
1805
1775 utask = current->utask; 1806 utask = current->utask;
1776 if (!utask) 1807 if (!utask)
1777 return false; 1808 goto sigill;
1778 1809
1779 ri = utask->return_instances; 1810 ri = utask->return_instances;
1780 if (!ri) 1811 if (!ri)
1781 return false; 1812 goto sigill;
1782
1783 /*
1784 * TODO: we should throw out return_instance's invalidated by
1785 * longjmp(), currently we assume that the probed function always
1786 * returns.
1787 */
1788 instruction_pointer_set(regs, ri->orig_ret_vaddr);
1789
1790 for (;;) {
1791 handle_uretprobe_chain(ri, regs);
1792
1793 chained = ri->chained;
1794 put_uprobe(ri->uprobe);
1795
1796 tmp = ri;
1797 ri = ri->next;
1798 kfree(tmp);
1799 utask->depth--;
1800 1813
1801 if (!chained) 1814 do {
1802 break; 1815 /*
1803 BUG_ON(!ri); 1816 * We should throw out the frames invalidated by longjmp().
1804 } 1817 * If this chain is valid, then the next one should be alive
1818 * or NULL; the latter case means that nobody but ri->func
1819 * could hit this trampoline on return. TODO: sigaltstack().
1820 */
1821 next = find_next_ret_chain(ri);
1822 valid = !next || arch_uretprobe_is_alive(next, RP_CHECK_RET, regs);
1823
1824 instruction_pointer_set(regs, ri->orig_ret_vaddr);
1825 do {
1826 if (valid)
1827 handle_uretprobe_chain(ri, regs);
1828 ri = free_ret_instance(ri);
1829 utask->depth--;
1830 } while (ri != next);
1831 } while (!valid);
1805 1832
1806 utask->return_instances = ri; 1833 utask->return_instances = ri;
1834 return;
1835
1836 sigill:
1837 uprobe_warn(current, "handle uretprobe, sending SIGILL.");
1838 force_sig_info(SIGILL, SEND_SIG_FORCED, current);
1807 1839
1808 return true;
1809} 1840}
1810 1841
1811bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs) 1842bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
@@ -1813,6 +1844,12 @@ bool __weak arch_uprobe_ignore(struct arch_uprobe *aup, struct pt_regs *regs)
1813 return false; 1844 return false;
1814} 1845}
1815 1846
1847bool __weak arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
1848 struct pt_regs *regs)
1849{
1850 return true;
1851}
1852
1816/* 1853/*
1817 * Run handler and ask thread to singlestep. 1854 * Run handler and ask thread to singlestep.
1818 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1855 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1824,13 +1861,8 @@ static void handle_swbp(struct pt_regs *regs)
1824 int uninitialized_var(is_swbp); 1861 int uninitialized_var(is_swbp);
1825 1862
1826 bp_vaddr = uprobe_get_swbp_addr(regs); 1863 bp_vaddr = uprobe_get_swbp_addr(regs);
1827 if (bp_vaddr == get_trampoline_vaddr()) { 1864 if (bp_vaddr == get_trampoline_vaddr())
1828 if (handle_trampoline(regs)) 1865 return handle_trampoline(regs);
1829 return;
1830
1831 pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
1832 current->pid, current->tgid);
1833 }
1834 1866
1835 uprobe = find_active_uprobe(bp_vaddr, &is_swbp); 1867 uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
1836 if (!uprobe) { 1868 if (!uprobe) {
diff --git a/kernel/exit.c b/kernel/exit.c
index 031325e9acf9..ea95ee1b5ef7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1471,7 +1471,7 @@ static long do_wait(struct wait_opts *wo)
1471 add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait); 1471 add_wait_queue(&current->signal->wait_chldexit, &wo->child_wait);
1472repeat: 1472repeat:
1473 /* 1473 /*
1474 * If there is nothing that can match our critiera just get out. 1474 * If there is nothing that can match our criteria, just get out.
1475 * We will clear ->notask_error to zero if we see any child that 1475 * We will clear ->notask_error to zero if we see any child that
1476 * might later match our criteria, even if we are not able to reap 1476 * might later match our criteria, even if we are not able to reap
1477 * it yet. 1477 * it yet.
diff --git a/kernel/extable.c b/kernel/extable.c
index c98f926277a8..e820ccee9846 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -18,7 +18,6 @@
18#include <linux/ftrace.h> 18#include <linux/ftrace.h>
19#include <linux/memory.h> 19#include <linux/memory.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/ftrace.h>
22#include <linux/mutex.h> 21#include <linux/mutex.h>
23#include <linux/init.h> 22#include <linux/init.h>
24 23
diff --git a/kernel/fork.c b/kernel/fork.c
index 1bfefc6f96a4..7d5f0f118a63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -287,6 +287,11 @@ static void set_max_threads(unsigned int max_threads_suggested)
287 max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS); 287 max_threads = clamp_t(u64, threads, MIN_THREADS, MAX_THREADS);
288} 288}
289 289
290#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
291/* Initialized by the architecture: */
292int arch_task_struct_size __read_mostly;
293#endif
294
290void __init fork_init(void) 295void __init fork_init(void)
291{ 296{
292#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR 297#ifndef CONFIG_ARCH_TASK_STRUCT_ALLOCATOR
@@ -295,7 +300,7 @@ void __init fork_init(void)
295#endif 300#endif
296 /* create a slab on which task_structs can be allocated */ 301 /* create a slab on which task_structs can be allocated */
297 task_struct_cachep = 302 task_struct_cachep =
298 kmem_cache_create("task_struct", sizeof(struct task_struct), 303 kmem_cache_create("task_struct", arch_task_struct_size,
299 ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); 304 ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
300#endif 305#endif
301 306
@@ -449,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
449 tmp->vm_mm = mm; 454 tmp->vm_mm = mm;
450 if (anon_vma_fork(tmp, mpnt)) 455 if (anon_vma_fork(tmp, mpnt))
451 goto fail_nomem_anon_vma_fork; 456 goto fail_nomem_anon_vma_fork;
452 tmp->vm_flags &= ~VM_LOCKED; 457 tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
453 tmp->vm_next = tmp->vm_prev = NULL; 458 tmp->vm_next = tmp->vm_prev = NULL;
459 tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
454 file = tmp->vm_file; 460 file = tmp->vm_file;
455 if (file) { 461 if (file) {
456 struct inode *inode = file_inode(file); 462 struct inode *inode = file_inode(file);
@@ -1067,6 +1073,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
1067 rcu_assign_pointer(tsk->sighand, sig); 1073 rcu_assign_pointer(tsk->sighand, sig);
1068 if (!sig) 1074 if (!sig)
1069 return -ENOMEM; 1075 return -ENOMEM;
1076
1070 atomic_set(&sig->count, 1); 1077 atomic_set(&sig->count, 1);
1071 memcpy(sig->action, current->sighand->action, sizeof(sig->action)); 1078 memcpy(sig->action, current->sighand->action, sizeof(sig->action));
1072 return 0; 1079 return 0;
@@ -1128,6 +1135,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1128 init_sigpending(&sig->shared_pending); 1135 init_sigpending(&sig->shared_pending);
1129 INIT_LIST_HEAD(&sig->posix_timers); 1136 INIT_LIST_HEAD(&sig->posix_timers);
1130 seqlock_init(&sig->stats_lock); 1137 seqlock_init(&sig->stats_lock);
1138 prev_cputime_init(&sig->prev_cputime);
1131 1139
1132 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1140 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1133 sig->real_timer.function = it_real_fn; 1141 sig->real_timer.function = it_real_fn;
@@ -1239,6 +1247,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1239{ 1247{
1240 int retval; 1248 int retval;
1241 struct task_struct *p; 1249 struct task_struct *p;
1250 void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
1242 1251
1243 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) 1252 if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
1244 return ERR_PTR(-EINVAL); 1253 return ERR_PTR(-EINVAL);
@@ -1273,10 +1282,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1273 1282
1274 /* 1283 /*
1275 * If the new process will be in a different pid or user namespace 1284 * If the new process will be in a different pid or user namespace
1276 * do not allow it to share a thread group or signal handlers or 1285 * do not allow it to share a thread group with the forking task.
1277 * parent with the forking task.
1278 */ 1286 */
1279 if (clone_flags & CLONE_SIGHAND) { 1287 if (clone_flags & CLONE_THREAD) {
1280 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || 1288 if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
1281 (task_active_pid_ns(current) != 1289 (task_active_pid_ns(current) !=
1282 current->nsproxy->pid_ns_for_children)) 1290 current->nsproxy->pid_ns_for_children))
@@ -1335,9 +1343,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1335 1343
1336 p->utime = p->stime = p->gtime = 0; 1344 p->utime = p->stime = p->gtime = 0;
1337 p->utimescaled = p->stimescaled = 0; 1345 p->utimescaled = p->stimescaled = 0;
1338#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 1346 prev_cputime_init(&p->prev_cputime);
1339 p->prev_cputime.utime = p->prev_cputime.stime = 0; 1347
1340#endif
1341#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN 1348#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
1342 seqlock_init(&p->vtime_seqlock); 1349 seqlock_init(&p->vtime_seqlock);
1343 p->vtime_snap = 0; 1350 p->vtime_snap = 0;
@@ -1513,6 +1520,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1513 p->task_works = NULL; 1520 p->task_works = NULL;
1514 1521
1515 /* 1522 /*
1523 * Ensure that the cgroup subsystem policies allow the new process to be
1524 * forked. It should be noted the the new process's css_set can be changed
1525 * between here and cgroup_post_fork() if an organisation operation is in
1526 * progress.
1527 */
1528 retval = cgroup_can_fork(p, cgrp_ss_priv);
1529 if (retval)
1530 goto bad_fork_free_pid;
1531
1532 /*
1516 * Make it visible to the rest of the system, but dont wake it up yet. 1533 * Make it visible to the rest of the system, but dont wake it up yet.
1517 * Need tasklist lock for parent etc handling! 1534 * Need tasklist lock for parent etc handling!
1518 */ 1535 */
@@ -1548,7 +1565,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1548 spin_unlock(&current->sighand->siglock); 1565 spin_unlock(&current->sighand->siglock);
1549 write_unlock_irq(&tasklist_lock); 1566 write_unlock_irq(&tasklist_lock);
1550 retval = -ERESTARTNOINTR; 1567 retval = -ERESTARTNOINTR;
1551 goto bad_fork_free_pid; 1568 goto bad_fork_cancel_cgroup;
1552 } 1569 }
1553 1570
1554 if (likely(p->pid)) { 1571 if (likely(p->pid)) {
@@ -1590,7 +1607,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1590 write_unlock_irq(&tasklist_lock); 1607 write_unlock_irq(&tasklist_lock);
1591 1608
1592 proc_fork_connector(p); 1609 proc_fork_connector(p);
1593 cgroup_post_fork(p); 1610 cgroup_post_fork(p, cgrp_ss_priv);
1594 if (clone_flags & CLONE_THREAD) 1611 if (clone_flags & CLONE_THREAD)
1595 threadgroup_change_end(current); 1612 threadgroup_change_end(current);
1596 perf_event_fork(p); 1613 perf_event_fork(p);
@@ -1600,6 +1617,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1600 1617
1601 return p; 1618 return p;
1602 1619
1620bad_fork_cancel_cgroup:
1621 cgroup_cancel_fork(p, cgrp_ss_priv);
1603bad_fork_free_pid: 1622bad_fork_free_pid:
1604 if (pid != &init_struct_pid) 1623 if (pid != &init_struct_pid)
1605 free_pid(pid); 1624 free_pid(pid);
@@ -1866,13 +1885,21 @@ static int check_unshare_flags(unsigned long unshare_flags)
1866 CLONE_NEWUSER|CLONE_NEWPID)) 1885 CLONE_NEWUSER|CLONE_NEWPID))
1867 return -EINVAL; 1886 return -EINVAL;
1868 /* 1887 /*
1869 * Not implemented, but pretend it works if there is nothing to 1888 * Not implemented, but pretend it works if there is nothing
1870 * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND 1889 * to unshare. Note that unsharing the address space or the
1871 * needs to unshare vm. 1890 * signal handlers also need to unshare the signal queues (aka
1891 * CLONE_THREAD).
1872 */ 1892 */
1873 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) { 1893 if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
1874 /* FIXME: get_task_mm() increments ->mm_users */ 1894 if (!thread_group_empty(current))
1875 if (atomic_read(&current->mm->mm_users) > 1) 1895 return -EINVAL;
1896 }
1897 if (unshare_flags & (CLONE_SIGHAND | CLONE_VM)) {
1898 if (atomic_read(&current->sighand->count) > 1)
1899 return -EINVAL;
1900 }
1901 if (unshare_flags & CLONE_VM) {
1902 if (!current_is_single_threaded())
1876 return -EINVAL; 1903 return -EINVAL;
1877 } 1904 }
1878 1905
@@ -1936,21 +1963,22 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1936 int err; 1963 int err;
1937 1964
1938 /* 1965 /*
1939 * If unsharing a user namespace must also unshare the thread. 1966 * If unsharing a user namespace must also unshare the thread group
1967 * and unshare the filesystem root and working directories.
1940 */ 1968 */
1941 if (unshare_flags & CLONE_NEWUSER) 1969 if (unshare_flags & CLONE_NEWUSER)
1942 unshare_flags |= CLONE_THREAD | CLONE_FS; 1970 unshare_flags |= CLONE_THREAD | CLONE_FS;
1943 /* 1971 /*
1944 * If unsharing a thread from a thread group, must also unshare vm.
1945 */
1946 if (unshare_flags & CLONE_THREAD)
1947 unshare_flags |= CLONE_VM;
1948 /*
1949 * If unsharing vm, must also unshare signal handlers. 1972 * If unsharing vm, must also unshare signal handlers.
1950 */ 1973 */
1951 if (unshare_flags & CLONE_VM) 1974 if (unshare_flags & CLONE_VM)
1952 unshare_flags |= CLONE_SIGHAND; 1975 unshare_flags |= CLONE_SIGHAND;
1953 /* 1976 /*
1977 * If unsharing a signal handlers, must also unshare the signal queues.
1978 */
1979 if (unshare_flags & CLONE_SIGHAND)
1980 unshare_flags |= CLONE_THREAD;
1981 /*
1954 * If unsharing namespace, must also unshare filesystem information. 1982 * If unsharing namespace, must also unshare filesystem information.
1955 */ 1983 */
1956 if (unshare_flags & CLONE_NEWNS) 1984 if (unshare_flags & CLONE_NEWNS)
diff --git a/kernel/futex.c b/kernel/futex.c
index c4a182f5357e..6e443efc65f4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -64,6 +64,7 @@
64#include <linux/hugetlb.h> 64#include <linux/hugetlb.h>
65#include <linux/freezer.h> 65#include <linux/freezer.h>
66#include <linux/bootmem.h> 66#include <linux/bootmem.h>
67#include <linux/fault-inject.h>
67 68
68#include <asm/futex.h> 69#include <asm/futex.h>
69 70
@@ -258,6 +259,66 @@ static unsigned long __read_mostly futex_hashsize;
258 259
259static struct futex_hash_bucket *futex_queues; 260static struct futex_hash_bucket *futex_queues;
260 261
262/*
263 * Fault injections for futexes.
264 */
265#ifdef CONFIG_FAIL_FUTEX
266
267static struct {
268 struct fault_attr attr;
269
270 u32 ignore_private;
271} fail_futex = {
272 .attr = FAULT_ATTR_INITIALIZER,
273 .ignore_private = 0,
274};
275
276static int __init setup_fail_futex(char *str)
277{
278 return setup_fault_attr(&fail_futex.attr, str);
279}
280__setup("fail_futex=", setup_fail_futex);
281
282static bool should_fail_futex(bool fshared)
283{
284 if (fail_futex.ignore_private && !fshared)
285 return false;
286
287 return should_fail(&fail_futex.attr, 1);
288}
289
290#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
291
292static int __init fail_futex_debugfs(void)
293{
294 umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
295 struct dentry *dir;
296
297 dir = fault_create_debugfs_attr("fail_futex", NULL,
298 &fail_futex.attr);
299 if (IS_ERR(dir))
300 return PTR_ERR(dir);
301
302 if (!debugfs_create_bool("ignore-private", mode, dir,
303 &fail_futex.ignore_private)) {
304 debugfs_remove_recursive(dir);
305 return -ENOMEM;
306 }
307
308 return 0;
309}
310
311late_initcall(fail_futex_debugfs);
312
313#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
314
315#else
316static inline bool should_fail_futex(bool fshared)
317{
318 return false;
319}
320#endif /* CONFIG_FAIL_FUTEX */
321
261static inline void futex_get_mm(union futex_key *key) 322static inline void futex_get_mm(union futex_key *key)
262{ 323{
263 atomic_inc(&key->private.mm->mm_count); 324 atomic_inc(&key->private.mm->mm_count);
@@ -413,6 +474,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
413 if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) 474 if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
414 return -EFAULT; 475 return -EFAULT;
415 476
477 if (unlikely(should_fail_futex(fshared)))
478 return -EFAULT;
479
416 /* 480 /*
417 * PROCESS_PRIVATE futexes are fast. 481 * PROCESS_PRIVATE futexes are fast.
418 * As the mm cannot disappear under us and the 'key' only needs 482 * As the mm cannot disappear under us and the 'key' only needs
@@ -428,6 +492,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
428 } 492 }
429 493
430again: 494again:
495 /* Ignore any VERIFY_READ mapping (futex common case) */
496 if (unlikely(should_fail_futex(fshared)))
497 return -EFAULT;
498
431 err = get_user_pages_fast(address, 1, 1, &page); 499 err = get_user_pages_fast(address, 1, 1, &page);
432 /* 500 /*
433 * If write access is not required (eg. FUTEX_WAIT), try 501 * If write access is not required (eg. FUTEX_WAIT), try
@@ -516,7 +584,7 @@ again:
516 * A RO anonymous page will never change and thus doesn't make 584 * A RO anonymous page will never change and thus doesn't make
517 * sense for futex operations. 585 * sense for futex operations.
518 */ 586 */
519 if (ro) { 587 if (unlikely(should_fail_futex(fshared)) || ro) {
520 err = -EFAULT; 588 err = -EFAULT;
521 goto out; 589 goto out;
522 } 590 }
@@ -974,6 +1042,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
974{ 1042{
975 u32 uninitialized_var(curval); 1043 u32 uninitialized_var(curval);
976 1044
1045 if (unlikely(should_fail_futex(true)))
1046 return -EFAULT;
1047
977 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) 1048 if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
978 return -EFAULT; 1049 return -EFAULT;
979 1050
@@ -1015,12 +1086,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
1015 if (get_futex_value_locked(&uval, uaddr)) 1086 if (get_futex_value_locked(&uval, uaddr))
1016 return -EFAULT; 1087 return -EFAULT;
1017 1088
1089 if (unlikely(should_fail_futex(true)))
1090 return -EFAULT;
1091
1018 /* 1092 /*
1019 * Detect deadlocks. 1093 * Detect deadlocks.
1020 */ 1094 */
1021 if ((unlikely((uval & FUTEX_TID_MASK) == vpid))) 1095 if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
1022 return -EDEADLK; 1096 return -EDEADLK;
1023 1097
1098 if ((unlikely(should_fail_futex(true))))
1099 return -EDEADLK;
1100
1024 /* 1101 /*
1025 * Lookup existing state first. If it exists, try to attach to 1102 * Lookup existing state first. If it exists, try to attach to
1026 * its pi_state. 1103 * its pi_state.
@@ -1155,6 +1232,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
1155 */ 1232 */
1156 newval = FUTEX_WAITERS | task_pid_vnr(new_owner); 1233 newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1157 1234
1235 if (unlikely(should_fail_futex(true)))
1236 ret = -EFAULT;
1237
1158 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) 1238 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1159 ret = -EFAULT; 1239 ret = -EFAULT;
1160 else if (curval != uval) 1240 else if (curval != uval)
@@ -1457,6 +1537,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1457 if (get_futex_value_locked(&curval, pifutex)) 1537 if (get_futex_value_locked(&curval, pifutex))
1458 return -EFAULT; 1538 return -EFAULT;
1459 1539
1540 if (unlikely(should_fail_futex(true)))
1541 return -EFAULT;
1542
1460 /* 1543 /*
1461 * Find the top_waiter and determine if there are additional waiters. 1544 * Find the top_waiter and determine if there are additional waiters.
1462 * If the caller intends to requeue more than 1 waiter to pifutex, 1545 * If the caller intends to requeue more than 1 waiter to pifutex,
@@ -2268,8 +2351,11 @@ static long futex_wait_restart(struct restart_block *restart)
2268/* 2351/*
2269 * Userspace tried a 0 -> TID atomic transition of the futex value 2352 * Userspace tried a 0 -> TID atomic transition of the futex value
2270 * and failed. The kernel side here does the whole locking operation: 2353 * and failed. The kernel side here does the whole locking operation:
2271 * if there are waiters then it will block, it does PI, etc. (Due to 2354 * if there are waiters then it will block as a consequence of relying
2272 * races the kernel might see a 0 value of the futex too.) 2355 * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
2356 * a 0 value of the futex too.).
2357 *
2358 * Also serves as futex trylock_pi()'ing, and due semantics.
2273 */ 2359 */
2274static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, 2360static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
2275 ktime_t *time, int trylock) 2361 ktime_t *time, int trylock)
@@ -2300,6 +2386,10 @@ retry_private:
2300 2386
2301 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); 2387 ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
2302 if (unlikely(ret)) { 2388 if (unlikely(ret)) {
2389 /*
2390 * Atomic work succeeded and we got the lock,
2391 * or failed. Either way, we do _not_ block.
2392 */
2303 switch (ret) { 2393 switch (ret) {
2304 case 1: 2394 case 1:
2305 /* We got the lock. */ 2395 /* We got the lock. */
@@ -2530,7 +2620,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2530 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 2620 * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2531 * @uaddr: the futex we initially wait on (non-pi) 2621 * @uaddr: the futex we initially wait on (non-pi)
2532 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be 2622 * @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2533 * the same type, no requeueing from private to shared, etc. 2623 * the same type, no requeueing from private to shared, etc.
2534 * @val: the expected value of uaddr 2624 * @val: the expected value of uaddr
2535 * @abs_time: absolute timeout 2625 * @abs_time: absolute timeout
2536 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all 2626 * @bitset: 32 bit wakeup bitset set by userspace, defaults to all
@@ -3005,6 +3095,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
3005 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || 3095 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
3006 cmd == FUTEX_WAIT_BITSET || 3096 cmd == FUTEX_WAIT_BITSET ||
3007 cmd == FUTEX_WAIT_REQUEUE_PI)) { 3097 cmd == FUTEX_WAIT_REQUEUE_PI)) {
3098 if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
3099 return -EFAULT;
3008 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 3100 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
3009 return -EFAULT; 3101 return -EFAULT;
3010 if (!timespec_valid(&ts)) 3102 if (!timespec_valid(&ts))
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 27f4332c7f84..6e40a9539763 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -63,7 +63,7 @@ int irq_set_irq_type(unsigned int irq, unsigned int type)
63 return -EINVAL; 63 return -EINVAL;
64 64
65 type &= IRQ_TYPE_SENSE_MASK; 65 type &= IRQ_TYPE_SENSE_MASK;
66 ret = __irq_set_trigger(desc, irq, type); 66 ret = __irq_set_trigger(desc, type);
67 irq_put_desc_busunlock(desc, flags); 67 irq_put_desc_busunlock(desc, flags);
68 return ret; 68 return ret;
69} 69}
@@ -187,7 +187,7 @@ int irq_startup(struct irq_desc *desc, bool resend)
187 irq_enable(desc); 187 irq_enable(desc);
188 } 188 }
189 if (resend) 189 if (resend)
190 check_irq_resend(desc, desc->irq_data.irq); 190 check_irq_resend(desc);
191 return ret; 191 return ret;
192} 192}
193 193
@@ -315,7 +315,7 @@ void handle_nested_irq(unsigned int irq)
315 raw_spin_lock_irq(&desc->lock); 315 raw_spin_lock_irq(&desc->lock);
316 316
317 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 317 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
318 kstat_incr_irqs_this_cpu(irq, desc); 318 kstat_incr_irqs_this_cpu(desc);
319 319
320 action = desc->action; 320 action = desc->action;
321 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { 321 if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) {
@@ -328,7 +328,7 @@ void handle_nested_irq(unsigned int irq)
328 328
329 action_ret = action->thread_fn(action->irq, action->dev_id); 329 action_ret = action->thread_fn(action->irq, action->dev_id);
330 if (!noirqdebug) 330 if (!noirqdebug)
331 note_interrupt(irq, desc, action_ret); 331 note_interrupt(desc, action_ret);
332 332
333 raw_spin_lock_irq(&desc->lock); 333 raw_spin_lock_irq(&desc->lock);
334 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS); 334 irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@@ -391,7 +391,7 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
391 goto out_unlock; 391 goto out_unlock;
392 392
393 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 393 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
394 kstat_incr_irqs_this_cpu(irq, desc); 394 kstat_incr_irqs_this_cpu(desc);
395 395
396 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { 396 if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
397 desc->istate |= IRQS_PENDING; 397 desc->istate |= IRQS_PENDING;
@@ -443,7 +443,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
443 goto out_unlock; 443 goto out_unlock;
444 444
445 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 445 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
446 kstat_incr_irqs_this_cpu(irq, desc); 446 kstat_incr_irqs_this_cpu(desc);
447 447
448 /* 448 /*
449 * If its disabled or no action available 449 * If its disabled or no action available
@@ -515,7 +515,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
515 goto out; 515 goto out;
516 516
517 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 517 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
518 kstat_incr_irqs_this_cpu(irq, desc); 518 kstat_incr_irqs_this_cpu(desc);
519 519
520 /* 520 /*
521 * If its disabled or no action available 521 * If its disabled or no action available
@@ -583,7 +583,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
583 goto out_unlock; 583 goto out_unlock;
584 } 584 }
585 585
586 kstat_incr_irqs_this_cpu(irq, desc); 586 kstat_incr_irqs_this_cpu(desc);
587 587
588 /* Start handling the irq */ 588 /* Start handling the irq */
589 desc->irq_data.chip->irq_ack(&desc->irq_data); 589 desc->irq_data.chip->irq_ack(&desc->irq_data);
@@ -646,7 +646,7 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
646 goto out_eoi; 646 goto out_eoi;
647 } 647 }
648 648
649 kstat_incr_irqs_this_cpu(irq, desc); 649 kstat_incr_irqs_this_cpu(desc);
650 650
651 do { 651 do {
652 if (unlikely(!desc->action)) 652 if (unlikely(!desc->action))
@@ -675,7 +675,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
675{ 675{
676 struct irq_chip *chip = irq_desc_get_chip(desc); 676 struct irq_chip *chip = irq_desc_get_chip(desc);
677 677
678 kstat_incr_irqs_this_cpu(irq, desc); 678 kstat_incr_irqs_this_cpu(desc);
679 679
680 if (chip->irq_ack) 680 if (chip->irq_ack)
681 chip->irq_ack(&desc->irq_data); 681 chip->irq_ack(&desc->irq_data);
@@ -705,7 +705,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
705 void *dev_id = raw_cpu_ptr(action->percpu_dev_id); 705 void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
706 irqreturn_t res; 706 irqreturn_t res;
707 707
708 kstat_incr_irqs_this_cpu(irq, desc); 708 kstat_incr_irqs_this_cpu(desc);
709 709
710 if (chip->irq_ack) 710 if (chip->irq_ack)
711 chip->irq_ack(&desc->irq_data); 711 chip->irq_ack(&desc->irq_data);
@@ -985,6 +985,23 @@ int irq_chip_set_affinity_parent(struct irq_data *data,
985} 985}
986 986
987/** 987/**
988 * irq_chip_set_type_parent - Set IRQ type on the parent interrupt
989 * @data: Pointer to interrupt specific data
990 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
991 *
992 * Conditional, as the underlying parent chip might not implement it.
993 */
994int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
995{
996 data = data->parent_data;
997
998 if (data->chip->irq_set_type)
999 return data->chip->irq_set_type(data, type);
1000
1001 return -ENOSYS;
1002}
1003
1004/**
988 * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware 1005 * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware
989 * @data: Pointer to interrupt specific data 1006 * @data: Pointer to interrupt specific data
990 * 1007 *
@@ -997,13 +1014,13 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data)
997 if (data->chip && data->chip->irq_retrigger) 1014 if (data->chip && data->chip->irq_retrigger)
998 return data->chip->irq_retrigger(data); 1015 return data->chip->irq_retrigger(data);
999 1016
1000 return -ENOSYS; 1017 return 0;
1001} 1018}
1002 1019
1003/** 1020/**
1004 * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt 1021 * irq_chip_set_vcpu_affinity_parent - Set vcpu affinity on the parent interrupt
1005 * @data: Pointer to interrupt specific data 1022 * @data: Pointer to interrupt specific data
1006 * @dest: The vcpu affinity information 1023 * @vcpu_info: The vcpu affinity information
1007 */ 1024 */
1008int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) 1025int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
1009{ 1026{
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 15b370daf234..abd286afbd27 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -553,6 +553,9 @@ static int irq_gc_suspend(void)
553 if (data) 553 if (data)
554 ct->chip.irq_suspend(data); 554 ct->chip.irq_suspend(data);
555 } 555 }
556
557 if (gc->suspend)
558 gc->suspend(gc);
556 } 559 }
557 return 0; 560 return 0;
558} 561}
@@ -564,6 +567,9 @@ static void irq_gc_resume(void)
564 list_for_each_entry(gc, &gc_list, list) { 567 list_for_each_entry(gc, &gc_list, list) {
565 struct irq_chip_type *ct = gc->chip_types; 568 struct irq_chip_type *ct = gc->chip_types;
566 569
570 if (gc->resume)
571 gc->resume(gc);
572
567 if (ct->chip.irq_resume) { 573 if (ct->chip.irq_resume) {
568 struct irq_data *data = irq_gc_get_irq_data(gc); 574 struct irq_data *data = irq_gc_get_irq_data(gc);
569 575
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 635480270858..b6eeea8a80c5 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -30,7 +30,7 @@
30void handle_bad_irq(unsigned int irq, struct irq_desc *desc) 30void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
31{ 31{
32 print_irq_desc(irq, desc); 32 print_irq_desc(irq, desc);
33 kstat_incr_irqs_this_cpu(irq, desc); 33 kstat_incr_irqs_this_cpu(desc);
34 ack_bad_irq(irq); 34 ack_bad_irq(irq);
35} 35}
36 36
@@ -176,7 +176,7 @@ handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
176 add_interrupt_randomness(irq, flags); 176 add_interrupt_randomness(irq, flags);
177 177
178 if (!noirqdebug) 178 if (!noirqdebug)
179 note_interrupt(irq, desc, retval); 179 note_interrupt(desc, retval);
180 return retval; 180 return retval;
181} 181}
182 182
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 61008b8433ab..eee4b385cffb 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -59,10 +59,9 @@ enum {
59#include "debug.h" 59#include "debug.h"
60#include "settings.h" 60#include "settings.h"
61 61
62extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 62extern int __irq_set_trigger(struct irq_desc *desc, unsigned long flags);
63 unsigned long flags); 63extern void __disable_irq(struct irq_desc *desc);
64extern void __disable_irq(struct irq_desc *desc, unsigned int irq); 64extern void __enable_irq(struct irq_desc *desc);
65extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
66 65
67extern int irq_startup(struct irq_desc *desc, bool resend); 66extern int irq_startup(struct irq_desc *desc, bool resend);
68extern void irq_shutdown(struct irq_desc *desc); 67extern void irq_shutdown(struct irq_desc *desc);
@@ -86,7 +85,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *act
86irqreturn_t handle_irq_event(struct irq_desc *desc); 85irqreturn_t handle_irq_event(struct irq_desc *desc);
87 86
88/* Resending of interrupts :*/ 87/* Resending of interrupts :*/
89void check_irq_resend(struct irq_desc *desc, unsigned int irq); 88void check_irq_resend(struct irq_desc *desc);
90bool irq_wait_for_poll(struct irq_desc *desc); 89bool irq_wait_for_poll(struct irq_desc *desc);
91void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action); 90void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action);
92 91
@@ -187,7 +186,7 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
187 return __irqd_to_state(d) & mask; 186 return __irqd_to_state(d) & mask;
188} 187}
189 188
190static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *desc) 189static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
191{ 190{
192 __this_cpu_inc(*desc->kstat_irqs); 191 __this_cpu_inc(*desc->kstat_irqs);
193 __this_cpu_inc(kstat.irqs_sum); 192 __this_cpu_inc(kstat.irqs_sum);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4afc457613dd..0a2a4b697bcb 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -582,7 +582,7 @@ int irq_set_percpu_devid(unsigned int irq)
582 582
583void kstat_incr_irq_this_cpu(unsigned int irq) 583void kstat_incr_irq_this_cpu(unsigned int irq)
584{ 584{
585 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); 585 kstat_incr_irqs_this_cpu(irq_to_desc(irq));
586} 586}
587 587
588/** 588/**
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 8c3577fef78c..79baaf8a7813 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -187,10 +187,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node,
187EXPORT_SYMBOL_GPL(irq_domain_add_legacy); 187EXPORT_SYMBOL_GPL(irq_domain_add_legacy);
188 188
189/** 189/**
190 * irq_find_host() - Locates a domain for a given device node 190 * irq_find_matching_host() - Locates a domain for a given device node
191 * @node: device-tree node of the interrupt controller 191 * @node: device-tree node of the interrupt controller
192 * @bus_token: domain-specific data
192 */ 193 */
193struct irq_domain *irq_find_host(struct device_node *node) 194struct irq_domain *irq_find_matching_host(struct device_node *node,
195 enum irq_domain_bus_token bus_token)
194{ 196{
195 struct irq_domain *h, *found = NULL; 197 struct irq_domain *h, *found = NULL;
196 int rc; 198 int rc;
@@ -199,13 +201,19 @@ struct irq_domain *irq_find_host(struct device_node *node)
199 * it might potentially be set to match all interrupts in 201 * it might potentially be set to match all interrupts in
200 * the absence of a device node. This isn't a problem so far 202 * the absence of a device node. This isn't a problem so far
201 * yet though... 203 * yet though...
204 *
205 * bus_token == DOMAIN_BUS_ANY matches any domain, any other
206 * values must generate an exact match for the domain to be
207 * selected.
202 */ 208 */
203 mutex_lock(&irq_domain_mutex); 209 mutex_lock(&irq_domain_mutex);
204 list_for_each_entry(h, &irq_domain_list, link) { 210 list_for_each_entry(h, &irq_domain_list, link) {
205 if (h->ops->match) 211 if (h->ops->match)
206 rc = h->ops->match(h, node); 212 rc = h->ops->match(h, node, bus_token);
207 else 213 else
208 rc = (h->of_node != NULL) && (h->of_node == node); 214 rc = ((h->of_node != NULL) && (h->of_node == node) &&
215 ((bus_token == DOMAIN_BUS_ANY) ||
216 (h->bus_token == bus_token)));
209 217
210 if (rc) { 218 if (rc) {
211 found = h; 219 found = h;
@@ -215,7 +223,7 @@ struct irq_domain *irq_find_host(struct device_node *node)
215 mutex_unlock(&irq_domain_mutex); 223 mutex_unlock(&irq_domain_mutex);
216 return found; 224 return found;
217} 225}
218EXPORT_SYMBOL_GPL(irq_find_host); 226EXPORT_SYMBOL_GPL(irq_find_matching_host);
219 227
220/** 228/**
221 * irq_set_default_host() - Set a "default" irq domain 229 * irq_set_default_host() - Set a "default" irq domain
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index f9744853b656..ad1b064f94fe 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -115,6 +115,14 @@ EXPORT_SYMBOL(synchronize_irq);
115#ifdef CONFIG_SMP 115#ifdef CONFIG_SMP
116cpumask_var_t irq_default_affinity; 116cpumask_var_t irq_default_affinity;
117 117
118static int __irq_can_set_affinity(struct irq_desc *desc)
119{
120 if (!desc || !irqd_can_balance(&desc->irq_data) ||
121 !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
122 return 0;
123 return 1;
124}
125
118/** 126/**
119 * irq_can_set_affinity - Check if the affinity of a given irq can be set 127 * irq_can_set_affinity - Check if the affinity of a given irq can be set
120 * @irq: Interrupt to check 128 * @irq: Interrupt to check
@@ -122,13 +130,7 @@ cpumask_var_t irq_default_affinity;
122 */ 130 */
123int irq_can_set_affinity(unsigned int irq) 131int irq_can_set_affinity(unsigned int irq)
124{ 132{
125 struct irq_desc *desc = irq_to_desc(irq); 133 return __irq_can_set_affinity(irq_to_desc(irq));
126
127 if (!desc || !irqd_can_balance(&desc->irq_data) ||
128 !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
129 return 0;
130
131 return 1;
132} 134}
133 135
134/** 136/**
@@ -359,14 +361,13 @@ EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
359/* 361/*
360 * Generic version of the affinity autoselector. 362 * Generic version of the affinity autoselector.
361 */ 363 */
362static int 364static int setup_affinity(struct irq_desc *desc, struct cpumask *mask)
363setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
364{ 365{
365 struct cpumask *set = irq_default_affinity; 366 struct cpumask *set = irq_default_affinity;
366 int node = irq_desc_get_node(desc); 367 int node = irq_desc_get_node(desc);
367 368
368 /* Excludes PER_CPU and NO_BALANCE interrupts */ 369 /* Excludes PER_CPU and NO_BALANCE interrupts */
369 if (!irq_can_set_affinity(irq)) 370 if (!__irq_can_set_affinity(desc))
370 return 0; 371 return 0;
371 372
372 /* 373 /*
@@ -393,10 +394,10 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
393 return 0; 394 return 0;
394} 395}
395#else 396#else
396static inline int 397/* Wrapper for ALPHA specific affinity selector magic */
397setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask) 398static inline int setup_affinity(struct irq_desc *d, struct cpumask *mask)
398{ 399{
399 return irq_select_affinity(irq); 400 return irq_select_affinity(irq_desc_get_irq(d));
400} 401}
401#endif 402#endif
402 403
@@ -410,20 +411,20 @@ int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
410 int ret; 411 int ret;
411 412
412 raw_spin_lock_irqsave(&desc->lock, flags); 413 raw_spin_lock_irqsave(&desc->lock, flags);
413 ret = setup_affinity(irq, desc, mask); 414 ret = setup_affinity(desc, mask);
414 raw_spin_unlock_irqrestore(&desc->lock, flags); 415 raw_spin_unlock_irqrestore(&desc->lock, flags);
415 return ret; 416 return ret;
416} 417}
417 418
418#else 419#else
419static inline int 420static inline int
420setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask) 421setup_affinity(struct irq_desc *desc, struct cpumask *mask)
421{ 422{
422 return 0; 423 return 0;
423} 424}
424#endif 425#endif
425 426
426void __disable_irq(struct irq_desc *desc, unsigned int irq) 427void __disable_irq(struct irq_desc *desc)
427{ 428{
428 if (!desc->depth++) 429 if (!desc->depth++)
429 irq_disable(desc); 430 irq_disable(desc);
@@ -436,7 +437,7 @@ static int __disable_irq_nosync(unsigned int irq)
436 437
437 if (!desc) 438 if (!desc)
438 return -EINVAL; 439 return -EINVAL;
439 __disable_irq(desc, irq); 440 __disable_irq(desc);
440 irq_put_desc_busunlock(desc, flags); 441 irq_put_desc_busunlock(desc, flags);
441 return 0; 442 return 0;
442} 443}
@@ -503,12 +504,13 @@ bool disable_hardirq(unsigned int irq)
503} 504}
504EXPORT_SYMBOL_GPL(disable_hardirq); 505EXPORT_SYMBOL_GPL(disable_hardirq);
505 506
506void __enable_irq(struct irq_desc *desc, unsigned int irq) 507void __enable_irq(struct irq_desc *desc)
507{ 508{
508 switch (desc->depth) { 509 switch (desc->depth) {
509 case 0: 510 case 0:
510 err_out: 511 err_out:
511 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 512 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n",
513 irq_desc_get_irq(desc));
512 break; 514 break;
513 case 1: { 515 case 1: {
514 if (desc->istate & IRQS_SUSPENDED) 516 if (desc->istate & IRQS_SUSPENDED)
@@ -516,7 +518,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq)
516 /* Prevent probing on this irq: */ 518 /* Prevent probing on this irq: */
517 irq_settings_set_noprobe(desc); 519 irq_settings_set_noprobe(desc);
518 irq_enable(desc); 520 irq_enable(desc);
519 check_irq_resend(desc, irq); 521 check_irq_resend(desc);
520 /* fall-through */ 522 /* fall-through */
521 } 523 }
522 default: 524 default:
@@ -546,7 +548,7 @@ void enable_irq(unsigned int irq)
546 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) 548 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
547 goto out; 549 goto out;
548 550
549 __enable_irq(desc, irq); 551 __enable_irq(desc);
550out: 552out:
551 irq_put_desc_busunlock(desc, flags); 553 irq_put_desc_busunlock(desc, flags);
552} 554}
@@ -637,8 +639,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
637 return canrequest; 639 return canrequest;
638} 640}
639 641
640int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 642int __irq_set_trigger(struct irq_desc *desc, unsigned long flags)
641 unsigned long flags)
642{ 643{
643 struct irq_chip *chip = desc->irq_data.chip; 644 struct irq_chip *chip = desc->irq_data.chip;
644 int ret, unmask = 0; 645 int ret, unmask = 0;
@@ -648,7 +649,8 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
648 * IRQF_TRIGGER_* but the PIC does not support multiple 649 * IRQF_TRIGGER_* but the PIC does not support multiple
649 * flow-types? 650 * flow-types?
650 */ 651 */
651 pr_debug("No set_type function for IRQ %d (%s)\n", irq, 652 pr_debug("No set_type function for IRQ %d (%s)\n",
653 irq_desc_get_irq(desc),
652 chip ? (chip->name ? : "unknown") : "unknown"); 654 chip ? (chip->name ? : "unknown") : "unknown");
653 return 0; 655 return 0;
654 } 656 }
@@ -685,7 +687,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
685 break; 687 break;
686 default: 688 default:
687 pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n", 689 pr_err("Setting trigger mode %lu for irq %u failed (%pF)\n",
688 flags, irq, chip->irq_set_type); 690 flags, irq_desc_get_irq(desc), chip->irq_set_type);
689 } 691 }
690 if (unmask) 692 if (unmask)
691 unmask_irq(desc); 693 unmask_irq(desc);
@@ -1221,8 +1223,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1221 1223
1222 /* Setup the type (level, edge polarity) if configured: */ 1224 /* Setup the type (level, edge polarity) if configured: */
1223 if (new->flags & IRQF_TRIGGER_MASK) { 1225 if (new->flags & IRQF_TRIGGER_MASK) {
1224 ret = __irq_set_trigger(desc, irq, 1226 ret = __irq_set_trigger(desc,
1225 new->flags & IRQF_TRIGGER_MASK); 1227 new->flags & IRQF_TRIGGER_MASK);
1226 1228
1227 if (ret) 1229 if (ret)
1228 goto out_mask; 1230 goto out_mask;
@@ -1253,7 +1255,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1253 } 1255 }
1254 1256
1255 /* Set default affinity mask once everything is setup */ 1257 /* Set default affinity mask once everything is setup */
1256 setup_affinity(irq, desc, mask); 1258 setup_affinity(desc, mask);
1257 1259
1258 } else if (new->flags & IRQF_TRIGGER_MASK) { 1260 } else if (new->flags & IRQF_TRIGGER_MASK) {
1259 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK; 1261 unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
@@ -1280,7 +1282,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1280 */ 1282 */
1281 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { 1283 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
1282 desc->istate &= ~IRQS_SPURIOUS_DISABLED; 1284 desc->istate &= ~IRQS_SPURIOUS_DISABLED;
1283 __enable_irq(desc, irq); 1285 __enable_irq(desc);
1284 } 1286 }
1285 1287
1286 raw_spin_unlock_irqrestore(&desc->lock, flags); 1288 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -1650,7 +1652,7 @@ void enable_percpu_irq(unsigned int irq, unsigned int type)
1650 if (type != IRQ_TYPE_NONE) { 1652 if (type != IRQ_TYPE_NONE) {
1651 int ret; 1653 int ret;
1652 1654
1653 ret = __irq_set_trigger(desc, irq, type); 1655 ret = __irq_set_trigger(desc, type);
1654 1656
1655 if (ret) { 1657 if (ret) {
1656 WARN(1, "failed to set type for IRQ%d\n", irq); 1658 WARN(1, "failed to set type for IRQ%d\n", irq);
@@ -1875,6 +1877,7 @@ int irq_get_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
1875 irq_put_desc_busunlock(desc, flags); 1877 irq_put_desc_busunlock(desc, flags);
1876 return err; 1878 return err;
1877} 1879}
1880EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
1878 1881
1879/** 1882/**
1880 * irq_set_irqchip_state - set the state of a forwarded interrupt. 1883 * irq_set_irqchip_state - set the state of a forwarded interrupt.
@@ -1920,3 +1923,4 @@ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
1920 irq_put_desc_busunlock(desc, flags); 1923 irq_put_desc_busunlock(desc, flags);
1921 return err; 1924 return err;
1922} 1925}
1926EXPORT_SYMBOL_GPL(irq_set_irqchip_state);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 7bf1f1bbb7fa..7e6512b9dc1f 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -18,6 +18,23 @@
18/* Temparory solution for building, will be removed later */ 18/* Temparory solution for building, will be removed later */
19#include <linux/pci.h> 19#include <linux/pci.h>
20 20
21struct msi_desc *alloc_msi_entry(struct device *dev)
22{
23 struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
24 if (!desc)
25 return NULL;
26
27 INIT_LIST_HEAD(&desc->list);
28 desc->dev = dev;
29
30 return desc;
31}
32
33void free_msi_entry(struct msi_desc *entry)
34{
35 kfree(entry);
36}
37
21void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg) 38void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
22{ 39{
23 *msg = entry->msg; 40 *msg = entry->msg;
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index d22786a6dbde..21c62617a35a 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -68,7 +68,7 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
68 desc->cond_suspend_depth--; 68 desc->cond_suspend_depth--;
69} 69}
70 70
71static bool suspend_device_irq(struct irq_desc *desc, int irq) 71static bool suspend_device_irq(struct irq_desc *desc)
72{ 72{
73 if (!desc->action || desc->no_suspend_depth) 73 if (!desc->action || desc->no_suspend_depth)
74 return false; 74 return false;
@@ -85,7 +85,7 @@ static bool suspend_device_irq(struct irq_desc *desc, int irq)
85 } 85 }
86 86
87 desc->istate |= IRQS_SUSPENDED; 87 desc->istate |= IRQS_SUSPENDED;
88 __disable_irq(desc, irq); 88 __disable_irq(desc);
89 89
90 /* 90 /*
91 * Hardware which has no wakeup source configuration facility 91 * Hardware which has no wakeup source configuration facility
@@ -126,7 +126,7 @@ void suspend_device_irqs(void)
126 if (irq_settings_is_nested_thread(desc)) 126 if (irq_settings_is_nested_thread(desc))
127 continue; 127 continue;
128 raw_spin_lock_irqsave(&desc->lock, flags); 128 raw_spin_lock_irqsave(&desc->lock, flags);
129 sync = suspend_device_irq(desc, irq); 129 sync = suspend_device_irq(desc);
130 raw_spin_unlock_irqrestore(&desc->lock, flags); 130 raw_spin_unlock_irqrestore(&desc->lock, flags);
131 131
132 if (sync) 132 if (sync)
@@ -135,7 +135,7 @@ void suspend_device_irqs(void)
135} 135}
136EXPORT_SYMBOL_GPL(suspend_device_irqs); 136EXPORT_SYMBOL_GPL(suspend_device_irqs);
137 137
138static void resume_irq(struct irq_desc *desc, int irq) 138static void resume_irq(struct irq_desc *desc)
139{ 139{
140 irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED); 140 irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
141 141
@@ -150,7 +150,7 @@ static void resume_irq(struct irq_desc *desc, int irq)
150 desc->depth++; 150 desc->depth++;
151resume: 151resume:
152 desc->istate &= ~IRQS_SUSPENDED; 152 desc->istate &= ~IRQS_SUSPENDED;
153 __enable_irq(desc, irq); 153 __enable_irq(desc);
154} 154}
155 155
156static void resume_irqs(bool want_early) 156static void resume_irqs(bool want_early)
@@ -169,7 +169,7 @@ static void resume_irqs(bool want_early)
169 continue; 169 continue;
170 170
171 raw_spin_lock_irqsave(&desc->lock, flags); 171 raw_spin_lock_irqsave(&desc->lock, flags);
172 resume_irq(desc, irq); 172 resume_irq(desc);
173 raw_spin_unlock_irqrestore(&desc->lock, flags); 173 raw_spin_unlock_irqrestore(&desc->lock, flags);
174 } 174 }
175} 175}
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 9065107f083e..dd95f44f99b2 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -53,7 +53,7 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
53 * 53 *
54 * Is called with interrupts disabled and desc->lock held. 54 * Is called with interrupts disabled and desc->lock held.
55 */ 55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq) 56void check_irq_resend(struct irq_desc *desc)
57{ 57{
58 /* 58 /*
59 * We do not resend level type interrupts. Level type 59 * We do not resend level type interrupts. Level type
@@ -74,14 +74,24 @@ void check_irq_resend(struct irq_desc *desc, unsigned int irq)
74 if (!desc->irq_data.chip->irq_retrigger || 74 if (!desc->irq_data.chip->irq_retrigger ||
75 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) { 75 !desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
76#ifdef CONFIG_HARDIRQS_SW_RESEND 76#ifdef CONFIG_HARDIRQS_SW_RESEND
77 unsigned int irq = irq_desc_get_irq(desc);
78
77 /* 79 /*
78 * If the interrupt has a parent irq and runs 80 * If the interrupt is running in the thread
79 * in the thread context of the parent irq, 81 * context of the parent irq we need to be
80 * retrigger the parent. 82 * careful, because we cannot trigger it
83 * directly.
81 */ 84 */
82 if (desc->parent_irq && 85 if (irq_settings_is_nested_thread(desc)) {
83 irq_settings_is_nested_thread(desc)) 86 /*
87 * If the parent_irq is valid, we
88 * retrigger the parent, otherwise we
89 * do nothing.
90 */
91 if (!desc->parent_irq)
92 return;
84 irq = desc->parent_irq; 93 irq = desc->parent_irq;
94 }
85 /* Set it pending and activate the softirq: */ 95 /* Set it pending and activate the softirq: */
86 set_bit(irq, irqs_resend); 96 set_bit(irq, irqs_resend);
87 tasklet_schedule(&resend_tasklet); 97 tasklet_schedule(&resend_tasklet);
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index e2514b0e439e..32144175458d 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -60,7 +60,7 @@ bool irq_wait_for_poll(struct irq_desc *desc)
60/* 60/*
61 * Recovery handler for misrouted interrupts. 61 * Recovery handler for misrouted interrupts.
62 */ 62 */
63static int try_one_irq(int irq, struct irq_desc *desc, bool force) 63static int try_one_irq(struct irq_desc *desc, bool force)
64{ 64{
65 irqreturn_t ret = IRQ_NONE; 65 irqreturn_t ret = IRQ_NONE;
66 struct irqaction *action; 66 struct irqaction *action;
@@ -133,7 +133,7 @@ static int misrouted_irq(int irq)
133 if (i == irq) /* Already tried */ 133 if (i == irq) /* Already tried */
134 continue; 134 continue;
135 135
136 if (try_one_irq(i, desc, false)) 136 if (try_one_irq(desc, false))
137 ok = 1; 137 ok = 1;
138 } 138 }
139out: 139out:
@@ -164,7 +164,7 @@ static void poll_spurious_irqs(unsigned long dummy)
164 continue; 164 continue;
165 165
166 local_irq_disable(); 166 local_irq_disable();
167 try_one_irq(i, desc, true); 167 try_one_irq(desc, true);
168 local_irq_enable(); 168 local_irq_enable();
169 } 169 }
170out: 170out:
@@ -188,10 +188,9 @@ static inline int bad_action_ret(irqreturn_t action_ret)
188 * (The other 100-of-100,000 interrupts may have been a correctly 188 * (The other 100-of-100,000 interrupts may have been a correctly
189 * functioning device sharing an IRQ with the failing one) 189 * functioning device sharing an IRQ with the failing one)
190 */ 190 */
191static void 191static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
192__report_bad_irq(unsigned int irq, struct irq_desc *desc,
193 irqreturn_t action_ret)
194{ 192{
193 unsigned int irq = irq_desc_get_irq(desc);
195 struct irqaction *action; 194 struct irqaction *action;
196 unsigned long flags; 195 unsigned long flags;
197 196
@@ -224,14 +223,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
224 raw_spin_unlock_irqrestore(&desc->lock, flags); 223 raw_spin_unlock_irqrestore(&desc->lock, flags);
225} 224}
226 225
227static void 226static void report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
228report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
229{ 227{
230 static int count = 100; 228 static int count = 100;
231 229
232 if (count > 0) { 230 if (count > 0) {
233 count--; 231 count--;
234 __report_bad_irq(irq, desc, action_ret); 232 __report_bad_irq(desc, action_ret);
235 } 233 }
236} 234}
237 235
@@ -272,15 +270,16 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
272 270
273#define SPURIOUS_DEFERRED 0x80000000 271#define SPURIOUS_DEFERRED 0x80000000
274 272
275void note_interrupt(unsigned int irq, struct irq_desc *desc, 273void note_interrupt(struct irq_desc *desc, irqreturn_t action_ret)
276 irqreturn_t action_ret)
277{ 274{
275 unsigned int irq;
276
278 if (desc->istate & IRQS_POLL_INPROGRESS || 277 if (desc->istate & IRQS_POLL_INPROGRESS ||
279 irq_settings_is_polled(desc)) 278 irq_settings_is_polled(desc))
280 return; 279 return;
281 280
282 if (bad_action_ret(action_ret)) { 281 if (bad_action_ret(action_ret)) {
283 report_bad_irq(irq, desc, action_ret); 282 report_bad_irq(desc, action_ret);
284 return; 283 return;
285 } 284 }
286 285
@@ -398,6 +397,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
398 desc->last_unhandled = jiffies; 397 desc->last_unhandled = jiffies;
399 } 398 }
400 399
400 irq = irq_desc_get_irq(desc);
401 if (unlikely(try_misrouted_irq(irq, desc, action_ret))) { 401 if (unlikely(try_misrouted_irq(irq, desc, action_ret))) {
402 int ok = misrouted_irq(irq); 402 int ok = misrouted_irq(irq);
403 if (action_ret == IRQ_NONE) 403 if (action_ret == IRQ_NONE)
@@ -413,7 +413,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
413 /* 413 /*
414 * The interrupt is stuck 414 * The interrupt is stuck
415 */ 415 */
416 __report_bad_irq(irq, desc, action_ret); 416 __report_bad_irq(desc, action_ret);
417 /* 417 /*
418 * Now kill the IRQ 418 * Now kill the IRQ
419 */ 419 */
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 52ebaca1b9fc..f7dd15d537f9 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
54 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); 54 sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
55} 55}
56 56
57static void jump_label_update(struct static_key *key, int enable); 57static void jump_label_update(struct static_key *key);
58 58
59void static_key_slow_inc(struct static_key *key) 59void static_key_slow_inc(struct static_key *key)
60{ 60{
@@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key)
63 return; 63 return;
64 64
65 jump_label_lock(); 65 jump_label_lock();
66 if (atomic_read(&key->enabled) == 0) { 66 if (atomic_inc_return(&key->enabled) == 1)
67 if (!jump_label_get_branch_default(key)) 67 jump_label_update(key);
68 jump_label_update(key, JUMP_LABEL_ENABLE);
69 else
70 jump_label_update(key, JUMP_LABEL_DISABLE);
71 }
72 atomic_inc(&key->enabled);
73 jump_label_unlock(); 68 jump_label_unlock();
74} 69}
75EXPORT_SYMBOL_GPL(static_key_slow_inc); 70EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key,
87 atomic_inc(&key->enabled); 82 atomic_inc(&key->enabled);
88 schedule_delayed_work(work, rate_limit); 83 schedule_delayed_work(work, rate_limit);
89 } else { 84 } else {
90 if (!jump_label_get_branch_default(key)) 85 jump_label_update(key);
91 jump_label_update(key, JUMP_LABEL_DISABLE);
92 else
93 jump_label_update(key, JUMP_LABEL_ENABLE);
94 } 86 }
95 jump_label_unlock(); 87 jump_label_unlock();
96} 88}
@@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
149 return 0; 141 return 0;
150} 142}
151 143
152/* 144/*
153 * Update code which is definitely not currently executing. 145 * Update code which is definitely not currently executing.
154 * Architectures which need heavyweight synchronization to modify 146 * Architectures which need heavyweight synchronization to modify
155 * running code can override this to make the non-live update case 147 * running code can override this to make the non-live update case
@@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
158void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry, 150void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
159 enum jump_label_type type) 151 enum jump_label_type type)
160{ 152{
161 arch_jump_label_transform(entry, type); 153 arch_jump_label_transform(entry, type);
154}
155
156static inline struct jump_entry *static_key_entries(struct static_key *key)
157{
158 return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
159}
160
161static inline bool static_key_type(struct static_key *key)
162{
163 return (unsigned long)key->entries & JUMP_TYPE_MASK;
164}
165
166static inline struct static_key *jump_entry_key(struct jump_entry *entry)
167{
168 return (struct static_key *)((unsigned long)entry->key & ~1UL);
169}
170
171static bool jump_entry_branch(struct jump_entry *entry)
172{
173 return (unsigned long)entry->key & 1UL;
174}
175
176static enum jump_label_type jump_label_type(struct jump_entry *entry)
177{
178 struct static_key *key = jump_entry_key(entry);
179 bool enabled = static_key_enabled(key);
180 bool branch = jump_entry_branch(entry);
181
182 /* See the comment in linux/jump_label.h */
183 return enabled ^ branch;
162} 184}
163 185
164static void __jump_label_update(struct static_key *key, 186static void __jump_label_update(struct static_key *key,
165 struct jump_entry *entry, 187 struct jump_entry *entry,
166 struct jump_entry *stop, int enable) 188 struct jump_entry *stop)
167{ 189{
168 for (; (entry < stop) && 190 for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
169 (entry->key == (jump_label_t)(unsigned long)key);
170 entry++) {
171 /* 191 /*
172 * entry->code set to 0 invalidates module init text sections 192 * entry->code set to 0 invalidates module init text sections
173 * kernel_text_address() verifies we are not in core kernel 193 * kernel_text_address() verifies we are not in core kernel
174 * init code, see jump_label_invalidate_module_init(). 194 * init code, see jump_label_invalidate_module_init().
175 */ 195 */
176 if (entry->code && kernel_text_address(entry->code)) 196 if (entry->code && kernel_text_address(entry->code))
177 arch_jump_label_transform(entry, enable); 197 arch_jump_label_transform(entry, jump_label_type(entry));
178 } 198 }
179} 199}
180 200
181static enum jump_label_type jump_label_type(struct static_key *key)
182{
183 bool true_branch = jump_label_get_branch_default(key);
184 bool state = static_key_enabled(key);
185
186 if ((!true_branch && state) || (true_branch && !state))
187 return JUMP_LABEL_ENABLE;
188
189 return JUMP_LABEL_DISABLE;
190}
191
192void __init jump_label_init(void) 201void __init jump_label_init(void)
193{ 202{
194 struct jump_entry *iter_start = __start___jump_table; 203 struct jump_entry *iter_start = __start___jump_table;
@@ -202,8 +211,11 @@ void __init jump_label_init(void)
202 for (iter = iter_start; iter < iter_stop; iter++) { 211 for (iter = iter_start; iter < iter_stop; iter++) {
203 struct static_key *iterk; 212 struct static_key *iterk;
204 213
205 iterk = (struct static_key *)(unsigned long)iter->key; 214 /* rewrite NOPs */
206 arch_jump_label_transform_static(iter, jump_label_type(iterk)); 215 if (jump_label_type(iter) == JUMP_LABEL_NOP)
216 arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
217
218 iterk = jump_entry_key(iter);
207 if (iterk == key) 219 if (iterk == key)
208 continue; 220 continue;
209 221
@@ -222,6 +234,16 @@ void __init jump_label_init(void)
222 234
223#ifdef CONFIG_MODULES 235#ifdef CONFIG_MODULES
224 236
237static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
238{
239 struct static_key *key = jump_entry_key(entry);
240 bool type = static_key_type(key);
241 bool branch = jump_entry_branch(entry);
242
243 /* See the comment in linux/jump_label.h */
244 return type ^ branch;
245}
246
225struct static_key_mod { 247struct static_key_mod {
226 struct static_key_mod *next; 248 struct static_key_mod *next;
227 struct jump_entry *entries; 249 struct jump_entry *entries;
@@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
243 start, end); 265 start, end);
244} 266}
245 267
246static void __jump_label_mod_update(struct static_key *key, int enable) 268static void __jump_label_mod_update(struct static_key *key)
247{ 269{
248 struct static_key_mod *mod = key->next; 270 struct static_key_mod *mod;
249 271
250 while (mod) { 272 for (mod = key->next; mod; mod = mod->next) {
251 struct module *m = mod->mod; 273 struct module *m = mod->mod;
252 274
253 __jump_label_update(key, mod->entries, 275 __jump_label_update(key, mod->entries,
254 m->jump_entries + m->num_jump_entries, 276 m->jump_entries + m->num_jump_entries);
255 enable);
256 mod = mod->next;
257 } 277 }
258} 278}
259 279
@@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod)
276 return; 296 return;
277 297
278 for (iter = iter_start; iter < iter_stop; iter++) { 298 for (iter = iter_start; iter < iter_stop; iter++) {
279 arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE); 299 /* Only write NOPs for arch_branch_static(). */
300 if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
301 arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
280 } 302 }
281} 303}
282 304
@@ -297,7 +319,7 @@ static int jump_label_add_module(struct module *mod)
297 for (iter = iter_start; iter < iter_stop; iter++) { 319 for (iter = iter_start; iter < iter_stop; iter++) {
298 struct static_key *iterk; 320 struct static_key *iterk;
299 321
300 iterk = (struct static_key *)(unsigned long)iter->key; 322 iterk = jump_entry_key(iter);
301 if (iterk == key) 323 if (iterk == key)
302 continue; 324 continue;
303 325
@@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod)
318 jlm->next = key->next; 340 jlm->next = key->next;
319 key->next = jlm; 341 key->next = jlm;
320 342
321 if (jump_label_type(key) == JUMP_LABEL_ENABLE) 343 /* Only update if we've changed from our initial state */
322 __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE); 344 if (jump_label_type(iter) != jump_label_init_type(iter))
345 __jump_label_update(key, iter, iter_stop);
323 } 346 }
324 347
325 return 0; 348 return 0;
@@ -334,10 +357,10 @@ static void jump_label_del_module(struct module *mod)
334 struct static_key_mod *jlm, **prev; 357 struct static_key_mod *jlm, **prev;
335 358
336 for (iter = iter_start; iter < iter_stop; iter++) { 359 for (iter = iter_start; iter < iter_stop; iter++) {
337 if (iter->key == (jump_label_t)(unsigned long)key) 360 if (jump_entry_key(iter) == key)
338 continue; 361 continue;
339 362
340 key = (struct static_key *)(unsigned long)iter->key; 363 key = jump_entry_key(iter);
341 364
342 if (within_module(iter->key, mod)) 365 if (within_module(iter->key, mod))
343 continue; 366 continue;
@@ -439,14 +462,14 @@ int jump_label_text_reserved(void *start, void *end)
439 return ret; 462 return ret;
440} 463}
441 464
442static void jump_label_update(struct static_key *key, int enable) 465static void jump_label_update(struct static_key *key)
443{ 466{
444 struct jump_entry *stop = __stop___jump_table; 467 struct jump_entry *stop = __stop___jump_table;
445 struct jump_entry *entry = jump_label_get_entries(key); 468 struct jump_entry *entry = static_key_entries(key);
446#ifdef CONFIG_MODULES 469#ifdef CONFIG_MODULES
447 struct module *mod; 470 struct module *mod;
448 471
449 __jump_label_mod_update(key, enable); 472 __jump_label_mod_update(key);
450 473
451 preempt_disable(); 474 preempt_disable();
452 mod = __module_address((unsigned long)key); 475 mod = __module_address((unsigned long)key);
@@ -456,7 +479,44 @@ static void jump_label_update(struct static_key *key, int enable)
456#endif 479#endif
457 /* if there are no users, entry can be NULL */ 480 /* if there are no users, entry can be NULL */
458 if (entry) 481 if (entry)
459 __jump_label_update(key, entry, stop, enable); 482 __jump_label_update(key, entry, stop);
460} 483}
461 484
462#endif 485#ifdef CONFIG_STATIC_KEYS_SELFTEST
486static DEFINE_STATIC_KEY_TRUE(sk_true);
487static DEFINE_STATIC_KEY_FALSE(sk_false);
488
489static __init int jump_label_test(void)
490{
491 int i;
492
493 for (i = 0; i < 2; i++) {
494 WARN_ON(static_key_enabled(&sk_true.key) != true);
495 WARN_ON(static_key_enabled(&sk_false.key) != false);
496
497 WARN_ON(!static_branch_likely(&sk_true));
498 WARN_ON(!static_branch_unlikely(&sk_true));
499 WARN_ON(static_branch_likely(&sk_false));
500 WARN_ON(static_branch_unlikely(&sk_false));
501
502 static_branch_disable(&sk_true);
503 static_branch_enable(&sk_false);
504
505 WARN_ON(static_key_enabled(&sk_true.key) == true);
506 WARN_ON(static_key_enabled(&sk_false.key) == false);
507
508 WARN_ON(static_branch_likely(&sk_true));
509 WARN_ON(static_branch_unlikely(&sk_true));
510 WARN_ON(!static_branch_likely(&sk_false));
511 WARN_ON(!static_branch_unlikely(&sk_false));
512
513 static_branch_enable(&sk_true);
514 static_branch_disable(&sk_false);
515 }
516
517 return 0;
518}
519late_initcall(jump_label_test);
520#endif /* STATIC_KEYS_SELFTEST */
521
522#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index a785c1015e25..4c5edc357923 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1,156 +1,22 @@
1/* 1/*
2 * kexec.c - kexec system call 2 * kexec.c - kexec_load system call
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 * 4 *
5 * This source code is licensed under the GNU General Public License, 5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details. 6 * Version 2. See the file COPYING for more details.
7 */ 7 */
8 8
9#define pr_fmt(fmt) "kexec: " fmt
10
11#include <linux/capability.h> 9#include <linux/capability.h>
12#include <linux/mm.h> 10#include <linux/mm.h>
13#include <linux/file.h> 11#include <linux/file.h>
14#include <linux/slab.h>
15#include <linux/fs.h>
16#include <linux/kexec.h> 12#include <linux/kexec.h>
17#include <linux/mutex.h> 13#include <linux/mutex.h>
18#include <linux/list.h> 14#include <linux/list.h>
19#include <linux/highmem.h>
20#include <linux/syscalls.h> 15#include <linux/syscalls.h>
21#include <linux/reboot.h>
22#include <linux/ioport.h>
23#include <linux/hardirq.h>
24#include <linux/elf.h>
25#include <linux/elfcore.h>
26#include <linux/utsname.h>
27#include <linux/numa.h>
28#include <linux/suspend.h>
29#include <linux/device.h>
30#include <linux/freezer.h>
31#include <linux/pm.h>
32#include <linux/cpu.h>
33#include <linux/console.h>
34#include <linux/vmalloc.h> 16#include <linux/vmalloc.h>
35#include <linux/swap.h> 17#include <linux/slab.h>
36#include <linux/syscore_ops.h>
37#include <linux/compiler.h>
38#include <linux/hugetlb.h>
39
40#include <asm/page.h>
41#include <asm/uaccess.h>
42#include <asm/io.h>
43#include <asm/sections.h>
44
45#include <crypto/hash.h>
46#include <crypto/sha.h>
47
48/* Per cpu memory for storing cpu states in case of system crash. */
49note_buf_t __percpu *crash_notes;
50
51/* vmcoreinfo stuff */
52static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
53u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
54size_t vmcoreinfo_size;
55size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
56
57/* Flag to indicate we are going to kexec a new kernel */
58bool kexec_in_progress = false;
59
60/*
61 * Declare these symbols weak so that if architecture provides a purgatory,
62 * these will be overridden.
63 */
64char __weak kexec_purgatory[0];
65size_t __weak kexec_purgatory_size = 0;
66
67#ifdef CONFIG_KEXEC_FILE
68static int kexec_calculate_store_digests(struct kimage *image);
69#endif
70
71/* Location of the reserved area for the crash kernel */
72struct resource crashk_res = {
73 .name = "Crash kernel",
74 .start = 0,
75 .end = 0,
76 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
77};
78struct resource crashk_low_res = {
79 .name = "Crash kernel",
80 .start = 0,
81 .end = 0,
82 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
83};
84
85int kexec_should_crash(struct task_struct *p)
86{
87 /*
88 * If crash_kexec_post_notifiers is enabled, don't run
89 * crash_kexec() here yet, which must be run after panic
90 * notifiers in panic().
91 */
92 if (crash_kexec_post_notifiers)
93 return 0;
94 /*
95 * There are 4 panic() calls in do_exit() path, each of which
96 * corresponds to each of these 4 conditions.
97 */
98 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
99 return 1;
100 return 0;
101}
102
103/*
104 * When kexec transitions to the new kernel there is a one-to-one
105 * mapping between physical and virtual addresses. On processors
106 * where you can disable the MMU this is trivial, and easy. For
107 * others it is still a simple predictable page table to setup.
108 *
109 * In that environment kexec copies the new kernel to its final
110 * resting place. This means I can only support memory whose
111 * physical address can fit in an unsigned long. In particular
112 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
113 * If the assembly stub has more restrictive requirements
114 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
115 * defined more restrictively in <asm/kexec.h>.
116 *
117 * The code for the transition from the current kernel to the
118 * the new kernel is placed in the control_code_buffer, whose size
119 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
120 * page of memory is necessary, but some architectures require more.
121 * Because this memory must be identity mapped in the transition from
122 * virtual to physical addresses it must live in the range
123 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
124 * modifiable.
125 *
126 * The assembly stub in the control code buffer is passed a linked list
127 * of descriptor pages detailing the source pages of the new kernel,
128 * and the destination addresses of those source pages. As this data
129 * structure is not used in the context of the current OS, it must
130 * be self-contained.
131 *
132 * The code has been made to work with highmem pages and will use a
133 * destination page in its final resting place (if it happens
134 * to allocate it). The end product of this is that most of the
135 * physical address space, and most of RAM can be used.
136 *
137 * Future directions include:
138 * - allocating a page table with the control code buffer identity
139 * mapped, to simplify machine_kexec and make kexec_on_panic more
140 * reliable.
141 */
142
143/*
144 * KIMAGE_NO_DEST is an impossible destination address..., for
145 * allocating pages whose destination address we do not care about.
146 */
147#define KIMAGE_NO_DEST (-1UL)
148 18
149static int kimage_is_destination_range(struct kimage *image, 19#include "kexec_internal.h"
150 unsigned long start, unsigned long end);
151static struct page *kimage_alloc_page(struct kimage *image,
152 gfp_t gfp_mask,
153 unsigned long dest);
154 20
155static int copy_user_segment_list(struct kimage *image, 21static int copy_user_segment_list(struct kimage *image,
156 unsigned long nr_segments, 22 unsigned long nr_segments,
@@ -169,125 +35,6 @@ static int copy_user_segment_list(struct kimage *image,
169 return ret; 35 return ret;
170} 36}
171 37
172static int sanity_check_segment_list(struct kimage *image)
173{
174 int result, i;
175 unsigned long nr_segments = image->nr_segments;
176
177 /*
178 * Verify we have good destination addresses. The caller is
179 * responsible for making certain we don't attempt to load
180 * the new image into invalid or reserved areas of RAM. This
181 * just verifies it is an address we can use.
182 *
183 * Since the kernel does everything in page size chunks ensure
184 * the destination addresses are page aligned. Too many
185 * special cases crop of when we don't do this. The most
186 * insidious is getting overlapping destination addresses
187 * simply because addresses are changed to page size
188 * granularity.
189 */
190 result = -EADDRNOTAVAIL;
191 for (i = 0; i < nr_segments; i++) {
192 unsigned long mstart, mend;
193
194 mstart = image->segment[i].mem;
195 mend = mstart + image->segment[i].memsz;
196 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
197 return result;
198 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
199 return result;
200 }
201
202 /* Verify our destination addresses do not overlap.
203 * If we alloed overlapping destination addresses
204 * through very weird things can happen with no
205 * easy explanation as one segment stops on another.
206 */
207 result = -EINVAL;
208 for (i = 0; i < nr_segments; i++) {
209 unsigned long mstart, mend;
210 unsigned long j;
211
212 mstart = image->segment[i].mem;
213 mend = mstart + image->segment[i].memsz;
214 for (j = 0; j < i; j++) {
215 unsigned long pstart, pend;
216 pstart = image->segment[j].mem;
217 pend = pstart + image->segment[j].memsz;
218 /* Do the segments overlap ? */
219 if ((mend > pstart) && (mstart < pend))
220 return result;
221 }
222 }
223
224 /* Ensure our buffer sizes are strictly less than
225 * our memory sizes. This should always be the case,
226 * and it is easier to check up front than to be surprised
227 * later on.
228 */
229 result = -EINVAL;
230 for (i = 0; i < nr_segments; i++) {
231 if (image->segment[i].bufsz > image->segment[i].memsz)
232 return result;
233 }
234
235 /*
236 * Verify we have good destination addresses. Normally
237 * the caller is responsible for making certain we don't
238 * attempt to load the new image into invalid or reserved
239 * areas of RAM. But crash kernels are preloaded into a
240 * reserved area of ram. We must ensure the addresses
241 * are in the reserved area otherwise preloading the
242 * kernel could corrupt things.
243 */
244
245 if (image->type == KEXEC_TYPE_CRASH) {
246 result = -EADDRNOTAVAIL;
247 for (i = 0; i < nr_segments; i++) {
248 unsigned long mstart, mend;
249
250 mstart = image->segment[i].mem;
251 mend = mstart + image->segment[i].memsz - 1;
252 /* Ensure we are within the crash kernel limits */
253 if ((mstart < crashk_res.start) ||
254 (mend > crashk_res.end))
255 return result;
256 }
257 }
258
259 return 0;
260}
261
262static struct kimage *do_kimage_alloc_init(void)
263{
264 struct kimage *image;
265
266 /* Allocate a controlling structure */
267 image = kzalloc(sizeof(*image), GFP_KERNEL);
268 if (!image)
269 return NULL;
270
271 image->head = 0;
272 image->entry = &image->head;
273 image->last_entry = &image->head;
274 image->control_page = ~0; /* By default this does not apply */
275 image->type = KEXEC_TYPE_DEFAULT;
276
277 /* Initialize the list of control pages */
278 INIT_LIST_HEAD(&image->control_pages);
279
280 /* Initialize the list of destination pages */
281 INIT_LIST_HEAD(&image->dest_pages);
282
283 /* Initialize the list of unusable pages */
284 INIT_LIST_HEAD(&image->unusable_pages);
285
286 return image;
287}
288
289static void kimage_free_page_list(struct list_head *list);
290
291static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, 38static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
292 unsigned long nr_segments, 39 unsigned long nr_segments,
293 struct kexec_segment __user *segments, 40 struct kexec_segment __user *segments,
@@ -354,873 +101,6 @@ out_free_image:
354 return ret; 101 return ret;
355} 102}
356 103
357#ifdef CONFIG_KEXEC_FILE
358static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
359{
360 struct fd f = fdget(fd);
361 int ret;
362 struct kstat stat;
363 loff_t pos;
364 ssize_t bytes = 0;
365
366 if (!f.file)
367 return -EBADF;
368
369 ret = vfs_getattr(&f.file->f_path, &stat);
370 if (ret)
371 goto out;
372
373 if (stat.size > INT_MAX) {
374 ret = -EFBIG;
375 goto out;
376 }
377
378 /* Don't hand 0 to vmalloc, it whines. */
379 if (stat.size == 0) {
380 ret = -EINVAL;
381 goto out;
382 }
383
384 *buf = vmalloc(stat.size);
385 if (!*buf) {
386 ret = -ENOMEM;
387 goto out;
388 }
389
390 pos = 0;
391 while (pos < stat.size) {
392 bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
393 stat.size - pos);
394 if (bytes < 0) {
395 vfree(*buf);
396 ret = bytes;
397 goto out;
398 }
399
400 if (bytes == 0)
401 break;
402 pos += bytes;
403 }
404
405 if (pos != stat.size) {
406 ret = -EBADF;
407 vfree(*buf);
408 goto out;
409 }
410
411 *buf_len = pos;
412out:
413 fdput(f);
414 return ret;
415}
416
417/* Architectures can provide this probe function */
418int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
419 unsigned long buf_len)
420{
421 return -ENOEXEC;
422}
423
424void * __weak arch_kexec_kernel_image_load(struct kimage *image)
425{
426 return ERR_PTR(-ENOEXEC);
427}
428
429void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
430{
431}
432
433int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
434 unsigned long buf_len)
435{
436 return -EKEYREJECTED;
437}
438
439/* Apply relocations of type RELA */
440int __weak
441arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
442 unsigned int relsec)
443{
444 pr_err("RELA relocation unsupported.\n");
445 return -ENOEXEC;
446}
447
448/* Apply relocations of type REL */
449int __weak
450arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
451 unsigned int relsec)
452{
453 pr_err("REL relocation unsupported.\n");
454 return -ENOEXEC;
455}
456
457/*
458 * Free up memory used by kernel, initrd, and command line. This is temporary
459 * memory allocation which is not needed any more after these buffers have
460 * been loaded into separate segments and have been copied elsewhere.
461 */
462static void kimage_file_post_load_cleanup(struct kimage *image)
463{
464 struct purgatory_info *pi = &image->purgatory_info;
465
466 vfree(image->kernel_buf);
467 image->kernel_buf = NULL;
468
469 vfree(image->initrd_buf);
470 image->initrd_buf = NULL;
471
472 kfree(image->cmdline_buf);
473 image->cmdline_buf = NULL;
474
475 vfree(pi->purgatory_buf);
476 pi->purgatory_buf = NULL;
477
478 vfree(pi->sechdrs);
479 pi->sechdrs = NULL;
480
481 /* See if architecture has anything to cleanup post load */
482 arch_kimage_file_post_load_cleanup(image);
483
484 /*
485 * Above call should have called into bootloader to free up
486 * any data stored in kimage->image_loader_data. It should
487 * be ok now to free it up.
488 */
489 kfree(image->image_loader_data);
490 image->image_loader_data = NULL;
491}
492
493/*
494 * In file mode list of segments is prepared by kernel. Copy relevant
495 * data from user space, do error checking, prepare segment list
496 */
497static int
498kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
499 const char __user *cmdline_ptr,
500 unsigned long cmdline_len, unsigned flags)
501{
502 int ret = 0;
503 void *ldata;
504
505 ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
506 &image->kernel_buf_len);
507 if (ret)
508 return ret;
509
510 /* Call arch image probe handlers */
511 ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
512 image->kernel_buf_len);
513
514 if (ret)
515 goto out;
516
517#ifdef CONFIG_KEXEC_VERIFY_SIG
518 ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
519 image->kernel_buf_len);
520 if (ret) {
521 pr_debug("kernel signature verification failed.\n");
522 goto out;
523 }
524 pr_debug("kernel signature verification successful.\n");
525#endif
526 /* It is possible that there no initramfs is being loaded */
527 if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
528 ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
529 &image->initrd_buf_len);
530 if (ret)
531 goto out;
532 }
533
534 if (cmdline_len) {
535 image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
536 if (!image->cmdline_buf) {
537 ret = -ENOMEM;
538 goto out;
539 }
540
541 ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
542 cmdline_len);
543 if (ret) {
544 ret = -EFAULT;
545 goto out;
546 }
547
548 image->cmdline_buf_len = cmdline_len;
549
550 /* command line should be a string with last byte null */
551 if (image->cmdline_buf[cmdline_len - 1] != '\0') {
552 ret = -EINVAL;
553 goto out;
554 }
555 }
556
557 /* Call arch image load handlers */
558 ldata = arch_kexec_kernel_image_load(image);
559
560 if (IS_ERR(ldata)) {
561 ret = PTR_ERR(ldata);
562 goto out;
563 }
564
565 image->image_loader_data = ldata;
566out:
567 /* In case of error, free up all allocated memory in this function */
568 if (ret)
569 kimage_file_post_load_cleanup(image);
570 return ret;
571}
572
573static int
574kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
575 int initrd_fd, const char __user *cmdline_ptr,
576 unsigned long cmdline_len, unsigned long flags)
577{
578 int ret;
579 struct kimage *image;
580 bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
581
582 image = do_kimage_alloc_init();
583 if (!image)
584 return -ENOMEM;
585
586 image->file_mode = 1;
587
588 if (kexec_on_panic) {
589 /* Enable special crash kernel control page alloc policy. */
590 image->control_page = crashk_res.start;
591 image->type = KEXEC_TYPE_CRASH;
592 }
593
594 ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
595 cmdline_ptr, cmdline_len, flags);
596 if (ret)
597 goto out_free_image;
598
599 ret = sanity_check_segment_list(image);
600 if (ret)
601 goto out_free_post_load_bufs;
602
603 ret = -ENOMEM;
604 image->control_code_page = kimage_alloc_control_pages(image,
605 get_order(KEXEC_CONTROL_PAGE_SIZE));
606 if (!image->control_code_page) {
607 pr_err("Could not allocate control_code_buffer\n");
608 goto out_free_post_load_bufs;
609 }
610
611 if (!kexec_on_panic) {
612 image->swap_page = kimage_alloc_control_pages(image, 0);
613 if (!image->swap_page) {
614 pr_err("Could not allocate swap buffer\n");
615 goto out_free_control_pages;
616 }
617 }
618
619 *rimage = image;
620 return 0;
621out_free_control_pages:
622 kimage_free_page_list(&image->control_pages);
623out_free_post_load_bufs:
624 kimage_file_post_load_cleanup(image);
625out_free_image:
626 kfree(image);
627 return ret;
628}
629#else /* CONFIG_KEXEC_FILE */
630static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
631#endif /* CONFIG_KEXEC_FILE */
632
633static int kimage_is_destination_range(struct kimage *image,
634 unsigned long start,
635 unsigned long end)
636{
637 unsigned long i;
638
639 for (i = 0; i < image->nr_segments; i++) {
640 unsigned long mstart, mend;
641
642 mstart = image->segment[i].mem;
643 mend = mstart + image->segment[i].memsz;
644 if ((end > mstart) && (start < mend))
645 return 1;
646 }
647
648 return 0;
649}
650
651static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
652{
653 struct page *pages;
654
655 pages = alloc_pages(gfp_mask, order);
656 if (pages) {
657 unsigned int count, i;
658 pages->mapping = NULL;
659 set_page_private(pages, order);
660 count = 1 << order;
661 for (i = 0; i < count; i++)
662 SetPageReserved(pages + i);
663 }
664
665 return pages;
666}
667
668static void kimage_free_pages(struct page *page)
669{
670 unsigned int order, count, i;
671
672 order = page_private(page);
673 count = 1 << order;
674 for (i = 0; i < count; i++)
675 ClearPageReserved(page + i);
676 __free_pages(page, order);
677}
678
679static void kimage_free_page_list(struct list_head *list)
680{
681 struct list_head *pos, *next;
682
683 list_for_each_safe(pos, next, list) {
684 struct page *page;
685
686 page = list_entry(pos, struct page, lru);
687 list_del(&page->lru);
688 kimage_free_pages(page);
689 }
690}
691
692static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
693 unsigned int order)
694{
695 /* Control pages are special, they are the intermediaries
696 * that are needed while we copy the rest of the pages
697 * to their final resting place. As such they must
698 * not conflict with either the destination addresses
699 * or memory the kernel is already using.
700 *
701 * The only case where we really need more than one of
702 * these are for architectures where we cannot disable
703 * the MMU and must instead generate an identity mapped
704 * page table for all of the memory.
705 *
706 * At worst this runs in O(N) of the image size.
707 */
708 struct list_head extra_pages;
709 struct page *pages;
710 unsigned int count;
711
712 count = 1 << order;
713 INIT_LIST_HEAD(&extra_pages);
714
715 /* Loop while I can allocate a page and the page allocated
716 * is a destination page.
717 */
718 do {
719 unsigned long pfn, epfn, addr, eaddr;
720
721 pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
722 if (!pages)
723 break;
724 pfn = page_to_pfn(pages);
725 epfn = pfn + count;
726 addr = pfn << PAGE_SHIFT;
727 eaddr = epfn << PAGE_SHIFT;
728 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
729 kimage_is_destination_range(image, addr, eaddr)) {
730 list_add(&pages->lru, &extra_pages);
731 pages = NULL;
732 }
733 } while (!pages);
734
735 if (pages) {
736 /* Remember the allocated page... */
737 list_add(&pages->lru, &image->control_pages);
738
739 /* Because the page is already in it's destination
740 * location we will never allocate another page at
741 * that address. Therefore kimage_alloc_pages
742 * will not return it (again) and we don't need
743 * to give it an entry in image->segment[].
744 */
745 }
746 /* Deal with the destination pages I have inadvertently allocated.
747 *
748 * Ideally I would convert multi-page allocations into single
749 * page allocations, and add everything to image->dest_pages.
750 *
751 * For now it is simpler to just free the pages.
752 */
753 kimage_free_page_list(&extra_pages);
754
755 return pages;
756}
757
758static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
759 unsigned int order)
760{
761 /* Control pages are special, they are the intermediaries
762 * that are needed while we copy the rest of the pages
763 * to their final resting place. As such they must
764 * not conflict with either the destination addresses
765 * or memory the kernel is already using.
766 *
767 * Control pages are also the only pags we must allocate
768 * when loading a crash kernel. All of the other pages
769 * are specified by the segments and we just memcpy
770 * into them directly.
771 *
772 * The only case where we really need more than one of
773 * these are for architectures where we cannot disable
774 * the MMU and must instead generate an identity mapped
775 * page table for all of the memory.
776 *
777 * Given the low demand this implements a very simple
778 * allocator that finds the first hole of the appropriate
779 * size in the reserved memory region, and allocates all
780 * of the memory up to and including the hole.
781 */
782 unsigned long hole_start, hole_end, size;
783 struct page *pages;
784
785 pages = NULL;
786 size = (1 << order) << PAGE_SHIFT;
787 hole_start = (image->control_page + (size - 1)) & ~(size - 1);
788 hole_end = hole_start + size - 1;
789 while (hole_end <= crashk_res.end) {
790 unsigned long i;
791
792 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
793 break;
794 /* See if I overlap any of the segments */
795 for (i = 0; i < image->nr_segments; i++) {
796 unsigned long mstart, mend;
797
798 mstart = image->segment[i].mem;
799 mend = mstart + image->segment[i].memsz - 1;
800 if ((hole_end >= mstart) && (hole_start <= mend)) {
801 /* Advance the hole to the end of the segment */
802 hole_start = (mend + (size - 1)) & ~(size - 1);
803 hole_end = hole_start + size - 1;
804 break;
805 }
806 }
807 /* If I don't overlap any segments I have found my hole! */
808 if (i == image->nr_segments) {
809 pages = pfn_to_page(hole_start >> PAGE_SHIFT);
810 break;
811 }
812 }
813 if (pages)
814 image->control_page = hole_end;
815
816 return pages;
817}
818
819
820struct page *kimage_alloc_control_pages(struct kimage *image,
821 unsigned int order)
822{
823 struct page *pages = NULL;
824
825 switch (image->type) {
826 case KEXEC_TYPE_DEFAULT:
827 pages = kimage_alloc_normal_control_pages(image, order);
828 break;
829 case KEXEC_TYPE_CRASH:
830 pages = kimage_alloc_crash_control_pages(image, order);
831 break;
832 }
833
834 return pages;
835}
836
837static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
838{
839 if (*image->entry != 0)
840 image->entry++;
841
842 if (image->entry == image->last_entry) {
843 kimage_entry_t *ind_page;
844 struct page *page;
845
846 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
847 if (!page)
848 return -ENOMEM;
849
850 ind_page = page_address(page);
851 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
852 image->entry = ind_page;
853 image->last_entry = ind_page +
854 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
855 }
856 *image->entry = entry;
857 image->entry++;
858 *image->entry = 0;
859
860 return 0;
861}
862
863static int kimage_set_destination(struct kimage *image,
864 unsigned long destination)
865{
866 int result;
867
868 destination &= PAGE_MASK;
869 result = kimage_add_entry(image, destination | IND_DESTINATION);
870
871 return result;
872}
873
874
875static int kimage_add_page(struct kimage *image, unsigned long page)
876{
877 int result;
878
879 page &= PAGE_MASK;
880 result = kimage_add_entry(image, page | IND_SOURCE);
881
882 return result;
883}
884
885
886static void kimage_free_extra_pages(struct kimage *image)
887{
888 /* Walk through and free any extra destination pages I may have */
889 kimage_free_page_list(&image->dest_pages);
890
891 /* Walk through and free any unusable pages I have cached */
892 kimage_free_page_list(&image->unusable_pages);
893
894}
895static void kimage_terminate(struct kimage *image)
896{
897 if (*image->entry != 0)
898 image->entry++;
899
900 *image->entry = IND_DONE;
901}
902
903#define for_each_kimage_entry(image, ptr, entry) \
904 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
905 ptr = (entry & IND_INDIRECTION) ? \
906 phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
907
908static void kimage_free_entry(kimage_entry_t entry)
909{
910 struct page *page;
911
912 page = pfn_to_page(entry >> PAGE_SHIFT);
913 kimage_free_pages(page);
914}
915
916static void kimage_free(struct kimage *image)
917{
918 kimage_entry_t *ptr, entry;
919 kimage_entry_t ind = 0;
920
921 if (!image)
922 return;
923
924 kimage_free_extra_pages(image);
925 for_each_kimage_entry(image, ptr, entry) {
926 if (entry & IND_INDIRECTION) {
927 /* Free the previous indirection page */
928 if (ind & IND_INDIRECTION)
929 kimage_free_entry(ind);
930 /* Save this indirection page until we are
931 * done with it.
932 */
933 ind = entry;
934 } else if (entry & IND_SOURCE)
935 kimage_free_entry(entry);
936 }
937 /* Free the final indirection page */
938 if (ind & IND_INDIRECTION)
939 kimage_free_entry(ind);
940
941 /* Handle any machine specific cleanup */
942 machine_kexec_cleanup(image);
943
944 /* Free the kexec control pages... */
945 kimage_free_page_list(&image->control_pages);
946
947 /*
948 * Free up any temporary buffers allocated. This might hit if
949 * error occurred much later after buffer allocation.
950 */
951 if (image->file_mode)
952 kimage_file_post_load_cleanup(image);
953
954 kfree(image);
955}
956
957static kimage_entry_t *kimage_dst_used(struct kimage *image,
958 unsigned long page)
959{
960 kimage_entry_t *ptr, entry;
961 unsigned long destination = 0;
962
963 for_each_kimage_entry(image, ptr, entry) {
964 if (entry & IND_DESTINATION)
965 destination = entry & PAGE_MASK;
966 else if (entry & IND_SOURCE) {
967 if (page == destination)
968 return ptr;
969 destination += PAGE_SIZE;
970 }
971 }
972
973 return NULL;
974}
975
976static struct page *kimage_alloc_page(struct kimage *image,
977 gfp_t gfp_mask,
978 unsigned long destination)
979{
980 /*
981 * Here we implement safeguards to ensure that a source page
982 * is not copied to its destination page before the data on
983 * the destination page is no longer useful.
984 *
985 * To do this we maintain the invariant that a source page is
986 * either its own destination page, or it is not a
987 * destination page at all.
988 *
989 * That is slightly stronger than required, but the proof
990 * that no problems will not occur is trivial, and the
991 * implementation is simply to verify.
992 *
993 * When allocating all pages normally this algorithm will run
994 * in O(N) time, but in the worst case it will run in O(N^2)
995 * time. If the runtime is a problem the data structures can
996 * be fixed.
997 */
998 struct page *page;
999 unsigned long addr;
1000
1001 /*
1002 * Walk through the list of destination pages, and see if I
1003 * have a match.
1004 */
1005 list_for_each_entry(page, &image->dest_pages, lru) {
1006 addr = page_to_pfn(page) << PAGE_SHIFT;
1007 if (addr == destination) {
1008 list_del(&page->lru);
1009 return page;
1010 }
1011 }
1012 page = NULL;
1013 while (1) {
1014 kimage_entry_t *old;
1015
1016 /* Allocate a page, if we run out of memory give up */
1017 page = kimage_alloc_pages(gfp_mask, 0);
1018 if (!page)
1019 return NULL;
1020 /* If the page cannot be used file it away */
1021 if (page_to_pfn(page) >
1022 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
1023 list_add(&page->lru, &image->unusable_pages);
1024 continue;
1025 }
1026 addr = page_to_pfn(page) << PAGE_SHIFT;
1027
1028 /* If it is the destination page we want use it */
1029 if (addr == destination)
1030 break;
1031
1032 /* If the page is not a destination page use it */
1033 if (!kimage_is_destination_range(image, addr,
1034 addr + PAGE_SIZE))
1035 break;
1036
1037 /*
1038 * I know that the page is someones destination page.
1039 * See if there is already a source page for this
1040 * destination page. And if so swap the source pages.
1041 */
1042 old = kimage_dst_used(image, addr);
1043 if (old) {
1044 /* If so move it */
1045 unsigned long old_addr;
1046 struct page *old_page;
1047
1048 old_addr = *old & PAGE_MASK;
1049 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
1050 copy_highpage(page, old_page);
1051 *old = addr | (*old & ~PAGE_MASK);
1052
1053 /* The old page I have found cannot be a
1054 * destination page, so return it if it's
1055 * gfp_flags honor the ones passed in.
1056 */
1057 if (!(gfp_mask & __GFP_HIGHMEM) &&
1058 PageHighMem(old_page)) {
1059 kimage_free_pages(old_page);
1060 continue;
1061 }
1062 addr = old_addr;
1063 page = old_page;
1064 break;
1065 } else {
1066 /* Place the page on the destination list I
1067 * will use it later.
1068 */
1069 list_add(&page->lru, &image->dest_pages);
1070 }
1071 }
1072
1073 return page;
1074}
1075
1076static int kimage_load_normal_segment(struct kimage *image,
1077 struct kexec_segment *segment)
1078{
1079 unsigned long maddr;
1080 size_t ubytes, mbytes;
1081 int result;
1082 unsigned char __user *buf = NULL;
1083 unsigned char *kbuf = NULL;
1084
1085 result = 0;
1086 if (image->file_mode)
1087 kbuf = segment->kbuf;
1088 else
1089 buf = segment->buf;
1090 ubytes = segment->bufsz;
1091 mbytes = segment->memsz;
1092 maddr = segment->mem;
1093
1094 result = kimage_set_destination(image, maddr);
1095 if (result < 0)
1096 goto out;
1097
1098 while (mbytes) {
1099 struct page *page;
1100 char *ptr;
1101 size_t uchunk, mchunk;
1102
1103 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
1104 if (!page) {
1105 result = -ENOMEM;
1106 goto out;
1107 }
1108 result = kimage_add_page(image, page_to_pfn(page)
1109 << PAGE_SHIFT);
1110 if (result < 0)
1111 goto out;
1112
1113 ptr = kmap(page);
1114 /* Start with a clear page */
1115 clear_page(ptr);
1116 ptr += maddr & ~PAGE_MASK;
1117 mchunk = min_t(size_t, mbytes,
1118 PAGE_SIZE - (maddr & ~PAGE_MASK));
1119 uchunk = min(ubytes, mchunk);
1120
1121 /* For file based kexec, source pages are in kernel memory */
1122 if (image->file_mode)
1123 memcpy(ptr, kbuf, uchunk);
1124 else
1125 result = copy_from_user(ptr, buf, uchunk);
1126 kunmap(page);
1127 if (result) {
1128 result = -EFAULT;
1129 goto out;
1130 }
1131 ubytes -= uchunk;
1132 maddr += mchunk;
1133 if (image->file_mode)
1134 kbuf += mchunk;
1135 else
1136 buf += mchunk;
1137 mbytes -= mchunk;
1138 }
1139out:
1140 return result;
1141}
1142
1143static int kimage_load_crash_segment(struct kimage *image,
1144 struct kexec_segment *segment)
1145{
1146 /* For crash dumps kernels we simply copy the data from
1147 * user space to it's destination.
1148 * We do things a page at a time for the sake of kmap.
1149 */
1150 unsigned long maddr;
1151 size_t ubytes, mbytes;
1152 int result;
1153 unsigned char __user *buf = NULL;
1154 unsigned char *kbuf = NULL;
1155
1156 result = 0;
1157 if (image->file_mode)
1158 kbuf = segment->kbuf;
1159 else
1160 buf = segment->buf;
1161 ubytes = segment->bufsz;
1162 mbytes = segment->memsz;
1163 maddr = segment->mem;
1164 while (mbytes) {
1165 struct page *page;
1166 char *ptr;
1167 size_t uchunk, mchunk;
1168
1169 page = pfn_to_page(maddr >> PAGE_SHIFT);
1170 if (!page) {
1171 result = -ENOMEM;
1172 goto out;
1173 }
1174 ptr = kmap(page);
1175 ptr += maddr & ~PAGE_MASK;
1176 mchunk = min_t(size_t, mbytes,
1177 PAGE_SIZE - (maddr & ~PAGE_MASK));
1178 uchunk = min(ubytes, mchunk);
1179 if (mchunk > uchunk) {
1180 /* Zero the trailing part of the page */
1181 memset(ptr + uchunk, 0, mchunk - uchunk);
1182 }
1183
1184 /* For file based kexec, source pages are in kernel memory */
1185 if (image->file_mode)
1186 memcpy(ptr, kbuf, uchunk);
1187 else
1188 result = copy_from_user(ptr, buf, uchunk);
1189 kexec_flush_icache_page(page);
1190 kunmap(page);
1191 if (result) {
1192 result = -EFAULT;
1193 goto out;
1194 }
1195 ubytes -= uchunk;
1196 maddr += mchunk;
1197 if (image->file_mode)
1198 kbuf += mchunk;
1199 else
1200 buf += mchunk;
1201 mbytes -= mchunk;
1202 }
1203out:
1204 return result;
1205}
1206
1207static int kimage_load_segment(struct kimage *image,
1208 struct kexec_segment *segment)
1209{
1210 int result = -ENOMEM;
1211
1212 switch (image->type) {
1213 case KEXEC_TYPE_DEFAULT:
1214 result = kimage_load_normal_segment(image, segment);
1215 break;
1216 case KEXEC_TYPE_CRASH:
1217 result = kimage_load_crash_segment(image, segment);
1218 break;
1219 }
1220
1221 return result;
1222}
1223
1224/* 104/*
1225 * Exec Kernel system call: for obvious reasons only root may call it. 105 * Exec Kernel system call: for obvious reasons only root may call it.
1226 * 106 *
@@ -1241,11 +121,6 @@ static int kimage_load_segment(struct kimage *image,
1241 * kexec does not sync, or unmount filesystems so if you need 121 * kexec does not sync, or unmount filesystems so if you need
1242 * that to happen you need to do that yourself. 122 * that to happen you need to do that yourself.
1243 */ 123 */
1244struct kimage *kexec_image;
1245struct kimage *kexec_crash_image;
1246int kexec_load_disabled;
1247
1248static DEFINE_MUTEX(kexec_mutex);
1249 124
1250SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, 125SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
1251 struct kexec_segment __user *, segments, unsigned long, flags) 126 struct kexec_segment __user *, segments, unsigned long, flags)
@@ -1340,18 +215,6 @@ out:
1340 return result; 215 return result;
1341} 216}
1342 217
1343/*
1344 * Add and remove page tables for crashkernel memory
1345 *
1346 * Provide an empty default implementation here -- architecture
1347 * code may override this
1348 */
1349void __weak crash_map_reserved_pages(void)
1350{}
1351
1352void __weak crash_unmap_reserved_pages(void)
1353{}
1354
1355#ifdef CONFIG_COMPAT 218#ifdef CONFIG_COMPAT
1356COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry, 219COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1357 compat_ulong_t, nr_segments, 220 compat_ulong_t, nr_segments,
@@ -1390,1391 +253,3 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1390 return sys_kexec_load(entry, nr_segments, ksegments, flags); 253 return sys_kexec_load(entry, nr_segments, ksegments, flags);
1391} 254}
1392#endif 255#endif
1393
1394#ifdef CONFIG_KEXEC_FILE
1395SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
1396 unsigned long, cmdline_len, const char __user *, cmdline_ptr,
1397 unsigned long, flags)
1398{
1399 int ret = 0, i;
1400 struct kimage **dest_image, *image;
1401
1402 /* We only trust the superuser with rebooting the system. */
1403 if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
1404 return -EPERM;
1405
1406 /* Make sure we have a legal set of flags */
1407 if (flags != (flags & KEXEC_FILE_FLAGS))
1408 return -EINVAL;
1409
1410 image = NULL;
1411
1412 if (!mutex_trylock(&kexec_mutex))
1413 return -EBUSY;
1414
1415 dest_image = &kexec_image;
1416 if (flags & KEXEC_FILE_ON_CRASH)
1417 dest_image = &kexec_crash_image;
1418
1419 if (flags & KEXEC_FILE_UNLOAD)
1420 goto exchange;
1421
1422 /*
1423 * In case of crash, new kernel gets loaded in reserved region. It is
1424 * same memory where old crash kernel might be loaded. Free any
1425 * current crash dump kernel before we corrupt it.
1426 */
1427 if (flags & KEXEC_FILE_ON_CRASH)
1428 kimage_free(xchg(&kexec_crash_image, NULL));
1429
1430 ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
1431 cmdline_len, flags);
1432 if (ret)
1433 goto out;
1434
1435 ret = machine_kexec_prepare(image);
1436 if (ret)
1437 goto out;
1438
1439 ret = kexec_calculate_store_digests(image);
1440 if (ret)
1441 goto out;
1442
1443 for (i = 0; i < image->nr_segments; i++) {
1444 struct kexec_segment *ksegment;
1445
1446 ksegment = &image->segment[i];
1447 pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
1448 i, ksegment->buf, ksegment->bufsz, ksegment->mem,
1449 ksegment->memsz);
1450
1451 ret = kimage_load_segment(image, &image->segment[i]);
1452 if (ret)
1453 goto out;
1454 }
1455
1456 kimage_terminate(image);
1457
1458 /*
1459 * Free up any temporary buffers allocated which are not needed
1460 * after image has been loaded
1461 */
1462 kimage_file_post_load_cleanup(image);
1463exchange:
1464 image = xchg(dest_image, image);
1465out:
1466 mutex_unlock(&kexec_mutex);
1467 kimage_free(image);
1468 return ret;
1469}
1470
1471#endif /* CONFIG_KEXEC_FILE */
1472
1473void crash_kexec(struct pt_regs *regs)
1474{
1475 /* Take the kexec_mutex here to prevent sys_kexec_load
1476 * running on one cpu from replacing the crash kernel
1477 * we are using after a panic on a different cpu.
1478 *
1479 * If the crash kernel was not located in a fixed area
1480 * of memory the xchg(&kexec_crash_image) would be
1481 * sufficient. But since I reuse the memory...
1482 */
1483 if (mutex_trylock(&kexec_mutex)) {
1484 if (kexec_crash_image) {
1485 struct pt_regs fixed_regs;
1486
1487 crash_setup_regs(&fixed_regs, regs);
1488 crash_save_vmcoreinfo();
1489 machine_crash_shutdown(&fixed_regs);
1490 machine_kexec(kexec_crash_image);
1491 }
1492 mutex_unlock(&kexec_mutex);
1493 }
1494}
1495
1496size_t crash_get_memory_size(void)
1497{
1498 size_t size = 0;
1499 mutex_lock(&kexec_mutex);
1500 if (crashk_res.end != crashk_res.start)
1501 size = resource_size(&crashk_res);
1502 mutex_unlock(&kexec_mutex);
1503 return size;
1504}
1505
1506void __weak crash_free_reserved_phys_range(unsigned long begin,
1507 unsigned long end)
1508{
1509 unsigned long addr;
1510
1511 for (addr = begin; addr < end; addr += PAGE_SIZE)
1512 free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
1513}
1514
1515int crash_shrink_memory(unsigned long new_size)
1516{
1517 int ret = 0;
1518 unsigned long start, end;
1519 unsigned long old_size;
1520 struct resource *ram_res;
1521
1522 mutex_lock(&kexec_mutex);
1523
1524 if (kexec_crash_image) {
1525 ret = -ENOENT;
1526 goto unlock;
1527 }
1528 start = crashk_res.start;
1529 end = crashk_res.end;
1530 old_size = (end == 0) ? 0 : end - start + 1;
1531 if (new_size >= old_size) {
1532 ret = (new_size == old_size) ? 0 : -EINVAL;
1533 goto unlock;
1534 }
1535
1536 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
1537 if (!ram_res) {
1538 ret = -ENOMEM;
1539 goto unlock;
1540 }
1541
1542 start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
1543 end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
1544
1545 crash_map_reserved_pages();
1546 crash_free_reserved_phys_range(end, crashk_res.end);
1547
1548 if ((start == end) && (crashk_res.parent != NULL))
1549 release_resource(&crashk_res);
1550
1551 ram_res->start = end;
1552 ram_res->end = crashk_res.end;
1553 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
1554 ram_res->name = "System RAM";
1555
1556 crashk_res.end = end - 1;
1557
1558 insert_resource(&iomem_resource, ram_res);
1559 crash_unmap_reserved_pages();
1560
1561unlock:
1562 mutex_unlock(&kexec_mutex);
1563 return ret;
1564}
1565
1566static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1567 size_t data_len)
1568{
1569 struct elf_note note;
1570
1571 note.n_namesz = strlen(name) + 1;
1572 note.n_descsz = data_len;
1573 note.n_type = type;
1574 memcpy(buf, &note, sizeof(note));
1575 buf += (sizeof(note) + 3)/4;
1576 memcpy(buf, name, note.n_namesz);
1577 buf += (note.n_namesz + 3)/4;
1578 memcpy(buf, data, note.n_descsz);
1579 buf += (note.n_descsz + 3)/4;
1580
1581 return buf;
1582}
1583
1584static void final_note(u32 *buf)
1585{
1586 struct elf_note note;
1587
1588 note.n_namesz = 0;
1589 note.n_descsz = 0;
1590 note.n_type = 0;
1591 memcpy(buf, &note, sizeof(note));
1592}
1593
1594void crash_save_cpu(struct pt_regs *regs, int cpu)
1595{
1596 struct elf_prstatus prstatus;
1597 u32 *buf;
1598
1599 if ((cpu < 0) || (cpu >= nr_cpu_ids))
1600 return;
1601
1602 /* Using ELF notes here is opportunistic.
1603 * I need a well defined structure format
1604 * for the data I pass, and I need tags
1605 * on the data to indicate what information I have
1606 * squirrelled away. ELF notes happen to provide
1607 * all of that, so there is no need to invent something new.
1608 */
1609 buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
1610 if (!buf)
1611 return;
1612 memset(&prstatus, 0, sizeof(prstatus));
1613 prstatus.pr_pid = current->pid;
1614 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1615 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1616 &prstatus, sizeof(prstatus));
1617 final_note(buf);
1618}
1619
1620static int __init crash_notes_memory_init(void)
1621{
1622 /* Allocate memory for saving cpu registers. */
1623 crash_notes = alloc_percpu(note_buf_t);
1624 if (!crash_notes) {
1625 pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
1626 return -ENOMEM;
1627 }
1628 return 0;
1629}
1630subsys_initcall(crash_notes_memory_init);
1631
1632
1633/*
1634 * parsing the "crashkernel" commandline
1635 *
1636 * this code is intended to be called from architecture specific code
1637 */
1638
1639
1640/*
1641 * This function parses command lines in the format
1642 *
1643 * crashkernel=ramsize-range:size[,...][@offset]
1644 *
1645 * The function returns 0 on success and -EINVAL on failure.
1646 */
1647static int __init parse_crashkernel_mem(char *cmdline,
1648 unsigned long long system_ram,
1649 unsigned long long *crash_size,
1650 unsigned long long *crash_base)
1651{
1652 char *cur = cmdline, *tmp;
1653
1654 /* for each entry of the comma-separated list */
1655 do {
1656 unsigned long long start, end = ULLONG_MAX, size;
1657
1658 /* get the start of the range */
1659 start = memparse(cur, &tmp);
1660 if (cur == tmp) {
1661 pr_warn("crashkernel: Memory value expected\n");
1662 return -EINVAL;
1663 }
1664 cur = tmp;
1665 if (*cur != '-') {
1666 pr_warn("crashkernel: '-' expected\n");
1667 return -EINVAL;
1668 }
1669 cur++;
1670
1671 /* if no ':' is here, than we read the end */
1672 if (*cur != ':') {
1673 end = memparse(cur, &tmp);
1674 if (cur == tmp) {
1675 pr_warn("crashkernel: Memory value expected\n");
1676 return -EINVAL;
1677 }
1678 cur = tmp;
1679 if (end <= start) {
1680 pr_warn("crashkernel: end <= start\n");
1681 return -EINVAL;
1682 }
1683 }
1684
1685 if (*cur != ':') {
1686 pr_warn("crashkernel: ':' expected\n");
1687 return -EINVAL;
1688 }
1689 cur++;
1690
1691 size = memparse(cur, &tmp);
1692 if (cur == tmp) {
1693 pr_warn("Memory value expected\n");
1694 return -EINVAL;
1695 }
1696 cur = tmp;
1697 if (size >= system_ram) {
1698 pr_warn("crashkernel: invalid size\n");
1699 return -EINVAL;
1700 }
1701
1702 /* match ? */
1703 if (system_ram >= start && system_ram < end) {
1704 *crash_size = size;
1705 break;
1706 }
1707 } while (*cur++ == ',');
1708
1709 if (*crash_size > 0) {
1710 while (*cur && *cur != ' ' && *cur != '@')
1711 cur++;
1712 if (*cur == '@') {
1713 cur++;
1714 *crash_base = memparse(cur, &tmp);
1715 if (cur == tmp) {
1716 pr_warn("Memory value expected after '@'\n");
1717 return -EINVAL;
1718 }
1719 }
1720 }
1721
1722 return 0;
1723}
1724
1725/*
1726 * That function parses "simple" (old) crashkernel command lines like
1727 *
1728 * crashkernel=size[@offset]
1729 *
1730 * It returns 0 on success and -EINVAL on failure.
1731 */
1732static int __init parse_crashkernel_simple(char *cmdline,
1733 unsigned long long *crash_size,
1734 unsigned long long *crash_base)
1735{
1736 char *cur = cmdline;
1737
1738 *crash_size = memparse(cmdline, &cur);
1739 if (cmdline == cur) {
1740 pr_warn("crashkernel: memory value expected\n");
1741 return -EINVAL;
1742 }
1743
1744 if (*cur == '@')
1745 *crash_base = memparse(cur+1, &cur);
1746 else if (*cur != ' ' && *cur != '\0') {
1747 pr_warn("crashkernel: unrecognized char\n");
1748 return -EINVAL;
1749 }
1750
1751 return 0;
1752}
1753
1754#define SUFFIX_HIGH 0
1755#define SUFFIX_LOW 1
1756#define SUFFIX_NULL 2
1757static __initdata char *suffix_tbl[] = {
1758 [SUFFIX_HIGH] = ",high",
1759 [SUFFIX_LOW] = ",low",
1760 [SUFFIX_NULL] = NULL,
1761};
1762
1763/*
1764 * That function parses "suffix" crashkernel command lines like
1765 *
1766 * crashkernel=size,[high|low]
1767 *
1768 * It returns 0 on success and -EINVAL on failure.
1769 */
1770static int __init parse_crashkernel_suffix(char *cmdline,
1771 unsigned long long *crash_size,
1772 const char *suffix)
1773{
1774 char *cur = cmdline;
1775
1776 *crash_size = memparse(cmdline, &cur);
1777 if (cmdline == cur) {
1778 pr_warn("crashkernel: memory value expected\n");
1779 return -EINVAL;
1780 }
1781
1782 /* check with suffix */
1783 if (strncmp(cur, suffix, strlen(suffix))) {
1784 pr_warn("crashkernel: unrecognized char\n");
1785 return -EINVAL;
1786 }
1787 cur += strlen(suffix);
1788 if (*cur != ' ' && *cur != '\0') {
1789 pr_warn("crashkernel: unrecognized char\n");
1790 return -EINVAL;
1791 }
1792
1793 return 0;
1794}
1795
1796static __init char *get_last_crashkernel(char *cmdline,
1797 const char *name,
1798 const char *suffix)
1799{
1800 char *p = cmdline, *ck_cmdline = NULL;
1801
1802 /* find crashkernel and use the last one if there are more */
1803 p = strstr(p, name);
1804 while (p) {
1805 char *end_p = strchr(p, ' ');
1806 char *q;
1807
1808 if (!end_p)
1809 end_p = p + strlen(p);
1810
1811 if (!suffix) {
1812 int i;
1813
1814 /* skip the one with any known suffix */
1815 for (i = 0; suffix_tbl[i]; i++) {
1816 q = end_p - strlen(suffix_tbl[i]);
1817 if (!strncmp(q, suffix_tbl[i],
1818 strlen(suffix_tbl[i])))
1819 goto next;
1820 }
1821 ck_cmdline = p;
1822 } else {
1823 q = end_p - strlen(suffix);
1824 if (!strncmp(q, suffix, strlen(suffix)))
1825 ck_cmdline = p;
1826 }
1827next:
1828 p = strstr(p+1, name);
1829 }
1830
1831 if (!ck_cmdline)
1832 return NULL;
1833
1834 return ck_cmdline;
1835}
1836
1837static int __init __parse_crashkernel(char *cmdline,
1838 unsigned long long system_ram,
1839 unsigned long long *crash_size,
1840 unsigned long long *crash_base,
1841 const char *name,
1842 const char *suffix)
1843{
1844 char *first_colon, *first_space;
1845 char *ck_cmdline;
1846
1847 BUG_ON(!crash_size || !crash_base);
1848 *crash_size = 0;
1849 *crash_base = 0;
1850
1851 ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
1852
1853 if (!ck_cmdline)
1854 return -EINVAL;
1855
1856 ck_cmdline += strlen(name);
1857
1858 if (suffix)
1859 return parse_crashkernel_suffix(ck_cmdline, crash_size,
1860 suffix);
1861 /*
1862 * if the commandline contains a ':', then that's the extended
1863 * syntax -- if not, it must be the classic syntax
1864 */
1865 first_colon = strchr(ck_cmdline, ':');
1866 first_space = strchr(ck_cmdline, ' ');
1867 if (first_colon && (!first_space || first_colon < first_space))
1868 return parse_crashkernel_mem(ck_cmdline, system_ram,
1869 crash_size, crash_base);
1870
1871 return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
1872}
1873
1874/*
1875 * That function is the entry point for command line parsing and should be
1876 * called from the arch-specific code.
1877 */
1878int __init parse_crashkernel(char *cmdline,
1879 unsigned long long system_ram,
1880 unsigned long long *crash_size,
1881 unsigned long long *crash_base)
1882{
1883 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1884 "crashkernel=", NULL);
1885}
1886
1887int __init parse_crashkernel_high(char *cmdline,
1888 unsigned long long system_ram,
1889 unsigned long long *crash_size,
1890 unsigned long long *crash_base)
1891{
1892 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1893 "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
1894}
1895
1896int __init parse_crashkernel_low(char *cmdline,
1897 unsigned long long system_ram,
1898 unsigned long long *crash_size,
1899 unsigned long long *crash_base)
1900{
1901 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1902 "crashkernel=", suffix_tbl[SUFFIX_LOW]);
1903}
1904
1905static void update_vmcoreinfo_note(void)
1906{
1907 u32 *buf = vmcoreinfo_note;
1908
1909 if (!vmcoreinfo_size)
1910 return;
1911 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1912 vmcoreinfo_size);
1913 final_note(buf);
1914}
1915
1916void crash_save_vmcoreinfo(void)
1917{
1918 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1919 update_vmcoreinfo_note();
1920}
1921
1922void vmcoreinfo_append_str(const char *fmt, ...)
1923{
1924 va_list args;
1925 char buf[0x50];
1926 size_t r;
1927
1928 va_start(args, fmt);
1929 r = vscnprintf(buf, sizeof(buf), fmt, args);
1930 va_end(args);
1931
1932 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
1933
1934 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1935
1936 vmcoreinfo_size += r;
1937}
1938
1939/*
1940 * provide an empty default implementation here -- architecture
1941 * code may override this
1942 */
1943void __weak arch_crash_save_vmcoreinfo(void)
1944{}
1945
1946unsigned long __weak paddr_vmcoreinfo_note(void)
1947{
1948 return __pa((unsigned long)(char *)&vmcoreinfo_note);
1949}
1950
1951static int __init crash_save_vmcoreinfo_init(void)
1952{
1953 VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1954 VMCOREINFO_PAGESIZE(PAGE_SIZE);
1955
1956 VMCOREINFO_SYMBOL(init_uts_ns);
1957 VMCOREINFO_SYMBOL(node_online_map);
1958#ifdef CONFIG_MMU
1959 VMCOREINFO_SYMBOL(swapper_pg_dir);
1960#endif
1961 VMCOREINFO_SYMBOL(_stext);
1962 VMCOREINFO_SYMBOL(vmap_area_list);
1963
1964#ifndef CONFIG_NEED_MULTIPLE_NODES
1965 VMCOREINFO_SYMBOL(mem_map);
1966 VMCOREINFO_SYMBOL(contig_page_data);
1967#endif
1968#ifdef CONFIG_SPARSEMEM
1969 VMCOREINFO_SYMBOL(mem_section);
1970 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1971 VMCOREINFO_STRUCT_SIZE(mem_section);
1972 VMCOREINFO_OFFSET(mem_section, section_mem_map);
1973#endif
1974 VMCOREINFO_STRUCT_SIZE(page);
1975 VMCOREINFO_STRUCT_SIZE(pglist_data);
1976 VMCOREINFO_STRUCT_SIZE(zone);
1977 VMCOREINFO_STRUCT_SIZE(free_area);
1978 VMCOREINFO_STRUCT_SIZE(list_head);
1979 VMCOREINFO_SIZE(nodemask_t);
1980 VMCOREINFO_OFFSET(page, flags);
1981 VMCOREINFO_OFFSET(page, _count);
1982 VMCOREINFO_OFFSET(page, mapping);
1983 VMCOREINFO_OFFSET(page, lru);
1984 VMCOREINFO_OFFSET(page, _mapcount);
1985 VMCOREINFO_OFFSET(page, private);
1986 VMCOREINFO_OFFSET(pglist_data, node_zones);
1987 VMCOREINFO_OFFSET(pglist_data, nr_zones);
1988#ifdef CONFIG_FLAT_NODE_MEM_MAP
1989 VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1990#endif
1991 VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1992 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1993 VMCOREINFO_OFFSET(pglist_data, node_id);
1994 VMCOREINFO_OFFSET(zone, free_area);
1995 VMCOREINFO_OFFSET(zone, vm_stat);
1996 VMCOREINFO_OFFSET(zone, spanned_pages);
1997 VMCOREINFO_OFFSET(free_area, free_list);
1998 VMCOREINFO_OFFSET(list_head, next);
1999 VMCOREINFO_OFFSET(list_head, prev);
2000 VMCOREINFO_OFFSET(vmap_area, va_start);
2001 VMCOREINFO_OFFSET(vmap_area, list);
2002 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
2003 log_buf_kexec_setup();
2004 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
2005 VMCOREINFO_NUMBER(NR_FREE_PAGES);
2006 VMCOREINFO_NUMBER(PG_lru);
2007 VMCOREINFO_NUMBER(PG_private);
2008 VMCOREINFO_NUMBER(PG_swapcache);
2009 VMCOREINFO_NUMBER(PG_slab);
2010#ifdef CONFIG_MEMORY_FAILURE
2011 VMCOREINFO_NUMBER(PG_hwpoison);
2012#endif
2013 VMCOREINFO_NUMBER(PG_head_mask);
2014 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
2015#ifdef CONFIG_HUGETLBFS
2016 VMCOREINFO_SYMBOL(free_huge_page);
2017#endif
2018
2019 arch_crash_save_vmcoreinfo();
2020 update_vmcoreinfo_note();
2021
2022 return 0;
2023}
2024
2025subsys_initcall(crash_save_vmcoreinfo_init);
2026
2027#ifdef CONFIG_KEXEC_FILE
2028static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
2029 struct kexec_buf *kbuf)
2030{
2031 struct kimage *image = kbuf->image;
2032 unsigned long temp_start, temp_end;
2033
2034 temp_end = min(end, kbuf->buf_max);
2035 temp_start = temp_end - kbuf->memsz;
2036
2037 do {
2038 /* align down start */
2039 temp_start = temp_start & (~(kbuf->buf_align - 1));
2040
2041 if (temp_start < start || temp_start < kbuf->buf_min)
2042 return 0;
2043
2044 temp_end = temp_start + kbuf->memsz - 1;
2045
2046 /*
2047 * Make sure this does not conflict with any of existing
2048 * segments
2049 */
2050 if (kimage_is_destination_range(image, temp_start, temp_end)) {
2051 temp_start = temp_start - PAGE_SIZE;
2052 continue;
2053 }
2054
2055 /* We found a suitable memory range */
2056 break;
2057 } while (1);
2058
2059 /* If we are here, we found a suitable memory range */
2060 kbuf->mem = temp_start;
2061
2062 /* Success, stop navigating through remaining System RAM ranges */
2063 return 1;
2064}
2065
2066static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
2067 struct kexec_buf *kbuf)
2068{
2069 struct kimage *image = kbuf->image;
2070 unsigned long temp_start, temp_end;
2071
2072 temp_start = max(start, kbuf->buf_min);
2073
2074 do {
2075 temp_start = ALIGN(temp_start, kbuf->buf_align);
2076 temp_end = temp_start + kbuf->memsz - 1;
2077
2078 if (temp_end > end || temp_end > kbuf->buf_max)
2079 return 0;
2080 /*
2081 * Make sure this does not conflict with any of existing
2082 * segments
2083 */
2084 if (kimage_is_destination_range(image, temp_start, temp_end)) {
2085 temp_start = temp_start + PAGE_SIZE;
2086 continue;
2087 }
2088
2089 /* We found a suitable memory range */
2090 break;
2091 } while (1);
2092
2093 /* If we are here, we found a suitable memory range */
2094 kbuf->mem = temp_start;
2095
2096 /* Success, stop navigating through remaining System RAM ranges */
2097 return 1;
2098}
2099
2100static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
2101{
2102 struct kexec_buf *kbuf = (struct kexec_buf *)arg;
2103 unsigned long sz = end - start + 1;
2104
2105 /* Returning 0 will take to next memory range */
2106 if (sz < kbuf->memsz)
2107 return 0;
2108
2109 if (end < kbuf->buf_min || start > kbuf->buf_max)
2110 return 0;
2111
2112 /*
2113 * Allocate memory top down with-in ram range. Otherwise bottom up
2114 * allocation.
2115 */
2116 if (kbuf->top_down)
2117 return locate_mem_hole_top_down(start, end, kbuf);
2118 return locate_mem_hole_bottom_up(start, end, kbuf);
2119}
2120
2121/*
2122 * Helper function for placing a buffer in a kexec segment. This assumes
2123 * that kexec_mutex is held.
2124 */
2125int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
2126 unsigned long memsz, unsigned long buf_align,
2127 unsigned long buf_min, unsigned long buf_max,
2128 bool top_down, unsigned long *load_addr)
2129{
2130
2131 struct kexec_segment *ksegment;
2132 struct kexec_buf buf, *kbuf;
2133 int ret;
2134
2135 /* Currently adding segment this way is allowed only in file mode */
2136 if (!image->file_mode)
2137 return -EINVAL;
2138
2139 if (image->nr_segments >= KEXEC_SEGMENT_MAX)
2140 return -EINVAL;
2141
2142 /*
2143 * Make sure we are not trying to add buffer after allocating
2144 * control pages. All segments need to be placed first before
2145 * any control pages are allocated. As control page allocation
2146 * logic goes through list of segments to make sure there are
2147 * no destination overlaps.
2148 */
2149 if (!list_empty(&image->control_pages)) {
2150 WARN_ON(1);
2151 return -EINVAL;
2152 }
2153
2154 memset(&buf, 0, sizeof(struct kexec_buf));
2155 kbuf = &buf;
2156 kbuf->image = image;
2157 kbuf->buffer = buffer;
2158 kbuf->bufsz = bufsz;
2159
2160 kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
2161 kbuf->buf_align = max(buf_align, PAGE_SIZE);
2162 kbuf->buf_min = buf_min;
2163 kbuf->buf_max = buf_max;
2164 kbuf->top_down = top_down;
2165
2166 /* Walk the RAM ranges and allocate a suitable range for the buffer */
2167 if (image->type == KEXEC_TYPE_CRASH)
2168 ret = walk_iomem_res("Crash kernel",
2169 IORESOURCE_MEM | IORESOURCE_BUSY,
2170 crashk_res.start, crashk_res.end, kbuf,
2171 locate_mem_hole_callback);
2172 else
2173 ret = walk_system_ram_res(0, -1, kbuf,
2174 locate_mem_hole_callback);
2175 if (ret != 1) {
2176 /* A suitable memory range could not be found for buffer */
2177 return -EADDRNOTAVAIL;
2178 }
2179
2180 /* Found a suitable memory range */
2181 ksegment = &image->segment[image->nr_segments];
2182 ksegment->kbuf = kbuf->buffer;
2183 ksegment->bufsz = kbuf->bufsz;
2184 ksegment->mem = kbuf->mem;
2185 ksegment->memsz = kbuf->memsz;
2186 image->nr_segments++;
2187 *load_addr = ksegment->mem;
2188 return 0;
2189}
2190
2191/* Calculate and store the digest of segments */
2192static int kexec_calculate_store_digests(struct kimage *image)
2193{
2194 struct crypto_shash *tfm;
2195 struct shash_desc *desc;
2196 int ret = 0, i, j, zero_buf_sz, sha_region_sz;
2197 size_t desc_size, nullsz;
2198 char *digest;
2199 void *zero_buf;
2200 struct kexec_sha_region *sha_regions;
2201 struct purgatory_info *pi = &image->purgatory_info;
2202
2203 zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
2204 zero_buf_sz = PAGE_SIZE;
2205
2206 tfm = crypto_alloc_shash("sha256", 0, 0);
2207 if (IS_ERR(tfm)) {
2208 ret = PTR_ERR(tfm);
2209 goto out;
2210 }
2211
2212 desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
2213 desc = kzalloc(desc_size, GFP_KERNEL);
2214 if (!desc) {
2215 ret = -ENOMEM;
2216 goto out_free_tfm;
2217 }
2218
2219 sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
2220 sha_regions = vzalloc(sha_region_sz);
2221 if (!sha_regions)
2222 goto out_free_desc;
2223
2224 desc->tfm = tfm;
2225 desc->flags = 0;
2226
2227 ret = crypto_shash_init(desc);
2228 if (ret < 0)
2229 goto out_free_sha_regions;
2230
2231 digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
2232 if (!digest) {
2233 ret = -ENOMEM;
2234 goto out_free_sha_regions;
2235 }
2236
2237 for (j = i = 0; i < image->nr_segments; i++) {
2238 struct kexec_segment *ksegment;
2239
2240 ksegment = &image->segment[i];
2241 /*
2242 * Skip purgatory as it will be modified once we put digest
2243 * info in purgatory.
2244 */
2245 if (ksegment->kbuf == pi->purgatory_buf)
2246 continue;
2247
2248 ret = crypto_shash_update(desc, ksegment->kbuf,
2249 ksegment->bufsz);
2250 if (ret)
2251 break;
2252
2253 /*
2254 * Assume rest of the buffer is filled with zero and
2255 * update digest accordingly.
2256 */
2257 nullsz = ksegment->memsz - ksegment->bufsz;
2258 while (nullsz) {
2259 unsigned long bytes = nullsz;
2260
2261 if (bytes > zero_buf_sz)
2262 bytes = zero_buf_sz;
2263 ret = crypto_shash_update(desc, zero_buf, bytes);
2264 if (ret)
2265 break;
2266 nullsz -= bytes;
2267 }
2268
2269 if (ret)
2270 break;
2271
2272 sha_regions[j].start = ksegment->mem;
2273 sha_regions[j].len = ksegment->memsz;
2274 j++;
2275 }
2276
2277 if (!ret) {
2278 ret = crypto_shash_final(desc, digest);
2279 if (ret)
2280 goto out_free_digest;
2281 ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
2282 sha_regions, sha_region_sz, 0);
2283 if (ret)
2284 goto out_free_digest;
2285
2286 ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
2287 digest, SHA256_DIGEST_SIZE, 0);
2288 if (ret)
2289 goto out_free_digest;
2290 }
2291
2292out_free_digest:
2293 kfree(digest);
2294out_free_sha_regions:
2295 vfree(sha_regions);
2296out_free_desc:
2297 kfree(desc);
2298out_free_tfm:
2299 kfree(tfm);
2300out:
2301 return ret;
2302}
2303
2304/* Actually load purgatory. Lot of code taken from kexec-tools */
2305static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
2306 unsigned long max, int top_down)
2307{
2308 struct purgatory_info *pi = &image->purgatory_info;
2309 unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
2310 unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
2311 unsigned char *buf_addr, *src;
2312 int i, ret = 0, entry_sidx = -1;
2313 const Elf_Shdr *sechdrs_c;
2314 Elf_Shdr *sechdrs = NULL;
2315 void *purgatory_buf = NULL;
2316
2317 /*
2318 * sechdrs_c points to section headers in purgatory and are read
2319 * only. No modifications allowed.
2320 */
2321 sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
2322
2323 /*
2324 * We can not modify sechdrs_c[] and its fields. It is read only.
2325 * Copy it over to a local copy where one can store some temporary
2326 * data and free it at the end. We need to modify ->sh_addr and
2327 * ->sh_offset fields to keep track of permanent and temporary
2328 * locations of sections.
2329 */
2330 sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
2331 if (!sechdrs)
2332 return -ENOMEM;
2333
2334 memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
2335
2336 /*
2337 * We seem to have multiple copies of sections. First copy is which
2338 * is embedded in kernel in read only section. Some of these sections
2339 * will be copied to a temporary buffer and relocated. And these
2340 * sections will finally be copied to their final destination at
2341 * segment load time.
2342 *
2343 * Use ->sh_offset to reflect section address in memory. It will
2344 * point to original read only copy if section is not allocatable.
2345 * Otherwise it will point to temporary copy which will be relocated.
2346 *
2347 * Use ->sh_addr to contain final address of the section where it
2348 * will go during execution time.
2349 */
2350 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2351 if (sechdrs[i].sh_type == SHT_NOBITS)
2352 continue;
2353
2354 sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
2355 sechdrs[i].sh_offset;
2356 }
2357
2358 /*
2359 * Identify entry point section and make entry relative to section
2360 * start.
2361 */
2362 entry = pi->ehdr->e_entry;
2363 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2364 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
2365 continue;
2366
2367 if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
2368 continue;
2369
2370 /* Make entry section relative */
2371 if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
2372 ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
2373 pi->ehdr->e_entry)) {
2374 entry_sidx = i;
2375 entry -= sechdrs[i].sh_addr;
2376 break;
2377 }
2378 }
2379
2380 /* Determine how much memory is needed to load relocatable object. */
2381 buf_align = 1;
2382 bss_align = 1;
2383 buf_sz = 0;
2384 bss_sz = 0;
2385
2386 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2387 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
2388 continue;
2389
2390 align = sechdrs[i].sh_addralign;
2391 if (sechdrs[i].sh_type != SHT_NOBITS) {
2392 if (buf_align < align)
2393 buf_align = align;
2394 buf_sz = ALIGN(buf_sz, align);
2395 buf_sz += sechdrs[i].sh_size;
2396 } else {
2397 /* bss section */
2398 if (bss_align < align)
2399 bss_align = align;
2400 bss_sz = ALIGN(bss_sz, align);
2401 bss_sz += sechdrs[i].sh_size;
2402 }
2403 }
2404
2405 /* Determine the bss padding required to align bss properly */
2406 bss_pad = 0;
2407 if (buf_sz & (bss_align - 1))
2408 bss_pad = bss_align - (buf_sz & (bss_align - 1));
2409
2410 memsz = buf_sz + bss_pad + bss_sz;
2411
2412 /* Allocate buffer for purgatory */
2413 purgatory_buf = vzalloc(buf_sz);
2414 if (!purgatory_buf) {
2415 ret = -ENOMEM;
2416 goto out;
2417 }
2418
2419 if (buf_align < bss_align)
2420 buf_align = bss_align;
2421
2422 /* Add buffer to segment list */
2423 ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
2424 buf_align, min, max, top_down,
2425 &pi->purgatory_load_addr);
2426 if (ret)
2427 goto out;
2428
2429 /* Load SHF_ALLOC sections */
2430 buf_addr = purgatory_buf;
2431 load_addr = curr_load_addr = pi->purgatory_load_addr;
2432 bss_addr = load_addr + buf_sz + bss_pad;
2433
2434 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2435 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
2436 continue;
2437
2438 align = sechdrs[i].sh_addralign;
2439 if (sechdrs[i].sh_type != SHT_NOBITS) {
2440 curr_load_addr = ALIGN(curr_load_addr, align);
2441 offset = curr_load_addr - load_addr;
2442 /* We already modifed ->sh_offset to keep src addr */
2443 src = (char *) sechdrs[i].sh_offset;
2444 memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
2445
2446 /* Store load address and source address of section */
2447 sechdrs[i].sh_addr = curr_load_addr;
2448
2449 /*
2450 * This section got copied to temporary buffer. Update
2451 * ->sh_offset accordingly.
2452 */
2453 sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
2454
2455 /* Advance to the next address */
2456 curr_load_addr += sechdrs[i].sh_size;
2457 } else {
2458 bss_addr = ALIGN(bss_addr, align);
2459 sechdrs[i].sh_addr = bss_addr;
2460 bss_addr += sechdrs[i].sh_size;
2461 }
2462 }
2463
2464 /* Update entry point based on load address of text section */
2465 if (entry_sidx >= 0)
2466 entry += sechdrs[entry_sidx].sh_addr;
2467
2468 /* Make kernel jump to purgatory after shutdown */
2469 image->start = entry;
2470
2471 /* Used later to get/set symbol values */
2472 pi->sechdrs = sechdrs;
2473
2474 /*
2475 * Used later to identify which section is purgatory and skip it
2476 * from checksumming.
2477 */
2478 pi->purgatory_buf = purgatory_buf;
2479 return ret;
2480out:
2481 vfree(sechdrs);
2482 vfree(purgatory_buf);
2483 return ret;
2484}
2485
2486static int kexec_apply_relocations(struct kimage *image)
2487{
2488 int i, ret;
2489 struct purgatory_info *pi = &image->purgatory_info;
2490 Elf_Shdr *sechdrs = pi->sechdrs;
2491
2492 /* Apply relocations */
2493 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2494 Elf_Shdr *section, *symtab;
2495
2496 if (sechdrs[i].sh_type != SHT_RELA &&
2497 sechdrs[i].sh_type != SHT_REL)
2498 continue;
2499
2500 /*
2501 * For section of type SHT_RELA/SHT_REL,
2502 * ->sh_link contains section header index of associated
2503 * symbol table. And ->sh_info contains section header
2504 * index of section to which relocations apply.
2505 */
2506 if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
2507 sechdrs[i].sh_link >= pi->ehdr->e_shnum)
2508 return -ENOEXEC;
2509
2510 section = &sechdrs[sechdrs[i].sh_info];
2511 symtab = &sechdrs[sechdrs[i].sh_link];
2512
2513 if (!(section->sh_flags & SHF_ALLOC))
2514 continue;
2515
2516 /*
2517 * symtab->sh_link contain section header index of associated
2518 * string table.
2519 */
2520 if (symtab->sh_link >= pi->ehdr->e_shnum)
2521 /* Invalid section number? */
2522 continue;
2523
2524 /*
2525 * Respective architecture needs to provide support for applying
2526 * relocations of type SHT_RELA/SHT_REL.
2527 */
2528 if (sechdrs[i].sh_type == SHT_RELA)
2529 ret = arch_kexec_apply_relocations_add(pi->ehdr,
2530 sechdrs, i);
2531 else if (sechdrs[i].sh_type == SHT_REL)
2532 ret = arch_kexec_apply_relocations(pi->ehdr,
2533 sechdrs, i);
2534 if (ret)
2535 return ret;
2536 }
2537
2538 return 0;
2539}
2540
2541/* Load relocatable purgatory object and relocate it appropriately */
2542int kexec_load_purgatory(struct kimage *image, unsigned long min,
2543 unsigned long max, int top_down,
2544 unsigned long *load_addr)
2545{
2546 struct purgatory_info *pi = &image->purgatory_info;
2547 int ret;
2548
2549 if (kexec_purgatory_size <= 0)
2550 return -EINVAL;
2551
2552 if (kexec_purgatory_size < sizeof(Elf_Ehdr))
2553 return -ENOEXEC;
2554
2555 pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
2556
2557 if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
2558 || pi->ehdr->e_type != ET_REL
2559 || !elf_check_arch(pi->ehdr)
2560 || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
2561 return -ENOEXEC;
2562
2563 if (pi->ehdr->e_shoff >= kexec_purgatory_size
2564 || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
2565 kexec_purgatory_size - pi->ehdr->e_shoff))
2566 return -ENOEXEC;
2567
2568 ret = __kexec_load_purgatory(image, min, max, top_down);
2569 if (ret)
2570 return ret;
2571
2572 ret = kexec_apply_relocations(image);
2573 if (ret)
2574 goto out;
2575
2576 *load_addr = pi->purgatory_load_addr;
2577 return 0;
2578out:
2579 vfree(pi->sechdrs);
2580 vfree(pi->purgatory_buf);
2581 return ret;
2582}
2583
2584static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
2585 const char *name)
2586{
2587 Elf_Sym *syms;
2588 Elf_Shdr *sechdrs;
2589 Elf_Ehdr *ehdr;
2590 int i, k;
2591 const char *strtab;
2592
2593 if (!pi->sechdrs || !pi->ehdr)
2594 return NULL;
2595
2596 sechdrs = pi->sechdrs;
2597 ehdr = pi->ehdr;
2598
2599 for (i = 0; i < ehdr->e_shnum; i++) {
2600 if (sechdrs[i].sh_type != SHT_SYMTAB)
2601 continue;
2602
2603 if (sechdrs[i].sh_link >= ehdr->e_shnum)
2604 /* Invalid strtab section number */
2605 continue;
2606 strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
2607 syms = (Elf_Sym *)sechdrs[i].sh_offset;
2608
2609 /* Go through symbols for a match */
2610 for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
2611 if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
2612 continue;
2613
2614 if (strcmp(strtab + syms[k].st_name, name) != 0)
2615 continue;
2616
2617 if (syms[k].st_shndx == SHN_UNDEF ||
2618 syms[k].st_shndx >= ehdr->e_shnum) {
2619 pr_debug("Symbol: %s has bad section index %d.\n",
2620 name, syms[k].st_shndx);
2621 return NULL;
2622 }
2623
2624 /* Found the symbol we are looking for */
2625 return &syms[k];
2626 }
2627 }
2628
2629 return NULL;
2630}
2631
2632void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
2633{
2634 struct purgatory_info *pi = &image->purgatory_info;
2635 Elf_Sym *sym;
2636 Elf_Shdr *sechdr;
2637
2638 sym = kexec_purgatory_find_symbol(pi, name);
2639 if (!sym)
2640 return ERR_PTR(-EINVAL);
2641
2642 sechdr = &pi->sechdrs[sym->st_shndx];
2643
2644 /*
2645 * Returns the address where symbol will finally be loaded after
2646 * kexec_load_segment()
2647 */
2648 return (void *)(sechdr->sh_addr + sym->st_value);
2649}
2650
2651/*
2652 * Get or set value of a symbol. If "get_value" is true, symbol value is
2653 * returned in buf otherwise symbol value is set based on value in buf.
2654 */
2655int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
2656 void *buf, unsigned int size, bool get_value)
2657{
2658 Elf_Sym *sym;
2659 Elf_Shdr *sechdrs;
2660 struct purgatory_info *pi = &image->purgatory_info;
2661 char *sym_buf;
2662
2663 sym = kexec_purgatory_find_symbol(pi, name);
2664 if (!sym)
2665 return -EINVAL;
2666
2667 if (sym->st_size != size) {
2668 pr_err("symbol %s size mismatch: expected %lu actual %u\n",
2669 name, (unsigned long)sym->st_size, size);
2670 return -EINVAL;
2671 }
2672
2673 sechdrs = pi->sechdrs;
2674
2675 if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
2676 pr_err("symbol %s is in a bss section. Cannot %s\n", name,
2677 get_value ? "get" : "set");
2678 return -EINVAL;
2679 }
2680
2681 sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
2682 sym->st_value;
2683
2684 if (get_value)
2685 memcpy((void *)buf, sym_buf, size);
2686 else
2687 memcpy((void *)sym_buf, buf, size);
2688
2689 return 0;
2690}
2691#endif /* CONFIG_KEXEC_FILE */
2692
2693/*
2694 * Move into place and start executing a preloaded standalone
2695 * executable. If nothing was preloaded return an error.
2696 */
2697int kernel_kexec(void)
2698{
2699 int error = 0;
2700
2701 if (!mutex_trylock(&kexec_mutex))
2702 return -EBUSY;
2703 if (!kexec_image) {
2704 error = -EINVAL;
2705 goto Unlock;
2706 }
2707
2708#ifdef CONFIG_KEXEC_JUMP
2709 if (kexec_image->preserve_context) {
2710 lock_system_sleep();
2711 pm_prepare_console();
2712 error = freeze_processes();
2713 if (error) {
2714 error = -EBUSY;
2715 goto Restore_console;
2716 }
2717 suspend_console();
2718 error = dpm_suspend_start(PMSG_FREEZE);
2719 if (error)
2720 goto Resume_console;
2721 /* At this point, dpm_suspend_start() has been called,
2722 * but *not* dpm_suspend_end(). We *must* call
2723 * dpm_suspend_end() now. Otherwise, drivers for
2724 * some devices (e.g. interrupt controllers) become
2725 * desynchronized with the actual state of the
2726 * hardware at resume time, and evil weirdness ensues.
2727 */
2728 error = dpm_suspend_end(PMSG_FREEZE);
2729 if (error)
2730 goto Resume_devices;
2731 error = disable_nonboot_cpus();
2732 if (error)
2733 goto Enable_cpus;
2734 local_irq_disable();
2735 error = syscore_suspend();
2736 if (error)
2737 goto Enable_irqs;
2738 } else
2739#endif
2740 {
2741 kexec_in_progress = true;
2742 kernel_restart_prepare(NULL);
2743 migrate_to_reboot_cpu();
2744
2745 /*
2746 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
2747 * no further code needs to use CPU hotplug (which is true in
2748 * the reboot case). However, the kexec path depends on using
2749 * CPU hotplug again; so re-enable it here.
2750 */
2751 cpu_hotplug_enable();
2752 pr_emerg("Starting new kernel\n");
2753 machine_shutdown();
2754 }
2755
2756 machine_kexec(kexec_image);
2757
2758#ifdef CONFIG_KEXEC_JUMP
2759 if (kexec_image->preserve_context) {
2760 syscore_resume();
2761 Enable_irqs:
2762 local_irq_enable();
2763 Enable_cpus:
2764 enable_nonboot_cpus();
2765 dpm_resume_start(PMSG_RESTORE);
2766 Resume_devices:
2767 dpm_resume_end(PMSG_RESTORE);
2768 Resume_console:
2769 resume_console();
2770 thaw_processes();
2771 Restore_console:
2772 pm_restore_console();
2773 unlock_system_sleep();
2774 }
2775#endif
2776
2777 Unlock:
2778 mutex_unlock(&kexec_mutex);
2779 return error;
2780}
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
new file mode 100644
index 000000000000..201b45327804
--- /dev/null
+++ b/kernel/kexec_core.c
@@ -0,0 +1,1534 @@
1/*
2 * kexec.c - kexec system call core code.
3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com>
4 *
5 * This source code is licensed under the GNU General Public License,
6 * Version 2. See the file COPYING for more details.
7 */
8
9#define pr_fmt(fmt) "kexec: " fmt
10
11#include <linux/capability.h>
12#include <linux/mm.h>
13#include <linux/file.h>
14#include <linux/slab.h>
15#include <linux/fs.h>
16#include <linux/kexec.h>
17#include <linux/mutex.h>
18#include <linux/list.h>
19#include <linux/highmem.h>
20#include <linux/syscalls.h>
21#include <linux/reboot.h>
22#include <linux/ioport.h>
23#include <linux/hardirq.h>
24#include <linux/elf.h>
25#include <linux/elfcore.h>
26#include <linux/utsname.h>
27#include <linux/numa.h>
28#include <linux/suspend.h>
29#include <linux/device.h>
30#include <linux/freezer.h>
31#include <linux/pm.h>
32#include <linux/cpu.h>
33#include <linux/uaccess.h>
34#include <linux/io.h>
35#include <linux/console.h>
36#include <linux/vmalloc.h>
37#include <linux/swap.h>
38#include <linux/syscore_ops.h>
39#include <linux/compiler.h>
40#include <linux/hugetlb.h>
41
42#include <asm/page.h>
43#include <asm/sections.h>
44
45#include <crypto/hash.h>
46#include <crypto/sha.h>
47#include "kexec_internal.h"
48
49DEFINE_MUTEX(kexec_mutex);
50
51/* Per cpu memory for storing cpu states in case of system crash. */
52note_buf_t __percpu *crash_notes;
53
54/* vmcoreinfo stuff */
55static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
56u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
57size_t vmcoreinfo_size;
58size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
59
60/* Flag to indicate we are going to kexec a new kernel */
61bool kexec_in_progress = false;
62
63
64/* Location of the reserved area for the crash kernel */
65struct resource crashk_res = {
66 .name = "Crash kernel",
67 .start = 0,
68 .end = 0,
69 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
70};
71struct resource crashk_low_res = {
72 .name = "Crash kernel",
73 .start = 0,
74 .end = 0,
75 .flags = IORESOURCE_BUSY | IORESOURCE_MEM
76};
77
78int kexec_should_crash(struct task_struct *p)
79{
80 /*
81 * If crash_kexec_post_notifiers is enabled, don't run
82 * crash_kexec() here yet, which must be run after panic
83 * notifiers in panic().
84 */
85 if (crash_kexec_post_notifiers)
86 return 0;
87 /*
88 * There are 4 panic() calls in do_exit() path, each of which
89 * corresponds to each of these 4 conditions.
90 */
91 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
92 return 1;
93 return 0;
94}
95
96/*
97 * When kexec transitions to the new kernel there is a one-to-one
98 * mapping between physical and virtual addresses. On processors
99 * where you can disable the MMU this is trivial, and easy. For
100 * others it is still a simple predictable page table to setup.
101 *
102 * In that environment kexec copies the new kernel to its final
103 * resting place. This means I can only support memory whose
104 * physical address can fit in an unsigned long. In particular
105 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
106 * If the assembly stub has more restrictive requirements
107 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
108 * defined more restrictively in <asm/kexec.h>.
109 *
110 * The code for the transition from the current kernel to the
111 * the new kernel is placed in the control_code_buffer, whose size
112 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
113 * page of memory is necessary, but some architectures require more.
114 * Because this memory must be identity mapped in the transition from
115 * virtual to physical addresses it must live in the range
116 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
117 * modifiable.
118 *
119 * The assembly stub in the control code buffer is passed a linked list
120 * of descriptor pages detailing the source pages of the new kernel,
121 * and the destination addresses of those source pages. As this data
122 * structure is not used in the context of the current OS, it must
123 * be self-contained.
124 *
125 * The code has been made to work with highmem pages and will use a
126 * destination page in its final resting place (if it happens
127 * to allocate it). The end product of this is that most of the
128 * physical address space, and most of RAM can be used.
129 *
130 * Future directions include:
131 * - allocating a page table with the control code buffer identity
132 * mapped, to simplify machine_kexec and make kexec_on_panic more
133 * reliable.
134 */
135
136/*
137 * KIMAGE_NO_DEST is an impossible destination address..., for
138 * allocating pages whose destination address we do not care about.
139 */
140#define KIMAGE_NO_DEST (-1UL)
141
142static struct page *kimage_alloc_page(struct kimage *image,
143 gfp_t gfp_mask,
144 unsigned long dest);
145
146int sanity_check_segment_list(struct kimage *image)
147{
148 int result, i;
149 unsigned long nr_segments = image->nr_segments;
150
151 /*
152 * Verify we have good destination addresses. The caller is
153 * responsible for making certain we don't attempt to load
154 * the new image into invalid or reserved areas of RAM. This
155 * just verifies it is an address we can use.
156 *
157 * Since the kernel does everything in page size chunks ensure
158 * the destination addresses are page aligned. Too many
159 * special cases crop of when we don't do this. The most
160 * insidious is getting overlapping destination addresses
161 * simply because addresses are changed to page size
162 * granularity.
163 */
164 result = -EADDRNOTAVAIL;
165 for (i = 0; i < nr_segments; i++) {
166 unsigned long mstart, mend;
167
168 mstart = image->segment[i].mem;
169 mend = mstart + image->segment[i].memsz;
170 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
171 return result;
172 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
173 return result;
174 }
175
176 /* Verify our destination addresses do not overlap.
177 * If we alloed overlapping destination addresses
178 * through very weird things can happen with no
179 * easy explanation as one segment stops on another.
180 */
181 result = -EINVAL;
182 for (i = 0; i < nr_segments; i++) {
183 unsigned long mstart, mend;
184 unsigned long j;
185
186 mstart = image->segment[i].mem;
187 mend = mstart + image->segment[i].memsz;
188 for (j = 0; j < i; j++) {
189 unsigned long pstart, pend;
190
191 pstart = image->segment[j].mem;
192 pend = pstart + image->segment[j].memsz;
193 /* Do the segments overlap ? */
194 if ((mend > pstart) && (mstart < pend))
195 return result;
196 }
197 }
198
199 /* Ensure our buffer sizes are strictly less than
200 * our memory sizes. This should always be the case,
201 * and it is easier to check up front than to be surprised
202 * later on.
203 */
204 result = -EINVAL;
205 for (i = 0; i < nr_segments; i++) {
206 if (image->segment[i].bufsz > image->segment[i].memsz)
207 return result;
208 }
209
210 /*
211 * Verify we have good destination addresses. Normally
212 * the caller is responsible for making certain we don't
213 * attempt to load the new image into invalid or reserved
214 * areas of RAM. But crash kernels are preloaded into a
215 * reserved area of ram. We must ensure the addresses
216 * are in the reserved area otherwise preloading the
217 * kernel could corrupt things.
218 */
219
220 if (image->type == KEXEC_TYPE_CRASH) {
221 result = -EADDRNOTAVAIL;
222 for (i = 0; i < nr_segments; i++) {
223 unsigned long mstart, mend;
224
225 mstart = image->segment[i].mem;
226 mend = mstart + image->segment[i].memsz - 1;
227 /* Ensure we are within the crash kernel limits */
228 if ((mstart < crashk_res.start) ||
229 (mend > crashk_res.end))
230 return result;
231 }
232 }
233
234 return 0;
235}
236
237struct kimage *do_kimage_alloc_init(void)
238{
239 struct kimage *image;
240
241 /* Allocate a controlling structure */
242 image = kzalloc(sizeof(*image), GFP_KERNEL);
243 if (!image)
244 return NULL;
245
246 image->head = 0;
247 image->entry = &image->head;
248 image->last_entry = &image->head;
249 image->control_page = ~0; /* By default this does not apply */
250 image->type = KEXEC_TYPE_DEFAULT;
251
252 /* Initialize the list of control pages */
253 INIT_LIST_HEAD(&image->control_pages);
254
255 /* Initialize the list of destination pages */
256 INIT_LIST_HEAD(&image->dest_pages);
257
258 /* Initialize the list of unusable pages */
259 INIT_LIST_HEAD(&image->unusable_pages);
260
261 return image;
262}
263
264int kimage_is_destination_range(struct kimage *image,
265 unsigned long start,
266 unsigned long end)
267{
268 unsigned long i;
269
270 for (i = 0; i < image->nr_segments; i++) {
271 unsigned long mstart, mend;
272
273 mstart = image->segment[i].mem;
274 mend = mstart + image->segment[i].memsz;
275 if ((end > mstart) && (start < mend))
276 return 1;
277 }
278
279 return 0;
280}
281
282static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
283{
284 struct page *pages;
285
286 pages = alloc_pages(gfp_mask, order);
287 if (pages) {
288 unsigned int count, i;
289
290 pages->mapping = NULL;
291 set_page_private(pages, order);
292 count = 1 << order;
293 for (i = 0; i < count; i++)
294 SetPageReserved(pages + i);
295 }
296
297 return pages;
298}
299
300static void kimage_free_pages(struct page *page)
301{
302 unsigned int order, count, i;
303
304 order = page_private(page);
305 count = 1 << order;
306 for (i = 0; i < count; i++)
307 ClearPageReserved(page + i);
308 __free_pages(page, order);
309}
310
311void kimage_free_page_list(struct list_head *list)
312{
313 struct list_head *pos, *next;
314
315 list_for_each_safe(pos, next, list) {
316 struct page *page;
317
318 page = list_entry(pos, struct page, lru);
319 list_del(&page->lru);
320 kimage_free_pages(page);
321 }
322}
323
324static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
325 unsigned int order)
326{
327 /* Control pages are special, they are the intermediaries
328 * that are needed while we copy the rest of the pages
329 * to their final resting place. As such they must
330 * not conflict with either the destination addresses
331 * or memory the kernel is already using.
332 *
333 * The only case where we really need more than one of
334 * these are for architectures where we cannot disable
335 * the MMU and must instead generate an identity mapped
336 * page table for all of the memory.
337 *
338 * At worst this runs in O(N) of the image size.
339 */
340 struct list_head extra_pages;
341 struct page *pages;
342 unsigned int count;
343
344 count = 1 << order;
345 INIT_LIST_HEAD(&extra_pages);
346
347 /* Loop while I can allocate a page and the page allocated
348 * is a destination page.
349 */
350 do {
351 unsigned long pfn, epfn, addr, eaddr;
352
353 pages = kimage_alloc_pages(KEXEC_CONTROL_MEMORY_GFP, order);
354 if (!pages)
355 break;
356 pfn = page_to_pfn(pages);
357 epfn = pfn + count;
358 addr = pfn << PAGE_SHIFT;
359 eaddr = epfn << PAGE_SHIFT;
360 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
361 kimage_is_destination_range(image, addr, eaddr)) {
362 list_add(&pages->lru, &extra_pages);
363 pages = NULL;
364 }
365 } while (!pages);
366
367 if (pages) {
368 /* Remember the allocated page... */
369 list_add(&pages->lru, &image->control_pages);
370
371 /* Because the page is already in it's destination
372 * location we will never allocate another page at
373 * that address. Therefore kimage_alloc_pages
374 * will not return it (again) and we don't need
375 * to give it an entry in image->segment[].
376 */
377 }
378 /* Deal with the destination pages I have inadvertently allocated.
379 *
380 * Ideally I would convert multi-page allocations into single
381 * page allocations, and add everything to image->dest_pages.
382 *
383 * For now it is simpler to just free the pages.
384 */
385 kimage_free_page_list(&extra_pages);
386
387 return pages;
388}
389
390static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
391 unsigned int order)
392{
393 /* Control pages are special, they are the intermediaries
394 * that are needed while we copy the rest of the pages
395 * to their final resting place. As such they must
396 * not conflict with either the destination addresses
397 * or memory the kernel is already using.
398 *
399 * Control pages are also the only pags we must allocate
400 * when loading a crash kernel. All of the other pages
401 * are specified by the segments and we just memcpy
402 * into them directly.
403 *
404 * The only case where we really need more than one of
405 * these are for architectures where we cannot disable
406 * the MMU and must instead generate an identity mapped
407 * page table for all of the memory.
408 *
409 * Given the low demand this implements a very simple
410 * allocator that finds the first hole of the appropriate
411 * size in the reserved memory region, and allocates all
412 * of the memory up to and including the hole.
413 */
414 unsigned long hole_start, hole_end, size;
415 struct page *pages;
416
417 pages = NULL;
418 size = (1 << order) << PAGE_SHIFT;
419 hole_start = (image->control_page + (size - 1)) & ~(size - 1);
420 hole_end = hole_start + size - 1;
421 while (hole_end <= crashk_res.end) {
422 unsigned long i;
423
424 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
425 break;
426 /* See if I overlap any of the segments */
427 for (i = 0; i < image->nr_segments; i++) {
428 unsigned long mstart, mend;
429
430 mstart = image->segment[i].mem;
431 mend = mstart + image->segment[i].memsz - 1;
432 if ((hole_end >= mstart) && (hole_start <= mend)) {
433 /* Advance the hole to the end of the segment */
434 hole_start = (mend + (size - 1)) & ~(size - 1);
435 hole_end = hole_start + size - 1;
436 break;
437 }
438 }
439 /* If I don't overlap any segments I have found my hole! */
440 if (i == image->nr_segments) {
441 pages = pfn_to_page(hole_start >> PAGE_SHIFT);
442 image->control_page = hole_end;
443 break;
444 }
445 }
446
447 return pages;
448}
449
450
451struct page *kimage_alloc_control_pages(struct kimage *image,
452 unsigned int order)
453{
454 struct page *pages = NULL;
455
456 switch (image->type) {
457 case KEXEC_TYPE_DEFAULT:
458 pages = kimage_alloc_normal_control_pages(image, order);
459 break;
460 case KEXEC_TYPE_CRASH:
461 pages = kimage_alloc_crash_control_pages(image, order);
462 break;
463 }
464
465 return pages;
466}
467
468static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
469{
470 if (*image->entry != 0)
471 image->entry++;
472
473 if (image->entry == image->last_entry) {
474 kimage_entry_t *ind_page;
475 struct page *page;
476
477 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
478 if (!page)
479 return -ENOMEM;
480
481 ind_page = page_address(page);
482 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
483 image->entry = ind_page;
484 image->last_entry = ind_page +
485 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
486 }
487 *image->entry = entry;
488 image->entry++;
489 *image->entry = 0;
490
491 return 0;
492}
493
494static int kimage_set_destination(struct kimage *image,
495 unsigned long destination)
496{
497 int result;
498
499 destination &= PAGE_MASK;
500 result = kimage_add_entry(image, destination | IND_DESTINATION);
501
502 return result;
503}
504
505
506static int kimage_add_page(struct kimage *image, unsigned long page)
507{
508 int result;
509
510 page &= PAGE_MASK;
511 result = kimage_add_entry(image, page | IND_SOURCE);
512
513 return result;
514}
515
516
517static void kimage_free_extra_pages(struct kimage *image)
518{
519 /* Walk through and free any extra destination pages I may have */
520 kimage_free_page_list(&image->dest_pages);
521
522 /* Walk through and free any unusable pages I have cached */
523 kimage_free_page_list(&image->unusable_pages);
524
525}
526void kimage_terminate(struct kimage *image)
527{
528 if (*image->entry != 0)
529 image->entry++;
530
531 *image->entry = IND_DONE;
532}
533
534#define for_each_kimage_entry(image, ptr, entry) \
535 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
536 ptr = (entry & IND_INDIRECTION) ? \
537 phys_to_virt((entry & PAGE_MASK)) : ptr + 1)
538
539static void kimage_free_entry(kimage_entry_t entry)
540{
541 struct page *page;
542
543 page = pfn_to_page(entry >> PAGE_SHIFT);
544 kimage_free_pages(page);
545}
546
547void kimage_free(struct kimage *image)
548{
549 kimage_entry_t *ptr, entry;
550 kimage_entry_t ind = 0;
551
552 if (!image)
553 return;
554
555 kimage_free_extra_pages(image);
556 for_each_kimage_entry(image, ptr, entry) {
557 if (entry & IND_INDIRECTION) {
558 /* Free the previous indirection page */
559 if (ind & IND_INDIRECTION)
560 kimage_free_entry(ind);
561 /* Save this indirection page until we are
562 * done with it.
563 */
564 ind = entry;
565 } else if (entry & IND_SOURCE)
566 kimage_free_entry(entry);
567 }
568 /* Free the final indirection page */
569 if (ind & IND_INDIRECTION)
570 kimage_free_entry(ind);
571
572 /* Handle any machine specific cleanup */
573 machine_kexec_cleanup(image);
574
575 /* Free the kexec control pages... */
576 kimage_free_page_list(&image->control_pages);
577
578 /*
579 * Free up any temporary buffers allocated. This might hit if
580 * error occurred much later after buffer allocation.
581 */
582 if (image->file_mode)
583 kimage_file_post_load_cleanup(image);
584
585 kfree(image);
586}
587
588static kimage_entry_t *kimage_dst_used(struct kimage *image,
589 unsigned long page)
590{
591 kimage_entry_t *ptr, entry;
592 unsigned long destination = 0;
593
594 for_each_kimage_entry(image, ptr, entry) {
595 if (entry & IND_DESTINATION)
596 destination = entry & PAGE_MASK;
597 else if (entry & IND_SOURCE) {
598 if (page == destination)
599 return ptr;
600 destination += PAGE_SIZE;
601 }
602 }
603
604 return NULL;
605}
606
607static struct page *kimage_alloc_page(struct kimage *image,
608 gfp_t gfp_mask,
609 unsigned long destination)
610{
611 /*
612 * Here we implement safeguards to ensure that a source page
613 * is not copied to its destination page before the data on
614 * the destination page is no longer useful.
615 *
616 * To do this we maintain the invariant that a source page is
617 * either its own destination page, or it is not a
618 * destination page at all.
619 *
620 * That is slightly stronger than required, but the proof
621 * that no problems will not occur is trivial, and the
622 * implementation is simply to verify.
623 *
624 * When allocating all pages normally this algorithm will run
625 * in O(N) time, but in the worst case it will run in O(N^2)
626 * time. If the runtime is a problem the data structures can
627 * be fixed.
628 */
629 struct page *page;
630 unsigned long addr;
631
632 /*
633 * Walk through the list of destination pages, and see if I
634 * have a match.
635 */
636 list_for_each_entry(page, &image->dest_pages, lru) {
637 addr = page_to_pfn(page) << PAGE_SHIFT;
638 if (addr == destination) {
639 list_del(&page->lru);
640 return page;
641 }
642 }
643 page = NULL;
644 while (1) {
645 kimage_entry_t *old;
646
647 /* Allocate a page, if we run out of memory give up */
648 page = kimage_alloc_pages(gfp_mask, 0);
649 if (!page)
650 return NULL;
651 /* If the page cannot be used file it away */
652 if (page_to_pfn(page) >
653 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
654 list_add(&page->lru, &image->unusable_pages);
655 continue;
656 }
657 addr = page_to_pfn(page) << PAGE_SHIFT;
658
659 /* If it is the destination page we want use it */
660 if (addr == destination)
661 break;
662
663 /* If the page is not a destination page use it */
664 if (!kimage_is_destination_range(image, addr,
665 addr + PAGE_SIZE))
666 break;
667
668 /*
669 * I know that the page is someones destination page.
670 * See if there is already a source page for this
671 * destination page. And if so swap the source pages.
672 */
673 old = kimage_dst_used(image, addr);
674 if (old) {
675 /* If so move it */
676 unsigned long old_addr;
677 struct page *old_page;
678
679 old_addr = *old & PAGE_MASK;
680 old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
681 copy_highpage(page, old_page);
682 *old = addr | (*old & ~PAGE_MASK);
683
684 /* The old page I have found cannot be a
685 * destination page, so return it if it's
686 * gfp_flags honor the ones passed in.
687 */
688 if (!(gfp_mask & __GFP_HIGHMEM) &&
689 PageHighMem(old_page)) {
690 kimage_free_pages(old_page);
691 continue;
692 }
693 addr = old_addr;
694 page = old_page;
695 break;
696 }
697 /* Place the page on the destination list, to be used later */
698 list_add(&page->lru, &image->dest_pages);
699 }
700
701 return page;
702}
703
704static int kimage_load_normal_segment(struct kimage *image,
705 struct kexec_segment *segment)
706{
707 unsigned long maddr;
708 size_t ubytes, mbytes;
709 int result;
710 unsigned char __user *buf = NULL;
711 unsigned char *kbuf = NULL;
712
713 result = 0;
714 if (image->file_mode)
715 kbuf = segment->kbuf;
716 else
717 buf = segment->buf;
718 ubytes = segment->bufsz;
719 mbytes = segment->memsz;
720 maddr = segment->mem;
721
722 result = kimage_set_destination(image, maddr);
723 if (result < 0)
724 goto out;
725
726 while (mbytes) {
727 struct page *page;
728 char *ptr;
729 size_t uchunk, mchunk;
730
731 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
732 if (!page) {
733 result = -ENOMEM;
734 goto out;
735 }
736 result = kimage_add_page(image, page_to_pfn(page)
737 << PAGE_SHIFT);
738 if (result < 0)
739 goto out;
740
741 ptr = kmap(page);
742 /* Start with a clear page */
743 clear_page(ptr);
744 ptr += maddr & ~PAGE_MASK;
745 mchunk = min_t(size_t, mbytes,
746 PAGE_SIZE - (maddr & ~PAGE_MASK));
747 uchunk = min(ubytes, mchunk);
748
749 /* For file based kexec, source pages are in kernel memory */
750 if (image->file_mode)
751 memcpy(ptr, kbuf, uchunk);
752 else
753 result = copy_from_user(ptr, buf, uchunk);
754 kunmap(page);
755 if (result) {
756 result = -EFAULT;
757 goto out;
758 }
759 ubytes -= uchunk;
760 maddr += mchunk;
761 if (image->file_mode)
762 kbuf += mchunk;
763 else
764 buf += mchunk;
765 mbytes -= mchunk;
766 }
767out:
768 return result;
769}
770
771static int kimage_load_crash_segment(struct kimage *image,
772 struct kexec_segment *segment)
773{
774 /* For crash dumps kernels we simply copy the data from
775 * user space to it's destination.
776 * We do things a page at a time for the sake of kmap.
777 */
778 unsigned long maddr;
779 size_t ubytes, mbytes;
780 int result;
781 unsigned char __user *buf = NULL;
782 unsigned char *kbuf = NULL;
783
784 result = 0;
785 if (image->file_mode)
786 kbuf = segment->kbuf;
787 else
788 buf = segment->buf;
789 ubytes = segment->bufsz;
790 mbytes = segment->memsz;
791 maddr = segment->mem;
792 while (mbytes) {
793 struct page *page;
794 char *ptr;
795 size_t uchunk, mchunk;
796
797 page = pfn_to_page(maddr >> PAGE_SHIFT);
798 if (!page) {
799 result = -ENOMEM;
800 goto out;
801 }
802 ptr = kmap(page);
803 ptr += maddr & ~PAGE_MASK;
804 mchunk = min_t(size_t, mbytes,
805 PAGE_SIZE - (maddr & ~PAGE_MASK));
806 uchunk = min(ubytes, mchunk);
807 if (mchunk > uchunk) {
808 /* Zero the trailing part of the page */
809 memset(ptr + uchunk, 0, mchunk - uchunk);
810 }
811
812 /* For file based kexec, source pages are in kernel memory */
813 if (image->file_mode)
814 memcpy(ptr, kbuf, uchunk);
815 else
816 result = copy_from_user(ptr, buf, uchunk);
817 kexec_flush_icache_page(page);
818 kunmap(page);
819 if (result) {
820 result = -EFAULT;
821 goto out;
822 }
823 ubytes -= uchunk;
824 maddr += mchunk;
825 if (image->file_mode)
826 kbuf += mchunk;
827 else
828 buf += mchunk;
829 mbytes -= mchunk;
830 }
831out:
832 return result;
833}
834
835int kimage_load_segment(struct kimage *image,
836 struct kexec_segment *segment)
837{
838 int result = -ENOMEM;
839
840 switch (image->type) {
841 case KEXEC_TYPE_DEFAULT:
842 result = kimage_load_normal_segment(image, segment);
843 break;
844 case KEXEC_TYPE_CRASH:
845 result = kimage_load_crash_segment(image, segment);
846 break;
847 }
848
849 return result;
850}
851
852struct kimage *kexec_image;
853struct kimage *kexec_crash_image;
854int kexec_load_disabled;
855
856void crash_kexec(struct pt_regs *regs)
857{
858 /* Take the kexec_mutex here to prevent sys_kexec_load
859 * running on one cpu from replacing the crash kernel
860 * we are using after a panic on a different cpu.
861 *
862 * If the crash kernel was not located in a fixed area
863 * of memory the xchg(&kexec_crash_image) would be
864 * sufficient. But since I reuse the memory...
865 */
866 if (mutex_trylock(&kexec_mutex)) {
867 if (kexec_crash_image) {
868 struct pt_regs fixed_regs;
869
870 crash_setup_regs(&fixed_regs, regs);
871 crash_save_vmcoreinfo();
872 machine_crash_shutdown(&fixed_regs);
873 machine_kexec(kexec_crash_image);
874 }
875 mutex_unlock(&kexec_mutex);
876 }
877}
878
879size_t crash_get_memory_size(void)
880{
881 size_t size = 0;
882
883 mutex_lock(&kexec_mutex);
884 if (crashk_res.end != crashk_res.start)
885 size = resource_size(&crashk_res);
886 mutex_unlock(&kexec_mutex);
887 return size;
888}
889
890void __weak crash_free_reserved_phys_range(unsigned long begin,
891 unsigned long end)
892{
893 unsigned long addr;
894
895 for (addr = begin; addr < end; addr += PAGE_SIZE)
896 free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
897}
898
899int crash_shrink_memory(unsigned long new_size)
900{
901 int ret = 0;
902 unsigned long start, end;
903 unsigned long old_size;
904 struct resource *ram_res;
905
906 mutex_lock(&kexec_mutex);
907
908 if (kexec_crash_image) {
909 ret = -ENOENT;
910 goto unlock;
911 }
912 start = crashk_res.start;
913 end = crashk_res.end;
914 old_size = (end == 0) ? 0 : end - start + 1;
915 if (new_size >= old_size) {
916 ret = (new_size == old_size) ? 0 : -EINVAL;
917 goto unlock;
918 }
919
920 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL);
921 if (!ram_res) {
922 ret = -ENOMEM;
923 goto unlock;
924 }
925
926 start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
927 end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
928
929 crash_map_reserved_pages();
930 crash_free_reserved_phys_range(end, crashk_res.end);
931
932 if ((start == end) && (crashk_res.parent != NULL))
933 release_resource(&crashk_res);
934
935 ram_res->start = end;
936 ram_res->end = crashk_res.end;
937 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
938 ram_res->name = "System RAM";
939
940 crashk_res.end = end - 1;
941
942 insert_resource(&iomem_resource, ram_res);
943 crash_unmap_reserved_pages();
944
945unlock:
946 mutex_unlock(&kexec_mutex);
947 return ret;
948}
949
950static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
951 size_t data_len)
952{
953 struct elf_note note;
954
955 note.n_namesz = strlen(name) + 1;
956 note.n_descsz = data_len;
957 note.n_type = type;
958 memcpy(buf, &note, sizeof(note));
959 buf += (sizeof(note) + 3)/4;
960 memcpy(buf, name, note.n_namesz);
961 buf += (note.n_namesz + 3)/4;
962 memcpy(buf, data, note.n_descsz);
963 buf += (note.n_descsz + 3)/4;
964
965 return buf;
966}
967
968static void final_note(u32 *buf)
969{
970 struct elf_note note;
971
972 note.n_namesz = 0;
973 note.n_descsz = 0;
974 note.n_type = 0;
975 memcpy(buf, &note, sizeof(note));
976}
977
978void crash_save_cpu(struct pt_regs *regs, int cpu)
979{
980 struct elf_prstatus prstatus;
981 u32 *buf;
982
983 if ((cpu < 0) || (cpu >= nr_cpu_ids))
984 return;
985
986 /* Using ELF notes here is opportunistic.
987 * I need a well defined structure format
988 * for the data I pass, and I need tags
989 * on the data to indicate what information I have
990 * squirrelled away. ELF notes happen to provide
991 * all of that, so there is no need to invent something new.
992 */
993 buf = (u32 *)per_cpu_ptr(crash_notes, cpu);
994 if (!buf)
995 return;
996 memset(&prstatus, 0, sizeof(prstatus));
997 prstatus.pr_pid = current->pid;
998 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
999 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1000 &prstatus, sizeof(prstatus));
1001 final_note(buf);
1002}
1003
1004static int __init crash_notes_memory_init(void)
1005{
1006 /* Allocate memory for saving cpu registers. */
1007 size_t size, align;
1008
1009 /*
1010 * crash_notes could be allocated across 2 vmalloc pages when percpu
1011 * is vmalloc based . vmalloc doesn't guarantee 2 continuous vmalloc
1012 * pages are also on 2 continuous physical pages. In this case the
1013 * 2nd part of crash_notes in 2nd page could be lost since only the
1014 * starting address and size of crash_notes are exported through sysfs.
1015 * Here round up the size of crash_notes to the nearest power of two
1016 * and pass it to __alloc_percpu as align value. This can make sure
1017 * crash_notes is allocated inside one physical page.
1018 */
1019 size = sizeof(note_buf_t);
1020 align = min(roundup_pow_of_two(sizeof(note_buf_t)), PAGE_SIZE);
1021
1022 /*
1023 * Break compile if size is bigger than PAGE_SIZE since crash_notes
1024 * definitely will be in 2 pages with that.
1025 */
1026 BUILD_BUG_ON(size > PAGE_SIZE);
1027
1028 crash_notes = __alloc_percpu(size, align);
1029 if (!crash_notes) {
1030 pr_warn("Kexec: Memory allocation for saving cpu register states failed\n");
1031 return -ENOMEM;
1032 }
1033 return 0;
1034}
1035subsys_initcall(crash_notes_memory_init);
1036
1037
1038/*
1039 * parsing the "crashkernel" commandline
1040 *
1041 * this code is intended to be called from architecture specific code
1042 */
1043
1044
1045/*
1046 * This function parses command lines in the format
1047 *
1048 * crashkernel=ramsize-range:size[,...][@offset]
1049 *
1050 * The function returns 0 on success and -EINVAL on failure.
1051 */
1052static int __init parse_crashkernel_mem(char *cmdline,
1053 unsigned long long system_ram,
1054 unsigned long long *crash_size,
1055 unsigned long long *crash_base)
1056{
1057 char *cur = cmdline, *tmp;
1058
1059 /* for each entry of the comma-separated list */
1060 do {
1061 unsigned long long start, end = ULLONG_MAX, size;
1062
1063 /* get the start of the range */
1064 start = memparse(cur, &tmp);
1065 if (cur == tmp) {
1066 pr_warn("crashkernel: Memory value expected\n");
1067 return -EINVAL;
1068 }
1069 cur = tmp;
1070 if (*cur != '-') {
1071 pr_warn("crashkernel: '-' expected\n");
1072 return -EINVAL;
1073 }
1074 cur++;
1075
1076 /* if no ':' is here, than we read the end */
1077 if (*cur != ':') {
1078 end = memparse(cur, &tmp);
1079 if (cur == tmp) {
1080 pr_warn("crashkernel: Memory value expected\n");
1081 return -EINVAL;
1082 }
1083 cur = tmp;
1084 if (end <= start) {
1085 pr_warn("crashkernel: end <= start\n");
1086 return -EINVAL;
1087 }
1088 }
1089
1090 if (*cur != ':') {
1091 pr_warn("crashkernel: ':' expected\n");
1092 return -EINVAL;
1093 }
1094 cur++;
1095
1096 size = memparse(cur, &tmp);
1097 if (cur == tmp) {
1098 pr_warn("Memory value expected\n");
1099 return -EINVAL;
1100 }
1101 cur = tmp;
1102 if (size >= system_ram) {
1103 pr_warn("crashkernel: invalid size\n");
1104 return -EINVAL;
1105 }
1106
1107 /* match ? */
1108 if (system_ram >= start && system_ram < end) {
1109 *crash_size = size;
1110 break;
1111 }
1112 } while (*cur++ == ',');
1113
1114 if (*crash_size > 0) {
1115 while (*cur && *cur != ' ' && *cur != '@')
1116 cur++;
1117 if (*cur == '@') {
1118 cur++;
1119 *crash_base = memparse(cur, &tmp);
1120 if (cur == tmp) {
1121 pr_warn("Memory value expected after '@'\n");
1122 return -EINVAL;
1123 }
1124 }
1125 }
1126
1127 return 0;
1128}
1129
1130/*
1131 * That function parses "simple" (old) crashkernel command lines like
1132 *
1133 * crashkernel=size[@offset]
1134 *
1135 * It returns 0 on success and -EINVAL on failure.
1136 */
1137static int __init parse_crashkernel_simple(char *cmdline,
1138 unsigned long long *crash_size,
1139 unsigned long long *crash_base)
1140{
1141 char *cur = cmdline;
1142
1143 *crash_size = memparse(cmdline, &cur);
1144 if (cmdline == cur) {
1145 pr_warn("crashkernel: memory value expected\n");
1146 return -EINVAL;
1147 }
1148
1149 if (*cur == '@')
1150 *crash_base = memparse(cur+1, &cur);
1151 else if (*cur != ' ' && *cur != '\0') {
1152 pr_warn("crashkernel: unrecognized char\n");
1153 return -EINVAL;
1154 }
1155
1156 return 0;
1157}
1158
1159#define SUFFIX_HIGH 0
1160#define SUFFIX_LOW 1
1161#define SUFFIX_NULL 2
1162static __initdata char *suffix_tbl[] = {
1163 [SUFFIX_HIGH] = ",high",
1164 [SUFFIX_LOW] = ",low",
1165 [SUFFIX_NULL] = NULL,
1166};
1167
1168/*
1169 * That function parses "suffix" crashkernel command lines like
1170 *
1171 * crashkernel=size,[high|low]
1172 *
1173 * It returns 0 on success and -EINVAL on failure.
1174 */
1175static int __init parse_crashkernel_suffix(char *cmdline,
1176 unsigned long long *crash_size,
1177 const char *suffix)
1178{
1179 char *cur = cmdline;
1180
1181 *crash_size = memparse(cmdline, &cur);
1182 if (cmdline == cur) {
1183 pr_warn("crashkernel: memory value expected\n");
1184 return -EINVAL;
1185 }
1186
1187 /* check with suffix */
1188 if (strncmp(cur, suffix, strlen(suffix))) {
1189 pr_warn("crashkernel: unrecognized char\n");
1190 return -EINVAL;
1191 }
1192 cur += strlen(suffix);
1193 if (*cur != ' ' && *cur != '\0') {
1194 pr_warn("crashkernel: unrecognized char\n");
1195 return -EINVAL;
1196 }
1197
1198 return 0;
1199}
1200
1201static __init char *get_last_crashkernel(char *cmdline,
1202 const char *name,
1203 const char *suffix)
1204{
1205 char *p = cmdline, *ck_cmdline = NULL;
1206
1207 /* find crashkernel and use the last one if there are more */
1208 p = strstr(p, name);
1209 while (p) {
1210 char *end_p = strchr(p, ' ');
1211 char *q;
1212
1213 if (!end_p)
1214 end_p = p + strlen(p);
1215
1216 if (!suffix) {
1217 int i;
1218
1219 /* skip the one with any known suffix */
1220 for (i = 0; suffix_tbl[i]; i++) {
1221 q = end_p - strlen(suffix_tbl[i]);
1222 if (!strncmp(q, suffix_tbl[i],
1223 strlen(suffix_tbl[i])))
1224 goto next;
1225 }
1226 ck_cmdline = p;
1227 } else {
1228 q = end_p - strlen(suffix);
1229 if (!strncmp(q, suffix, strlen(suffix)))
1230 ck_cmdline = p;
1231 }
1232next:
1233 p = strstr(p+1, name);
1234 }
1235
1236 if (!ck_cmdline)
1237 return NULL;
1238
1239 return ck_cmdline;
1240}
1241
1242static int __init __parse_crashkernel(char *cmdline,
1243 unsigned long long system_ram,
1244 unsigned long long *crash_size,
1245 unsigned long long *crash_base,
1246 const char *name,
1247 const char *suffix)
1248{
1249 char *first_colon, *first_space;
1250 char *ck_cmdline;
1251
1252 BUG_ON(!crash_size || !crash_base);
1253 *crash_size = 0;
1254 *crash_base = 0;
1255
1256 ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
1257
1258 if (!ck_cmdline)
1259 return -EINVAL;
1260
1261 ck_cmdline += strlen(name);
1262
1263 if (suffix)
1264 return parse_crashkernel_suffix(ck_cmdline, crash_size,
1265 suffix);
1266 /*
1267 * if the commandline contains a ':', then that's the extended
1268 * syntax -- if not, it must be the classic syntax
1269 */
1270 first_colon = strchr(ck_cmdline, ':');
1271 first_space = strchr(ck_cmdline, ' ');
1272 if (first_colon && (!first_space || first_colon < first_space))
1273 return parse_crashkernel_mem(ck_cmdline, system_ram,
1274 crash_size, crash_base);
1275
1276 return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base);
1277}
1278
1279/*
1280 * That function is the entry point for command line parsing and should be
1281 * called from the arch-specific code.
1282 */
1283int __init parse_crashkernel(char *cmdline,
1284 unsigned long long system_ram,
1285 unsigned long long *crash_size,
1286 unsigned long long *crash_base)
1287{
1288 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1289 "crashkernel=", NULL);
1290}
1291
1292int __init parse_crashkernel_high(char *cmdline,
1293 unsigned long long system_ram,
1294 unsigned long long *crash_size,
1295 unsigned long long *crash_base)
1296{
1297 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1298 "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
1299}
1300
1301int __init parse_crashkernel_low(char *cmdline,
1302 unsigned long long system_ram,
1303 unsigned long long *crash_size,
1304 unsigned long long *crash_base)
1305{
1306 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
1307 "crashkernel=", suffix_tbl[SUFFIX_LOW]);
1308}
1309
1310static void update_vmcoreinfo_note(void)
1311{
1312 u32 *buf = vmcoreinfo_note;
1313
1314 if (!vmcoreinfo_size)
1315 return;
1316 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1317 vmcoreinfo_size);
1318 final_note(buf);
1319}
1320
1321void crash_save_vmcoreinfo(void)
1322{
1323 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
1324 update_vmcoreinfo_note();
1325}
1326
1327void vmcoreinfo_append_str(const char *fmt, ...)
1328{
1329 va_list args;
1330 char buf[0x50];
1331 size_t r;
1332
1333 va_start(args, fmt);
1334 r = vscnprintf(buf, sizeof(buf), fmt, args);
1335 va_end(args);
1336
1337 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
1338
1339 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1340
1341 vmcoreinfo_size += r;
1342}
1343
1344/*
1345 * provide an empty default implementation here -- architecture
1346 * code may override this
1347 */
1348void __weak arch_crash_save_vmcoreinfo(void)
1349{}
1350
1351unsigned long __weak paddr_vmcoreinfo_note(void)
1352{
1353 return __pa((unsigned long)(char *)&vmcoreinfo_note);
1354}
1355
1356static int __init crash_save_vmcoreinfo_init(void)
1357{
1358 VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1359 VMCOREINFO_PAGESIZE(PAGE_SIZE);
1360
1361 VMCOREINFO_SYMBOL(init_uts_ns);
1362 VMCOREINFO_SYMBOL(node_online_map);
1363#ifdef CONFIG_MMU
1364 VMCOREINFO_SYMBOL(swapper_pg_dir);
1365#endif
1366 VMCOREINFO_SYMBOL(_stext);
1367 VMCOREINFO_SYMBOL(vmap_area_list);
1368
1369#ifndef CONFIG_NEED_MULTIPLE_NODES
1370 VMCOREINFO_SYMBOL(mem_map);
1371 VMCOREINFO_SYMBOL(contig_page_data);
1372#endif
1373#ifdef CONFIG_SPARSEMEM
1374 VMCOREINFO_SYMBOL(mem_section);
1375 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1376 VMCOREINFO_STRUCT_SIZE(mem_section);
1377 VMCOREINFO_OFFSET(mem_section, section_mem_map);
1378#endif
1379 VMCOREINFO_STRUCT_SIZE(page);
1380 VMCOREINFO_STRUCT_SIZE(pglist_data);
1381 VMCOREINFO_STRUCT_SIZE(zone);
1382 VMCOREINFO_STRUCT_SIZE(free_area);
1383 VMCOREINFO_STRUCT_SIZE(list_head);
1384 VMCOREINFO_SIZE(nodemask_t);
1385 VMCOREINFO_OFFSET(page, flags);
1386 VMCOREINFO_OFFSET(page, _count);
1387 VMCOREINFO_OFFSET(page, mapping);
1388 VMCOREINFO_OFFSET(page, lru);
1389 VMCOREINFO_OFFSET(page, _mapcount);
1390 VMCOREINFO_OFFSET(page, private);
1391 VMCOREINFO_OFFSET(pglist_data, node_zones);
1392 VMCOREINFO_OFFSET(pglist_data, nr_zones);
1393#ifdef CONFIG_FLAT_NODE_MEM_MAP
1394 VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1395#endif
1396 VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1397 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1398 VMCOREINFO_OFFSET(pglist_data, node_id);
1399 VMCOREINFO_OFFSET(zone, free_area);
1400 VMCOREINFO_OFFSET(zone, vm_stat);
1401 VMCOREINFO_OFFSET(zone, spanned_pages);
1402 VMCOREINFO_OFFSET(free_area, free_list);
1403 VMCOREINFO_OFFSET(list_head, next);
1404 VMCOREINFO_OFFSET(list_head, prev);
1405 VMCOREINFO_OFFSET(vmap_area, va_start);
1406 VMCOREINFO_OFFSET(vmap_area, list);
1407 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1408 log_buf_kexec_setup();
1409 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1410 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1411 VMCOREINFO_NUMBER(PG_lru);
1412 VMCOREINFO_NUMBER(PG_private);
1413 VMCOREINFO_NUMBER(PG_swapcache);
1414 VMCOREINFO_NUMBER(PG_slab);
1415#ifdef CONFIG_MEMORY_FAILURE
1416 VMCOREINFO_NUMBER(PG_hwpoison);
1417#endif
1418 VMCOREINFO_NUMBER(PG_head_mask);
1419 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
1420#ifdef CONFIG_X86
1421 VMCOREINFO_NUMBER(KERNEL_IMAGE_SIZE);
1422#endif
1423#ifdef CONFIG_HUGETLBFS
1424 VMCOREINFO_SYMBOL(free_huge_page);
1425#endif
1426
1427 arch_crash_save_vmcoreinfo();
1428 update_vmcoreinfo_note();
1429
1430 return 0;
1431}
1432
1433subsys_initcall(crash_save_vmcoreinfo_init);
1434
1435/*
1436 * Move into place and start executing a preloaded standalone
1437 * executable. If nothing was preloaded return an error.
1438 */
1439int kernel_kexec(void)
1440{
1441 int error = 0;
1442
1443 if (!mutex_trylock(&kexec_mutex))
1444 return -EBUSY;
1445 if (!kexec_image) {
1446 error = -EINVAL;
1447 goto Unlock;
1448 }
1449
1450#ifdef CONFIG_KEXEC_JUMP
1451 if (kexec_image->preserve_context) {
1452 lock_system_sleep();
1453 pm_prepare_console();
1454 error = freeze_processes();
1455 if (error) {
1456 error = -EBUSY;
1457 goto Restore_console;
1458 }
1459 suspend_console();
1460 error = dpm_suspend_start(PMSG_FREEZE);
1461 if (error)
1462 goto Resume_console;
1463 /* At this point, dpm_suspend_start() has been called,
1464 * but *not* dpm_suspend_end(). We *must* call
1465 * dpm_suspend_end() now. Otherwise, drivers for
1466 * some devices (e.g. interrupt controllers) become
1467 * desynchronized with the actual state of the
1468 * hardware at resume time, and evil weirdness ensues.
1469 */
1470 error = dpm_suspend_end(PMSG_FREEZE);
1471 if (error)
1472 goto Resume_devices;
1473 error = disable_nonboot_cpus();
1474 if (error)
1475 goto Enable_cpus;
1476 local_irq_disable();
1477 error = syscore_suspend();
1478 if (error)
1479 goto Enable_irqs;
1480 } else
1481#endif
1482 {
1483 kexec_in_progress = true;
1484 kernel_restart_prepare(NULL);
1485 migrate_to_reboot_cpu();
1486
1487 /*
1488 * migrate_to_reboot_cpu() disables CPU hotplug assuming that
1489 * no further code needs to use CPU hotplug (which is true in
1490 * the reboot case). However, the kexec path depends on using
1491 * CPU hotplug again; so re-enable it here.
1492 */
1493 cpu_hotplug_enable();
1494 pr_emerg("Starting new kernel\n");
1495 machine_shutdown();
1496 }
1497
1498 machine_kexec(kexec_image);
1499
1500#ifdef CONFIG_KEXEC_JUMP
1501 if (kexec_image->preserve_context) {
1502 syscore_resume();
1503 Enable_irqs:
1504 local_irq_enable();
1505 Enable_cpus:
1506 enable_nonboot_cpus();
1507 dpm_resume_start(PMSG_RESTORE);
1508 Resume_devices:
1509 dpm_resume_end(PMSG_RESTORE);
1510 Resume_console:
1511 resume_console();
1512 thaw_processes();
1513 Restore_console:
1514 pm_restore_console();
1515 unlock_system_sleep();
1516 }
1517#endif
1518
1519 Unlock:
1520 mutex_unlock(&kexec_mutex);
1521 return error;
1522}
1523
1524/*
1525 * Add and remove page tables for crashkernel memory
1526 *
1527 * Provide an empty default implementation here -- architecture
1528 * code may override this
1529 */
1530void __weak crash_map_reserved_pages(void)
1531{}
1532
1533void __weak crash_unmap_reserved_pages(void)
1534{}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
new file mode 100644
index 000000000000..6a9a3f2a0e8e
--- /dev/null
+++ b/kernel/kexec_file.c
@@ -0,0 +1,1045 @@
1/*
2 * kexec: kexec_file_load system call
3 *
4 * Copyright (C) 2014 Red Hat Inc.
5 * Authors:
6 * Vivek Goyal <vgoyal@redhat.com>
7 *
8 * This source code is licensed under the GNU General Public License,
9 * Version 2. See the file COPYING for more details.
10 */
11
12#include <linux/capability.h>
13#include <linux/mm.h>
14#include <linux/file.h>
15#include <linux/slab.h>
16#include <linux/kexec.h>
17#include <linux/mutex.h>
18#include <linux/list.h>
19#include <crypto/hash.h>
20#include <crypto/sha.h>
21#include <linux/syscalls.h>
22#include <linux/vmalloc.h>
23#include "kexec_internal.h"
24
25/*
26 * Declare these symbols weak so that if architecture provides a purgatory,
27 * these will be overridden.
28 */
29char __weak kexec_purgatory[0];
30size_t __weak kexec_purgatory_size = 0;
31
32static int kexec_calculate_store_digests(struct kimage *image);
33
34static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
35{
36 struct fd f = fdget(fd);
37 int ret;
38 struct kstat stat;
39 loff_t pos;
40 ssize_t bytes = 0;
41
42 if (!f.file)
43 return -EBADF;
44
45 ret = vfs_getattr(&f.file->f_path, &stat);
46 if (ret)
47 goto out;
48
49 if (stat.size > INT_MAX) {
50 ret = -EFBIG;
51 goto out;
52 }
53
54 /* Don't hand 0 to vmalloc, it whines. */
55 if (stat.size == 0) {
56 ret = -EINVAL;
57 goto out;
58 }
59
60 *buf = vmalloc(stat.size);
61 if (!*buf) {
62 ret = -ENOMEM;
63 goto out;
64 }
65
66 pos = 0;
67 while (pos < stat.size) {
68 bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
69 stat.size - pos);
70 if (bytes < 0) {
71 vfree(*buf);
72 ret = bytes;
73 goto out;
74 }
75
76 if (bytes == 0)
77 break;
78 pos += bytes;
79 }
80
81 if (pos != stat.size) {
82 ret = -EBADF;
83 vfree(*buf);
84 goto out;
85 }
86
87 *buf_len = pos;
88out:
89 fdput(f);
90 return ret;
91}
92
93/* Architectures can provide this probe function */
94int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
95 unsigned long buf_len)
96{
97 return -ENOEXEC;
98}
99
100void * __weak arch_kexec_kernel_image_load(struct kimage *image)
101{
102 return ERR_PTR(-ENOEXEC);
103}
104
105int __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
106{
107 return -EINVAL;
108}
109
110int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
111 unsigned long buf_len)
112{
113 return -EKEYREJECTED;
114}
115
116/* Apply relocations of type RELA */
117int __weak
118arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
119 unsigned int relsec)
120{
121 pr_err("RELA relocation unsupported.\n");
122 return -ENOEXEC;
123}
124
125/* Apply relocations of type REL */
126int __weak
127arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
128 unsigned int relsec)
129{
130 pr_err("REL relocation unsupported.\n");
131 return -ENOEXEC;
132}
133
134/*
135 * Free up memory used by kernel, initrd, and command line. This is temporary
136 * memory allocation which is not needed any more after these buffers have
137 * been loaded into separate segments and have been copied elsewhere.
138 */
139void kimage_file_post_load_cleanup(struct kimage *image)
140{
141 struct purgatory_info *pi = &image->purgatory_info;
142
143 vfree(image->kernel_buf);
144 image->kernel_buf = NULL;
145
146 vfree(image->initrd_buf);
147 image->initrd_buf = NULL;
148
149 kfree(image->cmdline_buf);
150 image->cmdline_buf = NULL;
151
152 vfree(pi->purgatory_buf);
153 pi->purgatory_buf = NULL;
154
155 vfree(pi->sechdrs);
156 pi->sechdrs = NULL;
157
158 /* See if architecture has anything to cleanup post load */
159 arch_kimage_file_post_load_cleanup(image);
160
161 /*
162 * Above call should have called into bootloader to free up
163 * any data stored in kimage->image_loader_data. It should
164 * be ok now to free it up.
165 */
166 kfree(image->image_loader_data);
167 image->image_loader_data = NULL;
168}
169
170/*
171 * In file mode list of segments is prepared by kernel. Copy relevant
172 * data from user space, do error checking, prepare segment list
173 */
174static int
175kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
176 const char __user *cmdline_ptr,
177 unsigned long cmdline_len, unsigned flags)
178{
179 int ret = 0;
180 void *ldata;
181
182 ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
183 &image->kernel_buf_len);
184 if (ret)
185 return ret;
186
187 /* Call arch image probe handlers */
188 ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
189 image->kernel_buf_len);
190
191 if (ret)
192 goto out;
193
194#ifdef CONFIG_KEXEC_VERIFY_SIG
195 ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
196 image->kernel_buf_len);
197 if (ret) {
198 pr_debug("kernel signature verification failed.\n");
199 goto out;
200 }
201 pr_debug("kernel signature verification successful.\n");
202#endif
203 /* It is possible that there no initramfs is being loaded */
204 if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
205 ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
206 &image->initrd_buf_len);
207 if (ret)
208 goto out;
209 }
210
211 if (cmdline_len) {
212 image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
213 if (!image->cmdline_buf) {
214 ret = -ENOMEM;
215 goto out;
216 }
217
218 ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
219 cmdline_len);
220 if (ret) {
221 ret = -EFAULT;
222 goto out;
223 }
224
225 image->cmdline_buf_len = cmdline_len;
226
227 /* command line should be a string with last byte null */
228 if (image->cmdline_buf[cmdline_len - 1] != '\0') {
229 ret = -EINVAL;
230 goto out;
231 }
232 }
233
234 /* Call arch image load handlers */
235 ldata = arch_kexec_kernel_image_load(image);
236
237 if (IS_ERR(ldata)) {
238 ret = PTR_ERR(ldata);
239 goto out;
240 }
241
242 image->image_loader_data = ldata;
243out:
244 /* In case of error, free up all allocated memory in this function */
245 if (ret)
246 kimage_file_post_load_cleanup(image);
247 return ret;
248}
249
250static int
251kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
252 int initrd_fd, const char __user *cmdline_ptr,
253 unsigned long cmdline_len, unsigned long flags)
254{
255 int ret;
256 struct kimage *image;
257 bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
258
259 image = do_kimage_alloc_init();
260 if (!image)
261 return -ENOMEM;
262
263 image->file_mode = 1;
264
265 if (kexec_on_panic) {
266 /* Enable special crash kernel control page alloc policy. */
267 image->control_page = crashk_res.start;
268 image->type = KEXEC_TYPE_CRASH;
269 }
270
271 ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
272 cmdline_ptr, cmdline_len, flags);
273 if (ret)
274 goto out_free_image;
275
276 ret = sanity_check_segment_list(image);
277 if (ret)
278 goto out_free_post_load_bufs;
279
280 ret = -ENOMEM;
281 image->control_code_page = kimage_alloc_control_pages(image,
282 get_order(KEXEC_CONTROL_PAGE_SIZE));
283 if (!image->control_code_page) {
284 pr_err("Could not allocate control_code_buffer\n");
285 goto out_free_post_load_bufs;
286 }
287
288 if (!kexec_on_panic) {
289 image->swap_page = kimage_alloc_control_pages(image, 0);
290 if (!image->swap_page) {
291 pr_err("Could not allocate swap buffer\n");
292 goto out_free_control_pages;
293 }
294 }
295
296 *rimage = image;
297 return 0;
298out_free_control_pages:
299 kimage_free_page_list(&image->control_pages);
300out_free_post_load_bufs:
301 kimage_file_post_load_cleanup(image);
302out_free_image:
303 kfree(image);
304 return ret;
305}
306
307SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
308 unsigned long, cmdline_len, const char __user *, cmdline_ptr,
309 unsigned long, flags)
310{
311 int ret = 0, i;
312 struct kimage **dest_image, *image;
313
314 /* We only trust the superuser with rebooting the system. */
315 if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
316 return -EPERM;
317
318 /* Make sure we have a legal set of flags */
319 if (flags != (flags & KEXEC_FILE_FLAGS))
320 return -EINVAL;
321
322 image = NULL;
323
324 if (!mutex_trylock(&kexec_mutex))
325 return -EBUSY;
326
327 dest_image = &kexec_image;
328 if (flags & KEXEC_FILE_ON_CRASH)
329 dest_image = &kexec_crash_image;
330
331 if (flags & KEXEC_FILE_UNLOAD)
332 goto exchange;
333
334 /*
335 * In case of crash, new kernel gets loaded in reserved region. It is
336 * same memory where old crash kernel might be loaded. Free any
337 * current crash dump kernel before we corrupt it.
338 */
339 if (flags & KEXEC_FILE_ON_CRASH)
340 kimage_free(xchg(&kexec_crash_image, NULL));
341
342 ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
343 cmdline_len, flags);
344 if (ret)
345 goto out;
346
347 ret = machine_kexec_prepare(image);
348 if (ret)
349 goto out;
350
351 ret = kexec_calculate_store_digests(image);
352 if (ret)
353 goto out;
354
355 for (i = 0; i < image->nr_segments; i++) {
356 struct kexec_segment *ksegment;
357
358 ksegment = &image->segment[i];
359 pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
360 i, ksegment->buf, ksegment->bufsz, ksegment->mem,
361 ksegment->memsz);
362
363 ret = kimage_load_segment(image, &image->segment[i]);
364 if (ret)
365 goto out;
366 }
367
368 kimage_terminate(image);
369
370 /*
371 * Free up any temporary buffers allocated which are not needed
372 * after image has been loaded
373 */
374 kimage_file_post_load_cleanup(image);
375exchange:
376 image = xchg(dest_image, image);
377out:
378 mutex_unlock(&kexec_mutex);
379 kimage_free(image);
380 return ret;
381}
382
383static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
384 struct kexec_buf *kbuf)
385{
386 struct kimage *image = kbuf->image;
387 unsigned long temp_start, temp_end;
388
389 temp_end = min(end, kbuf->buf_max);
390 temp_start = temp_end - kbuf->memsz;
391
392 do {
393 /* align down start */
394 temp_start = temp_start & (~(kbuf->buf_align - 1));
395
396 if (temp_start < start || temp_start < kbuf->buf_min)
397 return 0;
398
399 temp_end = temp_start + kbuf->memsz - 1;
400
401 /*
402 * Make sure this does not conflict with any of existing
403 * segments
404 */
405 if (kimage_is_destination_range(image, temp_start, temp_end)) {
406 temp_start = temp_start - PAGE_SIZE;
407 continue;
408 }
409
410 /* We found a suitable memory range */
411 break;
412 } while (1);
413
414 /* If we are here, we found a suitable memory range */
415 kbuf->mem = temp_start;
416
417 /* Success, stop navigating through remaining System RAM ranges */
418 return 1;
419}
420
421static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
422 struct kexec_buf *kbuf)
423{
424 struct kimage *image = kbuf->image;
425 unsigned long temp_start, temp_end;
426
427 temp_start = max(start, kbuf->buf_min);
428
429 do {
430 temp_start = ALIGN(temp_start, kbuf->buf_align);
431 temp_end = temp_start + kbuf->memsz - 1;
432
433 if (temp_end > end || temp_end > kbuf->buf_max)
434 return 0;
435 /*
436 * Make sure this does not conflict with any of existing
437 * segments
438 */
439 if (kimage_is_destination_range(image, temp_start, temp_end)) {
440 temp_start = temp_start + PAGE_SIZE;
441 continue;
442 }
443
444 /* We found a suitable memory range */
445 break;
446 } while (1);
447
448 /* If we are here, we found a suitable memory range */
449 kbuf->mem = temp_start;
450
451 /* Success, stop navigating through remaining System RAM ranges */
452 return 1;
453}
454
455static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
456{
457 struct kexec_buf *kbuf = (struct kexec_buf *)arg;
458 unsigned long sz = end - start + 1;
459
460 /* Returning 0 will take to next memory range */
461 if (sz < kbuf->memsz)
462 return 0;
463
464 if (end < kbuf->buf_min || start > kbuf->buf_max)
465 return 0;
466
467 /*
468 * Allocate memory top down with-in ram range. Otherwise bottom up
469 * allocation.
470 */
471 if (kbuf->top_down)
472 return locate_mem_hole_top_down(start, end, kbuf);
473 return locate_mem_hole_bottom_up(start, end, kbuf);
474}
475
476/*
477 * Helper function for placing a buffer in a kexec segment. This assumes
478 * that kexec_mutex is held.
479 */
480int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
481 unsigned long memsz, unsigned long buf_align,
482 unsigned long buf_min, unsigned long buf_max,
483 bool top_down, unsigned long *load_addr)
484{
485
486 struct kexec_segment *ksegment;
487 struct kexec_buf buf, *kbuf;
488 int ret;
489
490 /* Currently adding segment this way is allowed only in file mode */
491 if (!image->file_mode)
492 return -EINVAL;
493
494 if (image->nr_segments >= KEXEC_SEGMENT_MAX)
495 return -EINVAL;
496
497 /*
498 * Make sure we are not trying to add buffer after allocating
499 * control pages. All segments need to be placed first before
500 * any control pages are allocated. As control page allocation
501 * logic goes through list of segments to make sure there are
502 * no destination overlaps.
503 */
504 if (!list_empty(&image->control_pages)) {
505 WARN_ON(1);
506 return -EINVAL;
507 }
508
509 memset(&buf, 0, sizeof(struct kexec_buf));
510 kbuf = &buf;
511 kbuf->image = image;
512 kbuf->buffer = buffer;
513 kbuf->bufsz = bufsz;
514
515 kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
516 kbuf->buf_align = max(buf_align, PAGE_SIZE);
517 kbuf->buf_min = buf_min;
518 kbuf->buf_max = buf_max;
519 kbuf->top_down = top_down;
520
521 /* Walk the RAM ranges and allocate a suitable range for the buffer */
522 if (image->type == KEXEC_TYPE_CRASH)
523 ret = walk_iomem_res("Crash kernel",
524 IORESOURCE_MEM | IORESOURCE_BUSY,
525 crashk_res.start, crashk_res.end, kbuf,
526 locate_mem_hole_callback);
527 else
528 ret = walk_system_ram_res(0, -1, kbuf,
529 locate_mem_hole_callback);
530 if (ret != 1) {
531 /* A suitable memory range could not be found for buffer */
532 return -EADDRNOTAVAIL;
533 }
534
535 /* Found a suitable memory range */
536 ksegment = &image->segment[image->nr_segments];
537 ksegment->kbuf = kbuf->buffer;
538 ksegment->bufsz = kbuf->bufsz;
539 ksegment->mem = kbuf->mem;
540 ksegment->memsz = kbuf->memsz;
541 image->nr_segments++;
542 *load_addr = ksegment->mem;
543 return 0;
544}
545
546/* Calculate and store the digest of segments */
547static int kexec_calculate_store_digests(struct kimage *image)
548{
549 struct crypto_shash *tfm;
550 struct shash_desc *desc;
551 int ret = 0, i, j, zero_buf_sz, sha_region_sz;
552 size_t desc_size, nullsz;
553 char *digest;
554 void *zero_buf;
555 struct kexec_sha_region *sha_regions;
556 struct purgatory_info *pi = &image->purgatory_info;
557
558 zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
559 zero_buf_sz = PAGE_SIZE;
560
561 tfm = crypto_alloc_shash("sha256", 0, 0);
562 if (IS_ERR(tfm)) {
563 ret = PTR_ERR(tfm);
564 goto out;
565 }
566
567 desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
568 desc = kzalloc(desc_size, GFP_KERNEL);
569 if (!desc) {
570 ret = -ENOMEM;
571 goto out_free_tfm;
572 }
573
574 sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
575 sha_regions = vzalloc(sha_region_sz);
576 if (!sha_regions)
577 goto out_free_desc;
578
579 desc->tfm = tfm;
580 desc->flags = 0;
581
582 ret = crypto_shash_init(desc);
583 if (ret < 0)
584 goto out_free_sha_regions;
585
586 digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
587 if (!digest) {
588 ret = -ENOMEM;
589 goto out_free_sha_regions;
590 }
591
592 for (j = i = 0; i < image->nr_segments; i++) {
593 struct kexec_segment *ksegment;
594
595 ksegment = &image->segment[i];
596 /*
597 * Skip purgatory as it will be modified once we put digest
598 * info in purgatory.
599 */
600 if (ksegment->kbuf == pi->purgatory_buf)
601 continue;
602
603 ret = crypto_shash_update(desc, ksegment->kbuf,
604 ksegment->bufsz);
605 if (ret)
606 break;
607
608 /*
609 * Assume rest of the buffer is filled with zero and
610 * update digest accordingly.
611 */
612 nullsz = ksegment->memsz - ksegment->bufsz;
613 while (nullsz) {
614 unsigned long bytes = nullsz;
615
616 if (bytes > zero_buf_sz)
617 bytes = zero_buf_sz;
618 ret = crypto_shash_update(desc, zero_buf, bytes);
619 if (ret)
620 break;
621 nullsz -= bytes;
622 }
623
624 if (ret)
625 break;
626
627 sha_regions[j].start = ksegment->mem;
628 sha_regions[j].len = ksegment->memsz;
629 j++;
630 }
631
632 if (!ret) {
633 ret = crypto_shash_final(desc, digest);
634 if (ret)
635 goto out_free_digest;
636 ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
637 sha_regions, sha_region_sz, 0);
638 if (ret)
639 goto out_free_digest;
640
641 ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
642 digest, SHA256_DIGEST_SIZE, 0);
643 if (ret)
644 goto out_free_digest;
645 }
646
647out_free_digest:
648 kfree(digest);
649out_free_sha_regions:
650 vfree(sha_regions);
651out_free_desc:
652 kfree(desc);
653out_free_tfm:
654 kfree(tfm);
655out:
656 return ret;
657}
658
659/* Actually load purgatory. Lot of code taken from kexec-tools */
660static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
661 unsigned long max, int top_down)
662{
663 struct purgatory_info *pi = &image->purgatory_info;
664 unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
665 unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
666 unsigned char *buf_addr, *src;
667 int i, ret = 0, entry_sidx = -1;
668 const Elf_Shdr *sechdrs_c;
669 Elf_Shdr *sechdrs = NULL;
670 void *purgatory_buf = NULL;
671
672 /*
673 * sechdrs_c points to section headers in purgatory and are read
674 * only. No modifications allowed.
675 */
676 sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
677
678 /*
679 * We can not modify sechdrs_c[] and its fields. It is read only.
680 * Copy it over to a local copy where one can store some temporary
681 * data and free it at the end. We need to modify ->sh_addr and
682 * ->sh_offset fields to keep track of permanent and temporary
683 * locations of sections.
684 */
685 sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
686 if (!sechdrs)
687 return -ENOMEM;
688
689 memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
690
691 /*
692 * We seem to have multiple copies of sections. First copy is which
693 * is embedded in kernel in read only section. Some of these sections
694 * will be copied to a temporary buffer and relocated. And these
695 * sections will finally be copied to their final destination at
696 * segment load time.
697 *
698 * Use ->sh_offset to reflect section address in memory. It will
699 * point to original read only copy if section is not allocatable.
700 * Otherwise it will point to temporary copy which will be relocated.
701 *
702 * Use ->sh_addr to contain final address of the section where it
703 * will go during execution time.
704 */
705 for (i = 0; i < pi->ehdr->e_shnum; i++) {
706 if (sechdrs[i].sh_type == SHT_NOBITS)
707 continue;
708
709 sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
710 sechdrs[i].sh_offset;
711 }
712
713 /*
714 * Identify entry point section and make entry relative to section
715 * start.
716 */
717 entry = pi->ehdr->e_entry;
718 for (i = 0; i < pi->ehdr->e_shnum; i++) {
719 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
720 continue;
721
722 if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
723 continue;
724
725 /* Make entry section relative */
726 if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
727 ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
728 pi->ehdr->e_entry)) {
729 entry_sidx = i;
730 entry -= sechdrs[i].sh_addr;
731 break;
732 }
733 }
734
735 /* Determine how much memory is needed to load relocatable object. */
736 buf_align = 1;
737 bss_align = 1;
738 buf_sz = 0;
739 bss_sz = 0;
740
741 for (i = 0; i < pi->ehdr->e_shnum; i++) {
742 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
743 continue;
744
745 align = sechdrs[i].sh_addralign;
746 if (sechdrs[i].sh_type != SHT_NOBITS) {
747 if (buf_align < align)
748 buf_align = align;
749 buf_sz = ALIGN(buf_sz, align);
750 buf_sz += sechdrs[i].sh_size;
751 } else {
752 /* bss section */
753 if (bss_align < align)
754 bss_align = align;
755 bss_sz = ALIGN(bss_sz, align);
756 bss_sz += sechdrs[i].sh_size;
757 }
758 }
759
760 /* Determine the bss padding required to align bss properly */
761 bss_pad = 0;
762 if (buf_sz & (bss_align - 1))
763 bss_pad = bss_align - (buf_sz & (bss_align - 1));
764
765 memsz = buf_sz + bss_pad + bss_sz;
766
767 /* Allocate buffer for purgatory */
768 purgatory_buf = vzalloc(buf_sz);
769 if (!purgatory_buf) {
770 ret = -ENOMEM;
771 goto out;
772 }
773
774 if (buf_align < bss_align)
775 buf_align = bss_align;
776
777 /* Add buffer to segment list */
778 ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
779 buf_align, min, max, top_down,
780 &pi->purgatory_load_addr);
781 if (ret)
782 goto out;
783
784 /* Load SHF_ALLOC sections */
785 buf_addr = purgatory_buf;
786 load_addr = curr_load_addr = pi->purgatory_load_addr;
787 bss_addr = load_addr + buf_sz + bss_pad;
788
789 for (i = 0; i < pi->ehdr->e_shnum; i++) {
790 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
791 continue;
792
793 align = sechdrs[i].sh_addralign;
794 if (sechdrs[i].sh_type != SHT_NOBITS) {
795 curr_load_addr = ALIGN(curr_load_addr, align);
796 offset = curr_load_addr - load_addr;
797 /* We already modifed ->sh_offset to keep src addr */
798 src = (char *) sechdrs[i].sh_offset;
799 memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
800
801 /* Store load address and source address of section */
802 sechdrs[i].sh_addr = curr_load_addr;
803
804 /*
805 * This section got copied to temporary buffer. Update
806 * ->sh_offset accordingly.
807 */
808 sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
809
810 /* Advance to the next address */
811 curr_load_addr += sechdrs[i].sh_size;
812 } else {
813 bss_addr = ALIGN(bss_addr, align);
814 sechdrs[i].sh_addr = bss_addr;
815 bss_addr += sechdrs[i].sh_size;
816 }
817 }
818
819 /* Update entry point based on load address of text section */
820 if (entry_sidx >= 0)
821 entry += sechdrs[entry_sidx].sh_addr;
822
823 /* Make kernel jump to purgatory after shutdown */
824 image->start = entry;
825
826 /* Used later to get/set symbol values */
827 pi->sechdrs = sechdrs;
828
829 /*
830 * Used later to identify which section is purgatory and skip it
831 * from checksumming.
832 */
833 pi->purgatory_buf = purgatory_buf;
834 return ret;
835out:
836 vfree(sechdrs);
837 vfree(purgatory_buf);
838 return ret;
839}
840
841static int kexec_apply_relocations(struct kimage *image)
842{
843 int i, ret;
844 struct purgatory_info *pi = &image->purgatory_info;
845 Elf_Shdr *sechdrs = pi->sechdrs;
846
847 /* Apply relocations */
848 for (i = 0; i < pi->ehdr->e_shnum; i++) {
849 Elf_Shdr *section, *symtab;
850
851 if (sechdrs[i].sh_type != SHT_RELA &&
852 sechdrs[i].sh_type != SHT_REL)
853 continue;
854
855 /*
856 * For section of type SHT_RELA/SHT_REL,
857 * ->sh_link contains section header index of associated
858 * symbol table. And ->sh_info contains section header
859 * index of section to which relocations apply.
860 */
861 if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
862 sechdrs[i].sh_link >= pi->ehdr->e_shnum)
863 return -ENOEXEC;
864
865 section = &sechdrs[sechdrs[i].sh_info];
866 symtab = &sechdrs[sechdrs[i].sh_link];
867
868 if (!(section->sh_flags & SHF_ALLOC))
869 continue;
870
871 /*
872 * symtab->sh_link contain section header index of associated
873 * string table.
874 */
875 if (symtab->sh_link >= pi->ehdr->e_shnum)
876 /* Invalid section number? */
877 continue;
878
879 /*
880 * Respective architecture needs to provide support for applying
881 * relocations of type SHT_RELA/SHT_REL.
882 */
883 if (sechdrs[i].sh_type == SHT_RELA)
884 ret = arch_kexec_apply_relocations_add(pi->ehdr,
885 sechdrs, i);
886 else if (sechdrs[i].sh_type == SHT_REL)
887 ret = arch_kexec_apply_relocations(pi->ehdr,
888 sechdrs, i);
889 if (ret)
890 return ret;
891 }
892
893 return 0;
894}
895
896/* Load relocatable purgatory object and relocate it appropriately */
897int kexec_load_purgatory(struct kimage *image, unsigned long min,
898 unsigned long max, int top_down,
899 unsigned long *load_addr)
900{
901 struct purgatory_info *pi = &image->purgatory_info;
902 int ret;
903
904 if (kexec_purgatory_size <= 0)
905 return -EINVAL;
906
907 if (kexec_purgatory_size < sizeof(Elf_Ehdr))
908 return -ENOEXEC;
909
910 pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
911
912 if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
913 || pi->ehdr->e_type != ET_REL
914 || !elf_check_arch(pi->ehdr)
915 || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
916 return -ENOEXEC;
917
918 if (pi->ehdr->e_shoff >= kexec_purgatory_size
919 || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
920 kexec_purgatory_size - pi->ehdr->e_shoff))
921 return -ENOEXEC;
922
923 ret = __kexec_load_purgatory(image, min, max, top_down);
924 if (ret)
925 return ret;
926
927 ret = kexec_apply_relocations(image);
928 if (ret)
929 goto out;
930
931 *load_addr = pi->purgatory_load_addr;
932 return 0;
933out:
934 vfree(pi->sechdrs);
935 vfree(pi->purgatory_buf);
936 return ret;
937}
938
939static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
940 const char *name)
941{
942 Elf_Sym *syms;
943 Elf_Shdr *sechdrs;
944 Elf_Ehdr *ehdr;
945 int i, k;
946 const char *strtab;
947
948 if (!pi->sechdrs || !pi->ehdr)
949 return NULL;
950
951 sechdrs = pi->sechdrs;
952 ehdr = pi->ehdr;
953
954 for (i = 0; i < ehdr->e_shnum; i++) {
955 if (sechdrs[i].sh_type != SHT_SYMTAB)
956 continue;
957
958 if (sechdrs[i].sh_link >= ehdr->e_shnum)
959 /* Invalid strtab section number */
960 continue;
961 strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
962 syms = (Elf_Sym *)sechdrs[i].sh_offset;
963
964 /* Go through symbols for a match */
965 for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
966 if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
967 continue;
968
969 if (strcmp(strtab + syms[k].st_name, name) != 0)
970 continue;
971
972 if (syms[k].st_shndx == SHN_UNDEF ||
973 syms[k].st_shndx >= ehdr->e_shnum) {
974 pr_debug("Symbol: %s has bad section index %d.\n",
975 name, syms[k].st_shndx);
976 return NULL;
977 }
978
979 /* Found the symbol we are looking for */
980 return &syms[k];
981 }
982 }
983
984 return NULL;
985}
986
987void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
988{
989 struct purgatory_info *pi = &image->purgatory_info;
990 Elf_Sym *sym;
991 Elf_Shdr *sechdr;
992
993 sym = kexec_purgatory_find_symbol(pi, name);
994 if (!sym)
995 return ERR_PTR(-EINVAL);
996
997 sechdr = &pi->sechdrs[sym->st_shndx];
998
999 /*
1000 * Returns the address where symbol will finally be loaded after
1001 * kexec_load_segment()
1002 */
1003 return (void *)(sechdr->sh_addr + sym->st_value);
1004}
1005
1006/*
1007 * Get or set value of a symbol. If "get_value" is true, symbol value is
1008 * returned in buf otherwise symbol value is set based on value in buf.
1009 */
1010int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
1011 void *buf, unsigned int size, bool get_value)
1012{
1013 Elf_Sym *sym;
1014 Elf_Shdr *sechdrs;
1015 struct purgatory_info *pi = &image->purgatory_info;
1016 char *sym_buf;
1017
1018 sym = kexec_purgatory_find_symbol(pi, name);
1019 if (!sym)
1020 return -EINVAL;
1021
1022 if (sym->st_size != size) {
1023 pr_err("symbol %s size mismatch: expected %lu actual %u\n",
1024 name, (unsigned long)sym->st_size, size);
1025 return -EINVAL;
1026 }
1027
1028 sechdrs = pi->sechdrs;
1029
1030 if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
1031 pr_err("symbol %s is in a bss section. Cannot %s\n", name,
1032 get_value ? "get" : "set");
1033 return -EINVAL;
1034 }
1035
1036 sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
1037 sym->st_value;
1038
1039 if (get_value)
1040 memcpy((void *)buf, sym_buf, size);
1041 else
1042 memcpy((void *)sym_buf, buf, size);
1043
1044 return 0;
1045}
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
new file mode 100644
index 000000000000..e4392a698ad4
--- /dev/null
+++ b/kernel/kexec_internal.h
@@ -0,0 +1,22 @@
1#ifndef LINUX_KEXEC_INTERNAL_H
2#define LINUX_KEXEC_INTERNAL_H
3
4#include <linux/kexec.h>
5
6struct kimage *do_kimage_alloc_init(void);
7int sanity_check_segment_list(struct kimage *image);
8void kimage_free_page_list(struct list_head *list);
9void kimage_free(struct kimage *image);
10int kimage_load_segment(struct kimage *image, struct kexec_segment *segment);
11void kimage_terminate(struct kimage *image);
12int kimage_is_destination_range(struct kimage *image,
13 unsigned long start, unsigned long end);
14
15extern struct mutex kexec_mutex;
16
17#ifdef CONFIG_KEXEC_FILE
18void kimage_file_post_load_cleanup(struct kimage *image);
19#else /* CONFIG_KEXEC_FILE */
20static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
21#endif /* CONFIG_KEXEC_FILE */
22#endif /* LINUX_KEXEC_INTERNAL_H */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 2777f40a9c7b..da98d0593de2 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,8 +45,6 @@
45 45
46extern int max_threads; 46extern int max_threads;
47 47
48static struct workqueue_struct *khelper_wq;
49
50#define CAP_BSET (void *)1 48#define CAP_BSET (void *)1
51#define CAP_PI (void *)2 49#define CAP_PI (void *)2
52 50
@@ -114,10 +112,11 @@ out:
114 * @...: arguments as specified in the format string 112 * @...: arguments as specified in the format string
115 * 113 *
116 * Load a module using the user mode module loader. The function returns 114 * Load a module using the user mode module loader. The function returns
117 * zero on success or a negative errno code on failure. Note that a 115 * zero on success or a negative errno code or positive exit code from
118 * successful module load does not mean the module did not then unload 116 * "modprobe" on failure. Note that a successful module load does not mean
119 * and exit on an error of its own. Callers must check that the service 117 * the module did not then unload and exit on an error of its own. Callers
120 * they requested is now available not blindly invoke it. 118 * must check that the service they requested is now available not blindly
119 * invoke it.
121 * 120 *
122 * If module auto-loading support is disabled then this function 121 * If module auto-loading support is disabled then this function
123 * becomes a no-operation. 122 * becomes a no-operation.
@@ -213,7 +212,7 @@ static void umh_complete(struct subprocess_info *sub_info)
213/* 212/*
214 * This is the task which runs the usermode application 213 * This is the task which runs the usermode application
215 */ 214 */
216static int ____call_usermodehelper(void *data) 215static int call_usermodehelper_exec_async(void *data)
217{ 216{
218 struct subprocess_info *sub_info = data; 217 struct subprocess_info *sub_info = data;
219 struct cred *new; 218 struct cred *new;
@@ -223,12 +222,9 @@ static int ____call_usermodehelper(void *data)
223 flush_signal_handlers(current, 1); 222 flush_signal_handlers(current, 1);
224 spin_unlock_irq(&current->sighand->siglock); 223 spin_unlock_irq(&current->sighand->siglock);
225 224
226 /* We can run anywhere, unlike our parent keventd(). */
227 set_cpus_allowed_ptr(current, cpu_all_mask);
228
229 /* 225 /*
230 * Our parent is keventd, which runs with elevated scheduling priority. 226 * Our parent (unbound workqueue) runs with elevated scheduling
231 * Avoid propagating that into the userspace child. 227 * priority. Avoid propagating that into the userspace child.
232 */ 228 */
233 set_user_nice(current, 0); 229 set_user_nice(current, 0);
234 230
@@ -258,7 +254,10 @@ static int ____call_usermodehelper(void *data)
258 (const char __user *const __user *)sub_info->envp); 254 (const char __user *const __user *)sub_info->envp);
259out: 255out:
260 sub_info->retval = retval; 256 sub_info->retval = retval;
261 /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */ 257 /*
258 * call_usermodehelper_exec_sync() will call umh_complete
259 * if UHM_WAIT_PROC.
260 */
262 if (!(sub_info->wait & UMH_WAIT_PROC)) 261 if (!(sub_info->wait & UMH_WAIT_PROC))
263 umh_complete(sub_info); 262 umh_complete(sub_info);
264 if (!retval) 263 if (!retval)
@@ -266,15 +265,14 @@ out:
266 do_exit(0); 265 do_exit(0);
267} 266}
268 267
269/* Keventd can't block, but this (a child) can. */ 268/* Handles UMH_WAIT_PROC. */
270static int wait_for_helper(void *data) 269static void call_usermodehelper_exec_sync(struct subprocess_info *sub_info)
271{ 270{
272 struct subprocess_info *sub_info = data;
273 pid_t pid; 271 pid_t pid;
274 272
275 /* If SIGCLD is ignored sys_wait4 won't populate the status. */ 273 /* If SIGCLD is ignored sys_wait4 won't populate the status. */
276 kernel_sigaction(SIGCHLD, SIG_DFL); 274 kernel_sigaction(SIGCHLD, SIG_DFL);
277 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 275 pid = kernel_thread(call_usermodehelper_exec_async, sub_info, SIGCHLD);
278 if (pid < 0) { 276 if (pid < 0) {
279 sub_info->retval = pid; 277 sub_info->retval = pid;
280 } else { 278 } else {
@@ -282,44 +280,60 @@ static int wait_for_helper(void *data)
282 /* 280 /*
283 * Normally it is bogus to call wait4() from in-kernel because 281 * Normally it is bogus to call wait4() from in-kernel because
284 * wait4() wants to write the exit code to a userspace address. 282 * wait4() wants to write the exit code to a userspace address.
285 * But wait_for_helper() always runs as keventd, and put_user() 283 * But call_usermodehelper_exec_sync() always runs as kernel
286 * to a kernel address works OK for kernel threads, due to their 284 * thread (workqueue) and put_user() to a kernel address works
287 * having an mm_segment_t which spans the entire address space. 285 * OK for kernel threads, due to their having an mm_segment_t
286 * which spans the entire address space.
288 * 287 *
289 * Thus the __user pointer cast is valid here. 288 * Thus the __user pointer cast is valid here.
290 */ 289 */
291 sys_wait4(pid, (int __user *)&ret, 0, NULL); 290 sys_wait4(pid, (int __user *)&ret, 0, NULL);
292 291
293 /* 292 /*
294 * If ret is 0, either ____call_usermodehelper failed and the 293 * If ret is 0, either call_usermodehelper_exec_async failed and
295 * real error code is already in sub_info->retval or 294 * the real error code is already in sub_info->retval or
296 * sub_info->retval is 0 anyway, so don't mess with it then. 295 * sub_info->retval is 0 anyway, so don't mess with it then.
297 */ 296 */
298 if (ret) 297 if (ret)
299 sub_info->retval = ret; 298 sub_info->retval = ret;
300 } 299 }
301 300
301 /* Restore default kernel sig handler */
302 kernel_sigaction(SIGCHLD, SIG_IGN);
303
302 umh_complete(sub_info); 304 umh_complete(sub_info);
303 do_exit(0);
304} 305}
305 306
306/* This is run by khelper thread */ 307/*
307static void __call_usermodehelper(struct work_struct *work) 308 * We need to create the usermodehelper kernel thread from a task that is affine
309 * to an optimized set of CPUs (or nohz housekeeping ones) such that they
310 * inherit a widest affinity irrespective of call_usermodehelper() callers with
311 * possibly reduced affinity (eg: per-cpu workqueues). We don't want
312 * usermodehelper targets to contend a busy CPU.
313 *
314 * Unbound workqueues provide such wide affinity and allow to block on
315 * UMH_WAIT_PROC requests without blocking pending request (up to some limit).
316 *
317 * Besides, workqueues provide the privilege level that caller might not have
318 * to perform the usermodehelper request.
319 *
320 */
321static void call_usermodehelper_exec_work(struct work_struct *work)
308{ 322{
309 struct subprocess_info *sub_info = 323 struct subprocess_info *sub_info =
310 container_of(work, struct subprocess_info, work); 324 container_of(work, struct subprocess_info, work);
311 pid_t pid;
312 325
313 if (sub_info->wait & UMH_WAIT_PROC) 326 if (sub_info->wait & UMH_WAIT_PROC) {
314 pid = kernel_thread(wait_for_helper, sub_info, 327 call_usermodehelper_exec_sync(sub_info);
315 CLONE_FS | CLONE_FILES | SIGCHLD); 328 } else {
316 else 329 pid_t pid;
317 pid = kernel_thread(____call_usermodehelper, sub_info,
318 SIGCHLD);
319 330
320 if (pid < 0) { 331 pid = kernel_thread(call_usermodehelper_exec_async, sub_info,
321 sub_info->retval = pid; 332 SIGCHLD);
322 umh_complete(sub_info); 333 if (pid < 0) {
334 sub_info->retval = pid;
335 umh_complete(sub_info);
336 }
323 } 337 }
324} 338}
325 339
@@ -509,7 +523,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
509 if (!sub_info) 523 if (!sub_info)
510 goto out; 524 goto out;
511 525
512 INIT_WORK(&sub_info->work, __call_usermodehelper); 526 INIT_WORK(&sub_info->work, call_usermodehelper_exec_work);
513 sub_info->path = path; 527 sub_info->path = path;
514 sub_info->argv = argv; 528 sub_info->argv = argv;
515 sub_info->envp = envp; 529 sub_info->envp = envp;
@@ -531,8 +545,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
531 * from interrupt context. 545 * from interrupt context.
532 * 546 *
533 * Runs a user-space application. The application is started 547 * Runs a user-space application. The application is started
534 * asynchronously if wait is not set, and runs as a child of keventd. 548 * asynchronously if wait is not set, and runs as a child of system workqueues.
535 * (ie. it runs with full root capabilities). 549 * (ie. it runs with full root capabilities and optimized affinity).
536 */ 550 */
537int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) 551int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
538{ 552{
@@ -544,7 +558,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
544 return -EINVAL; 558 return -EINVAL;
545 } 559 }
546 helper_lock(); 560 helper_lock();
547 if (!khelper_wq || usermodehelper_disabled) { 561 if (usermodehelper_disabled) {
548 retval = -EBUSY; 562 retval = -EBUSY;
549 goto out; 563 goto out;
550 } 564 }
@@ -556,7 +570,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
556 sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done; 570 sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
557 sub_info->wait = wait; 571 sub_info->wait = wait;
558 572
559 queue_work(khelper_wq, &sub_info->work); 573 queue_work(system_unbound_wq, &sub_info->work);
560 if (wait == UMH_NO_WAIT) /* task has freed sub_info */ 574 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
561 goto unlock; 575 goto unlock;
562 576
@@ -686,9 +700,3 @@ struct ctl_table usermodehelper_table[] = {
686 }, 700 },
687 { } 701 { }
688}; 702};
689
690void __init usermodehelper_init(void)
691{
692 khelper_wq = create_singlethread_workqueue("khelper");
693 BUG_ON(!khelper_wq);
694}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index c90e417bb963..d10ab6b9b5e0 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1332,7 +1332,7 @@ bool __weak arch_within_kprobe_blacklist(unsigned long addr)
1332 addr < (unsigned long)__kprobes_text_end; 1332 addr < (unsigned long)__kprobes_text_end;
1333} 1333}
1334 1334
1335static bool within_kprobe_blacklist(unsigned long addr) 1335bool within_kprobe_blacklist(unsigned long addr)
1336{ 1336{
1337 struct kprobe_blacklist_entry *ent; 1337 struct kprobe_blacklist_entry *ent;
1338 1338
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 6683ccef9fff..e83b26464061 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -90,7 +90,7 @@ static ssize_t profiling_store(struct kobject *kobj,
90KERNEL_ATTR_RW(profiling); 90KERNEL_ATTR_RW(profiling);
91#endif 91#endif
92 92
93#ifdef CONFIG_KEXEC 93#ifdef CONFIG_KEXEC_CORE
94static ssize_t kexec_loaded_show(struct kobject *kobj, 94static ssize_t kexec_loaded_show(struct kobject *kobj,
95 struct kobj_attribute *attr, char *buf) 95 struct kobj_attribute *attr, char *buf)
96{ 96{
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
134} 134}
135KERNEL_ATTR_RO(vmcoreinfo); 135KERNEL_ATTR_RO(vmcoreinfo);
136 136
137#endif /* CONFIG_KEXEC */ 137#endif /* CONFIG_KEXEC_CORE */
138 138
139/* whether file capabilities are enabled */ 139/* whether file capabilities are enabled */
140static ssize_t fscaps_show(struct kobject *kobj, 140static ssize_t fscaps_show(struct kobject *kobj,
@@ -196,7 +196,7 @@ static struct attribute * kernel_attrs[] = {
196#ifdef CONFIG_PROFILING 196#ifdef CONFIG_PROFILING
197 &profiling_attr.attr, 197 &profiling_attr.attr,
198#endif 198#endif
199#ifdef CONFIG_KEXEC 199#ifdef CONFIG_KEXEC_CORE
200 &kexec_loaded_attr.attr, 200 &kexec_loaded_attr.attr,
201 &kexec_crash_loaded_attr.attr, 201 &kexec_crash_loaded_attr.attr,
202 &kexec_crash_size_attr.attr, 202 &kexec_crash_size_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 10e489c448fe..9ff173dca1ae 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -97,6 +97,7 @@ bool kthread_should_park(void)
97{ 97{
98 return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags); 98 return test_bit(KTHREAD_SHOULD_PARK, &to_kthread(current)->flags);
99} 99}
100EXPORT_SYMBOL_GPL(kthread_should_park);
100 101
101/** 102/**
102 * kthread_freezable_should_stop - should this freezable kthread return now? 103 * kthread_freezable_should_stop - should this freezable kthread return now?
@@ -171,6 +172,7 @@ void kthread_parkme(void)
171{ 172{
172 __kthread_parkme(to_kthread(current)); 173 __kthread_parkme(to_kthread(current));
173} 174}
175EXPORT_SYMBOL_GPL(kthread_parkme);
174 176
175static int kthread(void *_create) 177static int kthread(void *_create)
176{ 178{
@@ -246,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
246 * kthread_create_on_node - create a kthread. 248 * kthread_create_on_node - create a kthread.
247 * @threadfn: the function to run until signal_pending(current). 249 * @threadfn: the function to run until signal_pending(current).
248 * @data: data ptr for @threadfn. 250 * @data: data ptr for @threadfn.
249 * @node: memory node number. 251 * @node: task and thread structures for the thread are allocated on this node
250 * @namefmt: printf-style name for the thread. 252 * @namefmt: printf-style name for the thread.
251 * 253 *
252 * Description: This helper function creates and names a kernel 254 * Description: This helper function creates and names a kernel
253 * thread. The thread will be stopped: use wake_up_process() to start 255 * thread. The thread will be stopped: use wake_up_process() to start
254 * it. See also kthread_run(). 256 * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
257 * is affine to all CPUs.
255 * 258 *
256 * If thread is going to be bound on a particular cpu, give its node 259 * If thread is going to be bound on a particular cpu, give its node
257 * in @node, to get NUMA affinity for kthread stack, or else give -1. 260 * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
258 * When woken, the thread will run @threadfn() with @data as its 261 * When woken, the thread will run @threadfn() with @data as its
259 * argument. @threadfn() can either call do_exit() directly if it is a 262 * argument. @threadfn() can either call do_exit() directly if it is a
260 * standalone thread for which no one will call kthread_stop(), or 263 * standalone thread for which no one will call kthread_stop(), or
@@ -325,16 +328,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
325} 328}
326EXPORT_SYMBOL(kthread_create_on_node); 329EXPORT_SYMBOL(kthread_create_on_node);
327 330
328static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) 331static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
329{ 332{
330 /* Must have done schedule() in kthread() before we set_task_cpu */ 333 unsigned long flags;
334
331 if (!wait_task_inactive(p, state)) { 335 if (!wait_task_inactive(p, state)) {
332 WARN_ON(1); 336 WARN_ON(1);
333 return; 337 return;
334 } 338 }
339
335 /* It's safe because the task is inactive. */ 340 /* It's safe because the task is inactive. */
336 do_set_cpus_allowed(p, cpumask_of(cpu)); 341 raw_spin_lock_irqsave(&p->pi_lock, flags);
342 do_set_cpus_allowed(p, mask);
337 p->flags |= PF_NO_SETAFFINITY; 343 p->flags |= PF_NO_SETAFFINITY;
344 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
345}
346
347static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
348{
349 __kthread_bind_mask(p, cpumask_of(cpu), state);
350}
351
352void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
353{
354 __kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
338} 355}
339 356
340/** 357/**
@@ -411,6 +428,7 @@ void kthread_unpark(struct task_struct *k)
411 if (kthread) 428 if (kthread)
412 __kthread_unpark(k, kthread); 429 __kthread_unpark(k, kthread);
413} 430}
431EXPORT_SYMBOL_GPL(kthread_unpark);
414 432
415/** 433/**
416 * kthread_park - park a thread created by kthread_create(). 434 * kthread_park - park a thread created by kthread_create().
@@ -441,6 +459,7 @@ int kthread_park(struct task_struct *k)
441 } 459 }
442 return ret; 460 return ret;
443} 461}
462EXPORT_SYMBOL_GPL(kthread_park);
444 463
445/** 464/**
446 * kthread_stop - stop a thread created by kthread_create(). 465 * kthread_stop - stop a thread created by kthread_create().
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index c40ebcca0495..6e5344112419 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -348,8 +348,10 @@ static void klp_disable_func(struct klp_func *func)
348{ 348{
349 struct klp_ops *ops; 349 struct klp_ops *ops;
350 350
351 WARN_ON(func->state != KLP_ENABLED); 351 if (WARN_ON(func->state != KLP_ENABLED))
352 WARN_ON(!func->old_addr); 352 return;
353 if (WARN_ON(!func->old_addr))
354 return;
353 355
354 ops = klp_find_ops(func->old_addr); 356 ops = klp_find_ops(func->old_addr);
355 if (WARN_ON(!ops)) 357 if (WARN_ON(!ops))
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 7dd5c9918e4c..8e96f6cc2a4a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
1 1
2obj-y += mutex.o semaphore.o rwsem.o 2obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
3 3
4ifdef CONFIG_FUNCTION_TRACER 4ifdef CONFIG_FUNCTION_TRACER
5CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) 5CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -20,11 +20,9 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
20obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o 20obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
21obj-$(CONFIG_RT_MUTEXES) += rtmutex.o 21obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
22obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 22obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
23obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
24obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 23obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
25obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o 24obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
26obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o 25obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
27obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o 26obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
28obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
29obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o 27obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
30obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o 28obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 652a8ee8efe9..f32567254867 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw)
88 __up_read(&brw->rw_sem); 88 __up_read(&brw->rw_sem);
89} 89}
90 90
91int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
92{
93 if (unlikely(!update_fast_ctr(brw, +1))) {
94 if (!__down_read_trylock(&brw->rw_sem))
95 return 0;
96 atomic_inc(&brw->slow_read_ctr);
97 __up_read(&brw->rw_sem);
98 }
99
100 rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
101 return 1;
102}
103
91void percpu_up_read(struct percpu_rw_semaphore *brw) 104void percpu_up_read(struct percpu_rw_semaphore *brw)
92{ 105{
93 rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); 106 rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 6c5da483966b..f17a3e3b3550 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -55,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
55{ 55{
56 while ((cnts & _QW_WMASK) == _QW_LOCKED) { 56 while ((cnts & _QW_WMASK) == _QW_LOCKED) {
57 cpu_relax_lowlatency(); 57 cpu_relax_lowlatency();
58 cnts = smp_load_acquire((u32 *)&lock->cnts); 58 cnts = atomic_read_acquire(&lock->cnts);
59 } 59 }
60} 60}
61 61
62/** 62/**
63 * queue_read_lock_slowpath - acquire read lock of a queue rwlock 63 * queued_read_lock_slowpath - acquire read lock of a queue rwlock
64 * @lock: Pointer to queue rwlock structure 64 * @lock: Pointer to queue rwlock structure
65 * @cnts: Current qrwlock lock value
65 */ 66 */
66void queue_read_lock_slowpath(struct qrwlock *lock) 67void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
67{ 68{
68 u32 cnts;
69
70 /* 69 /*
71 * Readers come here when they cannot get the lock without waiting 70 * Readers come here when they cannot get the lock without waiting
72 */ 71 */
73 if (unlikely(in_interrupt())) { 72 if (unlikely(in_interrupt())) {
74 /* 73 /*
75 * Readers in interrupt context will spin until the lock is 74 * Readers in interrupt context will get the lock immediately
76 * available without waiting in the queue. 75 * if the writer is just waiting (not holding the lock yet).
76 * The rspin_until_writer_unlock() function returns immediately
77 * in this case. Otherwise, they will spin (with ACQUIRE
78 * semantics) until the lock is available without waiting in
79 * the queue.
77 */ 80 */
78 cnts = smp_load_acquire((u32 *)&lock->cnts);
79 rspin_until_writer_unlock(lock, cnts); 81 rspin_until_writer_unlock(lock, cnts);
80 return; 82 return;
81 } 83 }
@@ -87,16 +89,11 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
87 arch_spin_lock(&lock->lock); 89 arch_spin_lock(&lock->lock);
88 90
89 /* 91 /*
90 * At the head of the wait queue now, wait until the writer state 92 * The ACQUIRE semantics of the following spinning code ensure
91 * goes to 0 and then try to increment the reader count and get 93 * that accesses can't leak upwards out of our subsequent critical
92 * the lock. It is possible that an incoming writer may steal the 94 * section in the case that the lock is currently held for write.
93 * lock in the interim, so it is necessary to check the writer byte
94 * to make sure that the write lock isn't taken.
95 */ 95 */
96 while (atomic_read(&lock->cnts) & _QW_WMASK) 96 cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
97 cpu_relax_lowlatency();
98
99 cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
100 rspin_until_writer_unlock(lock, cnts); 97 rspin_until_writer_unlock(lock, cnts);
101 98
102 /* 99 /*
@@ -104,13 +101,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
104 */ 101 */
105 arch_spin_unlock(&lock->lock); 102 arch_spin_unlock(&lock->lock);
106} 103}
107EXPORT_SYMBOL(queue_read_lock_slowpath); 104EXPORT_SYMBOL(queued_read_lock_slowpath);
108 105
109/** 106/**
110 * queue_write_lock_slowpath - acquire write lock of a queue rwlock 107 * queued_write_lock_slowpath - acquire write lock of a queue rwlock
111 * @lock : Pointer to queue rwlock structure 108 * @lock : Pointer to queue rwlock structure
112 */ 109 */
113void queue_write_lock_slowpath(struct qrwlock *lock) 110void queued_write_lock_slowpath(struct qrwlock *lock)
114{ 111{
115 u32 cnts; 112 u32 cnts;
116 113
@@ -119,7 +116,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
119 116
120 /* Try to acquire the lock directly if no reader is present */ 117 /* Try to acquire the lock directly if no reader is present */
121 if (!atomic_read(&lock->cnts) && 118 if (!atomic_read(&lock->cnts) &&
122 (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0)) 119 (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
123 goto unlock; 120 goto unlock;
124 121
125 /* 122 /*
@@ -130,7 +127,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
130 struct __qrwlock *l = (struct __qrwlock *)lock; 127 struct __qrwlock *l = (struct __qrwlock *)lock;
131 128
132 if (!READ_ONCE(l->wmode) && 129 if (!READ_ONCE(l->wmode) &&
133 (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0)) 130 (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
134 break; 131 break;
135 132
136 cpu_relax_lowlatency(); 133 cpu_relax_lowlatency();
@@ -140,8 +137,8 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
140 for (;;) { 137 for (;;) {
141 cnts = atomic_read(&lock->cnts); 138 cnts = atomic_read(&lock->cnts);
142 if ((cnts == _QW_WAITING) && 139 if ((cnts == _QW_WAITING) &&
143 (atomic_cmpxchg(&lock->cnts, _QW_WAITING, 140 (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
144 _QW_LOCKED) == _QW_WAITING)) 141 _QW_LOCKED) == _QW_WAITING))
145 break; 142 break;
146 143
147 cpu_relax_lowlatency(); 144 cpu_relax_lowlatency();
@@ -149,4 +146,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
149unlock: 146unlock:
150 arch_spin_unlock(&lock->lock); 147 arch_spin_unlock(&lock->lock);
151} 148}
152EXPORT_SYMBOL(queue_write_lock_slowpath); 149EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 38c49202d532..337c8818541d 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
239 239
240static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } 240static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
241static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } 241static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
242static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { } 242static __always_inline void __pv_kick_node(struct qspinlock *lock,
243 243 struct mcs_spinlock *node) { }
244static __always_inline void __pv_wait_head(struct qspinlock *lock, 244static __always_inline void __pv_wait_head(struct qspinlock *lock,
245 struct mcs_spinlock *node) { } 245 struct mcs_spinlock *node) { }
246 246
@@ -440,7 +440,7 @@ queue:
440 cpu_relax(); 440 cpu_relax();
441 441
442 arch_mcs_spin_unlock_contended(&next->locked); 442 arch_mcs_spin_unlock_contended(&next->locked);
443 pv_kick_node(next); 443 pv_kick_node(lock, next);
444 444
445release: 445release:
446 /* 446 /*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index 04ab18151cc8..c8e6e9a596f5 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -4,6 +4,7 @@
4 4
5#include <linux/hash.h> 5#include <linux/hash.h>
6#include <linux/bootmem.h> 6#include <linux/bootmem.h>
7#include <linux/debug_locks.h>
7 8
8/* 9/*
9 * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead 10 * Implement paravirt qspinlocks; the general idea is to halt the vcpus instead
@@ -21,9 +22,14 @@
21 22
22#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) 23#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
23 24
25/*
26 * Queue node uses: vcpu_running & vcpu_halted.
27 * Queue head uses: vcpu_running & vcpu_hashed.
28 */
24enum vcpu_state { 29enum vcpu_state {
25 vcpu_running = 0, 30 vcpu_running = 0,
26 vcpu_halted, 31 vcpu_halted, /* Used only in pv_wait_node */
32 vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
27}; 33};
28 34
29struct pv_node { 35struct pv_node {
@@ -152,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node)
152 158
153/* 159/*
154 * Wait for node->locked to become true, halt the vcpu after a short spin. 160 * Wait for node->locked to become true, halt the vcpu after a short spin.
155 * pv_kick_node() is used to wake the vcpu again. 161 * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
162 * behalf.
156 */ 163 */
157static void pv_wait_node(struct mcs_spinlock *node) 164static void pv_wait_node(struct mcs_spinlock *node)
158{ 165{
@@ -171,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
171 * 178 *
172 * [S] pn->state = vcpu_halted [S] next->locked = 1 179 * [S] pn->state = vcpu_halted [S] next->locked = 1
173 * MB MB 180 * MB MB
174 * [L] pn->locked [RmW] pn->state = vcpu_running 181 * [L] pn->locked [RmW] pn->state = vcpu_hashed
175 * 182 *
176 * Matches the xchg() from pv_kick_node(). 183 * Matches the cmpxchg() from pv_kick_node().
177 */ 184 */
178 smp_store_mb(pn->state, vcpu_halted); 185 smp_store_mb(pn->state, vcpu_halted);
179 186
@@ -181,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node)
181 pv_wait(&pn->state, vcpu_halted); 188 pv_wait(&pn->state, vcpu_halted);
182 189
183 /* 190 /*
184 * Reset the vCPU state to avoid unncessary CPU kicking 191 * If pv_kick_node() changed us to vcpu_hashed, retain that value
192 * so that pv_wait_head() knows to not also try to hash this lock.
185 */ 193 */
186 WRITE_ONCE(pn->state, vcpu_running); 194 cmpxchg(&pn->state, vcpu_halted, vcpu_running);
187 195
188 /* 196 /*
189 * If the locked flag is still not set after wakeup, it is a 197 * If the locked flag is still not set after wakeup, it is a
@@ -193,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
193 * MCS lock will be released soon. 201 * MCS lock will be released soon.
194 */ 202 */
195 } 203 }
204
196 /* 205 /*
197 * By now our node->locked should be 1 and our caller will not actually 206 * By now our node->locked should be 1 and our caller will not actually
198 * spin-wait for it. We do however rely on our caller to do a 207 * spin-wait for it. We do however rely on our caller to do a
@@ -201,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node)
201} 210}
202 211
203/* 212/*
204 * Called after setting next->locked = 1, used to wake those stuck in 213 * Called after setting next->locked = 1 when we're the lock owner.
205 * pv_wait_node(). 214 *
215 * Instead of waking the waiters stuck in pv_wait_node() advance their state such
216 * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
206 */ 217 */
207static void pv_kick_node(struct mcs_spinlock *node) 218static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
208{ 219{
209 struct pv_node *pn = (struct pv_node *)node; 220 struct pv_node *pn = (struct pv_node *)node;
221 struct __qspinlock *l = (void *)lock;
210 222
211 /* 223 /*
212 * Note that because node->locked is already set, this actual 224 * If the vCPU is indeed halted, advance its state to match that of
213 * mcs_spinlock entry could be re-used already. 225 * pv_wait_node(). If OTOH this fails, the vCPU was running and will
226 * observe its next->locked value and advance itself.
214 * 227 *
215 * This should be fine however, kicking people for no reason is 228 * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
216 * harmless. 229 */
230 if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
231 return;
232
233 /*
234 * Put the lock into the hash table and set the _Q_SLOW_VAL.
217 * 235 *
218 * See the comment in pv_wait_node(). 236 * As this is the same vCPU that will check the _Q_SLOW_VAL value and
237 * the hash table later on at unlock time, no atomic instruction is
238 * needed.
219 */ 239 */
220 if (xchg(&pn->state, vcpu_running) == vcpu_halted) 240 WRITE_ONCE(l->locked, _Q_SLOW_VAL);
221 pv_kick(pn->cpu); 241 (void)pv_hash(lock, pn);
222} 242}
223 243
224/* 244/*
@@ -232,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
232 struct qspinlock **lp = NULL; 252 struct qspinlock **lp = NULL;
233 int loop; 253 int loop;
234 254
255 /*
256 * If pv_kick_node() already advanced our state, we don't need to
257 * insert ourselves into the hash table anymore.
258 */
259 if (READ_ONCE(pn->state) == vcpu_hashed)
260 lp = (struct qspinlock **)1;
261
235 for (;;) { 262 for (;;) {
236 for (loop = SPIN_THRESHOLD; loop; loop--) { 263 for (loop = SPIN_THRESHOLD; loop; loop--) {
237 if (!READ_ONCE(l->locked)) 264 if (!READ_ONCE(l->locked))
@@ -239,17 +266,22 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
239 cpu_relax(); 266 cpu_relax();
240 } 267 }
241 268
242 WRITE_ONCE(pn->state, vcpu_halted);
243 if (!lp) { /* ONCE */ 269 if (!lp) { /* ONCE */
270 WRITE_ONCE(pn->state, vcpu_hashed);
244 lp = pv_hash(lock, pn); 271 lp = pv_hash(lock, pn);
272
245 /* 273 /*
246 * lp must be set before setting _Q_SLOW_VAL 274 * We must hash before setting _Q_SLOW_VAL, such that
275 * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
276 * we'll be sure to be able to observe our hash entry.
247 * 277 *
248 * [S] lp = lock [RmW] l = l->locked = 0 278 * [S] pn->state
249 * MB MB 279 * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL
250 * [S] l->locked = _Q_SLOW_VAL [L] lp 280 * MB RMB
281 * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash>
282 * [L] pn->state
251 * 283 *
252 * Matches the cmpxchg() in __pv_queued_spin_unlock(). 284 * Matches the smp_rmb() in __pv_queued_spin_unlock().
253 */ 285 */
254 if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { 286 if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
255 /* 287 /*
@@ -286,14 +318,32 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
286{ 318{
287 struct __qspinlock *l = (void *)lock; 319 struct __qspinlock *l = (void *)lock;
288 struct pv_node *node; 320 struct pv_node *node;
321 u8 locked;
289 322
290 /* 323 /*
291 * We must not unlock if SLOW, because in that case we must first 324 * We must not unlock if SLOW, because in that case we must first
292 * unhash. Otherwise it would be possible to have multiple @lock 325 * unhash. Otherwise it would be possible to have multiple @lock
293 * entries, which would be BAD. 326 * entries, which would be BAD.
294 */ 327 */
295 if (likely(cmpxchg(&l->locked, _Q_LOCKED_VAL, 0) == _Q_LOCKED_VAL)) 328 locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
329 if (likely(locked == _Q_LOCKED_VAL))
330 return;
331
332 if (unlikely(locked != _Q_SLOW_VAL)) {
333 WARN(!debug_locks_silent,
334 "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
335 (unsigned long)lock, atomic_read(&lock->val));
296 return; 336 return;
337 }
338
339 /*
340 * A failed cmpxchg doesn't provide any memory-ordering guarantees,
341 * so we need a barrier to order the read of the node data in
342 * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
343 *
344 * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
345 */
346 smp_rmb();
297 347
298 /* 348 /*
299 * Since the above failed to release, this must be the SLOW path. 349 * Since the above failed to release, this must be the SLOW path.
@@ -310,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
310 /* 360 /*
311 * At this point the memory pointed at by lock can be freed/reused, 361 * At this point the memory pointed at by lock can be freed/reused,
312 * however we can still use the pv_node to kick the CPU. 362 * however we can still use the pv_node to kick the CPU.
363 * The other vCPU may not really be halted, but kicking an active
364 * vCPU is harmless other than the additional latency in completing
365 * the unlock.
313 */ 366 */
314 if (READ_ONCE(node->state) == vcpu_halted) 367 if (READ_ONCE(node->state) == vcpu_hashed)
315 pv_kick(node->cpu); 368 pv_kick(node->cpu);
316} 369}
317/* 370/*
diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
deleted file mode 100644
index 1d96dd0d93c1..000000000000
--- a/kernel/locking/rtmutex-tester.c
+++ /dev/null
@@ -1,420 +0,0 @@
1/*
2 * RT-Mutex-tester: scriptable tester for rt mutexes
3 *
4 * started by Thomas Gleixner:
5 *
6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
7 *
8 */
9#include <linux/device.h>
10#include <linux/kthread.h>
11#include <linux/export.h>
12#include <linux/sched.h>
13#include <linux/sched/rt.h>
14#include <linux/spinlock.h>
15#include <linux/timer.h>
16#include <linux/freezer.h>
17#include <linux/stat.h>
18
19#include "rtmutex.h"
20
21#define MAX_RT_TEST_THREADS 8
22#define MAX_RT_TEST_MUTEXES 8
23
24static spinlock_t rttest_lock;
25static atomic_t rttest_event;
26
27struct test_thread_data {
28 int opcode;
29 int opdata;
30 int mutexes[MAX_RT_TEST_MUTEXES];
31 int event;
32 struct device dev;
33};
34
35static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
36static struct task_struct *threads[MAX_RT_TEST_THREADS];
37static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
38
39enum test_opcodes {
40 RTTEST_NOP = 0,
41 RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
42 RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
43 RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
44 RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
45 RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 /* 9, 10 - reserved for BKL commemoration */
50 RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
51 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
52 RTTEST_RESET = 99, /* 99 Reset all pending operations */
53};
54
55static int handle_op(struct test_thread_data *td, int lockwakeup)
56{
57 int i, id, ret = -EINVAL;
58
59 switch(td->opcode) {
60
61 case RTTEST_NOP:
62 return 0;
63
64 case RTTEST_LOCKCONT:
65 td->mutexes[td->opdata] = 1;
66 td->event = atomic_add_return(1, &rttest_event);
67 return 0;
68
69 case RTTEST_RESET:
70 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
71 if (td->mutexes[i] == 4) {
72 rt_mutex_unlock(&mutexes[i]);
73 td->mutexes[i] = 0;
74 }
75 }
76 return 0;
77
78 case RTTEST_RESETEVENT:
79 atomic_set(&rttest_event, 0);
80 return 0;
81
82 default:
83 if (lockwakeup)
84 return ret;
85 }
86
87 switch(td->opcode) {
88
89 case RTTEST_LOCK:
90 case RTTEST_LOCKNOWAIT:
91 id = td->opdata;
92 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
93 return ret;
94
95 td->mutexes[id] = 1;
96 td->event = atomic_add_return(1, &rttest_event);
97 rt_mutex_lock(&mutexes[id]);
98 td->event = atomic_add_return(1, &rttest_event);
99 td->mutexes[id] = 4;
100 return 0;
101
102 case RTTEST_LOCKINT:
103 case RTTEST_LOCKINTNOWAIT:
104 id = td->opdata;
105 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
106 return ret;
107
108 td->mutexes[id] = 1;
109 td->event = atomic_add_return(1, &rttest_event);
110 ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
111 td->event = atomic_add_return(1, &rttest_event);
112 td->mutexes[id] = ret ? 0 : 4;
113 return ret ? -EINTR : 0;
114
115 case RTTEST_UNLOCK:
116 id = td->opdata;
117 if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
118 return ret;
119
120 td->event = atomic_add_return(1, &rttest_event);
121 rt_mutex_unlock(&mutexes[id]);
122 td->event = atomic_add_return(1, &rttest_event);
123 td->mutexes[id] = 0;
124 return 0;
125
126 default:
127 break;
128 }
129 return ret;
130}
131
132/*
133 * Schedule replacement for rtsem_down(). Only called for threads with
134 * PF_MUTEX_TESTER set.
135 *
136 * This allows us to have finegrained control over the event flow.
137 *
138 */
139void schedule_rt_mutex_test(struct rt_mutex *mutex)
140{
141 int tid, op, dat;
142 struct test_thread_data *td;
143
144 /* We have to lookup the task */
145 for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
146 if (threads[tid] == current)
147 break;
148 }
149
150 BUG_ON(tid == MAX_RT_TEST_THREADS);
151
152 td = &thread_data[tid];
153
154 op = td->opcode;
155 dat = td->opdata;
156
157 switch (op) {
158 case RTTEST_LOCK:
159 case RTTEST_LOCKINT:
160 case RTTEST_LOCKNOWAIT:
161 case RTTEST_LOCKINTNOWAIT:
162 if (mutex != &mutexes[dat])
163 break;
164
165 if (td->mutexes[dat] != 1)
166 break;
167
168 td->mutexes[dat] = 2;
169 td->event = atomic_add_return(1, &rttest_event);
170 break;
171
172 default:
173 break;
174 }
175
176 schedule();
177
178
179 switch (op) {
180 case RTTEST_LOCK:
181 case RTTEST_LOCKINT:
182 if (mutex != &mutexes[dat])
183 return;
184
185 if (td->mutexes[dat] != 2)
186 return;
187
188 td->mutexes[dat] = 3;
189 td->event = atomic_add_return(1, &rttest_event);
190 break;
191
192 case RTTEST_LOCKNOWAIT:
193 case RTTEST_LOCKINTNOWAIT:
194 if (mutex != &mutexes[dat])
195 return;
196
197 if (td->mutexes[dat] != 2)
198 return;
199
200 td->mutexes[dat] = 1;
201 td->event = atomic_add_return(1, &rttest_event);
202 return;
203
204 default:
205 return;
206 }
207
208 td->opcode = 0;
209
210 for (;;) {
211 set_current_state(TASK_INTERRUPTIBLE);
212
213 if (td->opcode > 0) {
214 int ret;
215
216 set_current_state(TASK_RUNNING);
217 ret = handle_op(td, 1);
218 set_current_state(TASK_INTERRUPTIBLE);
219 if (td->opcode == RTTEST_LOCKCONT)
220 break;
221 td->opcode = ret;
222 }
223
224 /* Wait for the next command to be executed */
225 schedule();
226 }
227
228 /* Restore previous command and data */
229 td->opcode = op;
230 td->opdata = dat;
231}
232
233static int test_func(void *data)
234{
235 struct test_thread_data *td = data;
236 int ret;
237
238 current->flags |= PF_MUTEX_TESTER;
239 set_freezable();
240 allow_signal(SIGHUP);
241
242 for(;;) {
243
244 set_current_state(TASK_INTERRUPTIBLE);
245
246 if (td->opcode > 0) {
247 set_current_state(TASK_RUNNING);
248 ret = handle_op(td, 0);
249 set_current_state(TASK_INTERRUPTIBLE);
250 td->opcode = ret;
251 }
252
253 /* Wait for the next command to be executed */
254 schedule();
255 try_to_freeze();
256
257 if (signal_pending(current))
258 flush_signals(current);
259
260 if(kthread_should_stop())
261 break;
262 }
263 return 0;
264}
265
266/**
267 * sysfs_test_command - interface for test commands
268 * @dev: thread reference
269 * @buf: command for actual step
270 * @count: length of buffer
271 *
272 * command syntax:
273 *
274 * opcode:data
275 */
276static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
277 const char *buf, size_t count)
278{
279 struct sched_param schedpar;
280 struct test_thread_data *td;
281 char cmdbuf[32];
282 int op, dat, tid, ret;
283
284 td = container_of(dev, struct test_thread_data, dev);
285 tid = td->dev.id;
286
287 /* strings from sysfs write are not 0 terminated! */
288 if (count >= sizeof(cmdbuf))
289 return -EINVAL;
290
291 /* strip of \n: */
292 if (buf[count-1] == '\n')
293 count--;
294 if (count < 1)
295 return -EINVAL;
296
297 memcpy(cmdbuf, buf, count);
298 cmdbuf[count] = 0;
299
300 if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
301 return -EINVAL;
302
303 switch (op) {
304 case RTTEST_SCHEDOT:
305 schedpar.sched_priority = 0;
306 ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
307 if (ret)
308 return ret;
309 set_user_nice(current, 0);
310 break;
311
312 case RTTEST_SCHEDRT:
313 schedpar.sched_priority = dat;
314 ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
315 if (ret)
316 return ret;
317 break;
318
319 case RTTEST_SIGNAL:
320 send_sig(SIGHUP, threads[tid], 0);
321 break;
322
323 default:
324 if (td->opcode > 0)
325 return -EBUSY;
326 td->opdata = dat;
327 td->opcode = op;
328 wake_up_process(threads[tid]);
329 }
330
331 return count;
332}
333
334/**
335 * sysfs_test_status - sysfs interface for rt tester
336 * @dev: thread to query
337 * @buf: char buffer to be filled with thread status info
338 */
339static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
340 char *buf)
341{
342 struct test_thread_data *td;
343 struct task_struct *tsk;
344 char *curr = buf;
345 int i;
346
347 td = container_of(dev, struct test_thread_data, dev);
348 tsk = threads[td->dev.id];
349
350 spin_lock(&rttest_lock);
351
352 curr += sprintf(curr,
353 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
354 td->opcode, td->event, tsk->state,
355 (MAX_RT_PRIO - 1) - tsk->prio,
356 (MAX_RT_PRIO - 1) - tsk->normal_prio,
357 tsk->pi_blocked_on);
358
359 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
360 curr += sprintf(curr, "%d", td->mutexes[i]);
361
362 spin_unlock(&rttest_lock);
363
364 curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
365 mutexes[td->dev.id].owner);
366
367 return curr - buf;
368}
369
370static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
371static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
372
373static struct bus_type rttest_subsys = {
374 .name = "rttest",
375 .dev_name = "rttest",
376};
377
378static int init_test_thread(int id)
379{
380 thread_data[id].dev.bus = &rttest_subsys;
381 thread_data[id].dev.id = id;
382
383 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
384 if (IS_ERR(threads[id]))
385 return PTR_ERR(threads[id]);
386
387 return device_register(&thread_data[id].dev);
388}
389
390static int init_rttest(void)
391{
392 int ret, i;
393
394 spin_lock_init(&rttest_lock);
395
396 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
397 rt_mutex_init(&mutexes[i]);
398
399 ret = subsys_system_register(&rttest_subsys, NULL);
400 if (ret)
401 return ret;
402
403 for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
404 ret = init_test_thread(i);
405 if (ret)
406 break;
407 ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
408 if (ret)
409 break;
410 ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
411 if (ret)
412 break;
413 }
414
415 printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
416
417 return ret;
418}
419
420device_initcall(init_rttest);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 5674b073473c..7781d801212f 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1120,7 +1120,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
1120 1120
1121 debug_rt_mutex_print_deadlock(waiter); 1121 debug_rt_mutex_print_deadlock(waiter);
1122 1122
1123 schedule_rt_mutex(lock); 1123 schedule();
1124 1124
1125 raw_spin_lock(&lock->wait_lock); 1125 raw_spin_lock(&lock->wait_lock);
1126 set_current_state(state); 1126 set_current_state(state);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7844f8f0e639..4f5f83c7d2d3 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -15,28 +15,6 @@
15#include <linux/rtmutex.h> 15#include <linux/rtmutex.h>
16 16
17/* 17/*
18 * The rtmutex in kernel tester is independent of rtmutex debugging. We
19 * call schedule_rt_mutex_test() instead of schedule() for the tasks which
20 * belong to the tester. That way we can delay the wakeup path of those
21 * threads to provoke lock stealing and testing of complex boosting scenarios.
22 */
23#ifdef CONFIG_RT_MUTEX_TESTER
24
25extern void schedule_rt_mutex_test(struct rt_mutex *lock);
26
27#define schedule_rt_mutex(_lock) \
28 do { \
29 if (!(current->flags & PF_MUTEX_TESTER)) \
30 schedule(); \
31 else \
32 schedule_rt_mutex_test(_lock); \
33 } while (0)
34
35#else
36# define schedule_rt_mutex(_lock) schedule()
37#endif
38
39/*
40 * This is the control structure for tasks blocked on a rt_mutex, 18 * This is the control structure for tasks blocked on a rt_mutex,
41 * which is allocated on the kernel stack on of the blocked task. 19 * which is allocated on the kernel stack on of the blocked task.
42 * 20 *
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
new file mode 100644
index 000000000000..536c727a56e9
--- /dev/null
+++ b/kernel/membarrier.c
@@ -0,0 +1,66 @@
1/*
2 * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3 *
4 * membarrier system call
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 */
16
17#include <linux/syscalls.h>
18#include <linux/membarrier.h>
19
20/*
21 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
22 * except MEMBARRIER_CMD_QUERY.
23 */
24#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
25
26/**
27 * sys_membarrier - issue memory barriers on a set of threads
28 * @cmd: Takes command values defined in enum membarrier_cmd.
29 * @flags: Currently needs to be 0. For future extensions.
30 *
31 * If this system call is not implemented, -ENOSYS is returned. If the
32 * command specified does not exist, or if the command argument is invalid,
33 * this system call returns -EINVAL. For a given command, with flags argument
34 * set to 0, this system call is guaranteed to always return the same value
35 * until reboot.
36 *
37 * All memory accesses performed in program order from each targeted thread
38 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
39 * the semantic "barrier()" to represent a compiler barrier forcing memory
40 * accesses to be performed in program order across the barrier, and
41 * smp_mb() to represent explicit memory barriers forcing full memory
42 * ordering across the barrier, we have the following ordering table for
43 * each pair of barrier(), sys_membarrier() and smp_mb():
44 *
45 * The pair ordering is detailed as (O: ordered, X: not ordered):
46 *
47 * barrier() smp_mb() sys_membarrier()
48 * barrier() X X O
49 * smp_mb() X O O
50 * sys_membarrier() O O O
51 */
52SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
53{
54 if (unlikely(flags))
55 return -EINVAL;
56 switch (cmd) {
57 case MEMBARRIER_CMD_QUERY:
58 return MEMBARRIER_CMD_BITMASK;
59 case MEMBARRIER_CMD_SHARED:
60 if (num_online_cpus() > 1)
61 synchronize_sched();
62 return 0;
63 default:
64 return -EINVAL;
65 }
66}
diff --git a/kernel/memremap.c b/kernel/memremap.c
new file mode 100644
index 000000000000..72b0c66628b6
--- /dev/null
+++ b/kernel/memremap.c
@@ -0,0 +1,190 @@
1/*
2 * Copyright(c) 2015 Intel Corporation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * General Public License for more details.
12 */
13#include <linux/device.h>
14#include <linux/types.h>
15#include <linux/io.h>
16#include <linux/mm.h>
17#include <linux/memory_hotplug.h>
18
19#ifndef ioremap_cache
20/* temporary while we convert existing ioremap_cache users to memremap */
21__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
22{
23 return ioremap(offset, size);
24}
25#endif
26
27/**
28 * memremap() - remap an iomem_resource as cacheable memory
29 * @offset: iomem resource start address
30 * @size: size of remap
31 * @flags: either MEMREMAP_WB or MEMREMAP_WT
32 *
33 * memremap() is "ioremap" for cases where it is known that the resource
34 * being mapped does not have i/o side effects and the __iomem
35 * annotation is not applicable.
36 *
37 * MEMREMAP_WB - matches the default mapping for "System RAM" on
38 * the architecture. This is usually a read-allocate write-back cache.
39 * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
40 * memremap() will bypass establishing a new mapping and instead return
41 * a pointer into the direct map.
42 *
43 * MEMREMAP_WT - establish a mapping whereby writes either bypass the
44 * cache or are written through to memory and never exist in a
45 * cache-dirty state with respect to program visibility. Attempts to
46 * map "System RAM" with this mapping type will fail.
47 */
48void *memremap(resource_size_t offset, size_t size, unsigned long flags)
49{
50 int is_ram = region_intersects(offset, size, "System RAM");
51 void *addr = NULL;
52
53 if (is_ram == REGION_MIXED) {
54 WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
55 &offset, (unsigned long) size);
56 return NULL;
57 }
58
59 /* Try all mapping types requested until one returns non-NULL */
60 if (flags & MEMREMAP_WB) {
61 flags &= ~MEMREMAP_WB;
62 /*
63 * MEMREMAP_WB is special in that it can be satisifed
64 * from the direct map. Some archs depend on the
65 * capability of memremap() to autodetect cases where
66 * the requested range is potentially in "System RAM"
67 */
68 if (is_ram == REGION_INTERSECTS)
69 addr = __va(offset);
70 else
71 addr = ioremap_cache(offset, size);
72 }
73
74 /*
75 * If we don't have a mapping yet and more request flags are
76 * pending then we will be attempting to establish a new virtual
77 * address mapping. Enforce that this mapping is not aliasing
78 * "System RAM"
79 */
80 if (!addr && is_ram == REGION_INTERSECTS && flags) {
81 WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
82 &offset, (unsigned long) size);
83 return NULL;
84 }
85
86 if (!addr && (flags & MEMREMAP_WT)) {
87 flags &= ~MEMREMAP_WT;
88 addr = ioremap_wt(offset, size);
89 }
90
91 return addr;
92}
93EXPORT_SYMBOL(memremap);
94
95void memunmap(void *addr)
96{
97 if (is_vmalloc_addr(addr))
98 iounmap((void __iomem *) addr);
99}
100EXPORT_SYMBOL(memunmap);
101
102static void devm_memremap_release(struct device *dev, void *res)
103{
104 memunmap(res);
105}
106
107static int devm_memremap_match(struct device *dev, void *res, void *match_data)
108{
109 return *(void **)res == match_data;
110}
111
112void *devm_memremap(struct device *dev, resource_size_t offset,
113 size_t size, unsigned long flags)
114{
115 void **ptr, *addr;
116
117 ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL);
118 if (!ptr)
119 return NULL;
120
121 addr = memremap(offset, size, flags);
122 if (addr) {
123 *ptr = addr;
124 devres_add(dev, ptr);
125 } else
126 devres_free(ptr);
127
128 return addr;
129}
130EXPORT_SYMBOL(devm_memremap);
131
132void devm_memunmap(struct device *dev, void *addr)
133{
134 WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match,
135 addr));
136 memunmap(addr);
137}
138EXPORT_SYMBOL(devm_memunmap);
139
140#ifdef CONFIG_ZONE_DEVICE
141struct page_map {
142 struct resource res;
143};
144
145static void devm_memremap_pages_release(struct device *dev, void *res)
146{
147 struct page_map *page_map = res;
148
149 /* pages are dead and unused, undo the arch mapping */
150 arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
151}
152
153void *devm_memremap_pages(struct device *dev, struct resource *res)
154{
155 int is_ram = region_intersects(res->start, resource_size(res),
156 "System RAM");
157 struct page_map *page_map;
158 int error, nid;
159
160 if (is_ram == REGION_MIXED) {
161 WARN_ONCE(1, "%s attempted on mixed region %pr\n",
162 __func__, res);
163 return ERR_PTR(-ENXIO);
164 }
165
166 if (is_ram == REGION_INTERSECTS)
167 return __va(res->start);
168
169 page_map = devres_alloc(devm_memremap_pages_release,
170 sizeof(*page_map), GFP_KERNEL);
171 if (!page_map)
172 return ERR_PTR(-ENOMEM);
173
174 memcpy(&page_map->res, res, sizeof(*res));
175
176 nid = dev_to_node(dev);
177 if (nid < 0)
178 nid = 0;
179
180 error = arch_add_memory(nid, res->start, resource_size(res), true);
181 if (error) {
182 devres_free(page_map);
183 return ERR_PTR(error);
184 }
185
186 devres_add(dev, page_map);
187 return __va(res->start);
188}
189EXPORT_SYMBOL(devm_memremap_pages);
190#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/module.c b/kernel/module.c
index 4d2b82e610e2..b86b7bf1be38 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -602,13 +602,16 @@ const struct kernel_symbol *find_symbol(const char *name,
602} 602}
603EXPORT_SYMBOL_GPL(find_symbol); 603EXPORT_SYMBOL_GPL(find_symbol);
604 604
605/* Search for module by name: must hold module_mutex. */ 605/*
606 * Search for module by name: must hold module_mutex (or preempt disabled
607 * for read-only access).
608 */
606static struct module *find_module_all(const char *name, size_t len, 609static struct module *find_module_all(const char *name, size_t len,
607 bool even_unformed) 610 bool even_unformed)
608{ 611{
609 struct module *mod; 612 struct module *mod;
610 613
611 module_assert_mutex(); 614 module_assert_mutex_or_preempt();
612 615
613 list_for_each_entry(mod, &modules, list) { 616 list_for_each_entry(mod, &modules, list) {
614 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) 617 if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
@@ -621,6 +624,7 @@ static struct module *find_module_all(const char *name, size_t len,
621 624
622struct module *find_module(const char *name) 625struct module *find_module(const char *name)
623{ 626{
627 module_assert_mutex();
624 return find_module_all(name, strlen(name), false); 628 return find_module_all(name, strlen(name), false);
625} 629}
626EXPORT_SYMBOL_GPL(find_module); 630EXPORT_SYMBOL_GPL(find_module);
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index be5b8fac4bd0..bd62f5cda746 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -10,11 +10,8 @@
10 */ 10 */
11 11
12#include <linux/kernel.h> 12#include <linux/kernel.h>
13#include <linux/err.h>
14#include <crypto/public_key.h>
15#include <crypto/hash.h>
16#include <keys/asymmetric-type.h>
17#include <keys/system_keyring.h> 13#include <keys/system_keyring.h>
14#include <crypto/public_key.h>
18#include "module-internal.h" 15#include "module-internal.h"
19 16
20/* 17/*
@@ -28,170 +25,22 @@
28 * - Information block 25 * - Information block
29 */ 26 */
30struct module_signature { 27struct module_signature {
31 u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */ 28 u8 algo; /* Public-key crypto algorithm [0] */
32 u8 hash; /* Digest algorithm [enum hash_algo] */ 29 u8 hash; /* Digest algorithm [0] */
33 u8 id_type; /* Key identifier type [enum pkey_id_type] */ 30 u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */
34 u8 signer_len; /* Length of signer's name */ 31 u8 signer_len; /* Length of signer's name [0] */
35 u8 key_id_len; /* Length of key identifier */ 32 u8 key_id_len; /* Length of key identifier [0] */
36 u8 __pad[3]; 33 u8 __pad[3];
37 __be32 sig_len; /* Length of signature data */ 34 __be32 sig_len; /* Length of signature data */
38}; 35};
39 36
40/* 37/*
41 * Digest the module contents.
42 */
43static struct public_key_signature *mod_make_digest(enum hash_algo hash,
44 const void *mod,
45 unsigned long modlen)
46{
47 struct public_key_signature *pks;
48 struct crypto_shash *tfm;
49 struct shash_desc *desc;
50 size_t digest_size, desc_size;
51 int ret;
52
53 pr_devel("==>%s()\n", __func__);
54
55 /* Allocate the hashing algorithm we're going to need and find out how
56 * big the hash operational data will be.
57 */
58 tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
59 if (IS_ERR(tfm))
60 return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
61
62 desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
63 digest_size = crypto_shash_digestsize(tfm);
64
65 /* We allocate the hash operational data storage on the end of our
66 * context data and the digest output buffer on the end of that.
67 */
68 ret = -ENOMEM;
69 pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
70 if (!pks)
71 goto error_no_pks;
72
73 pks->pkey_hash_algo = hash;
74 pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
75 pks->digest_size = digest_size;
76
77 desc = (void *)pks + sizeof(*pks);
78 desc->tfm = tfm;
79 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
80
81 ret = crypto_shash_init(desc);
82 if (ret < 0)
83 goto error;
84
85 ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
86 if (ret < 0)
87 goto error;
88
89 crypto_free_shash(tfm);
90 pr_devel("<==%s() = ok\n", __func__);
91 return pks;
92
93error:
94 kfree(pks);
95error_no_pks:
96 crypto_free_shash(tfm);
97 pr_devel("<==%s() = %d\n", __func__, ret);
98 return ERR_PTR(ret);
99}
100
101/*
102 * Extract an MPI array from the signature data. This represents the actual
103 * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
104 * size of the MPI in bytes.
105 *
106 * RSA signatures only have one MPI, so currently we only read one.
107 */
108static int mod_extract_mpi_array(struct public_key_signature *pks,
109 const void *data, size_t len)
110{
111 size_t nbytes;
112 MPI mpi;
113
114 if (len < 3)
115 return -EBADMSG;
116 nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
117 data += 2;
118 len -= 2;
119 if (len != nbytes)
120 return -EBADMSG;
121
122 mpi = mpi_read_raw_data(data, nbytes);
123 if (!mpi)
124 return -ENOMEM;
125 pks->mpi[0] = mpi;
126 pks->nr_mpi = 1;
127 return 0;
128}
129
130/*
131 * Request an asymmetric key.
132 */
133static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
134 const u8 *key_id, size_t key_id_len)
135{
136 key_ref_t key;
137 size_t i;
138 char *id, *q;
139
140 pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
141
142 /* Construct an identifier. */
143 id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
144 if (!id)
145 return ERR_PTR(-ENOKEY);
146
147 memcpy(id, signer, signer_len);
148
149 q = id + signer_len;
150 *q++ = ':';
151 *q++ = ' ';
152 for (i = 0; i < key_id_len; i++) {
153 *q++ = hex_asc[*key_id >> 4];
154 *q++ = hex_asc[*key_id++ & 0x0f];
155 }
156
157 *q = 0;
158
159 pr_debug("Look up: \"%s\"\n", id);
160
161 key = keyring_search(make_key_ref(system_trusted_keyring, 1),
162 &key_type_asymmetric, id);
163 if (IS_ERR(key))
164 pr_warn("Request for unknown module key '%s' err %ld\n",
165 id, PTR_ERR(key));
166 kfree(id);
167
168 if (IS_ERR(key)) {
169 switch (PTR_ERR(key)) {
170 /* Hide some search errors */
171 case -EACCES:
172 case -ENOTDIR:
173 case -EAGAIN:
174 return ERR_PTR(-ENOKEY);
175 default:
176 return ERR_CAST(key);
177 }
178 }
179
180 pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
181 return key_ref_to_ptr(key);
182}
183
184/*
185 * Verify the signature on a module. 38 * Verify the signature on a module.
186 */ 39 */
187int mod_verify_sig(const void *mod, unsigned long *_modlen) 40int mod_verify_sig(const void *mod, unsigned long *_modlen)
188{ 41{
189 struct public_key_signature *pks;
190 struct module_signature ms; 42 struct module_signature ms;
191 struct key *key;
192 const void *sig;
193 size_t modlen = *_modlen, sig_len; 43 size_t modlen = *_modlen, sig_len;
194 int ret;
195 44
196 pr_devel("==>%s(,%zu)\n", __func__, modlen); 45 pr_devel("==>%s(,%zu)\n", __func__, modlen);
197 46
@@ -205,46 +54,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
205 if (sig_len >= modlen) 54 if (sig_len >= modlen)
206 return -EBADMSG; 55 return -EBADMSG;
207 modlen -= sig_len; 56 modlen -= sig_len;
208 if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
209 return -EBADMSG;
210 modlen -= (size_t)ms.signer_len + ms.key_id_len;
211
212 *_modlen = modlen; 57 *_modlen = modlen;
213 sig = mod + modlen;
214
215 /* For the moment, only support RSA and X.509 identifiers */
216 if (ms.algo != PKEY_ALGO_RSA ||
217 ms.id_type != PKEY_ID_X509)
218 return -ENOPKG;
219 58
220 if (ms.hash >= PKEY_HASH__LAST || 59 if (ms.id_type != PKEY_ID_PKCS7) {
221 !hash_algo_name[ms.hash]) 60 pr_err("Module is not signed with expected PKCS#7 message\n");
222 return -ENOPKG; 61 return -ENOPKG;
223
224 key = request_asymmetric_key(sig, ms.signer_len,
225 sig + ms.signer_len, ms.key_id_len);
226 if (IS_ERR(key))
227 return PTR_ERR(key);
228
229 pks = mod_make_digest(ms.hash, mod, modlen);
230 if (IS_ERR(pks)) {
231 ret = PTR_ERR(pks);
232 goto error_put_key;
233 } 62 }
234 63
235 ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len, 64 if (ms.algo != 0 ||
236 sig_len); 65 ms.hash != 0 ||
237 if (ret < 0) 66 ms.signer_len != 0 ||
238 goto error_free_pks; 67 ms.key_id_len != 0 ||
239 68 ms.__pad[0] != 0 ||
240 ret = verify_signature(key, pks); 69 ms.__pad[1] != 0 ||
241 pr_devel("verify_signature() = %d\n", ret); 70 ms.__pad[2] != 0) {
71 pr_err("PKCS#7 signature info has unexpected non-zero params\n");
72 return -EBADMSG;
73 }
242 74
243error_free_pks: 75 return system_verify_data(mod, modlen, mod + modlen, sig_len,
244 mpi_free(pks->rsa.s); 76 VERIFYING_MODULE_SIGNATURE);
245 kfree(pks);
246error_put_key:
247 key_put(key);
248 pr_devel("<==%s() = %d\n", __func__, ret);
249 return ret;
250} 77}
diff --git a/kernel/notifier.c b/kernel/notifier.c
index ae9fc7cc360e..fd2c9acbcc19 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str,
544 .signr = sig, 544 .signr = sig,
545 545
546 }; 546 };
547 RCU_LOCKDEP_WARN(!rcu_is_watching(),
548 "notify_die called but RCU thinks we're quiescent");
547 return atomic_notifier_call_chain(&die_chain, val, &args); 549 return atomic_notifier_call_chain(&die_chain, val, &args);
548} 550}
549NOKPROBE_SYMBOL(notify_die); 551NOKPROBE_SYMBOL(notify_die);
diff --git a/kernel/pid.c b/kernel/pid.c
index 4fd07d5b7baf..ca368793808e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -451,9 +451,8 @@ EXPORT_SYMBOL(pid_task);
451 */ 451 */
452struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) 452struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
453{ 453{
454 rcu_lockdep_assert(rcu_read_lock_held(), 454 RCU_LOCKDEP_WARN(!rcu_read_lock_held(),
455 "find_task_by_pid_ns() needs rcu_read_lock()" 455 "find_task_by_pid_ns() needs rcu_read_lock() protection");
456 " protection");
457 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); 456 return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
458} 457}
459 458
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 9e302315e33d..02e8dfaa1ce2 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,6 +18,16 @@ config SUSPEND_FREEZER
18 18
19 Turning OFF this setting is NOT recommended! If in doubt, say Y. 19 Turning OFF this setting is NOT recommended! If in doubt, say Y.
20 20
21config SUSPEND_SKIP_SYNC
22 bool "Skip kernel's sys_sync() on suspend to RAM/standby"
23 depends on SUSPEND
24 depends on EXPERT
25 help
26 Skip the kernel sys_sync() before freezing user processes.
27 Some systems prefer not to pay this cost on every invocation
28 of suspend, or they are content with invoking sync() from
29 user-space before invoking suspend. Say Y if that's your case.
30
21config HIBERNATE_CALLBACKS 31config HIBERNATE_CALLBACKS
22 bool 32 bool
23 33
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 53266b729fd9..7e4cda4a8dd9 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -484,11 +484,13 @@ static int enter_state(suspend_state_t state)
484 if (state == PM_SUSPEND_FREEZE) 484 if (state == PM_SUSPEND_FREEZE)
485 freeze_begin(); 485 freeze_begin();
486 486
487#ifndef CONFIG_SUSPEND_SKIP_SYNC
487 trace_suspend_resume(TPS("sync_filesystems"), 0, true); 488 trace_suspend_resume(TPS("sync_filesystems"), 0, true);
488 printk(KERN_INFO "PM: Syncing filesystems ... "); 489 printk(KERN_INFO "PM: Syncing filesystems ... ");
489 sys_sync(); 490 sys_sync();
490 printk("done.\n"); 491 printk("done.\n");
491 trace_suspend_resume(TPS("sync_filesystems"), 0, false); 492 trace_suspend_resume(TPS("sync_filesystems"), 0, false);
493#endif
492 494
493 pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]); 495 pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
494 error = suspend_prepare(state); 496 error = suspend_prepare(state);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 2f30ca91e4fa..b2066fb5b10f 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -227,27 +227,23 @@ static void hib_init_batch(struct hib_bio_batch *hb)
227 hb->error = 0; 227 hb->error = 0;
228} 228}
229 229
230static void hib_end_io(struct bio *bio, int error) 230static void hib_end_io(struct bio *bio)
231{ 231{
232 struct hib_bio_batch *hb = bio->bi_private; 232 struct hib_bio_batch *hb = bio->bi_private;
233 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
234 struct page *page = bio->bi_io_vec[0].bv_page; 233 struct page *page = bio->bi_io_vec[0].bv_page;
235 234
236 if (!uptodate || error) { 235 if (bio->bi_error) {
237 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n", 236 printk(KERN_ALERT "Read-error on swap-device (%u:%u:%Lu)\n",
238 imajor(bio->bi_bdev->bd_inode), 237 imajor(bio->bi_bdev->bd_inode),
239 iminor(bio->bi_bdev->bd_inode), 238 iminor(bio->bi_bdev->bd_inode),
240 (unsigned long long)bio->bi_iter.bi_sector); 239 (unsigned long long)bio->bi_iter.bi_sector);
241
242 if (!error)
243 error = -EIO;
244 } 240 }
245 241
246 if (bio_data_dir(bio) == WRITE) 242 if (bio_data_dir(bio) == WRITE)
247 put_page(page); 243 put_page(page);
248 244
249 if (error && !hb->error) 245 if (bio->bi_error && !hb->error)
250 hb->error = error; 246 hb->error = bio->bi_error;
251 if (atomic_dec_and_test(&hb->count)) 247 if (atomic_dec_and_test(&hb->count))
252 wake_up(&hb->wait); 248 wake_up(&hb->wait);
253 249
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 019069c84ff6..1896386e16bb 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -17,6 +17,7 @@
17#include <linux/list.h> 17#include <linux/list.h>
18#include <linux/rbtree.h> 18#include <linux/rbtree.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/workqueue.h>
20 21
21#include "power.h" 22#include "power.h"
22 23
@@ -83,7 +84,9 @@ static inline void decrement_wakelocks_number(void) {}
83#define WL_GC_COUNT_MAX 100 84#define WL_GC_COUNT_MAX 100
84#define WL_GC_TIME_SEC 300 85#define WL_GC_TIME_SEC 300
85 86
87static void __wakelocks_gc(struct work_struct *work);
86static LIST_HEAD(wakelocks_lru_list); 88static LIST_HEAD(wakelocks_lru_list);
89static DECLARE_WORK(wakelock_work, __wakelocks_gc);
87static unsigned int wakelocks_gc_count; 90static unsigned int wakelocks_gc_count;
88 91
89static inline void wakelocks_lru_add(struct wakelock *wl) 92static inline void wakelocks_lru_add(struct wakelock *wl)
@@ -96,13 +99,12 @@ static inline void wakelocks_lru_most_recent(struct wakelock *wl)
96 list_move(&wl->lru, &wakelocks_lru_list); 99 list_move(&wl->lru, &wakelocks_lru_list);
97} 100}
98 101
99static void wakelocks_gc(void) 102static void __wakelocks_gc(struct work_struct *work)
100{ 103{
101 struct wakelock *wl, *aux; 104 struct wakelock *wl, *aux;
102 ktime_t now; 105 ktime_t now;
103 106
104 if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) 107 mutex_lock(&wakelocks_lock);
105 return;
106 108
107 now = ktime_get(); 109 now = ktime_get();
108 list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { 110 list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
@@ -127,6 +129,16 @@ static void wakelocks_gc(void)
127 } 129 }
128 } 130 }
129 wakelocks_gc_count = 0; 131 wakelocks_gc_count = 0;
132
133 mutex_unlock(&wakelocks_lock);
134}
135
136static void wakelocks_gc(void)
137{
138 if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
139 return;
140
141 schedule_work(&wakelock_work);
130} 142}
131#else /* !CONFIG_PM_WAKELOCKS_GC */ 143#else /* !CONFIG_PM_WAKELOCKS_GC */
132static inline void wakelocks_lru_add(struct wakelock *wl) {} 144static inline void wakelocks_lru_add(struct wakelock *wl) {}
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index cf8c24203368..8f0324ef72ab 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -835,7 +835,7 @@ const struct file_operations kmsg_fops = {
835 .release = devkmsg_release, 835 .release = devkmsg_release,
836}; 836};
837 837
838#ifdef CONFIG_KEXEC 838#ifdef CONFIG_KEXEC_CORE
839/* 839/*
840 * This appends the listed symbols to /proc/vmcore 840 * This appends the listed symbols to /proc/vmcore
841 * 841 *
diff --git a/kernel/profile.c b/kernel/profile.c
index a7bcd28d6e9f..99513e1160e5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info,
339 node = cpu_to_mem(cpu); 339 node = cpu_to_mem(cpu);
340 per_cpu(cpu_profile_flip, cpu) = 0; 340 per_cpu(cpu_profile_flip, cpu) = 0;
341 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 341 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
342 page = alloc_pages_exact_node(node, 342 page = __alloc_pages_node(node,
343 GFP_KERNEL | __GFP_ZERO, 343 GFP_KERNEL | __GFP_ZERO,
344 0); 344 0);
345 if (!page) 345 if (!page)
@@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info,
347 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 347 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
348 } 348 }
349 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 349 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
350 page = alloc_pages_exact_node(node, 350 page = __alloc_pages_node(node,
351 GFP_KERNEL | __GFP_ZERO, 351 GFP_KERNEL | __GFP_ZERO,
352 0); 352 0);
353 if (!page) 353 if (!page)
@@ -543,14 +543,14 @@ static int create_hash_tables(void)
543 int node = cpu_to_mem(cpu); 543 int node = cpu_to_mem(cpu);
544 struct page *page; 544 struct page *page;
545 545
546 page = alloc_pages_exact_node(node, 546 page = __alloc_pages_node(node,
547 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 547 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
548 0); 548 0);
549 if (!page) 549 if (!page)
550 goto out_cleanup; 550 goto out_cleanup;
551 per_cpu(cpu_profile_hits, cpu)[1] 551 per_cpu(cpu_profile_hits, cpu)[1]
552 = (struct profile_hit *)page_address(page); 552 = (struct profile_hit *)page_address(page);
553 page = alloc_pages_exact_node(node, 553 page = __alloc_pages_node(node,
554 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE, 554 GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
555 0); 555 0);
556 if (!page) 556 if (!page)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c8e0e050a36a..787320de68e0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
556 if (data & ~(unsigned long)PTRACE_O_MASK) 556 if (data & ~(unsigned long)PTRACE_O_MASK)
557 return -EINVAL; 557 return -EINVAL;
558 558
559 if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
560 if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
561 !config_enabled(CONFIG_SECCOMP))
562 return -EINVAL;
563
564 if (!capable(CAP_SYS_ADMIN))
565 return -EPERM;
566
567 if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
568 current->ptrace & PT_SUSPEND_SECCOMP)
569 return -EPERM;
570 }
571
559 /* Avoid intermediate state when all opts are cleared */ 572 /* Avoid intermediate state when all opts are cleared */
560 flags = child->ptrace; 573 flags = child->ptrace;
561 flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT); 574 flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 59e32684c23b..77192953dee5 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -635,6 +635,8 @@ static struct rcu_torture_ops sched_ops = {
635 .deferred_free = rcu_sched_torture_deferred_free, 635 .deferred_free = rcu_sched_torture_deferred_free,
636 .sync = synchronize_sched, 636 .sync = synchronize_sched,
637 .exp_sync = synchronize_sched_expedited, 637 .exp_sync = synchronize_sched_expedited,
638 .get_state = get_state_synchronize_sched,
639 .cond_sync = cond_synchronize_sched,
638 .call = call_rcu_sched, 640 .call = call_rcu_sched,
639 .cb_barrier = rcu_barrier_sched, 641 .cb_barrier = rcu_barrier_sched,
640 .fqs = rcu_sched_force_quiescent_state, 642 .fqs = rcu_sched_force_quiescent_state,
@@ -684,10 +686,20 @@ static struct rcu_torture_ops tasks_ops = {
684 686
685#define RCUTORTURE_TASKS_OPS &tasks_ops, 687#define RCUTORTURE_TASKS_OPS &tasks_ops,
686 688
689static bool __maybe_unused torturing_tasks(void)
690{
691 return cur_ops == &tasks_ops;
692}
693
687#else /* #ifdef CONFIG_TASKS_RCU */ 694#else /* #ifdef CONFIG_TASKS_RCU */
688 695
689#define RCUTORTURE_TASKS_OPS 696#define RCUTORTURE_TASKS_OPS
690 697
698static bool torturing_tasks(void)
699{
700 return false;
701}
702
691#endif /* #else #ifdef CONFIG_TASKS_RCU */ 703#endif /* #else #ifdef CONFIG_TASKS_RCU */
692 704
693/* 705/*
@@ -823,9 +835,7 @@ rcu_torture_cbflood(void *arg)
823 } 835 }
824 if (err) { 836 if (err) {
825 VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM"); 837 VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
826 while (!torture_must_stop()) 838 goto wait_for_stop;
827 schedule_timeout_interruptible(HZ);
828 return 0;
829 } 839 }
830 VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started"); 840 VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
831 do { 841 do {
@@ -844,6 +854,7 @@ rcu_torture_cbflood(void *arg)
844 stutter_wait("rcu_torture_cbflood"); 854 stutter_wait("rcu_torture_cbflood");
845 } while (!torture_must_stop()); 855 } while (!torture_must_stop());
846 vfree(rhp); 856 vfree(rhp);
857wait_for_stop:
847 torture_kthread_stopping("rcu_torture_cbflood"); 858 torture_kthread_stopping("rcu_torture_cbflood");
848 return 0; 859 return 0;
849} 860}
@@ -1088,7 +1099,8 @@ static void rcu_torture_timer(unsigned long unused)
1088 p = rcu_dereference_check(rcu_torture_current, 1099 p = rcu_dereference_check(rcu_torture_current,
1089 rcu_read_lock_bh_held() || 1100 rcu_read_lock_bh_held() ||
1090 rcu_read_lock_sched_held() || 1101 rcu_read_lock_sched_held() ||
1091 srcu_read_lock_held(srcu_ctlp)); 1102 srcu_read_lock_held(srcu_ctlp) ||
1103 torturing_tasks());
1092 if (p == NULL) { 1104 if (p == NULL) {
1093 /* Leave because rcu_torture_writer is not yet underway */ 1105 /* Leave because rcu_torture_writer is not yet underway */
1094 cur_ops->readunlock(idx); 1106 cur_ops->readunlock(idx);
@@ -1162,7 +1174,8 @@ rcu_torture_reader(void *arg)
1162 p = rcu_dereference_check(rcu_torture_current, 1174 p = rcu_dereference_check(rcu_torture_current,
1163 rcu_read_lock_bh_held() || 1175 rcu_read_lock_bh_held() ||
1164 rcu_read_lock_sched_held() || 1176 rcu_read_lock_sched_held() ||
1165 srcu_read_lock_held(srcu_ctlp)); 1177 srcu_read_lock_held(srcu_ctlp) ||
1178 torturing_tasks());
1166 if (p == NULL) { 1179 if (p == NULL) {
1167 /* Wait for rcu_torture_writer to get underway */ 1180 /* Wait for rcu_torture_writer to get underway */
1168 cur_ops->readunlock(idx); 1181 cur_ops->readunlock(idx);
@@ -1507,7 +1520,7 @@ static int rcu_torture_barrier_init(void)
1507 int i; 1520 int i;
1508 int ret; 1521 int ret;
1509 1522
1510 if (n_barrier_cbs == 0) 1523 if (n_barrier_cbs <= 0)
1511 return 0; 1524 return 0;
1512 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { 1525 if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) {
1513 pr_alert("%s" TORTURE_FLAG 1526 pr_alert("%s" TORTURE_FLAG
@@ -1786,12 +1799,15 @@ rcu_torture_init(void)
1786 writer_task); 1799 writer_task);
1787 if (firsterr) 1800 if (firsterr)
1788 goto unwind; 1801 goto unwind;
1789 fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), 1802 if (nfakewriters > 0) {
1790 GFP_KERNEL); 1803 fakewriter_tasks = kzalloc(nfakewriters *
1791 if (fakewriter_tasks == NULL) { 1804 sizeof(fakewriter_tasks[0]),
1792 VERBOSE_TOROUT_ERRSTRING("out of memory"); 1805 GFP_KERNEL);
1793 firsterr = -ENOMEM; 1806 if (fakewriter_tasks == NULL) {
1794 goto unwind; 1807 VERBOSE_TOROUT_ERRSTRING("out of memory");
1808 firsterr = -ENOMEM;
1809 goto unwind;
1810 }
1795 } 1811 }
1796 for (i = 0; i < nfakewriters; i++) { 1812 for (i = 0; i < nfakewriters; i++) {
1797 firsterr = torture_create_kthread(rcu_torture_fakewriter, 1813 firsterr = torture_create_kthread(rcu_torture_fakewriter,
@@ -1818,7 +1834,7 @@ rcu_torture_init(void)
1818 if (firsterr) 1834 if (firsterr)
1819 goto unwind; 1835 goto unwind;
1820 } 1836 }
1821 if (test_no_idle_hz) { 1837 if (test_no_idle_hz && shuffle_interval > 0) {
1822 firsterr = torture_shuffle_init(shuffle_interval * HZ); 1838 firsterr = torture_shuffle_init(shuffle_interval * HZ);
1823 if (firsterr) 1839 if (firsterr)
1824 goto unwind; 1840 goto unwind;
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index fb33d35ee0b7..d3fcb2ec8536 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -252,14 +252,15 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
252} 252}
253 253
254/** 254/**
255 * srcu_readers_active - returns approximate number of readers. 255 * srcu_readers_active - returns true if there are readers. and false
256 * otherwise
256 * @sp: which srcu_struct to count active readers (holding srcu_read_lock). 257 * @sp: which srcu_struct to count active readers (holding srcu_read_lock).
257 * 258 *
258 * Note that this is not an atomic primitive, and can therefore suffer 259 * Note that this is not an atomic primitive, and can therefore suffer
259 * severe errors when invoked on an active srcu_struct. That said, it 260 * severe errors when invoked on an active srcu_struct. That said, it
260 * can be useful as an error check at cleanup time. 261 * can be useful as an error check at cleanup time.
261 */ 262 */
262static int srcu_readers_active(struct srcu_struct *sp) 263static bool srcu_readers_active(struct srcu_struct *sp)
263{ 264{
264 int cpu; 265 int cpu;
265 unsigned long sum = 0; 266 unsigned long sum = 0;
@@ -414,11 +415,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
414 struct rcu_head *head = &rcu.head; 415 struct rcu_head *head = &rcu.head;
415 bool done = false; 416 bool done = false;
416 417
417 rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && 418 RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) ||
418 !lock_is_held(&rcu_bh_lock_map) && 419 lock_is_held(&rcu_bh_lock_map) ||
419 !lock_is_held(&rcu_lock_map) && 420 lock_is_held(&rcu_lock_map) ||
420 !lock_is_held(&rcu_sched_lock_map), 421 lock_is_held(&rcu_sched_lock_map),
421 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 422 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section");
422 423
423 might_sleep(); 424 might_sleep();
424 init_completion(&rcu.completion); 425 init_completion(&rcu.completion);
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index c291bd65d2cb..d0471056d0af 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -191,10 +191,10 @@ static void rcu_process_callbacks(struct softirq_action *unused)
191 */ 191 */
192void synchronize_sched(void) 192void synchronize_sched(void)
193{ 193{
194 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && 194 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
195 !lock_is_held(&rcu_lock_map) && 195 lock_is_held(&rcu_lock_map) ||
196 !lock_is_held(&rcu_sched_lock_map), 196 lock_is_held(&rcu_sched_lock_map),
197 "Illegal synchronize_sched() in RCU read-side critical section"); 197 "Illegal synchronize_sched() in RCU read-side critical section");
198 cond_resched(); 198 cond_resched();
199} 199}
200EXPORT_SYMBOL_GPL(synchronize_sched); 200EXPORT_SYMBOL_GPL(synchronize_sched);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 65137bc28b2b..9f75f25cc5d9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -70,6 +70,8 @@ MODULE_ALIAS("rcutree");
70 70
71static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 71static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
72static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 72static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
73static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
74static struct lock_class_key rcu_exp_sched_class[RCU_NUM_LVLS];
73 75
74/* 76/*
75 * In order to export the rcu_state name to the tracing tools, it 77 * In order to export the rcu_state name to the tracing tools, it
@@ -124,13 +126,8 @@ module_param(rcu_fanout_exact, bool, 0444);
124static int rcu_fanout_leaf = RCU_FANOUT_LEAF; 126static int rcu_fanout_leaf = RCU_FANOUT_LEAF;
125module_param(rcu_fanout_leaf, int, 0444); 127module_param(rcu_fanout_leaf, int, 0444);
126int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; 128int rcu_num_lvls __read_mostly = RCU_NUM_LVLS;
127static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ 129/* Number of rcu_nodes at specified level. */
128 NUM_RCU_LVL_0, 130static int num_rcu_lvl[] = NUM_RCU_LVL_INIT;
129 NUM_RCU_LVL_1,
130 NUM_RCU_LVL_2,
131 NUM_RCU_LVL_3,
132 NUM_RCU_LVL_4,
133};
134int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ 131int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */
135 132
136/* 133/*
@@ -649,12 +646,12 @@ static void rcu_eqs_enter_common(long long oldval, bool user)
649 * It is illegal to enter an extended quiescent state while 646 * It is illegal to enter an extended quiescent state while
650 * in an RCU read-side critical section. 647 * in an RCU read-side critical section.
651 */ 648 */
652 rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), 649 RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map),
653 "Illegal idle entry in RCU read-side critical section."); 650 "Illegal idle entry in RCU read-side critical section.");
654 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), 651 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map),
655 "Illegal idle entry in RCU-bh read-side critical section."); 652 "Illegal idle entry in RCU-bh read-side critical section.");
656 rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), 653 RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map),
657 "Illegal idle entry in RCU-sched read-side critical section."); 654 "Illegal idle entry in RCU-sched read-side critical section.");
658} 655}
659 656
660/* 657/*
@@ -701,7 +698,7 @@ void rcu_idle_enter(void)
701} 698}
702EXPORT_SYMBOL_GPL(rcu_idle_enter); 699EXPORT_SYMBOL_GPL(rcu_idle_enter);
703 700
704#ifdef CONFIG_RCU_USER_QS 701#ifdef CONFIG_NO_HZ_FULL
705/** 702/**
706 * rcu_user_enter - inform RCU that we are resuming userspace. 703 * rcu_user_enter - inform RCU that we are resuming userspace.
707 * 704 *
@@ -714,7 +711,7 @@ void rcu_user_enter(void)
714{ 711{
715 rcu_eqs_enter(1); 712 rcu_eqs_enter(1);
716} 713}
717#endif /* CONFIG_RCU_USER_QS */ 714#endif /* CONFIG_NO_HZ_FULL */
718 715
719/** 716/**
720 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle 717 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
@@ -828,7 +825,7 @@ void rcu_idle_exit(void)
828} 825}
829EXPORT_SYMBOL_GPL(rcu_idle_exit); 826EXPORT_SYMBOL_GPL(rcu_idle_exit);
830 827
831#ifdef CONFIG_RCU_USER_QS 828#ifdef CONFIG_NO_HZ_FULL
832/** 829/**
833 * rcu_user_exit - inform RCU that we are exiting userspace. 830 * rcu_user_exit - inform RCU that we are exiting userspace.
834 * 831 *
@@ -839,7 +836,7 @@ void rcu_user_exit(void)
839{ 836{
840 rcu_eqs_exit(1); 837 rcu_eqs_exit(1);
841} 838}
842#endif /* CONFIG_RCU_USER_QS */ 839#endif /* CONFIG_NO_HZ_FULL */
843 840
844/** 841/**
845 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle 842 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
@@ -978,9 +975,9 @@ bool notrace rcu_is_watching(void)
978{ 975{
979 bool ret; 976 bool ret;
980 977
981 preempt_disable(); 978 preempt_disable_notrace();
982 ret = __rcu_is_watching(); 979 ret = __rcu_is_watching();
983 preempt_enable(); 980 preempt_enable_notrace();
984 return ret; 981 return ret;
985} 982}
986EXPORT_SYMBOL_GPL(rcu_is_watching); 983EXPORT_SYMBOL_GPL(rcu_is_watching);
@@ -1178,9 +1175,11 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
1178 j = jiffies; 1175 j = jiffies;
1179 gpa = READ_ONCE(rsp->gp_activity); 1176 gpa = READ_ONCE(rsp->gp_activity);
1180 if (j - gpa > 2 * HZ) 1177 if (j - gpa > 2 * HZ)
1181 pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x\n", 1178 pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
1182 rsp->name, j - gpa, 1179 rsp->name, j - gpa,
1183 rsp->gpnum, rsp->completed, rsp->gp_flags); 1180 rsp->gpnum, rsp->completed,
1181 rsp->gp_flags, rsp->gp_state,
1182 rsp->gp_kthread ? rsp->gp_kthread->state : 0);
1184} 1183}
1185 1184
1186/* 1185/*
@@ -1906,6 +1905,26 @@ static int rcu_gp_init(struct rcu_state *rsp)
1906} 1905}
1907 1906
1908/* 1907/*
1908 * Helper function for wait_event_interruptible_timeout() wakeup
1909 * at force-quiescent-state time.
1910 */
1911static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
1912{
1913 struct rcu_node *rnp = rcu_get_root(rsp);
1914
1915 /* Someone like call_rcu() requested a force-quiescent-state scan. */
1916 *gfp = READ_ONCE(rsp->gp_flags);
1917 if (*gfp & RCU_GP_FLAG_FQS)
1918 return true;
1919
1920 /* The current grace period has completed. */
1921 if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp))
1922 return true;
1923
1924 return false;
1925}
1926
1927/*
1909 * Do one round of quiescent-state forcing. 1928 * Do one round of quiescent-state forcing.
1910 */ 1929 */
1911static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) 1930static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
@@ -2041,6 +2060,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
2041 wait_event_interruptible(rsp->gp_wq, 2060 wait_event_interruptible(rsp->gp_wq,
2042 READ_ONCE(rsp->gp_flags) & 2061 READ_ONCE(rsp->gp_flags) &
2043 RCU_GP_FLAG_INIT); 2062 RCU_GP_FLAG_INIT);
2063 rsp->gp_state = RCU_GP_DONE_GPS;
2044 /* Locking provides needed memory barrier. */ 2064 /* Locking provides needed memory barrier. */
2045 if (rcu_gp_init(rsp)) 2065 if (rcu_gp_init(rsp))
2046 break; 2066 break;
@@ -2068,11 +2088,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
2068 TPS("fqswait")); 2088 TPS("fqswait"));
2069 rsp->gp_state = RCU_GP_WAIT_FQS; 2089 rsp->gp_state = RCU_GP_WAIT_FQS;
2070 ret = wait_event_interruptible_timeout(rsp->gp_wq, 2090 ret = wait_event_interruptible_timeout(rsp->gp_wq,
2071 ((gf = READ_ONCE(rsp->gp_flags)) & 2091 rcu_gp_fqs_check_wake(rsp, &gf), j);
2072 RCU_GP_FLAG_FQS) || 2092 rsp->gp_state = RCU_GP_DOING_FQS;
2073 (!READ_ONCE(rnp->qsmask) &&
2074 !rcu_preempt_blocked_readers_cgp(rnp)),
2075 j);
2076 /* Locking provides needed memory barriers. */ 2093 /* Locking provides needed memory barriers. */
2077 /* If grace period done, leave loop. */ 2094 /* If grace period done, leave loop. */
2078 if (!READ_ONCE(rnp->qsmask) && 2095 if (!READ_ONCE(rnp->qsmask) &&
@@ -2110,7 +2127,9 @@ static int __noreturn rcu_gp_kthread(void *arg)
2110 } 2127 }
2111 2128
2112 /* Handle grace-period end. */ 2129 /* Handle grace-period end. */
2130 rsp->gp_state = RCU_GP_CLEANUP;
2113 rcu_gp_cleanup(rsp); 2131 rcu_gp_cleanup(rsp);
2132 rsp->gp_state = RCU_GP_CLEANED;
2114 } 2133 }
2115} 2134}
2116 2135
@@ -3161,10 +3180,10 @@ static inline int rcu_blocking_is_gp(void)
3161 */ 3180 */
3162void synchronize_sched(void) 3181void synchronize_sched(void)
3163{ 3182{
3164 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && 3183 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
3165 !lock_is_held(&rcu_lock_map) && 3184 lock_is_held(&rcu_lock_map) ||
3166 !lock_is_held(&rcu_sched_lock_map), 3185 lock_is_held(&rcu_sched_lock_map),
3167 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 3186 "Illegal synchronize_sched() in RCU-sched read-side critical section");
3168 if (rcu_blocking_is_gp()) 3187 if (rcu_blocking_is_gp())
3169 return; 3188 return;
3170 if (rcu_gp_is_expedited()) 3189 if (rcu_gp_is_expedited())
@@ -3188,10 +3207,10 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
3188 */ 3207 */
3189void synchronize_rcu_bh(void) 3208void synchronize_rcu_bh(void)
3190{ 3209{
3191 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && 3210 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
3192 !lock_is_held(&rcu_lock_map) && 3211 lock_is_held(&rcu_lock_map) ||
3193 !lock_is_held(&rcu_sched_lock_map), 3212 lock_is_held(&rcu_sched_lock_map),
3194 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 3213 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
3195 if (rcu_blocking_is_gp()) 3214 if (rcu_blocking_is_gp())
3196 return; 3215 return;
3197 if (rcu_gp_is_expedited()) 3216 if (rcu_gp_is_expedited())
@@ -3253,23 +3272,247 @@ void cond_synchronize_rcu(unsigned long oldstate)
3253} 3272}
3254EXPORT_SYMBOL_GPL(cond_synchronize_rcu); 3273EXPORT_SYMBOL_GPL(cond_synchronize_rcu);
3255 3274
3256static int synchronize_sched_expedited_cpu_stop(void *data) 3275/**
3276 * get_state_synchronize_sched - Snapshot current RCU-sched state
3277 *
3278 * Returns a cookie that is used by a later call to cond_synchronize_sched()
3279 * to determine whether or not a full grace period has elapsed in the
3280 * meantime.
3281 */
3282unsigned long get_state_synchronize_sched(void)
3257{ 3283{
3258 /* 3284 /*
3259 * There must be a full memory barrier on each affected CPU 3285 * Any prior manipulation of RCU-protected data must happen
3260 * between the time that try_stop_cpus() is called and the 3286 * before the load from ->gpnum.
3261 * time that it returns. 3287 */
3262 * 3288 smp_mb(); /* ^^^ */
3263 * In the current initial implementation of cpu_stop, the 3289
3264 * above condition is already met when the control reaches 3290 /*
3265 * this point and the following smp_mb() is not strictly 3291 * Make sure this load happens before the purportedly
3266 * necessary. Do smp_mb() anyway for documentation and 3292 * time-consuming work between get_state_synchronize_sched()
3267 * robustness against future implementation changes. 3293 * and cond_synchronize_sched().
3294 */
3295 return smp_load_acquire(&rcu_sched_state.gpnum);
3296}
3297EXPORT_SYMBOL_GPL(get_state_synchronize_sched);
3298
3299/**
3300 * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period
3301 *
3302 * @oldstate: return value from earlier call to get_state_synchronize_sched()
3303 *
3304 * If a full RCU-sched grace period has elapsed since the earlier call to
3305 * get_state_synchronize_sched(), just return. Otherwise, invoke
3306 * synchronize_sched() to wait for a full grace period.
3307 *
3308 * Yes, this function does not take counter wrap into account. But
3309 * counter wrap is harmless. If the counter wraps, we have waited for
3310 * more than 2 billion grace periods (and way more on a 64-bit system!),
3311 * so waiting for one additional grace period should be just fine.
3312 */
3313void cond_synchronize_sched(unsigned long oldstate)
3314{
3315 unsigned long newstate;
3316
3317 /*
3318 * Ensure that this load happens before any RCU-destructive
3319 * actions the caller might carry out after we return.
3268 */ 3320 */
3269 smp_mb(); /* See above comment block. */ 3321 newstate = smp_load_acquire(&rcu_sched_state.completed);
3322 if (ULONG_CMP_GE(oldstate, newstate))
3323 synchronize_sched();
3324}
3325EXPORT_SYMBOL_GPL(cond_synchronize_sched);
3326
3327/* Adjust sequence number for start of update-side operation. */
3328static void rcu_seq_start(unsigned long *sp)
3329{
3330 WRITE_ONCE(*sp, *sp + 1);
3331 smp_mb(); /* Ensure update-side operation after counter increment. */
3332 WARN_ON_ONCE(!(*sp & 0x1));
3333}
3334
3335/* Adjust sequence number for end of update-side operation. */
3336static void rcu_seq_end(unsigned long *sp)
3337{
3338 smp_mb(); /* Ensure update-side operation before counter increment. */
3339 WRITE_ONCE(*sp, *sp + 1);
3340 WARN_ON_ONCE(*sp & 0x1);
3341}
3342
3343/* Take a snapshot of the update side's sequence number. */
3344static unsigned long rcu_seq_snap(unsigned long *sp)
3345{
3346 unsigned long s;
3347
3348 smp_mb(); /* Caller's modifications seen first by other CPUs. */
3349 s = (READ_ONCE(*sp) + 3) & ~0x1;
3350 smp_mb(); /* Above access must not bleed into critical section. */
3351 return s;
3352}
3353
3354/*
3355 * Given a snapshot from rcu_seq_snap(), determine whether or not a
3356 * full update-side operation has occurred.
3357 */
3358static bool rcu_seq_done(unsigned long *sp, unsigned long s)
3359{
3360 return ULONG_CMP_GE(READ_ONCE(*sp), s);
3361}
3362
3363/* Wrapper functions for expedited grace periods. */
3364static void rcu_exp_gp_seq_start(struct rcu_state *rsp)
3365{
3366 rcu_seq_start(&rsp->expedited_sequence);
3367}
3368static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
3369{
3370 rcu_seq_end(&rsp->expedited_sequence);
3371 smp_mb(); /* Ensure that consecutive grace periods serialize. */
3372}
3373static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
3374{
3375 return rcu_seq_snap(&rsp->expedited_sequence);
3376}
3377static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
3378{
3379 return rcu_seq_done(&rsp->expedited_sequence, s);
3380}
3381
3382/* Common code for synchronize_{rcu,sched}_expedited() work-done checking. */
3383static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
3384 struct rcu_data *rdp,
3385 atomic_long_t *stat, unsigned long s)
3386{
3387 if (rcu_exp_gp_seq_done(rsp, s)) {
3388 if (rnp)
3389 mutex_unlock(&rnp->exp_funnel_mutex);
3390 else if (rdp)
3391 mutex_unlock(&rdp->exp_funnel_mutex);
3392 /* Ensure test happens before caller kfree(). */
3393 smp_mb__before_atomic(); /* ^^^ */
3394 atomic_long_inc(stat);
3395 return true;
3396 }
3397 return false;
3398}
3399
3400/*
3401 * Funnel-lock acquisition for expedited grace periods. Returns a
3402 * pointer to the root rcu_node structure, or NULL if some other
3403 * task did the expedited grace period for us.
3404 */
3405static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
3406{
3407 struct rcu_data *rdp;
3408 struct rcu_node *rnp0;
3409 struct rcu_node *rnp1 = NULL;
3410
3411 /*
3412 * First try directly acquiring the root lock in order to reduce
3413 * latency in the common case where expedited grace periods are
3414 * rare. We check mutex_is_locked() to avoid pathological levels of
3415 * memory contention on ->exp_funnel_mutex in the heavy-load case.
3416 */
3417 rnp0 = rcu_get_root(rsp);
3418 if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
3419 if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
3420 if (sync_exp_work_done(rsp, rnp0, NULL,
3421 &rsp->expedited_workdone0, s))
3422 return NULL;
3423 return rnp0;
3424 }
3425 }
3426
3427 /*
3428 * Each pass through the following loop works its way
3429 * up the rcu_node tree, returning if others have done the
3430 * work or otherwise falls through holding the root rnp's
3431 * ->exp_funnel_mutex. The mapping from CPU to rcu_node structure
3432 * can be inexact, as it is just promoting locality and is not
3433 * strictly needed for correctness.
3434 */
3435 rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
3436 if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
3437 return NULL;
3438 mutex_lock(&rdp->exp_funnel_mutex);
3439 rnp0 = rdp->mynode;
3440 for (; rnp0 != NULL; rnp0 = rnp0->parent) {
3441 if (sync_exp_work_done(rsp, rnp1, rdp,
3442 &rsp->expedited_workdone2, s))
3443 return NULL;
3444 mutex_lock(&rnp0->exp_funnel_mutex);
3445 if (rnp1)
3446 mutex_unlock(&rnp1->exp_funnel_mutex);
3447 else
3448 mutex_unlock(&rdp->exp_funnel_mutex);
3449 rnp1 = rnp0;
3450 }
3451 if (sync_exp_work_done(rsp, rnp1, rdp,
3452 &rsp->expedited_workdone3, s))
3453 return NULL;
3454 return rnp1;
3455}
3456
3457/* Invoked on each online non-idle CPU for expedited quiescent state. */
3458static int synchronize_sched_expedited_cpu_stop(void *data)
3459{
3460 struct rcu_data *rdp = data;
3461 struct rcu_state *rsp = rdp->rsp;
3462
3463 /* We are here: If we are last, do the wakeup. */
3464 rdp->exp_done = true;
3465 if (atomic_dec_and_test(&rsp->expedited_need_qs))
3466 wake_up(&rsp->expedited_wq);
3270 return 0; 3467 return 0;
3271} 3468}
3272 3469
3470static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
3471{
3472 int cpu;
3473 unsigned long jiffies_stall;
3474 unsigned long jiffies_start;
3475 struct rcu_data *rdp;
3476 int ret;
3477
3478 jiffies_stall = rcu_jiffies_till_stall_check();
3479 jiffies_start = jiffies;
3480
3481 for (;;) {
3482 ret = wait_event_interruptible_timeout(
3483 rsp->expedited_wq,
3484 !atomic_read(&rsp->expedited_need_qs),
3485 jiffies_stall);
3486 if (ret > 0)
3487 return;
3488 if (ret < 0) {
3489 /* Hit a signal, disable CPU stall warnings. */
3490 wait_event(rsp->expedited_wq,
3491 !atomic_read(&rsp->expedited_need_qs));
3492 return;
3493 }
3494 pr_err("INFO: %s detected expedited stalls on CPUs: {",
3495 rsp->name);
3496 for_each_online_cpu(cpu) {
3497 rdp = per_cpu_ptr(rsp->rda, cpu);
3498
3499 if (rdp->exp_done)
3500 continue;
3501 pr_cont(" %d", cpu);
3502 }
3503 pr_cont(" } %lu jiffies s: %lu\n",
3504 jiffies - jiffies_start, rsp->expedited_sequence);
3505 for_each_online_cpu(cpu) {
3506 rdp = per_cpu_ptr(rsp->rda, cpu);
3507
3508 if (rdp->exp_done)
3509 continue;
3510 dump_cpu_task(cpu);
3511 }
3512 jiffies_stall = 3 * rcu_jiffies_till_stall_check() + 3;
3513 }
3514}
3515
3273/** 3516/**
3274 * synchronize_sched_expedited - Brute-force RCU-sched grace period 3517 * synchronize_sched_expedited - Brute-force RCU-sched grace period
3275 * 3518 *
@@ -3281,58 +3524,21 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
3281 * restructure your code to batch your updates, and then use a single 3524 * restructure your code to batch your updates, and then use a single
3282 * synchronize_sched() instead. 3525 * synchronize_sched() instead.
3283 * 3526 *
3284 * This implementation can be thought of as an application of ticket 3527 * This implementation can be thought of as an application of sequence
3285 * locking to RCU, with sync_sched_expedited_started and 3528 * locking to expedited grace periods, but using the sequence counter to
3286 * sync_sched_expedited_done taking on the roles of the halves 3529 * determine when someone else has already done the work instead of for
3287 * of the ticket-lock word. Each task atomically increments 3530 * retrying readers.
3288 * sync_sched_expedited_started upon entry, snapshotting the old value,
3289 * then attempts to stop all the CPUs. If this succeeds, then each
3290 * CPU will have executed a context switch, resulting in an RCU-sched
3291 * grace period. We are then done, so we use atomic_cmpxchg() to
3292 * update sync_sched_expedited_done to match our snapshot -- but
3293 * only if someone else has not already advanced past our snapshot.
3294 *
3295 * On the other hand, if try_stop_cpus() fails, we check the value
3296 * of sync_sched_expedited_done. If it has advanced past our
3297 * initial snapshot, then someone else must have forced a grace period
3298 * some time after we took our snapshot. In this case, our work is
3299 * done for us, and we can simply return. Otherwise, we try again,
3300 * but keep our initial snapshot for purposes of checking for someone
3301 * doing our work for us.
3302 *
3303 * If we fail too many times in a row, we fall back to synchronize_sched().
3304 */ 3531 */
3305void synchronize_sched_expedited(void) 3532void synchronize_sched_expedited(void)
3306{ 3533{
3307 cpumask_var_t cm;
3308 bool cma = false;
3309 int cpu; 3534 int cpu;
3310 long firstsnap, s, snap; 3535 unsigned long s;
3311 int trycount = 0; 3536 struct rcu_node *rnp;
3312 struct rcu_state *rsp = &rcu_sched_state; 3537 struct rcu_state *rsp = &rcu_sched_state;
3313 3538
3314 /* 3539 /* Take a snapshot of the sequence number. */
3315 * If we are in danger of counter wrap, just do synchronize_sched(). 3540 s = rcu_exp_gp_seq_snap(rsp);
3316 * By allowing sync_sched_expedited_started to advance no more than
3317 * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring
3318 * that more than 3.5 billion CPUs would be required to force a
3319 * counter wrap on a 32-bit system. Quite a few more CPUs would of
3320 * course be required on a 64-bit system.
3321 */
3322 if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start),
3323 (ulong)atomic_long_read(&rsp->expedited_done) +
3324 ULONG_MAX / 8)) {
3325 wait_rcu_gp(call_rcu_sched);
3326 atomic_long_inc(&rsp->expedited_wrap);
3327 return;
3328 }
3329 3541
3330 /*
3331 * Take a ticket. Note that atomic_inc_return() implies a
3332 * full memory barrier.
3333 */
3334 snap = atomic_long_inc_return(&rsp->expedited_start);
3335 firstsnap = snap;
3336 if (!try_get_online_cpus()) { 3542 if (!try_get_online_cpus()) {
3337 /* CPU hotplug operation in flight, fall back to normal GP. */ 3543 /* CPU hotplug operation in flight, fall back to normal GP. */
3338 wait_rcu_gp(call_rcu_sched); 3544 wait_rcu_gp(call_rcu_sched);
@@ -3341,100 +3547,38 @@ void synchronize_sched_expedited(void)
3341 } 3547 }
3342 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); 3548 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
3343 3549
3344 /* Offline CPUs, idle CPUs, and any CPU we run on are quiescent. */ 3550 rnp = exp_funnel_lock(rsp, s);
3345 cma = zalloc_cpumask_var(&cm, GFP_KERNEL); 3551 if (rnp == NULL) {
3346 if (cma) { 3552 put_online_cpus();
3347 cpumask_copy(cm, cpu_online_mask); 3553 return; /* Someone else did our work for us. */
3348 cpumask_clear_cpu(raw_smp_processor_id(), cm);
3349 for_each_cpu(cpu, cm) {
3350 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
3351
3352 if (!(atomic_add_return(0, &rdtp->dynticks) & 0x1))
3353 cpumask_clear_cpu(cpu, cm);
3354 }
3355 if (cpumask_weight(cm) == 0)
3356 goto all_cpus_idle;
3357 } 3554 }
3358 3555
3359 /* 3556 rcu_exp_gp_seq_start(rsp);
3360 * Each pass through the following loop attempts to force a
3361 * context switch on each CPU.
3362 */
3363 while (try_stop_cpus(cma ? cm : cpu_online_mask,
3364 synchronize_sched_expedited_cpu_stop,
3365 NULL) == -EAGAIN) {
3366 put_online_cpus();
3367 atomic_long_inc(&rsp->expedited_tryfail);
3368
3369 /* Check to see if someone else did our work for us. */
3370 s = atomic_long_read(&rsp->expedited_done);
3371 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
3372 /* ensure test happens before caller kfree */
3373 smp_mb__before_atomic(); /* ^^^ */
3374 atomic_long_inc(&rsp->expedited_workdone1);
3375 free_cpumask_var(cm);
3376 return;
3377 }
3378 3557
3379 /* No joy, try again later. Or just synchronize_sched(). */ 3558 /* Stop each CPU that is online, non-idle, and not us. */
3380 if (trycount++ < 10) { 3559 init_waitqueue_head(&rsp->expedited_wq);
3381 udelay(trycount * num_online_cpus()); 3560 atomic_set(&rsp->expedited_need_qs, 1); /* Extra count avoids race. */
3382 } else { 3561 for_each_online_cpu(cpu) {
3383 wait_rcu_gp(call_rcu_sched); 3562 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
3384 atomic_long_inc(&rsp->expedited_normal); 3563 struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
3385 free_cpumask_var(cm);
3386 return;
3387 }
3388 3564
3389 /* Recheck to see if someone else did our work for us. */ 3565 rdp->exp_done = false;
3390 s = atomic_long_read(&rsp->expedited_done);
3391 if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) {
3392 /* ensure test happens before caller kfree */
3393 smp_mb__before_atomic(); /* ^^^ */
3394 atomic_long_inc(&rsp->expedited_workdone2);
3395 free_cpumask_var(cm);
3396 return;
3397 }
3398 3566
3399 /* 3567 /* Skip our CPU and any idle CPUs. */
3400 * Refetching sync_sched_expedited_started allows later 3568 if (raw_smp_processor_id() == cpu ||
3401 * callers to piggyback on our grace period. We retry 3569 !(atomic_add_return(0, &rdtp->dynticks) & 0x1))
3402 * after they started, so our grace period works for them, 3570 continue;
3403 * and they started after our first try, so their grace 3571 atomic_inc(&rsp->expedited_need_qs);
3404 * period works for us. 3572 stop_one_cpu_nowait(cpu, synchronize_sched_expedited_cpu_stop,
3405 */ 3573 rdp, &rdp->exp_stop_work);
3406 if (!try_get_online_cpus()) {
3407 /* CPU hotplug operation in flight, use normal GP. */
3408 wait_rcu_gp(call_rcu_sched);
3409 atomic_long_inc(&rsp->expedited_normal);
3410 free_cpumask_var(cm);
3411 return;
3412 }
3413 snap = atomic_long_read(&rsp->expedited_start);
3414 smp_mb(); /* ensure read is before try_stop_cpus(). */
3415 } 3574 }
3416 atomic_long_inc(&rsp->expedited_stoppedcpus);
3417 3575
3418all_cpus_idle: 3576 /* Remove extra count and, if necessary, wait for CPUs to stop. */
3419 free_cpumask_var(cm); 3577 if (!atomic_dec_and_test(&rsp->expedited_need_qs))
3578 synchronize_sched_expedited_wait(rsp);
3420 3579
3421 /* 3580 rcu_exp_gp_seq_end(rsp);
3422 * Everyone up to our most recent fetch is covered by our grace 3581 mutex_unlock(&rnp->exp_funnel_mutex);
3423 * period. Update the counter, but only if our work is still
3424 * relevant -- which it won't be if someone who started later
3425 * than we did already did their update.
3426 */
3427 do {
3428 atomic_long_inc(&rsp->expedited_done_tries);
3429 s = atomic_long_read(&rsp->expedited_done);
3430 if (ULONG_CMP_GE((ulong)s, (ulong)snap)) {
3431 /* ensure test happens before caller kfree */
3432 smp_mb__before_atomic(); /* ^^^ */
3433 atomic_long_inc(&rsp->expedited_done_lost);
3434 break;
3435 }
3436 } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s);
3437 atomic_long_inc(&rsp->expedited_done_exit);
3438 3582
3439 put_online_cpus(); 3583 put_online_cpus();
3440} 3584}
@@ -3571,10 +3715,10 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
3571 struct rcu_state *rsp = rdp->rsp; 3715 struct rcu_state *rsp = rdp->rsp;
3572 3716
3573 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { 3717 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
3574 _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); 3718 _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
3575 complete(&rsp->barrier_completion); 3719 complete(&rsp->barrier_completion);
3576 } else { 3720 } else {
3577 _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); 3721 _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
3578 } 3722 }
3579} 3723}
3580 3724
@@ -3586,7 +3730,7 @@ static void rcu_barrier_func(void *type)
3586 struct rcu_state *rsp = type; 3730 struct rcu_state *rsp = type;
3587 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 3731 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
3588 3732
3589 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); 3733 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
3590 atomic_inc(&rsp->barrier_cpu_count); 3734 atomic_inc(&rsp->barrier_cpu_count);
3591 rsp->call(&rdp->barrier_head, rcu_barrier_callback); 3735 rsp->call(&rdp->barrier_head, rcu_barrier_callback);
3592} 3736}
@@ -3599,55 +3743,24 @@ static void _rcu_barrier(struct rcu_state *rsp)
3599{ 3743{
3600 int cpu; 3744 int cpu;
3601 struct rcu_data *rdp; 3745 struct rcu_data *rdp;
3602 unsigned long snap = READ_ONCE(rsp->n_barrier_done); 3746 unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
3603 unsigned long snap_done;
3604 3747
3605 _rcu_barrier_trace(rsp, "Begin", -1, snap); 3748 _rcu_barrier_trace(rsp, "Begin", -1, s);
3606 3749
3607 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 3750 /* Take mutex to serialize concurrent rcu_barrier() requests. */
3608 mutex_lock(&rsp->barrier_mutex); 3751 mutex_lock(&rsp->barrier_mutex);
3609 3752
3610 /* 3753 /* Did someone else do our work for us? */
3611 * Ensure that all prior references, including to ->n_barrier_done, 3754 if (rcu_seq_done(&rsp->barrier_sequence, s)) {
3612 * are ordered before the _rcu_barrier() machinery. 3755 _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
3613 */
3614 smp_mb(); /* See above block comment. */
3615
3616 /*
3617 * Recheck ->n_barrier_done to see if others did our work for us.
3618 * This means checking ->n_barrier_done for an even-to-odd-to-even
3619 * transition. The "if" expression below therefore rounds the old
3620 * value up to the next even number and adds two before comparing.
3621 */
3622 snap_done = rsp->n_barrier_done;
3623 _rcu_barrier_trace(rsp, "Check", -1, snap_done);
3624
3625 /*
3626 * If the value in snap is odd, we needed to wait for the current
3627 * rcu_barrier() to complete, then wait for the next one, in other
3628 * words, we need the value of snap_done to be three larger than
3629 * the value of snap. On the other hand, if the value in snap is
3630 * even, we only had to wait for the next rcu_barrier() to complete,
3631 * in other words, we need the value of snap_done to be only two
3632 * greater than the value of snap. The "(snap + 3) & ~0x1" computes
3633 * this for us (thank you, Linus!).
3634 */
3635 if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) {
3636 _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done);
3637 smp_mb(); /* caller's subsequent code after above check. */ 3756 smp_mb(); /* caller's subsequent code after above check. */
3638 mutex_unlock(&rsp->barrier_mutex); 3757 mutex_unlock(&rsp->barrier_mutex);
3639 return; 3758 return;
3640 } 3759 }
3641 3760
3642 /* 3761 /* Mark the start of the barrier operation. */
3643 * Increment ->n_barrier_done to avoid duplicate work. Use 3762 rcu_seq_start(&rsp->barrier_sequence);
3644 * WRITE_ONCE() to prevent the compiler from speculating 3763 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
3645 * the increment to precede the early-exit check.
3646 */
3647 WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
3648 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1);
3649 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done);
3650 smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */
3651 3764
3652 /* 3765 /*
3653 * Initialize the count to one rather than to zero in order to 3766 * Initialize the count to one rather than to zero in order to
@@ -3671,10 +3784,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
3671 if (rcu_is_nocb_cpu(cpu)) { 3784 if (rcu_is_nocb_cpu(cpu)) {
3672 if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { 3785 if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
3673 _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, 3786 _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
3674 rsp->n_barrier_done); 3787 rsp->barrier_sequence);
3675 } else { 3788 } else {
3676 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 3789 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
3677 rsp->n_barrier_done); 3790 rsp->barrier_sequence);
3678 smp_mb__before_atomic(); 3791 smp_mb__before_atomic();
3679 atomic_inc(&rsp->barrier_cpu_count); 3792 atomic_inc(&rsp->barrier_cpu_count);
3680 __call_rcu(&rdp->barrier_head, 3793 __call_rcu(&rdp->barrier_head,
@@ -3682,11 +3795,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
3682 } 3795 }
3683 } else if (READ_ONCE(rdp->qlen)) { 3796 } else if (READ_ONCE(rdp->qlen)) {
3684 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 3797 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
3685 rsp->n_barrier_done); 3798 rsp->barrier_sequence);
3686 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 3799 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
3687 } else { 3800 } else {
3688 _rcu_barrier_trace(rsp, "OnlineNQ", cpu, 3801 _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
3689 rsp->n_barrier_done); 3802 rsp->barrier_sequence);
3690 } 3803 }
3691 } 3804 }
3692 put_online_cpus(); 3805 put_online_cpus();
@@ -3698,16 +3811,13 @@ static void _rcu_barrier(struct rcu_state *rsp)
3698 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) 3811 if (atomic_dec_and_test(&rsp->barrier_cpu_count))
3699 complete(&rsp->barrier_completion); 3812 complete(&rsp->barrier_completion);
3700 3813
3701 /* Increment ->n_barrier_done to prevent duplicate work. */
3702 smp_mb(); /* Keep increment after above mechanism. */
3703 WRITE_ONCE(rsp->n_barrier_done, rsp->n_barrier_done + 1);
3704 WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0);
3705 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done);
3706 smp_mb(); /* Keep increment before caller's subsequent code. */
3707
3708 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ 3814 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */
3709 wait_for_completion(&rsp->barrier_completion); 3815 wait_for_completion(&rsp->barrier_completion);
3710 3816
3817 /* Mark the end of the barrier operation. */
3818 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
3819 rcu_seq_end(&rsp->barrier_sequence);
3820
3711 /* Other rcu_barrier() invocations can now safely proceed. */ 3821 /* Other rcu_barrier() invocations can now safely proceed. */
3712 mutex_unlock(&rsp->barrier_mutex); 3822 mutex_unlock(&rsp->barrier_mutex);
3713} 3823}
@@ -3770,6 +3880,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
3770 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); 3880 WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
3771 rdp->cpu = cpu; 3881 rdp->cpu = cpu;
3772 rdp->rsp = rsp; 3882 rdp->rsp = rsp;
3883 mutex_init(&rdp->exp_funnel_mutex);
3773 rcu_boot_init_nocb_percpu_data(rdp); 3884 rcu_boot_init_nocb_percpu_data(rdp);
3774 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3885 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3775} 3886}
@@ -3961,22 +4072,22 @@ void rcu_scheduler_starting(void)
3961 * Compute the per-level fanout, either using the exact fanout specified 4072 * Compute the per-level fanout, either using the exact fanout specified
3962 * or balancing the tree, depending on the rcu_fanout_exact boot parameter. 4073 * or balancing the tree, depending on the rcu_fanout_exact boot parameter.
3963 */ 4074 */
3964static void __init rcu_init_levelspread(struct rcu_state *rsp) 4075static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
3965{ 4076{
3966 int i; 4077 int i;
3967 4078
3968 if (rcu_fanout_exact) { 4079 if (rcu_fanout_exact) {
3969 rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; 4080 levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
3970 for (i = rcu_num_lvls - 2; i >= 0; i--) 4081 for (i = rcu_num_lvls - 2; i >= 0; i--)
3971 rsp->levelspread[i] = RCU_FANOUT; 4082 levelspread[i] = RCU_FANOUT;
3972 } else { 4083 } else {
3973 int ccur; 4084 int ccur;
3974 int cprv; 4085 int cprv;
3975 4086
3976 cprv = nr_cpu_ids; 4087 cprv = nr_cpu_ids;
3977 for (i = rcu_num_lvls - 1; i >= 0; i--) { 4088 for (i = rcu_num_lvls - 1; i >= 0; i--) {
3978 ccur = rsp->levelcnt[i]; 4089 ccur = levelcnt[i];
3979 rsp->levelspread[i] = (cprv + ccur - 1) / ccur; 4090 levelspread[i] = (cprv + ccur - 1) / ccur;
3980 cprv = ccur; 4091 cprv = ccur;
3981 } 4092 }
3982 } 4093 }
@@ -3988,23 +4099,20 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)
3988static void __init rcu_init_one(struct rcu_state *rsp, 4099static void __init rcu_init_one(struct rcu_state *rsp,
3989 struct rcu_data __percpu *rda) 4100 struct rcu_data __percpu *rda)
3990{ 4101{
3991 static const char * const buf[] = { 4102 static const char * const buf[] = RCU_NODE_NAME_INIT;
3992 "rcu_node_0", 4103 static const char * const fqs[] = RCU_FQS_NAME_INIT;
3993 "rcu_node_1", 4104 static const char * const exp[] = RCU_EXP_NAME_INIT;
3994 "rcu_node_2", 4105 static const char * const exp_sched[] = RCU_EXP_SCHED_NAME_INIT;
3995 "rcu_node_3" }; /* Match MAX_RCU_LVLS */
3996 static const char * const fqs[] = {
3997 "rcu_node_fqs_0",
3998 "rcu_node_fqs_1",
3999 "rcu_node_fqs_2",
4000 "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */
4001 static u8 fl_mask = 0x1; 4106 static u8 fl_mask = 0x1;
4107
4108 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */
4109 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */
4002 int cpustride = 1; 4110 int cpustride = 1;
4003 int i; 4111 int i;
4004 int j; 4112 int j;
4005 struct rcu_node *rnp; 4113 struct rcu_node *rnp;
4006 4114
4007 BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ 4115 BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */
4008 4116
4009 /* Silence gcc 4.8 false positive about array index out of range. */ 4117 /* Silence gcc 4.8 false positive about array index out of range. */
4010 if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) 4118 if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS)
@@ -4013,19 +4121,19 @@ static void __init rcu_init_one(struct rcu_state *rsp,
4013 /* Initialize the level-tracking arrays. */ 4121 /* Initialize the level-tracking arrays. */
4014 4122
4015 for (i = 0; i < rcu_num_lvls; i++) 4123 for (i = 0; i < rcu_num_lvls; i++)
4016 rsp->levelcnt[i] = num_rcu_lvl[i]; 4124 levelcnt[i] = num_rcu_lvl[i];
4017 for (i = 1; i < rcu_num_lvls; i++) 4125 for (i = 1; i < rcu_num_lvls; i++)
4018 rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; 4126 rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1];
4019 rcu_init_levelspread(rsp); 4127 rcu_init_levelspread(levelspread, levelcnt);
4020 rsp->flavor_mask = fl_mask; 4128 rsp->flavor_mask = fl_mask;
4021 fl_mask <<= 1; 4129 fl_mask <<= 1;
4022 4130
4023 /* Initialize the elements themselves, starting from the leaves. */ 4131 /* Initialize the elements themselves, starting from the leaves. */
4024 4132
4025 for (i = rcu_num_lvls - 1; i >= 0; i--) { 4133 for (i = rcu_num_lvls - 1; i >= 0; i--) {
4026 cpustride *= rsp->levelspread[i]; 4134 cpustride *= levelspread[i];
4027 rnp = rsp->level[i]; 4135 rnp = rsp->level[i];
4028 for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { 4136 for (j = 0; j < levelcnt[i]; j++, rnp++) {
4029 raw_spin_lock_init(&rnp->lock); 4137 raw_spin_lock_init(&rnp->lock);
4030 lockdep_set_class_and_name(&rnp->lock, 4138 lockdep_set_class_and_name(&rnp->lock,
4031 &rcu_node_class[i], buf[i]); 4139 &rcu_node_class[i], buf[i]);
@@ -4045,14 +4153,23 @@ static void __init rcu_init_one(struct rcu_state *rsp,
4045 rnp->grpmask = 0; 4153 rnp->grpmask = 0;
4046 rnp->parent = NULL; 4154 rnp->parent = NULL;
4047 } else { 4155 } else {
4048 rnp->grpnum = j % rsp->levelspread[i - 1]; 4156 rnp->grpnum = j % levelspread[i - 1];
4049 rnp->grpmask = 1UL << rnp->grpnum; 4157 rnp->grpmask = 1UL << rnp->grpnum;
4050 rnp->parent = rsp->level[i - 1] + 4158 rnp->parent = rsp->level[i - 1] +
4051 j / rsp->levelspread[i - 1]; 4159 j / levelspread[i - 1];
4052 } 4160 }
4053 rnp->level = i; 4161 rnp->level = i;
4054 INIT_LIST_HEAD(&rnp->blkd_tasks); 4162 INIT_LIST_HEAD(&rnp->blkd_tasks);
4055 rcu_init_one_nocb(rnp); 4163 rcu_init_one_nocb(rnp);
4164 mutex_init(&rnp->exp_funnel_mutex);
4165 if (rsp == &rcu_sched_state)
4166 lockdep_set_class_and_name(
4167 &rnp->exp_funnel_mutex,
4168 &rcu_exp_sched_class[i], exp_sched[i]);
4169 else
4170 lockdep_set_class_and_name(
4171 &rnp->exp_funnel_mutex,
4172 &rcu_exp_class[i], exp[i]);
4056 } 4173 }
4057 } 4174 }
4058 4175
@@ -4076,9 +4193,7 @@ static void __init rcu_init_geometry(void)
4076{ 4193{
4077 ulong d; 4194 ulong d;
4078 int i; 4195 int i;
4079 int j; 4196 int rcu_capacity[RCU_NUM_LVLS];
4080 int n = nr_cpu_ids;
4081 int rcu_capacity[MAX_RCU_LVLS + 1];
4082 4197
4083 /* 4198 /*
4084 * Initialize any unspecified boot parameters. 4199 * Initialize any unspecified boot parameters.
@@ -4101,47 +4216,49 @@ static void __init rcu_init_geometry(void)
4101 rcu_fanout_leaf, nr_cpu_ids); 4216 rcu_fanout_leaf, nr_cpu_ids);
4102 4217
4103 /* 4218 /*
4104 * Compute number of nodes that can be handled an rcu_node tree
4105 * with the given number of levels. Setting rcu_capacity[0] makes
4106 * some of the arithmetic easier.
4107 */
4108 rcu_capacity[0] = 1;
4109 rcu_capacity[1] = rcu_fanout_leaf;
4110 for (i = 2; i <= MAX_RCU_LVLS; i++)
4111 rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
4112
4113 /*
4114 * The boot-time rcu_fanout_leaf parameter is only permitted 4219 * The boot-time rcu_fanout_leaf parameter is only permitted
4115 * to increase the leaf-level fanout, not decrease it. Of course, 4220 * to increase the leaf-level fanout, not decrease it. Of course,
4116 * the leaf-level fanout cannot exceed the number of bits in 4221 * the leaf-level fanout cannot exceed the number of bits in
4117 * the rcu_node masks. Finally, the tree must be able to accommodate 4222 * the rcu_node masks. Complain and fall back to the compile-
4118 * the configured number of CPUs. Complain and fall back to the 4223 * time values if these limits are exceeded.
4119 * compile-time values if these limits are exceeded.
4120 */ 4224 */
4121 if (rcu_fanout_leaf < RCU_FANOUT_LEAF || 4225 if (rcu_fanout_leaf < RCU_FANOUT_LEAF ||
4122 rcu_fanout_leaf > sizeof(unsigned long) * 8 || 4226 rcu_fanout_leaf > sizeof(unsigned long) * 8) {
4123 n > rcu_capacity[MAX_RCU_LVLS]) { 4227 rcu_fanout_leaf = RCU_FANOUT_LEAF;
4124 WARN_ON(1); 4228 WARN_ON(1);
4125 return; 4229 return;
4126 } 4230 }
4127 4231
4232 /*
4233 * Compute number of nodes that can be handled an rcu_node tree
4234 * with the given number of levels.
4235 */
4236 rcu_capacity[0] = rcu_fanout_leaf;
4237 for (i = 1; i < RCU_NUM_LVLS; i++)
4238 rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT;
4239
4240 /*
4241 * The tree must be able to accommodate the configured number of CPUs.
4242 * If this limit is exceeded than we have a serious problem elsewhere.
4243 */
4244 if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1])
4245 panic("rcu_init_geometry: rcu_capacity[] is too small");
4246
4247 /* Calculate the number of levels in the tree. */
4248 for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) {
4249 }
4250 rcu_num_lvls = i + 1;
4251
4128 /* Calculate the number of rcu_nodes at each level of the tree. */ 4252 /* Calculate the number of rcu_nodes at each level of the tree. */
4129 for (i = 1; i <= MAX_RCU_LVLS; i++) 4253 for (i = 0; i < rcu_num_lvls; i++) {
4130 if (n <= rcu_capacity[i]) { 4254 int cap = rcu_capacity[(rcu_num_lvls - 1) - i];
4131 for (j = 0; j <= i; j++) 4255 num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap);
4132 num_rcu_lvl[j] = 4256 }
4133 DIV_ROUND_UP(n, rcu_capacity[i - j]);
4134 rcu_num_lvls = i;
4135 for (j = i + 1; j <= MAX_RCU_LVLS; j++)
4136 num_rcu_lvl[j] = 0;
4137 break;
4138 }
4139 4257
4140 /* Calculate the total number of rcu_node structures. */ 4258 /* Calculate the total number of rcu_node structures. */
4141 rcu_num_nodes = 0; 4259 rcu_num_nodes = 0;
4142 for (i = 0; i <= MAX_RCU_LVLS; i++) 4260 for (i = 0; i < rcu_num_lvls; i++)
4143 rcu_num_nodes += num_rcu_lvl[i]; 4261 rcu_num_nodes += num_rcu_lvl[i];
4144 rcu_num_nodes -= n;
4145} 4262}
4146 4263
4147/* 4264/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 4adb7ca0bf47..2e991f8361e4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -27,6 +27,7 @@
27#include <linux/threads.h> 27#include <linux/threads.h>
28#include <linux/cpumask.h> 28#include <linux/cpumask.h>
29#include <linux/seqlock.h> 29#include <linux/seqlock.h>
30#include <linux/stop_machine.h>
30 31
31/* 32/*
32 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and 33 * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and
@@ -36,8 +37,6 @@
36 * Of course, your mileage may vary. 37 * Of course, your mileage may vary.
37 */ 38 */
38 39
39#define MAX_RCU_LVLS 4
40
41#ifdef CONFIG_RCU_FANOUT 40#ifdef CONFIG_RCU_FANOUT
42#define RCU_FANOUT CONFIG_RCU_FANOUT 41#define RCU_FANOUT CONFIG_RCU_FANOUT
43#else /* #ifdef CONFIG_RCU_FANOUT */ 42#else /* #ifdef CONFIG_RCU_FANOUT */
@@ -66,38 +65,53 @@
66#if NR_CPUS <= RCU_FANOUT_1 65#if NR_CPUS <= RCU_FANOUT_1
67# define RCU_NUM_LVLS 1 66# define RCU_NUM_LVLS 1
68# define NUM_RCU_LVL_0 1 67# define NUM_RCU_LVL_0 1
69# define NUM_RCU_LVL_1 (NR_CPUS) 68# define NUM_RCU_NODES NUM_RCU_LVL_0
70# define NUM_RCU_LVL_2 0 69# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 }
71# define NUM_RCU_LVL_3 0 70# define RCU_NODE_NAME_INIT { "rcu_node_0" }
72# define NUM_RCU_LVL_4 0 71# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" }
72# define RCU_EXP_NAME_INIT { "rcu_node_exp_0" }
73# define RCU_EXP_SCHED_NAME_INIT \
74 { "rcu_node_exp_sched_0" }
73#elif NR_CPUS <= RCU_FANOUT_2 75#elif NR_CPUS <= RCU_FANOUT_2
74# define RCU_NUM_LVLS 2 76# define RCU_NUM_LVLS 2
75# define NUM_RCU_LVL_0 1 77# define NUM_RCU_LVL_0 1
76# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) 78# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
77# define NUM_RCU_LVL_2 (NR_CPUS) 79# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1)
78# define NUM_RCU_LVL_3 0 80# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 }
79# define NUM_RCU_LVL_4 0 81# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" }
82# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" }
83# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1" }
84# define RCU_EXP_SCHED_NAME_INIT \
85 { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1" }
80#elif NR_CPUS <= RCU_FANOUT_3 86#elif NR_CPUS <= RCU_FANOUT_3
81# define RCU_NUM_LVLS 3 87# define RCU_NUM_LVLS 3
82# define NUM_RCU_LVL_0 1 88# define NUM_RCU_LVL_0 1
83# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) 89# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
84# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) 90# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
85# define NUM_RCU_LVL_3 (NR_CPUS) 91# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2)
86# define NUM_RCU_LVL_4 0 92# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 }
93# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" }
94# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" }
95# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2" }
96# define RCU_EXP_SCHED_NAME_INIT \
97 { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2" }
87#elif NR_CPUS <= RCU_FANOUT_4 98#elif NR_CPUS <= RCU_FANOUT_4
88# define RCU_NUM_LVLS 4 99# define RCU_NUM_LVLS 4
89# define NUM_RCU_LVL_0 1 100# define NUM_RCU_LVL_0 1
90# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) 101# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3)
91# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) 102# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2)
92# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) 103# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1)
93# define NUM_RCU_LVL_4 (NR_CPUS) 104# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3)
105# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 }
106# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" }
107# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" }
108# define RCU_EXP_NAME_INIT { "rcu_node_exp_0", "rcu_node_exp_1", "rcu_node_exp_2", "rcu_node_exp_3" }
109# define RCU_EXP_SCHED_NAME_INIT \
110 { "rcu_node_exp_sched_0", "rcu_node_exp_sched_1", "rcu_node_exp_sched_2", "rcu_node_exp_sched_3" }
94#else 111#else
95# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" 112# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
96#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ 113#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */
97 114
98#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4)
99#define NUM_RCU_NODES (RCU_SUM - NR_CPUS)
100
101extern int rcu_num_lvls; 115extern int rcu_num_lvls;
102extern int rcu_num_nodes; 116extern int rcu_num_nodes;
103 117
@@ -236,6 +250,8 @@ struct rcu_node {
236 int need_future_gp[2]; 250 int need_future_gp[2];
237 /* Counts of upcoming no-CB GP requests. */ 251 /* Counts of upcoming no-CB GP requests. */
238 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; 252 raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp;
253
254 struct mutex exp_funnel_mutex ____cacheline_internodealigned_in_smp;
239} ____cacheline_internodealigned_in_smp; 255} ____cacheline_internodealigned_in_smp;
240 256
241/* 257/*
@@ -287,12 +303,13 @@ struct rcu_data {
287 bool gpwrap; /* Possible gpnum/completed wrap. */ 303 bool gpwrap; /* Possible gpnum/completed wrap. */
288 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ 304 struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
289 unsigned long grpmask; /* Mask to apply to leaf qsmask. */ 305 unsigned long grpmask; /* Mask to apply to leaf qsmask. */
290#ifdef CONFIG_RCU_CPU_STALL_INFO
291 unsigned long ticks_this_gp; /* The number of scheduling-clock */ 306 unsigned long ticks_this_gp; /* The number of scheduling-clock */
292 /* ticks this CPU has handled */ 307 /* ticks this CPU has handled */
293 /* during and after the last grace */ 308 /* during and after the last grace */
294 /* period it is aware of. */ 309 /* period it is aware of. */
295#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ 310 struct cpu_stop_work exp_stop_work;
311 /* Expedited grace-period control */
312 /* for CPU stopping. */
296 313
297 /* 2) batch handling */ 314 /* 2) batch handling */
298 /* 315 /*
@@ -355,11 +372,13 @@ struct rcu_data {
355 unsigned long n_rp_nocb_defer_wakeup; 372 unsigned long n_rp_nocb_defer_wakeup;
356 unsigned long n_rp_need_nothing; 373 unsigned long n_rp_need_nothing;
357 374
358 /* 6) _rcu_barrier() and OOM callbacks. */ 375 /* 6) _rcu_barrier(), OOM callbacks, and expediting. */
359 struct rcu_head barrier_head; 376 struct rcu_head barrier_head;
360#ifdef CONFIG_RCU_FAST_NO_HZ 377#ifdef CONFIG_RCU_FAST_NO_HZ
361 struct rcu_head oom_head; 378 struct rcu_head oom_head;
362#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ 379#endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
380 struct mutex exp_funnel_mutex;
381 bool exp_done; /* Expedited QS for this CPU? */
363 382
364 /* 7) Callback offloading. */ 383 /* 7) Callback offloading. */
365#ifdef CONFIG_RCU_NOCB_CPU 384#ifdef CONFIG_RCU_NOCB_CPU
@@ -387,9 +406,7 @@ struct rcu_data {
387#endif /* #ifdef CONFIG_RCU_NOCB_CPU */ 406#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
388 407
389 /* 8) RCU CPU stall data. */ 408 /* 8) RCU CPU stall data. */
390#ifdef CONFIG_RCU_CPU_STALL_INFO
391 unsigned int softirq_snap; /* Snapshot of softirq activity. */ 409 unsigned int softirq_snap; /* Snapshot of softirq activity. */
392#endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
393 410
394 int cpu; 411 int cpu;
395 struct rcu_state *rsp; 412 struct rcu_state *rsp;
@@ -442,9 +459,9 @@ do { \
442 */ 459 */
443struct rcu_state { 460struct rcu_state {
444 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ 461 struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */
445 struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ 462 struct rcu_node *level[RCU_NUM_LVLS + 1];
446 u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ 463 /* Hierarchy levels (+1 to */
447 u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 464 /* shut bogus gcc warning) */
448 u8 flavor_mask; /* bit in flavor mask. */ 465 u8 flavor_mask; /* bit in flavor mask. */
449 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ 466 struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */
450 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ 467 void (*call)(struct rcu_head *head, /* call_rcu() flavor. */
@@ -479,21 +496,18 @@ struct rcu_state {
479 struct mutex barrier_mutex; /* Guards barrier fields. */ 496 struct mutex barrier_mutex; /* Guards barrier fields. */
480 atomic_t barrier_cpu_count; /* # CPUs waiting on. */ 497 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
481 struct completion barrier_completion; /* Wake at barrier end. */ 498 struct completion barrier_completion; /* Wake at barrier end. */
482 unsigned long n_barrier_done; /* ++ at start and end of */ 499 unsigned long barrier_sequence; /* ++ at start and end of */
483 /* _rcu_barrier(). */ 500 /* _rcu_barrier(). */
484 /* End of fields guarded by barrier_mutex. */ 501 /* End of fields guarded by barrier_mutex. */
485 502
486 atomic_long_t expedited_start; /* Starting ticket. */ 503 unsigned long expedited_sequence; /* Take a ticket. */
487 atomic_long_t expedited_done; /* Done ticket. */ 504 atomic_long_t expedited_workdone0; /* # done by others #0. */
488 atomic_long_t expedited_wrap; /* # near-wrap incidents. */
489 atomic_long_t expedited_tryfail; /* # acquisition failures. */
490 atomic_long_t expedited_workdone1; /* # done by others #1. */ 505 atomic_long_t expedited_workdone1; /* # done by others #1. */
491 atomic_long_t expedited_workdone2; /* # done by others #2. */ 506 atomic_long_t expedited_workdone2; /* # done by others #2. */
507 atomic_long_t expedited_workdone3; /* # done by others #3. */
492 atomic_long_t expedited_normal; /* # fallbacks to normal. */ 508 atomic_long_t expedited_normal; /* # fallbacks to normal. */
493 atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ 509 atomic_t expedited_need_qs; /* # CPUs left to check in. */
494 atomic_long_t expedited_done_tries; /* # tries to update _done. */ 510 wait_queue_head_t expedited_wq; /* Wait for check-ins. */
495 atomic_long_t expedited_done_lost; /* # times beaten to _done. */
496 atomic_long_t expedited_done_exit; /* # times exited _done loop. */
497 511
498 unsigned long jiffies_force_qs; /* Time at which to invoke */ 512 unsigned long jiffies_force_qs; /* Time at which to invoke */
499 /* force_quiescent_state(). */ 513 /* force_quiescent_state(). */
@@ -527,7 +541,11 @@ struct rcu_state {
527/* Values for rcu_state structure's gp_flags field. */ 541/* Values for rcu_state structure's gp_flags field. */
528#define RCU_GP_WAIT_INIT 0 /* Initial state. */ 542#define RCU_GP_WAIT_INIT 0 /* Initial state. */
529#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */ 543#define RCU_GP_WAIT_GPS 1 /* Wait for grace-period start. */
530#define RCU_GP_WAIT_FQS 2 /* Wait for force-quiescent-state time. */ 544#define RCU_GP_DONE_GPS 2 /* Wait done for grace-period start. */
545#define RCU_GP_WAIT_FQS 3 /* Wait for force-quiescent-state time. */
546#define RCU_GP_DOING_FQS 4 /* Wait done for force-quiescent-state time. */
547#define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */
548#define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */
531 549
532extern struct list_head rcu_struct_flavors; 550extern struct list_head rcu_struct_flavors;
533 551
@@ -635,3 +653,15 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
635#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ 653#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
636} 654}
637#endif /* #ifdef CONFIG_RCU_TRACE */ 655#endif /* #ifdef CONFIG_RCU_TRACE */
656
657/*
658 * Place this after a lock-acquisition primitive to guarantee that
659 * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies
660 * if the UNLOCK and LOCK are executed by the same CPU or if the
661 * UNLOCK and LOCK operate on the same lock variable.
662 */
663#ifdef CONFIG_PPC
664#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */
665#else /* #ifdef CONFIG_PPC */
666#define smp_mb__after_unlock_lock() do { } while (0)
667#endif /* #else #ifdef CONFIG_PPC */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 013485fb2b06..b2bf3963a0ae 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -82,10 +82,8 @@ static void __init rcu_bootup_announce_oddness(void)
82 pr_info("\tRCU lockdep checking is enabled.\n"); 82 pr_info("\tRCU lockdep checking is enabled.\n");
83 if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) 83 if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
84 pr_info("\tRCU torture testing starts during boot.\n"); 84 pr_info("\tRCU torture testing starts during boot.\n");
85 if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) 85 if (RCU_NUM_LVLS >= 4)
86 pr_info("\tAdditional per-CPU info printed with stalls.\n"); 86 pr_info("\tFour(or more)-level hierarchy is enabled.\n");
87 if (NUM_RCU_LVL_4 != 0)
88 pr_info("\tFour-level hierarchy is enabled.\n");
89 if (RCU_FANOUT_LEAF != 16) 87 if (RCU_FANOUT_LEAF != 16)
90 pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", 88 pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
91 RCU_FANOUT_LEAF); 89 RCU_FANOUT_LEAF);
@@ -418,8 +416,6 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
418 rcu_print_detail_task_stall_rnp(rnp); 416 rcu_print_detail_task_stall_rnp(rnp);
419} 417}
420 418
421#ifdef CONFIG_RCU_CPU_STALL_INFO
422
423static void rcu_print_task_stall_begin(struct rcu_node *rnp) 419static void rcu_print_task_stall_begin(struct rcu_node *rnp)
424{ 420{
425 pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", 421 pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
@@ -431,18 +427,6 @@ static void rcu_print_task_stall_end(void)
431 pr_cont("\n"); 427 pr_cont("\n");
432} 428}
433 429
434#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
435
436static void rcu_print_task_stall_begin(struct rcu_node *rnp)
437{
438}
439
440static void rcu_print_task_stall_end(void)
441{
442}
443
444#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
445
446/* 430/*
447 * Scan the current list of tasks blocked within RCU read-side critical 431 * Scan the current list of tasks blocked within RCU read-side critical
448 * sections, printing out the tid of each. 432 * sections, printing out the tid of each.
@@ -538,10 +522,10 @@ EXPORT_SYMBOL_GPL(call_rcu);
538 */ 522 */
539void synchronize_rcu(void) 523void synchronize_rcu(void)
540{ 524{
541 rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && 525 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) ||
542 !lock_is_held(&rcu_lock_map) && 526 lock_is_held(&rcu_lock_map) ||
543 !lock_is_held(&rcu_sched_lock_map), 527 lock_is_held(&rcu_sched_lock_map),
544 "Illegal synchronize_rcu() in RCU read-side critical section"); 528 "Illegal synchronize_rcu() in RCU read-side critical section");
545 if (!rcu_scheduler_active) 529 if (!rcu_scheduler_active)
546 return; 530 return;
547 if (rcu_gp_is_expedited()) 531 if (rcu_gp_is_expedited())
@@ -552,8 +536,6 @@ void synchronize_rcu(void)
552EXPORT_SYMBOL_GPL(synchronize_rcu); 536EXPORT_SYMBOL_GPL(synchronize_rcu);
553 537
554static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); 538static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
555static unsigned long sync_rcu_preempt_exp_count;
556static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
557 539
558/* 540/*
559 * Return non-zero if there are any tasks in RCU read-side critical 541 * Return non-zero if there are any tasks in RCU read-side critical
@@ -573,7 +555,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp)
573 * for the current expedited grace period. Works only for preemptible 555 * for the current expedited grace period. Works only for preemptible
574 * RCU -- other RCU implementation use other means. 556 * RCU -- other RCU implementation use other means.
575 * 557 *
576 * Caller must hold sync_rcu_preempt_exp_mutex. 558 * Caller must hold the root rcu_node's exp_funnel_mutex.
577 */ 559 */
578static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) 560static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
579{ 561{
@@ -589,7 +571,7 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
589 * recursively up the tree. (Calm down, calm down, we do the recursion 571 * recursively up the tree. (Calm down, calm down, we do the recursion
590 * iteratively!) 572 * iteratively!)
591 * 573 *
592 * Caller must hold sync_rcu_preempt_exp_mutex. 574 * Caller must hold the root rcu_node's exp_funnel_mutex.
593 */ 575 */
594static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, 576static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
595 bool wake) 577 bool wake)
@@ -628,7 +610,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
628 * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 610 * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
629 * that work is needed here. 611 * that work is needed here.
630 * 612 *
631 * Caller must hold sync_rcu_preempt_exp_mutex. 613 * Caller must hold the root rcu_node's exp_funnel_mutex.
632 */ 614 */
633static void 615static void
634sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) 616sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -671,7 +653,7 @@ sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
671 * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, 653 * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
672 * enabling rcu_read_unlock_special() to do the bit-clearing. 654 * enabling rcu_read_unlock_special() to do the bit-clearing.
673 * 655 *
674 * Caller must hold sync_rcu_preempt_exp_mutex. 656 * Caller must hold the root rcu_node's exp_funnel_mutex.
675 */ 657 */
676static void 658static void
677sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) 659sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
@@ -719,51 +701,17 @@ sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
719void synchronize_rcu_expedited(void) 701void synchronize_rcu_expedited(void)
720{ 702{
721 struct rcu_node *rnp; 703 struct rcu_node *rnp;
704 struct rcu_node *rnp_unlock;
722 struct rcu_state *rsp = rcu_state_p; 705 struct rcu_state *rsp = rcu_state_p;
723 unsigned long snap; 706 unsigned long s;
724 int trycount = 0;
725 707
726 smp_mb(); /* Caller's modifications seen first by other CPUs. */ 708 s = rcu_exp_gp_seq_snap(rsp);
727 snap = READ_ONCE(sync_rcu_preempt_exp_count) + 1;
728 smp_mb(); /* Above access cannot bleed into critical section. */
729 709
730 /* 710 rnp_unlock = exp_funnel_lock(rsp, s);
731 * Block CPU-hotplug operations. This means that any CPU-hotplug 711 if (rnp_unlock == NULL)
732 * operation that finds an rcu_node structure with tasks in the 712 return; /* Someone else did our work for us. */
733 * process of being boosted will know that all tasks blocking
734 * this expedited grace period will already be in the process of
735 * being boosted. This simplifies the process of moving tasks
736 * from leaf to root rcu_node structures.
737 */
738 if (!try_get_online_cpus()) {
739 /* CPU-hotplug operation in flight, fall back to normal GP. */
740 wait_rcu_gp(call_rcu);
741 return;
742 }
743 713
744 /* 714 rcu_exp_gp_seq_start(rsp);
745 * Acquire lock, falling back to synchronize_rcu() if too many
746 * lock-acquisition failures. Of course, if someone does the
747 * expedited grace period for us, just leave.
748 */
749 while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
750 if (ULONG_CMP_LT(snap,
751 READ_ONCE(sync_rcu_preempt_exp_count))) {
752 put_online_cpus();
753 goto mb_ret; /* Others did our work for us. */
754 }
755 if (trycount++ < 10) {
756 udelay(trycount * num_online_cpus());
757 } else {
758 put_online_cpus();
759 wait_rcu_gp(call_rcu);
760 return;
761 }
762 }
763 if (ULONG_CMP_LT(snap, READ_ONCE(sync_rcu_preempt_exp_count))) {
764 put_online_cpus();
765 goto unlock_mb_ret; /* Others did our work for us. */
766 }
767 715
768 /* force all RCU readers onto ->blkd_tasks lists. */ 716 /* force all RCU readers onto ->blkd_tasks lists. */
769 synchronize_sched_expedited(); 717 synchronize_sched_expedited();
@@ -779,20 +727,14 @@ void synchronize_rcu_expedited(void)
779 rcu_for_each_leaf_node(rsp, rnp) 727 rcu_for_each_leaf_node(rsp, rnp)
780 sync_rcu_preempt_exp_init2(rsp, rnp); 728 sync_rcu_preempt_exp_init2(rsp, rnp);
781 729
782 put_online_cpus();
783
784 /* Wait for snapshotted ->blkd_tasks lists to drain. */ 730 /* Wait for snapshotted ->blkd_tasks lists to drain. */
785 rnp = rcu_get_root(rsp); 731 rnp = rcu_get_root(rsp);
786 wait_event(sync_rcu_preempt_exp_wq, 732 wait_event(sync_rcu_preempt_exp_wq,
787 sync_rcu_preempt_exp_done(rnp)); 733 sync_rcu_preempt_exp_done(rnp));
788 734
789 /* Clean up and exit. */ 735 /* Clean up and exit. */
790 smp_mb(); /* ensure expedited GP seen before counter increment. */ 736 rcu_exp_gp_seq_end(rsp);
791 WRITE_ONCE(sync_rcu_preempt_exp_count, sync_rcu_preempt_exp_count + 1); 737 mutex_unlock(&rnp_unlock->exp_funnel_mutex);
792unlock_mb_ret:
793 mutex_unlock(&sync_rcu_preempt_exp_mutex);
794mb_ret:
795 smp_mb(); /* ensure subsequent action seen after grace period. */
796} 738}
797EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); 739EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
798 740
@@ -1061,8 +1003,7 @@ static int rcu_boost(struct rcu_node *rnp)
1061} 1003}
1062 1004
1063/* 1005/*
1064 * Priority-boosting kthread. One per leaf rcu_node and one for the 1006 * Priority-boosting kthread, one per leaf rcu_node.
1065 * root rcu_node.
1066 */ 1007 */
1067static int rcu_boost_kthread(void *arg) 1008static int rcu_boost_kthread(void *arg)
1068{ 1009{
@@ -1680,12 +1621,10 @@ static int rcu_oom_notify(struct notifier_block *self,
1680 */ 1621 */
1681 atomic_set(&oom_callback_count, 1); 1622 atomic_set(&oom_callback_count, 1);
1682 1623
1683 get_online_cpus();
1684 for_each_online_cpu(cpu) { 1624 for_each_online_cpu(cpu) {
1685 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); 1625 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
1686 cond_resched_rcu_qs(); 1626 cond_resched_rcu_qs();
1687 } 1627 }
1688 put_online_cpus();
1689 1628
1690 /* Unconditionally decrement: no need to wake ourselves up. */ 1629 /* Unconditionally decrement: no need to wake ourselves up. */
1691 atomic_dec(&oom_callback_count); 1630 atomic_dec(&oom_callback_count);
@@ -1706,8 +1645,6 @@ early_initcall(rcu_register_oom_notifier);
1706 1645
1707#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ 1646#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1708 1647
1709#ifdef CONFIG_RCU_CPU_STALL_INFO
1710
1711#ifdef CONFIG_RCU_FAST_NO_HZ 1648#ifdef CONFIG_RCU_FAST_NO_HZ
1712 1649
1713static void print_cpu_stall_fast_no_hz(char *cp, int cpu) 1650static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
@@ -1796,33 +1733,6 @@ static void increment_cpu_stall_ticks(void)
1796 raw_cpu_inc(rsp->rda->ticks_this_gp); 1733 raw_cpu_inc(rsp->rda->ticks_this_gp);
1797} 1734}
1798 1735
1799#else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
1800
1801static void print_cpu_stall_info_begin(void)
1802{
1803 pr_cont(" {");
1804}
1805
1806static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1807{
1808 pr_cont(" %d", cpu);
1809}
1810
1811static void print_cpu_stall_info_end(void)
1812{
1813 pr_cont("} ");
1814}
1815
1816static void zero_cpu_stall_ticks(struct rcu_data *rdp)
1817{
1818}
1819
1820static void increment_cpu_stall_ticks(void)
1821{
1822}
1823
1824#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
1825
1826#ifdef CONFIG_RCU_NOCB_CPU 1736#ifdef CONFIG_RCU_NOCB_CPU
1827 1737
1828/* 1738/*
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index 3ea7ffc7d5c4..6fc4c5ff3bb5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -81,9 +81,9 @@ static void r_stop(struct seq_file *m, void *v)
81static int show_rcubarrier(struct seq_file *m, void *v) 81static int show_rcubarrier(struct seq_file *m, void *v)
82{ 82{
83 struct rcu_state *rsp = (struct rcu_state *)m->private; 83 struct rcu_state *rsp = (struct rcu_state *)m->private;
84 seq_printf(m, "bcc: %d nbd: %lu\n", 84 seq_printf(m, "bcc: %d bseq: %lu\n",
85 atomic_read(&rsp->barrier_cpu_count), 85 atomic_read(&rsp->barrier_cpu_count),
86 rsp->n_barrier_done); 86 rsp->barrier_sequence);
87 return 0; 87 return 0;
88} 88}
89 89
@@ -185,18 +185,15 @@ static int show_rcuexp(struct seq_file *m, void *v)
185{ 185{
186 struct rcu_state *rsp = (struct rcu_state *)m->private; 186 struct rcu_state *rsp = (struct rcu_state *)m->private;
187 187
188 seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", 188 seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
189 atomic_long_read(&rsp->expedited_start), 189 rsp->expedited_sequence,
190 atomic_long_read(&rsp->expedited_done), 190 atomic_long_read(&rsp->expedited_workdone0),
191 atomic_long_read(&rsp->expedited_wrap),
192 atomic_long_read(&rsp->expedited_tryfail),
193 atomic_long_read(&rsp->expedited_workdone1), 191 atomic_long_read(&rsp->expedited_workdone1),
194 atomic_long_read(&rsp->expedited_workdone2), 192 atomic_long_read(&rsp->expedited_workdone2),
193 atomic_long_read(&rsp->expedited_workdone3),
195 atomic_long_read(&rsp->expedited_normal), 194 atomic_long_read(&rsp->expedited_normal),
196 atomic_long_read(&rsp->expedited_stoppedcpus), 195 atomic_read(&rsp->expedited_need_qs),
197 atomic_long_read(&rsp->expedited_done_tries), 196 rsp->expedited_sequence / 2);
198 atomic_long_read(&rsp->expedited_done_lost),
199 atomic_long_read(&rsp->expedited_done_exit));
200 return 0; 197 return 0;
201} 198}
202 199
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index afaecb7a799a..7a0b3bc7c5ed 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,55 @@ MODULE_ALIAS("rcupdate");
62 62
63module_param(rcu_expedited, int, 0); 63module_param(rcu_expedited, int, 0);
64 64
65#if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
66/**
67 * rcu_read_lock_sched_held() - might we be in RCU-sched read-side critical section?
68 *
69 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an
70 * RCU-sched read-side critical section. In absence of
71 * CONFIG_DEBUG_LOCK_ALLOC, this assumes we are in an RCU-sched read-side
72 * critical section unless it can prove otherwise. Note that disabling
73 * of preemption (including disabling irqs) counts as an RCU-sched
74 * read-side critical section. This is useful for debug checks in functions
75 * that required that they be called within an RCU-sched read-side
76 * critical section.
77 *
78 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
79 * and while lockdep is disabled.
80 *
81 * Note that if the CPU is in the idle loop from an RCU point of
82 * view (ie: that we are in the section between rcu_idle_enter() and
83 * rcu_idle_exit()) then rcu_read_lock_held() returns false even if the CPU
84 * did an rcu_read_lock(). The reason for this is that RCU ignores CPUs
85 * that are in such a section, considering these as in extended quiescent
86 * state, so such a CPU is effectively never in an RCU read-side critical
87 * section regardless of what RCU primitives it invokes. This state of
88 * affairs is required --- we need to keep an RCU-free window in idle
89 * where the CPU may possibly enter into low power mode. This way we can
90 * notice an extended quiescent state to other CPUs that started a grace
91 * period. Otherwise we would delay any grace period as long as we run in
92 * the idle task.
93 *
94 * Similarly, we avoid claiming an SRCU read lock held if the current
95 * CPU is offline.
96 */
97int rcu_read_lock_sched_held(void)
98{
99 int lockdep_opinion = 0;
100
101 if (!debug_lockdep_rcu_enabled())
102 return 1;
103 if (!rcu_is_watching())
104 return 0;
105 if (!rcu_lockdep_current_cpu_online())
106 return 0;
107 if (debug_locks)
108 lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
109 return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
110}
111EXPORT_SYMBOL(rcu_read_lock_sched_held);
112#endif
113
65#ifndef CONFIG_TINY_RCU 114#ifndef CONFIG_TINY_RCU
66 115
67static atomic_t rcu_expedited_nesting = 116static atomic_t rcu_expedited_nesting =
@@ -269,20 +318,37 @@ void wakeme_after_rcu(struct rcu_head *head)
269 rcu = container_of(head, struct rcu_synchronize, head); 318 rcu = container_of(head, struct rcu_synchronize, head);
270 complete(&rcu->completion); 319 complete(&rcu->completion);
271} 320}
321EXPORT_SYMBOL_GPL(wakeme_after_rcu);
272 322
273void wait_rcu_gp(call_rcu_func_t crf) 323void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array,
324 struct rcu_synchronize *rs_array)
274{ 325{
275 struct rcu_synchronize rcu; 326 int i;
276 327
277 init_rcu_head_on_stack(&rcu.head); 328 /* Initialize and register callbacks for each flavor specified. */
278 init_completion(&rcu.completion); 329 for (i = 0; i < n; i++) {
279 /* Will wake me after RCU finished. */ 330 if (checktiny &&
280 crf(&rcu.head, wakeme_after_rcu); 331 (crcu_array[i] == call_rcu ||
281 /* Wait for it. */ 332 crcu_array[i] == call_rcu_bh)) {
282 wait_for_completion(&rcu.completion); 333 might_sleep();
283 destroy_rcu_head_on_stack(&rcu.head); 334 continue;
335 }
336 init_rcu_head_on_stack(&rs_array[i].head);
337 init_completion(&rs_array[i].completion);
338 (crcu_array[i])(&rs_array[i].head, wakeme_after_rcu);
339 }
340
341 /* Wait for all callbacks to be invoked. */
342 for (i = 0; i < n; i++) {
343 if (checktiny &&
344 (crcu_array[i] == call_rcu ||
345 crcu_array[i] == call_rcu_bh))
346 continue;
347 wait_for_completion(&rs_array[i].completion);
348 destroy_rcu_head_on_stack(&rs_array[i].head);
349 }
284} 350}
285EXPORT_SYMBOL_GPL(wait_rcu_gp); 351EXPORT_SYMBOL_GPL(__wait_rcu_gp);
286 352
287#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 353#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
288void init_rcu_head(struct rcu_head *head) 354void init_rcu_head(struct rcu_head *head)
@@ -523,8 +589,8 @@ EXPORT_SYMBOL_GPL(call_rcu_tasks);
523void synchronize_rcu_tasks(void) 589void synchronize_rcu_tasks(void)
524{ 590{
525 /* Complain if the scheduler has not started. */ 591 /* Complain if the scheduler has not started. */
526 rcu_lockdep_assert(!rcu_scheduler_active, 592 RCU_LOCKDEP_WARN(!rcu_scheduler_active,
527 "synchronize_rcu_tasks called too soon"); 593 "synchronize_rcu_tasks called too soon");
528 594
529 /* Wait for the grace period. */ 595 /* Wait for the grace period. */
530 wait_rcu_gp(call_rcu_tasks); 596 wait_rcu_gp(call_rcu_tasks);
diff --git a/kernel/reboot.c b/kernel/reboot.c
index d20c85d9f8c0..bd30a973fe94 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -346,7 +346,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
346 kernel_restart(buffer); 346 kernel_restart(buffer);
347 break; 347 break;
348 348
349#ifdef CONFIG_KEXEC 349#ifdef CONFIG_KEXEC_CORE
350 case LINUX_REBOOT_CMD_KEXEC: 350 case LINUX_REBOOT_CMD_KEXEC:
351 ret = kernel_kexec(); 351 ret = kernel_kexec();
352 break; 352 break;
diff --git a/kernel/resource.c b/kernel/resource.c
index 90552aab5f2d..f150dbbe6f62 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn)
492} 492}
493EXPORT_SYMBOL_GPL(page_is_ram); 493EXPORT_SYMBOL_GPL(page_is_ram);
494 494
495/* 495/**
496 * Search for a resouce entry that fully contains the specified region. 496 * region_intersects() - determine intersection of region with known resources
497 * If found, return 1 if it is RAM, 0 if not. 497 * @start: region start address
498 * If not found, or region is not fully contained, return -1 498 * @size: size of region
499 * @name: name of resource (in iomem_resource)
499 * 500 *
500 * Used by the ioremap functions to ensure the user is not remapping RAM and is 501 * Check if the specified region partially overlaps or fully eclipses a
501 * a vast speed up over walking through the resource table page by page. 502 * resource identified by @name. Return REGION_DISJOINT if the region
503 * does not overlap @name, return REGION_MIXED if the region overlaps
504 * @type and another resource, and return REGION_INTERSECTS if the
505 * region overlaps @type and no other defined resource. Note, that
506 * REGION_INTERSECTS is also returned in the case when the specified
507 * region overlaps RAM and undefined memory holes.
508 *
509 * region_intersect() is used by memory remapping functions to ensure
510 * the user is not remapping RAM and is a vast speed up over walking
511 * through the resource table page by page.
502 */ 512 */
503int region_is_ram(resource_size_t start, unsigned long size) 513int region_intersects(resource_size_t start, size_t size, const char *name)
504{ 514{
505 struct resource *p; 515 unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
506 resource_size_t end = start + size - 1; 516 resource_size_t end = start + size - 1;
507 int flags = IORESOURCE_MEM | IORESOURCE_BUSY; 517 int type = 0; int other = 0;
508 const char *name = "System RAM"; 518 struct resource *p;
509 int ret = -1;
510 519
511 read_lock(&resource_lock); 520 read_lock(&resource_lock);
512 for (p = iomem_resource.child; p ; p = p->sibling) { 521 for (p = iomem_resource.child; p ; p = p->sibling) {
513 if (end < p->start) 522 bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
514 continue; 523
515 524 if (start >= p->start && start <= p->end)
516 if (p->start <= start && end <= p->end) { 525 is_type ? type++ : other++;
517 /* resource fully contains region */ 526 if (end >= p->start && end <= p->end)
518 if ((p->flags != flags) || strcmp(p->name, name)) 527 is_type ? type++ : other++;
519 ret = 0; 528 if (p->start >= start && p->end <= end)
520 else 529 is_type ? type++ : other++;
521 ret = 1;
522 break;
523 }
524 if (p->end < start)
525 break; /* not found */
526 } 530 }
527 read_unlock(&resource_lock); 531 read_unlock(&resource_lock);
528 return ret; 532
533 if (other == 0)
534 return type ? REGION_INTERSECTS : REGION_DISJOINT;
535
536 if (type)
537 return REGION_MIXED;
538
539 return REGION_DISJOINT;
529} 540}
530 541
531void __weak arch_remove_reservations(struct resource *avail) 542void __weak arch_remove_reservations(struct resource *avail)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10081..3595403921bd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
164 164
165static void sched_feat_disable(int i) 165static void sched_feat_disable(int i)
166{ 166{
167 if (static_key_enabled(&sched_feat_keys[i])) 167 static_key_disable(&sched_feat_keys[i]);
168 static_key_slow_dec(&sched_feat_keys[i]);
169} 168}
170 169
171static void sched_feat_enable(int i) 170static void sched_feat_enable(int i)
172{ 171{
173 if (!static_key_enabled(&sched_feat_keys[i])) 172 static_key_enable(&sched_feat_keys[i]);
174 static_key_slow_inc(&sched_feat_keys[i]);
175} 173}
176#else 174#else
177static void sched_feat_disable(int i) { }; 175static void sched_feat_disable(int i) { };
@@ -1151,15 +1149,45 @@ static int migration_cpu_stop(void *data)
1151 return 0; 1149 return 0;
1152} 1150}
1153 1151
1154void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 1152/*
1153 * sched_class::set_cpus_allowed must do the below, but is not required to
1154 * actually call this function.
1155 */
1156void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1155{ 1157{
1156 if (p->sched_class->set_cpus_allowed)
1157 p->sched_class->set_cpus_allowed(p, new_mask);
1158
1159 cpumask_copy(&p->cpus_allowed, new_mask); 1158 cpumask_copy(&p->cpus_allowed, new_mask);
1160 p->nr_cpus_allowed = cpumask_weight(new_mask); 1159 p->nr_cpus_allowed = cpumask_weight(new_mask);
1161} 1160}
1162 1161
1162void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1163{
1164 struct rq *rq = task_rq(p);
1165 bool queued, running;
1166
1167 lockdep_assert_held(&p->pi_lock);
1168
1169 queued = task_on_rq_queued(p);
1170 running = task_current(rq, p);
1171
1172 if (queued) {
1173 /*
1174 * Because __kthread_bind() calls this on blocked tasks without
1175 * holding rq->lock.
1176 */
1177 lockdep_assert_held(&rq->lock);
1178 dequeue_task(rq, p, 0);
1179 }
1180 if (running)
1181 put_prev_task(rq, p);
1182
1183 p->sched_class->set_cpus_allowed(p, new_mask);
1184
1185 if (running)
1186 p->sched_class->set_curr_task(rq);
1187 if (queued)
1188 enqueue_task(rq, p, 0);
1189}
1190
1163/* 1191/*
1164 * Change a given task's CPU affinity. Migrate the thread to a 1192 * Change a given task's CPU affinity. Migrate the thread to a
1165 * proper CPU and schedule it away if the CPU it's executing on 1193 * proper CPU and schedule it away if the CPU it's executing on
@@ -1169,7 +1197,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1169 * task must not exit() & deallocate itself prematurely. The 1197 * task must not exit() & deallocate itself prematurely. The
1170 * call is not atomic; no spinlocks may be held. 1198 * call is not atomic; no spinlocks may be held.
1171 */ 1199 */
1172int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 1200static int __set_cpus_allowed_ptr(struct task_struct *p,
1201 const struct cpumask *new_mask, bool check)
1173{ 1202{
1174 unsigned long flags; 1203 unsigned long flags;
1175 struct rq *rq; 1204 struct rq *rq;
@@ -1178,6 +1207,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1178 1207
1179 rq = task_rq_lock(p, &flags); 1208 rq = task_rq_lock(p, &flags);
1180 1209
1210 /*
1211 * Must re-check here, to close a race against __kthread_bind(),
1212 * sched_setaffinity() is not guaranteed to observe the flag.
1213 */
1214 if (check && (p->flags & PF_NO_SETAFFINITY)) {
1215 ret = -EINVAL;
1216 goto out;
1217 }
1218
1181 if (cpumask_equal(&p->cpus_allowed, new_mask)) 1219 if (cpumask_equal(&p->cpus_allowed, new_mask))
1182 goto out; 1220 goto out;
1183 1221
@@ -1214,6 +1252,11 @@ out:
1214 1252
1215 return ret; 1253 return ret;
1216} 1254}
1255
1256int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1257{
1258 return __set_cpus_allowed_ptr(p, new_mask, false);
1259}
1217EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); 1260EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1218 1261
1219void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 1262void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
@@ -1595,6 +1638,15 @@ static void update_avg(u64 *avg, u64 sample)
1595 s64 diff = sample - *avg; 1638 s64 diff = sample - *avg;
1596 *avg += diff >> 3; 1639 *avg += diff >> 3;
1597} 1640}
1641
1642#else
1643
1644static inline int __set_cpus_allowed_ptr(struct task_struct *p,
1645 const struct cpumask *new_mask, bool check)
1646{
1647 return set_cpus_allowed_ptr(p, new_mask);
1648}
1649
1598#endif /* CONFIG_SMP */ 1650#endif /* CONFIG_SMP */
1599 1651
1600static void 1652static void
@@ -1654,9 +1706,9 @@ static void
1654ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) 1706ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
1655{ 1707{
1656 check_preempt_curr(rq, p, wake_flags); 1708 check_preempt_curr(rq, p, wake_flags);
1657 trace_sched_wakeup(p, true);
1658
1659 p->state = TASK_RUNNING; 1709 p->state = TASK_RUNNING;
1710 trace_sched_wakeup(p);
1711
1660#ifdef CONFIG_SMP 1712#ifdef CONFIG_SMP
1661 if (p->sched_class->task_woken) { 1713 if (p->sched_class->task_woken) {
1662 /* 1714 /*
@@ -1874,6 +1926,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
1874 if (!(p->state & state)) 1926 if (!(p->state & state))
1875 goto out; 1927 goto out;
1876 1928
1929 trace_sched_waking(p);
1930
1877 success = 1; /* we're going to change ->state */ 1931 success = 1; /* we're going to change ->state */
1878 cpu = task_cpu(p); 1932 cpu = task_cpu(p);
1879 1933
@@ -1949,6 +2003,8 @@ static void try_to_wake_up_local(struct task_struct *p)
1949 if (!(p->state & TASK_NORMAL)) 2003 if (!(p->state & TASK_NORMAL))
1950 goto out; 2004 goto out;
1951 2005
2006 trace_sched_waking(p);
2007
1952 if (!task_on_rq_queued(p)) 2008 if (!task_on_rq_queued(p))
1953 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 2009 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1954 2010
@@ -2016,9 +2072,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2016 p->se.prev_sum_exec_runtime = 0; 2072 p->se.prev_sum_exec_runtime = 0;
2017 p->se.nr_migrations = 0; 2073 p->se.nr_migrations = 0;
2018 p->se.vruntime = 0; 2074 p->se.vruntime = 0;
2019#ifdef CONFIG_SMP
2020 p->se.avg.decay_count = 0;
2021#endif
2022 INIT_LIST_HEAD(&p->se.group_node); 2075 INIT_LIST_HEAD(&p->se.group_node);
2023 2076
2024#ifdef CONFIG_SCHEDSTATS 2077#ifdef CONFIG_SCHEDSTATS
@@ -2200,8 +2253,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
2200#ifdef CONFIG_SMP 2253#ifdef CONFIG_SMP
2201inline struct dl_bw *dl_bw_of(int i) 2254inline struct dl_bw *dl_bw_of(int i)
2202{ 2255{
2203 rcu_lockdep_assert(rcu_read_lock_sched_held(), 2256 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2204 "sched RCU must be held"); 2257 "sched RCU must be held");
2205 return &cpu_rq(i)->rd->dl_bw; 2258 return &cpu_rq(i)->rd->dl_bw;
2206} 2259}
2207 2260
@@ -2210,8 +2263,8 @@ static inline int dl_bw_cpus(int i)
2210 struct root_domain *rd = cpu_rq(i)->rd; 2263 struct root_domain *rd = cpu_rq(i)->rd;
2211 int cpus = 0; 2264 int cpus = 0;
2212 2265
2213 rcu_lockdep_assert(rcu_read_lock_sched_held(), 2266 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
2214 "sched RCU must be held"); 2267 "sched RCU must be held");
2215 for_each_cpu_and(i, rd->span, cpu_active_mask) 2268 for_each_cpu_and(i, rd->span, cpu_active_mask)
2216 cpus++; 2269 cpus++;
2217 2270
@@ -2303,11 +2356,11 @@ void wake_up_new_task(struct task_struct *p)
2303#endif 2356#endif
2304 2357
2305 /* Initialize new task's runnable average */ 2358 /* Initialize new task's runnable average */
2306 init_task_runnable_average(p); 2359 init_entity_runnable_average(&p->se);
2307 rq = __task_rq_lock(p); 2360 rq = __task_rq_lock(p);
2308 activate_task(rq, p, 0); 2361 activate_task(rq, p, 0);
2309 p->on_rq = TASK_ON_RQ_QUEUED; 2362 p->on_rq = TASK_ON_RQ_QUEUED;
2310 trace_sched_wakeup_new(p, true); 2363 trace_sched_wakeup_new(p);
2311 check_preempt_curr(rq, p, WF_FORK); 2364 check_preempt_curr(rq, p, WF_FORK);
2312#ifdef CONFIG_SMP 2365#ifdef CONFIG_SMP
2313 if (p->sched_class->task_woken) 2366 if (p->sched_class->task_woken)
@@ -2469,7 +2522,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2469 */ 2522 */
2470 prev_state = prev->state; 2523 prev_state = prev->state;
2471 vtime_task_switch(prev); 2524 vtime_task_switch(prev);
2472 finish_arch_switch(prev);
2473 perf_event_task_sched_in(prev, current); 2525 perf_event_task_sched_in(prev, current);
2474 finish_lock_switch(rq, prev); 2526 finish_lock_switch(rq, prev);
2475 finish_arch_post_lock_switch(); 2527 finish_arch_post_lock_switch();
@@ -2489,7 +2541,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2489 put_task_struct(prev); 2541 put_task_struct(prev);
2490 } 2542 }
2491 2543
2492 tick_nohz_task_switch(current); 2544 tick_nohz_task_switch();
2493 return rq; 2545 return rq;
2494} 2546}
2495 2547
@@ -4340,7 +4392,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4340 } 4392 }
4341#endif 4393#endif
4342again: 4394again:
4343 retval = set_cpus_allowed_ptr(p, new_mask); 4395 retval = __set_cpus_allowed_ptr(p, new_mask, true);
4344 4396
4345 if (!retval) { 4397 if (!retval) {
4346 cpuset_cpus_allowed(p, cpus_allowed); 4398 cpuset_cpus_allowed(p, cpus_allowed);
@@ -4492,7 +4544,7 @@ SYSCALL_DEFINE0(sched_yield)
4492 4544
4493int __sched _cond_resched(void) 4545int __sched _cond_resched(void)
4494{ 4546{
4495 if (should_resched()) { 4547 if (should_resched(0)) {
4496 preempt_schedule_common(); 4548 preempt_schedule_common();
4497 return 1; 4549 return 1;
4498 } 4550 }
@@ -4510,7 +4562,7 @@ EXPORT_SYMBOL(_cond_resched);
4510 */ 4562 */
4511int __cond_resched_lock(spinlock_t *lock) 4563int __cond_resched_lock(spinlock_t *lock)
4512{ 4564{
4513 int resched = should_resched(); 4565 int resched = should_resched(PREEMPT_LOCK_OFFSET);
4514 int ret = 0; 4566 int ret = 0;
4515 4567
4516 lockdep_assert_held(lock); 4568 lockdep_assert_held(lock);
@@ -4532,7 +4584,7 @@ int __sched __cond_resched_softirq(void)
4532{ 4584{
4533 BUG_ON(!in_softirq()); 4585 BUG_ON(!in_softirq());
4534 4586
4535 if (should_resched()) { 4587 if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
4536 local_bh_enable(); 4588 local_bh_enable();
4537 preempt_schedule_common(); 4589 preempt_schedule_common();
4538 local_bh_disable(); 4590 local_bh_disable();
@@ -4865,7 +4917,8 @@ void init_idle(struct task_struct *idle, int cpu)
4865 struct rq *rq = cpu_rq(cpu); 4917 struct rq *rq = cpu_rq(cpu);
4866 unsigned long flags; 4918 unsigned long flags;
4867 4919
4868 raw_spin_lock_irqsave(&rq->lock, flags); 4920 raw_spin_lock_irqsave(&idle->pi_lock, flags);
4921 raw_spin_lock(&rq->lock);
4869 4922
4870 __sched_fork(0, idle); 4923 __sched_fork(0, idle);
4871 idle->state = TASK_RUNNING; 4924 idle->state = TASK_RUNNING;
@@ -4891,7 +4944,8 @@ void init_idle(struct task_struct *idle, int cpu)
4891#if defined(CONFIG_SMP) 4944#if defined(CONFIG_SMP)
4892 idle->on_cpu = 1; 4945 idle->on_cpu = 1;
4893#endif 4946#endif
4894 raw_spin_unlock_irqrestore(&rq->lock, flags); 4947 raw_spin_unlock(&rq->lock);
4948 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
4895 4949
4896 /* Set the preempt count _outside_ the spinlocks! */ 4950 /* Set the preempt count _outside_ the spinlocks! */
4897 init_idle_preempt_count(idle, cpu); 4951 init_idle_preempt_count(idle, cpu);
@@ -5311,8 +5365,7 @@ static void register_sched_domain_sysctl(void)
5311/* may be called multiple times per register */ 5365/* may be called multiple times per register */
5312static void unregister_sched_domain_sysctl(void) 5366static void unregister_sched_domain_sysctl(void)
5313{ 5367{
5314 if (sd_sysctl_header) 5368 unregister_sysctl_table(sd_sysctl_header);
5315 unregister_sysctl_table(sd_sysctl_header);
5316 sd_sysctl_header = NULL; 5369 sd_sysctl_header = NULL;
5317 if (sd_ctl_dir[0].child) 5370 if (sd_ctl_dir[0].child)
5318 sd_free_ctl_entry(&sd_ctl_dir[0].child); 5371 sd_free_ctl_entry(&sd_ctl_dir[0].child);
@@ -5433,6 +5486,14 @@ static int sched_cpu_active(struct notifier_block *nfb,
5433 case CPU_STARTING: 5486 case CPU_STARTING:
5434 set_cpu_rq_start_time(); 5487 set_cpu_rq_start_time();
5435 return NOTIFY_OK; 5488 return NOTIFY_OK;
5489 case CPU_ONLINE:
5490 /*
5491 * At this point a starting CPU has marked itself as online via
5492 * set_cpu_online(). But it might not yet have marked itself
5493 * as active, which is essential from here on.
5494 *
5495 * Thus, fall-through and help the starting CPU along.
5496 */
5436 case CPU_DOWN_FAILED: 5497 case CPU_DOWN_FAILED:
5437 set_cpu_active((long)hcpu, true); 5498 set_cpu_active((long)hcpu, true);
5438 return NOTIFY_OK; 5499 return NOTIFY_OK;
@@ -6445,8 +6506,10 @@ static void init_numa_topology_type(void)
6445 6506
6446 n = sched_max_numa_distance; 6507 n = sched_max_numa_distance;
6447 6508
6448 if (n <= 1) 6509 if (sched_domains_numa_levels <= 1) {
6449 sched_numa_topology_type = NUMA_DIRECT; 6510 sched_numa_topology_type = NUMA_DIRECT;
6511 return;
6512 }
6450 6513
6451 for_each_online_node(a) { 6514 for_each_online_node(a) {
6452 for_each_online_node(b) { 6515 for_each_online_node(b) {
@@ -8068,7 +8131,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
8068 sched_offline_group(tg); 8131 sched_offline_group(tg);
8069} 8132}
8070 8133
8071static void cpu_cgroup_fork(struct task_struct *task) 8134static void cpu_cgroup_fork(struct task_struct *task, void *private)
8072{ 8135{
8073 sched_move_task(task); 8136 sched_move_task(task);
8074} 8137}
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index f5a64ffad176..8cbc3db671df 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,48 +555,43 @@ drop_precision:
555} 555}
556 556
557/* 557/*
558 * Atomically advance counter to the new value. Interrupts, vcpu 558 * Adjust tick based cputime random precision against scheduler runtime
559 * scheduling, and scaling inaccuracies can cause cputime_advance 559 * accounting.
560 * to be occasionally called with a new value smaller than counter.
561 * Let's enforce atomicity.
562 * 560 *
563 * Normally a caller will only go through this loop once, or not 561 * Tick based cputime accounting depend on random scheduling timeslices of a
564 * at all in case a previous caller updated counter the same jiffy. 562 * task to be interrupted or not by the timer. Depending on these
565 */ 563 * circumstances, the number of these interrupts may be over or
566static void cputime_advance(cputime_t *counter, cputime_t new) 564 * under-optimistic, matching the real user and system cputime with a variable
567{ 565 * precision.
568 cputime_t old; 566 *
569 567 * Fix this by scaling these tick based values against the total runtime
570 while (new > (old = READ_ONCE(*counter))) 568 * accounted by the CFS scheduler.
571 cmpxchg_cputime(counter, old, new); 569 *
572} 570 * This code provides the following guarantees:
573 571 *
574/* 572 * stime + utime == rtime
575 * Adjust tick based cputime random precision against scheduler 573 * stime_i+1 >= stime_i, utime_i+1 >= utime_i
576 * runtime accounting. 574 *
575 * Assuming that rtime_i+1 >= rtime_i.
577 */ 576 */
578static void cputime_adjust(struct task_cputime *curr, 577static void cputime_adjust(struct task_cputime *curr,
579 struct cputime *prev, 578 struct prev_cputime *prev,
580 cputime_t *ut, cputime_t *st) 579 cputime_t *ut, cputime_t *st)
581{ 580{
582 cputime_t rtime, stime, utime; 581 cputime_t rtime, stime, utime;
582 unsigned long flags;
583 583
584 /* 584 /* Serialize concurrent callers such that we can honour our guarantees */
585 * Tick based cputime accounting depend on random scheduling 585 raw_spin_lock_irqsave(&prev->lock, flags);
586 * timeslices of a task to be interrupted or not by the timer.
587 * Depending on these circumstances, the number of these interrupts
588 * may be over or under-optimistic, matching the real user and system
589 * cputime with a variable precision.
590 *
591 * Fix this by scaling these tick based values against the total
592 * runtime accounted by the CFS scheduler.
593 */
594 rtime = nsecs_to_cputime(curr->sum_exec_runtime); 586 rtime = nsecs_to_cputime(curr->sum_exec_runtime);
595 587
596 /* 588 /*
597 * Update userspace visible utime/stime values only if actual execution 589 * This is possible under two circumstances:
598 * time is bigger than already exported. Note that can happen, that we 590 * - rtime isn't monotonic after all (a bug);
599 * provided bigger values due to scaling inaccuracy on big numbers. 591 * - we got reordered by the lock.
592 *
593 * In both cases this acts as a filter such that the rest of the code
594 * can assume it is monotonic regardless of anything else.
600 */ 595 */
601 if (prev->stime + prev->utime >= rtime) 596 if (prev->stime + prev->utime >= rtime)
602 goto out; 597 goto out;
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
606 601
607 if (utime == 0) { 602 if (utime == 0) {
608 stime = rtime; 603 stime = rtime;
609 } else if (stime == 0) { 604 goto update;
610 utime = rtime; 605 }
611 } else {
612 cputime_t total = stime + utime;
613 606
614 stime = scale_stime((__force u64)stime, 607 if (stime == 0) {
615 (__force u64)rtime, (__force u64)total); 608 utime = rtime;
616 utime = rtime - stime; 609 goto update;
617 } 610 }
618 611
619 cputime_advance(&prev->stime, stime); 612 stime = scale_stime((__force u64)stime, (__force u64)rtime,
620 cputime_advance(&prev->utime, utime); 613 (__force u64)(stime + utime));
614
615 /*
616 * Make sure stime doesn't go backwards; this preserves monotonicity
617 * for utime because rtime is monotonic.
618 *
619 * utime_i+1 = rtime_i+1 - stime_i
620 * = rtime_i+1 - (rtime_i - utime_i)
621 * = (rtime_i+1 - rtime_i) + utime_i
622 * >= utime_i
623 */
624 if (stime < prev->stime)
625 stime = prev->stime;
626 utime = rtime - stime;
627
628 /*
629 * Make sure utime doesn't go backwards; this still preserves
630 * monotonicity for stime, analogous argument to above.
631 */
632 if (utime < prev->utime) {
633 utime = prev->utime;
634 stime = rtime - utime;
635 }
621 636
637update:
638 prev->stime = stime;
639 prev->utime = utime;
622out: 640out:
623 *ut = prev->utime; 641 *ut = prev->utime;
624 *st = prev->stime; 642 *st = prev->stime;
643 raw_spin_unlock_irqrestore(&prev->lock, flags);
625} 644}
626 645
627void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 646void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0a17af35670a..fc8f01083527 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -953,7 +953,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
953 953
954 /* 954 /*
955 * Use the scheduling parameters of the top pi-waiter 955 * Use the scheduling parameters of the top pi-waiter
956 * task if we have one and its (relative) deadline is 956 * task if we have one and its (absolute) deadline is
957 * smaller than our one... OTW we keep our runtime and 957 * smaller than our one... OTW we keep our runtime and
958 * deadline. 958 * deadline.
959 */ 959 */
@@ -1563,7 +1563,7 @@ out:
1563 1563
1564static void push_dl_tasks(struct rq *rq) 1564static void push_dl_tasks(struct rq *rq)
1565{ 1565{
1566 /* Terminates as it moves a -deadline task */ 1566 /* push_dl_task() will return true if it moved a -deadline task */
1567 while (push_dl_task(rq)) 1567 while (push_dl_task(rq))
1568 ; 1568 ;
1569} 1569}
@@ -1657,7 +1657,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
1657{ 1657{
1658 if (!task_running(rq, p) && 1658 if (!task_running(rq, p) &&
1659 !test_tsk_need_resched(rq->curr) && 1659 !test_tsk_need_resched(rq->curr) &&
1660 has_pushable_dl_tasks(rq) &&
1661 p->nr_cpus_allowed > 1 && 1660 p->nr_cpus_allowed > 1 &&
1662 dl_task(rq->curr) && 1661 dl_task(rq->curr) &&
1663 (rq->curr->nr_cpus_allowed < 2 || 1662 (rq->curr->nr_cpus_allowed < 2 ||
@@ -1669,9 +1668,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
1669static void set_cpus_allowed_dl(struct task_struct *p, 1668static void set_cpus_allowed_dl(struct task_struct *p,
1670 const struct cpumask *new_mask) 1669 const struct cpumask *new_mask)
1671{ 1670{
1672 struct rq *rq;
1673 struct root_domain *src_rd; 1671 struct root_domain *src_rd;
1674 int weight; 1672 struct rq *rq;
1675 1673
1676 BUG_ON(!dl_task(p)); 1674 BUG_ON(!dl_task(p));
1677 1675
@@ -1697,37 +1695,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
1697 raw_spin_unlock(&src_dl_b->lock); 1695 raw_spin_unlock(&src_dl_b->lock);
1698 } 1696 }
1699 1697
1700 /* 1698 set_cpus_allowed_common(p, new_mask);
1701 * Update only if the task is actually running (i.e.,
1702 * it is on the rq AND it is not throttled).
1703 */
1704 if (!on_dl_rq(&p->dl))
1705 return;
1706
1707 weight = cpumask_weight(new_mask);
1708
1709 /*
1710 * Only update if the process changes its state from whether it
1711 * can migrate or not.
1712 */
1713 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1714 return;
1715
1716 /*
1717 * The process used to be able to migrate OR it can now migrate
1718 */
1719 if (weight <= 1) {
1720 if (!task_current(rq, p))
1721 dequeue_pushable_dl_task(rq, p);
1722 BUG_ON(!rq->dl.dl_nr_migratory);
1723 rq->dl.dl_nr_migratory--;
1724 } else {
1725 if (!task_current(rq, p))
1726 enqueue_pushable_dl_task(rq, p);
1727 rq->dl.dl_nr_migratory++;
1728 }
1729
1730 update_dl_migration(&rq->dl);
1731} 1699}
1732 1700
1733/* Assumes rq->lock is held */ 1701/* Assumes rq->lock is held */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4222ec50ab88..641511771ae6 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
68#define PN(F) \ 68#define PN(F) \
69 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) 69 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
70 70
71 if (!se) { 71 if (!se)
72 struct sched_avg *avg = &cpu_rq(cpu)->avg;
73 P(avg->runnable_avg_sum);
74 P(avg->avg_period);
75 return; 72 return;
76 }
77
78 73
79 PN(se->exec_start); 74 PN(se->exec_start);
80 PN(se->vruntime); 75 PN(se->vruntime);
@@ -93,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
93#endif 88#endif
94 P(se->load.weight); 89 P(se->load.weight);
95#ifdef CONFIG_SMP 90#ifdef CONFIG_SMP
96 P(se->avg.runnable_avg_sum); 91 P(se->avg.load_avg);
97 P(se->avg.running_avg_sum); 92 P(se->avg.util_avg);
98 P(se->avg.avg_period);
99 P(se->avg.load_avg_contrib);
100 P(se->avg.utilization_avg_contrib);
101 P(se->avg.decay_count);
102#endif 93#endif
103#undef PN 94#undef PN
104#undef P 95#undef P
@@ -214,21 +205,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
214 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); 205 SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
215 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
216#ifdef CONFIG_SMP 207#ifdef CONFIG_SMP
217 SEQ_printf(m, " .%-30s: %ld\n", "runnable_load_avg", 208 SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
209 cfs_rq->avg.load_avg);
210 SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
218 cfs_rq->runnable_load_avg); 211 cfs_rq->runnable_load_avg);
219 SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", 212 SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
220 cfs_rq->blocked_load_avg); 213 cfs_rq->avg.util_avg);
221 SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", 214 SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
222 cfs_rq->utilization_load_avg); 215 atomic_long_read(&cfs_rq->removed_load_avg));
216 SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
217 atomic_long_read(&cfs_rq->removed_util_avg));
223#ifdef CONFIG_FAIR_GROUP_SCHED 218#ifdef CONFIG_FAIR_GROUP_SCHED
224 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", 219 SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
225 cfs_rq->tg_load_contrib); 220 cfs_rq->tg_load_avg_contrib);
226 SEQ_printf(m, " .%-30s: %d\n", "tg_runnable_contrib",
227 cfs_rq->tg_runnable_contrib);
228 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg", 221 SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
229 atomic_long_read(&cfs_rq->tg->load_avg)); 222 atomic_long_read(&cfs_rq->tg->load_avg));
230 SEQ_printf(m, " .%-30s: %d\n", "tg->runnable_avg",
231 atomic_read(&cfs_rq->tg->runnable_avg));
232#endif 223#endif
233#endif 224#endif
234#ifdef CONFIG_CFS_BANDWIDTH 225#ifdef CONFIG_CFS_BANDWIDTH
@@ -636,12 +627,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
636 627
637 P(se.load.weight); 628 P(se.load.weight);
638#ifdef CONFIG_SMP 629#ifdef CONFIG_SMP
639 P(se.avg.runnable_avg_sum); 630 P(se.avg.load_sum);
640 P(se.avg.running_avg_sum); 631 P(se.avg.util_sum);
641 P(se.avg.avg_period); 632 P(se.avg.load_avg);
642 P(se.avg.load_avg_contrib); 633 P(se.avg.util_avg);
643 P(se.avg.utilization_avg_contrib); 634 P(se.avg.last_update_time);
644 P(se.avg.decay_count);
645#endif 635#endif
646 P(policy); 636 P(policy);
647 P(prio); 637 P(prio);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 65c8f3ebdc3c..6e2e3483b1ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
283 return grp->my_q; 283 return grp->my_q;
284} 284}
285 285
286static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
287 int force_update);
288
289static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) 286static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
290{ 287{
291 if (!cfs_rq->on_list) { 288 if (!cfs_rq->on_list) {
@@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
305 } 302 }
306 303
307 cfs_rq->on_list = 1; 304 cfs_rq->on_list = 1;
308 /* We should have no load, but we need to update last_decay. */
309 update_cfs_rq_blocked_load(cfs_rq, 0);
310 } 305 }
311} 306}
312 307
@@ -616,15 +611,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
616 */ 611 */
617static u64 __sched_period(unsigned long nr_running) 612static u64 __sched_period(unsigned long nr_running)
618{ 613{
619 u64 period = sysctl_sched_latency; 614 if (unlikely(nr_running > sched_nr_latency))
620 unsigned long nr_latency = sched_nr_latency; 615 return nr_running * sysctl_sched_min_granularity;
621 616 else
622 if (unlikely(nr_running > nr_latency)) { 617 return sysctl_sched_latency;
623 period = sysctl_sched_min_granularity;
624 period *= nr_running;
625 }
626
627 return period;
628} 618}
629 619
630/* 620/*
@@ -669,22 +659,37 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
669static int select_idle_sibling(struct task_struct *p, int cpu); 659static int select_idle_sibling(struct task_struct *p, int cpu);
670static unsigned long task_h_load(struct task_struct *p); 660static unsigned long task_h_load(struct task_struct *p);
671 661
672static inline void __update_task_entity_contrib(struct sched_entity *se); 662/*
673static inline void __update_task_entity_utilization(struct sched_entity *se); 663 * We choose a half-life close to 1 scheduling period.
664 * Note: The tables below are dependent on this value.
665 */
666#define LOAD_AVG_PERIOD 32
667#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
668#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
674 669
675/* Give new task start runnable values to heavy its load in infant time */ 670/* Give new sched_entity start runnable values to heavy its load in infant time */
676void init_task_runnable_average(struct task_struct *p) 671void init_entity_runnable_average(struct sched_entity *se)
677{ 672{
678 u32 slice; 673 struct sched_avg *sa = &se->avg;
679 674
680 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; 675 sa->last_update_time = 0;
681 p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; 676 /*
682 p->se.avg.avg_period = slice; 677 * sched_avg's period_contrib should be strictly less then 1024, so
683 __update_task_entity_contrib(&p->se); 678 * we give it 1023 to make sure it is almost a period (1024us), and
684 __update_task_entity_utilization(&p->se); 679 * will definitely be update (after enqueue).
680 */
681 sa->period_contrib = 1023;
682 sa->load_avg = scale_load_down(se->load.weight);
683 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
684 sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
685 sa->util_sum = LOAD_AVG_MAX;
686 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
685} 687}
688
689static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
690static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
686#else 691#else
687void init_task_runnable_average(struct task_struct *p) 692void init_entity_runnable_average(struct sched_entity *se)
688{ 693{
689} 694}
690#endif 695#endif
@@ -1415,8 +1420,9 @@ static bool numa_has_capacity(struct task_numa_env *env)
1415 * --------------------- vs --------------------- 1420 * --------------------- vs ---------------------
1416 * src->compute_capacity dst->compute_capacity 1421 * src->compute_capacity dst->compute_capacity
1417 */ 1422 */
1418 if (src->load * dst->compute_capacity > 1423 if (src->load * dst->compute_capacity * env->imbalance_pct >
1419 dst->load * src->compute_capacity) 1424
1425 dst->load * src->compute_capacity * 100)
1420 return true; 1426 return true;
1421 1427
1422 return false; 1428 return false;
@@ -1702,8 +1708,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1702 delta = runtime - p->last_sum_exec_runtime; 1708 delta = runtime - p->last_sum_exec_runtime;
1703 *period = now - p->last_task_numa_placement; 1709 *period = now - p->last_task_numa_placement;
1704 } else { 1710 } else {
1705 delta = p->se.avg.runnable_avg_sum; 1711 delta = p->se.avg.load_sum / p->se.load.weight;
1706 *period = p->se.avg.avg_period; 1712 *period = LOAD_AVG_MAX;
1707 } 1713 }
1708 1714
1709 p->last_sum_exec_runtime = runtime; 1715 p->last_sum_exec_runtime = runtime;
@@ -2351,13 +2357,13 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
2351 long tg_weight; 2357 long tg_weight;
2352 2358
2353 /* 2359 /*
2354 * Use this CPU's actual weight instead of the last load_contribution 2360 * Use this CPU's real-time load instead of the last load contribution
2355 * to gain a more accurate current total weight. See 2361 * as the updating of the contribution is delayed, and we will use the
2356 * update_cfs_rq_load_contribution(). 2362 * the real-time load to calc the share. See update_tg_load_avg().
2357 */ 2363 */
2358 tg_weight = atomic_long_read(&tg->load_avg); 2364 tg_weight = atomic_long_read(&tg->load_avg);
2359 tg_weight -= cfs_rq->tg_load_contrib; 2365 tg_weight -= cfs_rq->tg_load_avg_contrib;
2360 tg_weight += cfs_rq->load.weight; 2366 tg_weight += cfs_rq_load_avg(cfs_rq);
2361 2367
2362 return tg_weight; 2368 return tg_weight;
2363} 2369}
@@ -2367,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2367 long tg_weight, load, shares; 2373 long tg_weight, load, shares;
2368 2374
2369 tg_weight = calc_tg_weight(tg, cfs_rq); 2375 tg_weight = calc_tg_weight(tg, cfs_rq);
2370 load = cfs_rq->load.weight; 2376 load = cfs_rq_load_avg(cfs_rq);
2371 2377
2372 shares = (tg->shares * load); 2378 shares = (tg->shares * load);
2373 if (tg_weight) 2379 if (tg_weight)
@@ -2429,14 +2435,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2429#endif /* CONFIG_FAIR_GROUP_SCHED */ 2435#endif /* CONFIG_FAIR_GROUP_SCHED */
2430 2436
2431#ifdef CONFIG_SMP 2437#ifdef CONFIG_SMP
2432/*
2433 * We choose a half-life close to 1 scheduling period.
2434 * Note: The tables below are dependent on this value.
2435 */
2436#define LOAD_AVG_PERIOD 32
2437#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
2438#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
2439
2440/* Precomputed fixed inverse multiplies for multiplication by y^n */ 2438/* Precomputed fixed inverse multiplies for multiplication by y^n */
2441static const u32 runnable_avg_yN_inv[] = { 2439static const u32 runnable_avg_yN_inv[] = {
2442 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, 2440 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
@@ -2485,9 +2483,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
2485 local_n %= LOAD_AVG_PERIOD; 2483 local_n %= LOAD_AVG_PERIOD;
2486 } 2484 }
2487 2485
2488 val *= runnable_avg_yN_inv[local_n]; 2486 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2489 /* We don't use SRR here since we always want to round down. */ 2487 return val;
2490 return val >> 32;
2491} 2488}
2492 2489
2493/* 2490/*
@@ -2546,23 +2543,22 @@ static u32 __compute_runnable_contrib(u64 n)
2546 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) 2543 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2547 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] 2544 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2548 */ 2545 */
2549static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, 2546static __always_inline int
2550 struct sched_avg *sa, 2547__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2551 int runnable, 2548 unsigned long weight, int running, struct cfs_rq *cfs_rq)
2552 int running)
2553{ 2549{
2554 u64 delta, periods; 2550 u64 delta, periods;
2555 u32 runnable_contrib; 2551 u32 contrib;
2556 int delta_w, decayed = 0; 2552 int delta_w, decayed = 0;
2557 unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); 2553 unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
2558 2554
2559 delta = now - sa->last_runnable_update; 2555 delta = now - sa->last_update_time;
2560 /* 2556 /*
2561 * This should only happen when time goes backwards, which it 2557 * This should only happen when time goes backwards, which it
2562 * unfortunately does during sched clock init when we swap over to TSC. 2558 * unfortunately does during sched clock init when we swap over to TSC.
2563 */ 2559 */
2564 if ((s64)delta < 0) { 2560 if ((s64)delta < 0) {
2565 sa->last_runnable_update = now; 2561 sa->last_update_time = now;
2566 return 0; 2562 return 0;
2567 } 2563 }
2568 2564
@@ -2573,26 +2569,29 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
2573 delta >>= 10; 2569 delta >>= 10;
2574 if (!delta) 2570 if (!delta)
2575 return 0; 2571 return 0;
2576 sa->last_runnable_update = now; 2572 sa->last_update_time = now;
2577 2573
2578 /* delta_w is the amount already accumulated against our next period */ 2574 /* delta_w is the amount already accumulated against our next period */
2579 delta_w = sa->avg_period % 1024; 2575 delta_w = sa->period_contrib;
2580 if (delta + delta_w >= 1024) { 2576 if (delta + delta_w >= 1024) {
2581 /* period roll-over */
2582 decayed = 1; 2577 decayed = 1;
2583 2578
2579 /* how much left for next period will start over, we don't know yet */
2580 sa->period_contrib = 0;
2581
2584 /* 2582 /*
2585 * Now that we know we're crossing a period boundary, figure 2583 * Now that we know we're crossing a period boundary, figure
2586 * out how much from delta we need to complete the current 2584 * out how much from delta we need to complete the current
2587 * period and accrue it. 2585 * period and accrue it.
2588 */ 2586 */
2589 delta_w = 1024 - delta_w; 2587 delta_w = 1024 - delta_w;
2590 if (runnable) 2588 if (weight) {
2591 sa->runnable_avg_sum += delta_w; 2589 sa->load_sum += weight * delta_w;
2590 if (cfs_rq)
2591 cfs_rq->runnable_load_sum += weight * delta_w;
2592 }
2592 if (running) 2593 if (running)
2593 sa->running_avg_sum += delta_w * scale_freq 2594 sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT;
2594 >> SCHED_CAPACITY_SHIFT;
2595 sa->avg_period += delta_w;
2596 2595
2597 delta -= delta_w; 2596 delta -= delta_w;
2598 2597
@@ -2600,341 +2599,186 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
2600 periods = delta / 1024; 2599 periods = delta / 1024;
2601 delta %= 1024; 2600 delta %= 1024;
2602 2601
2603 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, 2602 sa->load_sum = decay_load(sa->load_sum, periods + 1);
2604 periods + 1); 2603 if (cfs_rq) {
2605 sa->running_avg_sum = decay_load(sa->running_avg_sum, 2604 cfs_rq->runnable_load_sum =
2606 periods + 1); 2605 decay_load(cfs_rq->runnable_load_sum, periods + 1);
2607 sa->avg_period = decay_load(sa->avg_period, 2606 }
2608 periods + 1); 2607 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2609 2608
2610 /* Efficiently calculate \sum (1..n_period) 1024*y^i */ 2609 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
2611 runnable_contrib = __compute_runnable_contrib(periods); 2610 contrib = __compute_runnable_contrib(periods);
2612 if (runnable) 2611 if (weight) {
2613 sa->runnable_avg_sum += runnable_contrib; 2612 sa->load_sum += weight * contrib;
2613 if (cfs_rq)
2614 cfs_rq->runnable_load_sum += weight * contrib;
2615 }
2614 if (running) 2616 if (running)
2615 sa->running_avg_sum += runnable_contrib * scale_freq 2617 sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT;
2616 >> SCHED_CAPACITY_SHIFT;
2617 sa->avg_period += runnable_contrib;
2618 } 2618 }
2619 2619
2620 /* Remainder of delta accrued against u_0` */ 2620 /* Remainder of delta accrued against u_0` */
2621 if (runnable) 2621 if (weight) {
2622 sa->runnable_avg_sum += delta; 2622 sa->load_sum += weight * delta;
2623 if (cfs_rq)
2624 cfs_rq->runnable_load_sum += weight * delta;
2625 }
2623 if (running) 2626 if (running)
2624 sa->running_avg_sum += delta * scale_freq 2627 sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT;
2625 >> SCHED_CAPACITY_SHIFT;
2626 sa->avg_period += delta;
2627
2628 return decayed;
2629}
2630 2628
2631/* Synchronize an entity's decay with its parenting cfs_rq.*/ 2629 sa->period_contrib += delta;
2632static inline u64 __synchronize_entity_decay(struct sched_entity *se)
2633{
2634 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2635 u64 decays = atomic64_read(&cfs_rq->decay_counter);
2636
2637 decays -= se->avg.decay_count;
2638 se->avg.decay_count = 0;
2639 if (!decays)
2640 return 0;
2641 2630
2642 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); 2631 if (decayed) {
2643 se->avg.utilization_avg_contrib = 2632 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2644 decay_load(se->avg.utilization_avg_contrib, decays); 2633 if (cfs_rq) {
2634 cfs_rq->runnable_load_avg =
2635 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2636 }
2637 sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX;
2638 }
2645 2639
2646 return decays; 2640 return decayed;
2647} 2641}
2648 2642
2649#ifdef CONFIG_FAIR_GROUP_SCHED 2643#ifdef CONFIG_FAIR_GROUP_SCHED
2650static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2651 int force_update)
2652{
2653 struct task_group *tg = cfs_rq->tg;
2654 long tg_contrib;
2655
2656 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
2657 tg_contrib -= cfs_rq->tg_load_contrib;
2658
2659 if (!tg_contrib)
2660 return;
2661
2662 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
2663 atomic_long_add(tg_contrib, &tg->load_avg);
2664 cfs_rq->tg_load_contrib += tg_contrib;
2665 }
2666}
2667
2668/* 2644/*
2669 * Aggregate cfs_rq runnable averages into an equivalent task_group 2645 * Updating tg's load_avg is necessary before update_cfs_share (which is done)
2670 * representation for computing load contributions. 2646 * and effective_load (which is not done because it is too costly).
2671 */ 2647 */
2672static inline void __update_tg_runnable_avg(struct sched_avg *sa, 2648static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2673 struct cfs_rq *cfs_rq)
2674{ 2649{
2675 struct task_group *tg = cfs_rq->tg; 2650 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2676 long contrib;
2677 2651
2678 /* The fraction of a cpu used by this cfs_rq */ 2652 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2679 contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, 2653 atomic_long_add(delta, &cfs_rq->tg->load_avg);
2680 sa->avg_period + 1); 2654 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2681 contrib -= cfs_rq->tg_runnable_contrib;
2682
2683 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
2684 atomic_add(contrib, &tg->runnable_avg);
2685 cfs_rq->tg_runnable_contrib += contrib;
2686 } 2655 }
2687} 2656}
2688 2657
2689static inline void __update_group_entity_contrib(struct sched_entity *se)
2690{
2691 struct cfs_rq *cfs_rq = group_cfs_rq(se);
2692 struct task_group *tg = cfs_rq->tg;
2693 int runnable_avg;
2694
2695 u64 contrib;
2696
2697 contrib = cfs_rq->tg_load_contrib * tg->shares;
2698 se->avg.load_avg_contrib = div_u64(contrib,
2699 atomic_long_read(&tg->load_avg) + 1);
2700
2701 /*
2702 * For group entities we need to compute a correction term in the case
2703 * that they are consuming <1 cpu so that we would contribute the same
2704 * load as a task of equal weight.
2705 *
2706 * Explicitly co-ordinating this measurement would be expensive, but
2707 * fortunately the sum of each cpus contribution forms a usable
2708 * lower-bound on the true value.
2709 *
2710 * Consider the aggregate of 2 contributions. Either they are disjoint
2711 * (and the sum represents true value) or they are disjoint and we are
2712 * understating by the aggregate of their overlap.
2713 *
2714 * Extending this to N cpus, for a given overlap, the maximum amount we
2715 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
2716 * cpus that overlap for this interval and w_i is the interval width.
2717 *
2718 * On a small machine; the first term is well-bounded which bounds the
2719 * total error since w_i is a subset of the period. Whereas on a
2720 * larger machine, while this first term can be larger, if w_i is the
2721 * of consequential size guaranteed to see n_i*w_i quickly converge to
2722 * our upper bound of 1-cpu.
2723 */
2724 runnable_avg = atomic_read(&tg->runnable_avg);
2725 if (runnable_avg < NICE_0_LOAD) {
2726 se->avg.load_avg_contrib *= runnable_avg;
2727 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2728 }
2729}
2730
2731static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2732{
2733 __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
2734 runnable, runnable);
2735 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2736}
2737#else /* CONFIG_FAIR_GROUP_SCHED */ 2658#else /* CONFIG_FAIR_GROUP_SCHED */
2738static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq, 2659static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
2739 int force_update) {}
2740static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2741 struct cfs_rq *cfs_rq) {}
2742static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2743static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2744#endif /* CONFIG_FAIR_GROUP_SCHED */ 2660#endif /* CONFIG_FAIR_GROUP_SCHED */
2745 2661
2746static inline void __update_task_entity_contrib(struct sched_entity *se) 2662static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2747{
2748 u32 contrib;
2749
2750 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2751 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
2752 contrib /= (se->avg.avg_period + 1);
2753 se->avg.load_avg_contrib = scale_load(contrib);
2754}
2755 2663
2756/* Compute the current contribution to load_avg by se, return any delta */ 2664/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
2757static long __update_entity_load_avg_contrib(struct sched_entity *se) 2665static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
2758{ 2666{
2759 long old_contrib = se->avg.load_avg_contrib; 2667 int decayed;
2668 struct sched_avg *sa = &cfs_rq->avg;
2760 2669
2761 if (entity_is_task(se)) { 2670 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
2762 __update_task_entity_contrib(se); 2671 long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
2763 } else { 2672 sa->load_avg = max_t(long, sa->load_avg - r, 0);
2764 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se)); 2673 sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
2765 __update_group_entity_contrib(se);
2766 } 2674 }
2767 2675
2768 return se->avg.load_avg_contrib - old_contrib; 2676 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
2769} 2677 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
2770 2678 sa->util_avg = max_t(long, sa->util_avg - r, 0);
2771 2679 sa->util_sum = max_t(s32, sa->util_sum -
2772static inline void __update_task_entity_utilization(struct sched_entity *se) 2680 ((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0);
2773{ 2681 }
2774 u32 contrib;
2775 2682
2776 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ 2683 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2777 contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); 2684 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
2778 contrib /= (se->avg.avg_period + 1);
2779 se->avg.utilization_avg_contrib = scale_load(contrib);
2780}
2781 2685
2782static long __update_entity_utilization_avg_contrib(struct sched_entity *se) 2686#ifndef CONFIG_64BIT
2783{ 2687 smp_wmb();
2784 long old_contrib = se->avg.utilization_avg_contrib; 2688 cfs_rq->load_last_update_time_copy = sa->last_update_time;
2785 2689#endif
2786 if (entity_is_task(se))
2787 __update_task_entity_utilization(se);
2788 else
2789 se->avg.utilization_avg_contrib =
2790 group_cfs_rq(se)->utilization_load_avg;
2791 2690
2792 return se->avg.utilization_avg_contrib - old_contrib; 2691 return decayed;
2793} 2692}
2794 2693
2795static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, 2694/* Update task and its cfs_rq load average */
2796 long load_contrib) 2695static inline void update_load_avg(struct sched_entity *se, int update_tg)
2797{
2798 if (likely(load_contrib < cfs_rq->blocked_load_avg))
2799 cfs_rq->blocked_load_avg -= load_contrib;
2800 else
2801 cfs_rq->blocked_load_avg = 0;
2802}
2803
2804static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2805
2806/* Update a sched_entity's runnable average */
2807static inline void update_entity_load_avg(struct sched_entity *se,
2808 int update_cfs_rq)
2809{ 2696{
2810 struct cfs_rq *cfs_rq = cfs_rq_of(se); 2697 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2811 long contrib_delta, utilization_delta;
2812 int cpu = cpu_of(rq_of(cfs_rq)); 2698 int cpu = cpu_of(rq_of(cfs_rq));
2813 u64 now; 2699 u64 now = cfs_rq_clock_task(cfs_rq);
2814 2700
2815 /* 2701 /*
2816 * For a group entity we need to use their owned cfs_rq_clock_task() in 2702 * Track task load average for carrying it to new CPU after migrated, and
2817 * case they are the parent of a throttled hierarchy. 2703 * track group sched_entity load average for task_h_load calc in migration
2818 */ 2704 */
2819 if (entity_is_task(se)) 2705 __update_load_avg(now, cpu, &se->avg,
2820 now = cfs_rq_clock_task(cfs_rq); 2706 se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
2821 else
2822 now = cfs_rq_clock_task(group_cfs_rq(se));
2823
2824 if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
2825 cfs_rq->curr == se))
2826 return;
2827
2828 contrib_delta = __update_entity_load_avg_contrib(se);
2829 utilization_delta = __update_entity_utilization_avg_contrib(se);
2830
2831 if (!update_cfs_rq)
2832 return;
2833 2707
2834 if (se->on_rq) { 2708 if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
2835 cfs_rq->runnable_load_avg += contrib_delta; 2709 update_tg_load_avg(cfs_rq, 0);
2836 cfs_rq->utilization_load_avg += utilization_delta;
2837 } else {
2838 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
2839 }
2840} 2710}
2841 2711
2842/* 2712/* Add the load generated by se into cfs_rq's load average */
2843 * Decay the load contributed by all blocked children and account this so that 2713static inline void
2844 * their contribution may appropriately discounted when they wake up. 2714enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2845 */
2846static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2847{ 2715{
2848 u64 now = cfs_rq_clock_task(cfs_rq) >> 20; 2716 struct sched_avg *sa = &se->avg;
2849 u64 decays; 2717 u64 now = cfs_rq_clock_task(cfs_rq);
2850 2718 int migrated = 0, decayed;
2851 decays = now - cfs_rq->last_decay;
2852 if (!decays && !force_update)
2853 return;
2854 2719
2855 if (atomic_long_read(&cfs_rq->removed_load)) { 2720 if (sa->last_update_time == 0) {
2856 unsigned long removed_load; 2721 sa->last_update_time = now;
2857 removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0); 2722 migrated = 1;
2858 subtract_blocked_load_contrib(cfs_rq, removed_load);
2859 } 2723 }
2724 else {
2725 __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
2726 se->on_rq * scale_load_down(se->load.weight),
2727 cfs_rq->curr == se, NULL);
2728 }
2729
2730 decayed = update_cfs_rq_load_avg(now, cfs_rq);
2860 2731
2861 if (decays) { 2732 cfs_rq->runnable_load_avg += sa->load_avg;
2862 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg, 2733 cfs_rq->runnable_load_sum += sa->load_sum;
2863 decays); 2734
2864 atomic64_add(decays, &cfs_rq->decay_counter); 2735 if (migrated) {
2865 cfs_rq->last_decay = now; 2736 cfs_rq->avg.load_avg += sa->load_avg;
2737 cfs_rq->avg.load_sum += sa->load_sum;
2738 cfs_rq->avg.util_avg += sa->util_avg;
2739 cfs_rq->avg.util_sum += sa->util_sum;
2866 } 2740 }
2867 2741
2868 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update); 2742 if (decayed || migrated)
2743 update_tg_load_avg(cfs_rq, 0);
2869} 2744}
2870 2745
2871/* Add the load generated by se into cfs_rq's child load-average */ 2746/* Remove the runnable load generated by se from cfs_rq's runnable load average */
2872static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, 2747static inline void
2873 struct sched_entity *se, 2748dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
2874 int wakeup)
2875{ 2749{
2876 /* 2750 update_load_avg(se, 1);
2877 * We track migrations using entity decay_count <= 0, on a wake-up
2878 * migration we use a negative decay count to track the remote decays
2879 * accumulated while sleeping.
2880 *
2881 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
2882 * are seen by enqueue_entity_load_avg() as a migration with an already
2883 * constructed load_avg_contrib.
2884 */
2885 if (unlikely(se->avg.decay_count <= 0)) {
2886 se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
2887 if (se->avg.decay_count) {
2888 /*
2889 * In a wake-up migration we have to approximate the
2890 * time sleeping. This is because we can't synchronize
2891 * clock_task between the two cpus, and it is not
2892 * guaranteed to be read-safe. Instead, we can
2893 * approximate this using our carried decays, which are
2894 * explicitly atomically readable.
2895 */
2896 se->avg.last_runnable_update -= (-se->avg.decay_count)
2897 << 20;
2898 update_entity_load_avg(se, 0);
2899 /* Indicate that we're now synchronized and on-rq */
2900 se->avg.decay_count = 0;
2901 }
2902 wakeup = 0;
2903 } else {
2904 __synchronize_entity_decay(se);
2905 }
2906 2751
2907 /* migrated tasks did not contribute to our blocked load */ 2752 cfs_rq->runnable_load_avg =
2908 if (wakeup) { 2753 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
2909 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); 2754 cfs_rq->runnable_load_sum =
2910 update_entity_load_avg(se, 0); 2755 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
2911 }
2912
2913 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
2914 cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
2915 /* we force update consideration on load-balancer moves */
2916 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2917} 2756}
2918 2757
2919/* 2758/*
2920 * Remove se's load from this cfs_rq child load-average, if the entity is 2759 * Task first catches up with cfs_rq, and then subtract
2921 * transitioning to a blocked state we track its projected decay using 2760 * itself from the cfs_rq (task must be off the queue now).
2922 * blocked_load_avg.
2923 */ 2761 */
2924static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, 2762void remove_entity_load_avg(struct sched_entity *se)
2925 struct sched_entity *se,
2926 int sleep)
2927{ 2763{
2928 update_entity_load_avg(se, 1); 2764 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2929 /* we force update consideration on load-balancer moves */ 2765 u64 last_update_time;
2930 update_cfs_rq_blocked_load(cfs_rq, !sleep); 2766
2767#ifndef CONFIG_64BIT
2768 u64 last_update_time_copy;
2931 2769
2932 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; 2770 do {
2933 cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; 2771 last_update_time_copy = cfs_rq->load_last_update_time_copy;
2934 if (sleep) { 2772 smp_rmb();
2935 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; 2773 last_update_time = cfs_rq->avg.last_update_time;
2936 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 2774 } while (last_update_time != last_update_time_copy);
2937 } /* migrations, e.g. sleep=0 leave decay_count == 0 */ 2775#else
2776 last_update_time = cfs_rq->avg.last_update_time;
2777#endif
2778
2779 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
2780 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
2781 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
2938} 2782}
2939 2783
2940/* 2784/*
@@ -2944,7 +2788,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2944 */ 2788 */
2945void idle_enter_fair(struct rq *this_rq) 2789void idle_enter_fair(struct rq *this_rq)
2946{ 2790{
2947 update_rq_runnable_avg(this_rq, 1);
2948} 2791}
2949 2792
2950/* 2793/*
@@ -2954,24 +2797,28 @@ void idle_enter_fair(struct rq *this_rq)
2954 */ 2797 */
2955void idle_exit_fair(struct rq *this_rq) 2798void idle_exit_fair(struct rq *this_rq)
2956{ 2799{
2957 update_rq_runnable_avg(this_rq, 0); 2800}
2801
2802static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
2803{
2804 return cfs_rq->runnable_load_avg;
2805}
2806
2807static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
2808{
2809 return cfs_rq->avg.load_avg;
2958} 2810}
2959 2811
2960static int idle_balance(struct rq *this_rq); 2812static int idle_balance(struct rq *this_rq);
2961 2813
2962#else /* CONFIG_SMP */ 2814#else /* CONFIG_SMP */
2963 2815
2964static inline void update_entity_load_avg(struct sched_entity *se, 2816static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
2965 int update_cfs_rq) {} 2817static inline void
2966static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {} 2818enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2967static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, 2819static inline void
2968 struct sched_entity *se, 2820dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
2969 int wakeup) {} 2821static inline void remove_entity_load_avg(struct sched_entity *se) {}
2970static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2971 struct sched_entity *se,
2972 int sleep) {}
2973static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2974 int force_update) {}
2975 2822
2976static inline int idle_balance(struct rq *rq) 2823static inline int idle_balance(struct rq *rq)
2977{ 2824{
@@ -3103,7 +2950,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3103 * Update run-time statistics of the 'current'. 2950 * Update run-time statistics of the 'current'.
3104 */ 2951 */
3105 update_curr(cfs_rq); 2952 update_curr(cfs_rq);
3106 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP); 2953 enqueue_entity_load_avg(cfs_rq, se);
3107 account_entity_enqueue(cfs_rq, se); 2954 account_entity_enqueue(cfs_rq, se);
3108 update_cfs_shares(cfs_rq); 2955 update_cfs_shares(cfs_rq);
3109 2956
@@ -3178,7 +3025,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3178 * Update run-time statistics of the 'current'. 3025 * Update run-time statistics of the 'current'.
3179 */ 3026 */
3180 update_curr(cfs_rq); 3027 update_curr(cfs_rq);
3181 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP); 3028 dequeue_entity_load_avg(cfs_rq, se);
3182 3029
3183 update_stats_dequeue(cfs_rq, se); 3030 update_stats_dequeue(cfs_rq, se);
3184 if (flags & DEQUEUE_SLEEP) { 3031 if (flags & DEQUEUE_SLEEP) {
@@ -3268,7 +3115,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3268 */ 3115 */
3269 update_stats_wait_end(cfs_rq, se); 3116 update_stats_wait_end(cfs_rq, se);
3270 __dequeue_entity(cfs_rq, se); 3117 __dequeue_entity(cfs_rq, se);
3271 update_entity_load_avg(se, 1); 3118 update_load_avg(se, 1);
3272 } 3119 }
3273 3120
3274 update_stats_curr_start(cfs_rq, se); 3121 update_stats_curr_start(cfs_rq, se);
@@ -3368,7 +3215,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3368 /* Put 'current' back into the tree. */ 3215 /* Put 'current' back into the tree. */
3369 __enqueue_entity(cfs_rq, prev); 3216 __enqueue_entity(cfs_rq, prev);
3370 /* in !on_rq case, update occurred at dequeue */ 3217 /* in !on_rq case, update occurred at dequeue */
3371 update_entity_load_avg(prev, 1); 3218 update_load_avg(prev, 0);
3372 } 3219 }
3373 cfs_rq->curr = NULL; 3220 cfs_rq->curr = NULL;
3374} 3221}
@@ -3384,8 +3231,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3384 /* 3231 /*
3385 * Ensure that runnable average is periodically updated. 3232 * Ensure that runnable average is periodically updated.
3386 */ 3233 */
3387 update_entity_load_avg(curr, 1); 3234 update_load_avg(curr, 1);
3388 update_cfs_rq_blocked_load(cfs_rq, 1);
3389 update_cfs_shares(cfs_rq); 3235 update_cfs_shares(cfs_rq);
3390 3236
3391#ifdef CONFIG_SCHED_HRTICK 3237#ifdef CONFIG_SCHED_HRTICK
@@ -3683,7 +3529,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3683 cfs_rq->throttled = 1; 3529 cfs_rq->throttled = 1;
3684 cfs_rq->throttled_clock = rq_clock(rq); 3530 cfs_rq->throttled_clock = rq_clock(rq);
3685 raw_spin_lock(&cfs_b->lock); 3531 raw_spin_lock(&cfs_b->lock);
3686 empty = list_empty(&cfs_rq->throttled_list); 3532 empty = list_empty(&cfs_b->throttled_cfs_rq);
3687 3533
3688 /* 3534 /*
3689 * Add to the _head_ of the list, so that an already-started 3535 * Add to the _head_ of the list, so that an already-started
@@ -4258,14 +4104,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4258 if (cfs_rq_throttled(cfs_rq)) 4104 if (cfs_rq_throttled(cfs_rq))
4259 break; 4105 break;
4260 4106
4107 update_load_avg(se, 1);
4261 update_cfs_shares(cfs_rq); 4108 update_cfs_shares(cfs_rq);
4262 update_entity_load_avg(se, 1);
4263 } 4109 }
4264 4110
4265 if (!se) { 4111 if (!se)
4266 update_rq_runnable_avg(rq, rq->nr_running);
4267 add_nr_running(rq, 1); 4112 add_nr_running(rq, 1);
4268 } 4113
4269 hrtick_update(rq); 4114 hrtick_update(rq);
4270} 4115}
4271 4116
@@ -4319,14 +4164,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4319 if (cfs_rq_throttled(cfs_rq)) 4164 if (cfs_rq_throttled(cfs_rq))
4320 break; 4165 break;
4321 4166
4167 update_load_avg(se, 1);
4322 update_cfs_shares(cfs_rq); 4168 update_cfs_shares(cfs_rq);
4323 update_entity_load_avg(se, 1);
4324 } 4169 }
4325 4170
4326 if (!se) { 4171 if (!se)
4327 sub_nr_running(rq, 1); 4172 sub_nr_running(rq, 1);
4328 update_rq_runnable_avg(rq, 1); 4173
4329 }
4330 hrtick_update(rq); 4174 hrtick_update(rq);
4331} 4175}
4332 4176
@@ -4439,6 +4283,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4439 sched_avg_update(this_rq); 4283 sched_avg_update(this_rq);
4440} 4284}
4441 4285
4286/* Used instead of source_load when we know the type == 0 */
4287static unsigned long weighted_cpuload(const int cpu)
4288{
4289 return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
4290}
4291
4442#ifdef CONFIG_NO_HZ_COMMON 4292#ifdef CONFIG_NO_HZ_COMMON
4443/* 4293/*
4444 * There is no sane way to deal with nohz on smp when using jiffies because the 4294 * There is no sane way to deal with nohz on smp when using jiffies because the
@@ -4460,7 +4310,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
4460static void update_idle_cpu_load(struct rq *this_rq) 4310static void update_idle_cpu_load(struct rq *this_rq)
4461{ 4311{
4462 unsigned long curr_jiffies = READ_ONCE(jiffies); 4312 unsigned long curr_jiffies = READ_ONCE(jiffies);
4463 unsigned long load = this_rq->cfs.runnable_load_avg; 4313 unsigned long load = weighted_cpuload(cpu_of(this_rq));
4464 unsigned long pending_updates; 4314 unsigned long pending_updates;
4465 4315
4466 /* 4316 /*
@@ -4506,7 +4356,7 @@ void update_cpu_load_nohz(void)
4506 */ 4356 */
4507void update_cpu_load_active(struct rq *this_rq) 4357void update_cpu_load_active(struct rq *this_rq)
4508{ 4358{
4509 unsigned long load = this_rq->cfs.runnable_load_avg; 4359 unsigned long load = weighted_cpuload(cpu_of(this_rq));
4510 /* 4360 /*
4511 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). 4361 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
4512 */ 4362 */
@@ -4514,12 +4364,6 @@ void update_cpu_load_active(struct rq *this_rq)
4514 __update_cpu_load(this_rq, load, 1); 4364 __update_cpu_load(this_rq, load, 1);
4515} 4365}
4516 4366
4517/* Used instead of source_load when we know the type == 0 */
4518static unsigned long weighted_cpuload(const int cpu)
4519{
4520 return cpu_rq(cpu)->cfs.runnable_load_avg;
4521}
4522
4523/* 4367/*
4524 * Return a low guess at the load of a migration-source cpu weighted 4368 * Return a low guess at the load of a migration-source cpu weighted
4525 * according to the scheduling class and "nice" value. 4369 * according to the scheduling class and "nice" value.
@@ -4567,7 +4411,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
4567{ 4411{
4568 struct rq *rq = cpu_rq(cpu); 4412 struct rq *rq = cpu_rq(cpu);
4569 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); 4413 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
4570 unsigned long load_avg = rq->cfs.runnable_load_avg; 4414 unsigned long load_avg = weighted_cpuload(cpu);
4571 4415
4572 if (nr_running) 4416 if (nr_running)
4573 return load_avg / nr_running; 4417 return load_avg / nr_running;
@@ -4686,7 +4530,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4686 /* 4530 /*
4687 * w = rw_i + @wl 4531 * w = rw_i + @wl
4688 */ 4532 */
4689 w = se->my_q->load.weight + wl; 4533 w = cfs_rq_load_avg(se->my_q) + wl;
4690 4534
4691 /* 4535 /*
4692 * wl = S * s'_i; see (2) 4536 * wl = S * s'_i; see (2)
@@ -4707,7 +4551,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4707 /* 4551 /*
4708 * wl = dw_i = S * (s'_i - s_i); see (3) 4552 * wl = dw_i = S * (s'_i - s_i); see (3)
4709 */ 4553 */
4710 wl -= se->load.weight; 4554 wl -= se->avg.load_avg;
4711 4555
4712 /* 4556 /*
4713 * Recursively apply this logic to all parent groups to compute 4557 * Recursively apply this logic to all parent groups to compute
@@ -4730,26 +4574,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4730 4574
4731#endif 4575#endif
4732 4576
4577/*
4578 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
4579 * A waker of many should wake a different task than the one last awakened
4580 * at a frequency roughly N times higher than one of its wakees. In order
4581 * to determine whether we should let the load spread vs consolodating to
4582 * shared cache, we look for a minimum 'flip' frequency of llc_size in one
4583 * partner, and a factor of lls_size higher frequency in the other. With
4584 * both conditions met, we can be relatively sure that the relationship is
4585 * non-monogamous, with partner count exceeding socket size. Waker/wakee
4586 * being client/server, worker/dispatcher, interrupt source or whatever is
4587 * irrelevant, spread criteria is apparent partner count exceeds socket size.
4588 */
4733static int wake_wide(struct task_struct *p) 4589static int wake_wide(struct task_struct *p)
4734{ 4590{
4591 unsigned int master = current->wakee_flips;
4592 unsigned int slave = p->wakee_flips;
4735 int factor = this_cpu_read(sd_llc_size); 4593 int factor = this_cpu_read(sd_llc_size);
4736 4594
4737 /* 4595 if (master < slave)
4738 * Yeah, it's the switching-frequency, could means many wakee or 4596 swap(master, slave);
4739 * rapidly switch, use factor here will just help to automatically 4597 if (slave < factor || master < slave * factor)
4740 * adjust the loose-degree, so bigger node will lead to more pull. 4598 return 0;
4741 */ 4599 return 1;
4742 if (p->wakee_flips > factor) {
4743 /*
4744 * wakee is somewhat hot, it needs certain amount of cpu
4745 * resource, so if waker is far more hot, prefer to leave
4746 * it alone.
4747 */
4748 if (current->wakee_flips > (factor * p->wakee_flips))
4749 return 1;
4750 }
4751
4752 return 0;
4753} 4600}
4754 4601
4755static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 4602static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
@@ -4761,13 +4608,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4761 unsigned long weight; 4608 unsigned long weight;
4762 int balanced; 4609 int balanced;
4763 4610
4764 /*
4765 * If we wake multiple tasks be careful to not bounce
4766 * ourselves around too much.
4767 */
4768 if (wake_wide(p))
4769 return 0;
4770
4771 idx = sd->wake_idx; 4611 idx = sd->wake_idx;
4772 this_cpu = smp_processor_id(); 4612 this_cpu = smp_processor_id();
4773 prev_cpu = task_cpu(p); 4613 prev_cpu = task_cpu(p);
@@ -4781,14 +4621,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4781 */ 4621 */
4782 if (sync) { 4622 if (sync) {
4783 tg = task_group(current); 4623 tg = task_group(current);
4784 weight = current->se.load.weight; 4624 weight = current->se.avg.load_avg;
4785 4625
4786 this_load += effective_load(tg, this_cpu, -weight, -weight); 4626 this_load += effective_load(tg, this_cpu, -weight, -weight);
4787 load += effective_load(tg, prev_cpu, 0, -weight); 4627 load += effective_load(tg, prev_cpu, 0, -weight);
4788 } 4628 }
4789 4629
4790 tg = task_group(p); 4630 tg = task_group(p);
4791 weight = p->se.load.weight; 4631 weight = p->se.avg.load_avg;
4792 4632
4793 /* 4633 /*
4794 * In low-load situations, where prev_cpu is idle and this_cpu is idle 4634 * In low-load situations, where prev_cpu is idle and this_cpu is idle
@@ -4981,12 +4821,12 @@ done:
4981 * tasks. The unit of the return value must be the one of capacity so we can 4821 * tasks. The unit of the return value must be the one of capacity so we can
4982 * compare the usage with the capacity of the CPU that is available for CFS 4822 * compare the usage with the capacity of the CPU that is available for CFS
4983 * task (ie cpu_capacity). 4823 * task (ie cpu_capacity).
4984 * cfs.utilization_load_avg is the sum of running time of runnable tasks on a 4824 * cfs.avg.util_avg is the sum of running time of runnable tasks on a
4985 * CPU. It represents the amount of utilization of a CPU in the range 4825 * CPU. It represents the amount of utilization of a CPU in the range
4986 * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full 4826 * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full
4987 * capacity of the CPU because it's about the running time on this CPU. 4827 * capacity of the CPU because it's about the running time on this CPU.
4988 * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE 4828 * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE
4989 * because of unfortunate rounding in avg_period and running_load_avg or just 4829 * because of unfortunate rounding in util_avg or just
4990 * after migrating tasks until the average stabilizes with the new running 4830 * after migrating tasks until the average stabilizes with the new running
4991 * time. So we need to check that the usage stays into the range 4831 * time. So we need to check that the usage stays into the range
4992 * [0..cpu_capacity_orig] and cap if necessary. 4832 * [0..cpu_capacity_orig] and cap if necessary.
@@ -4995,7 +4835,7 @@ done:
4995 */ 4835 */
4996static int get_cpu_usage(int cpu) 4836static int get_cpu_usage(int cpu)
4997{ 4837{
4998 unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; 4838 unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg;
4999 unsigned long capacity = capacity_orig_of(cpu); 4839 unsigned long capacity = capacity_orig_of(cpu);
5000 4840
5001 if (usage >= SCHED_LOAD_SCALE) 4841 if (usage >= SCHED_LOAD_SCALE)
@@ -5021,17 +4861,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
5021{ 4861{
5022 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 4862 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
5023 int cpu = smp_processor_id(); 4863 int cpu = smp_processor_id();
5024 int new_cpu = cpu; 4864 int new_cpu = prev_cpu;
5025 int want_affine = 0; 4865 int want_affine = 0;
5026 int sync = wake_flags & WF_SYNC; 4866 int sync = wake_flags & WF_SYNC;
5027 4867
5028 if (sd_flag & SD_BALANCE_WAKE) 4868 if (sd_flag & SD_BALANCE_WAKE)
5029 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p)); 4869 want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
5030 4870
5031 rcu_read_lock(); 4871 rcu_read_lock();
5032 for_each_domain(cpu, tmp) { 4872 for_each_domain(cpu, tmp) {
5033 if (!(tmp->flags & SD_LOAD_BALANCE)) 4873 if (!(tmp->flags & SD_LOAD_BALANCE))
5034 continue; 4874 break;
5035 4875
5036 /* 4876 /*
5037 * If both cpu and prev_cpu are part of this domain, 4877 * If both cpu and prev_cpu are part of this domain,
@@ -5045,17 +4885,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
5045 4885
5046 if (tmp->flags & sd_flag) 4886 if (tmp->flags & sd_flag)
5047 sd = tmp; 4887 sd = tmp;
4888 else if (!want_affine)
4889 break;
5048 } 4890 }
5049 4891
5050 if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync)) 4892 if (affine_sd) {
5051 prev_cpu = cpu; 4893 sd = NULL; /* Prefer wake_affine over balance flags */
5052 4894 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
5053 if (sd_flag & SD_BALANCE_WAKE) { 4895 new_cpu = cpu;
5054 new_cpu = select_idle_sibling(p, prev_cpu);
5055 goto unlock;
5056 } 4896 }
5057 4897
5058 while (sd) { 4898 if (!sd) {
4899 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
4900 new_cpu = select_idle_sibling(p, new_cpu);
4901
4902 } else while (sd) {
5059 struct sched_group *group; 4903 struct sched_group *group;
5060 int weight; 4904 int weight;
5061 4905
@@ -5089,7 +4933,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
5089 } 4933 }
5090 /* while loop will break here if sd == NULL */ 4934 /* while loop will break here if sd == NULL */
5091 } 4935 }
5092unlock:
5093 rcu_read_unlock(); 4936 rcu_read_unlock();
5094 4937
5095 return new_cpu; 4938 return new_cpu;
@@ -5101,26 +4944,27 @@ unlock:
5101 * previous cpu. However, the caller only guarantees p->pi_lock is held; no 4944 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
5102 * other assumptions, including the state of rq->lock, should be made. 4945 * other assumptions, including the state of rq->lock, should be made.
5103 */ 4946 */
5104static void 4947static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
5105migrate_task_rq_fair(struct task_struct *p, int next_cpu)
5106{ 4948{
5107 struct sched_entity *se = &p->se;
5108 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5109
5110 /* 4949 /*
5111 * Load tracking: accumulate removed load so that it can be processed 4950 * We are supposed to update the task to "current" time, then its up to date
5112 * when we next update owning cfs_rq under rq->lock. Tasks contribute 4951 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
5113 * to blocked load iff they have a positive decay-count. It can never 4952 * what current time is, so simply throw away the out-of-date time. This
5114 * be negative here since on-rq tasks have decay-count == 0. 4953 * will result in the wakee task is less decayed, but giving the wakee more
4954 * load sounds not bad.
5115 */ 4955 */
5116 if (se->avg.decay_count) { 4956 remove_entity_load_avg(&p->se);
5117 se->avg.decay_count = -__synchronize_entity_decay(se); 4957
5118 atomic_long_add(se->avg.load_avg_contrib, 4958 /* Tell new CPU we are migrated */
5119 &cfs_rq->removed_load); 4959 p->se.avg.last_update_time = 0;
5120 }
5121 4960
5122 /* We have migrated, no longer consider this task hot */ 4961 /* We have migrated, no longer consider this task hot */
5123 se->exec_start = 0; 4962 p->se.exec_start = 0;
4963}
4964
4965static void task_dead_fair(struct task_struct *p)
4966{
4967 remove_entity_load_avg(&p->se);
5124} 4968}
5125#endif /* CONFIG_SMP */ 4969#endif /* CONFIG_SMP */
5126 4970
@@ -5670,72 +5514,39 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
5670 5514
5671#ifdef CONFIG_NUMA_BALANCING 5515#ifdef CONFIG_NUMA_BALANCING
5672/* 5516/*
5673 * Returns true if the destination node is the preferred node. 5517 * Returns 1, if task migration degrades locality
5674 * Needs to match fbq_classify_rq(): if there is a runnable task 5518 * Returns 0, if task migration improves locality i.e migration preferred.
5675 * that is not on its preferred node, we should identify it. 5519 * Returns -1, if task migration is not affected by locality.
5676 */ 5520 */
5677static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env) 5521static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5678{ 5522{
5679 struct numa_group *numa_group = rcu_dereference(p->numa_group); 5523 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5680 unsigned long src_faults, dst_faults; 5524 unsigned long src_faults, dst_faults;
5681 int src_nid, dst_nid; 5525 int src_nid, dst_nid;
5682 5526
5683 if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
5684 !(env->sd->flags & SD_NUMA)) {
5685 return false;
5686 }
5687
5688 src_nid = cpu_to_node(env->src_cpu);
5689 dst_nid = cpu_to_node(env->dst_cpu);
5690
5691 if (src_nid == dst_nid)
5692 return false;
5693
5694 /* Encourage migration to the preferred node. */
5695 if (dst_nid == p->numa_preferred_nid)
5696 return true;
5697
5698 /* Migrating away from the preferred node is bad. */
5699 if (src_nid == p->numa_preferred_nid)
5700 return false;
5701
5702 if (numa_group) {
5703 src_faults = group_faults(p, src_nid);
5704 dst_faults = group_faults(p, dst_nid);
5705 } else {
5706 src_faults = task_faults(p, src_nid);
5707 dst_faults = task_faults(p, dst_nid);
5708 }
5709
5710 return dst_faults > src_faults;
5711}
5712
5713
5714static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5715{
5716 struct numa_group *numa_group = rcu_dereference(p->numa_group);
5717 unsigned long src_faults, dst_faults;
5718 int src_nid, dst_nid;
5719
5720 if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
5721 return false;
5722
5723 if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) 5527 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
5724 return false; 5528 return -1;
5529
5530 if (!sched_feat(NUMA))
5531 return -1;
5725 5532
5726 src_nid = cpu_to_node(env->src_cpu); 5533 src_nid = cpu_to_node(env->src_cpu);
5727 dst_nid = cpu_to_node(env->dst_cpu); 5534 dst_nid = cpu_to_node(env->dst_cpu);
5728 5535
5729 if (src_nid == dst_nid) 5536 if (src_nid == dst_nid)
5730 return false; 5537 return -1;
5731 5538
5732 /* Migrating away from the preferred node is bad. */ 5539 /* Migrating away from the preferred node is always bad. */
5733 if (src_nid == p->numa_preferred_nid) 5540 if (src_nid == p->numa_preferred_nid) {
5734 return true; 5541 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
5542 return 1;
5543 else
5544 return -1;
5545 }
5735 5546
5736 /* Encourage migration to the preferred node. */ 5547 /* Encourage migration to the preferred node. */
5737 if (dst_nid == p->numa_preferred_nid) 5548 if (dst_nid == p->numa_preferred_nid)
5738 return false; 5549 return 0;
5739 5550
5740 if (numa_group) { 5551 if (numa_group) {
5741 src_faults = group_faults(p, src_nid); 5552 src_faults = group_faults(p, src_nid);
@@ -5749,16 +5560,10 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
5749} 5560}
5750 5561
5751#else 5562#else
5752static inline bool migrate_improves_locality(struct task_struct *p, 5563static inline int migrate_degrades_locality(struct task_struct *p,
5753 struct lb_env *env) 5564 struct lb_env *env)
5754{ 5565{
5755 return false; 5566 return -1;
5756}
5757
5758static inline bool migrate_degrades_locality(struct task_struct *p,
5759 struct lb_env *env)
5760{
5761 return false;
5762} 5567}
5763#endif 5568#endif
5764 5569
@@ -5768,7 +5573,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
5768static 5573static
5769int can_migrate_task(struct task_struct *p, struct lb_env *env) 5574int can_migrate_task(struct task_struct *p, struct lb_env *env)
5770{ 5575{
5771 int tsk_cache_hot = 0; 5576 int tsk_cache_hot;
5772 5577
5773 lockdep_assert_held(&env->src_rq->lock); 5578 lockdep_assert_held(&env->src_rq->lock);
5774 5579
@@ -5826,13 +5631,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5826 * 2) task is cache cold, or 5631 * 2) task is cache cold, or
5827 * 3) too many balance attempts have failed. 5632 * 3) too many balance attempts have failed.
5828 */ 5633 */
5829 tsk_cache_hot = task_hot(p, env); 5634 tsk_cache_hot = migrate_degrades_locality(p, env);
5830 if (!tsk_cache_hot) 5635 if (tsk_cache_hot == -1)
5831 tsk_cache_hot = migrate_degrades_locality(p, env); 5636 tsk_cache_hot = task_hot(p, env);
5832 5637
5833 if (migrate_improves_locality(p, env) || !tsk_cache_hot || 5638 if (tsk_cache_hot <= 0 ||
5834 env->sd->nr_balance_failed > env->sd->cache_nice_tries) { 5639 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5835 if (tsk_cache_hot) { 5640 if (tsk_cache_hot == 1) {
5836 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 5641 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5837 schedstat_inc(p, se.statistics.nr_forced_migrations); 5642 schedstat_inc(p, se.statistics.nr_forced_migrations);
5838 } 5643 }
@@ -5906,6 +5711,13 @@ static int detach_tasks(struct lb_env *env)
5906 return 0; 5711 return 0;
5907 5712
5908 while (!list_empty(tasks)) { 5713 while (!list_empty(tasks)) {
5714 /*
5715 * We don't want to steal all, otherwise we may be treated likewise,
5716 * which could at worst lead to a livelock crash.
5717 */
5718 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
5719 break;
5720
5909 p = list_first_entry(tasks, struct task_struct, se.group_node); 5721 p = list_first_entry(tasks, struct task_struct, se.group_node);
5910 5722
5911 env->loop++; 5723 env->loop++;
@@ -6015,39 +5827,6 @@ static void attach_tasks(struct lb_env *env)
6015} 5827}
6016 5828
6017#ifdef CONFIG_FAIR_GROUP_SCHED 5829#ifdef CONFIG_FAIR_GROUP_SCHED
6018/*
6019 * update tg->load_weight by folding this cpu's load_avg
6020 */
6021static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
6022{
6023 struct sched_entity *se = tg->se[cpu];
6024 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
6025
6026 /* throttled entities do not contribute to load */
6027 if (throttled_hierarchy(cfs_rq))
6028 return;
6029
6030 update_cfs_rq_blocked_load(cfs_rq, 1);
6031
6032 if (se) {
6033 update_entity_load_avg(se, 1);
6034 /*
6035 * We pivot on our runnable average having decayed to zero for
6036 * list removal. This generally implies that all our children
6037 * have also been removed (modulo rounding error or bandwidth
6038 * control); however, such cases are rare and we can fix these
6039 * at enqueue.
6040 *
6041 * TODO: fix up out-of-order children on enqueue.
6042 */
6043 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
6044 list_del_leaf_cfs_rq(cfs_rq);
6045 } else {
6046 struct rq *rq = rq_of(cfs_rq);
6047 update_rq_runnable_avg(rq, rq->nr_running);
6048 }
6049}
6050
6051static void update_blocked_averages(int cpu) 5830static void update_blocked_averages(int cpu)
6052{ 5831{
6053 struct rq *rq = cpu_rq(cpu); 5832 struct rq *rq = cpu_rq(cpu);
@@ -6056,19 +5835,19 @@ static void update_blocked_averages(int cpu)
6056 5835
6057 raw_spin_lock_irqsave(&rq->lock, flags); 5836 raw_spin_lock_irqsave(&rq->lock, flags);
6058 update_rq_clock(rq); 5837 update_rq_clock(rq);
5838
6059 /* 5839 /*
6060 * Iterates the task_group tree in a bottom up fashion, see 5840 * Iterates the task_group tree in a bottom up fashion, see
6061 * list_add_leaf_cfs_rq() for details. 5841 * list_add_leaf_cfs_rq() for details.
6062 */ 5842 */
6063 for_each_leaf_cfs_rq(rq, cfs_rq) { 5843 for_each_leaf_cfs_rq(rq, cfs_rq) {
6064 /* 5844 /* throttled entities do not contribute to load */
6065 * Note: We may want to consider periodically releasing 5845 if (throttled_hierarchy(cfs_rq))
6066 * rq->lock about these updates so that creating many task 5846 continue;
6067 * groups does not result in continually extending hold time.
6068 */
6069 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
6070 }
6071 5847
5848 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
5849 update_tg_load_avg(cfs_rq, 0);
5850 }
6072 raw_spin_unlock_irqrestore(&rq->lock, flags); 5851 raw_spin_unlock_irqrestore(&rq->lock, flags);
6073} 5852}
6074 5853
@@ -6096,14 +5875,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
6096 } 5875 }
6097 5876
6098 if (!se) { 5877 if (!se) {
6099 cfs_rq->h_load = cfs_rq->runnable_load_avg; 5878 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
6100 cfs_rq->last_h_load_update = now; 5879 cfs_rq->last_h_load_update = now;
6101 } 5880 }
6102 5881
6103 while ((se = cfs_rq->h_load_next) != NULL) { 5882 while ((se = cfs_rq->h_load_next) != NULL) {
6104 load = cfs_rq->h_load; 5883 load = cfs_rq->h_load;
6105 load = div64_ul(load * se->avg.load_avg_contrib, 5884 load = div64_ul(load * se->avg.load_avg,
6106 cfs_rq->runnable_load_avg + 1); 5885 cfs_rq_load_avg(cfs_rq) + 1);
6107 cfs_rq = group_cfs_rq(se); 5886 cfs_rq = group_cfs_rq(se);
6108 cfs_rq->h_load = load; 5887 cfs_rq->h_load = load;
6109 cfs_rq->last_h_load_update = now; 5888 cfs_rq->last_h_load_update = now;
@@ -6115,17 +5894,25 @@ static unsigned long task_h_load(struct task_struct *p)
6115 struct cfs_rq *cfs_rq = task_cfs_rq(p); 5894 struct cfs_rq *cfs_rq = task_cfs_rq(p);
6116 5895
6117 update_cfs_rq_h_load(cfs_rq); 5896 update_cfs_rq_h_load(cfs_rq);
6118 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, 5897 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
6119 cfs_rq->runnable_load_avg + 1); 5898 cfs_rq_load_avg(cfs_rq) + 1);
6120} 5899}
6121#else 5900#else
6122static inline void update_blocked_averages(int cpu) 5901static inline void update_blocked_averages(int cpu)
6123{ 5902{
5903 struct rq *rq = cpu_rq(cpu);
5904 struct cfs_rq *cfs_rq = &rq->cfs;
5905 unsigned long flags;
5906
5907 raw_spin_lock_irqsave(&rq->lock, flags);
5908 update_rq_clock(rq);
5909 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
5910 raw_spin_unlock_irqrestore(&rq->lock, flags);
6124} 5911}
6125 5912
6126static unsigned long task_h_load(struct task_struct *p) 5913static unsigned long task_h_load(struct task_struct *p)
6127{ 5914{
6128 return p->se.avg.load_avg_contrib; 5915 return p->se.avg.load_avg;
6129} 5916}
6130#endif 5917#endif
6131 5918
@@ -8025,8 +7812,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
8025 7812
8026 if (numabalancing_enabled) 7813 if (numabalancing_enabled)
8027 task_tick_numa(rq, curr); 7814 task_tick_numa(rq, curr);
8028
8029 update_rq_runnable_avg(rq, 1);
8030} 7815}
8031 7816
8032/* 7817/*
@@ -8125,15 +7910,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
8125 } 7910 }
8126 7911
8127#ifdef CONFIG_SMP 7912#ifdef CONFIG_SMP
8128 /* 7913 /* Catch up with the cfs_rq and remove our load when we leave */
8129 * Remove our load from contribution when we leave sched_fair 7914 __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg,
8130 * and ensure we don't carry in an old decay_count if we 7915 se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
8131 * switch back. 7916
8132 */ 7917 cfs_rq->avg.load_avg =
8133 if (se->avg.decay_count) { 7918 max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
8134 __synchronize_entity_decay(se); 7919 cfs_rq->avg.load_sum =
8135 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); 7920 max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
8136 } 7921 cfs_rq->avg.util_avg =
7922 max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
7923 cfs_rq->avg.util_sum =
7924 max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
8137#endif 7925#endif
8138} 7926}
8139 7927
@@ -8142,16 +7930,31 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
8142 */ 7930 */
8143static void switched_to_fair(struct rq *rq, struct task_struct *p) 7931static void switched_to_fair(struct rq *rq, struct task_struct *p)
8144{ 7932{
8145#ifdef CONFIG_FAIR_GROUP_SCHED
8146 struct sched_entity *se = &p->se; 7933 struct sched_entity *se = &p->se;
7934
7935#ifdef CONFIG_FAIR_GROUP_SCHED
8147 /* 7936 /*
8148 * Since the real-depth could have been changed (only FAIR 7937 * Since the real-depth could have been changed (only FAIR
8149 * class maintain depth value), reset depth properly. 7938 * class maintain depth value), reset depth properly.
8150 */ 7939 */
8151 se->depth = se->parent ? se->parent->depth + 1 : 0; 7940 se->depth = se->parent ? se->parent->depth + 1 : 0;
8152#endif 7941#endif
8153 if (!task_on_rq_queued(p)) 7942
7943 if (!task_on_rq_queued(p)) {
7944
7945 /*
7946 * Ensure the task has a non-normalized vruntime when it is switched
7947 * back to the fair class with !queued, so that enqueue_entity() at
7948 * wake-up time will do the right thing.
7949 *
7950 * If it's queued, then the enqueue_entity(.flags=0) makes the task
7951 * has non-normalized vruntime, if it's !queued, then it still has
7952 * normalized vruntime.
7953 */
7954 if (p->state != TASK_RUNNING)
7955 se->vruntime += cfs_rq_of(se)->min_vruntime;
8154 return; 7956 return;
7957 }
8155 7958
8156 /* 7959 /*
8157 * We were most likely switched from sched_rt, so 7960 * We were most likely switched from sched_rt, so
@@ -8190,8 +7993,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
8190 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; 7993 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
8191#endif 7994#endif
8192#ifdef CONFIG_SMP 7995#ifdef CONFIG_SMP
8193 atomic64_set(&cfs_rq->decay_counter, 1); 7996 atomic_long_set(&cfs_rq->removed_load_avg, 0);
8194 atomic_long_set(&cfs_rq->removed_load, 0); 7997 atomic_long_set(&cfs_rq->removed_util_avg, 0);
8195#endif 7998#endif
8196} 7999}
8197 8000
@@ -8236,14 +8039,14 @@ static void task_move_group_fair(struct task_struct *p, int queued)
8236 if (!queued) { 8039 if (!queued) {
8237 cfs_rq = cfs_rq_of(se); 8040 cfs_rq = cfs_rq_of(se);
8238 se->vruntime += cfs_rq->min_vruntime; 8041 se->vruntime += cfs_rq->min_vruntime;
8042
8239#ifdef CONFIG_SMP 8043#ifdef CONFIG_SMP
8240 /* 8044 /* Virtually synchronize task with its new cfs_rq */
8241 * migrate_task_rq_fair() will have removed our previous 8045 p->se.avg.last_update_time = cfs_rq->avg.last_update_time;
8242 * contribution, but we must synchronize for ongoing future 8046 cfs_rq->avg.load_avg += p->se.avg.load_avg;
8243 * decay. 8047 cfs_rq->avg.load_sum += p->se.avg.load_sum;
8244 */ 8048 cfs_rq->avg.util_avg += p->se.avg.util_avg;
8245 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); 8049 cfs_rq->avg.util_sum += p->se.avg.util_sum;
8246 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
8247#endif 8050#endif
8248 } 8051 }
8249} 8052}
@@ -8257,8 +8060,11 @@ void free_fair_sched_group(struct task_group *tg)
8257 for_each_possible_cpu(i) { 8060 for_each_possible_cpu(i) {
8258 if (tg->cfs_rq) 8061 if (tg->cfs_rq)
8259 kfree(tg->cfs_rq[i]); 8062 kfree(tg->cfs_rq[i]);
8260 if (tg->se) 8063 if (tg->se) {
8064 if (tg->se[i])
8065 remove_entity_load_avg(tg->se[i]);
8261 kfree(tg->se[i]); 8066 kfree(tg->se[i]);
8067 }
8262 } 8068 }
8263 8069
8264 kfree(tg->cfs_rq); 8070 kfree(tg->cfs_rq);
@@ -8295,6 +8101,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8295 8101
8296 init_cfs_rq(cfs_rq); 8102 init_cfs_rq(cfs_rq);
8297 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); 8103 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8104 init_entity_runnable_average(se);
8298 } 8105 }
8299 8106
8300 return 1; 8107 return 1;
@@ -8444,6 +8251,8 @@ const struct sched_class fair_sched_class = {
8444 .rq_offline = rq_offline_fair, 8251 .rq_offline = rq_offline_fair,
8445 8252
8446 .task_waking = task_waking_fair, 8253 .task_waking = task_waking_fair,
8254 .task_dead = task_dead_fair,
8255 .set_cpus_allowed = set_cpus_allowed_common,
8447#endif 8256#endif
8448 8257
8449 .set_curr_task = set_curr_task_fair, 8258 .set_curr_task = set_curr_task_fair,
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 91e33cd485f6..83a50e7ca533 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -79,20 +79,12 @@ SCHED_FEAT(LB_MIN, false)
79 * numa_balancing= 79 * numa_balancing=
80 */ 80 */
81#ifdef CONFIG_NUMA_BALANCING 81#ifdef CONFIG_NUMA_BALANCING
82SCHED_FEAT(NUMA, false)
83 82
84/* 83/*
85 * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a 84 * NUMA will favor moving tasks towards nodes where a higher number of
86 * higher number of hinting faults are recorded during active load 85 * hinting faults are recorded during active load balancing. It will
87 * balancing. 86 * resist moving tasks towards nodes where a lower number of hinting
87 * faults have been recorded.
88 */ 88 */
89SCHED_FEAT(NUMA_FAVOUR_HIGHER, true) 89SCHED_FEAT(NUMA, true)
90
91/*
92 * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
93 * lower number of hinting faults have been recorded. As this has
94 * the potential to prevent a task ever migrating to a new node
95 * due to CPU overload it is disabled by default.
96 */
97SCHED_FEAT(NUMA_RESIST_LOWER, false)
98#endif 90#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 594275ed2620..8f177c73ae19 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -83,10 +83,13 @@ void __weak arch_cpu_idle(void)
83 */ 83 */
84void default_idle_call(void) 84void default_idle_call(void)
85{ 85{
86 if (current_clr_polling_and_test()) 86 if (current_clr_polling_and_test()) {
87 local_irq_enable(); 87 local_irq_enable();
88 else 88 } else {
89 stop_critical_timings();
89 arch_cpu_idle(); 90 arch_cpu_idle();
91 start_critical_timings();
92 }
90} 93}
91 94
92static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, 95static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
@@ -141,12 +144,6 @@ static void cpuidle_idle_call(void)
141 } 144 }
142 145
143 /* 146 /*
144 * During the idle period, stop measuring the disabled irqs
145 * critical sections latencies
146 */
147 stop_critical_timings();
148
149 /*
150 * Tell the RCU framework we are entering an idle section, 147 * Tell the RCU framework we are entering an idle section,
151 * so no more rcu read side critical sections and one more 148 * so no more rcu read side critical sections and one more
152 * step to the grace period 149 * step to the grace period
@@ -198,7 +195,6 @@ exit_idle:
198 local_irq_enable(); 195 local_irq_enable();
199 196
200 rcu_idle_exit(); 197 rcu_idle_exit();
201 start_critical_timings();
202} 198}
203 199
204DEFINE_PER_CPU(bool, cpu_dead_idle); 200DEFINE_PER_CPU(bool, cpu_dead_idle);
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index c65dac8c97cd..c4ae0f1fdf9b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = {
96 96
97#ifdef CONFIG_SMP 97#ifdef CONFIG_SMP
98 .select_task_rq = select_task_rq_idle, 98 .select_task_rq = select_task_rq_idle,
99 .set_cpus_allowed = set_cpus_allowed_common,
99#endif 100#endif
100 101
101 .set_curr_task = set_curr_task_idle, 102 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 0d193a243e96..d2ea59364a1c 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2069,7 +2069,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
2069{ 2069{
2070 if (!task_running(rq, p) && 2070 if (!task_running(rq, p) &&
2071 !test_tsk_need_resched(rq->curr) && 2071 !test_tsk_need_resched(rq->curr) &&
2072 has_pushable_tasks(rq) &&
2073 p->nr_cpus_allowed > 1 && 2072 p->nr_cpus_allowed > 1 &&
2074 (dl_task(rq->curr) || rt_task(rq->curr)) && 2073 (dl_task(rq->curr) || rt_task(rq->curr)) &&
2075 (rq->curr->nr_cpus_allowed < 2 || 2074 (rq->curr->nr_cpus_allowed < 2 ||
@@ -2077,45 +2076,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
2077 push_rt_tasks(rq); 2076 push_rt_tasks(rq);
2078} 2077}
2079 2078
2080static void set_cpus_allowed_rt(struct task_struct *p,
2081 const struct cpumask *new_mask)
2082{
2083 struct rq *rq;
2084 int weight;
2085
2086 BUG_ON(!rt_task(p));
2087
2088 if (!task_on_rq_queued(p))
2089 return;
2090
2091 weight = cpumask_weight(new_mask);
2092
2093 /*
2094 * Only update if the process changes its state from whether it
2095 * can migrate or not.
2096 */
2097 if ((p->nr_cpus_allowed > 1) == (weight > 1))
2098 return;
2099
2100 rq = task_rq(p);
2101
2102 /*
2103 * The process used to be able to migrate OR it can now migrate
2104 */
2105 if (weight <= 1) {
2106 if (!task_current(rq, p))
2107 dequeue_pushable_task(rq, p);
2108 BUG_ON(!rq->rt.rt_nr_migratory);
2109 rq->rt.rt_nr_migratory--;
2110 } else {
2111 if (!task_current(rq, p))
2112 enqueue_pushable_task(rq, p);
2113 rq->rt.rt_nr_migratory++;
2114 }
2115
2116 update_rt_migration(&rq->rt);
2117}
2118
2119/* Assumes rq->lock is held */ 2079/* Assumes rq->lock is held */
2120static void rq_online_rt(struct rq *rq) 2080static void rq_online_rt(struct rq *rq)
2121{ 2081{
@@ -2324,7 +2284,7 @@ const struct sched_class rt_sched_class = {
2324#ifdef CONFIG_SMP 2284#ifdef CONFIG_SMP
2325 .select_task_rq = select_task_rq_rt, 2285 .select_task_rq = select_task_rq_rt,
2326 2286
2327 .set_cpus_allowed = set_cpus_allowed_rt, 2287 .set_cpus_allowed = set_cpus_allowed_common,
2328 .rq_online = rq_online_rt, 2288 .rq_online = rq_online_rt,
2329 .rq_offline = rq_offline_rt, 2289 .rq_offline = rq_offline_rt,
2330 .task_woken = task_woken_rt, 2290 .task_woken = task_woken_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 84d48790bb6d..68cda117574c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -245,7 +245,6 @@ struct task_group {
245 245
246#ifdef CONFIG_SMP 246#ifdef CONFIG_SMP
247 atomic_long_t load_avg; 247 atomic_long_t load_avg;
248 atomic_t runnable_avg;
249#endif 248#endif
250#endif 249#endif
251 250
@@ -366,27 +365,20 @@ struct cfs_rq {
366 365
367#ifdef CONFIG_SMP 366#ifdef CONFIG_SMP
368 /* 367 /*
369 * CFS Load tracking 368 * CFS load tracking
370 * Under CFS, load is tracked on a per-entity basis and aggregated up.
371 * This allows for the description of both thread and group usage (in
372 * the FAIR_GROUP_SCHED case).
373 * runnable_load_avg is the sum of the load_avg_contrib of the
374 * sched_entities on the rq.
375 * blocked_load_avg is similar to runnable_load_avg except that its
376 * the blocked sched_entities on the rq.
377 * utilization_load_avg is the sum of the average running time of the
378 * sched_entities on the rq.
379 */ 369 */
380 unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; 370 struct sched_avg avg;
381 atomic64_t decay_counter; 371 u64 runnable_load_sum;
382 u64 last_decay; 372 unsigned long runnable_load_avg;
383 atomic_long_t removed_load;
384
385#ifdef CONFIG_FAIR_GROUP_SCHED 373#ifdef CONFIG_FAIR_GROUP_SCHED
386 /* Required to track per-cpu representation of a task_group */ 374 unsigned long tg_load_avg_contrib;
387 u32 tg_runnable_contrib; 375#endif
388 unsigned long tg_load_contrib; 376 atomic_long_t removed_load_avg, removed_util_avg;
377#ifndef CONFIG_64BIT
378 u64 load_last_update_time_copy;
379#endif
389 380
381#ifdef CONFIG_FAIR_GROUP_SCHED
390 /* 382 /*
391 * h_load = weight * f(tg) 383 * h_load = weight * f(tg)
392 * 384 *
@@ -595,8 +587,6 @@ struct rq {
595#ifdef CONFIG_FAIR_GROUP_SCHED 587#ifdef CONFIG_FAIR_GROUP_SCHED
596 /* list of leaf cfs_rq on this cpu: */ 588 /* list of leaf cfs_rq on this cpu: */
597 struct list_head leaf_cfs_rq_list; 589 struct list_head leaf_cfs_rq_list;
598
599 struct sched_avg avg;
600#endif /* CONFIG_FAIR_GROUP_SCHED */ 590#endif /* CONFIG_FAIR_GROUP_SCHED */
601 591
602 /* 592 /*
@@ -1065,9 +1055,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
1065#ifndef prepare_arch_switch 1055#ifndef prepare_arch_switch
1066# define prepare_arch_switch(next) do { } while (0) 1056# define prepare_arch_switch(next) do { } while (0)
1067#endif 1057#endif
1068#ifndef finish_arch_switch
1069# define finish_arch_switch(prev) do { } while (0)
1070#endif
1071#ifndef finish_arch_post_lock_switch 1058#ifndef finish_arch_post_lock_switch
1072# define finish_arch_post_lock_switch() do { } while (0) 1059# define finish_arch_post_lock_switch() do { } while (0)
1073#endif 1060#endif
@@ -1268,6 +1255,8 @@ extern void trigger_load_balance(struct rq *rq);
1268extern void idle_enter_fair(struct rq *this_rq); 1255extern void idle_enter_fair(struct rq *this_rq);
1269extern void idle_exit_fair(struct rq *this_rq); 1256extern void idle_exit_fair(struct rq *this_rq);
1270 1257
1258extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
1259
1271#else 1260#else
1272 1261
1273static inline void idle_enter_fair(struct rq *rq) { } 1262static inline void idle_enter_fair(struct rq *rq) { }
@@ -1319,7 +1308,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1319 1308
1320unsigned long to_ratio(u64 period, u64 runtime); 1309unsigned long to_ratio(u64 period, u64 runtime);
1321 1310
1322extern void init_task_runnable_average(struct task_struct *p); 1311extern void init_entity_runnable_average(struct sched_entity *se);
1323 1312
1324static inline void add_nr_running(struct rq *rq, unsigned count) 1313static inline void add_nr_running(struct rq *rq, unsigned count)
1325{ 1314{
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 79ffec45a6ac..cbc67da10954 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = {
123 123
124#ifdef CONFIG_SMP 124#ifdef CONFIG_SMP
125 .select_task_rq = select_task_rq_stop, 125 .select_task_rq = select_task_rq_stop,
126 .set_cpus_allowed = set_cpus_allowed_common,
126#endif 127#endif
127 128
128 .set_curr_task = set_curr_task_stop, 129 .set_curr_task = set_curr_task_stop,
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d12..272d9322bc5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
106} 106}
107EXPORT_SYMBOL_GPL(__wake_up_locked); 107EXPORT_SYMBOL_GPL(__wake_up_locked);
108 108
109void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 109void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
110 void *key)
110{ 111{
111 __wake_up_common(q, mode, 1, 0, key); 112 __wake_up_common(q, mode, nr, 0, key);
112} 113}
113EXPORT_SYMBOL_GPL(__wake_up_locked_key); 114EXPORT_SYMBOL_GPL(__wake_up_locked_key);
114 115
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
283 if (!list_empty(&wait->task_list)) 284 if (!list_empty(&wait->task_list))
284 list_del_init(&wait->task_list); 285 list_del_init(&wait->task_list);
285 else if (waitqueue_active(q)) 286 else if (waitqueue_active(q))
286 __wake_up_locked_key(q, mode, key); 287 __wake_up_locked_key(q, mode, 1, key);
287 spin_unlock_irqrestore(&q->lock, flags); 288 spin_unlock_irqrestore(&q->lock, flags);
288} 289}
289EXPORT_SYMBOL(abort_exclusive_wait); 290EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 245df6b32b81..5bd4779282df 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
175 */ 175 */
176static u32 seccomp_run_filters(struct seccomp_data *sd) 176static u32 seccomp_run_filters(struct seccomp_data *sd)
177{ 177{
178 struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
179 struct seccomp_data sd_local; 178 struct seccomp_data sd_local;
180 u32 ret = SECCOMP_RET_ALLOW; 179 u32 ret = SECCOMP_RET_ALLOW;
180 /* Make sure cross-thread synced filter points somewhere sane. */
181 struct seccomp_filter *f =
182 lockless_dereference(current->seccomp.filter);
181 183
182 /* Ensure unexpected behavior doesn't result in failing open. */ 184 /* Ensure unexpected behavior doesn't result in failing open. */
183 if (unlikely(WARN_ON(f == NULL))) 185 if (unlikely(WARN_ON(f == NULL)))
184 return SECCOMP_RET_KILL; 186 return SECCOMP_RET_KILL;
185 187
186 /* Make sure cross-thread synced filter points somewhere sane. */
187 smp_read_barrier_depends();
188
189 if (!sd) { 188 if (!sd) {
190 populate_seccomp_data(&sd_local); 189 populate_seccomp_data(&sd_local);
191 sd = &sd_local; 190 sd = &sd_local;
@@ -549,7 +548,11 @@ void secure_computing_strict(int this_syscall)
549{ 548{
550 int mode = current->seccomp.mode; 549 int mode = current->seccomp.mode;
551 550
552 if (mode == 0) 551 if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
552 unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
553 return;
554
555 if (mode == SECCOMP_MODE_DISABLED)
553 return; 556 return;
554 else if (mode == SECCOMP_MODE_STRICT) 557 else if (mode == SECCOMP_MODE_STRICT)
555 __secure_computing_strict(this_syscall); 558 __secure_computing_strict(this_syscall);
@@ -650,6 +653,10 @@ u32 seccomp_phase1(struct seccomp_data *sd)
650 int this_syscall = sd ? sd->nr : 653 int this_syscall = sd ? sd->nr :
651 syscall_get_nr(current, task_pt_regs(current)); 654 syscall_get_nr(current, task_pt_regs(current));
652 655
656 if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
657 unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
658 return SECCOMP_PHASE1_OK;
659
653 switch (mode) { 660 switch (mode) {
654 case SECCOMP_MODE_STRICT: 661 case SECCOMP_MODE_STRICT:
655 __secure_computing_strict(this_syscall); /* may call do_exit */ 662 __secure_computing_strict(this_syscall); /* may call do_exit */
diff --git a/kernel/signal.c b/kernel/signal.c
index 836df8dac6cc..0f6bbbe77b46 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2748,12 +2748,15 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
2748 * Other callers might not initialize the si_lsb field, 2748 * Other callers might not initialize the si_lsb field,
2749 * so check explicitly for the right codes here. 2749 * so check explicitly for the right codes here.
2750 */ 2750 */
2751 if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO) 2751 if (from->si_signo == SIGBUS &&
2752 (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO))
2752 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); 2753 err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
2753#endif 2754#endif
2754#ifdef SEGV_BNDERR 2755#ifdef SEGV_BNDERR
2755 err |= __put_user(from->si_lower, &to->si_lower); 2756 if (from->si_signo == SIGSEGV && from->si_code == SEGV_BNDERR) {
2756 err |= __put_user(from->si_upper, &to->si_upper); 2757 err |= __put_user(from->si_lower, &to->si_lower);
2758 err |= __put_user(from->si_upper, &to->si_upper);
2759 }
2757#endif 2760#endif
2758 break; 2761 break;
2759 case __SI_CHLD: 2762 case __SI_CHLD:
@@ -3017,7 +3020,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo,
3017 int, sig, 3020 int, sig,
3018 struct compat_siginfo __user *, uinfo) 3021 struct compat_siginfo __user *, uinfo)
3019{ 3022{
3020 siginfo_t info; 3023 siginfo_t info = {};
3021 int ret = copy_siginfo_from_user32(&info, uinfo); 3024 int ret = copy_siginfo_from_user32(&info, uinfo);
3022 if (unlikely(ret)) 3025 if (unlikely(ret))
3023 return ret; 3026 return ret;
@@ -3061,7 +3064,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
3061 int, sig, 3064 int, sig,
3062 struct compat_siginfo __user *, uinfo) 3065 struct compat_siginfo __user *, uinfo)
3063{ 3066{
3064 siginfo_t info; 3067 siginfo_t info = {};
3065 3068
3066 if (copy_siginfo_from_user32(&info, uinfo)) 3069 if (copy_siginfo_from_user32(&info, uinfo))
3067 return -EFAULT; 3070 return -EFAULT;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 7c434c39f02a..a818cbc73e14 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data)
113 if (kthread_should_stop()) { 113 if (kthread_should_stop()) {
114 __set_current_state(TASK_RUNNING); 114 __set_current_state(TASK_RUNNING);
115 preempt_enable(); 115 preempt_enable();
116 if (ht->cleanup) 116 /* cleanup must mirror setup */
117 if (ht->cleanup && td->status != HP_THREAD_NONE)
117 ht->cleanup(td->cpu, cpu_online(td->cpu)); 118 ht->cleanup(td->cpu, cpu_online(td->cpu));
118 kfree(td); 119 kfree(td);
119 return 0; 120 return 0;
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
259{ 260{
260 unsigned int cpu; 261 unsigned int cpu;
261 262
262 /* Unpark any threads that were voluntarily parked. */
263 for_each_cpu_not(cpu, ht->cpumask) {
264 if (cpu_online(cpu)) {
265 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
266 if (tsk)
267 kthread_unpark(tsk);
268 }
269 }
270
271 /* We need to destroy also the parked threads of offline cpus */ 263 /* We need to destroy also the parked threads of offline cpus */
272 for_each_possible_cpu(cpu) { 264 for_each_possible_cpu(cpu) {
273 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); 265 struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
281} 273}
282 274
283/** 275/**
284 * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug 276 * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
277 * to hotplug
285 * @plug_thread: Hotplug thread descriptor 278 * @plug_thread: Hotplug thread descriptor
279 * @cpumask: The cpumask where threads run
286 * 280 *
287 * Creates and starts the threads on all online cpus. 281 * Creates and starts the threads on all online cpus.
288 */ 282 */
289int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread) 283int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
284 const struct cpumask *cpumask)
290{ 285{
291 unsigned int cpu; 286 unsigned int cpu;
292 int ret = 0; 287 int ret = 0;
293 288
294 if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL)) 289 if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
295 return -ENOMEM; 290 return -ENOMEM;
296 cpumask_copy(plug_thread->cpumask, cpu_possible_mask); 291 cpumask_copy(plug_thread->cpumask, cpumask);
297 292
298 get_online_cpus(); 293 get_online_cpus();
299 mutex_lock(&smpboot_threads_lock); 294 mutex_lock(&smpboot_threads_lock);
@@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
301 ret = __smpboot_create_thread(plug_thread, cpu); 296 ret = __smpboot_create_thread(plug_thread, cpu);
302 if (ret) { 297 if (ret) {
303 smpboot_destroy_threads(plug_thread); 298 smpboot_destroy_threads(plug_thread);
299 free_cpumask_var(plug_thread->cpumask);
304 goto out; 300 goto out;
305 } 301 }
306 smpboot_unpark_thread(plug_thread, cpu); 302 if (cpumask_test_cpu(cpu, cpumask))
303 smpboot_unpark_thread(plug_thread, cpu);
307 } 304 }
308 list_add(&plug_thread->list, &hotplug_threads); 305 list_add(&plug_thread->list, &hotplug_threads);
309out: 306out:
@@ -311,7 +308,7 @@ out:
311 put_online_cpus(); 308 put_online_cpus();
312 return ret; 309 return ret;
313} 310}
314EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread); 311EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
315 312
316/** 313/**
317 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug 314 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index fd643d8c4b42..12484e5d5c88 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,13 +35,16 @@ struct cpu_stop_done {
35 35
36/* the actual stopper, one per every possible cpu, enabled on online cpus */ 36/* the actual stopper, one per every possible cpu, enabled on online cpus */
37struct cpu_stopper { 37struct cpu_stopper {
38 struct task_struct *thread;
39
38 spinlock_t lock; 40 spinlock_t lock;
39 bool enabled; /* is this stopper enabled? */ 41 bool enabled; /* is this stopper enabled? */
40 struct list_head works; /* list of pending works */ 42 struct list_head works; /* list of pending works */
43
44 struct cpu_stop_work stop_work; /* for stop_cpus */
41}; 45};
42 46
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); 47static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
45static bool stop_machine_initialized = false; 48static bool stop_machine_initialized = false;
46 49
47/* 50/*
@@ -74,7 +77,6 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
74static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) 77static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
75{ 78{
76 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 79 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
77 struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
78 80
79 unsigned long flags; 81 unsigned long flags;
80 82
@@ -82,7 +84,7 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
82 84
83 if (stopper->enabled) { 85 if (stopper->enabled) {
84 list_add_tail(&work->list, &stopper->works); 86 list_add_tail(&work->list, &stopper->works);
85 wake_up_process(p); 87 wake_up_process(stopper->thread);
86 } else 88 } else
87 cpu_stop_signal_done(work->done, false); 89 cpu_stop_signal_done(work->done, false);
88 90
@@ -139,7 +141,7 @@ enum multi_stop_state {
139}; 141};
140 142
141struct multi_stop_data { 143struct multi_stop_data {
142 int (*fn)(void *); 144 cpu_stop_fn_t fn;
143 void *data; 145 void *data;
144 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ 146 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
145 unsigned int num_threads; 147 unsigned int num_threads;
@@ -293,7 +295,6 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
293 295
294/* static data for stop_cpus */ 296/* static data for stop_cpus */
295static DEFINE_MUTEX(stop_cpus_mutex); 297static DEFINE_MUTEX(stop_cpus_mutex);
296static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
297 298
298static void queue_stop_cpus_work(const struct cpumask *cpumask, 299static void queue_stop_cpus_work(const struct cpumask *cpumask,
299 cpu_stop_fn_t fn, void *arg, 300 cpu_stop_fn_t fn, void *arg,
@@ -302,22 +303,19 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
302 struct cpu_stop_work *work; 303 struct cpu_stop_work *work;
303 unsigned int cpu; 304 unsigned int cpu;
304 305
305 /* initialize works and done */
306 for_each_cpu(cpu, cpumask) {
307 work = &per_cpu(stop_cpus_work, cpu);
308 work->fn = fn;
309 work->arg = arg;
310 work->done = done;
311 }
312
313 /* 306 /*
314 * Disable preemption while queueing to avoid getting 307 * Disable preemption while queueing to avoid getting
315 * preempted by a stopper which might wait for other stoppers 308 * preempted by a stopper which might wait for other stoppers
316 * to enter @fn which can lead to deadlock. 309 * to enter @fn which can lead to deadlock.
317 */ 310 */
318 lg_global_lock(&stop_cpus_lock); 311 lg_global_lock(&stop_cpus_lock);
319 for_each_cpu(cpu, cpumask) 312 for_each_cpu(cpu, cpumask) {
320 cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu)); 313 work = &per_cpu(cpu_stopper.stop_work, cpu);
314 work->fn = fn;
315 work->arg = arg;
316 work->done = done;
317 cpu_stop_queue_work(cpu, work);
318 }
321 lg_global_unlock(&stop_cpus_lock); 319 lg_global_unlock(&stop_cpus_lock);
322} 320}
323 321
@@ -458,19 +456,21 @@ extern void sched_set_stop_task(int cpu, struct task_struct *stop);
458 456
459static void cpu_stop_create(unsigned int cpu) 457static void cpu_stop_create(unsigned int cpu)
460{ 458{
461 sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu)); 459 sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
462} 460}
463 461
464static void cpu_stop_park(unsigned int cpu) 462static void cpu_stop_park(unsigned int cpu)
465{ 463{
466 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); 464 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
467 struct cpu_stop_work *work; 465 struct cpu_stop_work *work, *tmp;
468 unsigned long flags; 466 unsigned long flags;
469 467
470 /* drain remaining works */ 468 /* drain remaining works */
471 spin_lock_irqsave(&stopper->lock, flags); 469 spin_lock_irqsave(&stopper->lock, flags);
472 list_for_each_entry(work, &stopper->works, list) 470 list_for_each_entry_safe(work, tmp, &stopper->works, list) {
471 list_del_init(&work->list);
473 cpu_stop_signal_done(work->done, false); 472 cpu_stop_signal_done(work->done, false);
473 }
474 stopper->enabled = false; 474 stopper->enabled = false;
475 spin_unlock_irqrestore(&stopper->lock, flags); 475 spin_unlock_irqrestore(&stopper->lock, flags);
476} 476}
@@ -485,7 +485,7 @@ static void cpu_stop_unpark(unsigned int cpu)
485} 485}
486 486
487static struct smp_hotplug_thread cpu_stop_threads = { 487static struct smp_hotplug_thread cpu_stop_threads = {
488 .store = &cpu_stopper_task, 488 .store = &cpu_stopper.thread,
489 .thread_should_run = cpu_stop_should_run, 489 .thread_should_run = cpu_stop_should_run,
490 .thread_fn = cpu_stopper_thread, 490 .thread_fn = cpu_stopper_thread,
491 .thread_comm = "migration/%u", 491 .thread_comm = "migration/%u",
@@ -515,7 +515,7 @@ early_initcall(cpu_stop_init);
515 515
516#ifdef CONFIG_STOP_MACHINE 516#ifdef CONFIG_STOP_MACHINE
517 517
518int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 518static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
519{ 519{
520 struct multi_stop_data msdata = { 520 struct multi_stop_data msdata = {
521 .fn = fn, 521 .fn = fn,
@@ -548,7 +548,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
548 return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata); 548 return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
549} 549}
550 550
551int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 551int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
552{ 552{
553 int ret; 553 int ret;
554 554
@@ -582,7 +582,7 @@ EXPORT_SYMBOL_GPL(stop_machine);
582 * 0 if all executions of @fn returned 0, any non zero return value if any 582 * 0 if all executions of @fn returned 0, any non zero return value if any
583 * returned non zero. 583 * returned non zero.
584 */ 584 */
585int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data, 585int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
586 const struct cpumask *cpus) 586 const struct cpumask *cpus)
587{ 587{
588 struct multi_stop_data msdata = { .fn = fn, .data = data, 588 struct multi_stop_data msdata = { .fn = fn, .data = data,
diff --git a/kernel/sys.c b/kernel/sys.c
index 259fda25eb6b..fa2f2f671a5c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1668,8 +1668,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1668 * overall picture. 1668 * overall picture.
1669 */ 1669 */
1670 err = -EACCES; 1670 err = -EACCES;
1671 if (!S_ISREG(inode->i_mode) || 1671 if (!S_ISREG(inode->i_mode) || path_noexec(&exe.file->f_path))
1672 exe.file->f_path.mnt->mnt_flags & MNT_NOEXEC)
1673 goto exit; 1672 goto exit;
1674 1673
1675 err = inode_permission(inode, MAY_EXEC); 1674 err = inode_permission(inode, MAY_EXEC);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7995ef5868d8..a02decf15583 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask);
140cond_syscall(sys_ssetmask); 140cond_syscall(sys_ssetmask);
141cond_syscall(sys_vm86old); 141cond_syscall(sys_vm86old);
142cond_syscall(sys_vm86); 142cond_syscall(sys_vm86);
143cond_syscall(sys_modify_ldt);
143cond_syscall(sys_ipc); 144cond_syscall(sys_ipc);
144cond_syscall(compat_sys_ipc); 145cond_syscall(compat_sys_ipc);
145cond_syscall(compat_sys_sysctl); 146cond_syscall(compat_sys_sysctl);
@@ -218,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime);
218cond_syscall(sys_eventfd); 219cond_syscall(sys_eventfd);
219cond_syscall(sys_eventfd2); 220cond_syscall(sys_eventfd2);
220cond_syscall(sys_memfd_create); 221cond_syscall(sys_memfd_create);
222cond_syscall(sys_userfaultfd);
221 223
222/* performance counters: */ 224/* performance counters: */
223cond_syscall(sys_perf_event_open); 225cond_syscall(sys_perf_event_open);
@@ -243,3 +245,6 @@ cond_syscall(sys_bpf);
243 245
244/* execveat */ 246/* execveat */
245cond_syscall(sys_execveat); 247cond_syscall(sys_execveat);
248
249/* membarrier */
250cond_syscall(sys_membarrier);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 19b62b522158..e69201d8094e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
621 .proc_handler = proc_dointvec, 621 .proc_handler = proc_dointvec,
622 }, 622 },
623#endif 623#endif
624#ifdef CONFIG_KEXEC 624#ifdef CONFIG_KEXEC_CORE
625 { 625 {
626 .procname = "kexec_load_disabled", 626 .procname = "kexec_load_disabled",
627 .data = &kexec_load_disabled, 627 .data = &kexec_load_disabled,
@@ -1995,7 +1995,7 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
1995 int val = *valp; 1995 int val = *valp;
1996 if (val < 0) { 1996 if (val < 0) {
1997 *negp = true; 1997 *negp = true;
1998 *lvalp = (unsigned long)-val; 1998 *lvalp = -(unsigned long)val;
1999 } else { 1999 } else {
2000 *negp = false; 2000 *negp = false;
2001 *lvalp = (unsigned long)val; 2001 *lvalp = (unsigned long)val;
@@ -2201,7 +2201,7 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
2201 int val = *valp; 2201 int val = *valp;
2202 if (val < 0) { 2202 if (val < 0) {
2203 *negp = true; 2203 *negp = true;
2204 *lvalp = (unsigned long)-val; 2204 *lvalp = -(unsigned long)val;
2205 } else { 2205 } else {
2206 *negp = false; 2206 *negp = false;
2207 *lvalp = (unsigned long)val; 2207 *lvalp = (unsigned long)val;
@@ -2436,7 +2436,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
2436 unsigned long lval; 2436 unsigned long lval;
2437 if (val < 0) { 2437 if (val < 0) {
2438 *negp = true; 2438 *negp = true;
2439 lval = (unsigned long)-val; 2439 lval = -(unsigned long)val;
2440 } else { 2440 } else {
2441 *negp = false; 2441 *negp = false;
2442 lval = (unsigned long)val; 2442 lval = (unsigned long)val;
@@ -2459,7 +2459,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp
2459 unsigned long lval; 2459 unsigned long lval;
2460 if (val < 0) { 2460 if (val < 0) {
2461 *negp = true; 2461 *negp = true;
2462 lval = (unsigned long)-val; 2462 lval = -(unsigned long)val;
2463 } else { 2463 } else {
2464 *negp = false; 2464 *negp = false;
2465 lval = (unsigned long)val; 2465 lval = (unsigned long)val;
@@ -2484,7 +2484,7 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
2484 unsigned long lval; 2484 unsigned long lval;
2485 if (val < 0) { 2485 if (val < 0) {
2486 *negp = true; 2486 *negp = true;
2487 lval = (unsigned long)-val; 2487 lval = -(unsigned long)val;
2488 } else { 2488 } else {
2489 *negp = false; 2489 *negp = false;
2490 lval = (unsigned long)val; 2490 lval = (unsigned long)val;
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
deleted file mode 100644
index 3e9868d47535..000000000000
--- a/kernel/system_certificates.S
+++ /dev/null
@@ -1,20 +0,0 @@
1#include <linux/export.h>
2#include <linux/init.h>
3
4 __INITRODATA
5
6 .align 8
7 .globl VMLINUX_SYMBOL(system_certificate_list)
8VMLINUX_SYMBOL(system_certificate_list):
9__cert_list_start:
10 .incbin "kernel/x509_certificate_list"
11__cert_list_end:
12
13 .align 8
14 .globl VMLINUX_SYMBOL(system_certificate_list_size)
15VMLINUX_SYMBOL(system_certificate_list_size):
16#ifdef CONFIG_64BIT
17 .quad __cert_list_end - __cert_list_start
18#else
19 .long __cert_list_end - __cert_list_start
20#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
deleted file mode 100644
index 875f64e8935b..000000000000
--- a/kernel/system_keyring.c
+++ /dev/null
@@ -1,106 +0,0 @@
1/* System trusted keyring for trusted public keys
2 *
3 * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#include <linux/export.h>
13#include <linux/kernel.h>
14#include <linux/sched.h>
15#include <linux/cred.h>
16#include <linux/err.h>
17#include <keys/asymmetric-type.h>
18#include <keys/system_keyring.h>
19#include "module-internal.h"
20
21struct key *system_trusted_keyring;
22EXPORT_SYMBOL_GPL(system_trusted_keyring);
23
24extern __initconst const u8 system_certificate_list[];
25extern __initconst const unsigned long system_certificate_list_size;
26
27/*
28 * Load the compiled-in keys
29 */
30static __init int system_trusted_keyring_init(void)
31{
32 pr_notice("Initialise system trusted keyring\n");
33
34 system_trusted_keyring =
35 keyring_alloc(".system_keyring",
36 KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
37 ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
38 KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
39 KEY_ALLOC_NOT_IN_QUOTA, NULL);
40 if (IS_ERR(system_trusted_keyring))
41 panic("Can't allocate system trusted keyring\n");
42
43 set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
44 return 0;
45}
46
47/*
48 * Must be initialised before we try and load the keys into the keyring.
49 */
50device_initcall(system_trusted_keyring_init);
51
52/*
53 * Load the compiled-in list of X.509 certificates.
54 */
55static __init int load_system_certificate_list(void)
56{
57 key_ref_t key;
58 const u8 *p, *end;
59 size_t plen;
60
61 pr_notice("Loading compiled-in X.509 certificates\n");
62
63 p = system_certificate_list;
64 end = p + system_certificate_list_size;
65 while (p < end) {
66 /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
67 * than 256 bytes in size.
68 */
69 if (end - p < 4)
70 goto dodgy_cert;
71 if (p[0] != 0x30 &&
72 p[1] != 0x82)
73 goto dodgy_cert;
74 plen = (p[2] << 8) | p[3];
75 plen += 4;
76 if (plen > end - p)
77 goto dodgy_cert;
78
79 key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
80 "asymmetric",
81 NULL,
82 p,
83 plen,
84 ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
85 KEY_USR_VIEW | KEY_USR_READ),
86 KEY_ALLOC_NOT_IN_QUOTA |
87 KEY_ALLOC_TRUSTED);
88 if (IS_ERR(key)) {
89 pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
90 PTR_ERR(key));
91 } else {
92 set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
93 pr_notice("Loaded X.509 cert '%s'\n",
94 key_ref_to_ptr(key)->description);
95 key_ref_put(key);
96 }
97 p += plen;
98 }
99
100 return 0;
101
102dodgy_cert:
103 pr_err("Problem parsing in-kernel X.509 certificate list\n");
104 return 0;
105}
106late_initcall(load_system_certificate_list);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 8727032e3a6f..53fa971d000d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
18 * This is like the signal handler which runs in kernel mode, but it doesn't 18 * This is like the signal handler which runs in kernel mode, but it doesn't
19 * try to wake up the @task. 19 * try to wake up the @task.
20 * 20 *
21 * Note: there is no ordering guarantee on works queued here.
22 *
21 * RETURNS: 23 * RETURNS:
22 * 0 if succeeds or -ESRCH. 24 * 0 if succeeds or -ESRCH.
23 */ 25 */
@@ -108,16 +110,6 @@ void task_work_run(void)
108 raw_spin_unlock_wait(&task->pi_lock); 110 raw_spin_unlock_wait(&task->pi_lock);
109 smp_mb(); 111 smp_mb();
110 112
111 /* Reverse the list to run the works in fifo order */
112 head = NULL;
113 do {
114 next = work->next;
115 work->next = head;
116 head = work;
117 work = next;
118 } while (work);
119
120 work = head;
121 do { 113 do {
122 next = work->next; 114 next = work->next;
123 work->func(work); 115 work->func(work);
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 579ce1b929af..4008d9f95dd7 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -92,12 +92,10 @@ config NO_HZ_FULL
92 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS 92 depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS
93 # We need at least one periodic CPU for timekeeping 93 # We need at least one periodic CPU for timekeeping
94 depends on SMP 94 depends on SMP
95 # RCU_USER_QS dependency
96 depends on HAVE_CONTEXT_TRACKING 95 depends on HAVE_CONTEXT_TRACKING
97 # VIRT_CPU_ACCOUNTING_GEN dependency 96 # VIRT_CPU_ACCOUNTING_GEN dependency
98 depends on HAVE_VIRT_CPU_ACCOUNTING_GEN 97 depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
99 select NO_HZ_COMMON 98 select NO_HZ_COMMON
100 select RCU_USER_QS
101 select RCU_NOCB_CPU 99 select RCU_NOCB_CPU
102 select VIRT_CPU_ACCOUNTING_GEN 100 select VIRT_CPU_ACCOUNTING_GEN
103 select IRQ_WORK 101 select IRQ_WORK
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5c7ae4b641c4..457a373e2181 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -183,7 +183,7 @@ struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
183 int pinned) 183 int pinned)
184{ 184{
185 if (pinned || !base->migration_enabled) 185 if (pinned || !base->migration_enabled)
186 return this_cpu_ptr(&hrtimer_bases); 186 return base;
187 return &per_cpu(hrtimer_bases, get_nohz_timer_target()); 187 return &per_cpu(hrtimer_bases, get_nohz_timer_target());
188} 188}
189#else 189#else
@@ -191,23 +191,32 @@ static inline
191struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, 191struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
192 int pinned) 192 int pinned)
193{ 193{
194 return this_cpu_ptr(&hrtimer_bases); 194 return base;
195} 195}
196#endif 196#endif
197 197
198/* 198/*
199 * Switch the timer base to the current CPU when possible. 199 * We switch the timer base to a power-optimized selected CPU target,
200 * if:
201 * - NO_HZ_COMMON is enabled
202 * - timer migration is enabled
203 * - the timer callback is not running
204 * - the timer is not the first expiring timer on the new target
205 *
206 * If one of the above requirements is not fulfilled we move the timer
207 * to the current CPU or leave it on the previously assigned CPU if
208 * the timer callback is currently running.
200 */ 209 */
201static inline struct hrtimer_clock_base * 210static inline struct hrtimer_clock_base *
202switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, 211switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
203 int pinned) 212 int pinned)
204{ 213{
205 struct hrtimer_cpu_base *new_cpu_base, *this_base; 214 struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base;
206 struct hrtimer_clock_base *new_base; 215 struct hrtimer_clock_base *new_base;
207 int basenum = base->index; 216 int basenum = base->index;
208 217
209 this_base = this_cpu_ptr(&hrtimer_bases); 218 this_cpu_base = this_cpu_ptr(&hrtimer_bases);
210 new_cpu_base = get_target_base(this_base, pinned); 219 new_cpu_base = get_target_base(this_cpu_base, pinned);
211again: 220again:
212 new_base = &new_cpu_base->clock_base[basenum]; 221 new_base = &new_cpu_base->clock_base[basenum];
213 222
@@ -229,19 +238,19 @@ again:
229 raw_spin_unlock(&base->cpu_base->lock); 238 raw_spin_unlock(&base->cpu_base->lock);
230 raw_spin_lock(&new_base->cpu_base->lock); 239 raw_spin_lock(&new_base->cpu_base->lock);
231 240
232 if (new_cpu_base != this_base && 241 if (new_cpu_base != this_cpu_base &&
233 hrtimer_check_target(timer, new_base)) { 242 hrtimer_check_target(timer, new_base)) {
234 raw_spin_unlock(&new_base->cpu_base->lock); 243 raw_spin_unlock(&new_base->cpu_base->lock);
235 raw_spin_lock(&base->cpu_base->lock); 244 raw_spin_lock(&base->cpu_base->lock);
236 new_cpu_base = this_base; 245 new_cpu_base = this_cpu_base;
237 timer->base = base; 246 timer->base = base;
238 goto again; 247 goto again;
239 } 248 }
240 timer->base = new_base; 249 timer->base = new_base;
241 } else { 250 } else {
242 if (new_cpu_base != this_base && 251 if (new_cpu_base != this_cpu_base &&
243 hrtimer_check_target(timer, new_base)) { 252 hrtimer_check_target(timer, new_base)) {
244 new_cpu_base = this_base; 253 new_cpu_base = this_cpu_base;
245 goto again; 254 goto again;
246 } 255 }
247 } 256 }
@@ -679,14 +688,14 @@ static void retrigger_next_event(void *arg)
679/* 688/*
680 * Switch to high resolution mode 689 * Switch to high resolution mode
681 */ 690 */
682static int hrtimer_switch_to_hres(void) 691static void hrtimer_switch_to_hres(void)
683{ 692{
684 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); 693 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
685 694
686 if (tick_init_highres()) { 695 if (tick_init_highres()) {
687 printk(KERN_WARNING "Could not switch to high resolution " 696 printk(KERN_WARNING "Could not switch to high resolution "
688 "mode on CPU %d\n", base->cpu); 697 "mode on CPU %d\n", base->cpu);
689 return 0; 698 return;
690 } 699 }
691 base->hres_active = 1; 700 base->hres_active = 1;
692 hrtimer_resolution = HIGH_RES_NSEC; 701 hrtimer_resolution = HIGH_RES_NSEC;
@@ -694,7 +703,6 @@ static int hrtimer_switch_to_hres(void)
694 tick_setup_sched_timer(); 703 tick_setup_sched_timer();
695 /* "Retrigger" the interrupt to get things going */ 704 /* "Retrigger" the interrupt to get things going */
696 retrigger_next_event(NULL); 705 retrigger_next_event(NULL);
697 return 1;
698} 706}
699 707
700static void clock_was_set_work(struct work_struct *work) 708static void clock_was_set_work(struct work_struct *work)
@@ -718,7 +726,7 @@ void clock_was_set_delayed(void)
718static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } 726static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
719static inline int hrtimer_hres_active(void) { return 0; } 727static inline int hrtimer_hres_active(void) { return 0; }
720static inline int hrtimer_is_hres_enabled(void) { return 0; } 728static inline int hrtimer_is_hres_enabled(void) { return 0; }
721static inline int hrtimer_switch_to_hres(void) { return 0; } 729static inline void hrtimer_switch_to_hres(void) { }
722static inline void 730static inline void
723hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } 731hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { }
724static inline int hrtimer_reprogram(struct hrtimer *timer, 732static inline int hrtimer_reprogram(struct hrtimer *timer,
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index fb4d98c7fd43..df68cb875248 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -487,6 +487,11 @@ out:
487} 487}
488 488
489#ifdef CONFIG_GENERIC_CMOS_UPDATE 489#ifdef CONFIG_GENERIC_CMOS_UPDATE
490int __weak update_persistent_clock(struct timespec now)
491{
492 return -ENODEV;
493}
494
490int __weak update_persistent_clock64(struct timespec64 now64) 495int __weak update_persistent_clock64(struct timespec64 now64)
491{ 496{
492 struct timespec now; 497 struct timespec now;
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 3e7db49a2381..53d7184da0be 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -18,30 +18,23 @@
18 18
19static struct hrtimer bctimer; 19static struct hrtimer bctimer;
20 20
21static void bc_set_mode(enum clock_event_mode mode, 21static int bc_shutdown(struct clock_event_device *evt)
22 struct clock_event_device *bc)
23{ 22{
24 switch (mode) { 23 /*
25 case CLOCK_EVT_MODE_UNUSED: 24 * Note, we cannot cancel the timer here as we might
26 case CLOCK_EVT_MODE_SHUTDOWN: 25 * run into the following live lock scenario:
27 /* 26 *
28 * Note, we cannot cancel the timer here as we might 27 * cpu 0 cpu1
29 * run into the following live lock scenario: 28 * lock(broadcast_lock);
30 * 29 * hrtimer_interrupt()
31 * cpu 0 cpu1 30 * bc_handler()
32 * lock(broadcast_lock); 31 * tick_handle_oneshot_broadcast();
33 * hrtimer_interrupt() 32 * lock(broadcast_lock);
34 * bc_handler() 33 * hrtimer_cancel()
35 * tick_handle_oneshot_broadcast(); 34 * wait_for_callback()
36 * lock(broadcast_lock); 35 */
37 * hrtimer_cancel() 36 hrtimer_try_to_cancel(&bctimer);
38 * wait_for_callback() 37 return 0;
39 */
40 hrtimer_try_to_cancel(&bctimer);
41 break;
42 default:
43 break;
44 }
45} 38}
46 39
47/* 40/*
@@ -82,7 +75,7 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
82} 75}
83 76
84static struct clock_event_device ce_broadcast_hrtimer = { 77static struct clock_event_device ce_broadcast_hrtimer = {
85 .set_mode = bc_set_mode, 78 .set_state_shutdown = bc_shutdown,
86 .set_next_ktime = bc_set_next, 79 .set_next_ktime = bc_set_next,
87 .features = CLOCK_EVT_FEAT_ONESHOT | 80 .features = CLOCK_EVT_FEAT_ONESHOT |
88 CLOCK_EVT_FEAT_KTIME | 81 CLOCK_EVT_FEAT_KTIME |
@@ -102,13 +95,11 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
102{ 95{
103 ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); 96 ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
104 97
105 switch (ce_broadcast_hrtimer.mode) { 98 if (clockevent_state_oneshot(&ce_broadcast_hrtimer))
106 case CLOCK_EVT_MODE_ONESHOT:
107 if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX) 99 if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
108 return HRTIMER_RESTART; 100 return HRTIMER_RESTART;
109 default: 101
110 return HRTIMER_NORESTART; 102 return HRTIMER_NORESTART;
111 }
112} 103}
113 104
114void tick_setup_hrtimer_broadcast(void) 105void tick_setup_hrtimer_broadcast(void)
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 52b9e199b5ac..f6aae7977824 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -839,7 +839,6 @@ out:
839 raw_spin_unlock(&tick_broadcast_lock); 839 raw_spin_unlock(&tick_broadcast_lock);
840 return ret; 840 return ret;
841} 841}
842EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
843 842
844/* 843/*
845 * Reset the one shot broadcast for a cpu 844 * Reset the one shot broadcast for a cpu
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 55e13efff1ab..d11c55b6ab7d 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -304,9 +304,6 @@ void tick_check_new_device(struct clock_event_device *newdev)
304 int cpu; 304 int cpu;
305 305
306 cpu = smp_processor_id(); 306 cpu = smp_processor_id();
307 if (!cpumask_test_cpu(cpu, newdev->cpumask))
308 goto out_bc;
309
310 td = &per_cpu(tick_cpu_device, cpu); 307 td = &per_cpu(tick_cpu_device, cpu);
311 curdev = td->evtdev; 308 curdev = td->evtdev;
312 309
@@ -363,6 +360,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
363 360
364 return __tick_broadcast_oneshot_control(state); 361 return __tick_broadcast_oneshot_control(state);
365} 362}
363EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control);
366 364
367#ifdef CONFIG_HOTPLUG_CPU 365#ifdef CONFIG_HOTPLUG_CPU
368/* 366/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c792429e98c6..3319e16f31e5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -197,27 +197,9 @@ static bool can_stop_full_tick(void)
197 return true; 197 return true;
198} 198}
199 199
200static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
201
202/*
203 * Re-evaluate the need for the tick on the current CPU
204 * and restart it if necessary.
205 */
206void __tick_nohz_full_check(void)
207{
208 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
209
210 if (tick_nohz_full_cpu(smp_processor_id())) {
211 if (ts->tick_stopped && !is_idle_task(current)) {
212 if (!can_stop_full_tick())
213 tick_nohz_restart_sched_tick(ts, ktime_get());
214 }
215 }
216}
217
218static void nohz_full_kick_work_func(struct irq_work *work) 200static void nohz_full_kick_work_func(struct irq_work *work)
219{ 201{
220 __tick_nohz_full_check(); 202 /* Empty, the tick restart happens on tick_nohz_irq_exit() */
221} 203}
222 204
223static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { 205static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
@@ -252,7 +234,7 @@ void tick_nohz_full_kick_cpu(int cpu)
252 234
253static void nohz_full_kick_ipi(void *info) 235static void nohz_full_kick_ipi(void *info)
254{ 236{
255 __tick_nohz_full_check(); 237 /* Empty, the tick restart happens on tick_nohz_irq_exit() */
256} 238}
257 239
258/* 240/*
@@ -276,7 +258,7 @@ void tick_nohz_full_kick_all(void)
276 * It might need the tick due to per task/process properties: 258 * It might need the tick due to per task/process properties:
277 * perf events, posix cpu timers, ... 259 * perf events, posix cpu timers, ...
278 */ 260 */
279void __tick_nohz_task_switch(struct task_struct *tsk) 261void __tick_nohz_task_switch(void)
280{ 262{
281 unsigned long flags; 263 unsigned long flags;
282 264
@@ -705,21 +687,38 @@ out:
705 return tick; 687 return tick;
706} 688}
707 689
708static void tick_nohz_full_stop_tick(struct tick_sched *ts) 690static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
691{
692 /* Update jiffies first */
693 tick_do_update_jiffies64(now);
694 update_cpu_load_nohz();
695
696 calc_load_exit_idle();
697 touch_softlockup_watchdog();
698 /*
699 * Cancel the scheduled timer and restore the tick
700 */
701 ts->tick_stopped = 0;
702 ts->idle_exittime = now;
703
704 tick_nohz_restart(ts, now);
705}
706
707static void tick_nohz_full_update_tick(struct tick_sched *ts)
709{ 708{
710#ifdef CONFIG_NO_HZ_FULL 709#ifdef CONFIG_NO_HZ_FULL
711 int cpu = smp_processor_id(); 710 int cpu = smp_processor_id();
712 711
713 if (!tick_nohz_full_cpu(cpu) || is_idle_task(current)) 712 if (!tick_nohz_full_cpu(cpu))
714 return; 713 return;
715 714
716 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) 715 if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
717 return; 716 return;
718 717
719 if (!can_stop_full_tick()) 718 if (can_stop_full_tick())
720 return; 719 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
721 720 else if (ts->tick_stopped)
722 tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); 721 tick_nohz_restart_sched_tick(ts, ktime_get());
723#endif 722#endif
724} 723}
725 724
@@ -849,7 +848,7 @@ void tick_nohz_irq_exit(void)
849 if (ts->inidle) 848 if (ts->inidle)
850 __tick_nohz_idle_enter(ts); 849 __tick_nohz_idle_enter(ts);
851 else 850 else
852 tick_nohz_full_stop_tick(ts); 851 tick_nohz_full_update_tick(ts);
853} 852}
854 853
855/** 854/**
@@ -864,23 +863,6 @@ ktime_t tick_nohz_get_sleep_length(void)
864 return ts->sleep_length; 863 return ts->sleep_length;
865} 864}
866 865
867static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
868{
869 /* Update jiffies first */
870 tick_do_update_jiffies64(now);
871 update_cpu_load_nohz();
872
873 calc_load_exit_idle();
874 touch_softlockup_watchdog();
875 /*
876 * Cancel the scheduled timer and restore the tick
877 */
878 ts->tick_stopped = 0;
879 ts->idle_exittime = now;
880
881 tick_nohz_restart(ts, now);
882}
883
884static void tick_nohz_account_idle_ticks(struct tick_sched *ts) 866static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
885{ 867{
886#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE 868#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 85d5bb1d67eb..86751c68e08d 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -268,10 +268,14 @@ EXPORT_SYMBOL(jiffies_to_msecs);
268 268
269unsigned int jiffies_to_usecs(const unsigned long j) 269unsigned int jiffies_to_usecs(const unsigned long j)
270{ 270{
271#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) 271 /*
272 * Hz usually doesn't go much further MSEC_PER_SEC.
273 * jiffies_to_usecs() and usecs_to_jiffies() depend on that.
274 */
275 BUILD_BUG_ON(HZ > USEC_PER_SEC);
276
277#if !(USEC_PER_SEC % HZ)
272 return (USEC_PER_SEC / HZ) * j; 278 return (USEC_PER_SEC / HZ) * j;
273#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
274 return (j + (HZ / USEC_PER_SEC) - 1)/(HZ / USEC_PER_SEC);
275#else 279#else
276# if BITS_PER_LONG == 32 280# if BITS_PER_LONG == 32
277 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32; 281 return (HZ_TO_USEC_MUL32 * j) >> HZ_TO_USEC_SHR32;
@@ -287,26 +291,20 @@ EXPORT_SYMBOL(jiffies_to_usecs);
287 * @t: Timespec 291 * @t: Timespec
288 * @gran: Granularity in ns. 292 * @gran: Granularity in ns.
289 * 293 *
290 * Truncate a timespec to a granularity. gran must be smaller than a second. 294 * Truncate a timespec to a granularity. Always rounds down. gran must
291 * Always rounds down. 295 * not be 0 nor greater than a second (NSEC_PER_SEC, or 10^9 ns).
292 *
293 * This function should be only used for timestamps returned by
294 * current_kernel_time() or CURRENT_TIME, not with do_gettimeofday() because
295 * it doesn't handle the better resolution of the latter.
296 */ 296 */
297struct timespec timespec_trunc(struct timespec t, unsigned gran) 297struct timespec timespec_trunc(struct timespec t, unsigned gran)
298{ 298{
299 /* 299 /* Avoid division in the common cases 1 ns and 1 s. */
300 * Division is pretty slow so avoid it for common cases. 300 if (gran == 1) {
301 * Currently current_kernel_time() never returns better than
302 * jiffies resolution. Exploit that.
303 */
304 if (gran <= jiffies_to_usecs(1) * 1000) {
305 /* nothing */ 301 /* nothing */
306 } else if (gran == 1000000000) { 302 } else if (gran == NSEC_PER_SEC) {
307 t.tv_nsec = 0; 303 t.tv_nsec = 0;
308 } else { 304 } else if (gran > 1 && gran < NSEC_PER_SEC) {
309 t.tv_nsec -= t.tv_nsec % gran; 305 t.tv_nsec -= t.tv_nsec % gran;
306 } else {
307 WARN(1, "illegal file time granularity: %u", gran);
310 } 308 }
311 return t; 309 return t;
312} 310}
@@ -546,7 +544,7 @@ EXPORT_SYMBOL(__usecs_to_jiffies);
546 * value to a scaled second value. 544 * value to a scaled second value.
547 */ 545 */
548static unsigned long 546static unsigned long
549__timespec_to_jiffies(unsigned long sec, long nsec) 547__timespec64_to_jiffies(u64 sec, long nsec)
550{ 548{
551 nsec = nsec + TICK_NSEC - 1; 549 nsec = nsec + TICK_NSEC - 1;
552 550
@@ -554,22 +552,27 @@ __timespec_to_jiffies(unsigned long sec, long nsec)
554 sec = MAX_SEC_IN_JIFFIES; 552 sec = MAX_SEC_IN_JIFFIES;
555 nsec = 0; 553 nsec = 0;
556 } 554 }
557 return (((u64)sec * SEC_CONVERSION) + 555 return ((sec * SEC_CONVERSION) +
558 (((u64)nsec * NSEC_CONVERSION) >> 556 (((u64)nsec * NSEC_CONVERSION) >>
559 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; 557 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
560 558
561} 559}
562 560
563unsigned long 561static unsigned long
564timespec_to_jiffies(const struct timespec *value) 562__timespec_to_jiffies(unsigned long sec, long nsec)
565{ 563{
566 return __timespec_to_jiffies(value->tv_sec, value->tv_nsec); 564 return __timespec64_to_jiffies((u64)sec, nsec);
567} 565}
568 566
569EXPORT_SYMBOL(timespec_to_jiffies); 567unsigned long
568timespec64_to_jiffies(const struct timespec64 *value)
569{
570 return __timespec64_to_jiffies(value->tv_sec, value->tv_nsec);
571}
572EXPORT_SYMBOL(timespec64_to_jiffies);
570 573
571void 574void
572jiffies_to_timespec(const unsigned long jiffies, struct timespec *value) 575jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value)
573{ 576{
574 /* 577 /*
575 * Convert jiffies to nanoseconds and separate with 578 * Convert jiffies to nanoseconds and separate with
@@ -580,7 +583,7 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
580 NSEC_PER_SEC, &rem); 583 NSEC_PER_SEC, &rem);
581 value->tv_nsec = rem; 584 value->tv_nsec = rem;
582} 585}
583EXPORT_SYMBOL(jiffies_to_timespec); 586EXPORT_SYMBOL(jiffies_to_timespec64);
584 587
585/* 588/*
586 * We could use a similar algorithm to timespec_to_jiffies (with a 589 * We could use a similar algorithm to timespec_to_jiffies (with a
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index bca3667a2de1..f6ee2e6b6f5d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -911,6 +911,7 @@ int do_settimeofday64(const struct timespec64 *ts)
911 struct timekeeper *tk = &tk_core.timekeeper; 911 struct timekeeper *tk = &tk_core.timekeeper;
912 struct timespec64 ts_delta, xt; 912 struct timespec64 ts_delta, xt;
913 unsigned long flags; 913 unsigned long flags;
914 int ret = 0;
914 915
915 if (!timespec64_valid_strict(ts)) 916 if (!timespec64_valid_strict(ts))
916 return -EINVAL; 917 return -EINVAL;
@@ -924,10 +925,15 @@ int do_settimeofday64(const struct timespec64 *ts)
924 ts_delta.tv_sec = ts->tv_sec - xt.tv_sec; 925 ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
925 ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec; 926 ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
926 927
928 if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
929 ret = -EINVAL;
930 goto out;
931 }
932
927 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta)); 933 tk_set_wall_to_mono(tk, timespec64_sub(tk->wall_to_monotonic, ts_delta));
928 934
929 tk_set_xtime(tk, ts); 935 tk_set_xtime(tk, ts);
930 936out:
931 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET); 937 timekeeping_update(tk, TK_CLEAR_NTP | TK_MIRROR | TK_CLOCK_WAS_SET);
932 938
933 write_seqcount_end(&tk_core.seq); 939 write_seqcount_end(&tk_core.seq);
@@ -936,7 +942,7 @@ int do_settimeofday64(const struct timespec64 *ts)
936 /* signal hrtimers about time change */ 942 /* signal hrtimers about time change */
937 clock_was_set(); 943 clock_was_set();
938 944
939 return 0; 945 return ret;
940} 946}
941EXPORT_SYMBOL(do_settimeofday64); 947EXPORT_SYMBOL(do_settimeofday64);
942 948
@@ -965,7 +971,8 @@ int timekeeping_inject_offset(struct timespec *ts)
965 971
966 /* Make sure the proposed value is valid */ 972 /* Make sure the proposed value is valid */
967 tmp = timespec64_add(tk_xtime(tk), ts64); 973 tmp = timespec64_add(tk_xtime(tk), ts64);
968 if (!timespec64_valid_strict(&tmp)) { 974 if (timespec64_compare(&tk->wall_to_monotonic, &ts64) > 0 ||
975 !timespec64_valid_strict(&tmp)) {
969 ret = -EINVAL; 976 ret = -EINVAL;
970 goto error; 977 goto error;
971 } 978 }
@@ -1874,7 +1881,7 @@ struct timespec __current_kernel_time(void)
1874 return timespec64_to_timespec(tk_xtime(tk)); 1881 return timespec64_to_timespec(tk_xtime(tk));
1875} 1882}
1876 1883
1877struct timespec current_kernel_time(void) 1884struct timespec64 current_kernel_time64(void)
1878{ 1885{
1879 struct timekeeper *tk = &tk_core.timekeeper; 1886 struct timekeeper *tk = &tk_core.timekeeper;
1880 struct timespec64 now; 1887 struct timespec64 now;
@@ -1886,9 +1893,9 @@ struct timespec current_kernel_time(void)
1886 now = tk_xtime(tk); 1893 now = tk_xtime(tk);
1887 } while (read_seqcount_retry(&tk_core.seq, seq)); 1894 } while (read_seqcount_retry(&tk_core.seq, seq));
1888 1895
1889 return timespec64_to_timespec(now); 1896 return now;
1890} 1897}
1891EXPORT_SYMBOL(current_kernel_time); 1898EXPORT_SYMBOL(current_kernel_time64);
1892 1899
1893struct timespec64 get_monotonic_coarse64(void) 1900struct timespec64 get_monotonic_coarse64(void)
1894{ 1901{
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 5e097fa9faf7..84190f02b521 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -807,8 +807,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
807 spin_unlock(&base->lock); 807 spin_unlock(&base->lock);
808 base = new_base; 808 base = new_base;
809 spin_lock(&base->lock); 809 spin_lock(&base->lock);
810 timer->flags &= ~TIMER_BASEMASK; 810 WRITE_ONCE(timer->flags,
811 timer->flags |= base->cpu; 811 (timer->flags & ~TIMER_BASEMASK) | base->cpu);
812 } 812 }
813 } 813 }
814 814
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a4536e1e3e2a..129c96033e46 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -137,7 +137,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
137 (unsigned long long) ktime_to_ns(base->offset)); 137 (unsigned long long) ktime_to_ns(base->offset));
138#endif 138#endif
139 SEQ_printf(m, "active timers:\n"); 139 SEQ_printf(m, "active timers:\n");
140 print_active_timers(m, base, now); 140 print_active_timers(m, base, now + ktime_to_ns(base->offset));
141} 141}
142 142
143static void print_cpu(struct seq_file *m, int cpu, u64 now) 143static void print_cpu(struct seq_file *m, int cpu, u64 now)
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3b9a48ae153a..1153c43428f3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -434,7 +434,7 @@ config UPROBE_EVENT
434 434
435config BPF_EVENTS 435config BPF_EVENTS
436 depends on BPF_SYSCALL 436 depends on BPF_SYSCALL
437 depends on KPROBE_EVENT 437 depends on KPROBE_EVENT || UPROBE_EVENT
438 bool 438 bool
439 default y 439 default y
440 help 440 help
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3e6b39b6cf9..90e72a0c3047 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -778,9 +778,6 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
778 if (likely(!bt)) 778 if (likely(!bt))
779 return; 779 return;
780 780
781 if (!error && !bio_flagged(bio, BIO_UPTODATE))
782 error = EIO;
783
784 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, 781 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
785 bio->bi_rw, what, error, 0, NULL); 782 bio->bi_rw, what, error, 0, NULL);
786} 783}
@@ -887,8 +884,7 @@ static void blk_add_trace_split(void *ignore,
887 884
888 __blk_add_trace(bt, bio->bi_iter.bi_sector, 885 __blk_add_trace(bt, bio->bi_iter.bi_sector,
889 bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT, 886 bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
890 !bio_flagged(bio, BIO_UPTODATE), 887 bio->bi_error, sizeof(rpdu), &rpdu);
891 sizeof(rpdu), &rpdu);
892 } 888 }
893} 889}
894 890
@@ -920,8 +916,8 @@ static void blk_add_trace_bio_remap(void *ignore,
920 r.sector_from = cpu_to_be64(from); 916 r.sector_from = cpu_to_be64(from);
921 917
922 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, 918 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
923 bio->bi_rw, BLK_TA_REMAP, 919 bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
924 !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); 920 sizeof(r), &r);
925} 921}
926 922
927/** 923/**
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88a041adee90..0fe96c7c8803 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -81,13 +81,16 @@ static const struct bpf_func_proto bpf_probe_read_proto = {
81 81
82/* 82/*
83 * limited trace_printk() 83 * limited trace_printk()
84 * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed 84 * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
85 */ 85 */
86static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) 86static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
87{ 87{
88 char *fmt = (char *) (long) r1; 88 char *fmt = (char *) (long) r1;
89 bool str_seen = false;
89 int mod[3] = {}; 90 int mod[3] = {};
90 int fmt_cnt = 0; 91 int fmt_cnt = 0;
92 u64 unsafe_addr;
93 char buf[64];
91 int i; 94 int i;
92 95
93 /* 96 /*
@@ -114,12 +117,37 @@ static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5)
114 if (fmt[i] == 'l') { 117 if (fmt[i] == 'l') {
115 mod[fmt_cnt]++; 118 mod[fmt_cnt]++;
116 i++; 119 i++;
117 } else if (fmt[i] == 'p') { 120 } else if (fmt[i] == 'p' || fmt[i] == 's') {
118 mod[fmt_cnt]++; 121 mod[fmt_cnt]++;
119 i++; 122 i++;
120 if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) 123 if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0)
121 return -EINVAL; 124 return -EINVAL;
122 fmt_cnt++; 125 fmt_cnt++;
126 if (fmt[i - 1] == 's') {
127 if (str_seen)
128 /* allow only one '%s' per fmt string */
129 return -EINVAL;
130 str_seen = true;
131
132 switch (fmt_cnt) {
133 case 1:
134 unsafe_addr = r3;
135 r3 = (long) buf;
136 break;
137 case 2:
138 unsafe_addr = r4;
139 r4 = (long) buf;
140 break;
141 case 3:
142 unsafe_addr = r5;
143 r5 = (long) buf;
144 break;
145 }
146 buf[0] = 0;
147 strncpy_from_unsafe(buf,
148 (void *) (long) unsafe_addr,
149 sizeof(buf));
150 }
123 continue; 151 continue;
124 } 152 }
125 153
@@ -158,6 +186,35 @@ const struct bpf_func_proto *bpf_get_trace_printk_proto(void)
158 return &bpf_trace_printk_proto; 186 return &bpf_trace_printk_proto;
159} 187}
160 188
189static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
190{
191 struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
192 struct bpf_array *array = container_of(map, struct bpf_array, map);
193 struct perf_event *event;
194
195 if (unlikely(index >= array->map.max_entries))
196 return -E2BIG;
197
198 event = (struct perf_event *)array->ptrs[index];
199 if (!event)
200 return -ENOENT;
201
202 /*
203 * we don't know if the function is run successfully by the
204 * return value. It can be judged in other places, such as
205 * eBPF programs.
206 */
207 return perf_event_read_local(event);
208}
209
210const struct bpf_func_proto bpf_perf_event_read_proto = {
211 .func = bpf_perf_event_read,
212 .gpl_only = false,
213 .ret_type = RET_INTEGER,
214 .arg1_type = ARG_CONST_MAP_PTR,
215 .arg2_type = ARG_ANYTHING,
216};
217
161static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) 218static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id)
162{ 219{
163 switch (func_id) { 220 switch (func_id) {
@@ -183,6 +240,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
183 return bpf_get_trace_printk_proto(); 240 return bpf_get_trace_printk_proto();
184 case BPF_FUNC_get_smp_processor_id: 241 case BPF_FUNC_get_smp_processor_id:
185 return &bpf_get_smp_processor_id_proto; 242 return &bpf_get_smp_processor_id_proto;
243 case BPF_FUNC_perf_event_read:
244 return &bpf_perf_event_read_proto;
186 default: 245 default:
187 return NULL; 246 return NULL;
188 } 247 }
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02bece4a99ea..b0623ac785a2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -98,6 +98,13 @@ struct ftrace_pid {
98 struct pid *pid; 98 struct pid *pid;
99}; 99};
100 100
101static bool ftrace_pids_enabled(void)
102{
103 return !list_empty(&ftrace_pids);
104}
105
106static void ftrace_update_trampoline(struct ftrace_ops *ops);
107
101/* 108/*
102 * ftrace_disabled is set when an anomaly is discovered. 109 * ftrace_disabled is set when an anomaly is discovered.
103 * ftrace_disabled is much stronger than ftrace_enabled. 110 * ftrace_disabled is much stronger than ftrace_enabled.
@@ -109,7 +116,6 @@ static DEFINE_MUTEX(ftrace_lock);
109static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; 116static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end;
110static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; 117static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
111ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; 118ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
112ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
113static struct ftrace_ops global_ops; 119static struct ftrace_ops global_ops;
114static struct ftrace_ops control_ops; 120static struct ftrace_ops control_ops;
115 121
@@ -183,14 +189,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
183 if (!test_tsk_trace_trace(current)) 189 if (!test_tsk_trace_trace(current))
184 return; 190 return;
185 191
186 ftrace_pid_function(ip, parent_ip, op, regs); 192 op->saved_func(ip, parent_ip, op, regs);
187}
188
189static void set_ftrace_pid_function(ftrace_func_t func)
190{
191 /* do not set ftrace_pid_function to itself! */
192 if (func != ftrace_pid_func)
193 ftrace_pid_function = func;
194} 193}
195 194
196/** 195/**
@@ -202,7 +201,6 @@ static void set_ftrace_pid_function(ftrace_func_t func)
202void clear_ftrace_function(void) 201void clear_ftrace_function(void)
203{ 202{
204 ftrace_trace_function = ftrace_stub; 203 ftrace_trace_function = ftrace_stub;
205 ftrace_pid_function = ftrace_stub;
206} 204}
207 205
208static void control_ops_disable_all(struct ftrace_ops *ops) 206static void control_ops_disable_all(struct ftrace_ops *ops)
@@ -436,6 +434,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
436 } else 434 } else
437 add_ftrace_ops(&ftrace_ops_list, ops); 435 add_ftrace_ops(&ftrace_ops_list, ops);
438 436
437 /* Always save the function, and reset at unregistering */
438 ops->saved_func = ops->func;
439
440 if (ops->flags & FTRACE_OPS_FL_PID && ftrace_pids_enabled())
441 ops->func = ftrace_pid_func;
442
439 ftrace_update_trampoline(ops); 443 ftrace_update_trampoline(ops);
440 444
441 if (ftrace_enabled) 445 if (ftrace_enabled)
@@ -463,15 +467,28 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
463 if (ftrace_enabled) 467 if (ftrace_enabled)
464 update_ftrace_function(); 468 update_ftrace_function();
465 469
470 ops->func = ops->saved_func;
471
466 return 0; 472 return 0;
467} 473}
468 474
469static void ftrace_update_pid_func(void) 475static void ftrace_update_pid_func(void)
470{ 476{
477 bool enabled = ftrace_pids_enabled();
478 struct ftrace_ops *op;
479
471 /* Only do something if we are tracing something */ 480 /* Only do something if we are tracing something */
472 if (ftrace_trace_function == ftrace_stub) 481 if (ftrace_trace_function == ftrace_stub)
473 return; 482 return;
474 483
484 do_for_each_ftrace_op(op, ftrace_ops_list) {
485 if (op->flags & FTRACE_OPS_FL_PID) {
486 op->func = enabled ? ftrace_pid_func :
487 op->saved_func;
488 ftrace_update_trampoline(op);
489 }
490 } while_for_each_ftrace_op(op);
491
475 update_ftrace_function(); 492 update_ftrace_function();
476} 493}
477 494
@@ -613,13 +630,18 @@ static int function_stat_show(struct seq_file *m, void *v)
613 goto out; 630 goto out;
614 } 631 }
615 632
633#ifdef CONFIG_FUNCTION_GRAPH_TRACER
634 avg = rec->time;
635 do_div(avg, rec->counter);
636 if (tracing_thresh && (avg < tracing_thresh))
637 goto out;
638#endif
639
616 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 640 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
617 seq_printf(m, " %-30.30s %10lu", str, rec->counter); 641 seq_printf(m, " %-30.30s %10lu", str, rec->counter);
618 642
619#ifdef CONFIG_FUNCTION_GRAPH_TRACER 643#ifdef CONFIG_FUNCTION_GRAPH_TRACER
620 seq_puts(m, " "); 644 seq_puts(m, " ");
621 avg = rec->time;
622 do_div(avg, rec->counter);
623 645
624 /* Sample standard deviation (s^2) */ 646 /* Sample standard deviation (s^2) */
625 if (rec->counter <= 1) 647 if (rec->counter <= 1)
@@ -1133,7 +1155,8 @@ static struct ftrace_ops global_ops = {
1133 .local_hash.filter_hash = EMPTY_HASH, 1155 .local_hash.filter_hash = EMPTY_HASH,
1134 INIT_OPS_HASH(global_ops) 1156 INIT_OPS_HASH(global_ops)
1135 .flags = FTRACE_OPS_FL_RECURSION_SAFE | 1157 .flags = FTRACE_OPS_FL_RECURSION_SAFE |
1136 FTRACE_OPS_FL_INITIALIZED, 1158 FTRACE_OPS_FL_INITIALIZED |
1159 FTRACE_OPS_FL_PID,
1137}; 1160};
1138 1161
1139/* 1162/*
@@ -5023,7 +5046,9 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
5023 5046
5024static struct ftrace_ops global_ops = { 5047static struct ftrace_ops global_ops = {
5025 .func = ftrace_stub, 5048 .func = ftrace_stub,
5026 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, 5049 .flags = FTRACE_OPS_FL_RECURSION_SAFE |
5050 FTRACE_OPS_FL_INITIALIZED |
5051 FTRACE_OPS_FL_PID,
5027}; 5052};
5028 5053
5029static int __init ftrace_nodyn_init(void) 5054static int __init ftrace_nodyn_init(void)
@@ -5080,11 +5105,6 @@ void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func)
5080 if (WARN_ON(tr->ops->func != ftrace_stub)) 5105 if (WARN_ON(tr->ops->func != ftrace_stub))
5081 printk("ftrace ops had %pS for function\n", 5106 printk("ftrace ops had %pS for function\n",
5082 tr->ops->func); 5107 tr->ops->func);
5083 /* Only the top level instance does pid tracing */
5084 if (!list_empty(&ftrace_pids)) {
5085 set_ftrace_pid_function(func);
5086 func = ftrace_pid_func;
5087 }
5088 } 5108 }
5089 tr->ops->func = func; 5109 tr->ops->func = func;
5090 tr->ops->private = tr; 5110 tr->ops->private = tr;
@@ -5371,7 +5391,7 @@ static void *fpid_start(struct seq_file *m, loff_t *pos)
5371{ 5391{
5372 mutex_lock(&ftrace_lock); 5392 mutex_lock(&ftrace_lock);
5373 5393
5374 if (list_empty(&ftrace_pids) && (!*pos)) 5394 if (!ftrace_pids_enabled() && (!*pos))
5375 return (void *) 1; 5395 return (void *) 1;
5376 5396
5377 return seq_list_start(&ftrace_pids, *pos); 5397 return seq_list_start(&ftrace_pids, *pos);
@@ -5610,6 +5630,7 @@ static struct ftrace_ops graph_ops = {
5610 .func = ftrace_stub, 5630 .func = ftrace_stub,
5611 .flags = FTRACE_OPS_FL_RECURSION_SAFE | 5631 .flags = FTRACE_OPS_FL_RECURSION_SAFE |
5612 FTRACE_OPS_FL_INITIALIZED | 5632 FTRACE_OPS_FL_INITIALIZED |
5633 FTRACE_OPS_FL_PID |
5613 FTRACE_OPS_FL_STUB, 5634 FTRACE_OPS_FL_STUB,
5614#ifdef FTRACE_GRAPH_TRAMP_ADDR 5635#ifdef FTRACE_GRAPH_TRAMP_ADDR
5615 .trampoline = FTRACE_GRAPH_TRAMP_ADDR, 5636 .trampoline = FTRACE_GRAPH_TRAMP_ADDR,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6260717c18e3..fc347f8b1bca 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -400,6 +400,17 @@ struct rb_irq_work {
400}; 400};
401 401
402/* 402/*
403 * Structure to hold event state and handle nested events.
404 */
405struct rb_event_info {
406 u64 ts;
407 u64 delta;
408 unsigned long length;
409 struct buffer_page *tail_page;
410 int add_timestamp;
411};
412
413/*
403 * Used for which event context the event is in. 414 * Used for which event context the event is in.
404 * NMI = 0 415 * NMI = 0
405 * IRQ = 1 416 * IRQ = 1
@@ -1876,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event)
1876 return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; 1887 return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
1877} 1888}
1878 1889
1879static inline int
1880rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
1881 struct ring_buffer_event *event)
1882{
1883 unsigned long addr = (unsigned long)event;
1884 unsigned long index;
1885
1886 index = rb_event_index(event);
1887 addr &= PAGE_MASK;
1888
1889 return cpu_buffer->commit_page->page == (void *)addr &&
1890 rb_commit_index(cpu_buffer) == index;
1891}
1892
1893static void
1894rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
1895{
1896 unsigned long max_count;
1897
1898 /*
1899 * We only race with interrupts and NMIs on this CPU.
1900 * If we own the commit event, then we can commit
1901 * all others that interrupted us, since the interruptions
1902 * are in stack format (they finish before they come
1903 * back to us). This allows us to do a simple loop to
1904 * assign the commit to the tail.
1905 */
1906 again:
1907 max_count = cpu_buffer->nr_pages * 100;
1908
1909 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
1910 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
1911 return;
1912 if (RB_WARN_ON(cpu_buffer,
1913 rb_is_reader_page(cpu_buffer->tail_page)))
1914 return;
1915 local_set(&cpu_buffer->commit_page->page->commit,
1916 rb_page_write(cpu_buffer->commit_page));
1917 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
1918 cpu_buffer->write_stamp =
1919 cpu_buffer->commit_page->page->time_stamp;
1920 /* add barrier to keep gcc from optimizing too much */
1921 barrier();
1922 }
1923 while (rb_commit_index(cpu_buffer) !=
1924 rb_page_write(cpu_buffer->commit_page)) {
1925
1926 local_set(&cpu_buffer->commit_page->page->commit,
1927 rb_page_write(cpu_buffer->commit_page));
1928 RB_WARN_ON(cpu_buffer,
1929 local_read(&cpu_buffer->commit_page->page->commit) &
1930 ~RB_WRITE_MASK);
1931 barrier();
1932 }
1933
1934 /* again, keep gcc from optimizing */
1935 barrier();
1936
1937 /*
1938 * If an interrupt came in just after the first while loop
1939 * and pushed the tail page forward, we will be left with
1940 * a dangling commit that will never go forward.
1941 */
1942 if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
1943 goto again;
1944}
1945
1946static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 1890static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1947{ 1891{
1948 cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; 1892 cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
@@ -1968,64 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
1968 iter->head = 0; 1912 iter->head = 0;
1969} 1913}
1970 1914
1971/* Slow path, do not inline */
1972static noinline struct ring_buffer_event *
1973rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1974{
1975 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
1976
1977 /* Not the first event on the page? */
1978 if (rb_event_index(event)) {
1979 event->time_delta = delta & TS_MASK;
1980 event->array[0] = delta >> TS_SHIFT;
1981 } else {
1982 /* nope, just zero it */
1983 event->time_delta = 0;
1984 event->array[0] = 0;
1985 }
1986
1987 return skip_time_extend(event);
1988}
1989
1990/**
1991 * rb_update_event - update event type and data
1992 * @event: the event to update
1993 * @type: the type of event
1994 * @length: the size of the event field in the ring buffer
1995 *
1996 * Update the type and data fields of the event. The length
1997 * is the actual size that is written to the ring buffer,
1998 * and with this, we can determine what to place into the
1999 * data field.
2000 */
2001static void
2002rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
2003 struct ring_buffer_event *event, unsigned length,
2004 int add_timestamp, u64 delta)
2005{
2006 /* Only a commit updates the timestamp */
2007 if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
2008 delta = 0;
2009
2010 /*
2011 * If we need to add a timestamp, then we
2012 * add it to the start of the resevered space.
2013 */
2014 if (unlikely(add_timestamp)) {
2015 event = rb_add_time_stamp(event, delta);
2016 length -= RB_LEN_TIME_EXTEND;
2017 delta = 0;
2018 }
2019
2020 event->time_delta = delta;
2021 length -= RB_EVNT_HDR_SIZE;
2022 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
2023 event->type_len = 0;
2024 event->array[0] = length;
2025 } else
2026 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
2027}
2028
2029/* 1915/*
2030 * rb_handle_head_page - writer hit the head page 1916 * rb_handle_head_page - writer hit the head page
2031 * 1917 *
@@ -2184,29 +2070,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
2184 return 0; 2070 return 0;
2185} 2071}
2186 2072
2187static unsigned rb_calculate_event_length(unsigned length)
2188{
2189 struct ring_buffer_event event; /* Used only for sizeof array */
2190
2191 /* zero length can cause confusions */
2192 if (!length)
2193 length++;
2194
2195 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
2196 length += sizeof(event.array[0]);
2197
2198 length += RB_EVNT_HDR_SIZE;
2199 length = ALIGN(length, RB_ARCH_ALIGNMENT);
2200
2201 return length;
2202}
2203
2204static inline void 2073static inline void
2205rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, 2074rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
2206 struct buffer_page *tail_page, 2075 unsigned long tail, struct rb_event_info *info)
2207 unsigned long tail, unsigned long length)
2208{ 2076{
2077 struct buffer_page *tail_page = info->tail_page;
2209 struct ring_buffer_event *event; 2078 struct ring_buffer_event *event;
2079 unsigned long length = info->length;
2210 2080
2211 /* 2081 /*
2212 * Only the event that crossed the page boundary 2082 * Only the event that crossed the page boundary
@@ -2276,13 +2146,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
2276 */ 2146 */
2277static noinline struct ring_buffer_event * 2147static noinline struct ring_buffer_event *
2278rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, 2148rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
2279 unsigned long length, unsigned long tail, 2149 unsigned long tail, struct rb_event_info *info)
2280 struct buffer_page *tail_page, u64 ts)
2281{ 2150{
2151 struct buffer_page *tail_page = info->tail_page;
2282 struct buffer_page *commit_page = cpu_buffer->commit_page; 2152 struct buffer_page *commit_page = cpu_buffer->commit_page;
2283 struct ring_buffer *buffer = cpu_buffer->buffer; 2153 struct ring_buffer *buffer = cpu_buffer->buffer;
2284 struct buffer_page *next_page; 2154 struct buffer_page *next_page;
2285 int ret; 2155 int ret;
2156 u64 ts;
2286 2157
2287 next_page = tail_page; 2158 next_page = tail_page;
2288 2159
@@ -2368,74 +2239,120 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
2368 2239
2369 out_again: 2240 out_again:
2370 2241
2371 rb_reset_tail(cpu_buffer, tail_page, tail, length); 2242 rb_reset_tail(cpu_buffer, tail, info);
2372 2243
2373 /* fail and let the caller try again */ 2244 /* fail and let the caller try again */
2374 return ERR_PTR(-EAGAIN); 2245 return ERR_PTR(-EAGAIN);
2375 2246
2376 out_reset: 2247 out_reset:
2377 /* reset write */ 2248 /* reset write */
2378 rb_reset_tail(cpu_buffer, tail_page, tail, length); 2249 rb_reset_tail(cpu_buffer, tail, info);
2379 2250
2380 return NULL; 2251 return NULL;
2381} 2252}
2382 2253
2383static struct ring_buffer_event * 2254/* Slow path, do not inline */
2384__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, 2255static noinline struct ring_buffer_event *
2385 unsigned long length, u64 ts, 2256rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
2386 u64 delta, int add_timestamp)
2387{ 2257{
2388 struct buffer_page *tail_page; 2258 event->type_len = RINGBUF_TYPE_TIME_EXTEND;
2389 struct ring_buffer_event *event;
2390 unsigned long tail, write;
2391 2259
2392 /* 2260 /* Not the first event on the page? */
2393 * If the time delta since the last event is too big to 2261 if (rb_event_index(event)) {
2394 * hold in the time field of the event, then we append a 2262 event->time_delta = delta & TS_MASK;
2395 * TIME EXTEND event ahead of the data event. 2263 event->array[0] = delta >> TS_SHIFT;
2396 */ 2264 } else {
2397 if (unlikely(add_timestamp)) 2265 /* nope, just zero it */
2398 length += RB_LEN_TIME_EXTEND; 2266 event->time_delta = 0;
2267 event->array[0] = 0;
2268 }
2399 2269
2400 tail_page = cpu_buffer->tail_page; 2270 return skip_time_extend(event);
2401 write = local_add_return(length, &tail_page->write); 2271}
2402 2272
2403 /* set write to only the index of the write */ 2273static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
2404 write &= RB_WRITE_MASK; 2274 struct ring_buffer_event *event);
2405 tail = write - length; 2275
2276/**
2277 * rb_update_event - update event type and data
2278 * @event: the event to update
2279 * @type: the type of event
2280 * @length: the size of the event field in the ring buffer
2281 *
2282 * Update the type and data fields of the event. The length
2283 * is the actual size that is written to the ring buffer,
2284 * and with this, we can determine what to place into the
2285 * data field.
2286 */
2287static void
2288rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
2289 struct ring_buffer_event *event,
2290 struct rb_event_info *info)
2291{
2292 unsigned length = info->length;
2293 u64 delta = info->delta;
2294
2295 /* Only a commit updates the timestamp */
2296 if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
2297 delta = 0;
2406 2298
2407 /* 2299 /*
2408 * If this is the first commit on the page, then it has the same 2300 * If we need to add a timestamp, then we
2409 * timestamp as the page itself. 2301 * add it to the start of the resevered space.
2410 */ 2302 */
2411 if (!tail) 2303 if (unlikely(info->add_timestamp)) {
2304 event = rb_add_time_stamp(event, delta);
2305 length -= RB_LEN_TIME_EXTEND;
2412 delta = 0; 2306 delta = 0;
2307 }
2413 2308
2414 /* See if we shot pass the end of this buffer page */ 2309 event->time_delta = delta;
2415 if (unlikely(write > BUF_PAGE_SIZE)) 2310 length -= RB_EVNT_HDR_SIZE;
2416 return rb_move_tail(cpu_buffer, length, tail, 2311 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
2417 tail_page, ts); 2312 event->type_len = 0;
2313 event->array[0] = length;
2314 } else
2315 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
2316}
2418 2317
2419 /* We reserved something on the buffer */ 2318static unsigned rb_calculate_event_length(unsigned length)
2319{
2320 struct ring_buffer_event event; /* Used only for sizeof array */
2420 2321
2421 event = __rb_page_index(tail_page, tail); 2322 /* zero length can cause confusions */
2422 kmemcheck_annotate_bitfield(event, bitfield); 2323 if (!length)
2423 rb_update_event(cpu_buffer, event, length, add_timestamp, delta); 2324 length++;
2424 2325
2425 local_inc(&tail_page->entries); 2326 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
2327 length += sizeof(event.array[0]);
2328
2329 length += RB_EVNT_HDR_SIZE;
2330 length = ALIGN(length, RB_ARCH_ALIGNMENT);
2426 2331
2427 /* 2332 /*
2428 * If this is the first commit on the page, then update 2333 * In case the time delta is larger than the 27 bits for it
2429 * its timestamp. 2334 * in the header, we need to add a timestamp. If another
2335 * event comes in when trying to discard this one to increase
2336 * the length, then the timestamp will be added in the allocated
2337 * space of this event. If length is bigger than the size needed
2338 * for the TIME_EXTEND, then padding has to be used. The events
2339 * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
2340 * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
2341 * As length is a multiple of 4, we only need to worry if it
2342 * is 12 (RB_LEN_TIME_EXTEND + 4).
2430 */ 2343 */
2431 if (!tail) 2344 if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
2432 tail_page->page->time_stamp = ts; 2345 length += RB_ALIGNMENT;
2433 2346
2434 /* account for these added bytes */ 2347 return length;
2435 local_add(length, &cpu_buffer->entries_bytes); 2348}
2436 2349
2437 return event; 2350#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2351static inline bool sched_clock_stable(void)
2352{
2353 return true;
2438} 2354}
2355#endif
2439 2356
2440static inline int 2357static inline int
2441rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, 2358rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2483,6 +2400,59 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
2483 local_inc(&cpu_buffer->commits); 2400 local_inc(&cpu_buffer->commits);
2484} 2401}
2485 2402
2403static void
2404rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
2405{
2406 unsigned long max_count;
2407
2408 /*
2409 * We only race with interrupts and NMIs on this CPU.
2410 * If we own the commit event, then we can commit
2411 * all others that interrupted us, since the interruptions
2412 * are in stack format (they finish before they come
2413 * back to us). This allows us to do a simple loop to
2414 * assign the commit to the tail.
2415 */
2416 again:
2417 max_count = cpu_buffer->nr_pages * 100;
2418
2419 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
2420 if (RB_WARN_ON(cpu_buffer, !(--max_count)))
2421 return;
2422 if (RB_WARN_ON(cpu_buffer,
2423 rb_is_reader_page(cpu_buffer->tail_page)))
2424 return;
2425 local_set(&cpu_buffer->commit_page->page->commit,
2426 rb_page_write(cpu_buffer->commit_page));
2427 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
2428 cpu_buffer->write_stamp =
2429 cpu_buffer->commit_page->page->time_stamp;
2430 /* add barrier to keep gcc from optimizing too much */
2431 barrier();
2432 }
2433 while (rb_commit_index(cpu_buffer) !=
2434 rb_page_write(cpu_buffer->commit_page)) {
2435
2436 local_set(&cpu_buffer->commit_page->page->commit,
2437 rb_page_write(cpu_buffer->commit_page));
2438 RB_WARN_ON(cpu_buffer,
2439 local_read(&cpu_buffer->commit_page->page->commit) &
2440 ~RB_WRITE_MASK);
2441 barrier();
2442 }
2443
2444 /* again, keep gcc from optimizing */
2445 barrier();
2446
2447 /*
2448 * If an interrupt came in just after the first while loop
2449 * and pushed the tail page forward, we will be left with
2450 * a dangling commit that will never go forward.
2451 */
2452 if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
2453 goto again;
2454}
2455
2486static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer) 2456static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2487{ 2457{
2488 unsigned long commits; 2458 unsigned long commits;
@@ -2515,91 +2485,94 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
2515 } 2485 }
2516} 2486}
2517 2487
2518static struct ring_buffer_event * 2488static inline void rb_event_discard(struct ring_buffer_event *event)
2519rb_reserve_next_event(struct ring_buffer *buffer,
2520 struct ring_buffer_per_cpu *cpu_buffer,
2521 unsigned long length)
2522{ 2489{
2523 struct ring_buffer_event *event; 2490 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
2524 u64 ts, delta; 2491 event = skip_time_extend(event);
2525 int nr_loops = 0;
2526 int add_timestamp;
2527 u64 diff;
2528 2492
2529 rb_start_commit(cpu_buffer); 2493 /* array[0] holds the actual length for the discarded event */
2494 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2495 event->type_len = RINGBUF_TYPE_PADDING;
2496 /* time delta must be non zero */
2497 if (!event->time_delta)
2498 event->time_delta = 1;
2499}
2530 2500
2531#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP 2501static inline int
2532 /* 2502rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
2533 * Due to the ability to swap a cpu buffer from a buffer 2503 struct ring_buffer_event *event)
2534 * it is possible it was swapped before we committed. 2504{
2535 * (committing stops a swap). We check for it here and 2505 unsigned long addr = (unsigned long)event;
2536 * if it happened, we have to fail the write. 2506 unsigned long index;
2537 */
2538 barrier();
2539 if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
2540 local_dec(&cpu_buffer->committing);
2541 local_dec(&cpu_buffer->commits);
2542 return NULL;
2543 }
2544#endif
2545 2507
2546 length = rb_calculate_event_length(length); 2508 index = rb_event_index(event);
2547 again: 2509 addr &= PAGE_MASK;
2548 add_timestamp = 0; 2510
2549 delta = 0; 2511 return cpu_buffer->commit_page->page == (void *)addr &&
2512 rb_commit_index(cpu_buffer) == index;
2513}
2514
2515static void
2516rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2517 struct ring_buffer_event *event)
2518{
2519 u64 delta;
2550 2520
2551 /* 2521 /*
2552 * We allow for interrupts to reenter here and do a trace. 2522 * The event first in the commit queue updates the
2553 * If one does, it will cause this original code to loop 2523 * time stamp.
2554 * back here. Even with heavy interrupts happening, this
2555 * should only happen a few times in a row. If this happens
2556 * 1000 times in a row, there must be either an interrupt
2557 * storm or we have something buggy.
2558 * Bail!
2559 */ 2524 */
2560 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 2525 if (rb_event_is_commit(cpu_buffer, event)) {
2561 goto out_fail; 2526 /*
2527 * A commit event that is first on a page
2528 * updates the write timestamp with the page stamp
2529 */
2530 if (!rb_event_index(event))
2531 cpu_buffer->write_stamp =
2532 cpu_buffer->commit_page->page->time_stamp;
2533 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2534 delta = event->array[0];
2535 delta <<= TS_SHIFT;
2536 delta += event->time_delta;
2537 cpu_buffer->write_stamp += delta;
2538 } else
2539 cpu_buffer->write_stamp += event->time_delta;
2540 }
2541}
2562 2542
2563 ts = rb_time_stamp(cpu_buffer->buffer); 2543static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2564 diff = ts - cpu_buffer->write_stamp; 2544 struct ring_buffer_event *event)
2545{
2546 local_inc(&cpu_buffer->entries);
2547 rb_update_write_stamp(cpu_buffer, event);
2548 rb_end_commit(cpu_buffer);
2549}
2565 2550
2566 /* make sure this diff is calculated here */ 2551static __always_inline void
2567 barrier(); 2552rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2553{
2554 bool pagebusy;
2568 2555
2569 /* Did the write stamp get updated already? */ 2556 if (buffer->irq_work.waiters_pending) {
2570 if (likely(ts >= cpu_buffer->write_stamp)) { 2557 buffer->irq_work.waiters_pending = false;
2571 delta = diff; 2558 /* irq_work_queue() supplies it's own memory barriers */
2572 if (unlikely(test_time_stamp(delta))) { 2559 irq_work_queue(&buffer->irq_work.work);
2573 int local_clock_stable = 1;
2574#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
2575 local_clock_stable = sched_clock_stable();
2576#endif
2577 WARN_ONCE(delta > (1ULL << 59),
2578 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
2579 (unsigned long long)delta,
2580 (unsigned long long)ts,
2581 (unsigned long long)cpu_buffer->write_stamp,
2582 local_clock_stable ? "" :
2583 "If you just came from a suspend/resume,\n"
2584 "please switch to the trace global clock:\n"
2585 " echo global > /sys/kernel/debug/tracing/trace_clock\n");
2586 add_timestamp = 1;
2587 }
2588 } 2560 }
2589 2561
2590 event = __rb_reserve_next(cpu_buffer, length, ts, 2562 if (cpu_buffer->irq_work.waiters_pending) {
2591 delta, add_timestamp); 2563 cpu_buffer->irq_work.waiters_pending = false;
2592 if (unlikely(PTR_ERR(event) == -EAGAIN)) 2564 /* irq_work_queue() supplies it's own memory barriers */
2593 goto again; 2565 irq_work_queue(&cpu_buffer->irq_work.work);
2594 2566 }
2595 if (!event)
2596 goto out_fail;
2597 2567
2598 return event; 2568 pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
2599 2569
2600 out_fail: 2570 if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
2601 rb_end_commit(cpu_buffer); 2571 cpu_buffer->irq_work.wakeup_full = true;
2602 return NULL; 2572 cpu_buffer->irq_work.full_waiters_pending = false;
2573 /* irq_work_queue() supplies it's own memory barriers */
2574 irq_work_queue(&cpu_buffer->irq_work.work);
2575 }
2603} 2576}
2604 2577
2605/* 2578/*
@@ -2672,6 +2645,178 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
2672} 2645}
2673 2646
2674/** 2647/**
2648 * ring_buffer_unlock_commit - commit a reserved
2649 * @buffer: The buffer to commit to
2650 * @event: The event pointer to commit.
2651 *
2652 * This commits the data to the ring buffer, and releases any locks held.
2653 *
2654 * Must be paired with ring_buffer_lock_reserve.
2655 */
2656int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2657 struct ring_buffer_event *event)
2658{
2659 struct ring_buffer_per_cpu *cpu_buffer;
2660 int cpu = raw_smp_processor_id();
2661
2662 cpu_buffer = buffer->buffers[cpu];
2663
2664 rb_commit(cpu_buffer, event);
2665
2666 rb_wakeups(buffer, cpu_buffer);
2667
2668 trace_recursive_unlock(cpu_buffer);
2669
2670 preempt_enable_notrace();
2671
2672 return 0;
2673}
2674EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
2675
2676static noinline void
2677rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
2678 struct rb_event_info *info)
2679{
2680 WARN_ONCE(info->delta > (1ULL << 59),
2681 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
2682 (unsigned long long)info->delta,
2683 (unsigned long long)info->ts,
2684 (unsigned long long)cpu_buffer->write_stamp,
2685 sched_clock_stable() ? "" :
2686 "If you just came from a suspend/resume,\n"
2687 "please switch to the trace global clock:\n"
2688 " echo global > /sys/kernel/debug/tracing/trace_clock\n");
2689 info->add_timestamp = 1;
2690}
2691
2692static struct ring_buffer_event *
2693__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
2694 struct rb_event_info *info)
2695{
2696 struct ring_buffer_event *event;
2697 struct buffer_page *tail_page;
2698 unsigned long tail, write;
2699
2700 /*
2701 * If the time delta since the last event is too big to
2702 * hold in the time field of the event, then we append a
2703 * TIME EXTEND event ahead of the data event.
2704 */
2705 if (unlikely(info->add_timestamp))
2706 info->length += RB_LEN_TIME_EXTEND;
2707
2708 tail_page = info->tail_page = cpu_buffer->tail_page;
2709 write = local_add_return(info->length, &tail_page->write);
2710
2711 /* set write to only the index of the write */
2712 write &= RB_WRITE_MASK;
2713 tail = write - info->length;
2714
2715 /*
2716 * If this is the first commit on the page, then it has the same
2717 * timestamp as the page itself.
2718 */
2719 if (!tail)
2720 info->delta = 0;
2721
2722 /* See if we shot pass the end of this buffer page */
2723 if (unlikely(write > BUF_PAGE_SIZE))
2724 return rb_move_tail(cpu_buffer, tail, info);
2725
2726 /* We reserved something on the buffer */
2727
2728 event = __rb_page_index(tail_page, tail);
2729 kmemcheck_annotate_bitfield(event, bitfield);
2730 rb_update_event(cpu_buffer, event, info);
2731
2732 local_inc(&tail_page->entries);
2733
2734 /*
2735 * If this is the first commit on the page, then update
2736 * its timestamp.
2737 */
2738 if (!tail)
2739 tail_page->page->time_stamp = info->ts;
2740
2741 /* account for these added bytes */
2742 local_add(info->length, &cpu_buffer->entries_bytes);
2743
2744 return event;
2745}
2746
2747static struct ring_buffer_event *
2748rb_reserve_next_event(struct ring_buffer *buffer,
2749 struct ring_buffer_per_cpu *cpu_buffer,
2750 unsigned long length)
2751{
2752 struct ring_buffer_event *event;
2753 struct rb_event_info info;
2754 int nr_loops = 0;
2755 u64 diff;
2756
2757 rb_start_commit(cpu_buffer);
2758
2759#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
2760 /*
2761 * Due to the ability to swap a cpu buffer from a buffer
2762 * it is possible it was swapped before we committed.
2763 * (committing stops a swap). We check for it here and
2764 * if it happened, we have to fail the write.
2765 */
2766 barrier();
2767 if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
2768 local_dec(&cpu_buffer->committing);
2769 local_dec(&cpu_buffer->commits);
2770 return NULL;
2771 }
2772#endif
2773
2774 info.length = rb_calculate_event_length(length);
2775 again:
2776 info.add_timestamp = 0;
2777 info.delta = 0;
2778
2779 /*
2780 * We allow for interrupts to reenter here and do a trace.
2781 * If one does, it will cause this original code to loop
2782 * back here. Even with heavy interrupts happening, this
2783 * should only happen a few times in a row. If this happens
2784 * 1000 times in a row, there must be either an interrupt
2785 * storm or we have something buggy.
2786 * Bail!
2787 */
2788 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
2789 goto out_fail;
2790
2791 info.ts = rb_time_stamp(cpu_buffer->buffer);
2792 diff = info.ts - cpu_buffer->write_stamp;
2793
2794 /* make sure this diff is calculated here */
2795 barrier();
2796
2797 /* Did the write stamp get updated already? */
2798 if (likely(info.ts >= cpu_buffer->write_stamp)) {
2799 info.delta = diff;
2800 if (unlikely(test_time_stamp(info.delta)))
2801 rb_handle_timestamp(cpu_buffer, &info);
2802 }
2803
2804 event = __rb_reserve_next(cpu_buffer, &info);
2805
2806 if (unlikely(PTR_ERR(event) == -EAGAIN))
2807 goto again;
2808
2809 if (!event)
2810 goto out_fail;
2811
2812 return event;
2813
2814 out_fail:
2815 rb_end_commit(cpu_buffer);
2816 return NULL;
2817}
2818
2819/**
2675 * ring_buffer_lock_reserve - reserve a part of the buffer 2820 * ring_buffer_lock_reserve - reserve a part of the buffer
2676 * @buffer: the ring buffer to reserve from 2821 * @buffer: the ring buffer to reserve from
2677 * @length: the length of the data to reserve (excluding event header) 2822 * @length: the length of the data to reserve (excluding event header)
@@ -2729,111 +2874,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
2729} 2874}
2730EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); 2875EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
2731 2876
2732static void
2733rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
2734 struct ring_buffer_event *event)
2735{
2736 u64 delta;
2737
2738 /*
2739 * The event first in the commit queue updates the
2740 * time stamp.
2741 */
2742 if (rb_event_is_commit(cpu_buffer, event)) {
2743 /*
2744 * A commit event that is first on a page
2745 * updates the write timestamp with the page stamp
2746 */
2747 if (!rb_event_index(event))
2748 cpu_buffer->write_stamp =
2749 cpu_buffer->commit_page->page->time_stamp;
2750 else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
2751 delta = event->array[0];
2752 delta <<= TS_SHIFT;
2753 delta += event->time_delta;
2754 cpu_buffer->write_stamp += delta;
2755 } else
2756 cpu_buffer->write_stamp += event->time_delta;
2757 }
2758}
2759
2760static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
2761 struct ring_buffer_event *event)
2762{
2763 local_inc(&cpu_buffer->entries);
2764 rb_update_write_stamp(cpu_buffer, event);
2765 rb_end_commit(cpu_buffer);
2766}
2767
2768static __always_inline void
2769rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2770{
2771 bool pagebusy;
2772
2773 if (buffer->irq_work.waiters_pending) {
2774 buffer->irq_work.waiters_pending = false;
2775 /* irq_work_queue() supplies it's own memory barriers */
2776 irq_work_queue(&buffer->irq_work.work);
2777 }
2778
2779 if (cpu_buffer->irq_work.waiters_pending) {
2780 cpu_buffer->irq_work.waiters_pending = false;
2781 /* irq_work_queue() supplies it's own memory barriers */
2782 irq_work_queue(&cpu_buffer->irq_work.work);
2783 }
2784
2785 pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
2786
2787 if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
2788 cpu_buffer->irq_work.wakeup_full = true;
2789 cpu_buffer->irq_work.full_waiters_pending = false;
2790 /* irq_work_queue() supplies it's own memory barriers */
2791 irq_work_queue(&cpu_buffer->irq_work.work);
2792 }
2793}
2794
2795/**
2796 * ring_buffer_unlock_commit - commit a reserved
2797 * @buffer: The buffer to commit to
2798 * @event: The event pointer to commit.
2799 *
2800 * This commits the data to the ring buffer, and releases any locks held.
2801 *
2802 * Must be paired with ring_buffer_lock_reserve.
2803 */
2804int ring_buffer_unlock_commit(struct ring_buffer *buffer,
2805 struct ring_buffer_event *event)
2806{
2807 struct ring_buffer_per_cpu *cpu_buffer;
2808 int cpu = raw_smp_processor_id();
2809
2810 cpu_buffer = buffer->buffers[cpu];
2811
2812 rb_commit(cpu_buffer, event);
2813
2814 rb_wakeups(buffer, cpu_buffer);
2815
2816 trace_recursive_unlock(cpu_buffer);
2817
2818 preempt_enable_notrace();
2819
2820 return 0;
2821}
2822EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
2823
2824static inline void rb_event_discard(struct ring_buffer_event *event)
2825{
2826 if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
2827 event = skip_time_extend(event);
2828
2829 /* array[0] holds the actual length for the discarded event */
2830 event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
2831 event->type_len = RINGBUF_TYPE_PADDING;
2832 /* time delta must be non zero */
2833 if (!event->time_delta)
2834 event->time_delta = 1;
2835}
2836
2837/* 2877/*
2838 * Decrement the entries to the page that an event is on. 2878 * Decrement the entries to the page that an event is on.
2839 * The event does not even need to exist, only the pointer 2879 * The event does not even need to exist, only the pointer
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index abcbf7ff8743..6e79408674aa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
3035 if (!iter) 3035 if (!iter)
3036 return ERR_PTR(-ENOMEM); 3036 return ERR_PTR(-ENOMEM);
3037 3037
3038 iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(), 3038 iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter),
3039 GFP_KERNEL); 3039 GFP_KERNEL);
3040 if (!iter->buffer_iter) 3040 if (!iter->buffer_iter)
3041 goto release; 3041 goto release;
@@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
6990 trace_init_global_iter(&iter); 6990 trace_init_global_iter(&iter);
6991 6991
6992 for_each_tracing_cpu(cpu) { 6992 for_each_tracing_cpu(cpu) {
6993 atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); 6993 atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
6994 } 6994 }
6995 6995
6996 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; 6996 old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f060716b02ae..74bde81601a9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -444,6 +444,7 @@ enum {
444 444
445 TRACE_CONTROL_BIT, 445 TRACE_CONTROL_BIT,
446 446
447 TRACE_BRANCH_BIT,
447/* 448/*
448 * Abuse of the trace_recursion. 449 * Abuse of the trace_recursion.
449 * As we need a way to maintain state if we are tracing the function 450 * As we need a way to maintain state if we are tracing the function
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index a87b43f49eb4..e2e12ad3186f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -36,9 +36,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
36 struct trace_branch *entry; 36 struct trace_branch *entry;
37 struct ring_buffer *buffer; 37 struct ring_buffer *buffer;
38 unsigned long flags; 38 unsigned long flags;
39 int cpu, pc; 39 int pc;
40 const char *p; 40 const char *p;
41 41
42 if (current->trace_recursion & TRACE_BRANCH_BIT)
43 return;
44
42 /* 45 /*
43 * I would love to save just the ftrace_likely_data pointer, but 46 * I would love to save just the ftrace_likely_data pointer, but
44 * this code can also be used by modules. Ugly things can happen 47 * this code can also be used by modules. Ugly things can happen
@@ -49,10 +52,10 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
49 if (unlikely(!tr)) 52 if (unlikely(!tr))
50 return; 53 return;
51 54
52 local_irq_save(flags); 55 raw_local_irq_save(flags);
53 cpu = raw_smp_processor_id(); 56 current->trace_recursion |= TRACE_BRANCH_BIT;
54 data = per_cpu_ptr(tr->trace_buffer.data, cpu); 57 data = this_cpu_ptr(tr->trace_buffer.data);
55 if (atomic_inc_return(&data->disabled) != 1) 58 if (atomic_read(&data->disabled))
56 goto out; 59 goto out;
57 60
58 pc = preempt_count(); 61 pc = preempt_count();
@@ -81,8 +84,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
81 __buffer_unlock_commit(buffer, event); 84 __buffer_unlock_commit(buffer, event);
82 85
83 out: 86 out:
84 atomic_dec(&data->disabled); 87 current->trace_recursion &= ~TRACE_BRANCH_BIT;
85 local_irq_restore(flags); 88 raw_local_irq_restore(flags);
86} 89}
87 90
88static inline 91static inline
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 404a372ad85a..7ca09cdc20c2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -30,6 +30,7 @@
30DEFINE_MUTEX(event_mutex); 30DEFINE_MUTEX(event_mutex);
31 31
32LIST_HEAD(ftrace_events); 32LIST_HEAD(ftrace_events);
33static LIST_HEAD(ftrace_generic_fields);
33static LIST_HEAD(ftrace_common_fields); 34static LIST_HEAD(ftrace_common_fields);
34 35
35#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) 36#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
@@ -94,6 +95,10 @@ trace_find_event_field(struct trace_event_call *call, char *name)
94 struct ftrace_event_field *field; 95 struct ftrace_event_field *field;
95 struct list_head *head; 96 struct list_head *head;
96 97
98 field = __find_event_field(&ftrace_generic_fields, name);
99 if (field)
100 return field;
101
97 field = __find_event_field(&ftrace_common_fields, name); 102 field = __find_event_field(&ftrace_common_fields, name);
98 if (field) 103 if (field)
99 return field; 104 return field;
@@ -144,6 +149,13 @@ int trace_define_field(struct trace_event_call *call, const char *type,
144} 149}
145EXPORT_SYMBOL_GPL(trace_define_field); 150EXPORT_SYMBOL_GPL(trace_define_field);
146 151
152#define __generic_field(type, item, filter_type) \
153 ret = __trace_define_field(&ftrace_generic_fields, #type, \
154 #item, 0, 0, is_signed_type(type), \
155 filter_type); \
156 if (ret) \
157 return ret;
158
147#define __common_field(type, item) \ 159#define __common_field(type, item) \
148 ret = __trace_define_field(&ftrace_common_fields, #type, \ 160 ret = __trace_define_field(&ftrace_common_fields, #type, \
149 "common_" #item, \ 161 "common_" #item, \
@@ -153,6 +165,16 @@ EXPORT_SYMBOL_GPL(trace_define_field);
153 if (ret) \ 165 if (ret) \
154 return ret; 166 return ret;
155 167
168static int trace_define_generic_fields(void)
169{
170 int ret;
171
172 __generic_field(int, cpu, FILTER_OTHER);
173 __generic_field(char *, comm, FILTER_PTR_STRING);
174
175 return ret;
176}
177
156static int trace_define_common_fields(void) 178static int trace_define_common_fields(void)
157{ 179{
158 int ret; 180 int ret;
@@ -2671,6 +2693,9 @@ static __init int event_trace_init(void)
2671 if (!entry) 2693 if (!entry)
2672 pr_warn("Could not create tracefs 'available_events' entry\n"); 2694 pr_warn("Could not create tracefs 'available_events' entry\n");
2673 2695
2696 if (trace_define_generic_fields())
2697 pr_warn("tracing: Failed to allocated generic fields");
2698
2674 if (trace_define_common_fields()) 2699 if (trace_define_common_fields())
2675 pr_warn("tracing: Failed to allocate common fields"); 2700 pr_warn("tracing: Failed to allocate common fields");
2676 2701
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d81d6f302b14..bd1bf184c5c9 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
252 return match; 252 return match;
253} 253}
254 254
255/* Filter predicate for CPUs. */
256static int filter_pred_cpu(struct filter_pred *pred, void *event)
257{
258 int cpu, cmp;
259 int match = 0;
260
261 cpu = raw_smp_processor_id();
262 cmp = pred->val;
263
264 switch (pred->op) {
265 case OP_EQ:
266 match = cpu == cmp;
267 break;
268 case OP_LT:
269 match = cpu < cmp;
270 break;
271 case OP_LE:
272 match = cpu <= cmp;
273 break;
274 case OP_GT:
275 match = cpu > cmp;
276 break;
277 case OP_GE:
278 match = cpu >= cmp;
279 break;
280 default:
281 break;
282 }
283
284 return !!match == !pred->not;
285}
286
287/* Filter predicate for COMM. */
288static int filter_pred_comm(struct filter_pred *pred, void *event)
289{
290 int cmp, match;
291
292 cmp = pred->regex.match(current->comm, &pred->regex,
293 pred->regex.field_len);
294 match = cmp ^ pred->not;
295
296 return match;
297}
298
255static int filter_pred_none(struct filter_pred *pred, void *event) 299static int filter_pred_none(struct filter_pred *pred, void *event)
256{ 300{
257 return 0; 301 return 0;
@@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps,
1002 if (is_string_field(field)) { 1046 if (is_string_field(field)) {
1003 filter_build_regex(pred); 1047 filter_build_regex(pred);
1004 1048
1005 if (field->filter_type == FILTER_STATIC_STRING) { 1049 if (!strcmp(field->name, "comm")) {
1050 fn = filter_pred_comm;
1051 pred->regex.field_len = TASK_COMM_LEN;
1052 } else if (field->filter_type == FILTER_STATIC_STRING) {
1006 fn = filter_pred_string; 1053 fn = filter_pred_string;
1007 pred->regex.field_len = field->size; 1054 pred->regex.field_len = field->size;
1008 } else if (field->filter_type == FILTER_DYN_STRING) 1055 } else if (field->filter_type == FILTER_DYN_STRING)
@@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps,
1025 } 1072 }
1026 pred->val = val; 1073 pred->val = val;
1027 1074
1028 fn = select_comparison_fn(pred->op, field->size, 1075 if (!strcmp(field->name, "cpu"))
1076 fn = filter_pred_cpu;
1077 else
1078 fn = select_comparison_fn(pred->op, field->size,
1029 field->is_signed); 1079 field->is_signed);
1030 if (!fn) { 1080 if (!fn) {
1031 parse_error(ps, FILT_ERR_INVALID_OP, 0); 1081 parse_error(ps, FILT_ERR_INVALID_OP, 0);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8968bf720c12..ca98445782ac 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
715 715
716 snprintf(nsecs_str, slen, "%03lu", nsecs_rem); 716 snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
717 trace_seq_printf(s, ".%s", nsecs_str); 717 trace_seq_printf(s, ".%s", nsecs_str);
718 len += strlen(nsecs_str); 718 len += strlen(nsecs_str) + 1;
719 } 719 }
720 720
721 trace_seq_puts(s, " us "); 721 trace_seq_puts(s, " us ");
722 722
723 /* Print remaining spaces to fit the row's width */ 723 /* Print remaining spaces to fit the row's width */
724 for (i = len; i < 7; i++) 724 for (i = len; i < 8; i++)
725 trace_seq_putc(s, ' '); 725 trace_seq_putc(s, ' ');
726} 726}
727 727
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b7d0cdd9906c..c9956440d0e6 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -165,11 +165,9 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
165static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, 165static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
166 void *addr, void *dest) 166 void *addr, void *dest)
167{ 167{
168 long ret;
169 int maxlen = get_rloc_len(*(u32 *)dest); 168 int maxlen = get_rloc_len(*(u32 *)dest);
170 u8 *dst = get_rloc_data(dest); 169 u8 *dst = get_rloc_data(dest);
171 u8 *src = addr; 170 long ret;
172 mm_segment_t old_fs = get_fs();
173 171
174 if (!maxlen) 172 if (!maxlen)
175 return; 173 return;
@@ -178,23 +176,13 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
178 * Try to get string again, since the string can be changed while 176 * Try to get string again, since the string can be changed while
179 * probing. 177 * probing.
180 */ 178 */
181 set_fs(KERNEL_DS); 179 ret = strncpy_from_unsafe(dst, addr, maxlen);
182 pagefault_disable();
183
184 do
185 ret = __copy_from_user_inatomic(dst++, src++, 1);
186 while (dst[-1] && ret == 0 && src - (u8 *)addr < maxlen);
187
188 dst[-1] = '\0';
189 pagefault_enable();
190 set_fs(old_fs);
191 180
192 if (ret < 0) { /* Failed to fetch string */ 181 if (ret < 0) { /* Failed to fetch string */
193 ((u8 *)get_rloc_data(dest))[0] = '\0'; 182 dst[0] = '\0';
194 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest)); 183 *(u32 *)dest = make_data_rloc(0, get_rloc_offs(*(u32 *)dest));
195 } else { 184 } else {
196 *(u32 *)dest = make_data_rloc(src - (u8 *)addr, 185 *(u32 *)dest = make_data_rloc(ret, get_rloc_offs(*(u32 *)dest));
197 get_rloc_offs(*(u32 *)dest));
198 } 186 }
199} 187}
200NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string)); 188NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index dfab253727dc..8e481a84aeea 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -496,6 +496,8 @@ static const struct trace_mark {
496 char sym; 496 char sym;
497} mark[] = { 497} mark[] = {
498 MARK(1000000000ULL , '$'), /* 1 sec */ 498 MARK(1000000000ULL , '$'), /* 1 sec */
499 MARK(100000000ULL , '@'), /* 100 msec */
500 MARK(10000000ULL , '*'), /* 10 msec */
499 MARK(1000000ULL , '#'), /* 1000 usecs */ 501 MARK(1000000ULL , '#'), /* 1000 usecs */
500 MARK(100000ULL , '!'), /* 100 usecs */ 502 MARK(100000ULL , '!'), /* 100 usecs */
501 MARK(10000ULL , '+'), /* 10 usecs */ 503 MARK(10000ULL , '+'), /* 10 usecs */
@@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d)
508 int size = ARRAY_SIZE(mark); 510 int size = ARRAY_SIZE(mark);
509 511
510 for (i = 0; i < size; i++) { 512 for (i = 0; i < size; i++) {
511 if (d >= mark[i].val) 513 if (d > mark[i].val)
512 break; 514 break;
513 } 515 }
514 516
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 419ca37e72c9..f270088e9929 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -26,7 +26,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
26} 26}
27 27
28static void 28static void
29probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) 29probe_sched_wakeup(void *ignore, struct task_struct *wakee)
30{ 30{
31 if (unlikely(!sched_ref)) 31 if (unlikely(!sched_ref))
32 return; 32 return;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 9b33dd117f3f..12cbe77b4136 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -514,7 +514,7 @@ static void wakeup_reset(struct trace_array *tr)
514} 514}
515 515
516static void 516static void
517probe_wakeup(void *ignore, struct task_struct *p, int success) 517probe_wakeup(void *ignore, struct task_struct *p)
518{ 518{
519 struct trace_array_cpu *data; 519 struct trace_array_cpu *data;
520 int cpu = smp_processor_id(); 520 int cpu = smp_processor_id();
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3f34496244e9..b746399ab59c 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,12 +18,6 @@
18 18
19#define STACK_TRACE_ENTRIES 500 19#define STACK_TRACE_ENTRIES 500
20 20
21#ifdef CC_USING_FENTRY
22# define fentry 1
23#else
24# define fentry 0
25#endif
26
27static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = 21static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
28 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; 22 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
29static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; 23static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
@@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
35 */ 29 */
36static struct stack_trace max_stack_trace = { 30static struct stack_trace max_stack_trace = {
37 .max_entries = STACK_TRACE_ENTRIES - 1, 31 .max_entries = STACK_TRACE_ENTRIES - 1,
38 .entries = &stack_dump_trace[1], 32 .entries = &stack_dump_trace[0],
39}; 33};
40 34
41static unsigned long max_stack_size; 35static unsigned long max_stack_size;
@@ -55,7 +49,7 @@ static inline void print_max_stack(void)
55 49
56 pr_emerg(" Depth Size Location (%d entries)\n" 50 pr_emerg(" Depth Size Location (%d entries)\n"
57 " ----- ---- --------\n", 51 " ----- ---- --------\n",
58 max_stack_trace.nr_entries - 1); 52 max_stack_trace.nr_entries);
59 53
60 for (i = 0; i < max_stack_trace.nr_entries; i++) { 54 for (i = 0; i < max_stack_trace.nr_entries; i++) {
61 if (stack_dump_trace[i] == ULONG_MAX) 55 if (stack_dump_trace[i] == ULONG_MAX)
@@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack)
77 unsigned long this_size, flags; unsigned long *p, *top, *start; 71 unsigned long this_size, flags; unsigned long *p, *top, *start;
78 static int tracer_frame; 72 static int tracer_frame;
79 int frame_size = ACCESS_ONCE(tracer_frame); 73 int frame_size = ACCESS_ONCE(tracer_frame);
80 int i; 74 int i, x;
81 75
82 this_size = ((unsigned long)stack) & (THREAD_SIZE-1); 76 this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
83 this_size = THREAD_SIZE - this_size; 77 this_size = THREAD_SIZE - this_size;
@@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack)
105 max_stack_size = this_size; 99 max_stack_size = this_size;
106 100
107 max_stack_trace.nr_entries = 0; 101 max_stack_trace.nr_entries = 0;
108 102 max_stack_trace.skip = 3;
109 if (using_ftrace_ops_list_func())
110 max_stack_trace.skip = 4;
111 else
112 max_stack_trace.skip = 3;
113 103
114 save_stack_trace(&max_stack_trace); 104 save_stack_trace(&max_stack_trace);
115 105
116 /* 106 /* Skip over the overhead of the stack tracer itself */
117 * Add the passed in ip from the function tracer. 107 for (i = 0; i < max_stack_trace.nr_entries; i++) {
118 * Searching for this on the stack will skip over 108 if (stack_dump_trace[i] == ip)
119 * most of the overhead from the stack tracer itself. 109 break;
120 */ 110 }
121 stack_dump_trace[0] = ip;
122 max_stack_trace.nr_entries++;
123 111
124 /* 112 /*
125 * Now find where in the stack these are. 113 * Now find where in the stack these are.
126 */ 114 */
127 i = 0; 115 x = 0;
128 start = stack; 116 start = stack;
129 top = (unsigned long *) 117 top = (unsigned long *)
130 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); 118 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack)
139 while (i < max_stack_trace.nr_entries) { 127 while (i < max_stack_trace.nr_entries) {
140 int found = 0; 128 int found = 0;
141 129
142 stack_dump_index[i] = this_size; 130 stack_dump_index[x] = this_size;
143 p = start; 131 p = start;
144 132
145 for (; p < top && i < max_stack_trace.nr_entries; p++) { 133 for (; p < top && i < max_stack_trace.nr_entries; p++) {
134 if (stack_dump_trace[i] == ULONG_MAX)
135 break;
146 if (*p == stack_dump_trace[i]) { 136 if (*p == stack_dump_trace[i]) {
147 this_size = stack_dump_index[i++] = 137 stack_dump_trace[x] = stack_dump_trace[i++];
138 this_size = stack_dump_index[x++] =
148 (top - p) * sizeof(unsigned long); 139 (top - p) * sizeof(unsigned long);
149 found = 1; 140 found = 1;
150 /* Start the search from here */ 141 /* Start the search from here */
@@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack)
156 * out what that is, then figure it out 147 * out what that is, then figure it out
157 * now. 148 * now.
158 */ 149 */
159 if (unlikely(!tracer_frame) && i == 1) { 150 if (unlikely(!tracer_frame)) {
160 tracer_frame = (p - stack) * 151 tracer_frame = (p - stack) *
161 sizeof(unsigned long); 152 sizeof(unsigned long);
162 max_stack_size -= tracer_frame; 153 max_stack_size -= tracer_frame;
@@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack)
168 i++; 159 i++;
169 } 160 }
170 161
162 max_stack_trace.nr_entries = x;
163 for (; x < i; x++)
164 stack_dump_trace[x] = ULONG_MAX;
165
171 if (task_stack_end_corrupted(current)) { 166 if (task_stack_end_corrupted(current)) {
172 print_max_stack(); 167 print_max_stack();
173 BUG(); 168 BUG();
@@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
192 if (per_cpu(trace_active, cpu)++ != 0) 187 if (per_cpu(trace_active, cpu)++ != 0)
193 goto out; 188 goto out;
194 189
195 /* 190 ip += MCOUNT_INSN_SIZE;
196 * When fentry is used, the traced function does not get
197 * its stack frame set up, and we lose the parent.
198 * The ip is pretty useless because the function tracer
199 * was called before that function set up its stack frame.
200 * In this case, we use the parent ip.
201 *
202 * By adding the return address of either the parent ip
203 * or the current ip we can disregard most of the stack usage
204 * caused by the stack tracer itself.
205 *
206 * The function tracer always reports the address of where the
207 * mcount call was, but the stack will hold the return address.
208 */
209 if (fentry)
210 ip = parent_ip;
211 else
212 ip += MCOUNT_INSN_SIZE;
213 191
214 check_stack(ip, &stack); 192 check_stack(ip, &stack);
215 193
@@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos)
284{ 262{
285 long n = *pos - 1; 263 long n = *pos - 1;
286 264
287 if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX) 265 if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
288 return NULL; 266 return NULL;
289 267
290 m->private = (void *)n; 268 m->private = (void *)n;
@@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v)
354 seq_printf(m, " Depth Size Location" 332 seq_printf(m, " Depth Size Location"
355 " (%d entries)\n" 333 " (%d entries)\n"
356 " ----- ---- --------\n", 334 " ----- ---- --------\n",
357 max_stack_trace.nr_entries - 1); 335 max_stack_trace.nr_entries);
358 336
359 if (!stack_tracer_enabled && !max_stack_size) 337 if (!stack_tracer_enabled && !max_stack_size)
360 print_disabled(m); 338 print_disabled(m);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index aa1ea7b36fa8..d2f6d0be3503 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -601,7 +601,22 @@ static int probes_seq_show(struct seq_file *m, void *v)
601 601
602 seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system, 602 seq_printf(m, "%c:%s/%s", c, tu->tp.call.class->system,
603 trace_event_name(&tu->tp.call)); 603 trace_event_name(&tu->tp.call));
604 seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset); 604 seq_printf(m, " %s:", tu->filename);
605
606 /* Don't print "0x (null)" when offset is 0 */
607 if (tu->offset) {
608 seq_printf(m, "0x%p", (void *)tu->offset);
609 } else {
610 switch (sizeof(void *)) {
611 case 4:
612 seq_printf(m, "0x00000000");
613 break;
614 case 8:
615 default:
616 seq_printf(m, "0x0000000000000000");
617 break;
618 }
619 }
605 620
606 for (i = 0; i < tu->tp.nr_args; i++) 621 for (i = 0; i < tu->tp.nr_args; i++)
607 seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm); 622 seq_printf(m, " %s=%s", tu->tp.args[i].name, tu->tp.args[i].comm);
@@ -1095,11 +1110,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
1095{ 1110{
1096 struct trace_event_call *call = &tu->tp.call; 1111 struct trace_event_call *call = &tu->tp.call;
1097 struct uprobe_trace_entry_head *entry; 1112 struct uprobe_trace_entry_head *entry;
1113 struct bpf_prog *prog = call->prog;
1098 struct hlist_head *head; 1114 struct hlist_head *head;
1099 void *data; 1115 void *data;
1100 int size, esize; 1116 int size, esize;
1101 int rctx; 1117 int rctx;
1102 1118
1119 if (prog && !trace_call_bpf(prog, regs))
1120 return;
1121
1103 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu)); 1122 esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
1104 1123
1105 size = esize + tu->tp.size + dsize; 1124 size = esize + tu->tp.size + dsize;
@@ -1289,6 +1308,7 @@ static int register_uprobe_event(struct trace_uprobe *tu)
1289 return -ENODEV; 1308 return -ENODEV;
1290 } 1309 }
1291 1310
1311 call->flags = TRACE_EVENT_FL_UPROBE;
1292 call->class->reg = trace_uprobe_register; 1312 call->class->reg = trace_uprobe_register;
1293 call->data = tu; 1313 call->data = tu;
1294 ret = trace_add_event_call(call); 1314 ret = trace_add_event_call(call);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 4109f8320684..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
39 cred->cap_inheritable = CAP_EMPTY_SET; 39 cred->cap_inheritable = CAP_EMPTY_SET;
40 cred->cap_permitted = CAP_FULL_SET; 40 cred->cap_permitted = CAP_FULL_SET;
41 cred->cap_effective = CAP_FULL_SET; 41 cred->cap_effective = CAP_FULL_SET;
42 cred->cap_ambient = CAP_EMPTY_SET;
42 cred->cap_bset = CAP_FULL_SET; 43 cred->cap_bset = CAP_FULL_SET;
43#ifdef CONFIG_KEYS 44#ifdef CONFIG_KEYS
44 key_put(cred->request_key_auth); 45 key_put(cred->request_key_auth);
@@ -976,8 +977,8 @@ static int userns_install(struct nsproxy *nsproxy, struct ns_common *ns)
976 if (user_ns == current_user_ns()) 977 if (user_ns == current_user_ns())
977 return -EINVAL; 978 return -EINVAL;
978 979
979 /* Threaded processes may not enter a different user namespace */ 980 /* Tasks that share a thread group must share a user namespace */
980 if (atomic_read(&current->mm->mm_users) > 1) 981 if (!thread_group_empty(current))
981 return -EINVAL; 982 return -EINVAL;
982 983
983 if (current->fs->users != 1) 984 if (current->fs->users != 1)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6ffa43f2993..64ed1c37bd1f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
24#include <asm/irq_regs.h> 24#include <asm/irq_regs.h>
25#include <linux/kvm_para.h> 25#include <linux/kvm_para.h>
26#include <linux/perf_event.h> 26#include <linux/perf_event.h>
27#include <linux/kthread.h>
27 28
28/* 29/*
29 * The run state of the lockup detectors is controlled by the content of the 30 * The run state of the lockup detectors is controlled by the content of the
@@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
66#define for_each_watchdog_cpu(cpu) \ 67#define for_each_watchdog_cpu(cpu) \
67 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) 68 for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
68 69
70/*
71 * The 'watchdog_running' variable is set to 1 when the watchdog threads
72 * are registered/started and is set to 0 when the watchdog threads are
73 * unregistered/stopped, so it is an indicator whether the threads exist.
74 */
69static int __read_mostly watchdog_running; 75static int __read_mostly watchdog_running;
76/*
77 * If a subsystem has a need to deactivate the watchdog temporarily, it
78 * can use the suspend/resume interface to achieve this. The content of
79 * the 'watchdog_suspended' variable reflects this state. Existing threads
80 * are parked/unparked by the lockup_detector_{suspend|resume} functions
81 * (see comment blocks pertaining to those functions for further details).
82 *
83 * 'watchdog_suspended' also prevents threads from being registered/started
84 * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
85 * of 'watchdog_running' cannot change while the watchdog is deactivated
86 * temporarily (see related code in 'proc' handlers).
87 */
88static int __read_mostly watchdog_suspended;
89
70static u64 __read_mostly sample_period; 90static u64 __read_mostly sample_period;
71 91
72static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); 92static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu)
613 } 633 }
614} 634}
615 635
616void watchdog_nmi_enable_all(void)
617{
618 int cpu;
619
620 mutex_lock(&watchdog_proc_mutex);
621
622 if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
623 goto unlock;
624
625 get_online_cpus();
626 for_each_watchdog_cpu(cpu)
627 watchdog_nmi_enable(cpu);
628 put_online_cpus();
629
630unlock:
631 mutex_unlock(&watchdog_proc_mutex);
632}
633
634void watchdog_nmi_disable_all(void)
635{
636 int cpu;
637
638 mutex_lock(&watchdog_proc_mutex);
639
640 if (!watchdog_running)
641 goto unlock;
642
643 get_online_cpus();
644 for_each_watchdog_cpu(cpu)
645 watchdog_nmi_disable(cpu);
646 put_online_cpus();
647
648unlock:
649 mutex_unlock(&watchdog_proc_mutex);
650}
651#else 636#else
652static int watchdog_nmi_enable(unsigned int cpu) { return 0; } 637static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
653static void watchdog_nmi_disable(unsigned int cpu) { return; } 638static void watchdog_nmi_disable(unsigned int cpu) { return; }
654void watchdog_nmi_enable_all(void) {}
655void watchdog_nmi_disable_all(void) {}
656#endif /* CONFIG_HARDLOCKUP_DETECTOR */ 639#endif /* CONFIG_HARDLOCKUP_DETECTOR */
657 640
658static struct smp_hotplug_thread watchdog_threads = { 641static struct smp_hotplug_thread watchdog_threads = {
@@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = {
666 .unpark = watchdog_enable, 649 .unpark = watchdog_enable,
667}; 650};
668 651
669static void restart_watchdog_hrtimer(void *info) 652/*
653 * park all watchdog threads that are specified in 'watchdog_cpumask'
654 */
655static int watchdog_park_threads(void)
670{ 656{
671 struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer); 657 int cpu, ret = 0;
672 int ret;
673 658
659 get_online_cpus();
660 for_each_watchdog_cpu(cpu) {
661 ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
662 if (ret)
663 break;
664 }
665 if (ret) {
666 for_each_watchdog_cpu(cpu)
667 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
668 }
669 put_online_cpus();
670
671 return ret;
672}
673
674/*
675 * unpark all watchdog threads that are specified in 'watchdog_cpumask'
676 */
677static void watchdog_unpark_threads(void)
678{
679 int cpu;
680
681 get_online_cpus();
682 for_each_watchdog_cpu(cpu)
683 kthread_unpark(per_cpu(softlockup_watchdog, cpu));
684 put_online_cpus();
685}
686
687/*
688 * Suspend the hard and soft lockup detector by parking the watchdog threads.
689 */
690int lockup_detector_suspend(void)
691{
692 int ret = 0;
693
694 mutex_lock(&watchdog_proc_mutex);
674 /* 695 /*
675 * No need to cancel and restart hrtimer if it is currently executing 696 * Multiple suspend requests can be active in parallel (counted by
676 * because it will reprogram itself with the new period now. 697 * the 'watchdog_suspended' variable). If the watchdog threads are
677 * We should never see it unqueued here because we are running per-cpu 698 * running, the first caller takes care that they will be parked.
678 * with interrupts disabled. 699 * The state of 'watchdog_running' cannot change while a suspend
700 * request is active (see related code in 'proc' handlers).
679 */ 701 */
680 ret = hrtimer_try_to_cancel(hrtimer); 702 if (watchdog_running && !watchdog_suspended)
681 if (ret == 1) 703 ret = watchdog_park_threads();
682 hrtimer_start(hrtimer, ns_to_ktime(sample_period), 704
683 HRTIMER_MODE_REL_PINNED); 705 if (ret == 0)
706 watchdog_suspended++;
707
708 mutex_unlock(&watchdog_proc_mutex);
709
710 return ret;
684} 711}
685 712
686static void update_watchdog(int cpu) 713/*
714 * Resume the hard and soft lockup detector by unparking the watchdog threads.
715 */
716void lockup_detector_resume(void)
687{ 717{
718 mutex_lock(&watchdog_proc_mutex);
719
720 watchdog_suspended--;
688 /* 721 /*
689 * Make sure that perf event counter will adopt to a new 722 * The watchdog threads are unparked if they were previously running
690 * sampling period. Updating the sampling period directly would 723 * and if there is no more active suspend request.
691 * be much nicer but we do not have an API for that now so
692 * let's use a big hammer.
693 * Hrtimer will adopt the new period on the next tick but this
694 * might be late already so we have to restart the timer as well.
695 */ 724 */
696 watchdog_nmi_disable(cpu); 725 if (watchdog_running && !watchdog_suspended)
697 smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1); 726 watchdog_unpark_threads();
698 watchdog_nmi_enable(cpu); 727
728 mutex_unlock(&watchdog_proc_mutex);
699} 729}
700 730
701static void update_watchdog_all_cpus(void) 731static void update_watchdog_all_cpus(void)
702{ 732{
703 int cpu; 733 watchdog_park_threads();
704 734 watchdog_unpark_threads();
705 get_online_cpus();
706 for_each_watchdog_cpu(cpu)
707 update_watchdog(cpu);
708 put_online_cpus();
709} 735}
710 736
711static int watchdog_enable_all_cpus(void) 737static int watchdog_enable_all_cpus(void)
@@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void)
713 int err = 0; 739 int err = 0;
714 740
715 if (!watchdog_running) { 741 if (!watchdog_running) {
716 err = smpboot_register_percpu_thread(&watchdog_threads); 742 err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
743 &watchdog_cpumask);
717 if (err) 744 if (err)
718 pr_err("Failed to create watchdog threads, disabled\n"); 745 pr_err("Failed to create watchdog threads, disabled\n");
719 else { 746 else
720 if (smpboot_update_cpumask_percpu_thread(
721 &watchdog_threads, &watchdog_cpumask))
722 pr_err("Failed to set cpumask for watchdog threads\n");
723 watchdog_running = 1; 747 watchdog_running = 1;
724 }
725 } else { 748 } else {
726 /* 749 /*
727 * Enable/disable the lockup detectors or 750 * Enable/disable the lockup detectors or
@@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
787 810
788 mutex_lock(&watchdog_proc_mutex); 811 mutex_lock(&watchdog_proc_mutex);
789 812
813 if (watchdog_suspended) {
814 /* no parameter changes allowed while watchdog is suspended */
815 err = -EAGAIN;
816 goto out;
817 }
818
790 /* 819 /*
791 * If the parameter is being read return the state of the corresponding 820 * If the parameter is being read return the state of the corresponding
792 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the 821 * bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
872 901
873 mutex_lock(&watchdog_proc_mutex); 902 mutex_lock(&watchdog_proc_mutex);
874 903
904 if (watchdog_suspended) {
905 /* no parameter changes allowed while watchdog is suspended */
906 err = -EAGAIN;
907 goto out;
908 }
909
875 old = ACCESS_ONCE(watchdog_thresh); 910 old = ACCESS_ONCE(watchdog_thresh);
876 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 911 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
877 912
@@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
903 int err; 938 int err;
904 939
905 mutex_lock(&watchdog_proc_mutex); 940 mutex_lock(&watchdog_proc_mutex);
941
942 if (watchdog_suspended) {
943 /* no parameter changes allowed while watchdog is suspended */
944 err = -EAGAIN;
945 goto out;
946 }
947
906 err = proc_do_large_bitmap(table, write, buffer, lenp, ppos); 948 err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
907 if (!err && write) { 949 if (!err && write) {
908 /* Remove impossible cpus to keep sysctl output cleaner. */ 950 /* Remove impossible cpus to keep sysctl output cleaner. */
@@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
920 pr_err("cpumask update failed\n"); 962 pr_err("cpumask update failed\n");
921 } 963 }
922 } 964 }
965out:
923 mutex_unlock(&watchdog_proc_mutex); 966 mutex_unlock(&watchdog_proc_mutex);
924 return err; 967 return err;
925} 968}
@@ -932,10 +975,8 @@ void __init lockup_detector_init(void)
932 975
933#ifdef CONFIG_NO_HZ_FULL 976#ifdef CONFIG_NO_HZ_FULL
934 if (tick_nohz_full_enabled()) { 977 if (tick_nohz_full_enabled()) {
935 if (!cpumask_empty(tick_nohz_full_mask)) 978 pr_info("Disabling watchdog on nohz_full cores by default\n");
936 pr_info("Disabling watchdog on nohz_full cores by default\n"); 979 cpumask_copy(&watchdog_cpumask, housekeeping_mask);
937 cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
938 tick_nohz_full_mask);
939 } else 980 } else
940 cpumask_copy(&watchdog_cpumask, cpu_possible_mask); 981 cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
941#else 982#else
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4c4f06176f74..ca71582fcfab 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -338,20 +338,20 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq);
338#include <trace/events/workqueue.h> 338#include <trace/events/workqueue.h>
339 339
340#define assert_rcu_or_pool_mutex() \ 340#define assert_rcu_or_pool_mutex() \
341 rcu_lockdep_assert(rcu_read_lock_sched_held() || \ 341 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
342 lockdep_is_held(&wq_pool_mutex), \ 342 !lockdep_is_held(&wq_pool_mutex), \
343 "sched RCU or wq_pool_mutex should be held") 343 "sched RCU or wq_pool_mutex should be held")
344 344
345#define assert_rcu_or_wq_mutex(wq) \ 345#define assert_rcu_or_wq_mutex(wq) \
346 rcu_lockdep_assert(rcu_read_lock_sched_held() || \ 346 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
347 lockdep_is_held(&wq->mutex), \ 347 !lockdep_is_held(&wq->mutex), \
348 "sched RCU or wq->mutex should be held") 348 "sched RCU or wq->mutex should be held")
349 349
350#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ 350#define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \
351 rcu_lockdep_assert(rcu_read_lock_sched_held() || \ 351 RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
352 lockdep_is_held(&wq->mutex) || \ 352 !lockdep_is_held(&wq->mutex) && \
353 lockdep_is_held(&wq_pool_mutex), \ 353 !lockdep_is_held(&wq_pool_mutex), \
354 "sched RCU, wq->mutex or wq_pool_mutex should be held") 354 "sched RCU, wq->mutex or wq_pool_mutex should be held")
355 355
356#define for_each_cpu_worker_pool(pool, cpu) \ 356#define for_each_cpu_worker_pool(pool, cpu) \
357 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ 357 for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
@@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool)
1714 goto fail; 1714 goto fail;
1715 1715
1716 set_user_nice(worker->task, pool->attrs->nice); 1716 set_user_nice(worker->task, pool->attrs->nice);
1717 1717 kthread_bind_mask(worker->task, pool->attrs->cpumask);
1718 /* prevent userland from meddling with cpumask of workqueue workers */
1719 worker->task->flags |= PF_NO_SETAFFINITY;
1720 1718
1721 /* successful, attach the worker to the pool */ 1719 /* successful, attach the worker to the pool */
1722 worker_attach_to_pool(worker, pool); 1720 worker_attach_to_pool(worker, pool);
@@ -2614,7 +2612,7 @@ void flush_workqueue(struct workqueue_struct *wq)
2614out_unlock: 2612out_unlock:
2615 mutex_unlock(&wq->mutex); 2613 mutex_unlock(&wq->mutex);
2616} 2614}
2617EXPORT_SYMBOL_GPL(flush_workqueue); 2615EXPORT_SYMBOL(flush_workqueue);
2618 2616
2619/** 2617/**
2620 * drain_workqueue - drain a workqueue 2618 * drain_workqueue - drain a workqueue
@@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
3856 } 3854 }
3857 3855
3858 wq->rescuer = rescuer; 3856 wq->rescuer = rescuer;
3859 rescuer->task->flags |= PF_NO_SETAFFINITY; 3857 kthread_bind_mask(rescuer->task, cpu_possible_mask);
3860 wake_up_process(rescuer->task); 3858 wake_up_process(rescuer->task);
3861 } 3859 }
3862 3860