aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2014-11-20 08:42:02 -0500
committerJiri Kosina <jkosina@suse.cz>2014-11-20 08:42:02 -0500
commita02001086bbfb4da35d1228bebc2f1b442db455f (patch)
tree62ab47936cef06fd08657ca5b6cd1df98c19be57 /kernel
parenteff264efeeb0898408e8c9df72d8a32621035bed (diff)
parentfc14f9c1272f62c3e8d01300f52467c0d9af50f9 (diff)
Merge Linus' tree to be be to apply submitted patches to newer code than
current trivial.git base
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/acct.c494
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/audit.c32
-rw-r--r--kernel/audit.h1
-rw-r--r--kernel/audit_tree.c7
-rw-r--r--kernel/audit_watch.c4
-rw-r--r--kernel/auditfilter.c60
-rw-r--r--kernel/auditsc.c28
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/bpf/Makefile4
-rw-r--r--kernel/bpf/core.c136
-rw-r--r--kernel/bpf/syscall.c606
-rw-r--r--kernel/bpf/test_stub.c116
-rw-r--r--kernel/bpf/verifier.c1924
-rw-r--r--kernel/cgroup.c239
-rw-r--r--kernel/compat.c24
-rw-r--r--kernel/configs/tiny.config4
-rw-r--r--kernel/context_tracking.c40
-rw-r--r--kernel/cpu.c30
-rw-r--r--kernel/cpuset.c24
-rw-r--r--kernel/crash_dump.c1
-rw-r--r--kernel/debug/kdb/kdb_bp.c6
-rw-r--r--kernel/events/callchain.c6
-rw-r--r--kernel/events/core.c295
-rw-r--r--kernel/events/hw_breakpoint.c7
-rw-r--r--kernel/events/uprobes.c15
-rw-r--r--kernel/exit.c100
-rw-r--r--kernel/fork.c102
-rw-r--r--kernel/freezer.c9
-rw-r--r--kernel/futex.c39
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/gcov/fs.c3
-rw-r--r--kernel/irq/Kconfig3
-rw-r--r--kernel/irq/chip.c88
-rw-r--r--kernel/irq/internals.h16
-rw-r--r--kernel/irq/irqdesc.c42
-rw-r--r--kernel/irq/manage.c32
-rw-r--r--kernel/irq/pm.c159
-rw-r--r--kernel/irq_work.c27
-rw-r--r--kernel/kallsyms.c13
-rw-r--r--kernel/kcmp.c7
-rw-r--r--kernel/kexec.c1292
-rw-r--r--kernel/kmod.c76
-rw-r--r--kernel/kprobes.c13
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/locking/locktorture.c529
-rw-r--r--kernel/locking/mcs_spinlock.h3
-rw-r--r--kernel/locking/mutex.c416
-rw-r--r--kernel/locking/mutex.h2
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rwsem-xadd.c27
-rw-r--r--kernel/locking/semaphore.c12
-rw-r--r--kernel/module.c25
-rw-r--r--kernel/nsproxy.c15
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/params.c24
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/hibernate.c8
-rw-r--r--kernel/power/power.h1
-rw-r--r--kernel/power/process.c58
-rw-r--r--kernel/power/qos.c27
-rw-r--r--kernel/power/snapshot.c25
-rw-r--r--kernel/power/suspend.c53
-rw-r--r--kernel/power/suspend_test.c61
-rw-r--r--kernel/printk/printk.c211
-rw-r--r--kernel/rcu/rcutorture.c278
-rw-r--r--kernel/rcu/tiny.c20
-rw-r--r--kernel/rcu/tree.c130
-rw-r--r--kernel/rcu/tree.h21
-rw-r--r--kernel/rcu/tree_plugin.h459
-rw-r--r--kernel/rcu/update.c345
-rw-r--r--kernel/reboot.c81
-rw-r--r--kernel/resource.c204
-rw-r--r--kernel/sched/auto_group.c5
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/core.c377
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cputime.c64
-rw-r--r--kernel/sched/deadline.c76
-rw-r--r--kernel/sched/debug.c13
-rw-r--r--kernel/sched/fair.c504
-rw-r--r--kernel/sched/idle.c6
-rw-r--r--kernel/sched/proc.c7
-rw-r--r--kernel/sched/rt.c23
-rw-r--r--kernel/sched/sched.h84
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/sched/wait.c36
-rw-r--r--kernel/seccomp.c269
-rw-r--r--kernel/signal.c46
-rw-r--r--kernel/smp.c30
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/sys.c491
-rw-r--r--kernel/sys_ni.c8
-rw-r--r--kernel/sysctl.c28
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/test_kprobes.c87
-rw-r--r--kernel/time/alarmtimer.c34
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/hrtimer.c23
-rw-r--r--kernel/time/posix-cpu-timers.c14
-rw-r--r--kernel/time/posix-timers.c1
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-common.c7
-rw-r--r--kernel/time/tick-internal.h7
-rw-r--r--kernel/time/tick-oneshot.c2
-rw-r--r--kernel/time/tick-sched.c100
-rw-r--r--kernel/time/time.c56
-rw-r--r--kernel/time/timekeeping.c10
-rw-r--r--kernel/time/timer.c4
-rw-r--r--kernel/torture.c32
-rw-r--r--kernel/trace/ftrace.c640
-rw-r--r--kernel/trace/ring_buffer.c128
-rw-r--r--kernel/trace/ring_buffer_benchmark.c3
-rw-r--r--kernel/trace/trace.c33
-rw-r--r--kernel/trace/trace_events.c5
-rw-r--r--kernel/trace/trace_selftest.c51
-rw-r--r--kernel/trace/trace_stack.c4
-rw-r--r--kernel/trace/trace_syscalls.c12
-rw-r--r--kernel/user-return-notifier.c4
-rw-r--r--kernel/user_namespace.c6
-rw-r--r--kernel/utsname.c6
-rw-r--r--kernel/watchdog.c101
-rw-r--r--kernel/workqueue.c5
125 files changed, 9587 insertions, 3061 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 0026cf531769..17ea6d4a9a24 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -86,7 +86,7 @@ obj-$(CONFIG_RING_BUFFER) += trace/
86obj-$(CONFIG_TRACEPOINTS) += trace/ 86obj-$(CONFIG_TRACEPOINTS) += trace/
87obj-$(CONFIG_IRQ_WORK) += irq_work.o 87obj-$(CONFIG_IRQ_WORK) += irq_work.o
88obj-$(CONFIG_CPU_PM) += cpu_pm.o 88obj-$(CONFIG_CPU_PM) += cpu_pm.o
89obj-$(CONFIG_NET) += bpf/ 89obj-$(CONFIG_BPF) += bpf/
90 90
91obj-$(CONFIG_PERF_EVENTS) += events/ 91obj-$(CONFIG_PERF_EVENTS) += events/
92 92
@@ -105,7 +105,7 @@ targets += config_data.gz
105$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE 105$(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
106 $(call if_changed,gzip) 106 $(call if_changed,gzip)
107 107
108 filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/bin2c; echo "MAGIC_END;") 108 filechk_ikconfiggz = (echo "static const char kernel_config_data[] __used = MAGIC_START"; cat $< | scripts/basic/bin2c; echo "MAGIC_END;")
109targets += config_data.h 109targets += config_data.h
110$(obj)/config_data.h: $(obj)/config_data.gz FORCE 110$(obj)/config_data.h: $(obj)/config_data.gz FORCE
111 $(call filechk,ikconfiggz) 111 $(call filechk,ikconfiggz)
diff --git a/kernel/acct.c b/kernel/acct.c
index a1844f14c6d6..33738ef972f3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -59,6 +59,7 @@
59#include <asm/div64.h> 59#include <asm/div64.h>
60#include <linux/blkdev.h> /* sector_div */ 60#include <linux/blkdev.h> /* sector_div */
61#include <linux/pid_namespace.h> 61#include <linux/pid_namespace.h>
62#include <linux/fs_pin.h>
62 63
63/* 64/*
64 * These constants control the amount of freespace that suspend and 65 * These constants control the amount of freespace that suspend and
@@ -75,172 +76,190 @@ int acct_parm[3] = {4, 2, 30};
75/* 76/*
76 * External references and all of the globals. 77 * External references and all of the globals.
77 */ 78 */
78static void do_acct_process(struct bsd_acct_struct *acct, 79static void do_acct_process(struct bsd_acct_struct *acct);
79 struct pid_namespace *ns, struct file *);
80 80
81/*
82 * This structure is used so that all the data protected by lock
83 * can be placed in the same cache line as the lock. This primes
84 * the cache line to have the data after getting the lock.
85 */
86struct bsd_acct_struct { 81struct bsd_acct_struct {
82 struct fs_pin pin;
83 struct mutex lock;
87 int active; 84 int active;
88 unsigned long needcheck; 85 unsigned long needcheck;
89 struct file *file; 86 struct file *file;
90 struct pid_namespace *ns; 87 struct pid_namespace *ns;
91 struct list_head list; 88 struct work_struct work;
89 struct completion done;
92}; 90};
93 91
94static DEFINE_SPINLOCK(acct_lock);
95static LIST_HEAD(acct_list);
96
97/* 92/*
98 * Check the amount of free space and suspend/resume accordingly. 93 * Check the amount of free space and suspend/resume accordingly.
99 */ 94 */
100static int check_free_space(struct bsd_acct_struct *acct, struct file *file) 95static int check_free_space(struct bsd_acct_struct *acct)
101{ 96{
102 struct kstatfs sbuf; 97 struct kstatfs sbuf;
103 int res; 98
104 int act; 99 if (time_is_before_jiffies(acct->needcheck))
105 u64 resume;
106 u64 suspend;
107
108 spin_lock(&acct_lock);
109 res = acct->active;
110 if (!file || time_is_before_jiffies(acct->needcheck))
111 goto out; 100 goto out;
112 spin_unlock(&acct_lock);
113 101
114 /* May block */ 102 /* May block */
115 if (vfs_statfs(&file->f_path, &sbuf)) 103 if (vfs_statfs(&acct->file->f_path, &sbuf))
116 return res;
117 suspend = sbuf.f_blocks * SUSPEND;
118 resume = sbuf.f_blocks * RESUME;
119
120 do_div(suspend, 100);
121 do_div(resume, 100);
122
123 if (sbuf.f_bavail <= suspend)
124 act = -1;
125 else if (sbuf.f_bavail >= resume)
126 act = 1;
127 else
128 act = 0;
129
130 /*
131 * If some joker switched acct->file under us we'ld better be
132 * silent and _not_ touch anything.
133 */
134 spin_lock(&acct_lock);
135 if (file != acct->file) {
136 if (act)
137 res = act > 0;
138 goto out; 104 goto out;
139 }
140 105
141 if (acct->active) { 106 if (acct->active) {
142 if (act < 0) { 107 u64 suspend = sbuf.f_blocks * SUSPEND;
108 do_div(suspend, 100);
109 if (sbuf.f_bavail <= suspend) {
143 acct->active = 0; 110 acct->active = 0;
144 printk(KERN_INFO "Process accounting paused\n"); 111 pr_info("Process accounting paused\n");
145 } 112 }
146 } else { 113 } else {
147 if (act > 0) { 114 u64 resume = sbuf.f_blocks * RESUME;
115 do_div(resume, 100);
116 if (sbuf.f_bavail >= resume) {
148 acct->active = 1; 117 acct->active = 1;
149 printk(KERN_INFO "Process accounting resumed\n"); 118 pr_info("Process accounting resumed\n");
150 } 119 }
151 } 120 }
152 121
153 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; 122 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
154 res = acct->active;
155out: 123out:
156 spin_unlock(&acct_lock); 124 return acct->active;
125}
126
127static struct bsd_acct_struct *acct_get(struct pid_namespace *ns)
128{
129 struct bsd_acct_struct *res;
130again:
131 smp_rmb();
132 rcu_read_lock();
133 res = ACCESS_ONCE(ns->bacct);
134 if (!res) {
135 rcu_read_unlock();
136 return NULL;
137 }
138 if (!atomic_long_inc_not_zero(&res->pin.count)) {
139 rcu_read_unlock();
140 cpu_relax();
141 goto again;
142 }
143 rcu_read_unlock();
144 mutex_lock(&res->lock);
145 if (!res->ns) {
146 mutex_unlock(&res->lock);
147 pin_put(&res->pin);
148 goto again;
149 }
157 return res; 150 return res;
158} 151}
159 152
160/* 153static void close_work(struct work_struct *work)
161 * Close the old accounting file (if currently open) and then replace 154{
162 * it with file (if non-NULL). 155 struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
163 * 156 struct file *file = acct->file;
164 * NOTE: acct_lock MUST be held on entry and exit. 157 if (file->f_op->flush)
165 */ 158 file->f_op->flush(file, NULL);
166static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, 159 __fput_sync(file);
167 struct pid_namespace *ns) 160 complete(&acct->done);
161}
162
163static void acct_kill(struct bsd_acct_struct *acct,
164 struct bsd_acct_struct *new)
168{ 165{
169 struct file *old_acct = NULL; 166 if (acct) {
170 struct pid_namespace *old_ns = NULL; 167 struct pid_namespace *ns = acct->ns;
171 168 do_acct_process(acct);
172 if (acct->file) { 169 INIT_WORK(&acct->work, close_work);
173 old_acct = acct->file; 170 init_completion(&acct->done);
174 old_ns = acct->ns; 171 schedule_work(&acct->work);
175 acct->active = 0; 172 wait_for_completion(&acct->done);
176 acct->file = NULL; 173 pin_remove(&acct->pin);
174 ns->bacct = new;
177 acct->ns = NULL; 175 acct->ns = NULL;
178 list_del(&acct->list); 176 atomic_long_dec(&acct->pin.count);
177 mutex_unlock(&acct->lock);
178 pin_put(&acct->pin);
179 } 179 }
180 if (file) { 180}
181 acct->file = file; 181
182 acct->ns = ns; 182static void acct_pin_kill(struct fs_pin *pin)
183 acct->needcheck = jiffies + ACCT_TIMEOUT*HZ; 183{
184 acct->active = 1; 184 struct bsd_acct_struct *acct;
185 list_add(&acct->list, &acct_list); 185 acct = container_of(pin, struct bsd_acct_struct, pin);
186 } 186 mutex_lock(&acct->lock);
187 if (old_acct) { 187 if (!acct->ns) {
188 mnt_unpin(old_acct->f_path.mnt); 188 mutex_unlock(&acct->lock);
189 spin_unlock(&acct_lock); 189 pin_put(pin);
190 do_acct_process(acct, old_ns, old_acct); 190 acct = NULL;
191 filp_close(old_acct, NULL);
192 spin_lock(&acct_lock);
193 } 191 }
192 acct_kill(acct, NULL);
194} 193}
195 194
196static int acct_on(struct filename *pathname) 195static int acct_on(struct filename *pathname)
197{ 196{
198 struct file *file; 197 struct file *file;
199 struct vfsmount *mnt; 198 struct vfsmount *mnt, *internal;
200 struct pid_namespace *ns; 199 struct pid_namespace *ns = task_active_pid_ns(current);
201 struct bsd_acct_struct *acct = NULL; 200 struct bsd_acct_struct *acct, *old;
201 int err;
202
203 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
204 if (!acct)
205 return -ENOMEM;
202 206
203 /* Difference from BSD - they don't do O_APPEND */ 207 /* Difference from BSD - they don't do O_APPEND */
204 file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0); 208 file = file_open_name(pathname, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
205 if (IS_ERR(file)) 209 if (IS_ERR(file)) {
210 kfree(acct);
206 return PTR_ERR(file); 211 return PTR_ERR(file);
212 }
207 213
208 if (!S_ISREG(file_inode(file)->i_mode)) { 214 if (!S_ISREG(file_inode(file)->i_mode)) {
215 kfree(acct);
209 filp_close(file, NULL); 216 filp_close(file, NULL);
210 return -EACCES; 217 return -EACCES;
211 } 218 }
212 219
213 if (!file->f_op->write) { 220 if (!file->f_op->write) {
221 kfree(acct);
214 filp_close(file, NULL); 222 filp_close(file, NULL);
215 return -EIO; 223 return -EIO;
216 } 224 }
217 225 internal = mnt_clone_internal(&file->f_path);
218 ns = task_active_pid_ns(current); 226 if (IS_ERR(internal)) {
219 if (ns->bacct == NULL) { 227 kfree(acct);
220 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); 228 filp_close(file, NULL);
221 if (acct == NULL) { 229 return PTR_ERR(internal);
222 filp_close(file, NULL);
223 return -ENOMEM;
224 }
225 } 230 }
226 231 err = mnt_want_write(internal);
227 spin_lock(&acct_lock); 232 if (err) {
228 if (ns->bacct == NULL) { 233 mntput(internal);
229 ns->bacct = acct; 234 kfree(acct);
230 acct = NULL; 235 filp_close(file, NULL);
236 return err;
231 } 237 }
232
233 mnt = file->f_path.mnt; 238 mnt = file->f_path.mnt;
234 mnt_pin(mnt); 239 file->f_path.mnt = internal;
235 acct_file_reopen(ns->bacct, file, ns); 240
236 spin_unlock(&acct_lock); 241 atomic_long_set(&acct->pin.count, 1);
237 242 acct->pin.kill = acct_pin_kill;
238 mntput(mnt); /* it's pinned, now give up active reference */ 243 acct->file = file;
239 kfree(acct); 244 acct->needcheck = jiffies;
240 245 acct->ns = ns;
246 mutex_init(&acct->lock);
247 mutex_lock_nested(&acct->lock, 1); /* nobody has seen it yet */
248 pin_insert(&acct->pin, mnt);
249
250 old = acct_get(ns);
251 if (old)
252 acct_kill(old, acct);
253 else
254 ns->bacct = acct;
255 mutex_unlock(&acct->lock);
256 mnt_drop_write(mnt);
257 mntput(mnt);
241 return 0; 258 return 0;
242} 259}
243 260
261static DEFINE_MUTEX(acct_on_mutex);
262
244/** 263/**
245 * sys_acct - enable/disable process accounting 264 * sys_acct - enable/disable process accounting
246 * @name: file name for accounting records or NULL to shutdown accounting 265 * @name: file name for accounting records or NULL to shutdown accounting
@@ -261,80 +280,23 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
261 280
262 if (name) { 281 if (name) {
263 struct filename *tmp = getname(name); 282 struct filename *tmp = getname(name);
283
264 if (IS_ERR(tmp)) 284 if (IS_ERR(tmp))
265 return PTR_ERR(tmp); 285 return PTR_ERR(tmp);
286 mutex_lock(&acct_on_mutex);
266 error = acct_on(tmp); 287 error = acct_on(tmp);
288 mutex_unlock(&acct_on_mutex);
267 putname(tmp); 289 putname(tmp);
268 } else { 290 } else {
269 struct bsd_acct_struct *acct; 291 acct_kill(acct_get(task_active_pid_ns(current)), NULL);
270
271 acct = task_active_pid_ns(current)->bacct;
272 if (acct == NULL)
273 return 0;
274
275 spin_lock(&acct_lock);
276 acct_file_reopen(acct, NULL, NULL);
277 spin_unlock(&acct_lock);
278 } 292 }
279 293
280 return error; 294 return error;
281} 295}
282 296
283/**
284 * acct_auto_close - turn off a filesystem's accounting if it is on
285 * @m: vfsmount being shut down
286 *
287 * If the accounting is turned on for a file in the subtree pointed to
288 * to by m, turn accounting off. Done when m is about to die.
289 */
290void acct_auto_close_mnt(struct vfsmount *m)
291{
292 struct bsd_acct_struct *acct;
293
294 spin_lock(&acct_lock);
295restart:
296 list_for_each_entry(acct, &acct_list, list)
297 if (acct->file && acct->file->f_path.mnt == m) {
298 acct_file_reopen(acct, NULL, NULL);
299 goto restart;
300 }
301 spin_unlock(&acct_lock);
302}
303
304/**
305 * acct_auto_close - turn off a filesystem's accounting if it is on
306 * @sb: super block for the filesystem
307 *
308 * If the accounting is turned on for a file in the filesystem pointed
309 * to by sb, turn accounting off.
310 */
311void acct_auto_close(struct super_block *sb)
312{
313 struct bsd_acct_struct *acct;
314
315 spin_lock(&acct_lock);
316restart:
317 list_for_each_entry(acct, &acct_list, list)
318 if (acct->file && acct->file->f_path.dentry->d_sb == sb) {
319 acct_file_reopen(acct, NULL, NULL);
320 goto restart;
321 }
322 spin_unlock(&acct_lock);
323}
324
325void acct_exit_ns(struct pid_namespace *ns) 297void acct_exit_ns(struct pid_namespace *ns)
326{ 298{
327 struct bsd_acct_struct *acct = ns->bacct; 299 acct_kill(acct_get(ns), NULL);
328
329 if (acct == NULL)
330 return;
331
332 spin_lock(&acct_lock);
333 if (acct->file != NULL)
334 acct_file_reopen(acct, NULL, NULL);
335 spin_unlock(&acct_lock);
336
337 kfree(acct);
338} 300}
339 301
340/* 302/*
@@ -376,7 +338,7 @@ static comp_t encode_comp_t(unsigned long value)
376 return exp; 338 return exp;
377} 339}
378 340
379#if ACCT_VERSION==1 || ACCT_VERSION==2 341#if ACCT_VERSION == 1 || ACCT_VERSION == 2
380/* 342/*
381 * encode an u64 into a comp2_t (24 bits) 343 * encode an u64 into a comp2_t (24 bits)
382 * 344 *
@@ -389,7 +351,7 @@ static comp_t encode_comp_t(unsigned long value)
389#define MANTSIZE2 20 /* 20 bit mantissa. */ 351#define MANTSIZE2 20 /* 20 bit mantissa. */
390#define EXPSIZE2 5 /* 5 bit base 2 exponent. */ 352#define EXPSIZE2 5 /* 5 bit base 2 exponent. */
391#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */ 353#define MAXFRACT2 ((1ul << MANTSIZE2) - 1) /* Maximum fractional value. */
392#define MAXEXP2 ((1 <<EXPSIZE2) - 1) /* Maximum exponent. */ 354#define MAXEXP2 ((1 << EXPSIZE2) - 1) /* Maximum exponent. */
393 355
394static comp2_t encode_comp2_t(u64 value) 356static comp2_t encode_comp2_t(u64 value)
395{ 357{
@@ -420,7 +382,7 @@ static comp2_t encode_comp2_t(u64 value)
420} 382}
421#endif 383#endif
422 384
423#if ACCT_VERSION==3 385#if ACCT_VERSION == 3
424/* 386/*
425 * encode an u64 into a 32 bit IEEE float 387 * encode an u64 into a 32 bit IEEE float
426 */ 388 */
@@ -429,8 +391,9 @@ static u32 encode_float(u64 value)
429 unsigned exp = 190; 391 unsigned exp = 190;
430 unsigned u; 392 unsigned u;
431 393
432 if (value==0) return 0; 394 if (value == 0)
433 while ((s64)value > 0){ 395 return 0;
396 while ((s64)value > 0) {
434 value <<= 1; 397 value <<= 1;
435 exp--; 398 exp--;
436 } 399 }
@@ -448,116 +411,116 @@ static u32 encode_float(u64 value)
448 * do_exit() or when switching to a different output file. 411 * do_exit() or when switching to a different output file.
449 */ 412 */
450 413
451/* 414static void fill_ac(acct_t *ac)
452 * do_acct_process does all actual work. Caller holds the reference to file.
453 */
454static void do_acct_process(struct bsd_acct_struct *acct,
455 struct pid_namespace *ns, struct file *file)
456{ 415{
457 struct pacct_struct *pacct = &current->signal->pacct; 416 struct pacct_struct *pacct = &current->signal->pacct;
458 acct_t ac;
459 mm_segment_t fs;
460 unsigned long flim;
461 u64 elapsed, run_time; 417 u64 elapsed, run_time;
462 struct tty_struct *tty; 418 struct tty_struct *tty;
463 const struct cred *orig_cred;
464
465 /* Perform file operations on behalf of whoever enabled accounting */
466 orig_cred = override_creds(file->f_cred);
467
468 /*
469 * First check to see if there is enough free_space to continue
470 * the process accounting system.
471 */
472 if (!check_free_space(acct, file))
473 goto out;
474 419
475 /* 420 /*
476 * Fill the accounting struct with the needed info as recorded 421 * Fill the accounting struct with the needed info as recorded
477 * by the different kernel functions. 422 * by the different kernel functions.
478 */ 423 */
479 memset(&ac, 0, sizeof(acct_t)); 424 memset(ac, 0, sizeof(acct_t));
480 425
481 ac.ac_version = ACCT_VERSION | ACCT_BYTEORDER; 426 ac->ac_version = ACCT_VERSION | ACCT_BYTEORDER;
482 strlcpy(ac.ac_comm, current->comm, sizeof(ac.ac_comm)); 427 strlcpy(ac->ac_comm, current->comm, sizeof(ac->ac_comm));
483 428
484 /* calculate run_time in nsec*/ 429 /* calculate run_time in nsec*/
485 run_time = ktime_get_ns(); 430 run_time = ktime_get_ns();
486 run_time -= current->group_leader->start_time; 431 run_time -= current->group_leader->start_time;
487 /* convert nsec -> AHZ */ 432 /* convert nsec -> AHZ */
488 elapsed = nsec_to_AHZ(run_time); 433 elapsed = nsec_to_AHZ(run_time);
489#if ACCT_VERSION==3 434#if ACCT_VERSION == 3
490 ac.ac_etime = encode_float(elapsed); 435 ac->ac_etime = encode_float(elapsed);
491#else 436#else
492 ac.ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ? 437 ac->ac_etime = encode_comp_t(elapsed < (unsigned long) -1l ?
493 (unsigned long) elapsed : (unsigned long) -1l); 438 (unsigned long) elapsed : (unsigned long) -1l);
494#endif 439#endif
495#if ACCT_VERSION==1 || ACCT_VERSION==2 440#if ACCT_VERSION == 1 || ACCT_VERSION == 2
496 { 441 {
497 /* new enlarged etime field */ 442 /* new enlarged etime field */
498 comp2_t etime = encode_comp2_t(elapsed); 443 comp2_t etime = encode_comp2_t(elapsed);
499 ac.ac_etime_hi = etime >> 16; 444
500 ac.ac_etime_lo = (u16) etime; 445 ac->ac_etime_hi = etime >> 16;
446 ac->ac_etime_lo = (u16) etime;
501 } 447 }
502#endif 448#endif
503 do_div(elapsed, AHZ); 449 do_div(elapsed, AHZ);
504 ac.ac_btime = get_seconds() - elapsed; 450 ac->ac_btime = get_seconds() - elapsed;
505 /* we really need to bite the bullet and change layout */
506 ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
507 ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
508#if ACCT_VERSION==2 451#if ACCT_VERSION==2
509 ac.ac_ahz = AHZ; 452 ac->ac_ahz = AHZ;
510#endif
511#if ACCT_VERSION==1 || ACCT_VERSION==2
512 /* backward-compatible 16 bit fields */
513 ac.ac_uid16 = ac.ac_uid;
514 ac.ac_gid16 = ac.ac_gid;
515#endif
516#if ACCT_VERSION==3
517 ac.ac_pid = task_tgid_nr_ns(current, ns);
518 rcu_read_lock();
519 ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
520 rcu_read_unlock();
521#endif 453#endif
522 454
523 spin_lock_irq(&current->sighand->siglock); 455 spin_lock_irq(&current->sighand->siglock);
524 tty = current->signal->tty; /* Safe as we hold the siglock */ 456 tty = current->signal->tty; /* Safe as we hold the siglock */
525 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; 457 ac->ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
526 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); 458 ac->ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
527 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); 459 ac->ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
528 ac.ac_flag = pacct->ac_flag; 460 ac->ac_flag = pacct->ac_flag;
529 ac.ac_mem = encode_comp_t(pacct->ac_mem); 461 ac->ac_mem = encode_comp_t(pacct->ac_mem);
530 ac.ac_minflt = encode_comp_t(pacct->ac_minflt); 462 ac->ac_minflt = encode_comp_t(pacct->ac_minflt);
531 ac.ac_majflt = encode_comp_t(pacct->ac_majflt); 463 ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
532 ac.ac_exitcode = pacct->ac_exitcode; 464 ac->ac_exitcode = pacct->ac_exitcode;
533 spin_unlock_irq(&current->sighand->siglock); 465 spin_unlock_irq(&current->sighand->siglock);
534 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ 466}
535 ac.ac_rw = encode_comp_t(ac.ac_io / 1024); 467/*
536 ac.ac_swaps = encode_comp_t(0); 468 * do_acct_process does all actual work. Caller holds the reference to file.
469 */
470static void do_acct_process(struct bsd_acct_struct *acct)
471{
472 acct_t ac;
473 unsigned long flim;
474 const struct cred *orig_cred;
475 struct file *file = acct->file;
537 476
538 /* 477 /*
539 * Get freeze protection. If the fs is frozen, just skip the write 478 * Accounting records are not subject to resource limits.
540 * as we could deadlock the system otherwise.
541 */ 479 */
542 if (!file_start_write_trylock(file)) 480 flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
543 goto out; 481 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
482 /* Perform file operations on behalf of whoever enabled accounting */
483 orig_cred = override_creds(file->f_cred);
484
544 /* 485 /*
545 * Kernel segment override to datasegment and write it 486 * First check to see if there is enough free_space to continue
546 * to the accounting file. 487 * the process accounting system.
547 */ 488 */
548 fs = get_fs(); 489 if (!check_free_space(acct))
549 set_fs(KERNEL_DS); 490 goto out;
491
492 fill_ac(&ac);
493 /* we really need to bite the bullet and change layout */
494 ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
495 ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
496#if ACCT_VERSION == 1 || ACCT_VERSION == 2
497 /* backward-compatible 16 bit fields */
498 ac.ac_uid16 = ac.ac_uid;
499 ac.ac_gid16 = ac.ac_gid;
500#endif
501#if ACCT_VERSION == 3
502 {
503 struct pid_namespace *ns = acct->ns;
504
505 ac.ac_pid = task_tgid_nr_ns(current, ns);
506 rcu_read_lock();
507 ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
508 ns);
509 rcu_read_unlock();
510 }
511#endif
550 /* 512 /*
551 * Accounting records are not subject to resource limits. 513 * Get freeze protection. If the fs is frozen, just skip the write
514 * as we could deadlock the system otherwise.
552 */ 515 */
553 flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; 516 if (file_start_write_trylock(file)) {
554 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; 517 /* it's been opened O_APPEND, so position is irrelevant */
555 file->f_op->write(file, (char *)&ac, 518 loff_t pos = 0;
556 sizeof(acct_t), &file->f_pos); 519 __kernel_write(file, (char *)&ac, sizeof(acct_t), &pos);
557 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; 520 file_end_write(file);
558 set_fs(fs); 521 }
559 file_end_write(file);
560out: 522out:
523 current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
561 revert_creds(orig_cred); 524 revert_creds(orig_cred);
562} 525}
563 526
@@ -574,6 +537,7 @@ void acct_collect(long exitcode, int group_dead)
574 537
575 if (group_dead && current->mm) { 538 if (group_dead && current->mm) {
576 struct vm_area_struct *vma; 539 struct vm_area_struct *vma;
540
577 down_read(&current->mm->mmap_sem); 541 down_read(&current->mm->mmap_sem);
578 vma = current->mm->mmap; 542 vma = current->mm->mmap;
579 while (vma) { 543 while (vma) {
@@ -605,34 +569,20 @@ void acct_collect(long exitcode, int group_dead)
605 spin_unlock_irq(&current->sighand->siglock); 569 spin_unlock_irq(&current->sighand->siglock);
606} 570}
607 571
608static void acct_process_in_ns(struct pid_namespace *ns) 572static void slow_acct_process(struct pid_namespace *ns)
609{ 573{
610 struct file *file = NULL; 574 for ( ; ns; ns = ns->parent) {
611 struct bsd_acct_struct *acct; 575 struct bsd_acct_struct *acct = acct_get(ns);
612 576 if (acct) {
613 acct = ns->bacct; 577 do_acct_process(acct);
614 /* 578 mutex_unlock(&acct->lock);
615 * accelerate the common fastpath: 579 pin_put(&acct->pin);
616 */ 580 }
617 if (!acct || !acct->file)
618 return;
619
620 spin_lock(&acct_lock);
621 file = acct->file;
622 if (unlikely(!file)) {
623 spin_unlock(&acct_lock);
624 return;
625 } 581 }
626 get_file(file);
627 spin_unlock(&acct_lock);
628
629 do_acct_process(acct, ns, file);
630 fput(file);
631} 582}
632 583
633/** 584/**
634 * acct_process - now just a wrapper around acct_process_in_ns, 585 * acct_process
635 * which in turn is a wrapper around do_acct_process.
636 * 586 *
637 * handles process accounting for an exiting task 587 * handles process accounting for an exiting task
638 */ 588 */
@@ -645,6 +595,10 @@ void acct_process(void)
645 * alive and holds its namespace, which in turn holds 595 * alive and holds its namespace, which in turn holds
646 * its parent. 596 * its parent.
647 */ 597 */
648 for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) 598 for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) {
649 acct_process_in_ns(ns); 599 if (ns->bacct)
600 break;
601 }
602 if (unlikely(ns))
603 slow_acct_process(ns);
650} 604}
diff --git a/kernel/async.c b/kernel/async.c
index 61f023ce0228..4c3773c0bf63 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work)
115 115
116 /* 1) run (and print duration) */ 116 /* 1) run (and print duration) */
117 if (initcall_debug && system_state == SYSTEM_BOOTING) { 117 if (initcall_debug && system_state == SYSTEM_BOOTING) {
118 printk(KERN_DEBUG "calling %lli_%pF @ %i\n", 118 pr_debug("calling %lli_%pF @ %i\n",
119 (long long)entry->cookie, 119 (long long)entry->cookie,
120 entry->func, task_pid_nr(current)); 120 entry->func, task_pid_nr(current));
121 calltime = ktime_get(); 121 calltime = ktime_get();
@@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work)
124 if (initcall_debug && system_state == SYSTEM_BOOTING) { 124 if (initcall_debug && system_state == SYSTEM_BOOTING) {
125 rettime = ktime_get(); 125 rettime = ktime_get();
126 delta = ktime_sub(rettime, calltime); 126 delta = ktime_sub(rettime, calltime);
127 printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n", 127 pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
128 (long long)entry->cookie, 128 (long long)entry->cookie,
129 entry->func, 129 entry->func,
130 (long long)ktime_to_ns(delta) >> 10); 130 (long long)ktime_to_ns(delta) >> 10);
@@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
285 ktime_t uninitialized_var(starttime), delta, endtime; 285 ktime_t uninitialized_var(starttime), delta, endtime;
286 286
287 if (initcall_debug && system_state == SYSTEM_BOOTING) { 287 if (initcall_debug && system_state == SYSTEM_BOOTING) {
288 printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current)); 288 pr_debug("async_waiting @ %i\n", task_pid_nr(current));
289 starttime = ktime_get(); 289 starttime = ktime_get();
290 } 290 }
291 291
@@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
295 endtime = ktime_get(); 295 endtime = ktime_get();
296 delta = ktime_sub(endtime, starttime); 296 delta = ktime_sub(endtime, starttime);
297 297
298 printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n", 298 pr_debug("async_continuing @ %i after %lli usec\n",
299 task_pid_nr(current), 299 task_pid_nr(current),
300 (long long)ktime_to_ns(delta) >> 10); 300 (long long)ktime_to_ns(delta) >> 10);
301 } 301 }
diff --git a/kernel/audit.c b/kernel/audit.c
index ba2ff5a5c600..cebb11db4d34 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -126,7 +126,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
126 126
127/* The netlink socket. */ 127/* The netlink socket. */
128static struct sock *audit_sock; 128static struct sock *audit_sock;
129int audit_net_id; 129static int audit_net_id;
130 130
131/* Hash for inode-based rules */ 131/* Hash for inode-based rules */
132struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; 132struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
@@ -724,7 +724,7 @@ static int audit_get_feature(struct sk_buff *skb)
724 724
725 seq = nlmsg_hdr(skb)->nlmsg_seq; 725 seq = nlmsg_hdr(skb)->nlmsg_seq;
726 726
727 audit_send_reply(skb, seq, AUDIT_GET, 0, 0, &af, sizeof(af)); 727 audit_send_reply(skb, seq, AUDIT_GET_FEATURE, 0, 0, &af, sizeof(af));
728 728
729 return 0; 729 return 0;
730} 730}
@@ -739,7 +739,7 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
739 739
740 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE); 740 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_FEATURE_CHANGE);
741 audit_log_task_info(ab, current); 741 audit_log_task_info(ab, current);
742 audit_log_format(ab, "feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d", 742 audit_log_format(ab, " feature=%s old=%u new=%u old_lock=%u new_lock=%u res=%d",
743 audit_feature_names[which], !!old_feature, !!new_feature, 743 audit_feature_names[which], !!old_feature, !!new_feature,
744 !!old_lock, !!new_lock, res); 744 !!old_lock, !!new_lock, res);
745 audit_log_end(ab); 745 audit_log_end(ab);
@@ -750,7 +750,7 @@ static int audit_set_feature(struct sk_buff *skb)
750 struct audit_features *uaf; 750 struct audit_features *uaf;
751 int i; 751 int i;
752 752
753 BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > sizeof(audit_feature_names)/sizeof(audit_feature_names[0])); 753 BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
754 uaf = nlmsg_data(nlmsg_hdr(skb)); 754 uaf = nlmsg_data(nlmsg_hdr(skb));
755 755
756 /* if there is ever a version 2 we should handle that here */ 756 /* if there is ever a version 2 we should handle that here */
@@ -1301,19 +1301,9 @@ err:
1301 */ 1301 */
1302unsigned int audit_serial(void) 1302unsigned int audit_serial(void)
1303{ 1303{
1304 static DEFINE_SPINLOCK(serial_lock); 1304 static atomic_t serial = ATOMIC_INIT(0);
1305 static unsigned int serial = 0;
1306 1305
1307 unsigned long flags; 1306 return atomic_add_return(1, &serial);
1308 unsigned int ret;
1309
1310 spin_lock_irqsave(&serial_lock, flags);
1311 do {
1312 ret = ++serial;
1313 } while (unlikely(!ret));
1314 spin_unlock_irqrestore(&serial_lock, flags);
1315
1316 return ret;
1317} 1307}
1318 1308
1319static inline void audit_get_stamp(struct audit_context *ctx, 1309static inline void audit_get_stamp(struct audit_context *ctx,
@@ -1681,7 +1671,7 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
1681 } 1671 }
1682} 1672}
1683 1673
1684void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name) 1674static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
1685{ 1675{
1686 kernel_cap_t *perm = &name->fcap.permitted; 1676 kernel_cap_t *perm = &name->fcap.permitted;
1687 kernel_cap_t *inh = &name->fcap.inheritable; 1677 kernel_cap_t *inh = &name->fcap.inheritable;
@@ -1860,7 +1850,7 @@ EXPORT_SYMBOL(audit_log_task_context);
1860void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk) 1850void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1861{ 1851{
1862 const struct cred *cred; 1852 const struct cred *cred;
1863 char name[sizeof(tsk->comm)]; 1853 char comm[sizeof(tsk->comm)];
1864 struct mm_struct *mm = tsk->mm; 1854 struct mm_struct *mm = tsk->mm;
1865 char *tty; 1855 char *tty;
1866 1856
@@ -1894,9 +1884,8 @@ void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk)
1894 from_kgid(&init_user_ns, cred->fsgid), 1884 from_kgid(&init_user_ns, cred->fsgid),
1895 tty, audit_get_sessionid(tsk)); 1885 tty, audit_get_sessionid(tsk));
1896 1886
1897 get_task_comm(name, tsk);
1898 audit_log_format(ab, " comm="); 1887 audit_log_format(ab, " comm=");
1899 audit_log_untrustedstring(ab, name); 1888 audit_log_untrustedstring(ab, get_task_comm(comm, tsk));
1900 1889
1901 if (mm) { 1890 if (mm) {
1902 down_read(&mm->mmap_sem); 1891 down_read(&mm->mmap_sem);
@@ -1959,6 +1948,7 @@ void audit_log_end(struct audit_buffer *ab)
1959 } else { 1948 } else {
1960 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 1949 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1961 1950
1951 nlh->nlmsg_len = ab->skb->len;
1962 kauditd_send_multicast_skb(ab->skb); 1952 kauditd_send_multicast_skb(ab->skb);
1963 1953
1964 /* 1954 /*
@@ -1970,7 +1960,7 @@ void audit_log_end(struct audit_buffer *ab)
1970 * protocol between the kaudit kernel subsystem and the auditd 1960 * protocol between the kaudit kernel subsystem and the auditd
1971 * userspace code. 1961 * userspace code.
1972 */ 1962 */
1973 nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN; 1963 nlh->nlmsg_len -= NLMSG_HDRLEN;
1974 1964
1975 if (audit_pid) { 1965 if (audit_pid) {
1976 skb_queue_tail(&audit_skb_queue, ab->skb); 1966 skb_queue_tail(&audit_skb_queue, ab->skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index 7bb65730c890..3cdffad5a1d9 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -222,7 +222,6 @@ extern void audit_copy_inode(struct audit_names *name,
222 const struct inode *inode); 222 const struct inode *inode);
223extern void audit_log_cap(struct audit_buffer *ab, char *prefix, 223extern void audit_log_cap(struct audit_buffer *ab, char *prefix,
224 kernel_cap_t *cap); 224 kernel_cap_t *cap);
225extern void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name);
226extern void audit_log_name(struct audit_context *context, 225extern void audit_log_name(struct audit_context *context,
227 struct audit_names *n, struct path *path, 226 struct audit_names *n, struct path *path,
228 int record_num, int *call_panic); 227 int record_num, int *call_panic);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 135944a7b28a..80f29e015570 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -154,6 +154,7 @@ static struct audit_chunk *alloc_chunk(int count)
154 chunk->owners[i].index = i; 154 chunk->owners[i].index = i;
155 } 155 }
156 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); 156 fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch);
157 chunk->mark.mask = FS_IN_IGNORED;
157 return chunk; 158 return chunk;
158} 159}
159 160
@@ -449,7 +450,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
449 return 0; 450 return 0;
450} 451}
451 452
452static void audit_log_remove_rule(struct audit_krule *rule) 453static void audit_tree_log_remove_rule(struct audit_krule *rule)
453{ 454{
454 struct audit_buffer *ab; 455 struct audit_buffer *ab;
455 456
@@ -457,7 +458,7 @@ static void audit_log_remove_rule(struct audit_krule *rule)
457 if (unlikely(!ab)) 458 if (unlikely(!ab))
458 return; 459 return;
459 audit_log_format(ab, "op="); 460 audit_log_format(ab, "op=");
460 audit_log_string(ab, "remove rule"); 461 audit_log_string(ab, "remove_rule");
461 audit_log_format(ab, " dir="); 462 audit_log_format(ab, " dir=");
462 audit_log_untrustedstring(ab, rule->tree->pathname); 463 audit_log_untrustedstring(ab, rule->tree->pathname);
463 audit_log_key(ab, rule->filterkey); 464 audit_log_key(ab, rule->filterkey);
@@ -476,7 +477,7 @@ static void kill_rules(struct audit_tree *tree)
476 list_del_init(&rule->rlist); 477 list_del_init(&rule->rlist);
477 if (rule->tree) { 478 if (rule->tree) {
478 /* not a half-baked one */ 479 /* not a half-baked one */
479 audit_log_remove_rule(rule); 480 audit_tree_log_remove_rule(rule);
480 rule->tree = NULL; 481 rule->tree = NULL;
481 list_del_rcu(&entry->list); 482 list_del_rcu(&entry->list);
482 list_del(&entry->rule.list); 483 list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 70b4554d2fbe..ad9c1682f616 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -314,7 +314,7 @@ static void audit_update_watch(struct audit_parent *parent,
314 &nentry->rule.list); 314 &nentry->rule.list);
315 } 315 }
316 316
317 audit_watch_log_rule_change(r, owatch, "updated rules"); 317 audit_watch_log_rule_change(r, owatch, "updated_rules");
318 318
319 call_rcu(&oentry->rcu, audit_free_rule_rcu); 319 call_rcu(&oentry->rcu, audit_free_rule_rcu);
320 } 320 }
@@ -342,7 +342,7 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
342 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { 342 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
343 list_for_each_entry_safe(r, nextr, &w->rules, rlist) { 343 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
344 e = container_of(r, struct audit_entry, rule); 344 e = container_of(r, struct audit_entry, rule);
345 audit_watch_log_rule_change(r, w, "remove rule"); 345 audit_watch_log_rule_change(r, w, "remove_rule");
346 list_del(&r->rlist); 346 list_del(&r->rlist);
347 list_del(&r->list); 347 list_del(&r->list);
348 list_del_rcu(&e->list); 348 list_del_rcu(&e->list);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 8e9bc9c3dbb7..3598e13f2a65 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -71,6 +71,24 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
71 71
72DEFINE_MUTEX(audit_filter_mutex); 72DEFINE_MUTEX(audit_filter_mutex);
73 73
74static void audit_free_lsm_field(struct audit_field *f)
75{
76 switch (f->type) {
77 case AUDIT_SUBJ_USER:
78 case AUDIT_SUBJ_ROLE:
79 case AUDIT_SUBJ_TYPE:
80 case AUDIT_SUBJ_SEN:
81 case AUDIT_SUBJ_CLR:
82 case AUDIT_OBJ_USER:
83 case AUDIT_OBJ_ROLE:
84 case AUDIT_OBJ_TYPE:
85 case AUDIT_OBJ_LEV_LOW:
86 case AUDIT_OBJ_LEV_HIGH:
87 kfree(f->lsm_str);
88 security_audit_rule_free(f->lsm_rule);
89 }
90}
91
74static inline void audit_free_rule(struct audit_entry *e) 92static inline void audit_free_rule(struct audit_entry *e)
75{ 93{
76 int i; 94 int i;
@@ -80,11 +98,8 @@ static inline void audit_free_rule(struct audit_entry *e)
80 if (erule->watch) 98 if (erule->watch)
81 audit_put_watch(erule->watch); 99 audit_put_watch(erule->watch);
82 if (erule->fields) 100 if (erule->fields)
83 for (i = 0; i < erule->field_count; i++) { 101 for (i = 0; i < erule->field_count; i++)
84 struct audit_field *f = &erule->fields[i]; 102 audit_free_lsm_field(&erule->fields[i]);
85 kfree(f->lsm_str);
86 security_audit_rule_free(f->lsm_rule);
87 }
88 kfree(erule->fields); 103 kfree(erule->fields);
89 kfree(erule->filterkey); 104 kfree(erule->filterkey);
90 kfree(e); 105 kfree(e);
@@ -106,7 +121,7 @@ static inline struct audit_entry *audit_init_entry(u32 field_count)
106 if (unlikely(!entry)) 121 if (unlikely(!entry))
107 return NULL; 122 return NULL;
108 123
109 fields = kzalloc(sizeof(*fields) * field_count, GFP_KERNEL); 124 fields = kcalloc(field_count, sizeof(*fields), GFP_KERNEL);
110 if (unlikely(!fields)) { 125 if (unlikely(!fields)) {
111 kfree(entry); 126 kfree(entry);
112 return NULL; 127 return NULL;
@@ -148,7 +163,7 @@ static inline int audit_to_inode(struct audit_krule *krule,
148 struct audit_field *f) 163 struct audit_field *f)
149{ 164{
150 if (krule->listnr != AUDIT_FILTER_EXIT || 165 if (krule->listnr != AUDIT_FILTER_EXIT ||
151 krule->watch || krule->inode_f || krule->tree || 166 krule->inode_f || krule->watch || krule->tree ||
152 (f->op != Audit_equal && f->op != Audit_not_equal)) 167 (f->op != Audit_equal && f->op != Audit_not_equal))
153 return -EINVAL; 168 return -EINVAL;
154 169
@@ -160,7 +175,7 @@ static __u32 *classes[AUDIT_SYSCALL_CLASSES];
160 175
161int __init audit_register_class(int class, unsigned *list) 176int __init audit_register_class(int class, unsigned *list)
162{ 177{
163 __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL); 178 __u32 *p = kcalloc(AUDIT_BITMASK_SIZE, sizeof(__u32), GFP_KERNEL);
164 if (!p) 179 if (!p)
165 return -ENOMEM; 180 return -ENOMEM;
166 while (*list != ~0U) { 181 while (*list != ~0U) {
@@ -422,10 +437,6 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
422 437
423 f->type = data->fields[i]; 438 f->type = data->fields[i];
424 f->val = data->values[i]; 439 f->val = data->values[i];
425 f->uid = INVALID_UID;
426 f->gid = INVALID_GID;
427 f->lsm_str = NULL;
428 f->lsm_rule = NULL;
429 440
430 /* Support legacy tests for a valid loginuid */ 441 /* Support legacy tests for a valid loginuid */
431 if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) { 442 if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
@@ -1053,30 +1064,27 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
1053 int err = 0; 1064 int err = 0;
1054 struct audit_entry *entry; 1065 struct audit_entry *entry;
1055 1066
1067 entry = audit_data_to_entry(data, datasz);
1068 if (IS_ERR(entry))
1069 return PTR_ERR(entry);
1070
1056 switch (type) { 1071 switch (type) {
1057 case AUDIT_ADD_RULE: 1072 case AUDIT_ADD_RULE:
1058 entry = audit_data_to_entry(data, datasz);
1059 if (IS_ERR(entry))
1060 return PTR_ERR(entry);
1061
1062 err = audit_add_rule(entry); 1073 err = audit_add_rule(entry);
1063 audit_log_rule_change("add rule", &entry->rule, !err); 1074 audit_log_rule_change("add_rule", &entry->rule, !err);
1064 if (err)
1065 audit_free_rule(entry);
1066 break; 1075 break;
1067 case AUDIT_DEL_RULE: 1076 case AUDIT_DEL_RULE:
1068 entry = audit_data_to_entry(data, datasz);
1069 if (IS_ERR(entry))
1070 return PTR_ERR(entry);
1071
1072 err = audit_del_rule(entry); 1077 err = audit_del_rule(entry);
1073 audit_log_rule_change("remove rule", &entry->rule, !err); 1078 audit_log_rule_change("remove_rule", &entry->rule, !err);
1074 audit_free_rule(entry);
1075 break; 1079 break;
1076 default: 1080 default:
1077 return -EINVAL; 1081 err = -EINVAL;
1082 WARN_ON(1);
1078 } 1083 }
1079 1084
1085 if (err || type == AUDIT_DEL_RULE)
1086 audit_free_rule(entry);
1087
1080 return err; 1088 return err;
1081} 1089}
1082 1090
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 7208c1df248d..e420a0c41b5f 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -67,6 +67,7 @@
67#include <linux/binfmts.h> 67#include <linux/binfmts.h>
68#include <linux/highmem.h> 68#include <linux/highmem.h>
69#include <linux/syscalls.h> 69#include <linux/syscalls.h>
70#include <asm/syscall.h>
70#include <linux/capability.h> 71#include <linux/capability.h>
71#include <linux/fs_struct.h> 72#include <linux/fs_struct.h>
72#include <linux/compat.h> 73#include <linux/compat.h>
@@ -125,14 +126,6 @@ struct audit_tree_refs {
125 struct audit_chunk *c[31]; 126 struct audit_chunk *c[31];
126}; 127};
127 128
128static inline int open_arg(int flags, int mask)
129{
130 int n = ACC_MODE(flags);
131 if (flags & (O_TRUNC | O_CREAT))
132 n |= AUDIT_PERM_WRITE;
133 return n & mask;
134}
135
136static int audit_match_perm(struct audit_context *ctx, int mask) 129static int audit_match_perm(struct audit_context *ctx, int mask)
137{ 130{
138 unsigned n; 131 unsigned n;
@@ -1505,7 +1498,6 @@ void __audit_free(struct task_struct *tsk)
1505 1498
1506/** 1499/**
1507 * audit_syscall_entry - fill in an audit record at syscall entry 1500 * audit_syscall_entry - fill in an audit record at syscall entry
1508 * @arch: architecture type
1509 * @major: major syscall type (function) 1501 * @major: major syscall type (function)
1510 * @a1: additional syscall register 1 1502 * @a1: additional syscall register 1
1511 * @a2: additional syscall register 2 1503 * @a2: additional syscall register 2
@@ -1520,9 +1512,8 @@ void __audit_free(struct task_struct *tsk)
1520 * will only be written if another part of the kernel requests that it 1512 * will only be written if another part of the kernel requests that it
1521 * be written). 1513 * be written).
1522 */ 1514 */
1523void __audit_syscall_entry(int arch, int major, 1515void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
1524 unsigned long a1, unsigned long a2, 1516 unsigned long a3, unsigned long a4)
1525 unsigned long a3, unsigned long a4)
1526{ 1517{
1527 struct task_struct *tsk = current; 1518 struct task_struct *tsk = current;
1528 struct audit_context *context = tsk->audit_context; 1519 struct audit_context *context = tsk->audit_context;
@@ -1536,7 +1527,7 @@ void __audit_syscall_entry(int arch, int major,
1536 if (!audit_enabled) 1527 if (!audit_enabled)
1537 return; 1528 return;
1538 1529
1539 context->arch = arch; 1530 context->arch = syscall_get_arch();
1540 context->major = major; 1531 context->major = major;
1541 context->argv[0] = a1; 1532 context->argv[0] = a1;
1542 context->argv[1] = a2; 1533 context->argv[1] = a2;
@@ -2433,6 +2424,7 @@ static void audit_log_task(struct audit_buffer *ab)
2433 kgid_t gid; 2424 kgid_t gid;
2434 unsigned int sessionid; 2425 unsigned int sessionid;
2435 struct mm_struct *mm = current->mm; 2426 struct mm_struct *mm = current->mm;
2427 char comm[sizeof(current->comm)];
2436 2428
2437 auid = audit_get_loginuid(current); 2429 auid = audit_get_loginuid(current);
2438 sessionid = audit_get_sessionid(current); 2430 sessionid = audit_get_sessionid(current);
@@ -2445,7 +2437,7 @@ static void audit_log_task(struct audit_buffer *ab)
2445 sessionid); 2437 sessionid);
2446 audit_log_task_context(ab); 2438 audit_log_task_context(ab);
2447 audit_log_format(ab, " pid=%d comm=", task_pid_nr(current)); 2439 audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
2448 audit_log_untrustedstring(ab, current->comm); 2440 audit_log_untrustedstring(ab, get_task_comm(comm, current));
2449 if (mm) { 2441 if (mm) {
2450 down_read(&mm->mmap_sem); 2442 down_read(&mm->mmap_sem);
2451 if (mm->exe_file) 2443 if (mm->exe_file)
@@ -2488,11 +2480,9 @@ void __audit_seccomp(unsigned long syscall, long signr, int code)
2488 if (unlikely(!ab)) 2480 if (unlikely(!ab))
2489 return; 2481 return;
2490 audit_log_task(ab); 2482 audit_log_task(ab);
2491 audit_log_format(ab, " sig=%ld", signr); 2483 audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x",
2492 audit_log_format(ab, " syscall=%ld", syscall); 2484 signr, syscall_get_arch(), syscall, is_compat_task(),
2493 audit_log_format(ab, " compat=%d", is_compat_task()); 2485 KSTK_EIP(current), code);
2494 audit_log_format(ab, " ip=0x%lx", KSTK_EIP(current));
2495 audit_log_format(ab, " code=0x%x", code);
2496 audit_log_end(ab); 2486 audit_log_end(ab);
2497} 2487}
2498 2488
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 9fd4246b04b8..e1d1d1952bfa 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,7 +9,6 @@
9#include <linux/page-flags.h> 9#include <linux/page-flags.h>
10#include <linux/mmzone.h> 10#include <linux/mmzone.h>
11#include <linux/kbuild.h> 11#include <linux/kbuild.h>
12#include <linux/page_cgroup.h>
13#include <linux/log2.h> 12#include <linux/log2.h>
14#include <linux/spinlock_types.h> 13#include <linux/spinlock_types.h>
15 14
@@ -18,7 +17,6 @@ void foo(void)
18 /* The enum constants to put into include/generated/bounds.h */ 17 /* The enum constants to put into include/generated/bounds.h */
19 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS); 18 DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
20 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES); 19 DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
21 DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
22#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
23 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); 21 DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
24#endif 22#endif
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a71145e2769..0daf7f6ae7df 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1,5 @@
1obj-y := core.o 1obj-y := core.o
2obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o
3ifdef CONFIG_TEST_BPF
4obj-$(CONFIG_BPF_SYSCALL) += test_stub.o
5endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7f0dbcbb34af..d6594e457a25 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -20,9 +20,14 @@
20 * Andi Kleen - Fix a few bad bugs and races. 20 * Andi Kleen - Fix a few bad bugs and races.
21 * Kris Katterjohn - Added many additional checks in bpf_check_classic() 21 * Kris Katterjohn - Added many additional checks in bpf_check_classic()
22 */ 22 */
23
23#include <linux/filter.h> 24#include <linux/filter.h>
24#include <linux/skbuff.h> 25#include <linux/skbuff.h>
26#include <linux/vmalloc.h>
27#include <linux/random.h>
28#include <linux/moduleloader.h>
25#include <asm/unaligned.h> 29#include <asm/unaligned.h>
30#include <linux/bpf.h>
26 31
27/* Registers */ 32/* Registers */
28#define BPF_R0 regs[BPF_REG_0] 33#define BPF_R0 regs[BPF_REG_0]
@@ -63,6 +68,105 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
63 return NULL; 68 return NULL;
64} 69}
65 70
71struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
72{
73 gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
74 gfp_extra_flags;
75 struct bpf_prog_aux *aux;
76 struct bpf_prog *fp;
77
78 size = round_up(size, PAGE_SIZE);
79 fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
80 if (fp == NULL)
81 return NULL;
82
83 aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
84 if (aux == NULL) {
85 vfree(fp);
86 return NULL;
87 }
88
89 fp->pages = size / PAGE_SIZE;
90 fp->aux = aux;
91
92 return fp;
93}
94EXPORT_SYMBOL_GPL(bpf_prog_alloc);
95
96struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
97 gfp_t gfp_extra_flags)
98{
99 gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
100 gfp_extra_flags;
101 struct bpf_prog *fp;
102
103 BUG_ON(fp_old == NULL);
104
105 size = round_up(size, PAGE_SIZE);
106 if (size <= fp_old->pages * PAGE_SIZE)
107 return fp_old;
108
109 fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
110 if (fp != NULL) {
111 memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
112 fp->pages = size / PAGE_SIZE;
113
114 /* We keep fp->aux from fp_old around in the new
115 * reallocated structure.
116 */
117 fp_old->aux = NULL;
118 __bpf_prog_free(fp_old);
119 }
120
121 return fp;
122}
123EXPORT_SYMBOL_GPL(bpf_prog_realloc);
124
125void __bpf_prog_free(struct bpf_prog *fp)
126{
127 kfree(fp->aux);
128 vfree(fp);
129}
130EXPORT_SYMBOL_GPL(__bpf_prog_free);
131
132#ifdef CONFIG_BPF_JIT
133struct bpf_binary_header *
134bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
135 unsigned int alignment,
136 bpf_jit_fill_hole_t bpf_fill_ill_insns)
137{
138 struct bpf_binary_header *hdr;
139 unsigned int size, hole, start;
140
141 /* Most of BPF filters are really small, but if some of them
142 * fill a page, allow at least 128 extra bytes to insert a
143 * random section of illegal instructions.
144 */
145 size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
146 hdr = module_alloc(size);
147 if (hdr == NULL)
148 return NULL;
149
150 /* Fill space with illegal/arch-dep instructions. */
151 bpf_fill_ill_insns(hdr, size);
152
153 hdr->pages = size / PAGE_SIZE;
154 hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
155 PAGE_SIZE - sizeof(*hdr));
156 start = (prandom_u32() % hole) & ~(alignment - 1);
157
158 /* Leave a random number of instructions before BPF code. */
159 *image_ptr = &hdr->image[start];
160
161 return hdr;
162}
163
164void bpf_jit_binary_free(struct bpf_binary_header *hdr)
165{
166 module_free(NULL, hdr);
167}
168#endif /* CONFIG_BPF_JIT */
169
66/* Base function for offset calculation. Needs to go into .text section, 170/* Base function for offset calculation. Needs to go into .text section,
67 * therefore keeping it non-static as well; will also be used by JITs 171 * therefore keeping it non-static as well; will also be used by JITs
68 * anyway later on, so do not let the compiler omit it. 172 * anyway later on, so do not let the compiler omit it.
@@ -180,6 +284,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
180 [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, 284 [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
181 [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, 285 [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
182 [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, 286 [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
287 [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
183 }; 288 };
184 void *ptr; 289 void *ptr;
185 int off; 290 int off;
@@ -239,6 +344,10 @@ select_insn:
239 ALU64_MOV_K: 344 ALU64_MOV_K:
240 DST = IMM; 345 DST = IMM;
241 CONT; 346 CONT;
347 LD_IMM_DW:
348 DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
349 insn++;
350 CONT;
242 ALU64_ARSH_X: 351 ALU64_ARSH_X:
243 (*(s64 *) &DST) >>= SRC; 352 (*(s64 *) &DST) >>= SRC;
244 CONT; 353 CONT;
@@ -523,12 +632,35 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
523 632
524 /* Probe if internal BPF can be JITed */ 633 /* Probe if internal BPF can be JITed */
525 bpf_int_jit_compile(fp); 634 bpf_int_jit_compile(fp);
635 /* Lock whole bpf_prog as read-only */
636 bpf_prog_lock_ro(fp);
526} 637}
527EXPORT_SYMBOL_GPL(bpf_prog_select_runtime); 638EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
528 639
529/* free internal BPF program */ 640static void bpf_prog_free_deferred(struct work_struct *work)
641{
642 struct bpf_prog_aux *aux;
643
644 aux = container_of(work, struct bpf_prog_aux, work);
645 bpf_jit_free(aux->prog);
646}
647
648/* Free internal BPF program */
530void bpf_prog_free(struct bpf_prog *fp) 649void bpf_prog_free(struct bpf_prog *fp)
531{ 650{
532 bpf_jit_free(fp); 651 struct bpf_prog_aux *aux = fp->aux;
652
653 INIT_WORK(&aux->work, bpf_prog_free_deferred);
654 aux->prog = fp;
655 schedule_work(&aux->work);
533} 656}
534EXPORT_SYMBOL_GPL(bpf_prog_free); 657EXPORT_SYMBOL_GPL(bpf_prog_free);
658
659/* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call
660 * skb_copy_bits(), so provide a weak definition of it for NET-less config.
661 */
662int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
663 int len)
664{
665 return -EFAULT;
666}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000000..ba61c8c16032
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,606 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/bpf.h>
13#include <linux/syscalls.h>
14#include <linux/slab.h>
15#include <linux/anon_inodes.h>
16#include <linux/file.h>
17#include <linux/license.h>
18#include <linux/filter.h>
19
20static LIST_HEAD(bpf_map_types);
21
22static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
23{
24 struct bpf_map_type_list *tl;
25 struct bpf_map *map;
26
27 list_for_each_entry(tl, &bpf_map_types, list_node) {
28 if (tl->type == attr->map_type) {
29 map = tl->ops->map_alloc(attr);
30 if (IS_ERR(map))
31 return map;
32 map->ops = tl->ops;
33 map->map_type = attr->map_type;
34 return map;
35 }
36 }
37 return ERR_PTR(-EINVAL);
38}
39
40/* boot time registration of different map implementations */
41void bpf_register_map_type(struct bpf_map_type_list *tl)
42{
43 list_add(&tl->list_node, &bpf_map_types);
44}
45
46/* called from workqueue */
47static void bpf_map_free_deferred(struct work_struct *work)
48{
49 struct bpf_map *map = container_of(work, struct bpf_map, work);
50
51 /* implementation dependent freeing */
52 map->ops->map_free(map);
53}
54
55/* decrement map refcnt and schedule it for freeing via workqueue
56 * (unrelying map implementation ops->map_free() might sleep)
57 */
58void bpf_map_put(struct bpf_map *map)
59{
60 if (atomic_dec_and_test(&map->refcnt)) {
61 INIT_WORK(&map->work, bpf_map_free_deferred);
62 schedule_work(&map->work);
63 }
64}
65
66static int bpf_map_release(struct inode *inode, struct file *filp)
67{
68 struct bpf_map *map = filp->private_data;
69
70 bpf_map_put(map);
71 return 0;
72}
73
74static const struct file_operations bpf_map_fops = {
75 .release = bpf_map_release,
76};
77
78/* helper macro to check that unused fields 'union bpf_attr' are zero */
79#define CHECK_ATTR(CMD) \
80 memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
81 sizeof(attr->CMD##_LAST_FIELD), 0, \
82 sizeof(*attr) - \
83 offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
84 sizeof(attr->CMD##_LAST_FIELD)) != NULL
85
86#define BPF_MAP_CREATE_LAST_FIELD max_entries
87/* called via syscall */
88static int map_create(union bpf_attr *attr)
89{
90 struct bpf_map *map;
91 int err;
92
93 err = CHECK_ATTR(BPF_MAP_CREATE);
94 if (err)
95 return -EINVAL;
96
97 /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
98 map = find_and_alloc_map(attr);
99 if (IS_ERR(map))
100 return PTR_ERR(map);
101
102 atomic_set(&map->refcnt, 1);
103
104 err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
105
106 if (err < 0)
107 /* failed to allocate fd */
108 goto free_map;
109
110 return err;
111
112free_map:
113 map->ops->map_free(map);
114 return err;
115}
116
117/* if error is returned, fd is released.
118 * On success caller should complete fd access with matching fdput()
119 */
120struct bpf_map *bpf_map_get(struct fd f)
121{
122 struct bpf_map *map;
123
124 if (!f.file)
125 return ERR_PTR(-EBADF);
126
127 if (f.file->f_op != &bpf_map_fops) {
128 fdput(f);
129 return ERR_PTR(-EINVAL);
130 }
131
132 map = f.file->private_data;
133
134 return map;
135}
136
137/* helper to convert user pointers passed inside __aligned_u64 fields */
138static void __user *u64_to_ptr(__u64 val)
139{
140 return (void __user *) (unsigned long) val;
141}
142
143/* last field in 'union bpf_attr' used by this command */
144#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
145
146static int map_lookup_elem(union bpf_attr *attr)
147{
148 void __user *ukey = u64_to_ptr(attr->key);
149 void __user *uvalue = u64_to_ptr(attr->value);
150 int ufd = attr->map_fd;
151 struct fd f = fdget(ufd);
152 struct bpf_map *map;
153 void *key, *value;
154 int err;
155
156 if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
157 return -EINVAL;
158
159 map = bpf_map_get(f);
160 if (IS_ERR(map))
161 return PTR_ERR(map);
162
163 err = -ENOMEM;
164 key = kmalloc(map->key_size, GFP_USER);
165 if (!key)
166 goto err_put;
167
168 err = -EFAULT;
169 if (copy_from_user(key, ukey, map->key_size) != 0)
170 goto free_key;
171
172 err = -ESRCH;
173 rcu_read_lock();
174 value = map->ops->map_lookup_elem(map, key);
175 if (!value)
176 goto err_unlock;
177
178 err = -EFAULT;
179 if (copy_to_user(uvalue, value, map->value_size) != 0)
180 goto err_unlock;
181
182 err = 0;
183
184err_unlock:
185 rcu_read_unlock();
186free_key:
187 kfree(key);
188err_put:
189 fdput(f);
190 return err;
191}
192
193#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
194
195static int map_update_elem(union bpf_attr *attr)
196{
197 void __user *ukey = u64_to_ptr(attr->key);
198 void __user *uvalue = u64_to_ptr(attr->value);
199 int ufd = attr->map_fd;
200 struct fd f = fdget(ufd);
201 struct bpf_map *map;
202 void *key, *value;
203 int err;
204
205 if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
206 return -EINVAL;
207
208 map = bpf_map_get(f);
209 if (IS_ERR(map))
210 return PTR_ERR(map);
211
212 err = -ENOMEM;
213 key = kmalloc(map->key_size, GFP_USER);
214 if (!key)
215 goto err_put;
216
217 err = -EFAULT;
218 if (copy_from_user(key, ukey, map->key_size) != 0)
219 goto free_key;
220
221 err = -ENOMEM;
222 value = kmalloc(map->value_size, GFP_USER);
223 if (!value)
224 goto free_key;
225
226 err = -EFAULT;
227 if (copy_from_user(value, uvalue, map->value_size) != 0)
228 goto free_value;
229
230 /* eBPF program that use maps are running under rcu_read_lock(),
231 * therefore all map accessors rely on this fact, so do the same here
232 */
233 rcu_read_lock();
234 err = map->ops->map_update_elem(map, key, value);
235 rcu_read_unlock();
236
237free_value:
238 kfree(value);
239free_key:
240 kfree(key);
241err_put:
242 fdput(f);
243 return err;
244}
245
246#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
247
248static int map_delete_elem(union bpf_attr *attr)
249{
250 void __user *ukey = u64_to_ptr(attr->key);
251 int ufd = attr->map_fd;
252 struct fd f = fdget(ufd);
253 struct bpf_map *map;
254 void *key;
255 int err;
256
257 if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
258 return -EINVAL;
259
260 map = bpf_map_get(f);
261 if (IS_ERR(map))
262 return PTR_ERR(map);
263
264 err = -ENOMEM;
265 key = kmalloc(map->key_size, GFP_USER);
266 if (!key)
267 goto err_put;
268
269 err = -EFAULT;
270 if (copy_from_user(key, ukey, map->key_size) != 0)
271 goto free_key;
272
273 rcu_read_lock();
274 err = map->ops->map_delete_elem(map, key);
275 rcu_read_unlock();
276
277free_key:
278 kfree(key);
279err_put:
280 fdput(f);
281 return err;
282}
283
284/* last field in 'union bpf_attr' used by this command */
285#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
286
287static int map_get_next_key(union bpf_attr *attr)
288{
289 void __user *ukey = u64_to_ptr(attr->key);
290 void __user *unext_key = u64_to_ptr(attr->next_key);
291 int ufd = attr->map_fd;
292 struct fd f = fdget(ufd);
293 struct bpf_map *map;
294 void *key, *next_key;
295 int err;
296
297 if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
298 return -EINVAL;
299
300 map = bpf_map_get(f);
301 if (IS_ERR(map))
302 return PTR_ERR(map);
303
304 err = -ENOMEM;
305 key = kmalloc(map->key_size, GFP_USER);
306 if (!key)
307 goto err_put;
308
309 err = -EFAULT;
310 if (copy_from_user(key, ukey, map->key_size) != 0)
311 goto free_key;
312
313 err = -ENOMEM;
314 next_key = kmalloc(map->key_size, GFP_USER);
315 if (!next_key)
316 goto free_key;
317
318 rcu_read_lock();
319 err = map->ops->map_get_next_key(map, key, next_key);
320 rcu_read_unlock();
321 if (err)
322 goto free_next_key;
323
324 err = -EFAULT;
325 if (copy_to_user(unext_key, next_key, map->key_size) != 0)
326 goto free_next_key;
327
328 err = 0;
329
330free_next_key:
331 kfree(next_key);
332free_key:
333 kfree(key);
334err_put:
335 fdput(f);
336 return err;
337}
338
339static LIST_HEAD(bpf_prog_types);
340
341static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
342{
343 struct bpf_prog_type_list *tl;
344
345 list_for_each_entry(tl, &bpf_prog_types, list_node) {
346 if (tl->type == type) {
347 prog->aux->ops = tl->ops;
348 prog->aux->prog_type = type;
349 return 0;
350 }
351 }
352 return -EINVAL;
353}
354
355void bpf_register_prog_type(struct bpf_prog_type_list *tl)
356{
357 list_add(&tl->list_node, &bpf_prog_types);
358}
359
360/* fixup insn->imm field of bpf_call instructions:
361 * if (insn->imm == BPF_FUNC_map_lookup_elem)
362 * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
363 * else if (insn->imm == BPF_FUNC_map_update_elem)
364 * insn->imm = bpf_map_update_elem - __bpf_call_base;
365 * else ...
366 *
367 * this function is called after eBPF program passed verification
368 */
369static void fixup_bpf_calls(struct bpf_prog *prog)
370{
371 const struct bpf_func_proto *fn;
372 int i;
373
374 for (i = 0; i < prog->len; i++) {
375 struct bpf_insn *insn = &prog->insnsi[i];
376
377 if (insn->code == (BPF_JMP | BPF_CALL)) {
378 /* we reach here when program has bpf_call instructions
379 * and it passed bpf_check(), means that
380 * ops->get_func_proto must have been supplied, check it
381 */
382 BUG_ON(!prog->aux->ops->get_func_proto);
383
384 fn = prog->aux->ops->get_func_proto(insn->imm);
385 /* all functions that have prototype and verifier allowed
386 * programs to call them, must be real in-kernel functions
387 */
388 BUG_ON(!fn->func);
389 insn->imm = fn->func - __bpf_call_base;
390 }
391 }
392}
393
394/* drop refcnt on maps used by eBPF program and free auxilary data */
395static void free_used_maps(struct bpf_prog_aux *aux)
396{
397 int i;
398
399 for (i = 0; i < aux->used_map_cnt; i++)
400 bpf_map_put(aux->used_maps[i]);
401
402 kfree(aux->used_maps);
403}
404
405void bpf_prog_put(struct bpf_prog *prog)
406{
407 if (atomic_dec_and_test(&prog->aux->refcnt)) {
408 free_used_maps(prog->aux);
409 bpf_prog_free(prog);
410 }
411}
412
413static int bpf_prog_release(struct inode *inode, struct file *filp)
414{
415 struct bpf_prog *prog = filp->private_data;
416
417 bpf_prog_put(prog);
418 return 0;
419}
420
421static const struct file_operations bpf_prog_fops = {
422 .release = bpf_prog_release,
423};
424
425static struct bpf_prog *get_prog(struct fd f)
426{
427 struct bpf_prog *prog;
428
429 if (!f.file)
430 return ERR_PTR(-EBADF);
431
432 if (f.file->f_op != &bpf_prog_fops) {
433 fdput(f);
434 return ERR_PTR(-EINVAL);
435 }
436
437 prog = f.file->private_data;
438
439 return prog;
440}
441
442/* called by sockets/tracing/seccomp before attaching program to an event
443 * pairs with bpf_prog_put()
444 */
445struct bpf_prog *bpf_prog_get(u32 ufd)
446{
447 struct fd f = fdget(ufd);
448 struct bpf_prog *prog;
449
450 prog = get_prog(f);
451
452 if (IS_ERR(prog))
453 return prog;
454
455 atomic_inc(&prog->aux->refcnt);
456 fdput(f);
457 return prog;
458}
459
460/* last field in 'union bpf_attr' used by this command */
461#define BPF_PROG_LOAD_LAST_FIELD log_buf
462
463static int bpf_prog_load(union bpf_attr *attr)
464{
465 enum bpf_prog_type type = attr->prog_type;
466 struct bpf_prog *prog;
467 int err;
468 char license[128];
469 bool is_gpl;
470
471 if (CHECK_ATTR(BPF_PROG_LOAD))
472 return -EINVAL;
473
474 /* copy eBPF program license from user space */
475 if (strncpy_from_user(license, u64_to_ptr(attr->license),
476 sizeof(license) - 1) < 0)
477 return -EFAULT;
478 license[sizeof(license) - 1] = 0;
479
480 /* eBPF programs must be GPL compatible to use GPL-ed functions */
481 is_gpl = license_is_gpl_compatible(license);
482
483 if (attr->insn_cnt >= BPF_MAXINSNS)
484 return -EINVAL;
485
486 /* plain bpf_prog allocation */
487 prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
488 if (!prog)
489 return -ENOMEM;
490
491 prog->len = attr->insn_cnt;
492
493 err = -EFAULT;
494 if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
495 prog->len * sizeof(struct bpf_insn)) != 0)
496 goto free_prog;
497
498 prog->orig_prog = NULL;
499 prog->jited = false;
500
501 atomic_set(&prog->aux->refcnt, 1);
502 prog->aux->is_gpl_compatible = is_gpl;
503
504 /* find program type: socket_filter vs tracing_filter */
505 err = find_prog_type(type, prog);
506 if (err < 0)
507 goto free_prog;
508
509 /* run eBPF verifier */
510 err = bpf_check(prog, attr);
511
512 if (err < 0)
513 goto free_used_maps;
514
515 /* fixup BPF_CALL->imm field */
516 fixup_bpf_calls(prog);
517
518 /* eBPF program is ready to be JITed */
519 bpf_prog_select_runtime(prog);
520
521 err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
522
523 if (err < 0)
524 /* failed to allocate fd */
525 goto free_used_maps;
526
527 return err;
528
529free_used_maps:
530 free_used_maps(prog->aux);
531free_prog:
532 bpf_prog_free(prog);
533 return err;
534}
535
536SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
537{
538 union bpf_attr attr = {};
539 int err;
540
541 /* the syscall is limited to root temporarily. This restriction will be
542 * lifted when security audit is clean. Note that eBPF+tracing must have
543 * this restriction, since it may pass kernel data to user space
544 */
545 if (!capable(CAP_SYS_ADMIN))
546 return -EPERM;
547
548 if (!access_ok(VERIFY_READ, uattr, 1))
549 return -EFAULT;
550
551 if (size > PAGE_SIZE) /* silly large */
552 return -E2BIG;
553
554 /* If we're handed a bigger struct than we know of,
555 * ensure all the unknown bits are 0 - i.e. new
556 * user-space does not rely on any kernel feature
557 * extensions we dont know about yet.
558 */
559 if (size > sizeof(attr)) {
560 unsigned char __user *addr;
561 unsigned char __user *end;
562 unsigned char val;
563
564 addr = (void __user *)uattr + sizeof(attr);
565 end = (void __user *)uattr + size;
566
567 for (; addr < end; addr++) {
568 err = get_user(val, addr);
569 if (err)
570 return err;
571 if (val)
572 return -E2BIG;
573 }
574 size = sizeof(attr);
575 }
576
577 /* copy attributes from user space, may be less than sizeof(bpf_attr) */
578 if (copy_from_user(&attr, uattr, size) != 0)
579 return -EFAULT;
580
581 switch (cmd) {
582 case BPF_MAP_CREATE:
583 err = map_create(&attr);
584 break;
585 case BPF_MAP_LOOKUP_ELEM:
586 err = map_lookup_elem(&attr);
587 break;
588 case BPF_MAP_UPDATE_ELEM:
589 err = map_update_elem(&attr);
590 break;
591 case BPF_MAP_DELETE_ELEM:
592 err = map_delete_elem(&attr);
593 break;
594 case BPF_MAP_GET_NEXT_KEY:
595 err = map_get_next_key(&attr);
596 break;
597 case BPF_PROG_LOAD:
598 err = bpf_prog_load(&attr);
599 break;
600 default:
601 err = -EINVAL;
602 break;
603 }
604
605 return err;
606}
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
new file mode 100644
index 000000000000..fcaddff4003e
--- /dev/null
+++ b/kernel/bpf/test_stub.c
@@ -0,0 +1,116 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 */
7#include <linux/kernel.h>
8#include <linux/types.h>
9#include <linux/slab.h>
10#include <linux/err.h>
11#include <linux/bpf.h>
12
13/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
14 * to be used by user space verifier testsuite
15 */
16struct bpf_context {
17 u64 arg1;
18 u64 arg2;
19};
20
21static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
22{
23 return 0;
24}
25
26static struct bpf_func_proto test_funcs[] = {
27 [BPF_FUNC_unspec] = {
28 .func = test_func,
29 .gpl_only = true,
30 .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
31 .arg1_type = ARG_CONST_MAP_PTR,
32 .arg2_type = ARG_PTR_TO_MAP_KEY,
33 },
34};
35
36static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
37{
38 if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
39 return NULL;
40 return &test_funcs[func_id];
41}
42
43static const struct bpf_context_access {
44 int size;
45 enum bpf_access_type type;
46} test_ctx_access[] = {
47 [offsetof(struct bpf_context, arg1)] = {
48 FIELD_SIZEOF(struct bpf_context, arg1),
49 BPF_READ
50 },
51 [offsetof(struct bpf_context, arg2)] = {
52 FIELD_SIZEOF(struct bpf_context, arg2),
53 BPF_READ
54 },
55};
56
57static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
58{
59 const struct bpf_context_access *access;
60
61 if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
62 return false;
63
64 access = &test_ctx_access[off];
65 if (access->size == size && (access->type & type))
66 return true;
67
68 return false;
69}
70
71static struct bpf_verifier_ops test_ops = {
72 .get_func_proto = test_func_proto,
73 .is_valid_access = test_is_valid_access,
74};
75
76static struct bpf_prog_type_list tl_prog = {
77 .ops = &test_ops,
78 .type = BPF_PROG_TYPE_UNSPEC,
79};
80
81static struct bpf_map *test_map_alloc(union bpf_attr *attr)
82{
83 struct bpf_map *map;
84
85 map = kzalloc(sizeof(*map), GFP_USER);
86 if (!map)
87 return ERR_PTR(-ENOMEM);
88
89 map->key_size = attr->key_size;
90 map->value_size = attr->value_size;
91 map->max_entries = attr->max_entries;
92 return map;
93}
94
95static void test_map_free(struct bpf_map *map)
96{
97 kfree(map);
98}
99
100static struct bpf_map_ops test_map_ops = {
101 .map_alloc = test_map_alloc,
102 .map_free = test_map_free,
103};
104
105static struct bpf_map_type_list tl_map = {
106 .ops = &test_map_ops,
107 .type = BPF_MAP_TYPE_UNSPEC,
108};
109
110static int __init register_test_ops(void)
111{
112 bpf_register_map_type(&tl_map);
113 bpf_register_prog_type(&tl_prog);
114 return 0;
115}
116late_initcall(register_test_ops);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
new file mode 100644
index 000000000000..9f81818f2941
--- /dev/null
+++ b/kernel/bpf/verifier.c
@@ -0,0 +1,1924 @@
1/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/kernel.h>
13#include <linux/types.h>
14#include <linux/slab.h>
15#include <linux/bpf.h>
16#include <linux/filter.h>
17#include <net/netlink.h>
18#include <linux/file.h>
19#include <linux/vmalloc.h>
20
21/* bpf_check() is a static code analyzer that walks eBPF program
22 * instruction by instruction and updates register/stack state.
23 * All paths of conditional branches are analyzed until 'bpf_exit' insn.
24 *
25 * The first pass is depth-first-search to check that the program is a DAG.
26 * It rejects the following programs:
27 * - larger than BPF_MAXINSNS insns
28 * - if loop is present (detected via back-edge)
29 * - unreachable insns exist (shouldn't be a forest. program = one function)
30 * - out of bounds or malformed jumps
31 * The second pass is all possible path descent from the 1st insn.
32 * Since it's analyzing all pathes through the program, the length of the
33 * analysis is limited to 32k insn, which may be hit even if total number of
34 * insn is less then 4K, but there are too many branches that change stack/regs.
35 * Number of 'branches to be analyzed' is limited to 1k
36 *
37 * On entry to each instruction, each register has a type, and the instruction
38 * changes the types of the registers depending on instruction semantics.
39 * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
40 * copied to R1.
41 *
42 * All registers are 64-bit.
43 * R0 - return register
44 * R1-R5 argument passing registers
45 * R6-R9 callee saved registers
46 * R10 - frame pointer read-only
47 *
48 * At the start of BPF program the register R1 contains a pointer to bpf_context
49 * and has type PTR_TO_CTX.
50 *
51 * Verifier tracks arithmetic operations on pointers in case:
52 * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
53 * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
54 * 1st insn copies R10 (which has FRAME_PTR) type into R1
55 * and 2nd arithmetic instruction is pattern matched to recognize
56 * that it wants to construct a pointer to some element within stack.
57 * So after 2nd insn, the register R1 has type PTR_TO_STACK
58 * (and -20 constant is saved for further stack bounds checking).
59 * Meaning that this reg is a pointer to stack plus known immediate constant.
60 *
61 * Most of the time the registers have UNKNOWN_VALUE type, which
62 * means the register has some value, but it's not a valid pointer.
63 * (like pointer plus pointer becomes UNKNOWN_VALUE type)
64 *
65 * When verifier sees load or store instructions the type of base register
66 * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
67 * types recognized by check_mem_access() function.
68 *
69 * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
70 * and the range of [ptr, ptr + map's value_size) is accessible.
71 *
72 * registers used to pass values to function calls are checked against
73 * function argument constraints.
74 *
75 * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
76 * It means that the register type passed to this function must be
77 * PTR_TO_STACK and it will be used inside the function as
78 * 'pointer to map element key'
79 *
80 * For example the argument constraints for bpf_map_lookup_elem():
81 * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
82 * .arg1_type = ARG_CONST_MAP_PTR,
83 * .arg2_type = ARG_PTR_TO_MAP_KEY,
84 *
85 * ret_type says that this function returns 'pointer to map elem value or null'
86 * function expects 1st argument to be a const pointer to 'struct bpf_map' and
87 * 2nd argument should be a pointer to stack, which will be used inside
88 * the helper function as a pointer to map element key.
89 *
90 * On the kernel side the helper function looks like:
91 * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
92 * {
93 * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
94 * void *key = (void *) (unsigned long) r2;
95 * void *value;
96 *
97 * here kernel can access 'key' and 'map' pointers safely, knowing that
98 * [key, key + map->key_size) bytes are valid and were initialized on
99 * the stack of eBPF program.
100 * }
101 *
102 * Corresponding eBPF program may look like:
103 * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR
104 * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
105 * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP
106 * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
107 * here verifier looks at prototype of map_lookup_elem() and sees:
108 * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
109 * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
110 *
111 * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
112 * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
113 * and were initialized prior to this call.
114 * If it's ok, then verifier allows this BPF_CALL insn and looks at
115 * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
116 * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
117 * returns ether pointer to map value or NULL.
118 *
119 * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
120 * insn, the register holding that pointer in the true branch changes state to
121 * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
122 * branch. See check_cond_jmp_op().
123 *
124 * After the call R0 is set to return type of the function and registers R1-R5
125 * are set to NOT_INIT to indicate that they are no longer readable.
126 */
127
128/* types of values stored in eBPF registers */
129enum bpf_reg_type {
130 NOT_INIT = 0, /* nothing was written into register */
131 UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
132 PTR_TO_CTX, /* reg points to bpf_context */
133 CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
134 PTR_TO_MAP_VALUE, /* reg points to map element value */
135 PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
136 FRAME_PTR, /* reg == frame_pointer */
137 PTR_TO_STACK, /* reg == frame_pointer + imm */
138 CONST_IMM, /* constant integer value */
139};
140
141struct reg_state {
142 enum bpf_reg_type type;
143 union {
144 /* valid when type == CONST_IMM | PTR_TO_STACK */
145 int imm;
146
147 /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
148 * PTR_TO_MAP_VALUE_OR_NULL
149 */
150 struct bpf_map *map_ptr;
151 };
152};
153
154enum bpf_stack_slot_type {
155 STACK_INVALID, /* nothing was stored in this stack slot */
156 STACK_SPILL, /* 1st byte of register spilled into stack */
157 STACK_SPILL_PART, /* other 7 bytes of register spill */
158 STACK_MISC /* BPF program wrote some data into this slot */
159};
160
161struct bpf_stack_slot {
162 enum bpf_stack_slot_type stype;
163 struct reg_state reg_st;
164};
165
166/* state of the program:
167 * type of all registers and stack info
168 */
169struct verifier_state {
170 struct reg_state regs[MAX_BPF_REG];
171 struct bpf_stack_slot stack[MAX_BPF_STACK];
172};
173
174/* linked list of verifier states used to prune search */
175struct verifier_state_list {
176 struct verifier_state state;
177 struct verifier_state_list *next;
178};
179
180/* verifier_state + insn_idx are pushed to stack when branch is encountered */
181struct verifier_stack_elem {
182 /* verifer state is 'st'
183 * before processing instruction 'insn_idx'
184 * and after processing instruction 'prev_insn_idx'
185 */
186 struct verifier_state st;
187 int insn_idx;
188 int prev_insn_idx;
189 struct verifier_stack_elem *next;
190};
191
192#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
193
194/* single container for all structs
195 * one verifier_env per bpf_check() call
196 */
197struct verifier_env {
198 struct bpf_prog *prog; /* eBPF program being verified */
199 struct verifier_stack_elem *head; /* stack of verifier states to be processed */
200 int stack_size; /* number of states to be processed */
201 struct verifier_state cur_state; /* current verifier state */
202 struct verifier_state_list **explored_states; /* search pruning optimization */
203 struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
204 u32 used_map_cnt; /* number of used maps */
205};
206
207/* verbose verifier prints what it's seeing
208 * bpf_check() is called under lock, so no race to access these global vars
209 */
210static u32 log_level, log_size, log_len;
211static char *log_buf;
212
213static DEFINE_MUTEX(bpf_verifier_lock);
214
215/* log_level controls verbosity level of eBPF verifier.
216 * verbose() is used to dump the verification trace to the log, so the user
217 * can figure out what's wrong with the program
218 */
219static void verbose(const char *fmt, ...)
220{
221 va_list args;
222
223 if (log_level == 0 || log_len >= log_size - 1)
224 return;
225
226 va_start(args, fmt);
227 log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
228 va_end(args);
229}
230
231/* string representation of 'enum bpf_reg_type' */
232static const char * const reg_type_str[] = {
233 [NOT_INIT] = "?",
234 [UNKNOWN_VALUE] = "inv",
235 [PTR_TO_CTX] = "ctx",
236 [CONST_PTR_TO_MAP] = "map_ptr",
237 [PTR_TO_MAP_VALUE] = "map_value",
238 [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
239 [FRAME_PTR] = "fp",
240 [PTR_TO_STACK] = "fp",
241 [CONST_IMM] = "imm",
242};
243
244static void print_verifier_state(struct verifier_env *env)
245{
246 enum bpf_reg_type t;
247 int i;
248
249 for (i = 0; i < MAX_BPF_REG; i++) {
250 t = env->cur_state.regs[i].type;
251 if (t == NOT_INIT)
252 continue;
253 verbose(" R%d=%s", i, reg_type_str[t]);
254 if (t == CONST_IMM || t == PTR_TO_STACK)
255 verbose("%d", env->cur_state.regs[i].imm);
256 else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
257 t == PTR_TO_MAP_VALUE_OR_NULL)
258 verbose("(ks=%d,vs=%d)",
259 env->cur_state.regs[i].map_ptr->key_size,
260 env->cur_state.regs[i].map_ptr->value_size);
261 }
262 for (i = 0; i < MAX_BPF_STACK; i++) {
263 if (env->cur_state.stack[i].stype == STACK_SPILL)
264 verbose(" fp%d=%s", -MAX_BPF_STACK + i,
265 reg_type_str[env->cur_state.stack[i].reg_st.type]);
266 }
267 verbose("\n");
268}
269
270static const char *const bpf_class_string[] = {
271 [BPF_LD] = "ld",
272 [BPF_LDX] = "ldx",
273 [BPF_ST] = "st",
274 [BPF_STX] = "stx",
275 [BPF_ALU] = "alu",
276 [BPF_JMP] = "jmp",
277 [BPF_RET] = "BUG",
278 [BPF_ALU64] = "alu64",
279};
280
281static const char *const bpf_alu_string[] = {
282 [BPF_ADD >> 4] = "+=",
283 [BPF_SUB >> 4] = "-=",
284 [BPF_MUL >> 4] = "*=",
285 [BPF_DIV >> 4] = "/=",
286 [BPF_OR >> 4] = "|=",
287 [BPF_AND >> 4] = "&=",
288 [BPF_LSH >> 4] = "<<=",
289 [BPF_RSH >> 4] = ">>=",
290 [BPF_NEG >> 4] = "neg",
291 [BPF_MOD >> 4] = "%=",
292 [BPF_XOR >> 4] = "^=",
293 [BPF_MOV >> 4] = "=",
294 [BPF_ARSH >> 4] = "s>>=",
295 [BPF_END >> 4] = "endian",
296};
297
298static const char *const bpf_ldst_string[] = {
299 [BPF_W >> 3] = "u32",
300 [BPF_H >> 3] = "u16",
301 [BPF_B >> 3] = "u8",
302 [BPF_DW >> 3] = "u64",
303};
304
305static const char *const bpf_jmp_string[] = {
306 [BPF_JA >> 4] = "jmp",
307 [BPF_JEQ >> 4] = "==",
308 [BPF_JGT >> 4] = ">",
309 [BPF_JGE >> 4] = ">=",
310 [BPF_JSET >> 4] = "&",
311 [BPF_JNE >> 4] = "!=",
312 [BPF_JSGT >> 4] = "s>",
313 [BPF_JSGE >> 4] = "s>=",
314 [BPF_CALL >> 4] = "call",
315 [BPF_EXIT >> 4] = "exit",
316};
317
318static void print_bpf_insn(struct bpf_insn *insn)
319{
320 u8 class = BPF_CLASS(insn->code);
321
322 if (class == BPF_ALU || class == BPF_ALU64) {
323 if (BPF_SRC(insn->code) == BPF_X)
324 verbose("(%02x) %sr%d %s %sr%d\n",
325 insn->code, class == BPF_ALU ? "(u32) " : "",
326 insn->dst_reg,
327 bpf_alu_string[BPF_OP(insn->code) >> 4],
328 class == BPF_ALU ? "(u32) " : "",
329 insn->src_reg);
330 else
331 verbose("(%02x) %sr%d %s %s%d\n",
332 insn->code, class == BPF_ALU ? "(u32) " : "",
333 insn->dst_reg,
334 bpf_alu_string[BPF_OP(insn->code) >> 4],
335 class == BPF_ALU ? "(u32) " : "",
336 insn->imm);
337 } else if (class == BPF_STX) {
338 if (BPF_MODE(insn->code) == BPF_MEM)
339 verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
340 insn->code,
341 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
342 insn->dst_reg,
343 insn->off, insn->src_reg);
344 else if (BPF_MODE(insn->code) == BPF_XADD)
345 verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
346 insn->code,
347 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
348 insn->dst_reg, insn->off,
349 insn->src_reg);
350 else
351 verbose("BUG_%02x\n", insn->code);
352 } else if (class == BPF_ST) {
353 if (BPF_MODE(insn->code) != BPF_MEM) {
354 verbose("BUG_st_%02x\n", insn->code);
355 return;
356 }
357 verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
358 insn->code,
359 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
360 insn->dst_reg,
361 insn->off, insn->imm);
362 } else if (class == BPF_LDX) {
363 if (BPF_MODE(insn->code) != BPF_MEM) {
364 verbose("BUG_ldx_%02x\n", insn->code);
365 return;
366 }
367 verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
368 insn->code, insn->dst_reg,
369 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
370 insn->src_reg, insn->off);
371 } else if (class == BPF_LD) {
372 if (BPF_MODE(insn->code) == BPF_ABS) {
373 verbose("(%02x) r0 = *(%s *)skb[%d]\n",
374 insn->code,
375 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
376 insn->imm);
377 } else if (BPF_MODE(insn->code) == BPF_IND) {
378 verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
379 insn->code,
380 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
381 insn->src_reg, insn->imm);
382 } else if (BPF_MODE(insn->code) == BPF_IMM) {
383 verbose("(%02x) r%d = 0x%x\n",
384 insn->code, insn->dst_reg, insn->imm);
385 } else {
386 verbose("BUG_ld_%02x\n", insn->code);
387 return;
388 }
389 } else if (class == BPF_JMP) {
390 u8 opcode = BPF_OP(insn->code);
391
392 if (opcode == BPF_CALL) {
393 verbose("(%02x) call %d\n", insn->code, insn->imm);
394 } else if (insn->code == (BPF_JMP | BPF_JA)) {
395 verbose("(%02x) goto pc%+d\n",
396 insn->code, insn->off);
397 } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
398 verbose("(%02x) exit\n", insn->code);
399 } else if (BPF_SRC(insn->code) == BPF_X) {
400 verbose("(%02x) if r%d %s r%d goto pc%+d\n",
401 insn->code, insn->dst_reg,
402 bpf_jmp_string[BPF_OP(insn->code) >> 4],
403 insn->src_reg, insn->off);
404 } else {
405 verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
406 insn->code, insn->dst_reg,
407 bpf_jmp_string[BPF_OP(insn->code) >> 4],
408 insn->imm, insn->off);
409 }
410 } else {
411 verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
412 }
413}
414
415static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
416{
417 struct verifier_stack_elem *elem;
418 int insn_idx;
419
420 if (env->head == NULL)
421 return -1;
422
423 memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
424 insn_idx = env->head->insn_idx;
425 if (prev_insn_idx)
426 *prev_insn_idx = env->head->prev_insn_idx;
427 elem = env->head->next;
428 kfree(env->head);
429 env->head = elem;
430 env->stack_size--;
431 return insn_idx;
432}
433
434static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
435 int prev_insn_idx)
436{
437 struct verifier_stack_elem *elem;
438
439 elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
440 if (!elem)
441 goto err;
442
443 memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
444 elem->insn_idx = insn_idx;
445 elem->prev_insn_idx = prev_insn_idx;
446 elem->next = env->head;
447 env->head = elem;
448 env->stack_size++;
449 if (env->stack_size > 1024) {
450 verbose("BPF program is too complex\n");
451 goto err;
452 }
453 return &elem->st;
454err:
455 /* pop all elements and return */
456 while (pop_stack(env, NULL) >= 0);
457 return NULL;
458}
459
460#define CALLER_SAVED_REGS 6
461static const int caller_saved[CALLER_SAVED_REGS] = {
462 BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
463};
464
465static void init_reg_state(struct reg_state *regs)
466{
467 int i;
468
469 for (i = 0; i < MAX_BPF_REG; i++) {
470 regs[i].type = NOT_INIT;
471 regs[i].imm = 0;
472 regs[i].map_ptr = NULL;
473 }
474
475 /* frame pointer */
476 regs[BPF_REG_FP].type = FRAME_PTR;
477
478 /* 1st arg to a function */
479 regs[BPF_REG_1].type = PTR_TO_CTX;
480}
481
482static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
483{
484 BUG_ON(regno >= MAX_BPF_REG);
485 regs[regno].type = UNKNOWN_VALUE;
486 regs[regno].imm = 0;
487 regs[regno].map_ptr = NULL;
488}
489
490enum reg_arg_type {
491 SRC_OP, /* register is used as source operand */
492 DST_OP, /* register is used as destination operand */
493 DST_OP_NO_MARK /* same as above, check only, don't mark */
494};
495
496static int check_reg_arg(struct reg_state *regs, u32 regno,
497 enum reg_arg_type t)
498{
499 if (regno >= MAX_BPF_REG) {
500 verbose("R%d is invalid\n", regno);
501 return -EINVAL;
502 }
503
504 if (t == SRC_OP) {
505 /* check whether register used as source operand can be read */
506 if (regs[regno].type == NOT_INIT) {
507 verbose("R%d !read_ok\n", regno);
508 return -EACCES;
509 }
510 } else {
511 /* check whether register used as dest operand can be written to */
512 if (regno == BPF_REG_FP) {
513 verbose("frame pointer is read only\n");
514 return -EACCES;
515 }
516 if (t == DST_OP)
517 mark_reg_unknown_value(regs, regno);
518 }
519 return 0;
520}
521
522static int bpf_size_to_bytes(int bpf_size)
523{
524 if (bpf_size == BPF_W)
525 return 4;
526 else if (bpf_size == BPF_H)
527 return 2;
528 else if (bpf_size == BPF_B)
529 return 1;
530 else if (bpf_size == BPF_DW)
531 return 8;
532 else
533 return -EINVAL;
534}
535
536/* check_stack_read/write functions track spill/fill of registers,
537 * stack boundary and alignment are checked in check_mem_access()
538 */
539static int check_stack_write(struct verifier_state *state, int off, int size,
540 int value_regno)
541{
542 struct bpf_stack_slot *slot;
543 int i;
544
545 if (value_regno >= 0 &&
546 (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
547 state->regs[value_regno].type == PTR_TO_STACK ||
548 state->regs[value_regno].type == PTR_TO_CTX)) {
549
550 /* register containing pointer is being spilled into stack */
551 if (size != 8) {
552 verbose("invalid size of register spill\n");
553 return -EACCES;
554 }
555
556 slot = &state->stack[MAX_BPF_STACK + off];
557 slot->stype = STACK_SPILL;
558 /* save register state */
559 slot->reg_st = state->regs[value_regno];
560 for (i = 1; i < 8; i++) {
561 slot = &state->stack[MAX_BPF_STACK + off + i];
562 slot->stype = STACK_SPILL_PART;
563 slot->reg_st.type = UNKNOWN_VALUE;
564 slot->reg_st.map_ptr = NULL;
565 }
566 } else {
567
568 /* regular write of data into stack */
569 for (i = 0; i < size; i++) {
570 slot = &state->stack[MAX_BPF_STACK + off + i];
571 slot->stype = STACK_MISC;
572 slot->reg_st.type = UNKNOWN_VALUE;
573 slot->reg_st.map_ptr = NULL;
574 }
575 }
576 return 0;
577}
578
579static int check_stack_read(struct verifier_state *state, int off, int size,
580 int value_regno)
581{
582 int i;
583 struct bpf_stack_slot *slot;
584
585 slot = &state->stack[MAX_BPF_STACK + off];
586
587 if (slot->stype == STACK_SPILL) {
588 if (size != 8) {
589 verbose("invalid size of register spill\n");
590 return -EACCES;
591 }
592 for (i = 1; i < 8; i++) {
593 if (state->stack[MAX_BPF_STACK + off + i].stype !=
594 STACK_SPILL_PART) {
595 verbose("corrupted spill memory\n");
596 return -EACCES;
597 }
598 }
599
600 if (value_regno >= 0)
601 /* restore register state from stack */
602 state->regs[value_regno] = slot->reg_st;
603 return 0;
604 } else {
605 for (i = 0; i < size; i++) {
606 if (state->stack[MAX_BPF_STACK + off + i].stype !=
607 STACK_MISC) {
608 verbose("invalid read from stack off %d+%d size %d\n",
609 off, i, size);
610 return -EACCES;
611 }
612 }
613 if (value_regno >= 0)
614 /* have read misc data from the stack */
615 mark_reg_unknown_value(state->regs, value_regno);
616 return 0;
617 }
618}
619
620/* check read/write into map element returned by bpf_map_lookup_elem() */
621static int check_map_access(struct verifier_env *env, u32 regno, int off,
622 int size)
623{
624 struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
625
626 if (off < 0 || off + size > map->value_size) {
627 verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
628 map->value_size, off, size);
629 return -EACCES;
630 }
631 return 0;
632}
633
634/* check access to 'struct bpf_context' fields */
635static int check_ctx_access(struct verifier_env *env, int off, int size,
636 enum bpf_access_type t)
637{
638 if (env->prog->aux->ops->is_valid_access &&
639 env->prog->aux->ops->is_valid_access(off, size, t))
640 return 0;
641
642 verbose("invalid bpf_context access off=%d size=%d\n", off, size);
643 return -EACCES;
644}
645
646/* check whether memory at (regno + off) is accessible for t = (read | write)
647 * if t==write, value_regno is a register which value is stored into memory
648 * if t==read, value_regno is a register which will receive the value from memory
649 * if t==write && value_regno==-1, some unknown value is stored into memory
650 * if t==read && value_regno==-1, don't care what we read from memory
651 */
652static int check_mem_access(struct verifier_env *env, u32 regno, int off,
653 int bpf_size, enum bpf_access_type t,
654 int value_regno)
655{
656 struct verifier_state *state = &env->cur_state;
657 int size, err = 0;
658
659 size = bpf_size_to_bytes(bpf_size);
660 if (size < 0)
661 return size;
662
663 if (off % size != 0) {
664 verbose("misaligned access off %d size %d\n", off, size);
665 return -EACCES;
666 }
667
668 if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
669 err = check_map_access(env, regno, off, size);
670 if (!err && t == BPF_READ && value_regno >= 0)
671 mark_reg_unknown_value(state->regs, value_regno);
672
673 } else if (state->regs[regno].type == PTR_TO_CTX) {
674 err = check_ctx_access(env, off, size, t);
675 if (!err && t == BPF_READ && value_regno >= 0)
676 mark_reg_unknown_value(state->regs, value_regno);
677
678 } else if (state->regs[regno].type == FRAME_PTR) {
679 if (off >= 0 || off < -MAX_BPF_STACK) {
680 verbose("invalid stack off=%d size=%d\n", off, size);
681 return -EACCES;
682 }
683 if (t == BPF_WRITE)
684 err = check_stack_write(state, off, size, value_regno);
685 else
686 err = check_stack_read(state, off, size, value_regno);
687 } else {
688 verbose("R%d invalid mem access '%s'\n",
689 regno, reg_type_str[state->regs[regno].type]);
690 return -EACCES;
691 }
692 return err;
693}
694
695static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
696{
697 struct reg_state *regs = env->cur_state.regs;
698 int err;
699
700 if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
701 insn->imm != 0) {
702 verbose("BPF_XADD uses reserved fields\n");
703 return -EINVAL;
704 }
705
706 /* check src1 operand */
707 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
708 if (err)
709 return err;
710
711 /* check src2 operand */
712 err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
713 if (err)
714 return err;
715
716 /* check whether atomic_add can read the memory */
717 err = check_mem_access(env, insn->dst_reg, insn->off,
718 BPF_SIZE(insn->code), BPF_READ, -1);
719 if (err)
720 return err;
721
722 /* check whether atomic_add can write into the same memory */
723 return check_mem_access(env, insn->dst_reg, insn->off,
724 BPF_SIZE(insn->code), BPF_WRITE, -1);
725}
726
727/* when register 'regno' is passed into function that will read 'access_size'
728 * bytes from that pointer, make sure that it's within stack boundary
729 * and all elements of stack are initialized
730 */
731static int check_stack_boundary(struct verifier_env *env,
732 int regno, int access_size)
733{
734 struct verifier_state *state = &env->cur_state;
735 struct reg_state *regs = state->regs;
736 int off, i;
737
738 if (regs[regno].type != PTR_TO_STACK)
739 return -EACCES;
740
741 off = regs[regno].imm;
742 if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
743 access_size <= 0) {
744 verbose("invalid stack type R%d off=%d access_size=%d\n",
745 regno, off, access_size);
746 return -EACCES;
747 }
748
749 for (i = 0; i < access_size; i++) {
750 if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) {
751 verbose("invalid indirect read from stack off %d+%d size %d\n",
752 off, i, access_size);
753 return -EACCES;
754 }
755 }
756 return 0;
757}
758
759static int check_func_arg(struct verifier_env *env, u32 regno,
760 enum bpf_arg_type arg_type, struct bpf_map **mapp)
761{
762 struct reg_state *reg = env->cur_state.regs + regno;
763 enum bpf_reg_type expected_type;
764 int err = 0;
765
766 if (arg_type == ARG_ANYTHING)
767 return 0;
768
769 if (reg->type == NOT_INIT) {
770 verbose("R%d !read_ok\n", regno);
771 return -EACCES;
772 }
773
774 if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
775 arg_type == ARG_PTR_TO_MAP_VALUE) {
776 expected_type = PTR_TO_STACK;
777 } else if (arg_type == ARG_CONST_STACK_SIZE) {
778 expected_type = CONST_IMM;
779 } else if (arg_type == ARG_CONST_MAP_PTR) {
780 expected_type = CONST_PTR_TO_MAP;
781 } else {
782 verbose("unsupported arg_type %d\n", arg_type);
783 return -EFAULT;
784 }
785
786 if (reg->type != expected_type) {
787 verbose("R%d type=%s expected=%s\n", regno,
788 reg_type_str[reg->type], reg_type_str[expected_type]);
789 return -EACCES;
790 }
791
792 if (arg_type == ARG_CONST_MAP_PTR) {
793 /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
794 *mapp = reg->map_ptr;
795
796 } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
797 /* bpf_map_xxx(..., map_ptr, ..., key) call:
798 * check that [key, key + map->key_size) are within
799 * stack limits and initialized
800 */
801 if (!*mapp) {
802 /* in function declaration map_ptr must come before
803 * map_key, so that it's verified and known before
804 * we have to check map_key here. Otherwise it means
805 * that kernel subsystem misconfigured verifier
806 */
807 verbose("invalid map_ptr to access map->key\n");
808 return -EACCES;
809 }
810 err = check_stack_boundary(env, regno, (*mapp)->key_size);
811
812 } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
813 /* bpf_map_xxx(..., map_ptr, ..., value) call:
814 * check [value, value + map->value_size) validity
815 */
816 if (!*mapp) {
817 /* kernel subsystem misconfigured verifier */
818 verbose("invalid map_ptr to access map->value\n");
819 return -EACCES;
820 }
821 err = check_stack_boundary(env, regno, (*mapp)->value_size);
822
823 } else if (arg_type == ARG_CONST_STACK_SIZE) {
824 /* bpf_xxx(..., buf, len) call will access 'len' bytes
825 * from stack pointer 'buf'. Check it
826 * note: regno == len, regno - 1 == buf
827 */
828 if (regno == 0) {
829 /* kernel subsystem misconfigured verifier */
830 verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
831 return -EACCES;
832 }
833 err = check_stack_boundary(env, regno - 1, reg->imm);
834 }
835
836 return err;
837}
838
839static int check_call(struct verifier_env *env, int func_id)
840{
841 struct verifier_state *state = &env->cur_state;
842 const struct bpf_func_proto *fn = NULL;
843 struct reg_state *regs = state->regs;
844 struct bpf_map *map = NULL;
845 struct reg_state *reg;
846 int i, err;
847
848 /* find function prototype */
849 if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
850 verbose("invalid func %d\n", func_id);
851 return -EINVAL;
852 }
853
854 if (env->prog->aux->ops->get_func_proto)
855 fn = env->prog->aux->ops->get_func_proto(func_id);
856
857 if (!fn) {
858 verbose("unknown func %d\n", func_id);
859 return -EINVAL;
860 }
861
862 /* eBPF programs must be GPL compatible to use GPL-ed functions */
863 if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) {
864 verbose("cannot call GPL only function from proprietary program\n");
865 return -EINVAL;
866 }
867
868 /* check args */
869 err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
870 if (err)
871 return err;
872 err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
873 if (err)
874 return err;
875 err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
876 if (err)
877 return err;
878 err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
879 if (err)
880 return err;
881 err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
882 if (err)
883 return err;
884
885 /* reset caller saved regs */
886 for (i = 0; i < CALLER_SAVED_REGS; i++) {
887 reg = regs + caller_saved[i];
888 reg->type = NOT_INIT;
889 reg->imm = 0;
890 }
891
892 /* update return register */
893 if (fn->ret_type == RET_INTEGER) {
894 regs[BPF_REG_0].type = UNKNOWN_VALUE;
895 } else if (fn->ret_type == RET_VOID) {
896 regs[BPF_REG_0].type = NOT_INIT;
897 } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
898 regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
899 /* remember map_ptr, so that check_map_access()
900 * can check 'value_size' boundary of memory access
901 * to map element returned from bpf_map_lookup_elem()
902 */
903 if (map == NULL) {
904 verbose("kernel subsystem misconfigured verifier\n");
905 return -EINVAL;
906 }
907 regs[BPF_REG_0].map_ptr = map;
908 } else {
909 verbose("unknown return type %d of func %d\n",
910 fn->ret_type, func_id);
911 return -EINVAL;
912 }
913 return 0;
914}
915
916/* check validity of 32-bit and 64-bit arithmetic operations */
917static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
918{
919 u8 opcode = BPF_OP(insn->code);
920 int err;
921
922 if (opcode == BPF_END || opcode == BPF_NEG) {
923 if (opcode == BPF_NEG) {
924 if (BPF_SRC(insn->code) != 0 ||
925 insn->src_reg != BPF_REG_0 ||
926 insn->off != 0 || insn->imm != 0) {
927 verbose("BPF_NEG uses reserved fields\n");
928 return -EINVAL;
929 }
930 } else {
931 if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
932 (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
933 verbose("BPF_END uses reserved fields\n");
934 return -EINVAL;
935 }
936 }
937
938 /* check src operand */
939 err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
940 if (err)
941 return err;
942
943 /* check dest operand */
944 err = check_reg_arg(regs, insn->dst_reg, DST_OP);
945 if (err)
946 return err;
947
948 } else if (opcode == BPF_MOV) {
949
950 if (BPF_SRC(insn->code) == BPF_X) {
951 if (insn->imm != 0 || insn->off != 0) {
952 verbose("BPF_MOV uses reserved fields\n");
953 return -EINVAL;
954 }
955
956 /* check src operand */
957 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
958 if (err)
959 return err;
960 } else {
961 if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
962 verbose("BPF_MOV uses reserved fields\n");
963 return -EINVAL;
964 }
965 }
966
967 /* check dest operand */
968 err = check_reg_arg(regs, insn->dst_reg, DST_OP);
969 if (err)
970 return err;
971
972 if (BPF_SRC(insn->code) == BPF_X) {
973 if (BPF_CLASS(insn->code) == BPF_ALU64) {
974 /* case: R1 = R2
975 * copy register state to dest reg
976 */
977 regs[insn->dst_reg] = regs[insn->src_reg];
978 } else {
979 regs[insn->dst_reg].type = UNKNOWN_VALUE;
980 regs[insn->dst_reg].map_ptr = NULL;
981 }
982 } else {
983 /* case: R = imm
984 * remember the value we stored into this reg
985 */
986 regs[insn->dst_reg].type = CONST_IMM;
987 regs[insn->dst_reg].imm = insn->imm;
988 }
989
990 } else if (opcode > BPF_END) {
991 verbose("invalid BPF_ALU opcode %x\n", opcode);
992 return -EINVAL;
993
994 } else { /* all other ALU ops: and, sub, xor, add, ... */
995
996 bool stack_relative = false;
997
998 if (BPF_SRC(insn->code) == BPF_X) {
999 if (insn->imm != 0 || insn->off != 0) {
1000 verbose("BPF_ALU uses reserved fields\n");
1001 return -EINVAL;
1002 }
1003 /* check src1 operand */
1004 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1005 if (err)
1006 return err;
1007 } else {
1008 if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
1009 verbose("BPF_ALU uses reserved fields\n");
1010 return -EINVAL;
1011 }
1012 }
1013
1014 /* check src2 operand */
1015 err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
1016 if (err)
1017 return err;
1018
1019 if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
1020 BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
1021 verbose("div by zero\n");
1022 return -EINVAL;
1023 }
1024
1025 /* pattern match 'bpf_add Rx, imm' instruction */
1026 if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
1027 regs[insn->dst_reg].type == FRAME_PTR &&
1028 BPF_SRC(insn->code) == BPF_K)
1029 stack_relative = true;
1030
1031 /* check dest operand */
1032 err = check_reg_arg(regs, insn->dst_reg, DST_OP);
1033 if (err)
1034 return err;
1035
1036 if (stack_relative) {
1037 regs[insn->dst_reg].type = PTR_TO_STACK;
1038 regs[insn->dst_reg].imm = insn->imm;
1039 }
1040 }
1041
1042 return 0;
1043}
1044
1045static int check_cond_jmp_op(struct verifier_env *env,
1046 struct bpf_insn *insn, int *insn_idx)
1047{
1048 struct reg_state *regs = env->cur_state.regs;
1049 struct verifier_state *other_branch;
1050 u8 opcode = BPF_OP(insn->code);
1051 int err;
1052
1053 if (opcode > BPF_EXIT) {
1054 verbose("invalid BPF_JMP opcode %x\n", opcode);
1055 return -EINVAL;
1056 }
1057
1058 if (BPF_SRC(insn->code) == BPF_X) {
1059 if (insn->imm != 0) {
1060 verbose("BPF_JMP uses reserved fields\n");
1061 return -EINVAL;
1062 }
1063
1064 /* check src1 operand */
1065 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1066 if (err)
1067 return err;
1068 } else {
1069 if (insn->src_reg != BPF_REG_0) {
1070 verbose("BPF_JMP uses reserved fields\n");
1071 return -EINVAL;
1072 }
1073 }
1074
1075 /* check src2 operand */
1076 err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
1077 if (err)
1078 return err;
1079
1080 /* detect if R == 0 where R was initialized to zero earlier */
1081 if (BPF_SRC(insn->code) == BPF_K &&
1082 (opcode == BPF_JEQ || opcode == BPF_JNE) &&
1083 regs[insn->dst_reg].type == CONST_IMM &&
1084 regs[insn->dst_reg].imm == insn->imm) {
1085 if (opcode == BPF_JEQ) {
1086 /* if (imm == imm) goto pc+off;
1087 * only follow the goto, ignore fall-through
1088 */
1089 *insn_idx += insn->off;
1090 return 0;
1091 } else {
1092 /* if (imm != imm) goto pc+off;
1093 * only follow fall-through branch, since
1094 * that's where the program will go
1095 */
1096 return 0;
1097 }
1098 }
1099
1100 other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
1101 if (!other_branch)
1102 return -EFAULT;
1103
1104 /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
1105 if (BPF_SRC(insn->code) == BPF_K &&
1106 insn->imm == 0 && (opcode == BPF_JEQ ||
1107 opcode == BPF_JNE) &&
1108 regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
1109 if (opcode == BPF_JEQ) {
1110 /* next fallthrough insn can access memory via
1111 * this register
1112 */
1113 regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
1114 /* branch targer cannot access it, since reg == 0 */
1115 other_branch->regs[insn->dst_reg].type = CONST_IMM;
1116 other_branch->regs[insn->dst_reg].imm = 0;
1117 } else {
1118 other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
1119 regs[insn->dst_reg].type = CONST_IMM;
1120 regs[insn->dst_reg].imm = 0;
1121 }
1122 } else if (BPF_SRC(insn->code) == BPF_K &&
1123 (opcode == BPF_JEQ || opcode == BPF_JNE)) {
1124
1125 if (opcode == BPF_JEQ) {
1126 /* detect if (R == imm) goto
1127 * and in the target state recognize that R = imm
1128 */
1129 other_branch->regs[insn->dst_reg].type = CONST_IMM;
1130 other_branch->regs[insn->dst_reg].imm = insn->imm;
1131 } else {
1132 /* detect if (R != imm) goto
1133 * and in the fall-through state recognize that R = imm
1134 */
1135 regs[insn->dst_reg].type = CONST_IMM;
1136 regs[insn->dst_reg].imm = insn->imm;
1137 }
1138 }
1139 if (log_level)
1140 print_verifier_state(env);
1141 return 0;
1142}
1143
1144/* return the map pointer stored inside BPF_LD_IMM64 instruction */
1145static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
1146{
1147 u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
1148
1149 return (struct bpf_map *) (unsigned long) imm64;
1150}
1151
1152/* verify BPF_LD_IMM64 instruction */
1153static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
1154{
1155 struct reg_state *regs = env->cur_state.regs;
1156 int err;
1157
1158 if (BPF_SIZE(insn->code) != BPF_DW) {
1159 verbose("invalid BPF_LD_IMM insn\n");
1160 return -EINVAL;
1161 }
1162 if (insn->off != 0) {
1163 verbose("BPF_LD_IMM64 uses reserved fields\n");
1164 return -EINVAL;
1165 }
1166
1167 err = check_reg_arg(regs, insn->dst_reg, DST_OP);
1168 if (err)
1169 return err;
1170
1171 if (insn->src_reg == 0)
1172 /* generic move 64-bit immediate into a register */
1173 return 0;
1174
1175 /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
1176 BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
1177
1178 regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
1179 regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
1180 return 0;
1181}
1182
1183/* non-recursive DFS pseudo code
1184 * 1 procedure DFS-iterative(G,v):
1185 * 2 label v as discovered
1186 * 3 let S be a stack
1187 * 4 S.push(v)
1188 * 5 while S is not empty
1189 * 6 t <- S.pop()
1190 * 7 if t is what we're looking for:
1191 * 8 return t
1192 * 9 for all edges e in G.adjacentEdges(t) do
1193 * 10 if edge e is already labelled
1194 * 11 continue with the next edge
1195 * 12 w <- G.adjacentVertex(t,e)
1196 * 13 if vertex w is not discovered and not explored
1197 * 14 label e as tree-edge
1198 * 15 label w as discovered
1199 * 16 S.push(w)
1200 * 17 continue at 5
1201 * 18 else if vertex w is discovered
1202 * 19 label e as back-edge
1203 * 20 else
1204 * 21 // vertex w is explored
1205 * 22 label e as forward- or cross-edge
1206 * 23 label t as explored
1207 * 24 S.pop()
1208 *
1209 * convention:
1210 * 0x10 - discovered
1211 * 0x11 - discovered and fall-through edge labelled
1212 * 0x12 - discovered and fall-through and branch edges labelled
1213 * 0x20 - explored
1214 */
1215
1216enum {
1217 DISCOVERED = 0x10,
1218 EXPLORED = 0x20,
1219 FALLTHROUGH = 1,
1220 BRANCH = 2,
1221};
1222
1223#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
1224
1225static int *insn_stack; /* stack of insns to process */
1226static int cur_stack; /* current stack index */
1227static int *insn_state;
1228
1229/* t, w, e - match pseudo-code above:
1230 * t - index of current instruction
1231 * w - next instruction
1232 * e - edge
1233 */
1234static int push_insn(int t, int w, int e, struct verifier_env *env)
1235{
1236 if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
1237 return 0;
1238
1239 if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
1240 return 0;
1241
1242 if (w < 0 || w >= env->prog->len) {
1243 verbose("jump out of range from insn %d to %d\n", t, w);
1244 return -EINVAL;
1245 }
1246
1247 if (e == BRANCH)
1248 /* mark branch target for state pruning */
1249 env->explored_states[w] = STATE_LIST_MARK;
1250
1251 if (insn_state[w] == 0) {
1252 /* tree-edge */
1253 insn_state[t] = DISCOVERED | e;
1254 insn_state[w] = DISCOVERED;
1255 if (cur_stack >= env->prog->len)
1256 return -E2BIG;
1257 insn_stack[cur_stack++] = w;
1258 return 1;
1259 } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
1260 verbose("back-edge from insn %d to %d\n", t, w);
1261 return -EINVAL;
1262 } else if (insn_state[w] == EXPLORED) {
1263 /* forward- or cross-edge */
1264 insn_state[t] = DISCOVERED | e;
1265 } else {
1266 verbose("insn state internal bug\n");
1267 return -EFAULT;
1268 }
1269 return 0;
1270}
1271
1272/* non-recursive depth-first-search to detect loops in BPF program
1273 * loop == back-edge in directed graph
1274 */
1275static int check_cfg(struct verifier_env *env)
1276{
1277 struct bpf_insn *insns = env->prog->insnsi;
1278 int insn_cnt = env->prog->len;
1279 int ret = 0;
1280 int i, t;
1281
1282 insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
1283 if (!insn_state)
1284 return -ENOMEM;
1285
1286 insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
1287 if (!insn_stack) {
1288 kfree(insn_state);
1289 return -ENOMEM;
1290 }
1291
1292 insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
1293 insn_stack[0] = 0; /* 0 is the first instruction */
1294 cur_stack = 1;
1295
1296peek_stack:
1297 if (cur_stack == 0)
1298 goto check_state;
1299 t = insn_stack[cur_stack - 1];
1300
1301 if (BPF_CLASS(insns[t].code) == BPF_JMP) {
1302 u8 opcode = BPF_OP(insns[t].code);
1303
1304 if (opcode == BPF_EXIT) {
1305 goto mark_explored;
1306 } else if (opcode == BPF_CALL) {
1307 ret = push_insn(t, t + 1, FALLTHROUGH, env);
1308 if (ret == 1)
1309 goto peek_stack;
1310 else if (ret < 0)
1311 goto err_free;
1312 } else if (opcode == BPF_JA) {
1313 if (BPF_SRC(insns[t].code) != BPF_K) {
1314 ret = -EINVAL;
1315 goto err_free;
1316 }
1317 /* unconditional jump with single edge */
1318 ret = push_insn(t, t + insns[t].off + 1,
1319 FALLTHROUGH, env);
1320 if (ret == 1)
1321 goto peek_stack;
1322 else if (ret < 0)
1323 goto err_free;
1324 /* tell verifier to check for equivalent states
1325 * after every call and jump
1326 */
1327 env->explored_states[t + 1] = STATE_LIST_MARK;
1328 } else {
1329 /* conditional jump with two edges */
1330 ret = push_insn(t, t + 1, FALLTHROUGH, env);
1331 if (ret == 1)
1332 goto peek_stack;
1333 else if (ret < 0)
1334 goto err_free;
1335
1336 ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
1337 if (ret == 1)
1338 goto peek_stack;
1339 else if (ret < 0)
1340 goto err_free;
1341 }
1342 } else {
1343 /* all other non-branch instructions with single
1344 * fall-through edge
1345 */
1346 ret = push_insn(t, t + 1, FALLTHROUGH, env);
1347 if (ret == 1)
1348 goto peek_stack;
1349 else if (ret < 0)
1350 goto err_free;
1351 }
1352
1353mark_explored:
1354 insn_state[t] = EXPLORED;
1355 if (cur_stack-- <= 0) {
1356 verbose("pop stack internal bug\n");
1357 ret = -EFAULT;
1358 goto err_free;
1359 }
1360 goto peek_stack;
1361
1362check_state:
1363 for (i = 0; i < insn_cnt; i++) {
1364 if (insn_state[i] != EXPLORED) {
1365 verbose("unreachable insn %d\n", i);
1366 ret = -EINVAL;
1367 goto err_free;
1368 }
1369 }
1370 ret = 0; /* cfg looks good */
1371
1372err_free:
1373 kfree(insn_state);
1374 kfree(insn_stack);
1375 return ret;
1376}
1377
1378/* compare two verifier states
1379 *
1380 * all states stored in state_list are known to be valid, since
1381 * verifier reached 'bpf_exit' instruction through them
1382 *
1383 * this function is called when verifier exploring different branches of
1384 * execution popped from the state stack. If it sees an old state that has
1385 * more strict register state and more strict stack state then this execution
1386 * branch doesn't need to be explored further, since verifier already
1387 * concluded that more strict state leads to valid finish.
1388 *
1389 * Therefore two states are equivalent if register state is more conservative
1390 * and explored stack state is more conservative than the current one.
1391 * Example:
1392 * explored current
1393 * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
1394 * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
1395 *
1396 * In other words if current stack state (one being explored) has more
1397 * valid slots than old one that already passed validation, it means
1398 * the verifier can stop exploring and conclude that current state is valid too
1399 *
1400 * Similarly with registers. If explored state has register type as invalid
1401 * whereas register type in current state is meaningful, it means that
1402 * the current state will reach 'bpf_exit' instruction safely
1403 */
1404static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
1405{
1406 int i;
1407
1408 for (i = 0; i < MAX_BPF_REG; i++) {
1409 if (memcmp(&old->regs[i], &cur->regs[i],
1410 sizeof(old->regs[0])) != 0) {
1411 if (old->regs[i].type == NOT_INIT ||
1412 (old->regs[i].type == UNKNOWN_VALUE &&
1413 cur->regs[i].type != NOT_INIT))
1414 continue;
1415 return false;
1416 }
1417 }
1418
1419 for (i = 0; i < MAX_BPF_STACK; i++) {
1420 if (memcmp(&old->stack[i], &cur->stack[i],
1421 sizeof(old->stack[0])) != 0) {
1422 if (old->stack[i].stype == STACK_INVALID)
1423 continue;
1424 return false;
1425 }
1426 }
1427 return true;
1428}
1429
1430static int is_state_visited(struct verifier_env *env, int insn_idx)
1431{
1432 struct verifier_state_list *new_sl;
1433 struct verifier_state_list *sl;
1434
1435 sl = env->explored_states[insn_idx];
1436 if (!sl)
1437 /* this 'insn_idx' instruction wasn't marked, so we will not
1438 * be doing state search here
1439 */
1440 return 0;
1441
1442 while (sl != STATE_LIST_MARK) {
1443 if (states_equal(&sl->state, &env->cur_state))
1444 /* reached equivalent register/stack state,
1445 * prune the search
1446 */
1447 return 1;
1448 sl = sl->next;
1449 }
1450
1451 /* there were no equivalent states, remember current one.
1452 * technically the current state is not proven to be safe yet,
1453 * but it will either reach bpf_exit (which means it's safe) or
1454 * it will be rejected. Since there are no loops, we won't be
1455 * seeing this 'insn_idx' instruction again on the way to bpf_exit
1456 */
1457 new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
1458 if (!new_sl)
1459 return -ENOMEM;
1460
1461 /* add new state to the head of linked list */
1462 memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
1463 new_sl->next = env->explored_states[insn_idx];
1464 env->explored_states[insn_idx] = new_sl;
1465 return 0;
1466}
1467
1468static int do_check(struct verifier_env *env)
1469{
1470 struct verifier_state *state = &env->cur_state;
1471 struct bpf_insn *insns = env->prog->insnsi;
1472 struct reg_state *regs = state->regs;
1473 int insn_cnt = env->prog->len;
1474 int insn_idx, prev_insn_idx = 0;
1475 int insn_processed = 0;
1476 bool do_print_state = false;
1477
1478 init_reg_state(regs);
1479 insn_idx = 0;
1480 for (;;) {
1481 struct bpf_insn *insn;
1482 u8 class;
1483 int err;
1484
1485 if (insn_idx >= insn_cnt) {
1486 verbose("invalid insn idx %d insn_cnt %d\n",
1487 insn_idx, insn_cnt);
1488 return -EFAULT;
1489 }
1490
1491 insn = &insns[insn_idx];
1492 class = BPF_CLASS(insn->code);
1493
1494 if (++insn_processed > 32768) {
1495 verbose("BPF program is too large. Proccessed %d insn\n",
1496 insn_processed);
1497 return -E2BIG;
1498 }
1499
1500 err = is_state_visited(env, insn_idx);
1501 if (err < 0)
1502 return err;
1503 if (err == 1) {
1504 /* found equivalent state, can prune the search */
1505 if (log_level) {
1506 if (do_print_state)
1507 verbose("\nfrom %d to %d: safe\n",
1508 prev_insn_idx, insn_idx);
1509 else
1510 verbose("%d: safe\n", insn_idx);
1511 }
1512 goto process_bpf_exit;
1513 }
1514
1515 if (log_level && do_print_state) {
1516 verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
1517 print_verifier_state(env);
1518 do_print_state = false;
1519 }
1520
1521 if (log_level) {
1522 verbose("%d: ", insn_idx);
1523 print_bpf_insn(insn);
1524 }
1525
1526 if (class == BPF_ALU || class == BPF_ALU64) {
1527 err = check_alu_op(regs, insn);
1528 if (err)
1529 return err;
1530
1531 } else if (class == BPF_LDX) {
1532 if (BPF_MODE(insn->code) != BPF_MEM ||
1533 insn->imm != 0) {
1534 verbose("BPF_LDX uses reserved fields\n");
1535 return -EINVAL;
1536 }
1537 /* check src operand */
1538 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1539 if (err)
1540 return err;
1541
1542 err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
1543 if (err)
1544 return err;
1545
1546 /* check that memory (src_reg + off) is readable,
1547 * the state of dst_reg will be updated by this func
1548 */
1549 err = check_mem_access(env, insn->src_reg, insn->off,
1550 BPF_SIZE(insn->code), BPF_READ,
1551 insn->dst_reg);
1552 if (err)
1553 return err;
1554
1555 } else if (class == BPF_STX) {
1556 if (BPF_MODE(insn->code) == BPF_XADD) {
1557 err = check_xadd(env, insn);
1558 if (err)
1559 return err;
1560 insn_idx++;
1561 continue;
1562 }
1563
1564 if (BPF_MODE(insn->code) != BPF_MEM ||
1565 insn->imm != 0) {
1566 verbose("BPF_STX uses reserved fields\n");
1567 return -EINVAL;
1568 }
1569 /* check src1 operand */
1570 err = check_reg_arg(regs, insn->src_reg, SRC_OP);
1571 if (err)
1572 return err;
1573 /* check src2 operand */
1574 err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
1575 if (err)
1576 return err;
1577
1578 /* check that memory (dst_reg + off) is writeable */
1579 err = check_mem_access(env, insn->dst_reg, insn->off,
1580 BPF_SIZE(insn->code), BPF_WRITE,
1581 insn->src_reg);
1582 if (err)
1583 return err;
1584
1585 } else if (class == BPF_ST) {
1586 if (BPF_MODE(insn->code) != BPF_MEM ||
1587 insn->src_reg != BPF_REG_0) {
1588 verbose("BPF_ST uses reserved fields\n");
1589 return -EINVAL;
1590 }
1591 /* check src operand */
1592 err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
1593 if (err)
1594 return err;
1595
1596 /* check that memory (dst_reg + off) is writeable */
1597 err = check_mem_access(env, insn->dst_reg, insn->off,
1598 BPF_SIZE(insn->code), BPF_WRITE,
1599 -1);
1600 if (err)
1601 return err;
1602
1603 } else if (class == BPF_JMP) {
1604 u8 opcode = BPF_OP(insn->code);
1605
1606 if (opcode == BPF_CALL) {
1607 if (BPF_SRC(insn->code) != BPF_K ||
1608 insn->off != 0 ||
1609 insn->src_reg != BPF_REG_0 ||
1610 insn->dst_reg != BPF_REG_0) {
1611 verbose("BPF_CALL uses reserved fields\n");
1612 return -EINVAL;
1613 }
1614
1615 err = check_call(env, insn->imm);
1616 if (err)
1617 return err;
1618
1619 } else if (opcode == BPF_JA) {
1620 if (BPF_SRC(insn->code) != BPF_K ||
1621 insn->imm != 0 ||
1622 insn->src_reg != BPF_REG_0 ||
1623 insn->dst_reg != BPF_REG_0) {
1624 verbose("BPF_JA uses reserved fields\n");
1625 return -EINVAL;
1626 }
1627
1628 insn_idx += insn->off + 1;
1629 continue;
1630
1631 } else if (opcode == BPF_EXIT) {
1632 if (BPF_SRC(insn->code) != BPF_K ||
1633 insn->imm != 0 ||
1634 insn->src_reg != BPF_REG_0 ||
1635 insn->dst_reg != BPF_REG_0) {
1636 verbose("BPF_EXIT uses reserved fields\n");
1637 return -EINVAL;
1638 }
1639
1640 /* eBPF calling convetion is such that R0 is used
1641 * to return the value from eBPF program.
1642 * Make sure that it's readable at this time
1643 * of bpf_exit, which means that program wrote
1644 * something into it earlier
1645 */
1646 err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
1647 if (err)
1648 return err;
1649
1650process_bpf_exit:
1651 insn_idx = pop_stack(env, &prev_insn_idx);
1652 if (insn_idx < 0) {
1653 break;
1654 } else {
1655 do_print_state = true;
1656 continue;
1657 }
1658 } else {
1659 err = check_cond_jmp_op(env, insn, &insn_idx);
1660 if (err)
1661 return err;
1662 }
1663 } else if (class == BPF_LD) {
1664 u8 mode = BPF_MODE(insn->code);
1665
1666 if (mode == BPF_ABS || mode == BPF_IND) {
1667 verbose("LD_ABS is not supported yet\n");
1668 return -EINVAL;
1669 } else if (mode == BPF_IMM) {
1670 err = check_ld_imm(env, insn);
1671 if (err)
1672 return err;
1673
1674 insn_idx++;
1675 } else {
1676 verbose("invalid BPF_LD mode\n");
1677 return -EINVAL;
1678 }
1679 } else {
1680 verbose("unknown insn class %d\n", class);
1681 return -EINVAL;
1682 }
1683
1684 insn_idx++;
1685 }
1686
1687 return 0;
1688}
1689
1690/* look for pseudo eBPF instructions that access map FDs and
1691 * replace them with actual map pointers
1692 */
1693static int replace_map_fd_with_map_ptr(struct verifier_env *env)
1694{
1695 struct bpf_insn *insn = env->prog->insnsi;
1696 int insn_cnt = env->prog->len;
1697 int i, j;
1698
1699 for (i = 0; i < insn_cnt; i++, insn++) {
1700 if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
1701 struct bpf_map *map;
1702 struct fd f;
1703
1704 if (i == insn_cnt - 1 || insn[1].code != 0 ||
1705 insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
1706 insn[1].off != 0) {
1707 verbose("invalid bpf_ld_imm64 insn\n");
1708 return -EINVAL;
1709 }
1710
1711 if (insn->src_reg == 0)
1712 /* valid generic load 64-bit imm */
1713 goto next_insn;
1714
1715 if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
1716 verbose("unrecognized bpf_ld_imm64 insn\n");
1717 return -EINVAL;
1718 }
1719
1720 f = fdget(insn->imm);
1721
1722 map = bpf_map_get(f);
1723 if (IS_ERR(map)) {
1724 verbose("fd %d is not pointing to valid bpf_map\n",
1725 insn->imm);
1726 fdput(f);
1727 return PTR_ERR(map);
1728 }
1729
1730 /* store map pointer inside BPF_LD_IMM64 instruction */
1731 insn[0].imm = (u32) (unsigned long) map;
1732 insn[1].imm = ((u64) (unsigned long) map) >> 32;
1733
1734 /* check whether we recorded this map already */
1735 for (j = 0; j < env->used_map_cnt; j++)
1736 if (env->used_maps[j] == map) {
1737 fdput(f);
1738 goto next_insn;
1739 }
1740
1741 if (env->used_map_cnt >= MAX_USED_MAPS) {
1742 fdput(f);
1743 return -E2BIG;
1744 }
1745
1746 /* remember this map */
1747 env->used_maps[env->used_map_cnt++] = map;
1748
1749 /* hold the map. If the program is rejected by verifier,
1750 * the map will be released by release_maps() or it
1751 * will be used by the valid program until it's unloaded
1752 * and all maps are released in free_bpf_prog_info()
1753 */
1754 atomic_inc(&map->refcnt);
1755
1756 fdput(f);
1757next_insn:
1758 insn++;
1759 i++;
1760 }
1761 }
1762
1763 /* now all pseudo BPF_LD_IMM64 instructions load valid
1764 * 'struct bpf_map *' into a register instead of user map_fd.
1765 * These pointers will be used later by verifier to validate map access.
1766 */
1767 return 0;
1768}
1769
1770/* drop refcnt of maps used by the rejected program */
1771static void release_maps(struct verifier_env *env)
1772{
1773 int i;
1774
1775 for (i = 0; i < env->used_map_cnt; i++)
1776 bpf_map_put(env->used_maps[i]);
1777}
1778
1779/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
1780static void convert_pseudo_ld_imm64(struct verifier_env *env)
1781{
1782 struct bpf_insn *insn = env->prog->insnsi;
1783 int insn_cnt = env->prog->len;
1784 int i;
1785
1786 for (i = 0; i < insn_cnt; i++, insn++)
1787 if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
1788 insn->src_reg = 0;
1789}
1790
1791static void free_states(struct verifier_env *env)
1792{
1793 struct verifier_state_list *sl, *sln;
1794 int i;
1795
1796 if (!env->explored_states)
1797 return;
1798
1799 for (i = 0; i < env->prog->len; i++) {
1800 sl = env->explored_states[i];
1801
1802 if (sl)
1803 while (sl != STATE_LIST_MARK) {
1804 sln = sl->next;
1805 kfree(sl);
1806 sl = sln;
1807 }
1808 }
1809
1810 kfree(env->explored_states);
1811}
1812
1813int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
1814{
1815 char __user *log_ubuf = NULL;
1816 struct verifier_env *env;
1817 int ret = -EINVAL;
1818
1819 if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
1820 return -E2BIG;
1821
1822 /* 'struct verifier_env' can be global, but since it's not small,
1823 * allocate/free it every time bpf_check() is called
1824 */
1825 env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
1826 if (!env)
1827 return -ENOMEM;
1828
1829 env->prog = prog;
1830
1831 /* grab the mutex to protect few globals used by verifier */
1832 mutex_lock(&bpf_verifier_lock);
1833
1834 if (attr->log_level || attr->log_buf || attr->log_size) {
1835 /* user requested verbose verifier output
1836 * and supplied buffer to store the verification trace
1837 */
1838 log_level = attr->log_level;
1839 log_ubuf = (char __user *) (unsigned long) attr->log_buf;
1840 log_size = attr->log_size;
1841 log_len = 0;
1842
1843 ret = -EINVAL;
1844 /* log_* values have to be sane */
1845 if (log_size < 128 || log_size > UINT_MAX >> 8 ||
1846 log_level == 0 || log_ubuf == NULL)
1847 goto free_env;
1848
1849 ret = -ENOMEM;
1850 log_buf = vmalloc(log_size);
1851 if (!log_buf)
1852 goto free_env;
1853 } else {
1854 log_level = 0;
1855 }
1856
1857 ret = replace_map_fd_with_map_ptr(env);
1858 if (ret < 0)
1859 goto skip_full_check;
1860
1861 env->explored_states = kcalloc(prog->len,
1862 sizeof(struct verifier_state_list *),
1863 GFP_USER);
1864 ret = -ENOMEM;
1865 if (!env->explored_states)
1866 goto skip_full_check;
1867
1868 ret = check_cfg(env);
1869 if (ret < 0)
1870 goto skip_full_check;
1871
1872 ret = do_check(env);
1873
1874skip_full_check:
1875 while (pop_stack(env, NULL) >= 0);
1876 free_states(env);
1877
1878 if (log_level && log_len >= log_size - 1) {
1879 BUG_ON(log_len >= log_size);
1880 /* verifier log exceeded user supplied buffer */
1881 ret = -ENOSPC;
1882 /* fall through to return what was recorded */
1883 }
1884
1885 /* copy verifier log back to user space including trailing zero */
1886 if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
1887 ret = -EFAULT;
1888 goto free_log_buf;
1889 }
1890
1891 if (ret == 0 && env->used_map_cnt) {
1892 /* if program passed verifier, update used_maps in bpf_prog_info */
1893 prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
1894 sizeof(env->used_maps[0]),
1895 GFP_KERNEL);
1896
1897 if (!prog->aux->used_maps) {
1898 ret = -ENOMEM;
1899 goto free_log_buf;
1900 }
1901
1902 memcpy(prog->aux->used_maps, env->used_maps,
1903 sizeof(env->used_maps[0]) * env->used_map_cnt);
1904 prog->aux->used_map_cnt = env->used_map_cnt;
1905
1906 /* program is valid. Convert pseudo bpf_ld_imm64 into generic
1907 * bpf_ld_imm64 instructions
1908 */
1909 convert_pseudo_ld_imm64(env);
1910 }
1911
1912free_log_buf:
1913 if (log_level)
1914 vfree(log_buf);
1915free_env:
1916 if (!prog->aux->used_maps)
1917 /* if we didn't copy map pointers into bpf_prog_info, release
1918 * them now. Otherwise free_bpf_prog_info() will release them.
1919 */
1920 release_maps(env);
1921 kfree(env);
1922 mutex_unlock(&bpf_verifier_lock);
1923 return ret;
1924}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7dc8788cfd52..136eceadeed1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly;
185static struct cftype cgroup_dfl_base_files[]; 185static struct cftype cgroup_dfl_base_files[];
186static struct cftype cgroup_legacy_base_files[]; 186static struct cftype cgroup_legacy_base_files[];
187 187
188static void cgroup_put(struct cgroup *cgrp);
189static int rebind_subsystems(struct cgroup_root *dst_root, 188static int rebind_subsystems(struct cgroup_root *dst_root,
190 unsigned int ss_mask); 189 unsigned int ss_mask);
191static int cgroup_destroy_locked(struct cgroup *cgrp); 190static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref);
195static void kill_css(struct cgroup_subsys_state *css); 194static void kill_css(struct cgroup_subsys_state *css);
196static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], 195static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
197 bool is_add); 196 bool is_add);
198static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
199 197
200/* IDR wrappers which synchronize using cgroup_idr_lock */ 198/* IDR wrappers which synchronize using cgroup_idr_lock */
201static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, 199static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
@@ -331,14 +329,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
331 return false; 329 return false;
332} 330}
333 331
334static int cgroup_is_releasable(const struct cgroup *cgrp)
335{
336 const int bits =
337 (1 << CGRP_RELEASABLE) |
338 (1 << CGRP_NOTIFY_ON_RELEASE);
339 return (cgrp->flags & bits) == bits;
340}
341
342static int notify_on_release(const struct cgroup *cgrp) 332static int notify_on_release(const struct cgroup *cgrp)
343{ 333{
344 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 334 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -394,12 +384,7 @@ static int notify_on_release(const struct cgroup *cgrp)
394 ; \ 384 ; \
395 else 385 else
396 386
397/* the list of cgroups eligible for automatic release. Protected by
398 * release_list_lock */
399static LIST_HEAD(release_list);
400static DEFINE_RAW_SPINLOCK(release_list_lock);
401static void cgroup_release_agent(struct work_struct *work); 387static void cgroup_release_agent(struct work_struct *work);
402static DECLARE_WORK(release_agent_work, cgroup_release_agent);
403static void check_for_release(struct cgroup *cgrp); 388static void check_for_release(struct cgroup *cgrp);
404 389
405/* 390/*
@@ -498,7 +483,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
498 return key; 483 return key;
499} 484}
500 485
501static void put_css_set_locked(struct css_set *cset, bool taskexit) 486static void put_css_set_locked(struct css_set *cset)
502{ 487{
503 struct cgrp_cset_link *link, *tmp_link; 488 struct cgrp_cset_link *link, *tmp_link;
504 struct cgroup_subsys *ss; 489 struct cgroup_subsys *ss;
@@ -524,11 +509,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
524 /* @cgrp can't go away while we're holding css_set_rwsem */ 509 /* @cgrp can't go away while we're holding css_set_rwsem */
525 if (list_empty(&cgrp->cset_links)) { 510 if (list_empty(&cgrp->cset_links)) {
526 cgroup_update_populated(cgrp, false); 511 cgroup_update_populated(cgrp, false);
527 if (notify_on_release(cgrp)) { 512 check_for_release(cgrp);
528 if (taskexit)
529 set_bit(CGRP_RELEASABLE, &cgrp->flags);
530 check_for_release(cgrp);
531 }
532 } 513 }
533 514
534 kfree(link); 515 kfree(link);
@@ -537,7 +518,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
537 kfree_rcu(cset, rcu_head); 518 kfree_rcu(cset, rcu_head);
538} 519}
539 520
540static void put_css_set(struct css_set *cset, bool taskexit) 521static void put_css_set(struct css_set *cset)
541{ 522{
542 /* 523 /*
543 * Ensure that the refcount doesn't hit zero while any readers 524 * Ensure that the refcount doesn't hit zero while any readers
@@ -548,7 +529,7 @@ static void put_css_set(struct css_set *cset, bool taskexit)
548 return; 529 return;
549 530
550 down_write(&css_set_rwsem); 531 down_write(&css_set_rwsem);
551 put_css_set_locked(cset, taskexit); 532 put_css_set_locked(cset);
552 up_write(&css_set_rwsem); 533 up_write(&css_set_rwsem);
553} 534}
554 535
@@ -969,14 +950,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
969 * knows that the cgroup won't be removed, as cgroup_rmdir() 950 * knows that the cgroup won't be removed, as cgroup_rmdir()
970 * needs that mutex. 951 * needs that mutex.
971 * 952 *
972 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
973 * (usually) take cgroup_mutex. These are the two most performance
974 * critical pieces of code here. The exception occurs on cgroup_exit(),
975 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
976 * is taken, and if the cgroup count is zero, a usermode call made
977 * to the release agent with the name of the cgroup (path relative to
978 * the root of cgroup file system) as the argument.
979 *
980 * A cgroup can only be deleted if both its 'count' of using tasks 953 * A cgroup can only be deleted if both its 'count' of using tasks
981 * is zero, and its list of 'children' cgroups is empty. Since all 954 * is zero, and its list of 'children' cgroups is empty. Since all
982 * tasks in the system use _some_ cgroup, and since there is always at 955 * tasks in the system use _some_ cgroup, and since there is always at
@@ -1035,6 +1008,11 @@ static void cgroup_get(struct cgroup *cgrp)
1035 css_get(&cgrp->self); 1008 css_get(&cgrp->self);
1036} 1009}
1037 1010
1011static bool cgroup_tryget(struct cgroup *cgrp)
1012{
1013 return css_tryget(&cgrp->self);
1014}
1015
1038static void cgroup_put(struct cgroup *cgrp) 1016static void cgroup_put(struct cgroup *cgrp)
1039{ 1017{
1040 css_put(&cgrp->self); 1018 css_put(&cgrp->self);
@@ -1147,7 +1125,8 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn)
1147 * protection against removal. Ensure @cgrp stays accessible and 1125 * protection against removal. Ensure @cgrp stays accessible and
1148 * break the active_ref protection. 1126 * break the active_ref protection.
1149 */ 1127 */
1150 cgroup_get(cgrp); 1128 if (!cgroup_tryget(cgrp))
1129 return NULL;
1151 kernfs_break_active_protection(kn); 1130 kernfs_break_active_protection(kn);
1152 1131
1153 mutex_lock(&cgroup_mutex); 1132 mutex_lock(&cgroup_mutex);
@@ -1581,7 +1560,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1581 INIT_LIST_HEAD(&cgrp->self.sibling); 1560 INIT_LIST_HEAD(&cgrp->self.sibling);
1582 INIT_LIST_HEAD(&cgrp->self.children); 1561 INIT_LIST_HEAD(&cgrp->self.children);
1583 INIT_LIST_HEAD(&cgrp->cset_links); 1562 INIT_LIST_HEAD(&cgrp->cset_links);
1584 INIT_LIST_HEAD(&cgrp->release_list);
1585 INIT_LIST_HEAD(&cgrp->pidlists); 1563 INIT_LIST_HEAD(&cgrp->pidlists);
1586 mutex_init(&cgrp->pidlist_mutex); 1564 mutex_init(&cgrp->pidlist_mutex);
1587 cgrp->self.cgroup = cgrp; 1565 cgrp->self.cgroup = cgrp;
@@ -1591,6 +1569,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
1591 INIT_LIST_HEAD(&cgrp->e_csets[ssid]); 1569 INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
1592 1570
1593 init_waitqueue_head(&cgrp->offline_waitq); 1571 init_waitqueue_head(&cgrp->offline_waitq);
1572 INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
1594} 1573}
1595 1574
1596static void init_cgroup_root(struct cgroup_root *root, 1575static void init_cgroup_root(struct cgroup_root *root,
@@ -1628,7 +1607,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1628 goto out; 1607 goto out;
1629 root_cgrp->id = ret; 1608 root_cgrp->id = ret;
1630 1609
1631 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); 1610 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
1611 GFP_KERNEL);
1632 if (ret) 1612 if (ret)
1633 goto out; 1613 goto out;
1634 1614
@@ -2046,8 +2026,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
2046 * task. As trading it for new_cset is protected by cgroup_mutex, 2026 * task. As trading it for new_cset is protected by cgroup_mutex,
2047 * we're safe to drop it here; it will be freed under RCU. 2027 * we're safe to drop it here; it will be freed under RCU.
2048 */ 2028 */
2049 set_bit(CGRP_RELEASABLE, &old_cgrp->flags); 2029 put_css_set_locked(old_cset);
2050 put_css_set_locked(old_cset, false);
2051} 2030}
2052 2031
2053/** 2032/**
@@ -2068,7 +2047,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
2068 cset->mg_src_cgrp = NULL; 2047 cset->mg_src_cgrp = NULL;
2069 cset->mg_dst_cset = NULL; 2048 cset->mg_dst_cset = NULL;
2070 list_del_init(&cset->mg_preload_node); 2049 list_del_init(&cset->mg_preload_node);
2071 put_css_set_locked(cset, false); 2050 put_css_set_locked(cset);
2072 } 2051 }
2073 up_write(&css_set_rwsem); 2052 up_write(&css_set_rwsem);
2074} 2053}
@@ -2162,8 +2141,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
2162 if (src_cset == dst_cset) { 2141 if (src_cset == dst_cset) {
2163 src_cset->mg_src_cgrp = NULL; 2142 src_cset->mg_src_cgrp = NULL;
2164 list_del_init(&src_cset->mg_preload_node); 2143 list_del_init(&src_cset->mg_preload_node);
2165 put_css_set(src_cset, false); 2144 put_css_set(src_cset);
2166 put_css_set(dst_cset, false); 2145 put_css_set(dst_cset);
2167 continue; 2146 continue;
2168 } 2147 }
2169 2148
@@ -2172,7 +2151,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
2172 if (list_empty(&dst_cset->mg_preload_node)) 2151 if (list_empty(&dst_cset->mg_preload_node))
2173 list_add(&dst_cset->mg_preload_node, &csets); 2152 list_add(&dst_cset->mg_preload_node, &csets);
2174 else 2153 else
2175 put_css_set(dst_cset, false); 2154 put_css_set(dst_cset);
2176 } 2155 }
2177 2156
2178 list_splice_tail(&csets, preloaded_csets); 2157 list_splice_tail(&csets, preloaded_csets);
@@ -3271,8 +3250,17 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
3271{ 3250{
3272 struct cftype *cft; 3251 struct cftype *cft;
3273 3252
3274 for (cft = cfts; cft && cft->name[0] != '\0'; cft++) 3253 /*
3275 cft->flags |= __CFTYPE_NOT_ON_DFL; 3254 * If legacy_flies_on_dfl, we want to show the legacy files on the
3255 * dfl hierarchy but iff the target subsystem hasn't been updated
3256 * for the dfl hierarchy yet.
3257 */
3258 if (!cgroup_legacy_files_on_dfl ||
3259 ss->dfl_cftypes != ss->legacy_cftypes) {
3260 for (cft = cfts; cft && cft->name[0] != '\0'; cft++)
3261 cft->flags |= __CFTYPE_NOT_ON_DFL;
3262 }
3263
3276 return cgroup_add_cftypes(ss, cfts); 3264 return cgroup_add_cftypes(ss, cfts);
3277} 3265}
3278 3266
@@ -3970,7 +3958,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
3970 3958
3971 l = cgroup_pidlist_find_create(cgrp, type); 3959 l = cgroup_pidlist_find_create(cgrp, type);
3972 if (!l) { 3960 if (!l) {
3973 mutex_unlock(&cgrp->pidlist_mutex);
3974 pidlist_free(array); 3961 pidlist_free(array);
3975 return -ENOMEM; 3962 return -ENOMEM;
3976 } 3963 }
@@ -4159,7 +4146,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
4159static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, 4146static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
4160 struct cftype *cft, u64 val) 4147 struct cftype *cft, u64 val)
4161{ 4148{
4162 clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
4163 if (val) 4149 if (val)
4164 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); 4150 set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
4165 else 4151 else
@@ -4337,6 +4323,7 @@ static void css_free_work_fn(struct work_struct *work)
4337 /* cgroup free path */ 4323 /* cgroup free path */
4338 atomic_dec(&cgrp->root->nr_cgrps); 4324 atomic_dec(&cgrp->root->nr_cgrps);
4339 cgroup_pidlist_destroy_all(cgrp); 4325 cgroup_pidlist_destroy_all(cgrp);
4326 cancel_work_sync(&cgrp->release_agent_work);
4340 4327
4341 if (cgroup_parent(cgrp)) { 4328 if (cgroup_parent(cgrp)) {
4342 /* 4329 /*
@@ -4387,6 +4374,15 @@ static void css_release_work_fn(struct work_struct *work)
4387 /* cgroup release path */ 4374 /* cgroup release path */
4388 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); 4375 cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id);
4389 cgrp->id = -1; 4376 cgrp->id = -1;
4377
4378 /*
4379 * There are two control paths which try to determine
4380 * cgroup from dentry without going through kernfs -
4381 * cgroupstats_build() and css_tryget_online_from_dir().
4382 * Those are supported by RCU protecting clearing of
4383 * cgrp->kn->priv backpointer.
4384 */
4385 RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL);
4390 } 4386 }
4391 4387
4392 mutex_unlock(&cgroup_mutex); 4388 mutex_unlock(&cgroup_mutex);
@@ -4487,7 +4483,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4487 4483
4488 init_and_link_css(css, ss, cgrp); 4484 init_and_link_css(css, ss, cgrp);
4489 4485
4490 err = percpu_ref_init(&css->refcnt, css_release); 4486 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
4491 if (err) 4487 if (err)
4492 goto err_free_css; 4488 goto err_free_css;
4493 4489
@@ -4543,6 +4539,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4543 struct cftype *base_files; 4539 struct cftype *base_files;
4544 int ssid, ret; 4540 int ssid, ret;
4545 4541
4542 /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable.
4543 */
4544 if (strchr(name, '\n'))
4545 return -EINVAL;
4546
4546 parent = cgroup_kn_lock_live(parent_kn); 4547 parent = cgroup_kn_lock_live(parent_kn);
4547 if (!parent) 4548 if (!parent)
4548 return -ENODEV; 4549 return -ENODEV;
@@ -4555,7 +4556,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4555 goto out_unlock; 4556 goto out_unlock;
4556 } 4557 }
4557 4558
4558 ret = percpu_ref_init(&cgrp->self.refcnt, css_release); 4559 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
4559 if (ret) 4560 if (ret)
4560 goto out_free_cgrp; 4561 goto out_free_cgrp;
4561 4562
@@ -4785,19 +4786,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
4785 for_each_css(css, ssid, cgrp) 4786 for_each_css(css, ssid, cgrp)
4786 kill_css(css); 4787 kill_css(css);
4787 4788
4788 /* CSS_ONLINE is clear, remove from ->release_list for the last time */
4789 raw_spin_lock(&release_list_lock);
4790 if (!list_empty(&cgrp->release_list))
4791 list_del_init(&cgrp->release_list);
4792 raw_spin_unlock(&release_list_lock);
4793
4794 /* 4789 /*
4795 * Remove @cgrp directory along with the base files. @cgrp has an 4790 * Remove @cgrp directory along with the base files. @cgrp has an
4796 * extra ref on its kn. 4791 * extra ref on its kn.
4797 */ 4792 */
4798 kernfs_remove(cgrp->kn); 4793 kernfs_remove(cgrp->kn);
4799 4794
4800 set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
4801 check_for_release(cgroup_parent(cgrp)); 4795 check_for_release(cgroup_parent(cgrp));
4802 4796
4803 /* put the base reference */ 4797 /* put the base reference */
@@ -4814,23 +4808,10 @@ static int cgroup_rmdir(struct kernfs_node *kn)
4814 cgrp = cgroup_kn_lock_live(kn); 4808 cgrp = cgroup_kn_lock_live(kn);
4815 if (!cgrp) 4809 if (!cgrp)
4816 return 0; 4810 return 0;
4817 cgroup_get(cgrp); /* for @kn->priv clearing */
4818 4811
4819 ret = cgroup_destroy_locked(cgrp); 4812 ret = cgroup_destroy_locked(cgrp);
4820 4813
4821 cgroup_kn_unlock(kn); 4814 cgroup_kn_unlock(kn);
4822
4823 /*
4824 * There are two control paths which try to determine cgroup from
4825 * dentry without going through kernfs - cgroupstats_build() and
4826 * css_tryget_online_from_dir(). Those are supported by RCU
4827 * protecting clearing of cgrp->kn->priv backpointer, which should
4828 * happen after all files under it have been removed.
4829 */
4830 if (!ret)
4831 RCU_INIT_POINTER(*(void __rcu __force **)&kn->priv, NULL);
4832
4833 cgroup_put(cgrp);
4834 return ret; 4815 return ret;
4835} 4816}
4836 4817
@@ -5034,12 +5015,9 @@ core_initcall(cgroup_wq_init);
5034 * - Print task's cgroup paths into seq_file, one line for each hierarchy 5015 * - Print task's cgroup paths into seq_file, one line for each hierarchy
5035 * - Used for /proc/<pid>/cgroup. 5016 * - Used for /proc/<pid>/cgroup.
5036 */ 5017 */
5037 5018int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
5038/* TODO: Use a proper seq_file iterator */ 5019 struct pid *pid, struct task_struct *tsk)
5039int proc_cgroup_show(struct seq_file *m, void *v)
5040{ 5020{
5041 struct pid *pid;
5042 struct task_struct *tsk;
5043 char *buf, *path; 5021 char *buf, *path;
5044 int retval; 5022 int retval;
5045 struct cgroup_root *root; 5023 struct cgroup_root *root;
@@ -5049,14 +5027,6 @@ int proc_cgroup_show(struct seq_file *m, void *v)
5049 if (!buf) 5027 if (!buf)
5050 goto out; 5028 goto out;
5051 5029
5052 retval = -ESRCH;
5053 pid = m->private;
5054 tsk = get_pid_task(pid, PIDTYPE_PID);
5055 if (!tsk)
5056 goto out_free;
5057
5058 retval = 0;
5059
5060 mutex_lock(&cgroup_mutex); 5030 mutex_lock(&cgroup_mutex);
5061 down_read(&css_set_rwsem); 5031 down_read(&css_set_rwsem);
5062 5032
@@ -5086,11 +5056,10 @@ int proc_cgroup_show(struct seq_file *m, void *v)
5086 seq_putc(m, '\n'); 5056 seq_putc(m, '\n');
5087 } 5057 }
5088 5058
5059 retval = 0;
5089out_unlock: 5060out_unlock:
5090 up_read(&css_set_rwsem); 5061 up_read(&css_set_rwsem);
5091 mutex_unlock(&cgroup_mutex); 5062 mutex_unlock(&cgroup_mutex);
5092 put_task_struct(tsk);
5093out_free:
5094 kfree(buf); 5063 kfree(buf);
5095out: 5064out:
5096 return retval; 5065 return retval;
@@ -5161,7 +5130,7 @@ void cgroup_post_fork(struct task_struct *child)
5161 int i; 5130 int i;
5162 5131
5163 /* 5132 /*
5164 * This may race against cgroup_enable_task_cg_links(). As that 5133 * This may race against cgroup_enable_task_cg_lists(). As that
5165 * function sets use_task_css_set_links before grabbing 5134 * function sets use_task_css_set_links before grabbing
5166 * tasklist_lock and we just went through tasklist_lock to add 5135 * tasklist_lock and we just went through tasklist_lock to add
5167 * @child, it's guaranteed that either we see the set 5136 * @child, it's guaranteed that either we see the set
@@ -5176,7 +5145,7 @@ void cgroup_post_fork(struct task_struct *child)
5176 * when implementing operations which need to migrate all tasks of 5145 * when implementing operations which need to migrate all tasks of
5177 * a cgroup to another. 5146 * a cgroup to another.
5178 * 5147 *
5179 * Note that if we lose to cgroup_enable_task_cg_links(), @child 5148 * Note that if we lose to cgroup_enable_task_cg_lists(), @child
5180 * will remain in init_css_set. This is safe because all tasks are 5149 * will remain in init_css_set. This is safe because all tasks are
5181 * in the init_css_set before cg_links is enabled and there's no 5150 * in the init_css_set before cg_links is enabled and there's no
5182 * operation which transfers all tasks out of init_css_set. 5151 * operation which transfers all tasks out of init_css_set.
@@ -5260,30 +5229,14 @@ void cgroup_exit(struct task_struct *tsk)
5260 } 5229 }
5261 5230
5262 if (put_cset) 5231 if (put_cset)
5263 put_css_set(cset, true); 5232 put_css_set(cset);
5264} 5233}
5265 5234
5266static void check_for_release(struct cgroup *cgrp) 5235static void check_for_release(struct cgroup *cgrp)
5267{ 5236{
5268 if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) && 5237 if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
5269 !css_has_online_children(&cgrp->self)) { 5238 !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
5270 /* 5239 schedule_work(&cgrp->release_agent_work);
5271 * Control Group is currently removeable. If it's not
5272 * already queued for a userspace notification, queue
5273 * it now
5274 */
5275 int need_schedule_work = 0;
5276
5277 raw_spin_lock(&release_list_lock);
5278 if (!cgroup_is_dead(cgrp) &&
5279 list_empty(&cgrp->release_list)) {
5280 list_add(&cgrp->release_list, &release_list);
5281 need_schedule_work = 1;
5282 }
5283 raw_spin_unlock(&release_list_lock);
5284 if (need_schedule_work)
5285 schedule_work(&release_agent_work);
5286 }
5287} 5240}
5288 5241
5289/* 5242/*
@@ -5311,52 +5264,39 @@ static void check_for_release(struct cgroup *cgrp)
5311 */ 5264 */
5312static void cgroup_release_agent(struct work_struct *work) 5265static void cgroup_release_agent(struct work_struct *work)
5313{ 5266{
5314 BUG_ON(work != &release_agent_work); 5267 struct cgroup *cgrp =
5268 container_of(work, struct cgroup, release_agent_work);
5269 char *pathbuf = NULL, *agentbuf = NULL, *path;
5270 char *argv[3], *envp[3];
5271
5315 mutex_lock(&cgroup_mutex); 5272 mutex_lock(&cgroup_mutex);
5316 raw_spin_lock(&release_list_lock); 5273
5317 while (!list_empty(&release_list)) { 5274 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
5318 char *argv[3], *envp[3]; 5275 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
5319 int i; 5276 if (!pathbuf || !agentbuf)
5320 char *pathbuf = NULL, *agentbuf = NULL, *path; 5277 goto out;
5321 struct cgroup *cgrp = list_entry(release_list.next, 5278
5322 struct cgroup, 5279 path = cgroup_path(cgrp, pathbuf, PATH_MAX);
5323 release_list); 5280 if (!path)
5324 list_del_init(&cgrp->release_list); 5281 goto out;
5325 raw_spin_unlock(&release_list_lock); 5282
5326 pathbuf = kmalloc(PATH_MAX, GFP_KERNEL); 5283 argv[0] = agentbuf;
5327 if (!pathbuf) 5284 argv[1] = path;
5328 goto continue_free; 5285 argv[2] = NULL;
5329 path = cgroup_path(cgrp, pathbuf, PATH_MAX); 5286
5330 if (!path) 5287 /* minimal command environment */
5331 goto continue_free; 5288 envp[0] = "HOME=/";
5332 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); 5289 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5333 if (!agentbuf) 5290 envp[2] = NULL;
5334 goto continue_free; 5291
5335
5336 i = 0;
5337 argv[i++] = agentbuf;
5338 argv[i++] = path;
5339 argv[i] = NULL;
5340
5341 i = 0;
5342 /* minimal command environment */
5343 envp[i++] = "HOME=/";
5344 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
5345 envp[i] = NULL;
5346
5347 /* Drop the lock while we invoke the usermode helper,
5348 * since the exec could involve hitting disk and hence
5349 * be a slow process */
5350 mutex_unlock(&cgroup_mutex);
5351 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5352 mutex_lock(&cgroup_mutex);
5353 continue_free:
5354 kfree(pathbuf);
5355 kfree(agentbuf);
5356 raw_spin_lock(&release_list_lock);
5357 }
5358 raw_spin_unlock(&release_list_lock);
5359 mutex_unlock(&cgroup_mutex); 5292 mutex_unlock(&cgroup_mutex);
5293 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
5294 goto out_free;
5295out:
5296 mutex_unlock(&cgroup_mutex);
5297out_free:
5298 kfree(agentbuf);
5299 kfree(pathbuf);
5360} 5300}
5361 5301
5362static int __init cgroup_disable(char *str) 5302static int __init cgroup_disable(char *str)
@@ -5416,7 +5356,7 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry,
5416 /* 5356 /*
5417 * This path doesn't originate from kernfs and @kn could already 5357 * This path doesn't originate from kernfs and @kn could already
5418 * have been or be removed at any point. @kn->priv is RCU 5358 * have been or be removed at any point. @kn->priv is RCU
5419 * protected for this access. See cgroup_rmdir() for details. 5359 * protected for this access. See css_release_work_fn() for details.
5420 */ 5360 */
5421 cgrp = rcu_dereference(kn->priv); 5361 cgrp = rcu_dereference(kn->priv);
5422 if (cgrp) 5362 if (cgrp)
@@ -5544,7 +5484,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
5544 5484
5545static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) 5485static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
5546{ 5486{
5547 return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); 5487 return (!cgroup_has_tasks(css->cgroup) &&
5488 !css_has_online_children(&css->cgroup->self));
5548} 5489}
5549 5490
5550static struct cftype debug_files[] = { 5491static struct cftype debug_files[] = {
diff --git a/kernel/compat.c b/kernel/compat.c
index 633394f442f8..ebb3c369d03d 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -226,7 +226,7 @@ static long compat_nanosleep_restart(struct restart_block *restart)
226 ret = hrtimer_nanosleep_restart(restart); 226 ret = hrtimer_nanosleep_restart(restart);
227 set_fs(oldfs); 227 set_fs(oldfs);
228 228
229 if (ret) { 229 if (ret == -ERESTART_RESTARTBLOCK) {
230 rmtp = restart->nanosleep.compat_rmtp; 230 rmtp = restart->nanosleep.compat_rmtp;
231 231
232 if (rmtp && compat_put_timespec(&rmt, rmtp)) 232 if (rmtp && compat_put_timespec(&rmt, rmtp))
@@ -256,7 +256,26 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
256 HRTIMER_MODE_REL, CLOCK_MONOTONIC); 256 HRTIMER_MODE_REL, CLOCK_MONOTONIC);
257 set_fs(oldfs); 257 set_fs(oldfs);
258 258
259 if (ret) { 259 /*
260 * hrtimer_nanosleep() can only return 0 or
261 * -ERESTART_RESTARTBLOCK here because:
262 *
263 * - we call it with HRTIMER_MODE_REL and therefor exclude the
264 * -ERESTARTNOHAND return path.
265 *
266 * - we supply the rmtp argument from the task stack (due to
267 * the necessary compat conversion. So the update cannot
268 * fail, which excludes the -EFAULT return path as well. If
269 * it fails nevertheless we have a bigger problem and wont
270 * reach this place anymore.
271 *
272 * - if the return value is 0, we do not have to update rmtp
273 * because there is no remaining time.
274 *
275 * We check for -ERESTART_RESTARTBLOCK nevertheless if the
276 * core implementation decides to return random nonsense.
277 */
278 if (ret == -ERESTART_RESTARTBLOCK) {
260 struct restart_block *restart 279 struct restart_block *restart
261 = &current_thread_info()->restart_block; 280 = &current_thread_info()->restart_block;
262 281
@@ -266,7 +285,6 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
266 if (rmtp && compat_put_timespec(&rmt, rmtp)) 285 if (rmtp && compat_put_timespec(&rmt, rmtp))
267 return -EFAULT; 286 return -EFAULT;
268 } 287 }
269
270 return ret; 288 return ret;
271} 289}
272 290
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
new file mode 100644
index 000000000000..c2de56ab0fce
--- /dev/null
+++ b/kernel/configs/tiny.config
@@ -0,0 +1,4 @@
1CONFIG_CC_OPTIMIZE_FOR_SIZE=y
2CONFIG_KERNEL_XZ=y
3CONFIG_OPTIMIZE_INLINING=y
4CONFIG_SLOB=y
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 5664985c46a0..937ecdfdf258 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -107,46 +107,6 @@ void context_tracking_user_enter(void)
107} 107}
108NOKPROBE_SYMBOL(context_tracking_user_enter); 108NOKPROBE_SYMBOL(context_tracking_user_enter);
109 109
110#ifdef CONFIG_PREEMPT
111/**
112 * preempt_schedule_context - preempt_schedule called by tracing
113 *
114 * The tracing infrastructure uses preempt_enable_notrace to prevent
115 * recursion and tracing preempt enabling caused by the tracing
116 * infrastructure itself. But as tracing can happen in areas coming
117 * from userspace or just about to enter userspace, a preempt enable
118 * can occur before user_exit() is called. This will cause the scheduler
119 * to be called when the system is still in usermode.
120 *
121 * To prevent this, the preempt_enable_notrace will use this function
122 * instead of preempt_schedule() to exit user context if needed before
123 * calling the scheduler.
124 */
125asmlinkage __visible void __sched notrace preempt_schedule_context(void)
126{
127 enum ctx_state prev_ctx;
128
129 if (likely(!preemptible()))
130 return;
131
132 /*
133 * Need to disable preemption in case user_exit() is traced
134 * and the tracer calls preempt_enable_notrace() causing
135 * an infinite recursion.
136 */
137 preempt_disable_notrace();
138 prev_ctx = exception_enter();
139 preempt_enable_no_resched_notrace();
140
141 preempt_schedule();
142
143 preempt_disable_notrace();
144 exception_exit(prev_ctx);
145 preempt_enable_notrace();
146}
147EXPORT_SYMBOL_GPL(preempt_schedule_context);
148#endif /* CONFIG_PREEMPT */
149
150/** 110/**
151 * context_tracking_user_exit - Inform the context tracking that the CPU is 111 * context_tracking_user_exit - Inform the context tracking that the CPU is
152 * exiting userspace mode and entering the kernel. 112 * exiting userspace mode and entering the kernel.
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 81e2a388a0f6..90a3d017b90c 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ static struct {
64 * an ongoing cpu hotplug operation. 64 * an ongoing cpu hotplug operation.
65 */ 65 */
66 int refcount; 66 int refcount;
67 /* And allows lockless put_online_cpus(). */
68 atomic_t puts_pending;
67 69
68#ifdef CONFIG_DEBUG_LOCK_ALLOC 70#ifdef CONFIG_DEBUG_LOCK_ALLOC
69 struct lockdep_map dep_map; 71 struct lockdep_map dep_map;
@@ -79,6 +81,8 @@ static struct {
79 81
80/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */ 82/* Lockdep annotations for get/put_online_cpus() and cpu_hotplug_begin/end() */
81#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map) 83#define cpuhp_lock_acquire_read() lock_map_acquire_read(&cpu_hotplug.dep_map)
84#define cpuhp_lock_acquire_tryread() \
85 lock_map_acquire_tryread(&cpu_hotplug.dep_map)
82#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map) 86#define cpuhp_lock_acquire() lock_map_acquire(&cpu_hotplug.dep_map)
83#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map) 87#define cpuhp_lock_release() lock_map_release(&cpu_hotplug.dep_map)
84 88
@@ -91,15 +95,31 @@ void get_online_cpus(void)
91 mutex_lock(&cpu_hotplug.lock); 95 mutex_lock(&cpu_hotplug.lock);
92 cpu_hotplug.refcount++; 96 cpu_hotplug.refcount++;
93 mutex_unlock(&cpu_hotplug.lock); 97 mutex_unlock(&cpu_hotplug.lock);
94
95} 98}
96EXPORT_SYMBOL_GPL(get_online_cpus); 99EXPORT_SYMBOL_GPL(get_online_cpus);
97 100
101bool try_get_online_cpus(void)
102{
103 if (cpu_hotplug.active_writer == current)
104 return true;
105 if (!mutex_trylock(&cpu_hotplug.lock))
106 return false;
107 cpuhp_lock_acquire_tryread();
108 cpu_hotplug.refcount++;
109 mutex_unlock(&cpu_hotplug.lock);
110 return true;
111}
112EXPORT_SYMBOL_GPL(try_get_online_cpus);
113
98void put_online_cpus(void) 114void put_online_cpus(void)
99{ 115{
100 if (cpu_hotplug.active_writer == current) 116 if (cpu_hotplug.active_writer == current)
101 return; 117 return;
102 mutex_lock(&cpu_hotplug.lock); 118 if (!mutex_trylock(&cpu_hotplug.lock)) {
119 atomic_inc(&cpu_hotplug.puts_pending);
120 cpuhp_lock_release();
121 return;
122 }
103 123
104 if (WARN_ON(!cpu_hotplug.refcount)) 124 if (WARN_ON(!cpu_hotplug.refcount))
105 cpu_hotplug.refcount++; /* try to fix things up */ 125 cpu_hotplug.refcount++; /* try to fix things up */
@@ -141,6 +161,12 @@ void cpu_hotplug_begin(void)
141 cpuhp_lock_acquire(); 161 cpuhp_lock_acquire();
142 for (;;) { 162 for (;;) {
143 mutex_lock(&cpu_hotplug.lock); 163 mutex_lock(&cpu_hotplug.lock);
164 if (atomic_read(&cpu_hotplug.puts_pending)) {
165 int delta;
166
167 delta = atomic_xchg(&cpu_hotplug.puts_pending, 0);
168 cpu_hotplug.refcount -= delta;
169 }
144 if (likely(!cpu_hotplug.refcount)) 170 if (likely(!cpu_hotplug.refcount))
145 break; 171 break;
146 __set_current_state(TASK_UNINTERRUPTIBLE); 172 __set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 22874d7cf2c0..1f107c74087b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -365,13 +365,14 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
365 struct task_struct *tsk) 365 struct task_struct *tsk)
366{ 366{
367 if (is_spread_page(cs)) 367 if (is_spread_page(cs))
368 tsk->flags |= PF_SPREAD_PAGE; 368 task_set_spread_page(tsk);
369 else 369 else
370 tsk->flags &= ~PF_SPREAD_PAGE; 370 task_clear_spread_page(tsk);
371
371 if (is_spread_slab(cs)) 372 if (is_spread_slab(cs))
372 tsk->flags |= PF_SPREAD_SLAB; 373 task_set_spread_slab(tsk);
373 else 374 else
374 tsk->flags &= ~PF_SPREAD_SLAB; 375 task_clear_spread_slab(tsk);
375} 376}
376 377
377/* 378/*
@@ -2729,10 +2730,9 @@ void __cpuset_memory_pressure_bump(void)
2729 * and we take cpuset_mutex, keeping cpuset_attach() from changing it 2730 * and we take cpuset_mutex, keeping cpuset_attach() from changing it
2730 * anyway. 2731 * anyway.
2731 */ 2732 */
2732int proc_cpuset_show(struct seq_file *m, void *unused_v) 2733int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
2734 struct pid *pid, struct task_struct *tsk)
2733{ 2735{
2734 struct pid *pid;
2735 struct task_struct *tsk;
2736 char *buf, *p; 2736 char *buf, *p;
2737 struct cgroup_subsys_state *css; 2737 struct cgroup_subsys_state *css;
2738 int retval; 2738 int retval;
@@ -2742,24 +2742,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
2742 if (!buf) 2742 if (!buf)
2743 goto out; 2743 goto out;
2744 2744
2745 retval = -ESRCH;
2746 pid = m->private;
2747 tsk = get_pid_task(pid, PIDTYPE_PID);
2748 if (!tsk)
2749 goto out_free;
2750
2751 retval = -ENAMETOOLONG; 2745 retval = -ENAMETOOLONG;
2752 rcu_read_lock(); 2746 rcu_read_lock();
2753 css = task_css(tsk, cpuset_cgrp_id); 2747 css = task_css(tsk, cpuset_cgrp_id);
2754 p = cgroup_path(css->cgroup, buf, PATH_MAX); 2748 p = cgroup_path(css->cgroup, buf, PATH_MAX);
2755 rcu_read_unlock(); 2749 rcu_read_unlock();
2756 if (!p) 2750 if (!p)
2757 goto out_put_task; 2751 goto out_free;
2758 seq_puts(m, p); 2752 seq_puts(m, p);
2759 seq_putc(m, '\n'); 2753 seq_putc(m, '\n');
2760 retval = 0; 2754 retval = 0;
2761out_put_task:
2762 put_task_struct(tsk);
2763out_free: 2755out_free:
2764 kfree(buf); 2756 kfree(buf);
2765out: 2757out:
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index c766ee54c0b1..b64e238b553b 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -18,6 +18,7 @@ unsigned long saved_max_pfn;
18 * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. 18 * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
19 */ 19 */
20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; 20unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
21EXPORT_SYMBOL_GPL(elfcorehdr_addr);
21 22
22/* 23/*
23 * stores the size of elf header of crash image 24 * stores the size of elf header of crash image
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
index 70a504601dc3..b20d544f20c2 100644
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -52,11 +52,11 @@ static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
52 52
53 bp->bph_length = 1; 53 bp->bph_length = 1;
54 if ((argc + 1) != nextarg) { 54 if ((argc + 1) != nextarg) {
55 if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0) 55 if (strncasecmp(argv[nextarg], "datar", sizeof("datar")) == 0)
56 bp->bp_type = BP_ACCESS_WATCHPOINT; 56 bp->bp_type = BP_ACCESS_WATCHPOINT;
57 else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0) 57 else if (strncasecmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
58 bp->bp_type = BP_WRITE_WATCHPOINT; 58 bp->bp_type = BP_WRITE_WATCHPOINT;
59 else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0) 59 else if (strncasecmp(argv[nextarg], "inst", sizeof("inst")) == 0)
60 bp->bp_type = BP_HARDWARE_BREAKPOINT; 60 bp->bp_type = BP_HARDWARE_BREAKPOINT;
61 else 61 else
62 return KDB_ARGCOUNT; 62 return KDB_ARGCOUNT;
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
index 97b67df8fbfe..d659487254d5 100644
--- a/kernel/events/callchain.c
+++ b/kernel/events/callchain.c
@@ -52,7 +52,7 @@ static void release_callchain_buffers(void)
52 struct callchain_cpus_entries *entries; 52 struct callchain_cpus_entries *entries;
53 53
54 entries = callchain_cpus_entries; 54 entries = callchain_cpus_entries;
55 rcu_assign_pointer(callchain_cpus_entries, NULL); 55 RCU_INIT_POINTER(callchain_cpus_entries, NULL);
56 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); 56 call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
57} 57}
58 58
@@ -137,7 +137,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
137 int cpu; 137 int cpu;
138 struct callchain_cpus_entries *entries; 138 struct callchain_cpus_entries *entries;
139 139
140 *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); 140 *rctx = get_recursion_context(this_cpu_ptr(callchain_recursion));
141 if (*rctx == -1) 141 if (*rctx == -1)
142 return NULL; 142 return NULL;
143 143
@@ -153,7 +153,7 @@ static struct perf_callchain_entry *get_callchain_entry(int *rctx)
153static void 153static void
154put_callchain_entry(int rctx) 154put_callchain_entry(int rctx)
155{ 155{
156 put_recursion_context(__get_cpu_var(callchain_recursion), rctx); 156 put_recursion_context(this_cpu_ptr(callchain_recursion), rctx);
157} 157}
158 158
159struct perf_callchain_entry * 159struct perf_callchain_entry *
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1cf24b3e42ec..2b02c9fda790 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -41,11 +41,14 @@
41#include <linux/cgroup.h> 41#include <linux/cgroup.h>
42#include <linux/module.h> 42#include <linux/module.h>
43#include <linux/mman.h> 43#include <linux/mman.h>
44#include <linux/compat.h>
44 45
45#include "internal.h" 46#include "internal.h"
46 47
47#include <asm/irq_regs.h> 48#include <asm/irq_regs.h>
48 49
50static struct workqueue_struct *perf_wq;
51
49struct remote_function_call { 52struct remote_function_call {
50 struct task_struct *p; 53 struct task_struct *p;
51 int (*func)(void *info); 54 int (*func)(void *info);
@@ -119,6 +122,13 @@ static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
119 return data.ret; 122 return data.ret;
120} 123}
121 124
125#define EVENT_OWNER_KERNEL ((void *) -1)
126
127static bool is_kernel_event(struct perf_event *event)
128{
129 return event->owner == EVENT_OWNER_KERNEL;
130}
131
122#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ 132#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
123 PERF_FLAG_FD_OUTPUT |\ 133 PERF_FLAG_FD_OUTPUT |\
124 PERF_FLAG_PID_CGROUP |\ 134 PERF_FLAG_PID_CGROUP |\
@@ -239,7 +249,7 @@ static void perf_duration_warn(struct irq_work *w)
239 u64 avg_local_sample_len; 249 u64 avg_local_sample_len;
240 u64 local_samples_len; 250 u64 local_samples_len;
241 251
242 local_samples_len = __get_cpu_var(running_sample_length); 252 local_samples_len = __this_cpu_read(running_sample_length);
243 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; 253 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
244 254
245 printk_ratelimited(KERN_WARNING 255 printk_ratelimited(KERN_WARNING
@@ -261,10 +271,10 @@ void perf_sample_event_took(u64 sample_len_ns)
261 return; 271 return;
262 272
263 /* decay the counter by 1 average sample */ 273 /* decay the counter by 1 average sample */
264 local_samples_len = __get_cpu_var(running_sample_length); 274 local_samples_len = __this_cpu_read(running_sample_length);
265 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; 275 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
266 local_samples_len += sample_len_ns; 276 local_samples_len += sample_len_ns;
267 __get_cpu_var(running_sample_length) = local_samples_len; 277 __this_cpu_write(running_sample_length, local_samples_len);
268 278
269 /* 279 /*
270 * note: this will be biased artifically low until we have 280 * note: this will be biased artifically low until we have
@@ -391,14 +401,9 @@ perf_cgroup_match(struct perf_event *event)
391 event->cgrp->css.cgroup); 401 event->cgrp->css.cgroup);
392} 402}
393 403
394static inline void perf_put_cgroup(struct perf_event *event)
395{
396 css_put(&event->cgrp->css);
397}
398
399static inline void perf_detach_cgroup(struct perf_event *event) 404static inline void perf_detach_cgroup(struct perf_event *event)
400{ 405{
401 perf_put_cgroup(event); 406 css_put(&event->cgrp->css);
402 event->cgrp = NULL; 407 event->cgrp = NULL;
403} 408}
404 409
@@ -877,7 +882,7 @@ static DEFINE_PER_CPU(struct list_head, rotation_list);
877static void perf_pmu_rotate_start(struct pmu *pmu) 882static void perf_pmu_rotate_start(struct pmu *pmu)
878{ 883{
879 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 884 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
880 struct list_head *head = &__get_cpu_var(rotation_list); 885 struct list_head *head = this_cpu_ptr(&rotation_list);
881 886
882 WARN_ON(!irqs_disabled()); 887 WARN_ON(!irqs_disabled());
883 888
@@ -901,13 +906,23 @@ static void put_ctx(struct perf_event_context *ctx)
901 } 906 }
902} 907}
903 908
904static void unclone_ctx(struct perf_event_context *ctx) 909/*
910 * This must be done under the ctx->lock, such as to serialize against
911 * context_equiv(), therefore we cannot call put_ctx() since that might end up
912 * calling scheduler related locks and ctx->lock nests inside those.
913 */
914static __must_check struct perf_event_context *
915unclone_ctx(struct perf_event_context *ctx)
905{ 916{
906 if (ctx->parent_ctx) { 917 struct perf_event_context *parent_ctx = ctx->parent_ctx;
907 put_ctx(ctx->parent_ctx); 918
919 lockdep_assert_held(&ctx->lock);
920
921 if (parent_ctx)
908 ctx->parent_ctx = NULL; 922 ctx->parent_ctx = NULL;
909 }
910 ctx->generation++; 923 ctx->generation++;
924
925 return parent_ctx;
911} 926}
912 927
913static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 928static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1374,6 +1389,45 @@ out:
1374 perf_event__header_size(tmp); 1389 perf_event__header_size(tmp);
1375} 1390}
1376 1391
1392/*
1393 * User event without the task.
1394 */
1395static bool is_orphaned_event(struct perf_event *event)
1396{
1397 return event && !is_kernel_event(event) && !event->owner;
1398}
1399
1400/*
1401 * Event has a parent but parent's task finished and it's
1402 * alive only because of children holding refference.
1403 */
1404static bool is_orphaned_child(struct perf_event *event)
1405{
1406 return is_orphaned_event(event->parent);
1407}
1408
1409static void orphans_remove_work(struct work_struct *work);
1410
1411static void schedule_orphans_remove(struct perf_event_context *ctx)
1412{
1413 if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1414 return;
1415
1416 if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1417 get_ctx(ctx);
1418 ctx->orphans_remove_sched = true;
1419 }
1420}
1421
1422static int __init perf_workqueue_init(void)
1423{
1424 perf_wq = create_singlethread_workqueue("perf");
1425 WARN(!perf_wq, "failed to create perf workqueue\n");
1426 return perf_wq ? 0 : -1;
1427}
1428
1429core_initcall(perf_workqueue_init);
1430
1377static inline int 1431static inline int
1378event_filter_match(struct perf_event *event) 1432event_filter_match(struct perf_event *event)
1379{ 1433{
@@ -1423,6 +1477,9 @@ event_sched_out(struct perf_event *event,
1423 if (event->attr.exclusive || !cpuctx->active_oncpu) 1477 if (event->attr.exclusive || !cpuctx->active_oncpu)
1424 cpuctx->exclusive = 0; 1478 cpuctx->exclusive = 0;
1425 1479
1480 if (is_orphaned_child(event))
1481 schedule_orphans_remove(ctx);
1482
1426 perf_pmu_enable(event->pmu); 1483 perf_pmu_enable(event->pmu);
1427} 1484}
1428 1485
@@ -1523,6 +1580,11 @@ retry:
1523 */ 1580 */
1524 if (ctx->is_active) { 1581 if (ctx->is_active) {
1525 raw_spin_unlock_irq(&ctx->lock); 1582 raw_spin_unlock_irq(&ctx->lock);
1583 /*
1584 * Reload the task pointer, it might have been changed by
1585 * a concurrent perf_event_context_sched_out().
1586 */
1587 task = ctx->task;
1526 goto retry; 1588 goto retry;
1527 } 1589 }
1528 1590
@@ -1725,6 +1787,9 @@ event_sched_in(struct perf_event *event,
1725 if (event->attr.exclusive) 1787 if (event->attr.exclusive)
1726 cpuctx->exclusive = 1; 1788 cpuctx->exclusive = 1;
1727 1789
1790 if (is_orphaned_child(event))
1791 schedule_orphans_remove(ctx);
1792
1728out: 1793out:
1729 perf_pmu_enable(event->pmu); 1794 perf_pmu_enable(event->pmu);
1730 1795
@@ -1966,6 +2031,11 @@ retry:
1966 */ 2031 */
1967 if (ctx->is_active) { 2032 if (ctx->is_active) {
1968 raw_spin_unlock_irq(&ctx->lock); 2033 raw_spin_unlock_irq(&ctx->lock);
2034 /*
2035 * Reload the task pointer, it might have been changed by
2036 * a concurrent perf_event_context_sched_out().
2037 */
2038 task = ctx->task;
1969 goto retry; 2039 goto retry;
1970 } 2040 }
1971 2041
@@ -2199,6 +2269,9 @@ static void ctx_sched_out(struct perf_event_context *ctx,
2199static int context_equiv(struct perf_event_context *ctx1, 2269static int context_equiv(struct perf_event_context *ctx1,
2200 struct perf_event_context *ctx2) 2270 struct perf_event_context *ctx2)
2201{ 2271{
2272 lockdep_assert_held(&ctx1->lock);
2273 lockdep_assert_held(&ctx2->lock);
2274
2202 /* Pinning disables the swap optimization */ 2275 /* Pinning disables the swap optimization */
2203 if (ctx1->pin_count || ctx2->pin_count) 2276 if (ctx1->pin_count || ctx2->pin_count)
2204 return 0; 2277 return 0;
@@ -2320,7 +2393,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2320 next_parent = rcu_dereference(next_ctx->parent_ctx); 2393 next_parent = rcu_dereference(next_ctx->parent_ctx);
2321 2394
2322 /* If neither context have a parent context; they cannot be clones. */ 2395 /* If neither context have a parent context; they cannot be clones. */
2323 if (!parent || !next_parent) 2396 if (!parent && !next_parent)
2324 goto unlock; 2397 goto unlock;
2325 2398
2326 if (next_parent == ctx || next_ctx == parent || next_parent == parent) { 2399 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
@@ -2389,7 +2462,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
2389 * to check if we have to switch out PMU state. 2462 * to check if we have to switch out PMU state.
2390 * cgroup event are system-wide mode only 2463 * cgroup event are system-wide mode only
2391 */ 2464 */
2392 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2465 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2393 perf_cgroup_sched_out(task, next); 2466 perf_cgroup_sched_out(task, next);
2394} 2467}
2395 2468
@@ -2632,11 +2705,11 @@ void __perf_event_task_sched_in(struct task_struct *prev,
2632 * to check if we have to switch in PMU state. 2705 * to check if we have to switch in PMU state.
2633 * cgroup event are system-wide mode only 2706 * cgroup event are system-wide mode only
2634 */ 2707 */
2635 if (atomic_read(&__get_cpu_var(perf_cgroup_events))) 2708 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2636 perf_cgroup_sched_in(prev, task); 2709 perf_cgroup_sched_in(prev, task);
2637 2710
2638 /* check for system-wide branch_stack events */ 2711 /* check for system-wide branch_stack events */
2639 if (atomic_read(&__get_cpu_var(perf_branch_stack_events))) 2712 if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
2640 perf_branch_stack_sched_in(prev, task); 2713 perf_branch_stack_sched_in(prev, task);
2641} 2714}
2642 2715
@@ -2891,7 +2964,7 @@ bool perf_event_can_stop_tick(void)
2891 2964
2892void perf_event_task_tick(void) 2965void perf_event_task_tick(void)
2893{ 2966{
2894 struct list_head *head = &__get_cpu_var(rotation_list); 2967 struct list_head *head = this_cpu_ptr(&rotation_list);
2895 struct perf_cpu_context *cpuctx, *tmp; 2968 struct perf_cpu_context *cpuctx, *tmp;
2896 struct perf_event_context *ctx; 2969 struct perf_event_context *ctx;
2897 int throttled; 2970 int throttled;
@@ -2932,6 +3005,7 @@ static int event_enable_on_exec(struct perf_event *event,
2932 */ 3005 */
2933static void perf_event_enable_on_exec(struct perf_event_context *ctx) 3006static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2934{ 3007{
3008 struct perf_event_context *clone_ctx = NULL;
2935 struct perf_event *event; 3009 struct perf_event *event;
2936 unsigned long flags; 3010 unsigned long flags;
2937 int enabled = 0; 3011 int enabled = 0;
@@ -2963,7 +3037,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2963 * Unclone this context if we enabled any event. 3037 * Unclone this context if we enabled any event.
2964 */ 3038 */
2965 if (enabled) 3039 if (enabled)
2966 unclone_ctx(ctx); 3040 clone_ctx = unclone_ctx(ctx);
2967 3041
2968 raw_spin_unlock(&ctx->lock); 3042 raw_spin_unlock(&ctx->lock);
2969 3043
@@ -2973,6 +3047,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2973 perf_event_context_sched_in(ctx, ctx->task); 3047 perf_event_context_sched_in(ctx, ctx->task);
2974out: 3048out:
2975 local_irq_restore(flags); 3049 local_irq_restore(flags);
3050
3051 if (clone_ctx)
3052 put_ctx(clone_ctx);
2976} 3053}
2977 3054
2978void perf_event_exec(void) 3055void perf_event_exec(void)
@@ -3067,6 +3144,7 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
3067 INIT_LIST_HEAD(&ctx->flexible_groups); 3144 INIT_LIST_HEAD(&ctx->flexible_groups);
3068 INIT_LIST_HEAD(&ctx->event_list); 3145 INIT_LIST_HEAD(&ctx->event_list);
3069 atomic_set(&ctx->refcount, 1); 3146 atomic_set(&ctx->refcount, 1);
3147 INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
3070} 3148}
3071 3149
3072static struct perf_event_context * 3150static struct perf_event_context *
@@ -3124,7 +3202,7 @@ errout:
3124static struct perf_event_context * 3202static struct perf_event_context *
3125find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) 3203find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
3126{ 3204{
3127 struct perf_event_context *ctx; 3205 struct perf_event_context *ctx, *clone_ctx = NULL;
3128 struct perf_cpu_context *cpuctx; 3206 struct perf_cpu_context *cpuctx;
3129 unsigned long flags; 3207 unsigned long flags;
3130 int ctxn, err; 3208 int ctxn, err;
@@ -3158,9 +3236,12 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
3158retry: 3236retry:
3159 ctx = perf_lock_task_context(task, ctxn, &flags); 3237 ctx = perf_lock_task_context(task, ctxn, &flags);
3160 if (ctx) { 3238 if (ctx) {
3161 unclone_ctx(ctx); 3239 clone_ctx = unclone_ctx(ctx);
3162 ++ctx->pin_count; 3240 ++ctx->pin_count;
3163 raw_spin_unlock_irqrestore(&ctx->lock, flags); 3241 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3242
3243 if (clone_ctx)
3244 put_ctx(clone_ctx);
3164 } else { 3245 } else {
3165 ctx = alloc_perf_context(pmu, task); 3246 ctx = alloc_perf_context(pmu, task);
3166 err = -ENOMEM; 3247 err = -ENOMEM;
@@ -3312,16 +3393,12 @@ static void free_event(struct perf_event *event)
3312} 3393}
3313 3394
3314/* 3395/*
3315 * Called when the last reference to the file is gone. 3396 * Remove user event from the owner task.
3316 */ 3397 */
3317static void put_event(struct perf_event *event) 3398static void perf_remove_from_owner(struct perf_event *event)
3318{ 3399{
3319 struct perf_event_context *ctx = event->ctx;
3320 struct task_struct *owner; 3400 struct task_struct *owner;
3321 3401
3322 if (!atomic_long_dec_and_test(&event->refcount))
3323 return;
3324
3325 rcu_read_lock(); 3402 rcu_read_lock();
3326 owner = ACCESS_ONCE(event->owner); 3403 owner = ACCESS_ONCE(event->owner);
3327 /* 3404 /*
@@ -3354,6 +3431,20 @@ static void put_event(struct perf_event *event)
3354 mutex_unlock(&owner->perf_event_mutex); 3431 mutex_unlock(&owner->perf_event_mutex);
3355 put_task_struct(owner); 3432 put_task_struct(owner);
3356 } 3433 }
3434}
3435
3436/*
3437 * Called when the last reference to the file is gone.
3438 */
3439static void put_event(struct perf_event *event)
3440{
3441 struct perf_event_context *ctx = event->ctx;
3442
3443 if (!atomic_long_dec_and_test(&event->refcount))
3444 return;
3445
3446 if (!is_kernel_event(event))
3447 perf_remove_from_owner(event);
3357 3448
3358 WARN_ON_ONCE(ctx->parent_ctx); 3449 WARN_ON_ONCE(ctx->parent_ctx);
3359 /* 3450 /*
@@ -3388,6 +3479,42 @@ static int perf_release(struct inode *inode, struct file *file)
3388 return 0; 3479 return 0;
3389} 3480}
3390 3481
3482/*
3483 * Remove all orphanes events from the context.
3484 */
3485static void orphans_remove_work(struct work_struct *work)
3486{
3487 struct perf_event_context *ctx;
3488 struct perf_event *event, *tmp;
3489
3490 ctx = container_of(work, struct perf_event_context,
3491 orphans_remove.work);
3492
3493 mutex_lock(&ctx->mutex);
3494 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3495 struct perf_event *parent_event = event->parent;
3496
3497 if (!is_orphaned_child(event))
3498 continue;
3499
3500 perf_remove_from_context(event, true);
3501
3502 mutex_lock(&parent_event->child_mutex);
3503 list_del_init(&event->child_list);
3504 mutex_unlock(&parent_event->child_mutex);
3505
3506 free_event(event);
3507 put_event(parent_event);
3508 }
3509
3510 raw_spin_lock_irq(&ctx->lock);
3511 ctx->orphans_remove_sched = false;
3512 raw_spin_unlock_irq(&ctx->lock);
3513 mutex_unlock(&ctx->mutex);
3514
3515 put_ctx(ctx);
3516}
3517
3391u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) 3518u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3392{ 3519{
3393 struct perf_event *child; 3520 struct perf_event *child;
@@ -3485,6 +3612,19 @@ static int perf_event_read_one(struct perf_event *event,
3485 return n * sizeof(u64); 3612 return n * sizeof(u64);
3486} 3613}
3487 3614
3615static bool is_event_hup(struct perf_event *event)
3616{
3617 bool no_children;
3618
3619 if (event->state != PERF_EVENT_STATE_EXIT)
3620 return false;
3621
3622 mutex_lock(&event->child_mutex);
3623 no_children = list_empty(&event->child_list);
3624 mutex_unlock(&event->child_mutex);
3625 return no_children;
3626}
3627
3488/* 3628/*
3489 * Read the performance event - simple non blocking version for now 3629 * Read the performance event - simple non blocking version for now
3490 */ 3630 */
@@ -3526,7 +3666,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3526{ 3666{
3527 struct perf_event *event = file->private_data; 3667 struct perf_event *event = file->private_data;
3528 struct ring_buffer *rb; 3668 struct ring_buffer *rb;
3529 unsigned int events = POLL_HUP; 3669 unsigned int events = POLLHUP;
3670
3671 poll_wait(file, &event->waitq, wait);
3672
3673 if (is_event_hup(event))
3674 return events;
3530 3675
3531 /* 3676 /*
3532 * Pin the event->rb by taking event->mmap_mutex; otherwise 3677 * Pin the event->rb by taking event->mmap_mutex; otherwise
@@ -3537,9 +3682,6 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
3537 if (rb) 3682 if (rb)
3538 events = atomic_xchg(&rb->poll, 0); 3683 events = atomic_xchg(&rb->poll, 0);
3539 mutex_unlock(&event->mmap_mutex); 3684 mutex_unlock(&event->mmap_mutex);
3540
3541 poll_wait(file, &event->waitq, wait);
3542
3543 return events; 3685 return events;
3544} 3686}
3545 3687
@@ -3717,6 +3859,26 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3717 return 0; 3859 return 0;
3718} 3860}
3719 3861
3862#ifdef CONFIG_COMPAT
3863static long perf_compat_ioctl(struct file *file, unsigned int cmd,
3864 unsigned long arg)
3865{
3866 switch (_IOC_NR(cmd)) {
3867 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
3868 case _IOC_NR(PERF_EVENT_IOC_ID):
3869 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
3870 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
3871 cmd &= ~IOCSIZE_MASK;
3872 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
3873 }
3874 break;
3875 }
3876 return perf_ioctl(file, cmd, arg);
3877}
3878#else
3879# define perf_compat_ioctl NULL
3880#endif
3881
3720int perf_event_task_enable(void) 3882int perf_event_task_enable(void)
3721{ 3883{
3722 struct perf_event *event; 3884 struct perf_event *event;
@@ -4222,7 +4384,7 @@ static const struct file_operations perf_fops = {
4222 .read = perf_read, 4384 .read = perf_read,
4223 .poll = perf_poll, 4385 .poll = perf_poll,
4224 .unlocked_ioctl = perf_ioctl, 4386 .unlocked_ioctl = perf_ioctl,
4225 .compat_ioctl = perf_ioctl, 4387 .compat_ioctl = perf_compat_ioctl,
4226 .mmap = perf_mmap, 4388 .mmap = perf_mmap,
4227 .fasync = perf_fasync, 4389 .fasync = perf_fasync,
4228}; 4390};
@@ -5671,7 +5833,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5671 struct perf_sample_data *data, 5833 struct perf_sample_data *data,
5672 struct pt_regs *regs) 5834 struct pt_regs *regs)
5673{ 5835{
5674 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5836 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
5675 struct perf_event *event; 5837 struct perf_event *event;
5676 struct hlist_head *head; 5838 struct hlist_head *head;
5677 5839
@@ -5690,7 +5852,7 @@ end:
5690 5852
5691int perf_swevent_get_recursion_context(void) 5853int perf_swevent_get_recursion_context(void)
5692{ 5854{
5693 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5855 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
5694 5856
5695 return get_recursion_context(swhash->recursion); 5857 return get_recursion_context(swhash->recursion);
5696} 5858}
@@ -5698,7 +5860,7 @@ EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
5698 5860
5699inline void perf_swevent_put_recursion_context(int rctx) 5861inline void perf_swevent_put_recursion_context(int rctx)
5700{ 5862{
5701 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5863 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
5702 5864
5703 put_recursion_context(swhash->recursion, rctx); 5865 put_recursion_context(swhash->recursion, rctx);
5704} 5866}
@@ -5727,7 +5889,7 @@ static void perf_swevent_read(struct perf_event *event)
5727 5889
5728static int perf_swevent_add(struct perf_event *event, int flags) 5890static int perf_swevent_add(struct perf_event *event, int flags)
5729{ 5891{
5730 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); 5892 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
5731 struct hw_perf_event *hwc = &event->hw; 5893 struct hw_perf_event *hwc = &event->hw;
5732 struct hlist_head *head; 5894 struct hlist_head *head;
5733 5895
@@ -5783,7 +5945,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
5783 if (!hlist) 5945 if (!hlist)
5784 return; 5946 return;
5785 5947
5786 rcu_assign_pointer(swhash->swevent_hlist, NULL); 5948 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
5787 kfree_rcu(hlist, rcu_head); 5949 kfree_rcu(hlist, rcu_head);
5788} 5950}
5789 5951
@@ -5909,11 +6071,6 @@ static int perf_swevent_init(struct perf_event *event)
5909 return 0; 6071 return 0;
5910} 6072}
5911 6073
5912static int perf_swevent_event_idx(struct perf_event *event)
5913{
5914 return 0;
5915}
5916
5917static struct pmu perf_swevent = { 6074static struct pmu perf_swevent = {
5918 .task_ctx_nr = perf_sw_context, 6075 .task_ctx_nr = perf_sw_context,
5919 6076
@@ -5923,8 +6080,6 @@ static struct pmu perf_swevent = {
5923 .start = perf_swevent_start, 6080 .start = perf_swevent_start,
5924 .stop = perf_swevent_stop, 6081 .stop = perf_swevent_stop,
5925 .read = perf_swevent_read, 6082 .read = perf_swevent_read,
5926
5927 .event_idx = perf_swevent_event_idx,
5928}; 6083};
5929 6084
5930#ifdef CONFIG_EVENT_TRACING 6085#ifdef CONFIG_EVENT_TRACING
@@ -6042,8 +6197,6 @@ static struct pmu perf_tracepoint = {
6042 .start = perf_swevent_start, 6197 .start = perf_swevent_start,
6043 .stop = perf_swevent_stop, 6198 .stop = perf_swevent_stop,
6044 .read = perf_swevent_read, 6199 .read = perf_swevent_read,
6045
6046 .event_idx = perf_swevent_event_idx,
6047}; 6200};
6048 6201
6049static inline void perf_tp_register(void) 6202static inline void perf_tp_register(void)
@@ -6269,8 +6422,6 @@ static struct pmu perf_cpu_clock = {
6269 .start = cpu_clock_event_start, 6422 .start = cpu_clock_event_start,
6270 .stop = cpu_clock_event_stop, 6423 .stop = cpu_clock_event_stop,
6271 .read = cpu_clock_event_read, 6424 .read = cpu_clock_event_read,
6272
6273 .event_idx = perf_swevent_event_idx,
6274}; 6425};
6275 6426
6276/* 6427/*
@@ -6349,8 +6500,6 @@ static struct pmu perf_task_clock = {
6349 .start = task_clock_event_start, 6500 .start = task_clock_event_start,
6350 .stop = task_clock_event_stop, 6501 .stop = task_clock_event_stop,
6351 .read = task_clock_event_read, 6502 .read = task_clock_event_read,
6352
6353 .event_idx = perf_swevent_event_idx,
6354}; 6503};
6355 6504
6356static void perf_pmu_nop_void(struct pmu *pmu) 6505static void perf_pmu_nop_void(struct pmu *pmu)
@@ -6380,7 +6529,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
6380 6529
6381static int perf_event_idx_default(struct perf_event *event) 6530static int perf_event_idx_default(struct perf_event *event)
6382{ 6531{
6383 return event->hw.idx + 1; 6532 return 0;
6384} 6533}
6385 6534
6386/* 6535/*
@@ -7366,6 +7515,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7366 goto err; 7515 goto err;
7367 } 7516 }
7368 7517
7518 /* Mark owner so we could distinguish it from user events. */
7519 event->owner = EVENT_OWNER_KERNEL;
7520
7369 account_event(event); 7521 account_event(event);
7370 7522
7371 ctx = find_get_context(event->pmu, task, cpu); 7523 ctx = find_get_context(event->pmu, task, cpu);
@@ -7453,6 +7605,12 @@ static void sync_child_event(struct perf_event *child_event,
7453 mutex_unlock(&parent_event->child_mutex); 7605 mutex_unlock(&parent_event->child_mutex);
7454 7606
7455 /* 7607 /*
7608 * Make sure user/parent get notified, that we just
7609 * lost one event.
7610 */
7611 perf_event_wakeup(parent_event);
7612
7613 /*
7456 * Release the parent event, if this was the last 7614 * Release the parent event, if this was the last
7457 * reference to it. 7615 * reference to it.
7458 */ 7616 */
@@ -7486,13 +7644,16 @@ __perf_event_exit_task(struct perf_event *child_event,
7486 if (child_event->parent) { 7644 if (child_event->parent) {
7487 sync_child_event(child_event, child); 7645 sync_child_event(child_event, child);
7488 free_event(child_event); 7646 free_event(child_event);
7647 } else {
7648 child_event->state = PERF_EVENT_STATE_EXIT;
7649 perf_event_wakeup(child_event);
7489 } 7650 }
7490} 7651}
7491 7652
7492static void perf_event_exit_task_context(struct task_struct *child, int ctxn) 7653static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7493{ 7654{
7494 struct perf_event *child_event, *next; 7655 struct perf_event *child_event, *next;
7495 struct perf_event_context *child_ctx, *parent_ctx; 7656 struct perf_event_context *child_ctx, *clone_ctx = NULL;
7496 unsigned long flags; 7657 unsigned long flags;
7497 7658
7498 if (likely(!child->perf_event_ctxp[ctxn])) { 7659 if (likely(!child->perf_event_ctxp[ctxn])) {
@@ -7519,28 +7680,16 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7519 child->perf_event_ctxp[ctxn] = NULL; 7680 child->perf_event_ctxp[ctxn] = NULL;
7520 7681
7521 /* 7682 /*
7522 * In order to avoid freeing: child_ctx->parent_ctx->task
7523 * under perf_event_context::lock, grab another reference.
7524 */
7525 parent_ctx = child_ctx->parent_ctx;
7526 if (parent_ctx)
7527 get_ctx(parent_ctx);
7528
7529 /*
7530 * If this context is a clone; unclone it so it can't get 7683 * If this context is a clone; unclone it so it can't get
7531 * swapped to another process while we're removing all 7684 * swapped to another process while we're removing all
7532 * the events from it. 7685 * the events from it.
7533 */ 7686 */
7534 unclone_ctx(child_ctx); 7687 clone_ctx = unclone_ctx(child_ctx);
7535 update_context_time(child_ctx); 7688 update_context_time(child_ctx);
7536 raw_spin_unlock_irqrestore(&child_ctx->lock, flags); 7689 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
7537 7690
7538 /* 7691 if (clone_ctx)
7539 * Now that we no longer hold perf_event_context::lock, drop 7692 put_ctx(clone_ctx);
7540 * our extra child_ctx->parent_ctx reference.
7541 */
7542 if (parent_ctx)
7543 put_ctx(parent_ctx);
7544 7693
7545 /* 7694 /*
7546 * Report the task dead after unscheduling the events so that we 7695 * Report the task dead after unscheduling the events so that we
@@ -7669,6 +7818,7 @@ inherit_event(struct perf_event *parent_event,
7669 struct perf_event *group_leader, 7818 struct perf_event *group_leader,
7670 struct perf_event_context *child_ctx) 7819 struct perf_event_context *child_ctx)
7671{ 7820{
7821 enum perf_event_active_state parent_state = parent_event->state;
7672 struct perf_event *child_event; 7822 struct perf_event *child_event;
7673 unsigned long flags; 7823 unsigned long flags;
7674 7824
@@ -7689,7 +7839,8 @@ inherit_event(struct perf_event *parent_event,
7689 if (IS_ERR(child_event)) 7839 if (IS_ERR(child_event))
7690 return child_event; 7840 return child_event;
7691 7841
7692 if (!atomic_long_inc_not_zero(&parent_event->refcount)) { 7842 if (is_orphaned_event(parent_event) ||
7843 !atomic_long_inc_not_zero(&parent_event->refcount)) {
7693 free_event(child_event); 7844 free_event(child_event);
7694 return NULL; 7845 return NULL;
7695 } 7846 }
@@ -7701,7 +7852,7 @@ inherit_event(struct perf_event *parent_event,
7701 * not its attr.disabled bit. We hold the parent's mutex, 7852 * not its attr.disabled bit. We hold the parent's mutex,
7702 * so we won't race with perf_event_{en, dis}able_family. 7853 * so we won't race with perf_event_{en, dis}able_family.
7703 */ 7854 */
7704 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) 7855 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
7705 child_event->state = PERF_EVENT_STATE_INACTIVE; 7856 child_event->state = PERF_EVENT_STATE_INACTIVE;
7706 else 7857 else
7707 child_event->state = PERF_EVENT_STATE_OFF; 7858 child_event->state = PERF_EVENT_STATE_OFF;
@@ -7917,8 +8068,10 @@ int perf_event_init_task(struct task_struct *child)
7917 8068
7918 for_each_task_context_nr(ctxn) { 8069 for_each_task_context_nr(ctxn) {
7919 ret = perf_event_init_context(child, ctxn); 8070 ret = perf_event_init_context(child, ctxn);
7920 if (ret) 8071 if (ret) {
8072 perf_event_free_task(child);
7921 return ret; 8073 return ret;
8074 }
7922 } 8075 }
7923 8076
7924 return 0; 8077 return 0;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 1559fb0b9296..9803a6600d49 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -605,11 +605,6 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags)
605 bp->hw.state = PERF_HES_STOPPED; 605 bp->hw.state = PERF_HES_STOPPED;
606} 606}
607 607
608static int hw_breakpoint_event_idx(struct perf_event *bp)
609{
610 return 0;
611}
612
613static struct pmu perf_breakpoint = { 608static struct pmu perf_breakpoint = {
614 .task_ctx_nr = perf_sw_context, /* could eventually get its own */ 609 .task_ctx_nr = perf_sw_context, /* could eventually get its own */
615 610
@@ -619,8 +614,6 @@ static struct pmu perf_breakpoint = {
619 .start = hw_breakpoint_start, 614 .start = hw_breakpoint_start,
620 .stop = hw_breakpoint_stop, 615 .stop = hw_breakpoint_stop,
621 .read = hw_breakpoint_pmu_read, 616 .read = hw_breakpoint_pmu_read,
622
623 .event_idx = hw_breakpoint_event_idx,
624}; 617};
625 618
626int __init init_hw_breakpoint(void) 619int __init init_hw_breakpoint(void)
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 6f3254e8c137..1d0af8a2c646 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -167,6 +167,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
167 /* For mmu_notifiers */ 167 /* For mmu_notifiers */
168 const unsigned long mmun_start = addr; 168 const unsigned long mmun_start = addr;
169 const unsigned long mmun_end = addr + PAGE_SIZE; 169 const unsigned long mmun_end = addr + PAGE_SIZE;
170 struct mem_cgroup *memcg;
171
172 err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
173 if (err)
174 return err;
170 175
171 /* For try_to_free_swap() and munlock_vma_page() below */ 176 /* For try_to_free_swap() and munlock_vma_page() below */
172 lock_page(page); 177 lock_page(page);
@@ -179,6 +184,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
179 184
180 get_page(kpage); 185 get_page(kpage);
181 page_add_new_anon_rmap(kpage, vma, addr); 186 page_add_new_anon_rmap(kpage, vma, addr);
187 mem_cgroup_commit_charge(kpage, memcg, false);
188 lru_cache_add_active_or_unevictable(kpage, vma);
182 189
183 if (!PageAnon(page)) { 190 if (!PageAnon(page)) {
184 dec_mm_counter(mm, MM_FILEPAGES); 191 dec_mm_counter(mm, MM_FILEPAGES);
@@ -200,6 +207,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
200 207
201 err = 0; 208 err = 0;
202 unlock: 209 unlock:
210 mem_cgroup_cancel_charge(kpage, memcg);
203 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); 211 mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
204 unlock_page(page); 212 unlock_page(page);
205 return err; 213 return err;
@@ -315,18 +323,11 @@ retry:
315 if (!new_page) 323 if (!new_page)
316 goto put_old; 324 goto put_old;
317 325
318 if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
319 goto put_new;
320
321 __SetPageUptodate(new_page); 326 __SetPageUptodate(new_page);
322 copy_highpage(new_page, old_page); 327 copy_highpage(new_page, old_page);
323 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); 328 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
324 329
325 ret = __replace_page(vma, vaddr, old_page, new_page); 330 ret = __replace_page(vma, vaddr, old_page, new_page);
326 if (ret)
327 mem_cgroup_uncharge_page(new_page);
328
329put_new:
330 page_cache_release(new_page); 331 page_cache_release(new_page);
331put_old: 332put_old:
332 put_page(old_page); 333 put_page(old_page);
diff --git a/kernel/exit.c b/kernel/exit.c
index e5c4668f1799..5d30019ff953 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -59,7 +59,7 @@
59#include <asm/pgtable.h> 59#include <asm/pgtable.h>
60#include <asm/mmu_context.h> 60#include <asm/mmu_context.h>
61 61
62static void exit_mm(struct task_struct * tsk); 62static void exit_mm(struct task_struct *tsk);
63 63
64static void __unhash_process(struct task_struct *p, bool group_dead) 64static void __unhash_process(struct task_struct *p, bool group_dead)
65{ 65{
@@ -115,32 +115,33 @@ static void __exit_signal(struct task_struct *tsk)
115 115
116 if (tsk == sig->curr_target) 116 if (tsk == sig->curr_target)
117 sig->curr_target = next_thread(tsk); 117 sig->curr_target = next_thread(tsk);
118 /*
119 * Accumulate here the counters for all threads but the
120 * group leader as they die, so they can be added into
121 * the process-wide totals when those are taken.
122 * The group leader stays around as a zombie as long
123 * as there are other threads. When it gets reaped,
124 * the exit.c code will add its counts into these totals.
125 * We won't ever get here for the group leader, since it
126 * will have been the last reference on the signal_struct.
127 */
128 task_cputime(tsk, &utime, &stime);
129 sig->utime += utime;
130 sig->stime += stime;
131 sig->gtime += task_gtime(tsk);
132 sig->min_flt += tsk->min_flt;
133 sig->maj_flt += tsk->maj_flt;
134 sig->nvcsw += tsk->nvcsw;
135 sig->nivcsw += tsk->nivcsw;
136 sig->inblock += task_io_get_inblock(tsk);
137 sig->oublock += task_io_get_oublock(tsk);
138 task_io_accounting_add(&sig->ioac, &tsk->ioac);
139 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
140 } 118 }
141 119
120 /*
121 * Accumulate here the counters for all threads but the group leader
122 * as they die, so they can be added into the process-wide totals
123 * when those are taken. The group leader stays around as a zombie as
124 * long as there are other threads. When it gets reaped, the exit.c
125 * code will add its counts into these totals. We won't ever get here
126 * for the group leader, since it will have been the last reference on
127 * the signal_struct.
128 */
129 task_cputime(tsk, &utime, &stime);
130 write_seqlock(&sig->stats_lock);
131 sig->utime += utime;
132 sig->stime += stime;
133 sig->gtime += task_gtime(tsk);
134 sig->min_flt += tsk->min_flt;
135 sig->maj_flt += tsk->maj_flt;
136 sig->nvcsw += tsk->nvcsw;
137 sig->nivcsw += tsk->nivcsw;
138 sig->inblock += task_io_get_inblock(tsk);
139 sig->oublock += task_io_get_oublock(tsk);
140 task_io_accounting_add(&sig->ioac, &tsk->ioac);
141 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
142 sig->nr_threads--; 142 sig->nr_threads--;
143 __unhash_process(tsk, group_dead); 143 __unhash_process(tsk, group_dead);
144 write_sequnlock(&sig->stats_lock);
144 145
145 /* 146 /*
146 * Do this under ->siglock, we can race with another thread 147 * Do this under ->siglock, we can race with another thread
@@ -151,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk)
151 spin_unlock(&sighand->siglock); 152 spin_unlock(&sighand->siglock);
152 153
153 __cleanup_sighand(sighand); 154 __cleanup_sighand(sighand);
154 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 155 clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
155 if (group_dead) { 156 if (group_dead) {
156 flush_sigqueue(&sig->shared_pending); 157 flush_sigqueue(&sig->shared_pending);
157 tty_kref_put(tty); 158 tty_kref_put(tty);
@@ -168,7 +169,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
168} 169}
169 170
170 171
171void release_task(struct task_struct * p) 172void release_task(struct task_struct *p)
172{ 173{
173 struct task_struct *leader; 174 struct task_struct *leader;
174 int zap_leader; 175 int zap_leader;
@@ -192,7 +193,8 @@ repeat:
192 */ 193 */
193 zap_leader = 0; 194 zap_leader = 0;
194 leader = p->group_leader; 195 leader = p->group_leader;
195 if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { 196 if (leader != p && thread_group_empty(leader)
197 && leader->exit_state == EXIT_ZOMBIE) {
196 /* 198 /*
197 * If we were the last child thread and the leader has 199 * If we were the last child thread and the leader has
198 * exited already, and the leader's parent ignores SIGCHLD, 200 * exited already, and the leader's parent ignores SIGCHLD,
@@ -241,7 +243,8 @@ struct pid *session_of_pgrp(struct pid *pgrp)
241 * 243 *
242 * "I ask you, have you ever known what it is to be an orphan?" 244 * "I ask you, have you ever known what it is to be an orphan?"
243 */ 245 */
244static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) 246static int will_become_orphaned_pgrp(struct pid *pgrp,
247 struct task_struct *ignored_task)
245{ 248{
246 struct task_struct *p; 249 struct task_struct *p;
247 250
@@ -294,9 +297,9 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
294 struct task_struct *ignored_task = tsk; 297 struct task_struct *ignored_task = tsk;
295 298
296 if (!parent) 299 if (!parent)
297 /* exit: our father is in a different pgrp than 300 /* exit: our father is in a different pgrp than
298 * we are and we were the only connection outside. 301 * we are and we were the only connection outside.
299 */ 302 */
300 parent = tsk->real_parent; 303 parent = tsk->real_parent;
301 else 304 else
302 /* reparent: our child is in a different pgrp than 305 /* reparent: our child is in a different pgrp than
@@ -405,7 +408,7 @@ assign_new_owner:
405 * Turn us into a lazy TLB process if we 408 * Turn us into a lazy TLB process if we
406 * aren't already.. 409 * aren't already..
407 */ 410 */
408static void exit_mm(struct task_struct * tsk) 411static void exit_mm(struct task_struct *tsk)
409{ 412{
410 struct mm_struct *mm = tsk->mm; 413 struct mm_struct *mm = tsk->mm;
411 struct core_state *core_state; 414 struct core_state *core_state;
@@ -425,6 +428,7 @@ static void exit_mm(struct task_struct * tsk)
425 core_state = mm->core_state; 428 core_state = mm->core_state;
426 if (core_state) { 429 if (core_state) {
427 struct core_thread self; 430 struct core_thread self;
431
428 up_read(&mm->mmap_sem); 432 up_read(&mm->mmap_sem);
429 433
430 self.task = tsk; 434 self.task = tsk;
@@ -455,6 +459,7 @@ static void exit_mm(struct task_struct * tsk)
455 task_unlock(tsk); 459 task_unlock(tsk);
456 mm_update_next_owner(mm); 460 mm_update_next_owner(mm);
457 mmput(mm); 461 mmput(mm);
462 clear_thread_flag(TIF_MEMDIE);
458} 463}
459 464
460/* 465/*
@@ -565,6 +570,7 @@ static void forget_original_parent(struct task_struct *father)
565 570
566 list_for_each_entry_safe(p, n, &father->children, sibling) { 571 list_for_each_entry_safe(p, n, &father->children, sibling) {
567 struct task_struct *t = p; 572 struct task_struct *t = p;
573
568 do { 574 do {
569 t->real_parent = reaper; 575 t->real_parent = reaper;
570 if (t->parent == father) { 576 if (t->parent == father) {
@@ -598,7 +604,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
598 /* 604 /*
599 * This does two things: 605 * This does two things:
600 * 606 *
601 * A. Make init inherit all the child processes 607 * A. Make init inherit all the child processes
602 * B. Check to see if any process groups have become orphaned 608 * B. Check to see if any process groups have become orphaned
603 * as a result of our exiting, and if they have any stopped 609 * as a result of our exiting, and if they have any stopped
604 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) 610 * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2)
@@ -648,9 +654,8 @@ static void check_stack_usage(void)
648 654
649 spin_lock(&low_water_lock); 655 spin_lock(&low_water_lock);
650 if (free < lowest_to_date) { 656 if (free < lowest_to_date) {
651 printk(KERN_WARNING "%s (%d) used greatest stack depth: " 657 pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n",
652 "%lu bytes left\n", 658 current->comm, task_pid_nr(current), free);
653 current->comm, task_pid_nr(current), free);
654 lowest_to_date = free; 659 lowest_to_date = free;
655 } 660 }
656 spin_unlock(&low_water_lock); 661 spin_unlock(&low_water_lock);
@@ -663,6 +668,7 @@ void do_exit(long code)
663{ 668{
664 struct task_struct *tsk = current; 669 struct task_struct *tsk = current;
665 int group_dead; 670 int group_dead;
671 TASKS_RCU(int tasks_rcu_i);
666 672
667 profile_task_exit(tsk); 673 profile_task_exit(tsk);
668 674
@@ -691,8 +697,7 @@ void do_exit(long code)
691 * leave this task alone and wait for reboot. 697 * leave this task alone and wait for reboot.
692 */ 698 */
693 if (unlikely(tsk->flags & PF_EXITING)) { 699 if (unlikely(tsk->flags & PF_EXITING)) {
694 printk(KERN_ALERT 700 pr_alert("Fixing recursive fault but reboot is needed!\n");
695 "Fixing recursive fault but reboot is needed!\n");
696 /* 701 /*
697 * We can do this unlocked here. The futex code uses 702 * We can do this unlocked here. The futex code uses
698 * this flag just to verify whether the pi state 703 * this flag just to verify whether the pi state
@@ -716,9 +721,9 @@ void do_exit(long code)
716 raw_spin_unlock_wait(&tsk->pi_lock); 721 raw_spin_unlock_wait(&tsk->pi_lock);
717 722
718 if (unlikely(in_atomic())) 723 if (unlikely(in_atomic()))
719 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 724 pr_info("note: %s[%d] exited with preempt_count %d\n",
720 current->comm, task_pid_nr(current), 725 current->comm, task_pid_nr(current),
721 preempt_count()); 726 preempt_count());
722 727
723 acct_update_integrals(tsk); 728 acct_update_integrals(tsk);
724 /* sync mm's RSS info before statistics gathering */ 729 /* sync mm's RSS info before statistics gathering */
@@ -772,6 +777,7 @@ void do_exit(long code)
772 */ 777 */
773 flush_ptrace_hw_breakpoint(tsk); 778 flush_ptrace_hw_breakpoint(tsk);
774 779
780 TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
775 exit_notify(tsk, group_dead); 781 exit_notify(tsk, group_dead);
776 proc_exit_connector(tsk); 782 proc_exit_connector(tsk);
777#ifdef CONFIG_NUMA 783#ifdef CONFIG_NUMA
@@ -811,6 +817,7 @@ void do_exit(long code)
811 if (tsk->nr_dirtied) 817 if (tsk->nr_dirtied)
812 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); 818 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
813 exit_rcu(); 819 exit_rcu();
820 TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
814 821
815 /* 822 /*
816 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed 823 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
@@ -836,7 +843,6 @@ void do_exit(long code)
836 for (;;) 843 for (;;)
837 cpu_relax(); /* For when BUG is null */ 844 cpu_relax(); /* For when BUG is null */
838} 845}
839
840EXPORT_SYMBOL_GPL(do_exit); 846EXPORT_SYMBOL_GPL(do_exit);
841 847
842void complete_and_exit(struct completion *comp, long code) 848void complete_and_exit(struct completion *comp, long code)
@@ -846,7 +852,6 @@ void complete_and_exit(struct completion *comp, long code)
846 852
847 do_exit(code); 853 do_exit(code);
848} 854}
849
850EXPORT_SYMBOL(complete_and_exit); 855EXPORT_SYMBOL(complete_and_exit);
851 856
852SYSCALL_DEFINE1(exit, int, error_code) 857SYSCALL_DEFINE1(exit, int, error_code)
@@ -869,6 +874,7 @@ do_group_exit(int exit_code)
869 exit_code = sig->group_exit_code; 874 exit_code = sig->group_exit_code;
870 else if (!thread_group_empty(current)) { 875 else if (!thread_group_empty(current)) {
871 struct sighand_struct *const sighand = current->sighand; 876 struct sighand_struct *const sighand = current->sighand;
877
872 spin_lock_irq(&sighand->siglock); 878 spin_lock_irq(&sighand->siglock);
873 if (signal_group_exit(sig)) 879 if (signal_group_exit(sig))
874 /* Another thread got here before we took the lock. */ 880 /* Another thread got here before we took the lock. */
@@ -1033,14 +1039,15 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1033 * as other threads in the parent group can be right 1039 * as other threads in the parent group can be right
1034 * here reaping other children at the same time. 1040 * here reaping other children at the same time.
1035 * 1041 *
1036 * We use thread_group_cputime_adjusted() to get times for the thread 1042 * We use thread_group_cputime_adjusted() to get times for
1037 * group, which consolidates times for all threads in the 1043 * the thread group, which consolidates times for all threads
1038 * group including the group leader. 1044 * in the group including the group leader.
1039 */ 1045 */
1040 thread_group_cputime_adjusted(p, &tgutime, &tgstime); 1046 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1041 spin_lock_irq(&p->real_parent->sighand->siglock); 1047 spin_lock_irq(&p->real_parent->sighand->siglock);
1042 psig = p->real_parent->signal; 1048 psig = p->real_parent->signal;
1043 sig = p->signal; 1049 sig = p->signal;
1050 write_seqlock(&psig->stats_lock);
1044 psig->cutime += tgutime + sig->cutime; 1051 psig->cutime += tgutime + sig->cutime;
1045 psig->cstime += tgstime + sig->cstime; 1052 psig->cstime += tgstime + sig->cstime;
1046 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; 1053 psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1063,6 +1070,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
1063 psig->cmaxrss = maxrss; 1070 psig->cmaxrss = maxrss;
1064 task_io_accounting_add(&psig->ioac, &p->ioac); 1071 task_io_accounting_add(&psig->ioac, &p->ioac);
1065 task_io_accounting_add(&psig->ioac, &sig->ioac); 1072 task_io_accounting_add(&psig->ioac, &sig->ioac);
1073 write_sequnlock(&psig->stats_lock);
1066 spin_unlock_irq(&p->real_parent->sighand->siglock); 1074 spin_unlock_irq(&p->real_parent->sighand->siglock);
1067 } 1075 }
1068 1076
@@ -1417,6 +1425,7 @@ static int do_wait_thread(struct wait_opts *wo, struct task_struct *tsk)
1417 1425
1418 list_for_each_entry(p, &tsk->children, sibling) { 1426 list_for_each_entry(p, &tsk->children, sibling) {
1419 int ret = wait_consider_task(wo, 0, p); 1427 int ret = wait_consider_task(wo, 0, p);
1428
1420 if (ret) 1429 if (ret)
1421 return ret; 1430 return ret;
1422 } 1431 }
@@ -1430,6 +1439,7 @@ static int ptrace_do_wait(struct wait_opts *wo, struct task_struct *tsk)
1430 1439
1431 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { 1440 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1432 int ret = wait_consider_task(wo, 1, p); 1441 int ret = wait_consider_task(wo, 1, p);
1442
1433 if (ret) 1443 if (ret)
1434 return ret; 1444 return ret;
1435 } 1445 }
diff --git a/kernel/fork.c b/kernel/fork.c
index fbd3497b221f..9b7d746d6d62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst,
294 return 0; 294 return 0;
295} 295}
296 296
297void set_task_stack_end_magic(struct task_struct *tsk)
298{
299 unsigned long *stackend;
300
301 stackend = end_of_stack(tsk);
302 *stackend = STACK_END_MAGIC; /* for overflow detection */
303}
304
297static struct task_struct *dup_task_struct(struct task_struct *orig) 305static struct task_struct *dup_task_struct(struct task_struct *orig)
298{ 306{
299 struct task_struct *tsk; 307 struct task_struct *tsk;
300 struct thread_info *ti; 308 struct thread_info *ti;
301 unsigned long *stackend;
302 int node = tsk_fork_get_node(orig); 309 int node = tsk_fork_get_node(orig);
303 int err; 310 int err;
304 311
@@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
328 setup_thread_stack(tsk, orig); 335 setup_thread_stack(tsk, orig);
329 clear_user_return_notifier(tsk); 336 clear_user_return_notifier(tsk);
330 clear_tsk_need_resched(tsk); 337 clear_tsk_need_resched(tsk);
331 stackend = end_of_stack(tsk); 338 set_task_stack_end_magic(tsk);
332 *stackend = STACK_END_MAGIC; /* for overflow detection */
333 339
334#ifdef CONFIG_CC_STACKPROTECTOR 340#ifdef CONFIG_CC_STACKPROTECTOR
335 tsk->stack_canary = get_random_int(); 341 tsk->stack_canary = get_random_int();
@@ -374,12 +380,11 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
374 */ 380 */
375 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); 381 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
376 382
377 mm->locked_vm = 0; 383 mm->total_vm = oldmm->total_vm;
378 mm->mmap = NULL; 384 mm->shared_vm = oldmm->shared_vm;
379 mm->vmacache_seqnum = 0; 385 mm->exec_vm = oldmm->exec_vm;
380 mm->map_count = 0; 386 mm->stack_vm = oldmm->stack_vm;
381 cpumask_clear(mm_cpumask(mm)); 387
382 mm->mm_rb = RB_ROOT;
383 rb_link = &mm->mm_rb.rb_node; 388 rb_link = &mm->mm_rb.rb_node;
384 rb_parent = NULL; 389 rb_parent = NULL;
385 pprev = &mm->mmap; 390 pprev = &mm->mmap;
@@ -430,7 +435,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
430 atomic_dec(&inode->i_writecount); 435 atomic_dec(&inode->i_writecount);
431 mutex_lock(&mapping->i_mmap_mutex); 436 mutex_lock(&mapping->i_mmap_mutex);
432 if (tmp->vm_flags & VM_SHARED) 437 if (tmp->vm_flags & VM_SHARED)
433 mapping->i_mmap_writable++; 438 atomic_inc(&mapping->i_mmap_writable);
434 flush_dcache_mmap_lock(mapping); 439 flush_dcache_mmap_lock(mapping);
435 /* insert tmp into the share list, just after mpnt */ 440 /* insert tmp into the share list, just after mpnt */
436 if (unlikely(tmp->vm_flags & VM_NONLINEAR)) 441 if (unlikely(tmp->vm_flags & VM_NONLINEAR))
@@ -536,19 +541,37 @@ static void mm_init_aio(struct mm_struct *mm)
536#endif 541#endif
537} 542}
538 543
544static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
545{
546#ifdef CONFIG_MEMCG
547 mm->owner = p;
548#endif
549}
550
539static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) 551static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
540{ 552{
553 mm->mmap = NULL;
554 mm->mm_rb = RB_ROOT;
555 mm->vmacache_seqnum = 0;
541 atomic_set(&mm->mm_users, 1); 556 atomic_set(&mm->mm_users, 1);
542 atomic_set(&mm->mm_count, 1); 557 atomic_set(&mm->mm_count, 1);
543 init_rwsem(&mm->mmap_sem); 558 init_rwsem(&mm->mmap_sem);
544 INIT_LIST_HEAD(&mm->mmlist); 559 INIT_LIST_HEAD(&mm->mmlist);
545 mm->core_state = NULL; 560 mm->core_state = NULL;
546 atomic_long_set(&mm->nr_ptes, 0); 561 atomic_long_set(&mm->nr_ptes, 0);
562 mm->map_count = 0;
563 mm->locked_vm = 0;
564 mm->pinned_vm = 0;
547 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); 565 memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
548 spin_lock_init(&mm->page_table_lock); 566 spin_lock_init(&mm->page_table_lock);
567 mm_init_cpumask(mm);
549 mm_init_aio(mm); 568 mm_init_aio(mm);
550 mm_init_owner(mm, p); 569 mm_init_owner(mm, p);
570 mmu_notifier_mm_init(mm);
551 clear_tlb_flush_pending(mm); 571 clear_tlb_flush_pending(mm);
572#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
573 mm->pmd_huge_pte = NULL;
574#endif
552 575
553 if (current->mm) { 576 if (current->mm) {
554 mm->flags = current->mm->flags & MMF_INIT_MASK; 577 mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -558,11 +581,17 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
558 mm->def_flags = 0; 581 mm->def_flags = 0;
559 } 582 }
560 583
561 if (likely(!mm_alloc_pgd(mm))) { 584 if (mm_alloc_pgd(mm))
562 mmu_notifier_mm_init(mm); 585 goto fail_nopgd;
563 return mm;
564 }
565 586
587 if (init_new_context(p, mm))
588 goto fail_nocontext;
589
590 return mm;
591
592fail_nocontext:
593 mm_free_pgd(mm);
594fail_nopgd:
566 free_mm(mm); 595 free_mm(mm);
567 return NULL; 596 return NULL;
568} 597}
@@ -578,9 +607,8 @@ static void check_mm(struct mm_struct *mm)
578 printk(KERN_ALERT "BUG: Bad rss-counter state " 607 printk(KERN_ALERT "BUG: Bad rss-counter state "
579 "mm:%p idx:%d val:%ld\n", mm, i, x); 608 "mm:%p idx:%d val:%ld\n", mm, i, x);
580 } 609 }
581
582#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS 610#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
583 VM_BUG_ON(mm->pmd_huge_pte); 611 VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
584#endif 612#endif
585} 613}
586 614
@@ -596,7 +624,6 @@ struct mm_struct *mm_alloc(void)
596 return NULL; 624 return NULL;
597 625
598 memset(mm, 0, sizeof(*mm)); 626 memset(mm, 0, sizeof(*mm));
599 mm_init_cpumask(mm);
600 return mm_init(mm, current); 627 return mm_init(mm, current);
601} 628}
602 629
@@ -828,17 +855,10 @@ static struct mm_struct *dup_mm(struct task_struct *tsk)
828 goto fail_nomem; 855 goto fail_nomem;
829 856
830 memcpy(mm, oldmm, sizeof(*mm)); 857 memcpy(mm, oldmm, sizeof(*mm));
831 mm_init_cpumask(mm);
832 858
833#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
834 mm->pmd_huge_pte = NULL;
835#endif
836 if (!mm_init(mm, tsk)) 859 if (!mm_init(mm, tsk))
837 goto fail_nomem; 860 goto fail_nomem;
838 861
839 if (init_new_context(tsk, mm))
840 goto fail_nocontext;
841
842 dup_mm_exe_file(oldmm, mm); 862 dup_mm_exe_file(oldmm, mm);
843 863
844 err = dup_mmap(mm, oldmm); 864 err = dup_mmap(mm, oldmm);
@@ -860,15 +880,6 @@ free_pt:
860 880
861fail_nomem: 881fail_nomem:
862 return NULL; 882 return NULL;
863
864fail_nocontext:
865 /*
866 * If init_new_context() failed, we cannot use mmput() to free the mm
867 * because it calls destroy_context()
868 */
869 mm_free_pgd(mm);
870 free_mm(mm);
871 return NULL;
872} 883}
873 884
874static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) 885static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
@@ -1062,6 +1073,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
1062 sig->curr_target = tsk; 1073 sig->curr_target = tsk;
1063 init_sigpending(&sig->shared_pending); 1074 init_sigpending(&sig->shared_pending);
1064 INIT_LIST_HEAD(&sig->posix_timers); 1075 INIT_LIST_HEAD(&sig->posix_timers);
1076 seqlock_init(&sig->stats_lock);
1065 1077
1066 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1078 hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1067 sig->real_timer.function = it_real_fn; 1079 sig->real_timer.function = it_real_fn;
@@ -1099,7 +1111,7 @@ static void copy_seccomp(struct task_struct *p)
1099 * needed because this new task is not yet running and cannot 1111 * needed because this new task is not yet running and cannot
1100 * be racing exec. 1112 * be racing exec.
1101 */ 1113 */
1102 BUG_ON(!spin_is_locked(&current->sighand->siglock)); 1114 assert_spin_locked(&current->sighand->siglock);
1103 1115
1104 /* Ref-count the new filter user, and assign it. */ 1116 /* Ref-count the new filter user, and assign it. */
1105 get_seccomp_filter(current); 1117 get_seccomp_filter(current);
@@ -1140,13 +1152,6 @@ static void rt_mutex_init_task(struct task_struct *p)
1140#endif 1152#endif
1141} 1153}
1142 1154
1143#ifdef CONFIG_MEMCG
1144void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
1145{
1146 mm->owner = p;
1147}
1148#endif /* CONFIG_MEMCG */
1149
1150/* 1155/*
1151 * Initialize POSIX timer handling for a single task. 1156 * Initialize POSIX timer handling for a single task.
1152 */ 1157 */
@@ -1346,10 +1351,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1346#ifdef CONFIG_DEBUG_MUTEXES 1351#ifdef CONFIG_DEBUG_MUTEXES
1347 p->blocked_on = NULL; /* not blocked yet */ 1352 p->blocked_on = NULL; /* not blocked yet */
1348#endif 1353#endif
1349#ifdef CONFIG_MEMCG
1350 p->memcg_batch.do_batch = 0;
1351 p->memcg_batch.memcg = NULL;
1352#endif
1353#ifdef CONFIG_BCACHE 1354#ifdef CONFIG_BCACHE
1354 p->sequential_io = 0; 1355 p->sequential_io = 0;
1355 p->sequential_io_avg = 0; 1356 p->sequential_io_avg = 0;
@@ -1365,8 +1366,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1365 goto bad_fork_cleanup_policy; 1366 goto bad_fork_cleanup_policy;
1366 retval = audit_alloc(p); 1367 retval = audit_alloc(p);
1367 if (retval) 1368 if (retval)
1368 goto bad_fork_cleanup_policy; 1369 goto bad_fork_cleanup_perf;
1369 /* copy all the process information */ 1370 /* copy all the process information */
1371 shm_init_task(p);
1370 retval = copy_semundo(clone_flags, p); 1372 retval = copy_semundo(clone_flags, p);
1371 if (retval) 1373 if (retval)
1372 goto bad_fork_cleanup_audit; 1374 goto bad_fork_cleanup_audit;
@@ -1570,8 +1572,9 @@ bad_fork_cleanup_semundo:
1570 exit_sem(p); 1572 exit_sem(p);
1571bad_fork_cleanup_audit: 1573bad_fork_cleanup_audit:
1572 audit_free(p); 1574 audit_free(p);
1573bad_fork_cleanup_policy: 1575bad_fork_cleanup_perf:
1574 perf_event_free_task(p); 1576 perf_event_free_task(p);
1577bad_fork_cleanup_policy:
1575#ifdef CONFIG_NUMA 1578#ifdef CONFIG_NUMA
1576 mpol_put(p->mempolicy); 1579 mpol_put(p->mempolicy);
1577bad_fork_cleanup_threadgroup_lock: 1580bad_fork_cleanup_threadgroup_lock:
@@ -1918,6 +1921,11 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
1918 */ 1921 */
1919 exit_sem(current); 1922 exit_sem(current);
1920 } 1923 }
1924 if (unshare_flags & CLONE_NEWIPC) {
1925 /* Orphan segments in old ns (see sem above). */
1926 exit_shm(current);
1927 shm_init_task(current);
1928 }
1921 1929
1922 if (new_nsproxy) 1930 if (new_nsproxy)
1923 switch_task_namespaces(current, new_nsproxy); 1931 switch_task_namespaces(current, new_nsproxy);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index aa6a8aadb911..a8900a3bc27a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -42,6 +42,9 @@ bool freezing_slow_path(struct task_struct *p)
42 if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK)) 42 if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
43 return false; 43 return false;
44 44
45 if (test_thread_flag(TIF_MEMDIE))
46 return false;
47
45 if (pm_nosig_freezing || cgroup_freezing(p)) 48 if (pm_nosig_freezing || cgroup_freezing(p))
46 return true; 49 return true;
47 50
@@ -147,12 +150,6 @@ void __thaw_task(struct task_struct *p)
147{ 150{
148 unsigned long flags; 151 unsigned long flags;
149 152
150 /*
151 * Clear freezing and kick @p if FROZEN. Clearing is guaranteed to
152 * be visible to @p as waking up implies wmb. Waking up inside
153 * freezer_lock also prevents wakeups from leaking outside
154 * refrigerator.
155 */
156 spin_lock_irqsave(&freezer_lock, flags); 153 spin_lock_irqsave(&freezer_lock, flags);
157 if (frozen(p)) 154 if (frozen(p))
158 wake_up_process(p); 155 wake_up_process(p);
diff --git a/kernel/futex.c b/kernel/futex.c
index d3a9d946d0b7..63678b573d61 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -143,9 +143,8 @@
143 * 143 *
144 * Where (A) orders the waiters increment and the futex value read through 144 * Where (A) orders the waiters increment and the futex value read through
145 * atomic operations (see hb_waiters_inc) and where (B) orders the write 145 * atomic operations (see hb_waiters_inc) and where (B) orders the write
146 * to futex and the waiters read -- this is done by the barriers in 146 * to futex and the waiters read -- this is done by the barriers for both
147 * get_futex_key_refs(), through either ihold or atomic_inc, depending on the 147 * shared and private futexes in get_futex_key_refs().
148 * futex type.
149 * 148 *
150 * This yields the following case (where X:=waiters, Y:=futex): 149 * This yields the following case (where X:=waiters, Y:=futex):
151 * 150 *
@@ -343,12 +342,21 @@ static void get_futex_key_refs(union futex_key *key)
343 case FUT_OFF_MMSHARED: 342 case FUT_OFF_MMSHARED:
344 futex_get_mm(key); /* implies MB (B) */ 343 futex_get_mm(key); /* implies MB (B) */
345 break; 344 break;
345 default:
346 /*
347 * Private futexes do not hold reference on an inode or
348 * mm, therefore the only purpose of calling get_futex_key_refs
349 * is because we need the barrier for the lockless waiter check.
350 */
351 smp_mb(); /* explicit MB (B) */
346 } 352 }
347} 353}
348 354
349/* 355/*
350 * Drop a reference to the resource addressed by a key. 356 * Drop a reference to the resource addressed by a key.
351 * The hash bucket spinlock must not be held. 357 * The hash bucket spinlock must not be held. This is
358 * a no-op for private futexes, see comment in the get
359 * counterpart.
352 */ 360 */
353static void drop_futex_key_refs(union futex_key *key) 361static void drop_futex_key_refs(union futex_key *key)
354{ 362{
@@ -639,8 +647,14 @@ static struct futex_pi_state * alloc_pi_state(void)
639 return pi_state; 647 return pi_state;
640} 648}
641 649
650/*
651 * Must be called with the hb lock held.
652 */
642static void free_pi_state(struct futex_pi_state *pi_state) 653static void free_pi_state(struct futex_pi_state *pi_state)
643{ 654{
655 if (!pi_state)
656 return;
657
644 if (!atomic_dec_and_test(&pi_state->refcount)) 658 if (!atomic_dec_and_test(&pi_state->refcount))
645 return; 659 return;
646 660
@@ -1519,15 +1533,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1519 } 1533 }
1520 1534
1521retry: 1535retry:
1522 if (pi_state != NULL) {
1523 /*
1524 * We will have to lookup the pi_state again, so free this one
1525 * to keep the accounting correct.
1526 */
1527 free_pi_state(pi_state);
1528 pi_state = NULL;
1529 }
1530
1531 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); 1536 ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1532 if (unlikely(ret != 0)) 1537 if (unlikely(ret != 0))
1533 goto out; 1538 goto out;
@@ -1617,6 +1622,8 @@ retry_private:
1617 case 0: 1622 case 0:
1618 break; 1623 break;
1619 case -EFAULT: 1624 case -EFAULT:
1625 free_pi_state(pi_state);
1626 pi_state = NULL;
1620 double_unlock_hb(hb1, hb2); 1627 double_unlock_hb(hb1, hb2);
1621 hb_waiters_dec(hb2); 1628 hb_waiters_dec(hb2);
1622 put_futex_key(&key2); 1629 put_futex_key(&key2);
@@ -1632,6 +1639,8 @@ retry_private:
1632 * exit to complete. 1639 * exit to complete.
1633 * - The user space value changed. 1640 * - The user space value changed.
1634 */ 1641 */
1642 free_pi_state(pi_state);
1643 pi_state = NULL;
1635 double_unlock_hb(hb1, hb2); 1644 double_unlock_hb(hb1, hb2);
1636 hb_waiters_dec(hb2); 1645 hb_waiters_dec(hb2);
1637 put_futex_key(&key2); 1646 put_futex_key(&key2);
@@ -1708,6 +1717,7 @@ retry_private:
1708 } 1717 }
1709 1718
1710out_unlock: 1719out_unlock:
1720 free_pi_state(pi_state);
1711 double_unlock_hb(hb1, hb2); 1721 double_unlock_hb(hb1, hb2);
1712 hb_waiters_dec(hb2); 1722 hb_waiters_dec(hb2);
1713 1723
@@ -1725,8 +1735,6 @@ out_put_keys:
1725out_put_key1: 1735out_put_key1:
1726 put_futex_key(&key1); 1736 put_futex_key(&key1);
1727out: 1737out:
1728 if (pi_state != NULL)
1729 free_pi_state(pi_state);
1730 return ret ? ret : task_count; 1738 return ret ? ret : task_count;
1731} 1739}
1732 1740
@@ -2592,6 +2600,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2592 * shared futexes. We need to compare the keys: 2600 * shared futexes. We need to compare the keys:
2593 */ 2601 */
2594 if (match_futex(&q.key, &key2)) { 2602 if (match_futex(&q.key, &key2)) {
2603 queue_unlock(hb);
2595 ret = -EINVAL; 2604 ret = -EINVAL;
2596 goto out_put_keys; 2605 goto out_put_keys;
2597 } 2606 }
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index d04ce8ac4399..3b7408759bdf 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
35config GCOV_PROFILE_ALL 35config GCOV_PROFILE_ALL
36 bool "Profile entire Kernel" 36 bool "Profile entire Kernel"
37 depends on GCOV_KERNEL 37 depends on GCOV_KERNEL
38 depends on SUPERH || S390 || X86 || PPC || MICROBLAZE 38 depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM || ARM64
39 default n 39 default n
40 ---help--- 40 ---help---
41 This options activates profiling for the entire kernel. 41 This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index 15ff01a76379..edf67c493a8e 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -784,8 +784,7 @@ static __init int gcov_fs_init(void)
784 784
785err_remove: 785err_remove:
786 pr_err("init failed\n"); 786 pr_err("init failed\n");
787 if (root_node.dentry) 787 debugfs_remove(root_node.dentry);
788 debugfs_remove(root_node.dentry);
789 788
790 return rc; 789 return rc;
791} 790}
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d269cecdfbf0..225086b2652e 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -55,6 +55,9 @@ config GENERIC_IRQ_CHIP
55config IRQ_DOMAIN 55config IRQ_DOMAIN
56 bool 56 bool
57 57
58config HANDLE_DOMAIN_IRQ
59 bool
60
58config IRQ_DOMAIN_DEBUG 61config IRQ_DOMAIN_DEBUG
59 bool "Expose hardware/virtual IRQ mapping via debugfs" 62 bool "Expose hardware/virtual IRQ mapping via debugfs"
60 depends on IRQ_DOMAIN && DEBUG_FS 63 depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index a2b28a2fd7b1..e5202f00cabc 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -342,6 +342,31 @@ static bool irq_check_poll(struct irq_desc *desc)
342 return irq_wait_for_poll(desc); 342 return irq_wait_for_poll(desc);
343} 343}
344 344
345static bool irq_may_run(struct irq_desc *desc)
346{
347 unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
348
349 /*
350 * If the interrupt is not in progress and is not an armed
351 * wakeup interrupt, proceed.
352 */
353 if (!irqd_has_set(&desc->irq_data, mask))
354 return true;
355
356 /*
357 * If the interrupt is an armed wakeup source, mark it pending
358 * and suspended, disable it and notify the pm core about the
359 * event.
360 */
361 if (irq_pm_check_wakeup(desc))
362 return false;
363
364 /*
365 * Handle a potential concurrent poll on a different core.
366 */
367 return irq_check_poll(desc);
368}
369
345/** 370/**
346 * handle_simple_irq - Simple and software-decoded IRQs. 371 * handle_simple_irq - Simple and software-decoded IRQs.
347 * @irq: the interrupt number 372 * @irq: the interrupt number
@@ -359,9 +384,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
359{ 384{
360 raw_spin_lock(&desc->lock); 385 raw_spin_lock(&desc->lock);
361 386
362 if (unlikely(irqd_irq_inprogress(&desc->irq_data))) 387 if (!irq_may_run(desc))
363 if (!irq_check_poll(desc)) 388 goto out_unlock;
364 goto out_unlock;
365 389
366 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 390 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
367 kstat_incr_irqs_this_cpu(irq, desc); 391 kstat_incr_irqs_this_cpu(irq, desc);
@@ -412,9 +436,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
412 raw_spin_lock(&desc->lock); 436 raw_spin_lock(&desc->lock);
413 mask_ack_irq(desc); 437 mask_ack_irq(desc);
414 438
415 if (unlikely(irqd_irq_inprogress(&desc->irq_data))) 439 if (!irq_may_run(desc))
416 if (!irq_check_poll(desc)) 440 goto out_unlock;
417 goto out_unlock;
418 441
419 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 442 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
420 kstat_incr_irqs_this_cpu(irq, desc); 443 kstat_incr_irqs_this_cpu(irq, desc);
@@ -485,9 +508,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
485 508
486 raw_spin_lock(&desc->lock); 509 raw_spin_lock(&desc->lock);
487 510
488 if (unlikely(irqd_irq_inprogress(&desc->irq_data))) 511 if (!irq_may_run(desc))
489 if (!irq_check_poll(desc)) 512 goto out;
490 goto out;
491 513
492 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 514 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
493 kstat_incr_irqs_this_cpu(irq, desc); 515 kstat_incr_irqs_this_cpu(irq, desc);
@@ -517,6 +539,7 @@ out:
517 chip->irq_eoi(&desc->irq_data); 539 chip->irq_eoi(&desc->irq_data);
518 raw_spin_unlock(&desc->lock); 540 raw_spin_unlock(&desc->lock);
519} 541}
542EXPORT_SYMBOL_GPL(handle_fasteoi_irq);
520 543
521/** 544/**
522 * handle_edge_irq - edge type IRQ handler 545 * handle_edge_irq - edge type IRQ handler
@@ -540,19 +563,23 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
540 raw_spin_lock(&desc->lock); 563 raw_spin_lock(&desc->lock);
541 564
542 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 565 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
566
567 if (!irq_may_run(desc)) {
568 desc->istate |= IRQS_PENDING;
569 mask_ack_irq(desc);
570 goto out_unlock;
571 }
572
543 /* 573 /*
544 * If we're currently running this IRQ, or its disabled, 574 * If its disabled or no action available then mask it and get
545 * we shouldn't process the IRQ. Mark it pending, handle 575 * out of here.
546 * the necessary masking and go out
547 */ 576 */
548 if (unlikely(irqd_irq_disabled(&desc->irq_data) || 577 if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
549 irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { 578 desc->istate |= IRQS_PENDING;
550 if (!irq_check_poll(desc)) { 579 mask_ack_irq(desc);
551 desc->istate |= IRQS_PENDING; 580 goto out_unlock;
552 mask_ack_irq(desc);
553 goto out_unlock;
554 }
555 } 581 }
582
556 kstat_incr_irqs_this_cpu(irq, desc); 583 kstat_incr_irqs_this_cpu(irq, desc);
557 584
558 /* Start handling the irq */ 585 /* Start handling the irq */
@@ -601,18 +628,21 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
601 raw_spin_lock(&desc->lock); 628 raw_spin_lock(&desc->lock);
602 629
603 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); 630 desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
631
632 if (!irq_may_run(desc)) {
633 desc->istate |= IRQS_PENDING;
634 goto out_eoi;
635 }
636
604 /* 637 /*
605 * If we're currently running this IRQ, or its disabled, 638 * If its disabled or no action available then mask it and get
606 * we shouldn't process the IRQ. Mark it pending, handle 639 * out of here.
607 * the necessary masking and go out
608 */ 640 */
609 if (unlikely(irqd_irq_disabled(&desc->irq_data) || 641 if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
610 irqd_irq_inprogress(&desc->irq_data) || !desc->action)) { 642 desc->istate |= IRQS_PENDING;
611 if (!irq_check_poll(desc)) { 643 goto out_eoi;
612 desc->istate |= IRQS_PENDING;
613 goto out_eoi;
614 }
615 } 644 }
645
616 kstat_incr_irqs_this_cpu(irq, desc); 646 kstat_incr_irqs_this_cpu(irq, desc);
617 647
618 do { 648 do {
@@ -669,7 +699,7 @@ void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
669{ 699{
670 struct irq_chip *chip = irq_desc_get_chip(desc); 700 struct irq_chip *chip = irq_desc_get_chip(desc);
671 struct irqaction *action = desc->action; 701 struct irqaction *action = desc->action;
672 void *dev_id = __this_cpu_ptr(action->percpu_dev_id); 702 void *dev_id = raw_cpu_ptr(action->percpu_dev_id);
673 irqreturn_t res; 703 irqreturn_t res;
674 704
675 kstat_incr_irqs_this_cpu(irq, desc); 705 kstat_incr_irqs_this_cpu(irq, desc);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 099ea2e0eb88..4332d766619d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -63,8 +63,8 @@ enum {
63 63
64extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, 64extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
65 unsigned long flags); 65 unsigned long flags);
66extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp); 66extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
67extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume); 67extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
68 68
69extern int irq_startup(struct irq_desc *desc, bool resend); 69extern int irq_startup(struct irq_desc *desc, bool resend);
70extern void irq_shutdown(struct irq_desc *desc); 70extern void irq_shutdown(struct irq_desc *desc);
@@ -194,3 +194,15 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d
194 __this_cpu_inc(*desc->kstat_irqs); 194 __this_cpu_inc(*desc->kstat_irqs);
195 __this_cpu_inc(kstat.irqs_sum); 195 __this_cpu_inc(kstat.irqs_sum);
196} 196}
197
198#ifdef CONFIG_PM_SLEEP
199bool irq_pm_check_wakeup(struct irq_desc *desc);
200void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
201void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
202#else
203static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
204static inline void
205irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
206static inline void
207irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
208#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1487a123db5c..a1782f88f0af 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -14,6 +14,7 @@
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/radix-tree.h> 15#include <linux/radix-tree.h>
16#include <linux/bitmap.h> 16#include <linux/bitmap.h>
17#include <linux/irqdomain.h>
17 18
18#include "internals.h" 19#include "internals.h"
19 20
@@ -336,6 +337,47 @@ int generic_handle_irq(unsigned int irq)
336} 337}
337EXPORT_SYMBOL_GPL(generic_handle_irq); 338EXPORT_SYMBOL_GPL(generic_handle_irq);
338 339
340#ifdef CONFIG_HANDLE_DOMAIN_IRQ
341/**
342 * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
343 * @domain: The domain where to perform the lookup
344 * @hwirq: The HW irq number to convert to a logical one
345 * @lookup: Whether to perform the domain lookup or not
346 * @regs: Register file coming from the low-level handling code
347 *
348 * Returns: 0 on success, or -EINVAL if conversion has failed
349 */
350int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
351 bool lookup, struct pt_regs *regs)
352{
353 struct pt_regs *old_regs = set_irq_regs(regs);
354 unsigned int irq = hwirq;
355 int ret = 0;
356
357 irq_enter();
358
359#ifdef CONFIG_IRQ_DOMAIN
360 if (lookup)
361 irq = irq_find_mapping(domain, hwirq);
362#endif
363
364 /*
365 * Some hardware gives randomly wrong interrupts. Rather
366 * than crashing, do something sensible.
367 */
368 if (unlikely(!irq || irq >= nr_irqs)) {
369 ack_bad_irq(irq);
370 ret = -EINVAL;
371 } else {
372 generic_handle_irq(irq);
373 }
374
375 irq_exit();
376 set_irq_regs(old_regs);
377 return ret;
378}
379#endif
380
339/* Dynamic interrupt handling */ 381/* Dynamic interrupt handling */
340 382
341/** 383/**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3dc6a61bf06a..0a9104b4608b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -382,14 +382,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
382} 382}
383#endif 383#endif
384 384
385void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend) 385void __disable_irq(struct irq_desc *desc, unsigned int irq)
386{ 386{
387 if (suspend) {
388 if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
389 return;
390 desc->istate |= IRQS_SUSPENDED;
391 }
392
393 if (!desc->depth++) 387 if (!desc->depth++)
394 irq_disable(desc); 388 irq_disable(desc);
395} 389}
@@ -401,7 +395,7 @@ static int __disable_irq_nosync(unsigned int irq)
401 395
402 if (!desc) 396 if (!desc)
403 return -EINVAL; 397 return -EINVAL;
404 __disable_irq(desc, irq, false); 398 __disable_irq(desc, irq);
405 irq_put_desc_busunlock(desc, flags); 399 irq_put_desc_busunlock(desc, flags);
406 return 0; 400 return 0;
407} 401}
@@ -442,20 +436,8 @@ void disable_irq(unsigned int irq)
442} 436}
443EXPORT_SYMBOL(disable_irq); 437EXPORT_SYMBOL(disable_irq);
444 438
445void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume) 439void __enable_irq(struct irq_desc *desc, unsigned int irq)
446{ 440{
447 if (resume) {
448 if (!(desc->istate & IRQS_SUSPENDED)) {
449 if (!desc->action)
450 return;
451 if (!(desc->action->flags & IRQF_FORCE_RESUME))
452 return;
453 /* Pretend that it got disabled ! */
454 desc->depth++;
455 }
456 desc->istate &= ~IRQS_SUSPENDED;
457 }
458
459 switch (desc->depth) { 441 switch (desc->depth) {
460 case 0: 442 case 0:
461 err_out: 443 err_out:
@@ -497,7 +479,7 @@ void enable_irq(unsigned int irq)
497 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq)) 479 KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
498 goto out; 480 goto out;
499 481
500 __enable_irq(desc, irq, false); 482 __enable_irq(desc, irq);
501out: 483out:
502 irq_put_desc_busunlock(desc, flags); 484 irq_put_desc_busunlock(desc, flags);
503} 485}
@@ -1218,6 +1200,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1218 new->irq = irq; 1200 new->irq = irq;
1219 *old_ptr = new; 1201 *old_ptr = new;
1220 1202
1203 irq_pm_install_action(desc, new);
1204
1221 /* Reset broken irq detection when installing new handler */ 1205 /* Reset broken irq detection when installing new handler */
1222 desc->irq_count = 0; 1206 desc->irq_count = 0;
1223 desc->irqs_unhandled = 0; 1207 desc->irqs_unhandled = 0;
@@ -1228,7 +1212,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
1228 */ 1212 */
1229 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) { 1213 if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
1230 desc->istate &= ~IRQS_SPURIOUS_DISABLED; 1214 desc->istate &= ~IRQS_SPURIOUS_DISABLED;
1231 __enable_irq(desc, irq, false); 1215 __enable_irq(desc, irq);
1232 } 1216 }
1233 1217
1234 raw_spin_unlock_irqrestore(&desc->lock, flags); 1218 raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -1336,6 +1320,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
1336 /* Found it - now remove it from the list of entries: */ 1320 /* Found it - now remove it from the list of entries: */
1337 *action_ptr = action->next; 1321 *action_ptr = action->next;
1338 1322
1323 irq_pm_remove_action(desc, action);
1324
1339 /* If this was the last handler, shut down the IRQ line: */ 1325 /* If this was the last handler, shut down the IRQ line: */
1340 if (!desc->action) { 1326 if (!desc->action) {
1341 irq_shutdown(desc); 1327 irq_shutdown(desc);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index abcd6ca86cb7..3ca532592704 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,17 +9,105 @@
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/suspend.h>
12#include <linux/syscore_ops.h> 13#include <linux/syscore_ops.h>
13 14
14#include "internals.h" 15#include "internals.h"
15 16
17bool irq_pm_check_wakeup(struct irq_desc *desc)
18{
19 if (irqd_is_wakeup_armed(&desc->irq_data)) {
20 irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
21 desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
22 desc->depth++;
23 irq_disable(desc);
24 pm_system_wakeup();
25 return true;
26 }
27 return false;
28}
29
30/*
31 * Called from __setup_irq() with desc->lock held after @action has
32 * been installed in the action chain.
33 */
34void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
35{
36 desc->nr_actions++;
37
38 if (action->flags & IRQF_FORCE_RESUME)
39 desc->force_resume_depth++;
40
41 WARN_ON_ONCE(desc->force_resume_depth &&
42 desc->force_resume_depth != desc->nr_actions);
43
44 if (action->flags & IRQF_NO_SUSPEND)
45 desc->no_suspend_depth++;
46
47 WARN_ON_ONCE(desc->no_suspend_depth &&
48 desc->no_suspend_depth != desc->nr_actions);
49}
50
51/*
52 * Called from __free_irq() with desc->lock held after @action has
53 * been removed from the action chain.
54 */
55void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
56{
57 desc->nr_actions--;
58
59 if (action->flags & IRQF_FORCE_RESUME)
60 desc->force_resume_depth--;
61
62 if (action->flags & IRQF_NO_SUSPEND)
63 desc->no_suspend_depth--;
64}
65
66static bool suspend_device_irq(struct irq_desc *desc, int irq)
67{
68 if (!desc->action || desc->no_suspend_depth)
69 return false;
70
71 if (irqd_is_wakeup_set(&desc->irq_data)) {
72 irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
73 /*
74 * We return true here to force the caller to issue
75 * synchronize_irq(). We need to make sure that the
76 * IRQD_WAKEUP_ARMED is visible before we return from
77 * suspend_device_irqs().
78 */
79 return true;
80 }
81
82 desc->istate |= IRQS_SUSPENDED;
83 __disable_irq(desc, irq);
84
85 /*
86 * Hardware which has no wakeup source configuration facility
87 * requires that the non wakeup interrupts are masked at the
88 * chip level. The chip implementation indicates that with
89 * IRQCHIP_MASK_ON_SUSPEND.
90 */
91 if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
92 mask_irq(desc);
93 return true;
94}
95
16/** 96/**
17 * suspend_device_irqs - disable all currently enabled interrupt lines 97 * suspend_device_irqs - disable all currently enabled interrupt lines
18 * 98 *
19 * During system-wide suspend or hibernation device drivers need to be prevented 99 * During system-wide suspend or hibernation device drivers need to be
20 * from receiving interrupts and this function is provided for this purpose. 100 * prevented from receiving interrupts and this function is provided
21 * It marks all interrupt lines in use, except for the timer ones, as disabled 101 * for this purpose.
22 * and sets the IRQS_SUSPENDED flag for each of them. 102 *
103 * So we disable all interrupts and mark them IRQS_SUSPENDED except
104 * for those which are unused, those which are marked as not
105 * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND
106 * set and those which are marked as active wakeup sources.
107 *
108 * The active wakeup sources are handled by the flow handler entry
109 * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the
110 * interrupt and notifies the pm core about the wakeup.
23 */ 111 */
24void suspend_device_irqs(void) 112void suspend_device_irqs(void)
25{ 113{
@@ -28,18 +116,36 @@ void suspend_device_irqs(void)
28 116
29 for_each_irq_desc(irq, desc) { 117 for_each_irq_desc(irq, desc) {
30 unsigned long flags; 118 unsigned long flags;
119 bool sync;
31 120
32 raw_spin_lock_irqsave(&desc->lock, flags); 121 raw_spin_lock_irqsave(&desc->lock, flags);
33 __disable_irq(desc, irq, true); 122 sync = suspend_device_irq(desc, irq);
34 raw_spin_unlock_irqrestore(&desc->lock, flags); 123 raw_spin_unlock_irqrestore(&desc->lock, flags);
35 }
36 124
37 for_each_irq_desc(irq, desc) 125 if (sync)
38 if (desc->istate & IRQS_SUSPENDED)
39 synchronize_irq(irq); 126 synchronize_irq(irq);
127 }
40} 128}
41EXPORT_SYMBOL_GPL(suspend_device_irqs); 129EXPORT_SYMBOL_GPL(suspend_device_irqs);
42 130
131static void resume_irq(struct irq_desc *desc, int irq)
132{
133 irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
134
135 if (desc->istate & IRQS_SUSPENDED)
136 goto resume;
137
138 /* Force resume the interrupt? */
139 if (!desc->force_resume_depth)
140 return;
141
142 /* Pretend that it got disabled ! */
143 desc->depth++;
144resume:
145 desc->istate &= ~IRQS_SUSPENDED;
146 __enable_irq(desc, irq);
147}
148
43static void resume_irqs(bool want_early) 149static void resume_irqs(bool want_early)
44{ 150{
45 struct irq_desc *desc; 151 struct irq_desc *desc;
@@ -54,7 +160,7 @@ static void resume_irqs(bool want_early)
54 continue; 160 continue;
55 161
56 raw_spin_lock_irqsave(&desc->lock, flags); 162 raw_spin_lock_irqsave(&desc->lock, flags);
57 __enable_irq(desc, irq, true); 163 resume_irq(desc, irq);
58 raw_spin_unlock_irqrestore(&desc->lock, flags); 164 raw_spin_unlock_irqrestore(&desc->lock, flags);
59 } 165 }
60} 166}
@@ -93,38 +199,3 @@ void resume_device_irqs(void)
93 resume_irqs(false); 199 resume_irqs(false);
94} 200}
95EXPORT_SYMBOL_GPL(resume_device_irqs); 201EXPORT_SYMBOL_GPL(resume_device_irqs);
96
97/**
98 * check_wakeup_irqs - check if any wake-up interrupts are pending
99 */
100int check_wakeup_irqs(void)
101{
102 struct irq_desc *desc;
103 int irq;
104
105 for_each_irq_desc(irq, desc) {
106 /*
107 * Only interrupts which are marked as wakeup source
108 * and have not been disabled before the suspend check
109 * can abort suspend.
110 */
111 if (irqd_is_wakeup_set(&desc->irq_data)) {
112 if (desc->depth == 1 && desc->istate & IRQS_PENDING)
113 return -EBUSY;
114 continue;
115 }
116 /*
117 * Check the non wakeup interrupts whether they need
118 * to be masked before finally going into suspend
119 * state. That's for hardware which has no wakeup
120 * source configuration facility. The chip
121 * implementation indicates that with
122 * IRQCHIP_MASK_ON_SUSPEND.
123 */
124 if (desc->istate & IRQS_SUSPENDED &&
125 irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
126 mask_irq(desc);
127 }
128
129 return 0;
130}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index e6bcbe756663..3ab9048483fa 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -95,11 +95,11 @@ bool irq_work_queue(struct irq_work *work)
95 95
96 /* If the work is "lazy", handle it from next tick if any */ 96 /* If the work is "lazy", handle it from next tick if any */
97 if (work->flags & IRQ_WORK_LAZY) { 97 if (work->flags & IRQ_WORK_LAZY) {
98 if (llist_add(&work->llnode, &__get_cpu_var(lazy_list)) && 98 if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
99 tick_nohz_tick_stopped()) 99 tick_nohz_tick_stopped())
100 arch_irq_work_raise(); 100 arch_irq_work_raise();
101 } else { 101 } else {
102 if (llist_add(&work->llnode, &__get_cpu_var(raised_list))) 102 if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
103 arch_irq_work_raise(); 103 arch_irq_work_raise();
104 } 104 }
105 105
@@ -113,10 +113,12 @@ bool irq_work_needs_cpu(void)
113{ 113{
114 struct llist_head *raised, *lazy; 114 struct llist_head *raised, *lazy;
115 115
116 raised = &__get_cpu_var(raised_list); 116 raised = this_cpu_ptr(&raised_list);
117 lazy = &__get_cpu_var(lazy_list); 117 lazy = this_cpu_ptr(&lazy_list);
118 if (llist_empty(raised) && llist_empty(lazy)) 118
119 return false; 119 if (llist_empty(raised) || arch_irq_work_has_interrupt())
120 if (llist_empty(lazy))
121 return false;
120 122
121 /* All work should have been flushed before going offline */ 123 /* All work should have been flushed before going offline */
122 WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); 124 WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -166,11 +168,20 @@ static void irq_work_run_list(struct llist_head *list)
166 */ 168 */
167void irq_work_run(void) 169void irq_work_run(void)
168{ 170{
169 irq_work_run_list(&__get_cpu_var(raised_list)); 171 irq_work_run_list(this_cpu_ptr(&raised_list));
170 irq_work_run_list(&__get_cpu_var(lazy_list)); 172 irq_work_run_list(this_cpu_ptr(&lazy_list));
171} 173}
172EXPORT_SYMBOL_GPL(irq_work_run); 174EXPORT_SYMBOL_GPL(irq_work_run);
173 175
176void irq_work_tick(void)
177{
178 struct llist_head *raised = &__get_cpu_var(raised_list);
179
180 if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
181 irq_work_run_list(raised);
182 irq_work_run_list(&__get_cpu_var(lazy_list));
183}
184
174/* 185/*
175 * Synchronize against the irq_work @entry, ensures the entry is not 186 * Synchronize against the irq_work @entry, ensures the entry is not
176 * currently in use. 187 * currently in use.
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index cb0cf37dac3a..5c5987f10819 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -364,7 +364,7 @@ static int __sprint_symbol(char *buffer, unsigned long address,
364 address += symbol_offset; 364 address += symbol_offset;
365 name = kallsyms_lookup(address, &size, &offset, &modname, buffer); 365 name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
366 if (!name) 366 if (!name)
367 return sprintf(buffer, "0x%lx", address); 367 return sprintf(buffer, "0x%lx", address - symbol_offset);
368 368
369 if (name != buffer) 369 if (name != buffer)
370 strcpy(buffer, name); 370 strcpy(buffer, name);
@@ -565,19 +565,12 @@ static int kallsyms_open(struct inode *inode, struct file *file)
565 * using get_symbol_offset for every symbol. 565 * using get_symbol_offset for every symbol.
566 */ 566 */
567 struct kallsym_iter *iter; 567 struct kallsym_iter *iter;
568 int ret; 568 iter = __seq_open_private(file, &kallsyms_op, sizeof(*iter));
569
570 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
571 if (!iter) 569 if (!iter)
572 return -ENOMEM; 570 return -ENOMEM;
573 reset_iter(iter, 0); 571 reset_iter(iter, 0);
574 572
575 ret = seq_open(file, &kallsyms_op); 573 return 0;
576 if (ret == 0)
577 ((struct seq_file *)file->private_data)->private = iter;
578 else
579 kfree(iter);
580 return ret;
581} 574}
582 575
583#ifdef CONFIG_KGDB_KDB 576#ifdef CONFIG_KGDB_KDB
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index e30ac0fe61c3..0aa69ea1d8fd 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type)
44 */ 44 */
45static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) 45static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
46{ 46{
47 long ret; 47 long t1, t2;
48 48
49 ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); 49 t1 = kptr_obfuscate((long)v1, type);
50 t2 = kptr_obfuscate((long)v2, type);
50 51
51 return (ret < 0) | ((ret > 0) << 1); 52 return (t1 < t2) | ((t1 > t2) << 1);
52} 53}
53 54
54/* The caller must have pinned the task */ 55/* The caller must have pinned the task */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 4b8f0c925884..2abf9f6e9a61 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -6,6 +6,8 @@
6 * Version 2. See the file COPYING for more details. 6 * Version 2. See the file COPYING for more details.
7 */ 7 */
8 8
9#define pr_fmt(fmt) "kexec: " fmt
10
9#include <linux/capability.h> 11#include <linux/capability.h>
10#include <linux/mm.h> 12#include <linux/mm.h>
11#include <linux/file.h> 13#include <linux/file.h>
@@ -40,6 +42,9 @@
40#include <asm/io.h> 42#include <asm/io.h>
41#include <asm/sections.h> 43#include <asm/sections.h>
42 44
45#include <crypto/hash.h>
46#include <crypto/sha.h>
47
43/* Per cpu memory for storing cpu states in case of system crash. */ 48/* Per cpu memory for storing cpu states in case of system crash. */
44note_buf_t __percpu *crash_notes; 49note_buf_t __percpu *crash_notes;
45 50
@@ -52,6 +57,17 @@ size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
52/* Flag to indicate we are going to kexec a new kernel */ 57/* Flag to indicate we are going to kexec a new kernel */
53bool kexec_in_progress = false; 58bool kexec_in_progress = false;
54 59
60/*
61 * Declare these symbols weak so that if architecture provides a purgatory,
62 * these will be overridden.
63 */
64char __weak kexec_purgatory[0];
65size_t __weak kexec_purgatory_size = 0;
66
67#ifdef CONFIG_KEXEC_FILE
68static int kexec_calculate_store_digests(struct kimage *image);
69#endif
70
55/* Location of the reserved area for the crash kernel */ 71/* Location of the reserved area for the crash kernel */
56struct resource crashk_res = { 72struct resource crashk_res = {
57 .name = "Crash kernel", 73 .name = "Crash kernel",
@@ -125,45 +141,27 @@ static struct page *kimage_alloc_page(struct kimage *image,
125 gfp_t gfp_mask, 141 gfp_t gfp_mask,
126 unsigned long dest); 142 unsigned long dest);
127 143
128static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, 144static int copy_user_segment_list(struct kimage *image,
129 unsigned long nr_segments, 145 unsigned long nr_segments,
130 struct kexec_segment __user *segments) 146 struct kexec_segment __user *segments)
131{ 147{
148 int ret;
132 size_t segment_bytes; 149 size_t segment_bytes;
133 struct kimage *image;
134 unsigned long i;
135 int result;
136
137 /* Allocate a controlling structure */
138 result = -ENOMEM;
139 image = kzalloc(sizeof(*image), GFP_KERNEL);
140 if (!image)
141 goto out;
142
143 image->head = 0;
144 image->entry = &image->head;
145 image->last_entry = &image->head;
146 image->control_page = ~0; /* By default this does not apply */
147 image->start = entry;
148 image->type = KEXEC_TYPE_DEFAULT;
149
150 /* Initialize the list of control pages */
151 INIT_LIST_HEAD(&image->control_pages);
152
153 /* Initialize the list of destination pages */
154 INIT_LIST_HEAD(&image->dest_pages);
155
156 /* Initialize the list of unusable pages */
157 INIT_LIST_HEAD(&image->unuseable_pages);
158 150
159 /* Read in the segments */ 151 /* Read in the segments */
160 image->nr_segments = nr_segments; 152 image->nr_segments = nr_segments;
161 segment_bytes = nr_segments * sizeof(*segments); 153 segment_bytes = nr_segments * sizeof(*segments);
162 result = copy_from_user(image->segment, segments, segment_bytes); 154 ret = copy_from_user(image->segment, segments, segment_bytes);
163 if (result) { 155 if (ret)
164 result = -EFAULT; 156 ret = -EFAULT;
165 goto out; 157
166 } 158 return ret;
159}
160
161static int sanity_check_segment_list(struct kimage *image)
162{
163 int result, i;
164 unsigned long nr_segments = image->nr_segments;
167 165
168 /* 166 /*
169 * Verify we have good destination addresses. The caller is 167 * Verify we have good destination addresses. The caller is
@@ -185,9 +183,9 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
185 mstart = image->segment[i].mem; 183 mstart = image->segment[i].mem;
186 mend = mstart + image->segment[i].memsz; 184 mend = mstart + image->segment[i].memsz;
187 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) 185 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
188 goto out; 186 return result;
189 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) 187 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
190 goto out; 188 return result;
191 } 189 }
192 190
193 /* Verify our destination addresses do not overlap. 191 /* Verify our destination addresses do not overlap.
@@ -208,7 +206,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
208 pend = pstart + image->segment[j].memsz; 206 pend = pstart + image->segment[j].memsz;
209 /* Do the segments overlap ? */ 207 /* Do the segments overlap ? */
210 if ((mend > pstart) && (mstart < pend)) 208 if ((mend > pstart) && (mstart < pend))
211 goto out; 209 return result;
212 } 210 }
213 } 211 }
214 212
@@ -220,131 +218,406 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
220 result = -EINVAL; 218 result = -EINVAL;
221 for (i = 0; i < nr_segments; i++) { 219 for (i = 0; i < nr_segments; i++) {
222 if (image->segment[i].bufsz > image->segment[i].memsz) 220 if (image->segment[i].bufsz > image->segment[i].memsz)
223 goto out; 221 return result;
224 } 222 }
225 223
226 result = 0; 224 /*
227out: 225 * Verify we have good destination addresses. Normally
228 if (result == 0) 226 * the caller is responsible for making certain we don't
229 *rimage = image; 227 * attempt to load the new image into invalid or reserved
230 else 228 * areas of RAM. But crash kernels are preloaded into a
231 kfree(image); 229 * reserved area of ram. We must ensure the addresses
230 * are in the reserved area otherwise preloading the
231 * kernel could corrupt things.
232 */
232 233
233 return result; 234 if (image->type == KEXEC_TYPE_CRASH) {
235 result = -EADDRNOTAVAIL;
236 for (i = 0; i < nr_segments; i++) {
237 unsigned long mstart, mend;
238
239 mstart = image->segment[i].mem;
240 mend = mstart + image->segment[i].memsz - 1;
241 /* Ensure we are within the crash kernel limits */
242 if ((mstart < crashk_res.start) ||
243 (mend > crashk_res.end))
244 return result;
245 }
246 }
247
248 return 0;
249}
250
251static struct kimage *do_kimage_alloc_init(void)
252{
253 struct kimage *image;
234 254
255 /* Allocate a controlling structure */
256 image = kzalloc(sizeof(*image), GFP_KERNEL);
257 if (!image)
258 return NULL;
259
260 image->head = 0;
261 image->entry = &image->head;
262 image->last_entry = &image->head;
263 image->control_page = ~0; /* By default this does not apply */
264 image->type = KEXEC_TYPE_DEFAULT;
265
266 /* Initialize the list of control pages */
267 INIT_LIST_HEAD(&image->control_pages);
268
269 /* Initialize the list of destination pages */
270 INIT_LIST_HEAD(&image->dest_pages);
271
272 /* Initialize the list of unusable pages */
273 INIT_LIST_HEAD(&image->unusable_pages);
274
275 return image;
235} 276}
236 277
237static void kimage_free_page_list(struct list_head *list); 278static void kimage_free_page_list(struct list_head *list);
238 279
239static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, 280static int kimage_alloc_init(struct kimage **rimage, unsigned long entry,
240 unsigned long nr_segments, 281 unsigned long nr_segments,
241 struct kexec_segment __user *segments) 282 struct kexec_segment __user *segments,
283 unsigned long flags)
242{ 284{
243 int result; 285 int ret;
244 struct kimage *image; 286 struct kimage *image;
287 bool kexec_on_panic = flags & KEXEC_ON_CRASH;
288
289 if (kexec_on_panic) {
290 /* Verify we have a valid entry point */
291 if ((entry < crashk_res.start) || (entry > crashk_res.end))
292 return -EADDRNOTAVAIL;
293 }
245 294
246 /* Allocate and initialize a controlling structure */ 295 /* Allocate and initialize a controlling structure */
247 image = NULL; 296 image = do_kimage_alloc_init();
248 result = do_kimage_alloc(&image, entry, nr_segments, segments); 297 if (!image)
249 if (result) 298 return -ENOMEM;
250 goto out; 299
300 image->start = entry;
301
302 ret = copy_user_segment_list(image, nr_segments, segments);
303 if (ret)
304 goto out_free_image;
305
306 ret = sanity_check_segment_list(image);
307 if (ret)
308 goto out_free_image;
309
310 /* Enable the special crash kernel control page allocation policy. */
311 if (kexec_on_panic) {
312 image->control_page = crashk_res.start;
313 image->type = KEXEC_TYPE_CRASH;
314 }
251 315
252 /* 316 /*
253 * Find a location for the control code buffer, and add it 317 * Find a location for the control code buffer, and add it
254 * the vector of segments so that it's pages will also be 318 * the vector of segments so that it's pages will also be
255 * counted as destination pages. 319 * counted as destination pages.
256 */ 320 */
257 result = -ENOMEM; 321 ret = -ENOMEM;
258 image->control_code_page = kimage_alloc_control_pages(image, 322 image->control_code_page = kimage_alloc_control_pages(image,
259 get_order(KEXEC_CONTROL_PAGE_SIZE)); 323 get_order(KEXEC_CONTROL_PAGE_SIZE));
260 if (!image->control_code_page) { 324 if (!image->control_code_page) {
261 pr_err("Could not allocate control_code_buffer\n"); 325 pr_err("Could not allocate control_code_buffer\n");
262 goto out_free; 326 goto out_free_image;
263 } 327 }
264 328
265 image->swap_page = kimage_alloc_control_pages(image, 0); 329 if (!kexec_on_panic) {
266 if (!image->swap_page) { 330 image->swap_page = kimage_alloc_control_pages(image, 0);
267 pr_err("Could not allocate swap buffer\n"); 331 if (!image->swap_page) {
268 goto out_free; 332 pr_err("Could not allocate swap buffer\n");
333 goto out_free_control_pages;
334 }
269 } 335 }
270 336
271 *rimage = image; 337 *rimage = image;
272 return 0; 338 return 0;
273 339out_free_control_pages:
274out_free:
275 kimage_free_page_list(&image->control_pages); 340 kimage_free_page_list(&image->control_pages);
341out_free_image:
276 kfree(image); 342 kfree(image);
277out: 343 return ret;
278 return result;
279} 344}
280 345
281static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 346#ifdef CONFIG_KEXEC_FILE
282 unsigned long nr_segments, 347static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len)
283 struct kexec_segment __user *segments)
284{ 348{
285 int result; 349 struct fd f = fdget(fd);
286 struct kimage *image; 350 int ret;
287 unsigned long i; 351 struct kstat stat;
352 loff_t pos;
353 ssize_t bytes = 0;
288 354
289 image = NULL; 355 if (!f.file)
290 /* Verify we have a valid entry point */ 356 return -EBADF;
291 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { 357
292 result = -EADDRNOTAVAIL; 358 ret = vfs_getattr(&f.file->f_path, &stat);
359 if (ret)
360 goto out;
361
362 if (stat.size > INT_MAX) {
363 ret = -EFBIG;
293 goto out; 364 goto out;
294 } 365 }
295 366
296 /* Allocate and initialize a controlling structure */ 367 /* Don't hand 0 to vmalloc, it whines. */
297 result = do_kimage_alloc(&image, entry, nr_segments, segments); 368 if (stat.size == 0) {
298 if (result) 369 ret = -EINVAL;
299 goto out; 370 goto out;
371 }
300 372
301 /* Enable the special crash kernel control page 373 *buf = vmalloc(stat.size);
302 * allocation policy. 374 if (!*buf) {
303 */ 375 ret = -ENOMEM;
304 image->control_page = crashk_res.start; 376 goto out;
305 image->type = KEXEC_TYPE_CRASH; 377 }
306 378
307 /* 379 pos = 0;
308 * Verify we have good destination addresses. Normally 380 while (pos < stat.size) {
309 * the caller is responsible for making certain we don't 381 bytes = kernel_read(f.file, pos, (char *)(*buf) + pos,
310 * attempt to load the new image into invalid or reserved 382 stat.size - pos);
311 * areas of RAM. But crash kernels are preloaded into a 383 if (bytes < 0) {
312 * reserved area of ram. We must ensure the addresses 384 vfree(*buf);
313 * are in the reserved area otherwise preloading the 385 ret = bytes;
314 * kernel could corrupt things. 386 goto out;
315 */ 387 }
316 result = -EADDRNOTAVAIL;
317 for (i = 0; i < nr_segments; i++) {
318 unsigned long mstart, mend;
319 388
320 mstart = image->segment[i].mem; 389 if (bytes == 0)
321 mend = mstart + image->segment[i].memsz - 1; 390 break;
322 /* Ensure we are within the crash kernel limits */ 391 pos += bytes;
323 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
324 goto out_free;
325 } 392 }
326 393
394 if (pos != stat.size) {
395 ret = -EBADF;
396 vfree(*buf);
397 goto out;
398 }
399
400 *buf_len = pos;
401out:
402 fdput(f);
403 return ret;
404}
405
406/* Architectures can provide this probe function */
407int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf,
408 unsigned long buf_len)
409{
410 return -ENOEXEC;
411}
412
413void * __weak arch_kexec_kernel_image_load(struct kimage *image)
414{
415 return ERR_PTR(-ENOEXEC);
416}
417
418void __weak arch_kimage_file_post_load_cleanup(struct kimage *image)
419{
420}
421
422int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf,
423 unsigned long buf_len)
424{
425 return -EKEYREJECTED;
426}
427
428/* Apply relocations of type RELA */
429int __weak
430arch_kexec_apply_relocations_add(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
431 unsigned int relsec)
432{
433 pr_err("RELA relocation unsupported.\n");
434 return -ENOEXEC;
435}
436
437/* Apply relocations of type REL */
438int __weak
439arch_kexec_apply_relocations(const Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
440 unsigned int relsec)
441{
442 pr_err("REL relocation unsupported.\n");
443 return -ENOEXEC;
444}
445
446/*
447 * Free up memory used by kernel, initrd, and comand line. This is temporary
448 * memory allocation which is not needed any more after these buffers have
449 * been loaded into separate segments and have been copied elsewhere.
450 */
451static void kimage_file_post_load_cleanup(struct kimage *image)
452{
453 struct purgatory_info *pi = &image->purgatory_info;
454
455 vfree(image->kernel_buf);
456 image->kernel_buf = NULL;
457
458 vfree(image->initrd_buf);
459 image->initrd_buf = NULL;
460
461 kfree(image->cmdline_buf);
462 image->cmdline_buf = NULL;
463
464 vfree(pi->purgatory_buf);
465 pi->purgatory_buf = NULL;
466
467 vfree(pi->sechdrs);
468 pi->sechdrs = NULL;
469
470 /* See if architecture has anything to cleanup post load */
471 arch_kimage_file_post_load_cleanup(image);
472
327 /* 473 /*
328 * Find a location for the control code buffer, and add 474 * Above call should have called into bootloader to free up
329 * the vector of segments so that it's pages will also be 475 * any data stored in kimage->image_loader_data. It should
330 * counted as destination pages. 476 * be ok now to free it up.
331 */ 477 */
332 result = -ENOMEM; 478 kfree(image->image_loader_data);
479 image->image_loader_data = NULL;
480}
481
482/*
483 * In file mode list of segments is prepared by kernel. Copy relevant
484 * data from user space, do error checking, prepare segment list
485 */
486static int
487kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
488 const char __user *cmdline_ptr,
489 unsigned long cmdline_len, unsigned flags)
490{
491 int ret = 0;
492 void *ldata;
493
494 ret = copy_file_from_fd(kernel_fd, &image->kernel_buf,
495 &image->kernel_buf_len);
496 if (ret)
497 return ret;
498
499 /* Call arch image probe handlers */
500 ret = arch_kexec_kernel_image_probe(image, image->kernel_buf,
501 image->kernel_buf_len);
502
503 if (ret)
504 goto out;
505
506#ifdef CONFIG_KEXEC_VERIFY_SIG
507 ret = arch_kexec_kernel_verify_sig(image, image->kernel_buf,
508 image->kernel_buf_len);
509 if (ret) {
510 pr_debug("kernel signature verification failed.\n");
511 goto out;
512 }
513 pr_debug("kernel signature verification successful.\n");
514#endif
515 /* It is possible that there no initramfs is being loaded */
516 if (!(flags & KEXEC_FILE_NO_INITRAMFS)) {
517 ret = copy_file_from_fd(initrd_fd, &image->initrd_buf,
518 &image->initrd_buf_len);
519 if (ret)
520 goto out;
521 }
522
523 if (cmdline_len) {
524 image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
525 if (!image->cmdline_buf) {
526 ret = -ENOMEM;
527 goto out;
528 }
529
530 ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
531 cmdline_len);
532 if (ret) {
533 ret = -EFAULT;
534 goto out;
535 }
536
537 image->cmdline_buf_len = cmdline_len;
538
539 /* command line should be a string with last byte null */
540 if (image->cmdline_buf[cmdline_len - 1] != '\0') {
541 ret = -EINVAL;
542 goto out;
543 }
544 }
545
546 /* Call arch image load handlers */
547 ldata = arch_kexec_kernel_image_load(image);
548
549 if (IS_ERR(ldata)) {
550 ret = PTR_ERR(ldata);
551 goto out;
552 }
553
554 image->image_loader_data = ldata;
555out:
556 /* In case of error, free up all allocated memory in this function */
557 if (ret)
558 kimage_file_post_load_cleanup(image);
559 return ret;
560}
561
562static int
563kimage_file_alloc_init(struct kimage **rimage, int kernel_fd,
564 int initrd_fd, const char __user *cmdline_ptr,
565 unsigned long cmdline_len, unsigned long flags)
566{
567 int ret;
568 struct kimage *image;
569 bool kexec_on_panic = flags & KEXEC_FILE_ON_CRASH;
570
571 image = do_kimage_alloc_init();
572 if (!image)
573 return -ENOMEM;
574
575 image->file_mode = 1;
576
577 if (kexec_on_panic) {
578 /* Enable special crash kernel control page alloc policy. */
579 image->control_page = crashk_res.start;
580 image->type = KEXEC_TYPE_CRASH;
581 }
582
583 ret = kimage_file_prepare_segments(image, kernel_fd, initrd_fd,
584 cmdline_ptr, cmdline_len, flags);
585 if (ret)
586 goto out_free_image;
587
588 ret = sanity_check_segment_list(image);
589 if (ret)
590 goto out_free_post_load_bufs;
591
592 ret = -ENOMEM;
333 image->control_code_page = kimage_alloc_control_pages(image, 593 image->control_code_page = kimage_alloc_control_pages(image,
334 get_order(KEXEC_CONTROL_PAGE_SIZE)); 594 get_order(KEXEC_CONTROL_PAGE_SIZE));
335 if (!image->control_code_page) { 595 if (!image->control_code_page) {
336 pr_err("Could not allocate control_code_buffer\n"); 596 pr_err("Could not allocate control_code_buffer\n");
337 goto out_free; 597 goto out_free_post_load_bufs;
598 }
599
600 if (!kexec_on_panic) {
601 image->swap_page = kimage_alloc_control_pages(image, 0);
602 if (!image->swap_page) {
603 pr_err(KERN_ERR "Could not allocate swap buffer\n");
604 goto out_free_control_pages;
605 }
338 } 606 }
339 607
340 *rimage = image; 608 *rimage = image;
341 return 0; 609 return 0;
342 610out_free_control_pages:
343out_free: 611 kimage_free_page_list(&image->control_pages);
612out_free_post_load_bufs:
613 kimage_file_post_load_cleanup(image);
614out_free_image:
344 kfree(image); 615 kfree(image);
345out: 616 return ret;
346 return result;
347} 617}
618#else /* CONFIG_KEXEC_FILE */
619static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
620#endif /* CONFIG_KEXEC_FILE */
348 621
349static int kimage_is_destination_range(struct kimage *image, 622static int kimage_is_destination_range(struct kimage *image,
350 unsigned long start, 623 unsigned long start,
@@ -609,7 +882,7 @@ static void kimage_free_extra_pages(struct kimage *image)
609 kimage_free_page_list(&image->dest_pages); 882 kimage_free_page_list(&image->dest_pages);
610 883
611 /* Walk through and free any unusable pages I have cached */ 884 /* Walk through and free any unusable pages I have cached */
612 kimage_free_page_list(&image->unuseable_pages); 885 kimage_free_page_list(&image->unusable_pages);
613 886
614} 887}
615static void kimage_terminate(struct kimage *image) 888static void kimage_terminate(struct kimage *image)
@@ -663,6 +936,14 @@ static void kimage_free(struct kimage *image)
663 936
664 /* Free the kexec control pages... */ 937 /* Free the kexec control pages... */
665 kimage_free_page_list(&image->control_pages); 938 kimage_free_page_list(&image->control_pages);
939
940 /*
941 * Free up any temporary buffers allocated. This might hit if
942 * error occurred much later after buffer allocation.
943 */
944 if (image->file_mode)
945 kimage_file_post_load_cleanup(image);
946
666 kfree(image); 947 kfree(image);
667} 948}
668 949
@@ -732,7 +1013,7 @@ static struct page *kimage_alloc_page(struct kimage *image,
732 /* If the page cannot be used file it away */ 1013 /* If the page cannot be used file it away */
733 if (page_to_pfn(page) > 1014 if (page_to_pfn(page) >
734 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { 1015 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
735 list_add(&page->lru, &image->unuseable_pages); 1016 list_add(&page->lru, &image->unusable_pages);
736 continue; 1017 continue;
737 } 1018 }
738 addr = page_to_pfn(page) << PAGE_SHIFT; 1019 addr = page_to_pfn(page) << PAGE_SHIFT;
@@ -791,10 +1072,14 @@ static int kimage_load_normal_segment(struct kimage *image,
791 unsigned long maddr; 1072 unsigned long maddr;
792 size_t ubytes, mbytes; 1073 size_t ubytes, mbytes;
793 int result; 1074 int result;
794 unsigned char __user *buf; 1075 unsigned char __user *buf = NULL;
1076 unsigned char *kbuf = NULL;
795 1077
796 result = 0; 1078 result = 0;
797 buf = segment->buf; 1079 if (image->file_mode)
1080 kbuf = segment->kbuf;
1081 else
1082 buf = segment->buf;
798 ubytes = segment->bufsz; 1083 ubytes = segment->bufsz;
799 mbytes = segment->memsz; 1084 mbytes = segment->memsz;
800 maddr = segment->mem; 1085 maddr = segment->mem;
@@ -826,7 +1111,11 @@ static int kimage_load_normal_segment(struct kimage *image,
826 PAGE_SIZE - (maddr & ~PAGE_MASK)); 1111 PAGE_SIZE - (maddr & ~PAGE_MASK));
827 uchunk = min(ubytes, mchunk); 1112 uchunk = min(ubytes, mchunk);
828 1113
829 result = copy_from_user(ptr, buf, uchunk); 1114 /* For file based kexec, source pages are in kernel memory */
1115 if (image->file_mode)
1116 memcpy(ptr, kbuf, uchunk);
1117 else
1118 result = copy_from_user(ptr, buf, uchunk);
830 kunmap(page); 1119 kunmap(page);
831 if (result) { 1120 if (result) {
832 result = -EFAULT; 1121 result = -EFAULT;
@@ -834,7 +1123,10 @@ static int kimage_load_normal_segment(struct kimage *image,
834 } 1123 }
835 ubytes -= uchunk; 1124 ubytes -= uchunk;
836 maddr += mchunk; 1125 maddr += mchunk;
837 buf += mchunk; 1126 if (image->file_mode)
1127 kbuf += mchunk;
1128 else
1129 buf += mchunk;
838 mbytes -= mchunk; 1130 mbytes -= mchunk;
839 } 1131 }
840out: 1132out:
@@ -851,10 +1143,14 @@ static int kimage_load_crash_segment(struct kimage *image,
851 unsigned long maddr; 1143 unsigned long maddr;
852 size_t ubytes, mbytes; 1144 size_t ubytes, mbytes;
853 int result; 1145 int result;
854 unsigned char __user *buf; 1146 unsigned char __user *buf = NULL;
1147 unsigned char *kbuf = NULL;
855 1148
856 result = 0; 1149 result = 0;
857 buf = segment->buf; 1150 if (image->file_mode)
1151 kbuf = segment->kbuf;
1152 else
1153 buf = segment->buf;
858 ubytes = segment->bufsz; 1154 ubytes = segment->bufsz;
859 mbytes = segment->memsz; 1155 mbytes = segment->memsz;
860 maddr = segment->mem; 1156 maddr = segment->mem;
@@ -877,7 +1173,12 @@ static int kimage_load_crash_segment(struct kimage *image,
877 /* Zero the trailing part of the page */ 1173 /* Zero the trailing part of the page */
878 memset(ptr + uchunk, 0, mchunk - uchunk); 1174 memset(ptr + uchunk, 0, mchunk - uchunk);
879 } 1175 }
880 result = copy_from_user(ptr, buf, uchunk); 1176
1177 /* For file based kexec, source pages are in kernel memory */
1178 if (image->file_mode)
1179 memcpy(ptr, kbuf, uchunk);
1180 else
1181 result = copy_from_user(ptr, buf, uchunk);
881 kexec_flush_icache_page(page); 1182 kexec_flush_icache_page(page);
882 kunmap(page); 1183 kunmap(page);
883 if (result) { 1184 if (result) {
@@ -886,7 +1187,10 @@ static int kimage_load_crash_segment(struct kimage *image,
886 } 1187 }
887 ubytes -= uchunk; 1188 ubytes -= uchunk;
888 maddr += mchunk; 1189 maddr += mchunk;
889 buf += mchunk; 1190 if (image->file_mode)
1191 kbuf += mchunk;
1192 else
1193 buf += mchunk;
890 mbytes -= mchunk; 1194 mbytes -= mchunk;
891 } 1195 }
892out: 1196out:
@@ -986,16 +1290,16 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
986 1290
987 /* Loading another kernel to reboot into */ 1291 /* Loading another kernel to reboot into */
988 if ((flags & KEXEC_ON_CRASH) == 0) 1292 if ((flags & KEXEC_ON_CRASH) == 0)
989 result = kimage_normal_alloc(&image, entry, 1293 result = kimage_alloc_init(&image, entry, nr_segments,
990 nr_segments, segments); 1294 segments, flags);
991 /* Loading another kernel to switch to if this one crashes */ 1295 /* Loading another kernel to switch to if this one crashes */
992 else if (flags & KEXEC_ON_CRASH) { 1296 else if (flags & KEXEC_ON_CRASH) {
993 /* Free any current crash dump kernel before 1297 /* Free any current crash dump kernel before
994 * we corrupt it. 1298 * we corrupt it.
995 */ 1299 */
996 kimage_free(xchg(&kexec_crash_image, NULL)); 1300 kimage_free(xchg(&kexec_crash_image, NULL));
997 result = kimage_crash_alloc(&image, entry, 1301 result = kimage_alloc_init(&image, entry, nr_segments,
998 nr_segments, segments); 1302 segments, flags);
999 crash_map_reserved_pages(); 1303 crash_map_reserved_pages();
1000 } 1304 }
1001 if (result) 1305 if (result)
@@ -1077,6 +1381,85 @@ COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulong_t, entry,
1077} 1381}
1078#endif 1382#endif
1079 1383
1384#ifdef CONFIG_KEXEC_FILE
1385SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
1386 unsigned long, cmdline_len, const char __user *, cmdline_ptr,
1387 unsigned long, flags)
1388{
1389 int ret = 0, i;
1390 struct kimage **dest_image, *image;
1391
1392 /* We only trust the superuser with rebooting the system. */
1393 if (!capable(CAP_SYS_BOOT) || kexec_load_disabled)
1394 return -EPERM;
1395
1396 /* Make sure we have a legal set of flags */
1397 if (flags != (flags & KEXEC_FILE_FLAGS))
1398 return -EINVAL;
1399
1400 image = NULL;
1401
1402 if (!mutex_trylock(&kexec_mutex))
1403 return -EBUSY;
1404
1405 dest_image = &kexec_image;
1406 if (flags & KEXEC_FILE_ON_CRASH)
1407 dest_image = &kexec_crash_image;
1408
1409 if (flags & KEXEC_FILE_UNLOAD)
1410 goto exchange;
1411
1412 /*
1413 * In case of crash, new kernel gets loaded in reserved region. It is
1414 * same memory where old crash kernel might be loaded. Free any
1415 * current crash dump kernel before we corrupt it.
1416 */
1417 if (flags & KEXEC_FILE_ON_CRASH)
1418 kimage_free(xchg(&kexec_crash_image, NULL));
1419
1420 ret = kimage_file_alloc_init(&image, kernel_fd, initrd_fd, cmdline_ptr,
1421 cmdline_len, flags);
1422 if (ret)
1423 goto out;
1424
1425 ret = machine_kexec_prepare(image);
1426 if (ret)
1427 goto out;
1428
1429 ret = kexec_calculate_store_digests(image);
1430 if (ret)
1431 goto out;
1432
1433 for (i = 0; i < image->nr_segments; i++) {
1434 struct kexec_segment *ksegment;
1435
1436 ksegment = &image->segment[i];
1437 pr_debug("Loading segment %d: buf=0x%p bufsz=0x%zx mem=0x%lx memsz=0x%zx\n",
1438 i, ksegment->buf, ksegment->bufsz, ksegment->mem,
1439 ksegment->memsz);
1440
1441 ret = kimage_load_segment(image, &image->segment[i]);
1442 if (ret)
1443 goto out;
1444 }
1445
1446 kimage_terminate(image);
1447
1448 /*
1449 * Free up any temporary buffers allocated which are not needed
1450 * after image has been loaded
1451 */
1452 kimage_file_post_load_cleanup(image);
1453exchange:
1454 image = xchg(dest_image, image);
1455out:
1456 mutex_unlock(&kexec_mutex);
1457 kimage_free(image);
1458 return ret;
1459}
1460
1461#endif /* CONFIG_KEXEC_FILE */
1462
1080void crash_kexec(struct pt_regs *regs) 1463void crash_kexec(struct pt_regs *regs)
1081{ 1464{
1082 /* Take the kexec_mutex here to prevent sys_kexec_load 1465 /* Take the kexec_mutex here to prevent sys_kexec_load
@@ -1376,7 +1759,6 @@ static __initdata char *suffix_tbl[] = {
1376 */ 1759 */
1377static int __init parse_crashkernel_suffix(char *cmdline, 1760static int __init parse_crashkernel_suffix(char *cmdline,
1378 unsigned long long *crash_size, 1761 unsigned long long *crash_size,
1379 unsigned long long *crash_base,
1380 const char *suffix) 1762 const char *suffix)
1381{ 1763{
1382 char *cur = cmdline; 1764 char *cur = cmdline;
@@ -1465,7 +1847,7 @@ static int __init __parse_crashkernel(char *cmdline,
1465 1847
1466 if (suffix) 1848 if (suffix)
1467 return parse_crashkernel_suffix(ck_cmdline, crash_size, 1849 return parse_crashkernel_suffix(ck_cmdline, crash_size,
1468 crash_base, suffix); 1850 suffix);
1469 /* 1851 /*
1470 * if the commandline contains a ':', then that's the extended 1852 * if the commandline contains a ':', then that's the extended
1471 * syntax -- if not, it must be the classic syntax 1853 * syntax -- if not, it must be the classic syntax
@@ -1632,6 +2014,672 @@ static int __init crash_save_vmcoreinfo_init(void)
1632 2014
1633subsys_initcall(crash_save_vmcoreinfo_init); 2015subsys_initcall(crash_save_vmcoreinfo_init);
1634 2016
2017#ifdef CONFIG_KEXEC_FILE
2018static int locate_mem_hole_top_down(unsigned long start, unsigned long end,
2019 struct kexec_buf *kbuf)
2020{
2021 struct kimage *image = kbuf->image;
2022 unsigned long temp_start, temp_end;
2023
2024 temp_end = min(end, kbuf->buf_max);
2025 temp_start = temp_end - kbuf->memsz;
2026
2027 do {
2028 /* align down start */
2029 temp_start = temp_start & (~(kbuf->buf_align - 1));
2030
2031 if (temp_start < start || temp_start < kbuf->buf_min)
2032 return 0;
2033
2034 temp_end = temp_start + kbuf->memsz - 1;
2035
2036 /*
2037 * Make sure this does not conflict with any of existing
2038 * segments
2039 */
2040 if (kimage_is_destination_range(image, temp_start, temp_end)) {
2041 temp_start = temp_start - PAGE_SIZE;
2042 continue;
2043 }
2044
2045 /* We found a suitable memory range */
2046 break;
2047 } while (1);
2048
2049 /* If we are here, we found a suitable memory range */
2050 kbuf->mem = temp_start;
2051
2052 /* Success, stop navigating through remaining System RAM ranges */
2053 return 1;
2054}
2055
2056static int locate_mem_hole_bottom_up(unsigned long start, unsigned long end,
2057 struct kexec_buf *kbuf)
2058{
2059 struct kimage *image = kbuf->image;
2060 unsigned long temp_start, temp_end;
2061
2062 temp_start = max(start, kbuf->buf_min);
2063
2064 do {
2065 temp_start = ALIGN(temp_start, kbuf->buf_align);
2066 temp_end = temp_start + kbuf->memsz - 1;
2067
2068 if (temp_end > end || temp_end > kbuf->buf_max)
2069 return 0;
2070 /*
2071 * Make sure this does not conflict with any of existing
2072 * segments
2073 */
2074 if (kimage_is_destination_range(image, temp_start, temp_end)) {
2075 temp_start = temp_start + PAGE_SIZE;
2076 continue;
2077 }
2078
2079 /* We found a suitable memory range */
2080 break;
2081 } while (1);
2082
2083 /* If we are here, we found a suitable memory range */
2084 kbuf->mem = temp_start;
2085
2086 /* Success, stop navigating through remaining System RAM ranges */
2087 return 1;
2088}
2089
2090static int locate_mem_hole_callback(u64 start, u64 end, void *arg)
2091{
2092 struct kexec_buf *kbuf = (struct kexec_buf *)arg;
2093 unsigned long sz = end - start + 1;
2094
2095 /* Returning 0 will take to next memory range */
2096 if (sz < kbuf->memsz)
2097 return 0;
2098
2099 if (end < kbuf->buf_min || start > kbuf->buf_max)
2100 return 0;
2101
2102 /*
2103 * Allocate memory top down with-in ram range. Otherwise bottom up
2104 * allocation.
2105 */
2106 if (kbuf->top_down)
2107 return locate_mem_hole_top_down(start, end, kbuf);
2108 return locate_mem_hole_bottom_up(start, end, kbuf);
2109}
2110
2111/*
2112 * Helper function for placing a buffer in a kexec segment. This assumes
2113 * that kexec_mutex is held.
2114 */
2115int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
2116 unsigned long memsz, unsigned long buf_align,
2117 unsigned long buf_min, unsigned long buf_max,
2118 bool top_down, unsigned long *load_addr)
2119{
2120
2121 struct kexec_segment *ksegment;
2122 struct kexec_buf buf, *kbuf;
2123 int ret;
2124
2125 /* Currently adding segment this way is allowed only in file mode */
2126 if (!image->file_mode)
2127 return -EINVAL;
2128
2129 if (image->nr_segments >= KEXEC_SEGMENT_MAX)
2130 return -EINVAL;
2131
2132 /*
2133 * Make sure we are not trying to add buffer after allocating
2134 * control pages. All segments need to be placed first before
2135 * any control pages are allocated. As control page allocation
2136 * logic goes through list of segments to make sure there are
2137 * no destination overlaps.
2138 */
2139 if (!list_empty(&image->control_pages)) {
2140 WARN_ON(1);
2141 return -EINVAL;
2142 }
2143
2144 memset(&buf, 0, sizeof(struct kexec_buf));
2145 kbuf = &buf;
2146 kbuf->image = image;
2147 kbuf->buffer = buffer;
2148 kbuf->bufsz = bufsz;
2149
2150 kbuf->memsz = ALIGN(memsz, PAGE_SIZE);
2151 kbuf->buf_align = max(buf_align, PAGE_SIZE);
2152 kbuf->buf_min = buf_min;
2153 kbuf->buf_max = buf_max;
2154 kbuf->top_down = top_down;
2155
2156 /* Walk the RAM ranges and allocate a suitable range for the buffer */
2157 if (image->type == KEXEC_TYPE_CRASH)
2158 ret = walk_iomem_res("Crash kernel",
2159 IORESOURCE_MEM | IORESOURCE_BUSY,
2160 crashk_res.start, crashk_res.end, kbuf,
2161 locate_mem_hole_callback);
2162 else
2163 ret = walk_system_ram_res(0, -1, kbuf,
2164 locate_mem_hole_callback);
2165 if (ret != 1) {
2166 /* A suitable memory range could not be found for buffer */
2167 return -EADDRNOTAVAIL;
2168 }
2169
2170 /* Found a suitable memory range */
2171 ksegment = &image->segment[image->nr_segments];
2172 ksegment->kbuf = kbuf->buffer;
2173 ksegment->bufsz = kbuf->bufsz;
2174 ksegment->mem = kbuf->mem;
2175 ksegment->memsz = kbuf->memsz;
2176 image->nr_segments++;
2177 *load_addr = ksegment->mem;
2178 return 0;
2179}
2180
2181/* Calculate and store the digest of segments */
2182static int kexec_calculate_store_digests(struct kimage *image)
2183{
2184 struct crypto_shash *tfm;
2185 struct shash_desc *desc;
2186 int ret = 0, i, j, zero_buf_sz, sha_region_sz;
2187 size_t desc_size, nullsz;
2188 char *digest;
2189 void *zero_buf;
2190 struct kexec_sha_region *sha_regions;
2191 struct purgatory_info *pi = &image->purgatory_info;
2192
2193 zero_buf = __va(page_to_pfn(ZERO_PAGE(0)) << PAGE_SHIFT);
2194 zero_buf_sz = PAGE_SIZE;
2195
2196 tfm = crypto_alloc_shash("sha256", 0, 0);
2197 if (IS_ERR(tfm)) {
2198 ret = PTR_ERR(tfm);
2199 goto out;
2200 }
2201
2202 desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
2203 desc = kzalloc(desc_size, GFP_KERNEL);
2204 if (!desc) {
2205 ret = -ENOMEM;
2206 goto out_free_tfm;
2207 }
2208
2209 sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
2210 sha_regions = vzalloc(sha_region_sz);
2211 if (!sha_regions)
2212 goto out_free_desc;
2213
2214 desc->tfm = tfm;
2215 desc->flags = 0;
2216
2217 ret = crypto_shash_init(desc);
2218 if (ret < 0)
2219 goto out_free_sha_regions;
2220
2221 digest = kzalloc(SHA256_DIGEST_SIZE, GFP_KERNEL);
2222 if (!digest) {
2223 ret = -ENOMEM;
2224 goto out_free_sha_regions;
2225 }
2226
2227 for (j = i = 0; i < image->nr_segments; i++) {
2228 struct kexec_segment *ksegment;
2229
2230 ksegment = &image->segment[i];
2231 /*
2232 * Skip purgatory as it will be modified once we put digest
2233 * info in purgatory.
2234 */
2235 if (ksegment->kbuf == pi->purgatory_buf)
2236 continue;
2237
2238 ret = crypto_shash_update(desc, ksegment->kbuf,
2239 ksegment->bufsz);
2240 if (ret)
2241 break;
2242
2243 /*
2244 * Assume rest of the buffer is filled with zero and
2245 * update digest accordingly.
2246 */
2247 nullsz = ksegment->memsz - ksegment->bufsz;
2248 while (nullsz) {
2249 unsigned long bytes = nullsz;
2250
2251 if (bytes > zero_buf_sz)
2252 bytes = zero_buf_sz;
2253 ret = crypto_shash_update(desc, zero_buf, bytes);
2254 if (ret)
2255 break;
2256 nullsz -= bytes;
2257 }
2258
2259 if (ret)
2260 break;
2261
2262 sha_regions[j].start = ksegment->mem;
2263 sha_regions[j].len = ksegment->memsz;
2264 j++;
2265 }
2266
2267 if (!ret) {
2268 ret = crypto_shash_final(desc, digest);
2269 if (ret)
2270 goto out_free_digest;
2271 ret = kexec_purgatory_get_set_symbol(image, "sha_regions",
2272 sha_regions, sha_region_sz, 0);
2273 if (ret)
2274 goto out_free_digest;
2275
2276 ret = kexec_purgatory_get_set_symbol(image, "sha256_digest",
2277 digest, SHA256_DIGEST_SIZE, 0);
2278 if (ret)
2279 goto out_free_digest;
2280 }
2281
2282out_free_digest:
2283 kfree(digest);
2284out_free_sha_regions:
2285 vfree(sha_regions);
2286out_free_desc:
2287 kfree(desc);
2288out_free_tfm:
2289 kfree(tfm);
2290out:
2291 return ret;
2292}
2293
2294/* Actually load purgatory. Lot of code taken from kexec-tools */
2295static int __kexec_load_purgatory(struct kimage *image, unsigned long min,
2296 unsigned long max, int top_down)
2297{
2298 struct purgatory_info *pi = &image->purgatory_info;
2299 unsigned long align, buf_align, bss_align, buf_sz, bss_sz, bss_pad;
2300 unsigned long memsz, entry, load_addr, curr_load_addr, bss_addr, offset;
2301 unsigned char *buf_addr, *src;
2302 int i, ret = 0, entry_sidx = -1;
2303 const Elf_Shdr *sechdrs_c;
2304 Elf_Shdr *sechdrs = NULL;
2305 void *purgatory_buf = NULL;
2306
2307 /*
2308 * sechdrs_c points to section headers in purgatory and are read
2309 * only. No modifications allowed.
2310 */
2311 sechdrs_c = (void *)pi->ehdr + pi->ehdr->e_shoff;
2312
2313 /*
2314 * We can not modify sechdrs_c[] and its fields. It is read only.
2315 * Copy it over to a local copy where one can store some temporary
2316 * data and free it at the end. We need to modify ->sh_addr and
2317 * ->sh_offset fields to keep track of permanent and temporary
2318 * locations of sections.
2319 */
2320 sechdrs = vzalloc(pi->ehdr->e_shnum * sizeof(Elf_Shdr));
2321 if (!sechdrs)
2322 return -ENOMEM;
2323
2324 memcpy(sechdrs, sechdrs_c, pi->ehdr->e_shnum * sizeof(Elf_Shdr));
2325
2326 /*
2327 * We seem to have multiple copies of sections. First copy is which
2328 * is embedded in kernel in read only section. Some of these sections
2329 * will be copied to a temporary buffer and relocated. And these
2330 * sections will finally be copied to their final destination at
2331 * segment load time.
2332 *
2333 * Use ->sh_offset to reflect section address in memory. It will
2334 * point to original read only copy if section is not allocatable.
2335 * Otherwise it will point to temporary copy which will be relocated.
2336 *
2337 * Use ->sh_addr to contain final address of the section where it
2338 * will go during execution time.
2339 */
2340 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2341 if (sechdrs[i].sh_type == SHT_NOBITS)
2342 continue;
2343
2344 sechdrs[i].sh_offset = (unsigned long)pi->ehdr +
2345 sechdrs[i].sh_offset;
2346 }
2347
2348 /*
2349 * Identify entry point section and make entry relative to section
2350 * start.
2351 */
2352 entry = pi->ehdr->e_entry;
2353 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2354 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
2355 continue;
2356
2357 if (!(sechdrs[i].sh_flags & SHF_EXECINSTR))
2358 continue;
2359
2360 /* Make entry section relative */
2361 if (sechdrs[i].sh_addr <= pi->ehdr->e_entry &&
2362 ((sechdrs[i].sh_addr + sechdrs[i].sh_size) >
2363 pi->ehdr->e_entry)) {
2364 entry_sidx = i;
2365 entry -= sechdrs[i].sh_addr;
2366 break;
2367 }
2368 }
2369
2370 /* Determine how much memory is needed to load relocatable object. */
2371 buf_align = 1;
2372 bss_align = 1;
2373 buf_sz = 0;
2374 bss_sz = 0;
2375
2376 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2377 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
2378 continue;
2379
2380 align = sechdrs[i].sh_addralign;
2381 if (sechdrs[i].sh_type != SHT_NOBITS) {
2382 if (buf_align < align)
2383 buf_align = align;
2384 buf_sz = ALIGN(buf_sz, align);
2385 buf_sz += sechdrs[i].sh_size;
2386 } else {
2387 /* bss section */
2388 if (bss_align < align)
2389 bss_align = align;
2390 bss_sz = ALIGN(bss_sz, align);
2391 bss_sz += sechdrs[i].sh_size;
2392 }
2393 }
2394
2395 /* Determine the bss padding required to align bss properly */
2396 bss_pad = 0;
2397 if (buf_sz & (bss_align - 1))
2398 bss_pad = bss_align - (buf_sz & (bss_align - 1));
2399
2400 memsz = buf_sz + bss_pad + bss_sz;
2401
2402 /* Allocate buffer for purgatory */
2403 purgatory_buf = vzalloc(buf_sz);
2404 if (!purgatory_buf) {
2405 ret = -ENOMEM;
2406 goto out;
2407 }
2408
2409 if (buf_align < bss_align)
2410 buf_align = bss_align;
2411
2412 /* Add buffer to segment list */
2413 ret = kexec_add_buffer(image, purgatory_buf, buf_sz, memsz,
2414 buf_align, min, max, top_down,
2415 &pi->purgatory_load_addr);
2416 if (ret)
2417 goto out;
2418
2419 /* Load SHF_ALLOC sections */
2420 buf_addr = purgatory_buf;
2421 load_addr = curr_load_addr = pi->purgatory_load_addr;
2422 bss_addr = load_addr + buf_sz + bss_pad;
2423
2424 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2425 if (!(sechdrs[i].sh_flags & SHF_ALLOC))
2426 continue;
2427
2428 align = sechdrs[i].sh_addralign;
2429 if (sechdrs[i].sh_type != SHT_NOBITS) {
2430 curr_load_addr = ALIGN(curr_load_addr, align);
2431 offset = curr_load_addr - load_addr;
2432 /* We already modifed ->sh_offset to keep src addr */
2433 src = (char *) sechdrs[i].sh_offset;
2434 memcpy(buf_addr + offset, src, sechdrs[i].sh_size);
2435
2436 /* Store load address and source address of section */
2437 sechdrs[i].sh_addr = curr_load_addr;
2438
2439 /*
2440 * This section got copied to temporary buffer. Update
2441 * ->sh_offset accordingly.
2442 */
2443 sechdrs[i].sh_offset = (unsigned long)(buf_addr + offset);
2444
2445 /* Advance to the next address */
2446 curr_load_addr += sechdrs[i].sh_size;
2447 } else {
2448 bss_addr = ALIGN(bss_addr, align);
2449 sechdrs[i].sh_addr = bss_addr;
2450 bss_addr += sechdrs[i].sh_size;
2451 }
2452 }
2453
2454 /* Update entry point based on load address of text section */
2455 if (entry_sidx >= 0)
2456 entry += sechdrs[entry_sidx].sh_addr;
2457
2458 /* Make kernel jump to purgatory after shutdown */
2459 image->start = entry;
2460
2461 /* Used later to get/set symbol values */
2462 pi->sechdrs = sechdrs;
2463
2464 /*
2465 * Used later to identify which section is purgatory and skip it
2466 * from checksumming.
2467 */
2468 pi->purgatory_buf = purgatory_buf;
2469 return ret;
2470out:
2471 vfree(sechdrs);
2472 vfree(purgatory_buf);
2473 return ret;
2474}
2475
2476static int kexec_apply_relocations(struct kimage *image)
2477{
2478 int i, ret;
2479 struct purgatory_info *pi = &image->purgatory_info;
2480 Elf_Shdr *sechdrs = pi->sechdrs;
2481
2482 /* Apply relocations */
2483 for (i = 0; i < pi->ehdr->e_shnum; i++) {
2484 Elf_Shdr *section, *symtab;
2485
2486 if (sechdrs[i].sh_type != SHT_RELA &&
2487 sechdrs[i].sh_type != SHT_REL)
2488 continue;
2489
2490 /*
2491 * For section of type SHT_RELA/SHT_REL,
2492 * ->sh_link contains section header index of associated
2493 * symbol table. And ->sh_info contains section header
2494 * index of section to which relocations apply.
2495 */
2496 if (sechdrs[i].sh_info >= pi->ehdr->e_shnum ||
2497 sechdrs[i].sh_link >= pi->ehdr->e_shnum)
2498 return -ENOEXEC;
2499
2500 section = &sechdrs[sechdrs[i].sh_info];
2501 symtab = &sechdrs[sechdrs[i].sh_link];
2502
2503 if (!(section->sh_flags & SHF_ALLOC))
2504 continue;
2505
2506 /*
2507 * symtab->sh_link contain section header index of associated
2508 * string table.
2509 */
2510 if (symtab->sh_link >= pi->ehdr->e_shnum)
2511 /* Invalid section number? */
2512 continue;
2513
2514 /*
2515 * Respective archicture needs to provide support for applying
2516 * relocations of type SHT_RELA/SHT_REL.
2517 */
2518 if (sechdrs[i].sh_type == SHT_RELA)
2519 ret = arch_kexec_apply_relocations_add(pi->ehdr,
2520 sechdrs, i);
2521 else if (sechdrs[i].sh_type == SHT_REL)
2522 ret = arch_kexec_apply_relocations(pi->ehdr,
2523 sechdrs, i);
2524 if (ret)
2525 return ret;
2526 }
2527
2528 return 0;
2529}
2530
2531/* Load relocatable purgatory object and relocate it appropriately */
2532int kexec_load_purgatory(struct kimage *image, unsigned long min,
2533 unsigned long max, int top_down,
2534 unsigned long *load_addr)
2535{
2536 struct purgatory_info *pi = &image->purgatory_info;
2537 int ret;
2538
2539 if (kexec_purgatory_size <= 0)
2540 return -EINVAL;
2541
2542 if (kexec_purgatory_size < sizeof(Elf_Ehdr))
2543 return -ENOEXEC;
2544
2545 pi->ehdr = (Elf_Ehdr *)kexec_purgatory;
2546
2547 if (memcmp(pi->ehdr->e_ident, ELFMAG, SELFMAG) != 0
2548 || pi->ehdr->e_type != ET_REL
2549 || !elf_check_arch(pi->ehdr)
2550 || pi->ehdr->e_shentsize != sizeof(Elf_Shdr))
2551 return -ENOEXEC;
2552
2553 if (pi->ehdr->e_shoff >= kexec_purgatory_size
2554 || (pi->ehdr->e_shnum * sizeof(Elf_Shdr) >
2555 kexec_purgatory_size - pi->ehdr->e_shoff))
2556 return -ENOEXEC;
2557
2558 ret = __kexec_load_purgatory(image, min, max, top_down);
2559 if (ret)
2560 return ret;
2561
2562 ret = kexec_apply_relocations(image);
2563 if (ret)
2564 goto out;
2565
2566 *load_addr = pi->purgatory_load_addr;
2567 return 0;
2568out:
2569 vfree(pi->sechdrs);
2570 vfree(pi->purgatory_buf);
2571 return ret;
2572}
2573
2574static Elf_Sym *kexec_purgatory_find_symbol(struct purgatory_info *pi,
2575 const char *name)
2576{
2577 Elf_Sym *syms;
2578 Elf_Shdr *sechdrs;
2579 Elf_Ehdr *ehdr;
2580 int i, k;
2581 const char *strtab;
2582
2583 if (!pi->sechdrs || !pi->ehdr)
2584 return NULL;
2585
2586 sechdrs = pi->sechdrs;
2587 ehdr = pi->ehdr;
2588
2589 for (i = 0; i < ehdr->e_shnum; i++) {
2590 if (sechdrs[i].sh_type != SHT_SYMTAB)
2591 continue;
2592
2593 if (sechdrs[i].sh_link >= ehdr->e_shnum)
2594 /* Invalid strtab section number */
2595 continue;
2596 strtab = (char *)sechdrs[sechdrs[i].sh_link].sh_offset;
2597 syms = (Elf_Sym *)sechdrs[i].sh_offset;
2598
2599 /* Go through symbols for a match */
2600 for (k = 0; k < sechdrs[i].sh_size/sizeof(Elf_Sym); k++) {
2601 if (ELF_ST_BIND(syms[k].st_info) != STB_GLOBAL)
2602 continue;
2603
2604 if (strcmp(strtab + syms[k].st_name, name) != 0)
2605 continue;
2606
2607 if (syms[k].st_shndx == SHN_UNDEF ||
2608 syms[k].st_shndx >= ehdr->e_shnum) {
2609 pr_debug("Symbol: %s has bad section index %d.\n",
2610 name, syms[k].st_shndx);
2611 return NULL;
2612 }
2613
2614 /* Found the symbol we are looking for */
2615 return &syms[k];
2616 }
2617 }
2618
2619 return NULL;
2620}
2621
2622void *kexec_purgatory_get_symbol_addr(struct kimage *image, const char *name)
2623{
2624 struct purgatory_info *pi = &image->purgatory_info;
2625 Elf_Sym *sym;
2626 Elf_Shdr *sechdr;
2627
2628 sym = kexec_purgatory_find_symbol(pi, name);
2629 if (!sym)
2630 return ERR_PTR(-EINVAL);
2631
2632 sechdr = &pi->sechdrs[sym->st_shndx];
2633
2634 /*
2635 * Returns the address where symbol will finally be loaded after
2636 * kexec_load_segment()
2637 */
2638 return (void *)(sechdr->sh_addr + sym->st_value);
2639}
2640
2641/*
2642 * Get or set value of a symbol. If "get_value" is true, symbol value is
2643 * returned in buf otherwise symbol value is set based on value in buf.
2644 */
2645int kexec_purgatory_get_set_symbol(struct kimage *image, const char *name,
2646 void *buf, unsigned int size, bool get_value)
2647{
2648 Elf_Sym *sym;
2649 Elf_Shdr *sechdrs;
2650 struct purgatory_info *pi = &image->purgatory_info;
2651 char *sym_buf;
2652
2653 sym = kexec_purgatory_find_symbol(pi, name);
2654 if (!sym)
2655 return -EINVAL;
2656
2657 if (sym->st_size != size) {
2658 pr_err("symbol %s size mismatch: expected %lu actual %u\n",
2659 name, (unsigned long)sym->st_size, size);
2660 return -EINVAL;
2661 }
2662
2663 sechdrs = pi->sechdrs;
2664
2665 if (sechdrs[sym->st_shndx].sh_type == SHT_NOBITS) {
2666 pr_err("symbol %s is in a bss section. Cannot %s\n", name,
2667 get_value ? "get" : "set");
2668 return -EINVAL;
2669 }
2670
2671 sym_buf = (unsigned char *)sechdrs[sym->st_shndx].sh_offset +
2672 sym->st_value;
2673
2674 if (get_value)
2675 memcpy((void *)buf, sym_buf, size);
2676 else
2677 memcpy((void *)sym_buf, buf, size);
2678
2679 return 0;
2680}
2681#endif /* CONFIG_KEXEC_FILE */
2682
1635/* 2683/*
1636 * Move into place and start executing a preloaded standalone 2684 * Move into place and start executing a preloaded standalone
1637 * executable. If nothing was preloaded return an error. 2685 * executable. If nothing was preloaded return an error.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8637e041a247..80f7a6d00519 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -196,12 +196,34 @@ int __request_module(bool wait, const char *fmt, ...)
196EXPORT_SYMBOL(__request_module); 196EXPORT_SYMBOL(__request_module);
197#endif /* CONFIG_MODULES */ 197#endif /* CONFIG_MODULES */
198 198
199static void call_usermodehelper_freeinfo(struct subprocess_info *info)
200{
201 if (info->cleanup)
202 (*info->cleanup)(info);
203 kfree(info);
204}
205
206static void umh_complete(struct subprocess_info *sub_info)
207{
208 struct completion *comp = xchg(&sub_info->complete, NULL);
209 /*
210 * See call_usermodehelper_exec(). If xchg() returns NULL
211 * we own sub_info, the UMH_KILLABLE caller has gone away
212 * or the caller used UMH_NO_WAIT.
213 */
214 if (comp)
215 complete(comp);
216 else
217 call_usermodehelper_freeinfo(sub_info);
218}
219
199/* 220/*
200 * This is the task which runs the usermode application 221 * This is the task which runs the usermode application
201 */ 222 */
202static int ____call_usermodehelper(void *data) 223static int ____call_usermodehelper(void *data)
203{ 224{
204 struct subprocess_info *sub_info = data; 225 struct subprocess_info *sub_info = data;
226 int wait = sub_info->wait & ~UMH_KILLABLE;
205 struct cred *new; 227 struct cred *new;
206 int retval; 228 int retval;
207 229
@@ -221,7 +243,7 @@ static int ____call_usermodehelper(void *data)
221 retval = -ENOMEM; 243 retval = -ENOMEM;
222 new = prepare_kernel_cred(current); 244 new = prepare_kernel_cred(current);
223 if (!new) 245 if (!new)
224 goto fail; 246 goto out;
225 247
226 spin_lock(&umh_sysctl_lock); 248 spin_lock(&umh_sysctl_lock);
227 new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset); 249 new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
@@ -233,7 +255,7 @@ static int ____call_usermodehelper(void *data)
233 retval = sub_info->init(sub_info, new); 255 retval = sub_info->init(sub_info, new);
234 if (retval) { 256 if (retval) {
235 abort_creds(new); 257 abort_creds(new);
236 goto fail; 258 goto out;
237 } 259 }
238 } 260 }
239 261
@@ -242,12 +264,13 @@ static int ____call_usermodehelper(void *data)
242 retval = do_execve(getname_kernel(sub_info->path), 264 retval = do_execve(getname_kernel(sub_info->path),
243 (const char __user *const __user *)sub_info->argv, 265 (const char __user *const __user *)sub_info->argv,
244 (const char __user *const __user *)sub_info->envp); 266 (const char __user *const __user *)sub_info->envp);
267out:
268 sub_info->retval = retval;
269 /* wait_for_helper() will call umh_complete if UHM_WAIT_PROC. */
270 if (wait != UMH_WAIT_PROC)
271 umh_complete(sub_info);
245 if (!retval) 272 if (!retval)
246 return 0; 273 return 0;
247
248 /* Exec failed? */
249fail:
250 sub_info->retval = retval;
251 do_exit(0); 274 do_exit(0);
252} 275}
253 276
@@ -258,26 +281,6 @@ static int call_helper(void *data)
258 return ____call_usermodehelper(data); 281 return ____call_usermodehelper(data);
259} 282}
260 283
261static void call_usermodehelper_freeinfo(struct subprocess_info *info)
262{
263 if (info->cleanup)
264 (*info->cleanup)(info);
265 kfree(info);
266}
267
268static void umh_complete(struct subprocess_info *sub_info)
269{
270 struct completion *comp = xchg(&sub_info->complete, NULL);
271 /*
272 * See call_usermodehelper_exec(). If xchg() returns NULL
273 * we own sub_info, the UMH_KILLABLE caller has gone away.
274 */
275 if (comp)
276 complete(comp);
277 else
278 call_usermodehelper_freeinfo(sub_info);
279}
280
281/* Keventd can't block, but this (a child) can. */ 284/* Keventd can't block, but this (a child) can. */
282static int wait_for_helper(void *data) 285static int wait_for_helper(void *data)
283{ 286{
@@ -336,18 +339,8 @@ static void __call_usermodehelper(struct work_struct *work)
336 kmod_thread_locker = NULL; 339 kmod_thread_locker = NULL;
337 } 340 }
338 341
339 switch (wait) { 342 if (pid < 0) {
340 case UMH_NO_WAIT: 343 sub_info->retval = pid;
341 call_usermodehelper_freeinfo(sub_info);
342 break;
343
344 case UMH_WAIT_PROC:
345 if (pid > 0)
346 break;
347 /* FALLTHROUGH */
348 case UMH_WAIT_EXEC:
349 if (pid < 0)
350 sub_info->retval = pid;
351 umh_complete(sub_info); 344 umh_complete(sub_info);
352 } 345 }
353} 346}
@@ -588,7 +581,12 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
588 goto out; 581 goto out;
589 } 582 }
590 583
591 sub_info->complete = &done; 584 /*
585 * Set the completion pointer only if there is a waiter.
586 * This makes it possible to use umh_complete to free
587 * the data structure in case of UMH_NO_WAIT.
588 */
589 sub_info->complete = (wait == UMH_NO_WAIT) ? NULL : &done;
592 sub_info->wait = wait; 590 sub_info->wait = wait;
593 591
594 queue_work(khelper_wq, &sub_info->work); 592 queue_work(khelper_wq, &sub_info->work);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 734e9a7d280b..3995f546d0f3 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1778,7 +1778,18 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
1778 unsigned long hash, flags = 0; 1778 unsigned long hash, flags = 0;
1779 struct kretprobe_instance *ri; 1779 struct kretprobe_instance *ri;
1780 1780
1781 /*TODO: consider to only swap the RA after the last pre_handler fired */ 1781 /*
1782 * To avoid deadlocks, prohibit return probing in NMI contexts,
1783 * just skip the probe and increase the (inexact) 'nmissed'
1784 * statistical counter, so that the user is informed that
1785 * something happened:
1786 */
1787 if (unlikely(in_nmi())) {
1788 rp->nmissed++;
1789 return 0;
1790 }
1791
1792 /* TODO: consider to only swap the RA after the last pre_handler fired */
1782 hash = hash_ptr(current, KPROBE_HASH_BITS); 1793 hash = hash_ptr(current, KPROBE_HASH_BITS);
1783 raw_spin_lock_irqsave(&rp->lock, flags); 1794 raw_spin_lock_irqsave(&rp->lock, flags);
1784 if (!hlist_empty(&rp->free_instances)) { 1795 if (!hlist_empty(&rp->free_instances)) {
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ef483220e855..10e489c448fe 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
369{ 369{
370 struct task_struct *p; 370 struct task_struct *p;
371 371
372 p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt, 372 p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
373 cpu); 373 cpu);
374 if (IS_ERR(p)) 374 if (IS_ERR(p))
375 return p; 375 return p;
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 0955b885d0dc..ec8cce259779 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -20,30 +20,20 @@
20 * Author: Paul E. McKenney <paulmck@us.ibm.com> 20 * Author: Paul E. McKenney <paulmck@us.ibm.com>
21 * Based on kernel/rcu/torture.c. 21 * Based on kernel/rcu/torture.c.
22 */ 22 */
23#include <linux/types.h>
24#include <linux/kernel.h> 23#include <linux/kernel.h>
25#include <linux/init.h>
26#include <linux/module.h> 24#include <linux/module.h>
27#include <linux/kthread.h> 25#include <linux/kthread.h>
28#include <linux/err.h>
29#include <linux/spinlock.h> 26#include <linux/spinlock.h>
27#include <linux/rwlock.h>
28#include <linux/mutex.h>
29#include <linux/rwsem.h>
30#include <linux/smp.h> 30#include <linux/smp.h>
31#include <linux/interrupt.h> 31#include <linux/interrupt.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/atomic.h> 33#include <linux/atomic.h>
34#include <linux/bitops.h>
35#include <linux/completion.h>
36#include <linux/moduleparam.h> 34#include <linux/moduleparam.h>
37#include <linux/percpu.h>
38#include <linux/notifier.h>
39#include <linux/reboot.h>
40#include <linux/freezer.h>
41#include <linux/cpu.h>
42#include <linux/delay.h> 35#include <linux/delay.h>
43#include <linux/stat.h>
44#include <linux/slab.h> 36#include <linux/slab.h>
45#include <linux/trace_clock.h>
46#include <asm/byteorder.h>
47#include <linux/torture.h> 37#include <linux/torture.h>
48 38
49MODULE_LICENSE("GPL"); 39MODULE_LICENSE("GPL");
@@ -51,6 +41,8 @@ MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>");
51 41
52torture_param(int, nwriters_stress, -1, 42torture_param(int, nwriters_stress, -1,
53 "Number of write-locking stress-test threads"); 43 "Number of write-locking stress-test threads");
44torture_param(int, nreaders_stress, -1,
45 "Number of read-locking stress-test threads");
54torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); 46torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
55torture_param(int, onoff_interval, 0, 47torture_param(int, onoff_interval, 0,
56 "Time between CPU hotplugs (s), 0=disable"); 48 "Time between CPU hotplugs (s), 0=disable");
@@ -66,30 +58,28 @@ torture_param(bool, verbose, true,
66static char *torture_type = "spin_lock"; 58static char *torture_type = "spin_lock";
67module_param(torture_type, charp, 0444); 59module_param(torture_type, charp, 0444);
68MODULE_PARM_DESC(torture_type, 60MODULE_PARM_DESC(torture_type,
69 "Type of lock to torture (spin_lock, spin_lock_irq, ...)"); 61 "Type of lock to torture (spin_lock, spin_lock_irq, mutex_lock, ...)");
70
71static atomic_t n_lock_torture_errors;
72 62
73static struct task_struct *stats_task; 63static struct task_struct *stats_task;
74static struct task_struct **writer_tasks; 64static struct task_struct **writer_tasks;
65static struct task_struct **reader_tasks;
75 66
76static int nrealwriters_stress;
77static bool lock_is_write_held; 67static bool lock_is_write_held;
68static bool lock_is_read_held;
78 69
79struct lock_writer_stress_stats { 70struct lock_stress_stats {
80 long n_write_lock_fail; 71 long n_lock_fail;
81 long n_write_lock_acquired; 72 long n_lock_acquired;
82}; 73};
83static struct lock_writer_stress_stats *lwsa;
84 74
85#if defined(MODULE) 75#if defined(MODULE)
86#define LOCKTORTURE_RUNNABLE_INIT 1 76#define LOCKTORTURE_RUNNABLE_INIT 1
87#else 77#else
88#define LOCKTORTURE_RUNNABLE_INIT 0 78#define LOCKTORTURE_RUNNABLE_INIT 0
89#endif 79#endif
90int locktorture_runnable = LOCKTORTURE_RUNNABLE_INIT; 80int torture_runnable = LOCKTORTURE_RUNNABLE_INIT;
91module_param(locktorture_runnable, int, 0444); 81module_param(torture_runnable, int, 0444);
92MODULE_PARM_DESC(locktorture_runnable, "Start locktorture at module init"); 82MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init");
93 83
94/* Forward reference. */ 84/* Forward reference. */
95static void lock_torture_cleanup(void); 85static void lock_torture_cleanup(void);
@@ -102,12 +92,25 @@ struct lock_torture_ops {
102 int (*writelock)(void); 92 int (*writelock)(void);
103 void (*write_delay)(struct torture_random_state *trsp); 93 void (*write_delay)(struct torture_random_state *trsp);
104 void (*writeunlock)(void); 94 void (*writeunlock)(void);
95 int (*readlock)(void);
96 void (*read_delay)(struct torture_random_state *trsp);
97 void (*readunlock)(void);
105 unsigned long flags; 98 unsigned long flags;
106 const char *name; 99 const char *name;
107}; 100};
108 101
109static struct lock_torture_ops *cur_ops; 102struct lock_torture_cxt {
110 103 int nrealwriters_stress;
104 int nrealreaders_stress;
105 bool debug_lock;
106 atomic_t n_lock_torture_errors;
107 struct lock_torture_ops *cur_ops;
108 struct lock_stress_stats *lwsa; /* writer statistics */
109 struct lock_stress_stats *lrsa; /* reader statistics */
110};
111static struct lock_torture_cxt cxt = { 0, 0, false,
112 ATOMIC_INIT(0),
113 NULL, NULL};
111/* 114/*
112 * Definitions for lock torture testing. 115 * Definitions for lock torture testing.
113 */ 116 */
@@ -123,10 +126,10 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp)
123 126
124 /* We want a long delay occasionally to force massive contention. */ 127 /* We want a long delay occasionally to force massive contention. */
125 if (!(torture_random(trsp) % 128 if (!(torture_random(trsp) %
126 (nrealwriters_stress * 2000 * longdelay_us))) 129 (cxt.nrealwriters_stress * 2000 * longdelay_us)))
127 mdelay(longdelay_us); 130 mdelay(longdelay_us);
128#ifdef CONFIG_PREEMPT 131#ifdef CONFIG_PREEMPT
129 if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) 132 if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
130 preempt_schedule(); /* Allow test to be preempted. */ 133 preempt_schedule(); /* Allow test to be preempted. */
131#endif 134#endif
132} 135}
@@ -140,6 +143,9 @@ static struct lock_torture_ops lock_busted_ops = {
140 .writelock = torture_lock_busted_write_lock, 143 .writelock = torture_lock_busted_write_lock,
141 .write_delay = torture_lock_busted_write_delay, 144 .write_delay = torture_lock_busted_write_delay,
142 .writeunlock = torture_lock_busted_write_unlock, 145 .writeunlock = torture_lock_busted_write_unlock,
146 .readlock = NULL,
147 .read_delay = NULL,
148 .readunlock = NULL,
143 .name = "lock_busted" 149 .name = "lock_busted"
144}; 150};
145 151
@@ -160,13 +166,13 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp)
160 * we want a long delay occasionally to force massive contention. 166 * we want a long delay occasionally to force massive contention.
161 */ 167 */
162 if (!(torture_random(trsp) % 168 if (!(torture_random(trsp) %
163 (nrealwriters_stress * 2000 * longdelay_us))) 169 (cxt.nrealwriters_stress * 2000 * longdelay_us)))
164 mdelay(longdelay_us); 170 mdelay(longdelay_us);
165 if (!(torture_random(trsp) % 171 if (!(torture_random(trsp) %
166 (nrealwriters_stress * 2 * shortdelay_us))) 172 (cxt.nrealwriters_stress * 2 * shortdelay_us)))
167 udelay(shortdelay_us); 173 udelay(shortdelay_us);
168#ifdef CONFIG_PREEMPT 174#ifdef CONFIG_PREEMPT
169 if (!(torture_random(trsp) % (nrealwriters_stress * 20000))) 175 if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
170 preempt_schedule(); /* Allow test to be preempted. */ 176 preempt_schedule(); /* Allow test to be preempted. */
171#endif 177#endif
172} 178}
@@ -180,39 +186,253 @@ static struct lock_torture_ops spin_lock_ops = {
180 .writelock = torture_spin_lock_write_lock, 186 .writelock = torture_spin_lock_write_lock,
181 .write_delay = torture_spin_lock_write_delay, 187 .write_delay = torture_spin_lock_write_delay,
182 .writeunlock = torture_spin_lock_write_unlock, 188 .writeunlock = torture_spin_lock_write_unlock,
189 .readlock = NULL,
190 .read_delay = NULL,
191 .readunlock = NULL,
183 .name = "spin_lock" 192 .name = "spin_lock"
184}; 193};
185 194
186static int torture_spin_lock_write_lock_irq(void) 195static int torture_spin_lock_write_lock_irq(void)
187__acquires(torture_spinlock_irq) 196__acquires(torture_spinlock)
188{ 197{
189 unsigned long flags; 198 unsigned long flags;
190 199
191 spin_lock_irqsave(&torture_spinlock, flags); 200 spin_lock_irqsave(&torture_spinlock, flags);
192 cur_ops->flags = flags; 201 cxt.cur_ops->flags = flags;
193 return 0; 202 return 0;
194} 203}
195 204
196static void torture_lock_spin_write_unlock_irq(void) 205static void torture_lock_spin_write_unlock_irq(void)
197__releases(torture_spinlock) 206__releases(torture_spinlock)
198{ 207{
199 spin_unlock_irqrestore(&torture_spinlock, cur_ops->flags); 208 spin_unlock_irqrestore(&torture_spinlock, cxt.cur_ops->flags);
200} 209}
201 210
202static struct lock_torture_ops spin_lock_irq_ops = { 211static struct lock_torture_ops spin_lock_irq_ops = {
203 .writelock = torture_spin_lock_write_lock_irq, 212 .writelock = torture_spin_lock_write_lock_irq,
204 .write_delay = torture_spin_lock_write_delay, 213 .write_delay = torture_spin_lock_write_delay,
205 .writeunlock = torture_lock_spin_write_unlock_irq, 214 .writeunlock = torture_lock_spin_write_unlock_irq,
215 .readlock = NULL,
216 .read_delay = NULL,
217 .readunlock = NULL,
206 .name = "spin_lock_irq" 218 .name = "spin_lock_irq"
207}; 219};
208 220
221static DEFINE_RWLOCK(torture_rwlock);
222
223static int torture_rwlock_write_lock(void) __acquires(torture_rwlock)
224{
225 write_lock(&torture_rwlock);
226 return 0;
227}
228
229static void torture_rwlock_write_delay(struct torture_random_state *trsp)
230{
231 const unsigned long shortdelay_us = 2;
232 const unsigned long longdelay_ms = 100;
233
234 /* We want a short delay mostly to emulate likely code, and
235 * we want a long delay occasionally to force massive contention.
236 */
237 if (!(torture_random(trsp) %
238 (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
239 mdelay(longdelay_ms);
240 else
241 udelay(shortdelay_us);
242}
243
244static void torture_rwlock_write_unlock(void) __releases(torture_rwlock)
245{
246 write_unlock(&torture_rwlock);
247}
248
249static int torture_rwlock_read_lock(void) __acquires(torture_rwlock)
250{
251 read_lock(&torture_rwlock);
252 return 0;
253}
254
255static void torture_rwlock_read_delay(struct torture_random_state *trsp)
256{
257 const unsigned long shortdelay_us = 10;
258 const unsigned long longdelay_ms = 100;
259
260 /* We want a short delay mostly to emulate likely code, and
261 * we want a long delay occasionally to force massive contention.
262 */
263 if (!(torture_random(trsp) %
264 (cxt.nrealreaders_stress * 2000 * longdelay_ms)))
265 mdelay(longdelay_ms);
266 else
267 udelay(shortdelay_us);
268}
269
270static void torture_rwlock_read_unlock(void) __releases(torture_rwlock)
271{
272 read_unlock(&torture_rwlock);
273}
274
275static struct lock_torture_ops rw_lock_ops = {
276 .writelock = torture_rwlock_write_lock,
277 .write_delay = torture_rwlock_write_delay,
278 .writeunlock = torture_rwlock_write_unlock,
279 .readlock = torture_rwlock_read_lock,
280 .read_delay = torture_rwlock_read_delay,
281 .readunlock = torture_rwlock_read_unlock,
282 .name = "rw_lock"
283};
284
285static int torture_rwlock_write_lock_irq(void) __acquires(torture_rwlock)
286{
287 unsigned long flags;
288
289 write_lock_irqsave(&torture_rwlock, flags);
290 cxt.cur_ops->flags = flags;
291 return 0;
292}
293
294static void torture_rwlock_write_unlock_irq(void)
295__releases(torture_rwlock)
296{
297 write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
298}
299
300static int torture_rwlock_read_lock_irq(void) __acquires(torture_rwlock)
301{
302 unsigned long flags;
303
304 read_lock_irqsave(&torture_rwlock, flags);
305 cxt.cur_ops->flags = flags;
306 return 0;
307}
308
309static void torture_rwlock_read_unlock_irq(void)
310__releases(torture_rwlock)
311{
312 write_unlock_irqrestore(&torture_rwlock, cxt.cur_ops->flags);
313}
314
315static struct lock_torture_ops rw_lock_irq_ops = {
316 .writelock = torture_rwlock_write_lock_irq,
317 .write_delay = torture_rwlock_write_delay,
318 .writeunlock = torture_rwlock_write_unlock_irq,
319 .readlock = torture_rwlock_read_lock_irq,
320 .read_delay = torture_rwlock_read_delay,
321 .readunlock = torture_rwlock_read_unlock_irq,
322 .name = "rw_lock_irq"
323};
324
325static DEFINE_MUTEX(torture_mutex);
326
327static int torture_mutex_lock(void) __acquires(torture_mutex)
328{
329 mutex_lock(&torture_mutex);
330 return 0;
331}
332
333static void torture_mutex_delay(struct torture_random_state *trsp)
334{
335 const unsigned long longdelay_ms = 100;
336
337 /* We want a long delay occasionally to force massive contention. */
338 if (!(torture_random(trsp) %
339 (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
340 mdelay(longdelay_ms * 5);
341 else
342 mdelay(longdelay_ms / 5);
343#ifdef CONFIG_PREEMPT
344 if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
345 preempt_schedule(); /* Allow test to be preempted. */
346#endif
347}
348
349static void torture_mutex_unlock(void) __releases(torture_mutex)
350{
351 mutex_unlock(&torture_mutex);
352}
353
354static struct lock_torture_ops mutex_lock_ops = {
355 .writelock = torture_mutex_lock,
356 .write_delay = torture_mutex_delay,
357 .writeunlock = torture_mutex_unlock,
358 .readlock = NULL,
359 .read_delay = NULL,
360 .readunlock = NULL,
361 .name = "mutex_lock"
362};
363
364static DECLARE_RWSEM(torture_rwsem);
365static int torture_rwsem_down_write(void) __acquires(torture_rwsem)
366{
367 down_write(&torture_rwsem);
368 return 0;
369}
370
371static void torture_rwsem_write_delay(struct torture_random_state *trsp)
372{
373 const unsigned long longdelay_ms = 100;
374
375 /* We want a long delay occasionally to force massive contention. */
376 if (!(torture_random(trsp) %
377 (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
378 mdelay(longdelay_ms * 10);
379 else
380 mdelay(longdelay_ms / 10);
381#ifdef CONFIG_PREEMPT
382 if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000)))
383 preempt_schedule(); /* Allow test to be preempted. */
384#endif
385}
386
387static void torture_rwsem_up_write(void) __releases(torture_rwsem)
388{
389 up_write(&torture_rwsem);
390}
391
392static int torture_rwsem_down_read(void) __acquires(torture_rwsem)
393{
394 down_read(&torture_rwsem);
395 return 0;
396}
397
398static void torture_rwsem_read_delay(struct torture_random_state *trsp)
399{
400 const unsigned long longdelay_ms = 100;
401
402 /* We want a long delay occasionally to force massive contention. */
403 if (!(torture_random(trsp) %
404 (cxt.nrealwriters_stress * 2000 * longdelay_ms)))
405 mdelay(longdelay_ms * 2);
406 else
407 mdelay(longdelay_ms / 2);
408#ifdef CONFIG_PREEMPT
409 if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000)))
410 preempt_schedule(); /* Allow test to be preempted. */
411#endif
412}
413
414static void torture_rwsem_up_read(void) __releases(torture_rwsem)
415{
416 up_read(&torture_rwsem);
417}
418
419static struct lock_torture_ops rwsem_lock_ops = {
420 .writelock = torture_rwsem_down_write,
421 .write_delay = torture_rwsem_write_delay,
422 .writeunlock = torture_rwsem_up_write,
423 .readlock = torture_rwsem_down_read,
424 .read_delay = torture_rwsem_read_delay,
425 .readunlock = torture_rwsem_up_read,
426 .name = "rwsem_lock"
427};
428
209/* 429/*
210 * Lock torture writer kthread. Repeatedly acquires and releases 430 * Lock torture writer kthread. Repeatedly acquires and releases
211 * the lock, checking for duplicate acquisitions. 431 * the lock, checking for duplicate acquisitions.
212 */ 432 */
213static int lock_torture_writer(void *arg) 433static int lock_torture_writer(void *arg)
214{ 434{
215 struct lock_writer_stress_stats *lwsp = arg; 435 struct lock_stress_stats *lwsp = arg;
216 static DEFINE_TORTURE_RANDOM(rand); 436 static DEFINE_TORTURE_RANDOM(rand);
217 437
218 VERBOSE_TOROUT_STRING("lock_torture_writer task started"); 438 VERBOSE_TOROUT_STRING("lock_torture_writer task started");
@@ -221,14 +441,19 @@ static int lock_torture_writer(void *arg)
221 do { 441 do {
222 if ((torture_random(&rand) & 0xfffff) == 0) 442 if ((torture_random(&rand) & 0xfffff) == 0)
223 schedule_timeout_uninterruptible(1); 443 schedule_timeout_uninterruptible(1);
224 cur_ops->writelock(); 444
445 cxt.cur_ops->writelock();
225 if (WARN_ON_ONCE(lock_is_write_held)) 446 if (WARN_ON_ONCE(lock_is_write_held))
226 lwsp->n_write_lock_fail++; 447 lwsp->n_lock_fail++;
227 lock_is_write_held = 1; 448 lock_is_write_held = 1;
228 lwsp->n_write_lock_acquired++; 449 if (WARN_ON_ONCE(lock_is_read_held))
229 cur_ops->write_delay(&rand); 450 lwsp->n_lock_fail++; /* rare, but... */
451
452 lwsp->n_lock_acquired++;
453 cxt.cur_ops->write_delay(&rand);
230 lock_is_write_held = 0; 454 lock_is_write_held = 0;
231 cur_ops->writeunlock(); 455 cxt.cur_ops->writeunlock();
456
232 stutter_wait("lock_torture_writer"); 457 stutter_wait("lock_torture_writer");
233 } while (!torture_must_stop()); 458 } while (!torture_must_stop());
234 torture_kthread_stopping("lock_torture_writer"); 459 torture_kthread_stopping("lock_torture_writer");
@@ -236,32 +461,66 @@ static int lock_torture_writer(void *arg)
236} 461}
237 462
238/* 463/*
464 * Lock torture reader kthread. Repeatedly acquires and releases
465 * the reader lock.
466 */
467static int lock_torture_reader(void *arg)
468{
469 struct lock_stress_stats *lrsp = arg;
470 static DEFINE_TORTURE_RANDOM(rand);
471
472 VERBOSE_TOROUT_STRING("lock_torture_reader task started");
473 set_user_nice(current, MAX_NICE);
474
475 do {
476 if ((torture_random(&rand) & 0xfffff) == 0)
477 schedule_timeout_uninterruptible(1);
478
479 cxt.cur_ops->readlock();
480 lock_is_read_held = 1;
481 if (WARN_ON_ONCE(lock_is_write_held))
482 lrsp->n_lock_fail++; /* rare, but... */
483
484 lrsp->n_lock_acquired++;
485 cxt.cur_ops->read_delay(&rand);
486 lock_is_read_held = 0;
487 cxt.cur_ops->readunlock();
488
489 stutter_wait("lock_torture_reader");
490 } while (!torture_must_stop());
491 torture_kthread_stopping("lock_torture_reader");
492 return 0;
493}
494
495/*
239 * Create an lock-torture-statistics message in the specified buffer. 496 * Create an lock-torture-statistics message in the specified buffer.
240 */ 497 */
241static void lock_torture_printk(char *page) 498static void __torture_print_stats(char *page,
499 struct lock_stress_stats *statp, bool write)
242{ 500{
243 bool fail = 0; 501 bool fail = 0;
244 int i; 502 int i, n_stress;
245 long max = 0; 503 long max = 0;
246 long min = lwsa[0].n_write_lock_acquired; 504 long min = statp[0].n_lock_acquired;
247 long long sum = 0; 505 long long sum = 0;
248 506
249 for (i = 0; i < nrealwriters_stress; i++) { 507 n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
250 if (lwsa[i].n_write_lock_fail) 508 for (i = 0; i < n_stress; i++) {
509 if (statp[i].n_lock_fail)
251 fail = true; 510 fail = true;
252 sum += lwsa[i].n_write_lock_acquired; 511 sum += statp[i].n_lock_acquired;
253 if (max < lwsa[i].n_write_lock_fail) 512 if (max < statp[i].n_lock_fail)
254 max = lwsa[i].n_write_lock_fail; 513 max = statp[i].n_lock_fail;
255 if (min > lwsa[i].n_write_lock_fail) 514 if (min > statp[i].n_lock_fail)
256 min = lwsa[i].n_write_lock_fail; 515 min = statp[i].n_lock_fail;
257 } 516 }
258 page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG);
259 page += sprintf(page, 517 page += sprintf(page,
260 "Writes: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n", 518 "%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
519 write ? "Writes" : "Reads ",
261 sum, max, min, max / 2 > min ? "???" : "", 520 sum, max, min, max / 2 > min ? "???" : "",
262 fail, fail ? "!!!" : ""); 521 fail, fail ? "!!!" : "");
263 if (fail) 522 if (fail)
264 atomic_inc(&n_lock_torture_errors); 523 atomic_inc(&cxt.n_lock_torture_errors);
265} 524}
266 525
267/* 526/*
@@ -274,18 +533,35 @@ static void lock_torture_printk(char *page)
274 */ 533 */
275static void lock_torture_stats_print(void) 534static void lock_torture_stats_print(void)
276{ 535{
277 int size = nrealwriters_stress * 200 + 8192; 536 int size = cxt.nrealwriters_stress * 200 + 8192;
278 char *buf; 537 char *buf;
279 538
539 if (cxt.cur_ops->readlock)
540 size += cxt.nrealreaders_stress * 200 + 8192;
541
280 buf = kmalloc(size, GFP_KERNEL); 542 buf = kmalloc(size, GFP_KERNEL);
281 if (!buf) { 543 if (!buf) {
282 pr_err("lock_torture_stats_print: Out of memory, need: %d", 544 pr_err("lock_torture_stats_print: Out of memory, need: %d",
283 size); 545 size);
284 return; 546 return;
285 } 547 }
286 lock_torture_printk(buf); 548
549 __torture_print_stats(buf, cxt.lwsa, true);
287 pr_alert("%s", buf); 550 pr_alert("%s", buf);
288 kfree(buf); 551 kfree(buf);
552
553 if (cxt.cur_ops->readlock) {
554 buf = kmalloc(size, GFP_KERNEL);
555 if (!buf) {
556 pr_err("lock_torture_stats_print: Out of memory, need: %d",
557 size);
558 return;
559 }
560
561 __torture_print_stats(buf, cxt.lrsa, false);
562 pr_alert("%s", buf);
563 kfree(buf);
564 }
289} 565}
290 566
291/* 567/*
@@ -312,9 +588,10 @@ lock_torture_print_module_parms(struct lock_torture_ops *cur_ops,
312 const char *tag) 588 const char *tag)
313{ 589{
314 pr_alert("%s" TORTURE_FLAG 590 pr_alert("%s" TORTURE_FLAG
315 "--- %s: nwriters_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n", 591 "--- %s%s: nwriters_stress=%d nreaders_stress=%d stat_interval=%d verbose=%d shuffle_interval=%d stutter=%d shutdown_secs=%d onoff_interval=%d onoff_holdoff=%d\n",
316 torture_type, tag, nrealwriters_stress, stat_interval, verbose, 592 torture_type, tag, cxt.debug_lock ? " [debug]": "",
317 shuffle_interval, stutter, shutdown_secs, 593 cxt.nrealwriters_stress, cxt.nrealreaders_stress, stat_interval,
594 verbose, shuffle_interval, stutter, shutdown_secs,
318 onoff_interval, onoff_holdoff); 595 onoff_interval, onoff_holdoff);
319} 596}
320 597
@@ -322,46 +599,59 @@ static void lock_torture_cleanup(void)
322{ 599{
323 int i; 600 int i;
324 601
325 if (torture_cleanup()) 602 if (torture_cleanup_begin())
326 return; 603 return;
327 604
328 if (writer_tasks) { 605 if (writer_tasks) {
329 for (i = 0; i < nrealwriters_stress; i++) 606 for (i = 0; i < cxt.nrealwriters_stress; i++)
330 torture_stop_kthread(lock_torture_writer, 607 torture_stop_kthread(lock_torture_writer,
331 writer_tasks[i]); 608 writer_tasks[i]);
332 kfree(writer_tasks); 609 kfree(writer_tasks);
333 writer_tasks = NULL; 610 writer_tasks = NULL;
334 } 611 }
335 612
613 if (reader_tasks) {
614 for (i = 0; i < cxt.nrealreaders_stress; i++)
615 torture_stop_kthread(lock_torture_reader,
616 reader_tasks[i]);
617 kfree(reader_tasks);
618 reader_tasks = NULL;
619 }
620
336 torture_stop_kthread(lock_torture_stats, stats_task); 621 torture_stop_kthread(lock_torture_stats, stats_task);
337 lock_torture_stats_print(); /* -After- the stats thread is stopped! */ 622 lock_torture_stats_print(); /* -After- the stats thread is stopped! */
338 623
339 if (atomic_read(&n_lock_torture_errors)) 624 if (atomic_read(&cxt.n_lock_torture_errors))
340 lock_torture_print_module_parms(cur_ops, 625 lock_torture_print_module_parms(cxt.cur_ops,
341 "End of test: FAILURE"); 626 "End of test: FAILURE");
342 else if (torture_onoff_failures()) 627 else if (torture_onoff_failures())
343 lock_torture_print_module_parms(cur_ops, 628 lock_torture_print_module_parms(cxt.cur_ops,
344 "End of test: LOCK_HOTPLUG"); 629 "End of test: LOCK_HOTPLUG");
345 else 630 else
346 lock_torture_print_module_parms(cur_ops, 631 lock_torture_print_module_parms(cxt.cur_ops,
347 "End of test: SUCCESS"); 632 "End of test: SUCCESS");
633 torture_cleanup_end();
348} 634}
349 635
350static int __init lock_torture_init(void) 636static int __init lock_torture_init(void)
351{ 637{
352 int i; 638 int i, j;
353 int firsterr = 0; 639 int firsterr = 0;
354 static struct lock_torture_ops *torture_ops[] = { 640 static struct lock_torture_ops *torture_ops[] = {
355 &lock_busted_ops, &spin_lock_ops, &spin_lock_irq_ops, 641 &lock_busted_ops,
642 &spin_lock_ops, &spin_lock_irq_ops,
643 &rw_lock_ops, &rw_lock_irq_ops,
644 &mutex_lock_ops,
645 &rwsem_lock_ops,
356 }; 646 };
357 647
358 if (!torture_init_begin(torture_type, verbose, &locktorture_runnable)) 648 if (!torture_init_begin(torture_type, verbose, &torture_runnable))
359 return -EBUSY; 649 return -EBUSY;
360 650
361 /* Process args and tell the world that the torturer is on the job. */ 651 /* Process args and tell the world that the torturer is on the job. */
362 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 652 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
363 cur_ops = torture_ops[i]; 653 cxt.cur_ops = torture_ops[i];
364 if (strcmp(torture_type, cur_ops->name) == 0) 654 if (strcmp(torture_type, cxt.cur_ops->name) == 0)
365 break; 655 break;
366 } 656 }
367 if (i == ARRAY_SIZE(torture_ops)) { 657 if (i == ARRAY_SIZE(torture_ops)) {
@@ -374,31 +664,69 @@ static int __init lock_torture_init(void)
374 torture_init_end(); 664 torture_init_end();
375 return -EINVAL; 665 return -EINVAL;
376 } 666 }
377 if (cur_ops->init) 667 if (cxt.cur_ops->init)
378 cur_ops->init(); /* no "goto unwind" prior to this point!!! */ 668 cxt.cur_ops->init(); /* no "goto unwind" prior to this point!!! */
379 669
380 if (nwriters_stress >= 0) 670 if (nwriters_stress >= 0)
381 nrealwriters_stress = nwriters_stress; 671 cxt.nrealwriters_stress = nwriters_stress;
382 else 672 else
383 nrealwriters_stress = 2 * num_online_cpus(); 673 cxt.nrealwriters_stress = 2 * num_online_cpus();
384 lock_torture_print_module_parms(cur_ops, "Start of test"); 674
675#ifdef CONFIG_DEBUG_MUTEXES
676 if (strncmp(torture_type, "mutex", 5) == 0)
677 cxt.debug_lock = true;
678#endif
679#ifdef CONFIG_DEBUG_SPINLOCK
680 if ((strncmp(torture_type, "spin", 4) == 0) ||
681 (strncmp(torture_type, "rw_lock", 7) == 0))
682 cxt.debug_lock = true;
683#endif
385 684
386 /* Initialize the statistics so that each run gets its own numbers. */ 685 /* Initialize the statistics so that each run gets its own numbers. */
387 686
388 lock_is_write_held = 0; 687 lock_is_write_held = 0;
389 lwsa = kmalloc(sizeof(*lwsa) * nrealwriters_stress, GFP_KERNEL); 688 cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL);
390 if (lwsa == NULL) { 689 if (cxt.lwsa == NULL) {
391 VERBOSE_TOROUT_STRING("lwsa: Out of memory"); 690 VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory");
392 firsterr = -ENOMEM; 691 firsterr = -ENOMEM;
393 goto unwind; 692 goto unwind;
394 } 693 }
395 for (i = 0; i < nrealwriters_stress; i++) { 694 for (i = 0; i < cxt.nrealwriters_stress; i++) {
396 lwsa[i].n_write_lock_fail = 0; 695 cxt.lwsa[i].n_lock_fail = 0;
397 lwsa[i].n_write_lock_acquired = 0; 696 cxt.lwsa[i].n_lock_acquired = 0;
398 } 697 }
399 698
400 /* Start up the kthreads. */ 699 if (cxt.cur_ops->readlock) {
700 if (nreaders_stress >= 0)
701 cxt.nrealreaders_stress = nreaders_stress;
702 else {
703 /*
704 * By default distribute evenly the number of
705 * readers and writers. We still run the same number
706 * of threads as the writer-only locks default.
707 */
708 if (nwriters_stress < 0) /* user doesn't care */
709 cxt.nrealwriters_stress = num_online_cpus();
710 cxt.nrealreaders_stress = cxt.nrealwriters_stress;
711 }
712
713 lock_is_read_held = 0;
714 cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL);
715 if (cxt.lrsa == NULL) {
716 VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory");
717 firsterr = -ENOMEM;
718 kfree(cxt.lwsa);
719 goto unwind;
720 }
721
722 for (i = 0; i < cxt.nrealreaders_stress; i++) {
723 cxt.lrsa[i].n_lock_fail = 0;
724 cxt.lrsa[i].n_lock_acquired = 0;
725 }
726 }
727 lock_torture_print_module_parms(cxt.cur_ops, "Start of test");
401 728
729 /* Prepare torture context. */
402 if (onoff_interval > 0) { 730 if (onoff_interval > 0) {
403 firsterr = torture_onoff_init(onoff_holdoff * HZ, 731 firsterr = torture_onoff_init(onoff_holdoff * HZ,
404 onoff_interval * HZ); 732 onoff_interval * HZ);
@@ -422,18 +750,51 @@ static int __init lock_torture_init(void)
422 goto unwind; 750 goto unwind;
423 } 751 }
424 752
425 writer_tasks = kzalloc(nrealwriters_stress * sizeof(writer_tasks[0]), 753 writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]),
426 GFP_KERNEL); 754 GFP_KERNEL);
427 if (writer_tasks == NULL) { 755 if (writer_tasks == NULL) {
428 VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); 756 VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
429 firsterr = -ENOMEM; 757 firsterr = -ENOMEM;
430 goto unwind; 758 goto unwind;
431 } 759 }
432 for (i = 0; i < nrealwriters_stress; i++) { 760
433 firsterr = torture_create_kthread(lock_torture_writer, &lwsa[i], 761 if (cxt.cur_ops->readlock) {
762 reader_tasks = kzalloc(cxt.nrealreaders_stress * sizeof(reader_tasks[0]),
763 GFP_KERNEL);
764 if (reader_tasks == NULL) {
765 VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
766 firsterr = -ENOMEM;
767 goto unwind;
768 }
769 }
770
771 /*
772 * Create the kthreads and start torturing (oh, those poor little locks).
773 *
774 * TODO: Note that we interleave writers with readers, giving writers a
775 * slight advantage, by creating its kthread first. This can be modified
776 * for very specific needs, or even let the user choose the policy, if
777 * ever wanted.
778 */
779 for (i = 0, j = 0; i < cxt.nrealwriters_stress ||
780 j < cxt.nrealreaders_stress; i++, j++) {
781 if (i >= cxt.nrealwriters_stress)
782 goto create_reader;
783
784 /* Create writer. */
785 firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
434 writer_tasks[i]); 786 writer_tasks[i]);
435 if (firsterr) 787 if (firsterr)
436 goto unwind; 788 goto unwind;
789
790 create_reader:
791 if (cxt.cur_ops->readlock == NULL || (j >= cxt.nrealreaders_stress))
792 continue;
793 /* Create reader. */
794 firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
795 reader_tasks[j]);
796 if (firsterr)
797 goto unwind;
437 } 798 }
438 if (stat_interval > 0) { 799 if (stat_interval > 0) {
439 firsterr = torture_create_kthread(lock_torture_stats, NULL, 800 firsterr = torture_create_kthread(lock_torture_stats, NULL,
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
index 23e89c5930e9..4d60986fcbee 100644
--- a/kernel/locking/mcs_spinlock.h
+++ b/kernel/locking/mcs_spinlock.h
@@ -56,9 +56,6 @@ do { \
56 * If the lock has already been acquired, then this will proceed to spin 56 * If the lock has already been acquired, then this will proceed to spin
57 * on this node->locked until the previous lock holder sets the node->locked 57 * on this node->locked until the previous lock holder sets the node->locked
58 * in mcs_spin_unlock(). 58 * in mcs_spin_unlock().
59 *
60 * We don't inline mcs_spin_lock() so that perf can correctly account for the
61 * time spent in this lock function.
62 */ 59 */
63static inline 60static inline
64void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) 61void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index ae712b25e492..dadbf88c22c4 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -15,7 +15,7 @@
15 * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale 15 * by Steven Rostedt, based on work by Gregory Haskins, Peter Morreale
16 * and Sven Dietrich. 16 * and Sven Dietrich.
17 * 17 *
18 * Also see Documentation/mutex-design.txt. 18 * Also see Documentation/locking/mutex-design.txt.
19 */ 19 */
20#include <linux/mutex.h> 20#include <linux/mutex.h>
21#include <linux/ww_mutex.h> 21#include <linux/ww_mutex.h>
@@ -106,6 +106,92 @@ void __sched mutex_lock(struct mutex *lock)
106EXPORT_SYMBOL(mutex_lock); 106EXPORT_SYMBOL(mutex_lock);
107#endif 107#endif
108 108
109static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
110 struct ww_acquire_ctx *ww_ctx)
111{
112#ifdef CONFIG_DEBUG_MUTEXES
113 /*
114 * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
115 * but released with a normal mutex_unlock in this call.
116 *
117 * This should never happen, always use ww_mutex_unlock.
118 */
119 DEBUG_LOCKS_WARN_ON(ww->ctx);
120
121 /*
122 * Not quite done after calling ww_acquire_done() ?
123 */
124 DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
125
126 if (ww_ctx->contending_lock) {
127 /*
128 * After -EDEADLK you tried to
129 * acquire a different ww_mutex? Bad!
130 */
131 DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
132
133 /*
134 * You called ww_mutex_lock after receiving -EDEADLK,
135 * but 'forgot' to unlock everything else first?
136 */
137 DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
138 ww_ctx->contending_lock = NULL;
139 }
140
141 /*
142 * Naughty, using a different class will lead to undefined behavior!
143 */
144 DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
145#endif
146 ww_ctx->acquired++;
147}
148
149/*
150 * after acquiring lock with fastpath or when we lost out in contested
151 * slowpath, set ctx and wake up any waiters so they can recheck.
152 *
153 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
154 * as the fastpath and opportunistic spinning are disabled in that case.
155 */
156static __always_inline void
157ww_mutex_set_context_fastpath(struct ww_mutex *lock,
158 struct ww_acquire_ctx *ctx)
159{
160 unsigned long flags;
161 struct mutex_waiter *cur;
162
163 ww_mutex_lock_acquired(lock, ctx);
164
165 lock->ctx = ctx;
166
167 /*
168 * The lock->ctx update should be visible on all cores before
169 * the atomic read is done, otherwise contended waiters might be
170 * missed. The contended waiters will either see ww_ctx == NULL
171 * and keep spinning, or it will acquire wait_lock, add itself
172 * to waiter list and sleep.
173 */
174 smp_mb(); /* ^^^ */
175
176 /*
177 * Check if lock is contended, if not there is nobody to wake up
178 */
179 if (likely(atomic_read(&lock->base.count) == 0))
180 return;
181
182 /*
183 * Uh oh, we raced in fastpath, wake up everyone in this case,
184 * so they can see the new lock->ctx.
185 */
186 spin_lock_mutex(&lock->base.wait_lock, flags);
187 list_for_each_entry(cur, &lock->base.wait_list, list) {
188 debug_mutex_wake_waiter(&lock->base, cur);
189 wake_up_process(cur->task);
190 }
191 spin_unlock_mutex(&lock->base.wait_lock, flags);
192}
193
194
109#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 195#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
110/* 196/*
111 * In order to avoid a stampede of mutex spinners from acquiring the mutex 197 * In order to avoid a stampede of mutex spinners from acquiring the mutex
@@ -180,6 +266,129 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
180 */ 266 */
181 return retval; 267 return retval;
182} 268}
269
270/*
271 * Atomically try to take the lock when it is available
272 */
273static inline bool mutex_try_to_acquire(struct mutex *lock)
274{
275 return !mutex_is_locked(lock) &&
276 (atomic_cmpxchg(&lock->count, 1, 0) == 1);
277}
278
279/*
280 * Optimistic spinning.
281 *
282 * We try to spin for acquisition when we find that the lock owner
283 * is currently running on a (different) CPU and while we don't
284 * need to reschedule. The rationale is that if the lock owner is
285 * running, it is likely to release the lock soon.
286 *
287 * Since this needs the lock owner, and this mutex implementation
288 * doesn't track the owner atomically in the lock field, we need to
289 * track it non-atomically.
290 *
291 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
292 * to serialize everything.
293 *
294 * The mutex spinners are queued up using MCS lock so that only one
295 * spinner can compete for the mutex. However, if mutex spinning isn't
296 * going to happen, there is no point in going through the lock/unlock
297 * overhead.
298 *
299 * Returns true when the lock was taken, otherwise false, indicating
300 * that we need to jump to the slowpath and sleep.
301 */
302static bool mutex_optimistic_spin(struct mutex *lock,
303 struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
304{
305 struct task_struct *task = current;
306
307 if (!mutex_can_spin_on_owner(lock))
308 goto done;
309
310 if (!osq_lock(&lock->osq))
311 goto done;
312
313 while (true) {
314 struct task_struct *owner;
315
316 if (use_ww_ctx && ww_ctx->acquired > 0) {
317 struct ww_mutex *ww;
318
319 ww = container_of(lock, struct ww_mutex, base);
320 /*
321 * If ww->ctx is set the contents are undefined, only
322 * by acquiring wait_lock there is a guarantee that
323 * they are not invalid when reading.
324 *
325 * As such, when deadlock detection needs to be
326 * performed the optimistic spinning cannot be done.
327 */
328 if (ACCESS_ONCE(ww->ctx))
329 break;
330 }
331
332 /*
333 * If there's an owner, wait for it to either
334 * release the lock or go to sleep.
335 */
336 owner = ACCESS_ONCE(lock->owner);
337 if (owner && !mutex_spin_on_owner(lock, owner))
338 break;
339
340 /* Try to acquire the mutex if it is unlocked. */
341 if (mutex_try_to_acquire(lock)) {
342 lock_acquired(&lock->dep_map, ip);
343
344 if (use_ww_ctx) {
345 struct ww_mutex *ww;
346 ww = container_of(lock, struct ww_mutex, base);
347
348 ww_mutex_set_context_fastpath(ww, ww_ctx);
349 }
350
351 mutex_set_owner(lock);
352 osq_unlock(&lock->osq);
353 return true;
354 }
355
356 /*
357 * When there's no owner, we might have preempted between the
358 * owner acquiring the lock and setting the owner field. If
359 * we're an RT task that will live-lock because we won't let
360 * the owner complete.
361 */
362 if (!owner && (need_resched() || rt_task(task)))
363 break;
364
365 /*
366 * The cpu_relax() call is a compiler barrier which forces
367 * everything in this loop to be re-loaded. We don't need
368 * memory barriers as we'll eventually observe the right
369 * values at the cost of a few extra spins.
370 */
371 cpu_relax_lowlatency();
372 }
373
374 osq_unlock(&lock->osq);
375done:
376 /*
377 * If we fell out of the spin path because of need_resched(),
378 * reschedule now, before we try-lock the mutex. This avoids getting
379 * scheduled out right after we obtained the mutex.
380 */
381 if (need_resched())
382 schedule_preempt_disabled();
383
384 return false;
385}
386#else
387static bool mutex_optimistic_spin(struct mutex *lock,
388 struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx)
389{
390 return false;
391}
183#endif 392#endif
184 393
185__visible __used noinline 394__visible __used noinline
@@ -277,91 +486,6 @@ __mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx)
277 return 0; 486 return 0;
278} 487}
279 488
280static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww,
281 struct ww_acquire_ctx *ww_ctx)
282{
283#ifdef CONFIG_DEBUG_MUTEXES
284 /*
285 * If this WARN_ON triggers, you used ww_mutex_lock to acquire,
286 * but released with a normal mutex_unlock in this call.
287 *
288 * This should never happen, always use ww_mutex_unlock.
289 */
290 DEBUG_LOCKS_WARN_ON(ww->ctx);
291
292 /*
293 * Not quite done after calling ww_acquire_done() ?
294 */
295 DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire);
296
297 if (ww_ctx->contending_lock) {
298 /*
299 * After -EDEADLK you tried to
300 * acquire a different ww_mutex? Bad!
301 */
302 DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww);
303
304 /*
305 * You called ww_mutex_lock after receiving -EDEADLK,
306 * but 'forgot' to unlock everything else first?
307 */
308 DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0);
309 ww_ctx->contending_lock = NULL;
310 }
311
312 /*
313 * Naughty, using a different class will lead to undefined behavior!
314 */
315 DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class);
316#endif
317 ww_ctx->acquired++;
318}
319
320/*
321 * after acquiring lock with fastpath or when we lost out in contested
322 * slowpath, set ctx and wake up any waiters so they can recheck.
323 *
324 * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set,
325 * as the fastpath and opportunistic spinning are disabled in that case.
326 */
327static __always_inline void
328ww_mutex_set_context_fastpath(struct ww_mutex *lock,
329 struct ww_acquire_ctx *ctx)
330{
331 unsigned long flags;
332 struct mutex_waiter *cur;
333
334 ww_mutex_lock_acquired(lock, ctx);
335
336 lock->ctx = ctx;
337
338 /*
339 * The lock->ctx update should be visible on all cores before
340 * the atomic read is done, otherwise contended waiters might be
341 * missed. The contended waiters will either see ww_ctx == NULL
342 * and keep spinning, or it will acquire wait_lock, add itself
343 * to waiter list and sleep.
344 */
345 smp_mb(); /* ^^^ */
346
347 /*
348 * Check if lock is contended, if not there is nobody to wake up
349 */
350 if (likely(atomic_read(&lock->base.count) == 0))
351 return;
352
353 /*
354 * Uh oh, we raced in fastpath, wake up everyone in this case,
355 * so they can see the new lock->ctx.
356 */
357 spin_lock_mutex(&lock->base.wait_lock, flags);
358 list_for_each_entry(cur, &lock->base.wait_list, list) {
359 debug_mutex_wake_waiter(&lock->base, cur);
360 wake_up_process(cur->task);
361 }
362 spin_unlock_mutex(&lock->base.wait_lock, flags);
363}
364
365/* 489/*
366 * Lock a mutex (possibly interruptible), slowpath: 490 * Lock a mutex (possibly interruptible), slowpath:
367 */ 491 */
@@ -378,104 +502,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
378 preempt_disable(); 502 preempt_disable();
379 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); 503 mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
380 504
381#ifdef CONFIG_MUTEX_SPIN_ON_OWNER 505 if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) {
382 /* 506 /* got the lock, yay! */
383 * Optimistic spinning. 507 preempt_enable();
384 * 508 return 0;
385 * We try to spin for acquisition when we find that the lock owner
386 * is currently running on a (different) CPU and while we don't
387 * need to reschedule. The rationale is that if the lock owner is
388 * running, it is likely to release the lock soon.
389 *
390 * Since this needs the lock owner, and this mutex implementation
391 * doesn't track the owner atomically in the lock field, we need to
392 * track it non-atomically.
393 *
394 * We can't do this for DEBUG_MUTEXES because that relies on wait_lock
395 * to serialize everything.
396 *
397 * The mutex spinners are queued up using MCS lock so that only one
398 * spinner can compete for the mutex. However, if mutex spinning isn't
399 * going to happen, there is no point in going through the lock/unlock
400 * overhead.
401 */
402 if (!mutex_can_spin_on_owner(lock))
403 goto slowpath;
404
405 if (!osq_lock(&lock->osq))
406 goto slowpath;
407
408 for (;;) {
409 struct task_struct *owner;
410
411 if (use_ww_ctx && ww_ctx->acquired > 0) {
412 struct ww_mutex *ww;
413
414 ww = container_of(lock, struct ww_mutex, base);
415 /*
416 * If ww->ctx is set the contents are undefined, only
417 * by acquiring wait_lock there is a guarantee that
418 * they are not invalid when reading.
419 *
420 * As such, when deadlock detection needs to be
421 * performed the optimistic spinning cannot be done.
422 */
423 if (ACCESS_ONCE(ww->ctx))
424 break;
425 }
426
427 /*
428 * If there's an owner, wait for it to either
429 * release the lock or go to sleep.
430 */
431 owner = ACCESS_ONCE(lock->owner);
432 if (owner && !mutex_spin_on_owner(lock, owner))
433 break;
434
435 /* Try to acquire the mutex if it is unlocked. */
436 if (!mutex_is_locked(lock) &&
437 (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
438 lock_acquired(&lock->dep_map, ip);
439 if (use_ww_ctx) {
440 struct ww_mutex *ww;
441 ww = container_of(lock, struct ww_mutex, base);
442
443 ww_mutex_set_context_fastpath(ww, ww_ctx);
444 }
445
446 mutex_set_owner(lock);
447 osq_unlock(&lock->osq);
448 preempt_enable();
449 return 0;
450 }
451
452 /*
453 * When there's no owner, we might have preempted between the
454 * owner acquiring the lock and setting the owner field. If
455 * we're an RT task that will live-lock because we won't let
456 * the owner complete.
457 */
458 if (!owner && (need_resched() || rt_task(task)))
459 break;
460
461 /*
462 * The cpu_relax() call is a compiler barrier which forces
463 * everything in this loop to be re-loaded. We don't need
464 * memory barriers as we'll eventually observe the right
465 * values at the cost of a few extra spins.
466 */
467 cpu_relax_lowlatency();
468 } 509 }
469 osq_unlock(&lock->osq); 510
470slowpath:
471 /*
472 * If we fell out of the spin path because of need_resched(),
473 * reschedule now, before we try-lock the mutex. This avoids getting
474 * scheduled out right after we obtained the mutex.
475 */
476 if (need_resched())
477 schedule_preempt_disabled();
478#endif
479 spin_lock_mutex(&lock->wait_lock, flags); 511 spin_lock_mutex(&lock->wait_lock, flags);
480 512
481 /* 513 /*
@@ -679,15 +711,21 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible);
679 * Release the lock, slowpath: 711 * Release the lock, slowpath:
680 */ 712 */
681static inline void 713static inline void
682__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested) 714__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
683{ 715{
684 struct mutex *lock = container_of(lock_count, struct mutex, count);
685 unsigned long flags; 716 unsigned long flags;
686 717
687 /* 718 /*
688 * some architectures leave the lock unlocked in the fastpath failure 719 * As a performance measurement, release the lock before doing other
720 * wakeup related duties to follow. This allows other tasks to acquire
721 * the lock sooner, while still handling cleanups in past unlock calls.
722 * This can be done as we do not enforce strict equivalence between the
723 * mutex counter and wait_list.
724 *
725 *
726 * Some architectures leave the lock unlocked in the fastpath failure
689 * case, others need to leave it locked. In the later case we have to 727 * case, others need to leave it locked. In the later case we have to
690 * unlock it here 728 * unlock it here - as the lock counter is currently 0 or negative.
691 */ 729 */
692 if (__mutex_slowpath_needs_to_unlock()) 730 if (__mutex_slowpath_needs_to_unlock())
693 atomic_set(&lock->count, 1); 731 atomic_set(&lock->count, 1);
@@ -716,7 +754,9 @@ __mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
716__visible void 754__visible void
717__mutex_unlock_slowpath(atomic_t *lock_count) 755__mutex_unlock_slowpath(atomic_t *lock_count)
718{ 756{
719 __mutex_unlock_common_slowpath(lock_count, 1); 757 struct mutex *lock = container_of(lock_count, struct mutex, count);
758
759 __mutex_unlock_common_slowpath(lock, 1);
720} 760}
721 761
722#ifndef CONFIG_DEBUG_LOCK_ALLOC 762#ifndef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
index 4115fbf83b12..5cda397607f2 100644
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -16,7 +16,7 @@
16#define mutex_remove_waiter(lock, waiter, ti) \ 16#define mutex_remove_waiter(lock, waiter, ti) \
17 __list_del((waiter)->list.prev, (waiter)->list.next) 17 __list_del((waiter)->list.prev, (waiter)->list.next)
18 18
19#ifdef CONFIG_SMP 19#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
20static inline void mutex_set_owner(struct mutex *lock) 20static inline void mutex_set_owner(struct mutex *lock)
21{ 21{
22 lock->owner = current; 22 lock->owner = current;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a0ea2a141b3b..7c98873a3077 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -8,7 +8,7 @@
8 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt 8 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
9 * Copyright (C) 2006 Esben Nielsen 9 * Copyright (C) 2006 Esben Nielsen
10 * 10 *
11 * See Documentation/rt-mutex-design.txt for details. 11 * See Documentation/locking/rt-mutex-design.txt for details.
12 */ 12 */
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/export.h> 14#include <linux/export.h>
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index d6203faf2eb1..7628c3fc37ca 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -246,19 +246,22 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
246 246
247 return sem; 247 return sem;
248} 248}
249EXPORT_SYMBOL(rwsem_down_read_failed);
249 250
250static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) 251static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
251{ 252{
252 if (!(count & RWSEM_ACTIVE_MASK)) { 253 /*
253 /* try acquiring the write lock */ 254 * Try acquiring the write lock. Check count first in order
254 if (sem->count == RWSEM_WAITING_BIAS && 255 * to reduce unnecessary expensive cmpxchg() operations.
255 cmpxchg(&sem->count, RWSEM_WAITING_BIAS, 256 */
256 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { 257 if (count == RWSEM_WAITING_BIAS &&
257 if (!list_is_singular(&sem->wait_list)) 258 cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
258 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); 259 RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
259 return true; 260 if (!list_is_singular(&sem->wait_list))
260 } 261 rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
262 return true;
261 } 263 }
264
262 return false; 265 return false;
263} 266}
264 267
@@ -465,6 +468,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
465 468
466 return sem; 469 return sem;
467} 470}
471EXPORT_SYMBOL(rwsem_down_write_failed);
468 472
469/* 473/*
470 * handle waking up a waiter on the semaphore 474 * handle waking up a waiter on the semaphore
@@ -485,6 +489,7 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
485 489
486 return sem; 490 return sem;
487} 491}
492EXPORT_SYMBOL(rwsem_wake);
488 493
489/* 494/*
490 * downgrade a write lock into a read lock 495 * downgrade a write lock into a read lock
@@ -506,8 +511,4 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
506 511
507 return sem; 512 return sem;
508} 513}
509
510EXPORT_SYMBOL(rwsem_down_read_failed);
511EXPORT_SYMBOL(rwsem_down_write_failed);
512EXPORT_SYMBOL(rwsem_wake);
513EXPORT_SYMBOL(rwsem_downgrade_wake); 514EXPORT_SYMBOL(rwsem_downgrade_wake);
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
index 6815171a4fff..b8120abe594b 100644
--- a/kernel/locking/semaphore.c
+++ b/kernel/locking/semaphore.c
@@ -36,7 +36,7 @@
36static noinline void __down(struct semaphore *sem); 36static noinline void __down(struct semaphore *sem);
37static noinline int __down_interruptible(struct semaphore *sem); 37static noinline int __down_interruptible(struct semaphore *sem);
38static noinline int __down_killable(struct semaphore *sem); 38static noinline int __down_killable(struct semaphore *sem);
39static noinline int __down_timeout(struct semaphore *sem, long jiffies); 39static noinline int __down_timeout(struct semaphore *sem, long timeout);
40static noinline void __up(struct semaphore *sem); 40static noinline void __up(struct semaphore *sem);
41 41
42/** 42/**
@@ -145,14 +145,14 @@ EXPORT_SYMBOL(down_trylock);
145/** 145/**
146 * down_timeout - acquire the semaphore within a specified time 146 * down_timeout - acquire the semaphore within a specified time
147 * @sem: the semaphore to be acquired 147 * @sem: the semaphore to be acquired
148 * @jiffies: how long to wait before failing 148 * @timeout: how long to wait before failing
149 * 149 *
150 * Attempts to acquire the semaphore. If no more tasks are allowed to 150 * Attempts to acquire the semaphore. If no more tasks are allowed to
151 * acquire the semaphore, calling this function will put the task to sleep. 151 * acquire the semaphore, calling this function will put the task to sleep.
152 * If the semaphore is not released within the specified number of jiffies, 152 * If the semaphore is not released within the specified number of jiffies,
153 * this function returns -ETIME. It returns 0 if the semaphore was acquired. 153 * this function returns -ETIME. It returns 0 if the semaphore was acquired.
154 */ 154 */
155int down_timeout(struct semaphore *sem, long jiffies) 155int down_timeout(struct semaphore *sem, long timeout)
156{ 156{
157 unsigned long flags; 157 unsigned long flags;
158 int result = 0; 158 int result = 0;
@@ -161,7 +161,7 @@ int down_timeout(struct semaphore *sem, long jiffies)
161 if (likely(sem->count > 0)) 161 if (likely(sem->count > 0))
162 sem->count--; 162 sem->count--;
163 else 163 else
164 result = __down_timeout(sem, jiffies); 164 result = __down_timeout(sem, timeout);
165 raw_spin_unlock_irqrestore(&sem->lock, flags); 165 raw_spin_unlock_irqrestore(&sem->lock, flags);
166 166
167 return result; 167 return result;
@@ -248,9 +248,9 @@ static noinline int __sched __down_killable(struct semaphore *sem)
248 return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT); 248 return __down_common(sem, TASK_KILLABLE, MAX_SCHEDULE_TIMEOUT);
249} 249}
250 250
251static noinline int __sched __down_timeout(struct semaphore *sem, long jiffies) 251static noinline int __sched __down_timeout(struct semaphore *sem, long timeout)
252{ 252{
253 return __down_common(sem, TASK_UNINTERRUPTIBLE, jiffies); 253 return __down_common(sem, TASK_UNINTERRUPTIBLE, timeout);
254} 254}
255 255
256static noinline void __sched __up(struct semaphore *sem) 256static noinline void __sched __up(struct semaphore *sem)
diff --git a/kernel/module.c b/kernel/module.c
index ae79ce615cb9..88cec1ddb1e3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -135,7 +135,7 @@ static int param_set_bool_enable_only(const char *val,
135} 135}
136 136
137static const struct kernel_param_ops param_ops_bool_enable_only = { 137static const struct kernel_param_ops param_ops_bool_enable_only = {
138 .flags = KERNEL_PARAM_FL_NOARG, 138 .flags = KERNEL_PARAM_OPS_FL_NOARG,
139 .set = param_set_bool_enable_only, 139 .set = param_set_bool_enable_only,
140 .get = param_get_bool, 140 .get = param_get_bool,
141}; 141};
@@ -1842,7 +1842,9 @@ static void free_module(struct module *mod)
1842 1842
1843 /* We leave it in list to prevent duplicate loads, but make sure 1843 /* We leave it in list to prevent duplicate loads, but make sure
1844 * that noone uses it while it's being deconstructed. */ 1844 * that noone uses it while it's being deconstructed. */
1845 mutex_lock(&module_mutex);
1845 mod->state = MODULE_STATE_UNFORMED; 1846 mod->state = MODULE_STATE_UNFORMED;
1847 mutex_unlock(&module_mutex);
1846 1848
1847 /* Remove dynamic debug info */ 1849 /* Remove dynamic debug info */
1848 ddebug_remove_module(mod->name); 1850 ddebug_remove_module(mod->name);
@@ -3304,6 +3306,11 @@ static int load_module(struct load_info *info, const char __user *uargs,
3304 mutex_lock(&module_mutex); 3306 mutex_lock(&module_mutex);
3305 module_bug_cleanup(mod); 3307 module_bug_cleanup(mod);
3306 mutex_unlock(&module_mutex); 3308 mutex_unlock(&module_mutex);
3309
3310 /* we can't deallocate the module until we clear memory protection */
3311 unset_module_init_ro_nx(mod);
3312 unset_module_core_ro_nx(mod);
3313
3307 ddebug_cleanup: 3314 ddebug_cleanup:
3308 dynamic_debug_remove(info->debug); 3315 dynamic_debug_remove(info->debug);
3309 synchronize_sched(); 3316 synchronize_sched();
@@ -3381,7 +3388,9 @@ static inline int within(unsigned long addr, void *start, unsigned long size)
3381 */ 3388 */
3382static inline int is_arm_mapping_symbol(const char *str) 3389static inline int is_arm_mapping_symbol(const char *str)
3383{ 3390{
3384 return str[0] == '$' && strchr("atd", str[1]) 3391 if (str[0] == '.' && str[1] == 'L')
3392 return true;
3393 return str[0] == '$' && strchr("axtd", str[1])
3385 && (str[2] == '\0' || str[2] == '.'); 3394 && (str[2] == '\0' || str[2] == '.');
3386} 3395}
3387 3396
@@ -3444,8 +3453,7 @@ const char *module_address_lookup(unsigned long addr,
3444 list_for_each_entry_rcu(mod, &modules, list) { 3453 list_for_each_entry_rcu(mod, &modules, list) {
3445 if (mod->state == MODULE_STATE_UNFORMED) 3454 if (mod->state == MODULE_STATE_UNFORMED)
3446 continue; 3455 continue;
3447 if (within_module_init(addr, mod) || 3456 if (within_module(addr, mod)) {
3448 within_module_core(addr, mod)) {
3449 if (modname) 3457 if (modname)
3450 *modname = mod->name; 3458 *modname = mod->name;
3451 ret = get_ksymbol(mod, addr, size, offset); 3459 ret = get_ksymbol(mod, addr, size, offset);
@@ -3469,8 +3477,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
3469 list_for_each_entry_rcu(mod, &modules, list) { 3477 list_for_each_entry_rcu(mod, &modules, list) {
3470 if (mod->state == MODULE_STATE_UNFORMED) 3478 if (mod->state == MODULE_STATE_UNFORMED)
3471 continue; 3479 continue;
3472 if (within_module_init(addr, mod) || 3480 if (within_module(addr, mod)) {
3473 within_module_core(addr, mod)) {
3474 const char *sym; 3481 const char *sym;
3475 3482
3476 sym = get_ksymbol(mod, addr, NULL, NULL); 3483 sym = get_ksymbol(mod, addr, NULL, NULL);
@@ -3495,8 +3502,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
3495 list_for_each_entry_rcu(mod, &modules, list) { 3502 list_for_each_entry_rcu(mod, &modules, list) {
3496 if (mod->state == MODULE_STATE_UNFORMED) 3503 if (mod->state == MODULE_STATE_UNFORMED)
3497 continue; 3504 continue;
3498 if (within_module_init(addr, mod) || 3505 if (within_module(addr, mod)) {
3499 within_module_core(addr, mod)) {
3500 const char *sym; 3506 const char *sym;
3501 3507
3502 sym = get_ksymbol(mod, addr, size, offset); 3508 sym = get_ksymbol(mod, addr, size, offset);
@@ -3760,8 +3766,7 @@ struct module *__module_address(unsigned long addr)
3760 list_for_each_entry_rcu(mod, &modules, list) { 3766 list_for_each_entry_rcu(mod, &modules, list) {
3761 if (mod->state == MODULE_STATE_UNFORMED) 3767 if (mod->state == MODULE_STATE_UNFORMED)
3762 continue; 3768 continue;
3763 if (within_module_core(addr, mod) 3769 if (within_module(addr, mod))
3764 || within_module_init(addr, mod))
3765 return mod; 3770 return mod;
3766 } 3771 }
3767 return NULL; 3772 return NULL;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 8e7811086b82..ef42d0ab3115 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -204,20 +204,13 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
204 204
205 might_sleep(); 205 might_sleep();
206 206
207 task_lock(p);
207 ns = p->nsproxy; 208 ns = p->nsproxy;
209 p->nsproxy = new;
210 task_unlock(p);
208 211
209 rcu_assign_pointer(p->nsproxy, new); 212 if (ns && atomic_dec_and_test(&ns->count))
210
211 if (ns && atomic_dec_and_test(&ns->count)) {
212 /*
213 * wait for others to get what they want from this nsproxy.
214 *
215 * cannot release this nsproxy via the call_rcu() since
216 * put_mnt_ns() will want to sleep
217 */
218 synchronize_rcu();
219 free_nsproxy(ns); 213 free_nsproxy(ns);
220 }
221} 214}
222 215
223void exit_task_namespaces(struct task_struct *p) 216void exit_task_namespaces(struct task_struct *p)
diff --git a/kernel/panic.c b/kernel/panic.c
index 62e16cef9cc2..cf80672b7924 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -224,6 +224,7 @@ static const struct tnt tnts[] = {
224 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' }, 224 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
225 { TAINT_OOT_MODULE, 'O', ' ' }, 225 { TAINT_OOT_MODULE, 'O', ' ' },
226 { TAINT_UNSIGNED_MODULE, 'E', ' ' }, 226 { TAINT_UNSIGNED_MODULE, 'E', ' ' },
227 { TAINT_SOFTLOCKUP, 'L', ' ' },
227}; 228};
228 229
229/** 230/**
@@ -243,6 +244,7 @@ static const struct tnt tnts[] = {
243 * 'I' - Working around severe firmware bug. 244 * 'I' - Working around severe firmware bug.
244 * 'O' - Out-of-tree module has been loaded. 245 * 'O' - Out-of-tree module has been loaded.
245 * 'E' - Unsigned module has been loaded. 246 * 'E' - Unsigned module has been loaded.
247 * 'L' - A soft lockup has previously occurred.
246 * 248 *
247 * The string is overwritten by the next call to print_tainted(). 249 * The string is overwritten by the next call to print_tainted().
248 */ 250 */
diff --git a/kernel/params.c b/kernel/params.c
index 34f527023794..db97b791390f 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -19,6 +19,7 @@
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/errno.h> 20#include <linux/errno.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/moduleparam.h>
22#include <linux/device.h> 23#include <linux/device.h>
23#include <linux/err.h> 24#include <linux/err.h>
24#include <linux/slab.h> 25#include <linux/slab.h>
@@ -83,6 +84,15 @@ bool parameq(const char *a, const char *b)
83 return parameqn(a, b, strlen(a)+1); 84 return parameqn(a, b, strlen(a)+1);
84} 85}
85 86
87static void param_check_unsafe(const struct kernel_param *kp)
88{
89 if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
90 pr_warn("Setting dangerous option %s - tainting kernel\n",
91 kp->name);
92 add_taint(TAINT_USER, LOCKDEP_STILL_OK);
93 }
94}
95
86static int parse_one(char *param, 96static int parse_one(char *param,
87 char *val, 97 char *val,
88 const char *doing, 98 const char *doing,
@@ -104,11 +114,12 @@ static int parse_one(char *param,
104 return 0; 114 return 0;
105 /* No one handled NULL, so do it here. */ 115 /* No one handled NULL, so do it here. */
106 if (!val && 116 if (!val &&
107 !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG)) 117 !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG))
108 return -EINVAL; 118 return -EINVAL;
109 pr_debug("handling %s with %p\n", param, 119 pr_debug("handling %s with %p\n", param,
110 params[i].ops->set); 120 params[i].ops->set);
111 mutex_lock(&param_lock); 121 mutex_lock(&param_lock);
122 param_check_unsafe(&params[i]);
112 err = params[i].ops->set(val, &params[i]); 123 err = params[i].ops->set(val, &params[i]);
113 mutex_unlock(&param_lock); 124 mutex_unlock(&param_lock);
114 return err; 125 return err;
@@ -318,7 +329,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
318EXPORT_SYMBOL(param_get_bool); 329EXPORT_SYMBOL(param_get_bool);
319 330
320struct kernel_param_ops param_ops_bool = { 331struct kernel_param_ops param_ops_bool = {
321 .flags = KERNEL_PARAM_FL_NOARG, 332 .flags = KERNEL_PARAM_OPS_FL_NOARG,
322 .set = param_set_bool, 333 .set = param_set_bool,
323 .get = param_get_bool, 334 .get = param_get_bool,
324}; 335};
@@ -369,7 +380,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
369EXPORT_SYMBOL(param_set_bint); 380EXPORT_SYMBOL(param_set_bint);
370 381
371struct kernel_param_ops param_ops_bint = { 382struct kernel_param_ops param_ops_bint = {
372 .flags = KERNEL_PARAM_FL_NOARG, 383 .flags = KERNEL_PARAM_OPS_FL_NOARG,
373 .set = param_set_bint, 384 .set = param_set_bint,
374 .get = param_get_int, 385 .get = param_get_int,
375}; 386};
@@ -503,8 +514,6 @@ EXPORT_SYMBOL(param_ops_string);
503#define to_module_attr(n) container_of(n, struct module_attribute, attr) 514#define to_module_attr(n) container_of(n, struct module_attribute, attr)
504#define to_module_kobject(n) container_of(n, struct module_kobject, kobj) 515#define to_module_kobject(n) container_of(n, struct module_kobject, kobj)
505 516
506extern struct kernel_param __start___param[], __stop___param[];
507
508struct param_attribute 517struct param_attribute
509{ 518{
510 struct module_attribute mattr; 519 struct module_attribute mattr;
@@ -552,6 +561,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
552 return -EPERM; 561 return -EPERM;
553 562
554 mutex_lock(&param_lock); 563 mutex_lock(&param_lock);
564 param_check_unsafe(attribute->param);
555 err = attribute->param->ops->set(buf, attribute->param); 565 err = attribute->param->ops->set(buf, attribute->param);
556 mutex_unlock(&param_lock); 566 mutex_unlock(&param_lock);
557 if (!err) 567 if (!err)
@@ -763,7 +773,7 @@ static struct module_kobject * __init locate_module_kobject(const char *name)
763} 773}
764 774
765static void __init kernel_add_sysfs_param(const char *name, 775static void __init kernel_add_sysfs_param(const char *name,
766 struct kernel_param *kparam, 776 const struct kernel_param *kparam,
767 unsigned int name_skip) 777 unsigned int name_skip)
768{ 778{
769 struct module_kobject *mk; 779 struct module_kobject *mk;
@@ -798,7 +808,7 @@ static void __init kernel_add_sysfs_param(const char *name,
798 */ 808 */
799static void __init param_sysfs_builtin(void) 809static void __init param_sysfs_builtin(void)
800{ 810{
801 struct kernel_param *kp; 811 const struct kernel_param *kp;
802 unsigned int name_len; 812 unsigned int name_len;
803 char modname[MODULE_NAME_LEN]; 813 char modname[MODULE_NAME_LEN];
804 814
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e4e4121fa327..bbef57f5bdfd 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -302,6 +302,10 @@ config PM_GENERIC_DOMAINS_RUNTIME
302 def_bool y 302 def_bool y
303 depends on PM_RUNTIME && PM_GENERIC_DOMAINS 303 depends on PM_RUNTIME && PM_GENERIC_DOMAINS
304 304
305config PM_GENERIC_DOMAINS_OF
306 def_bool y
307 depends on PM_GENERIC_DOMAINS && OF
308
305config CPU_PM 309config CPU_PM
306 bool 310 bool
307 depends on SUSPEND || CPU_IDLE 311 depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a9dfa79b6bab..1f35a3478f3c 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -502,8 +502,14 @@ int hibernation_restore(int platform_mode)
502 error = dpm_suspend_start(PMSG_QUIESCE); 502 error = dpm_suspend_start(PMSG_QUIESCE);
503 if (!error) { 503 if (!error) {
504 error = resume_target_kernel(platform_mode); 504 error = resume_target_kernel(platform_mode);
505 dpm_resume_end(PMSG_RECOVER); 505 /*
506 * The above should either succeed and jump to the new kernel,
507 * or return with an error. Otherwise things are just
508 * undefined, so let's be paranoid.
509 */
510 BUG_ON(!error);
506 } 511 }
512 dpm_resume_end(PMSG_RECOVER);
507 pm_restore_gfp_mask(); 513 pm_restore_gfp_mask();
508 resume_console(); 514 resume_console();
509 pm_restore_console(); 515 pm_restore_console();
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 5d49dcac2537..2df883a9d3cb 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -179,6 +179,7 @@ extern void swsusp_show_speed(struct timeval *, struct timeval *,
179 179
180#ifdef CONFIG_SUSPEND 180#ifdef CONFIG_SUSPEND
181/* kernel/power/suspend.c */ 181/* kernel/power/suspend.c */
182extern const char *pm_labels[];
182extern const char *pm_states[]; 183extern const char *pm_states[];
183 184
184extern int suspend_devices_and_enter(suspend_state_t state); 185extern int suspend_devices_and_enter(suspend_state_t state);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 4ee194eb524b..5a6ec8678b9a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -46,13 +46,13 @@ static int try_to_freeze_tasks(bool user_only)
46 while (true) { 46 while (true) {
47 todo = 0; 47 todo = 0;
48 read_lock(&tasklist_lock); 48 read_lock(&tasklist_lock);
49 do_each_thread(g, p) { 49 for_each_process_thread(g, p) {
50 if (p == current || !freeze_task(p)) 50 if (p == current || !freeze_task(p))
51 continue; 51 continue;
52 52
53 if (!freezer_should_skip(p)) 53 if (!freezer_should_skip(p))
54 todo++; 54 todo++;
55 } while_each_thread(g, p); 55 }
56 read_unlock(&tasklist_lock); 56 read_unlock(&tasklist_lock);
57 57
58 if (!user_only) { 58 if (!user_only) {
@@ -93,11 +93,11 @@ static int try_to_freeze_tasks(bool user_only)
93 93
94 if (!wakeup) { 94 if (!wakeup) {
95 read_lock(&tasklist_lock); 95 read_lock(&tasklist_lock);
96 do_each_thread(g, p) { 96 for_each_process_thread(g, p) {
97 if (p != current && !freezer_should_skip(p) 97 if (p != current && !freezer_should_skip(p)
98 && freezing(p) && !frozen(p)) 98 && freezing(p) && !frozen(p))
99 sched_show_task(p); 99 sched_show_task(p);
100 } while_each_thread(g, p); 100 }
101 read_unlock(&tasklist_lock); 101 read_unlock(&tasklist_lock);
102 } 102 }
103 } else { 103 } else {
@@ -108,6 +108,30 @@ static int try_to_freeze_tasks(bool user_only)
108 return todo ? -EBUSY : 0; 108 return todo ? -EBUSY : 0;
109} 109}
110 110
111static bool __check_frozen_processes(void)
112{
113 struct task_struct *g, *p;
114
115 for_each_process_thread(g, p)
116 if (p != current && !freezer_should_skip(p) && !frozen(p))
117 return false;
118
119 return true;
120}
121
122/*
123 * Returns true if all freezable tasks (except for current) are frozen already
124 */
125static bool check_frozen_processes(void)
126{
127 bool ret;
128
129 read_lock(&tasklist_lock);
130 ret = __check_frozen_processes();
131 read_unlock(&tasklist_lock);
132 return ret;
133}
134
111/** 135/**
112 * freeze_processes - Signal user space processes to enter the refrigerator. 136 * freeze_processes - Signal user space processes to enter the refrigerator.
113 * The current thread will not be frozen. The same process that calls 137 * The current thread will not be frozen. The same process that calls
@@ -118,6 +142,7 @@ static int try_to_freeze_tasks(bool user_only)
118int freeze_processes(void) 142int freeze_processes(void)
119{ 143{
120 int error; 144 int error;
145 int oom_kills_saved;
121 146
122 error = __usermodehelper_disable(UMH_FREEZING); 147 error = __usermodehelper_disable(UMH_FREEZING);
123 if (error) 148 if (error)
@@ -129,13 +154,28 @@ int freeze_processes(void)
129 if (!pm_freezing) 154 if (!pm_freezing)
130 atomic_inc(&system_freezing_cnt); 155 atomic_inc(&system_freezing_cnt);
131 156
157 pm_wakeup_clear();
132 printk("Freezing user space processes ... "); 158 printk("Freezing user space processes ... ");
133 pm_freezing = true; 159 pm_freezing = true;
160 oom_kills_saved = oom_kills_count();
134 error = try_to_freeze_tasks(true); 161 error = try_to_freeze_tasks(true);
135 if (!error) { 162 if (!error) {
136 printk("done.");
137 __usermodehelper_set_disable_depth(UMH_DISABLED); 163 __usermodehelper_set_disable_depth(UMH_DISABLED);
138 oom_killer_disable(); 164 oom_killer_disable();
165
166 /*
167 * There might have been an OOM kill while we were
168 * freezing tasks and the killed task might be still
169 * on the way out so we have to double check for race.
170 */
171 if (oom_kills_count() != oom_kills_saved &&
172 !check_frozen_processes()) {
173 __usermodehelper_set_disable_depth(UMH_ENABLED);
174 printk("OOM in progress.");
175 error = -EBUSY;
176 } else {
177 printk("done.");
178 }
139 } 179 }
140 printk("\n"); 180 printk("\n");
141 BUG_ON(in_atomic()); 181 BUG_ON(in_atomic());
@@ -190,11 +230,11 @@ void thaw_processes(void)
190 thaw_workqueues(); 230 thaw_workqueues();
191 231
192 read_lock(&tasklist_lock); 232 read_lock(&tasklist_lock);
193 do_each_thread(g, p) { 233 for_each_process_thread(g, p) {
194 /* No other threads should have PF_SUSPEND_TASK set */ 234 /* No other threads should have PF_SUSPEND_TASK set */
195 WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK)); 235 WARN_ON((p != curr) && (p->flags & PF_SUSPEND_TASK));
196 __thaw_task(p); 236 __thaw_task(p);
197 } while_each_thread(g, p); 237 }
198 read_unlock(&tasklist_lock); 238 read_unlock(&tasklist_lock);
199 239
200 WARN_ON(!(curr->flags & PF_SUSPEND_TASK)); 240 WARN_ON(!(curr->flags & PF_SUSPEND_TASK));
@@ -217,10 +257,10 @@ void thaw_kernel_threads(void)
217 thaw_workqueues(); 257 thaw_workqueues();
218 258
219 read_lock(&tasklist_lock); 259 read_lock(&tasklist_lock);
220 do_each_thread(g, p) { 260 for_each_process_thread(g, p) {
221 if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) 261 if (p->flags & (PF_KTHREAD | PF_WQ_WORKER))
222 __thaw_task(p); 262 __thaw_task(p);
223 } while_each_thread(g, p); 263 }
224 read_unlock(&tasklist_lock); 264 read_unlock(&tasklist_lock);
225 265
226 schedule(); 266 schedule();
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 884b77058864..5f4c006c4b1e 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -105,11 +105,27 @@ static struct pm_qos_object network_throughput_pm_qos = {
105}; 105};
106 106
107 107
108static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier);
109static struct pm_qos_constraints memory_bw_constraints = {
110 .list = PLIST_HEAD_INIT(memory_bw_constraints.list),
111 .target_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
112 .default_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
113 .no_constraint_value = PM_QOS_MEMORY_BANDWIDTH_DEFAULT_VALUE,
114 .type = PM_QOS_SUM,
115 .notifiers = &memory_bandwidth_notifier,
116};
117static struct pm_qos_object memory_bandwidth_pm_qos = {
118 .constraints = &memory_bw_constraints,
119 .name = "memory_bandwidth",
120};
121
122
108static struct pm_qos_object *pm_qos_array[] = { 123static struct pm_qos_object *pm_qos_array[] = {
109 &null_pm_qos, 124 &null_pm_qos,
110 &cpu_dma_pm_qos, 125 &cpu_dma_pm_qos,
111 &network_lat_pm_qos, 126 &network_lat_pm_qos,
112 &network_throughput_pm_qos 127 &network_throughput_pm_qos,
128 &memory_bandwidth_pm_qos,
113}; 129};
114 130
115static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 131static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
@@ -130,6 +146,9 @@ static const struct file_operations pm_qos_power_fops = {
130/* unlocked internal variant */ 146/* unlocked internal variant */
131static inline int pm_qos_get_value(struct pm_qos_constraints *c) 147static inline int pm_qos_get_value(struct pm_qos_constraints *c)
132{ 148{
149 struct plist_node *node;
150 int total_value = 0;
151
133 if (plist_head_empty(&c->list)) 152 if (plist_head_empty(&c->list))
134 return c->no_constraint_value; 153 return c->no_constraint_value;
135 154
@@ -140,6 +159,12 @@ static inline int pm_qos_get_value(struct pm_qos_constraints *c)
140 case PM_QOS_MAX: 159 case PM_QOS_MAX:
141 return plist_last(&c->list)->prio; 160 return plist_last(&c->list)->prio;
142 161
162 case PM_QOS_SUM:
163 plist_for_each(node, &c->list)
164 total_value += node->prio;
165
166 return total_value;
167
143 default: 168 default:
144 /* runtime check for not using enum */ 169 /* runtime check for not using enum */
145 BUG(); 170 BUG();
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 4fc5c32422b3..791a61892bb5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -954,6 +954,25 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
954 } 954 }
955} 955}
956 956
957static bool is_nosave_page(unsigned long pfn)
958{
959 struct nosave_region *region;
960
961 list_for_each_entry(region, &nosave_regions, list) {
962 if (pfn >= region->start_pfn && pfn < region->end_pfn) {
963 pr_err("PM: %#010llx in e820 nosave region: "
964 "[mem %#010llx-%#010llx]\n",
965 (unsigned long long) pfn << PAGE_SHIFT,
966 (unsigned long long) region->start_pfn << PAGE_SHIFT,
967 ((unsigned long long) region->end_pfn << PAGE_SHIFT)
968 - 1);
969 return true;
970 }
971 }
972
973 return false;
974}
975
957/** 976/**
958 * create_basic_memory_bitmaps - create bitmaps needed for marking page 977 * create_basic_memory_bitmaps - create bitmaps needed for marking page
959 * frames that should not be saved and free page frames. The pointers 978 * frames that should not be saved and free page frames. The pointers
@@ -1324,6 +1343,9 @@ void swsusp_free(void)
1324{ 1343{
1325 unsigned long fb_pfn, fr_pfn; 1344 unsigned long fb_pfn, fr_pfn;
1326 1345
1346 if (!forbidden_pages_map || !free_pages_map)
1347 goto out;
1348
1327 memory_bm_position_reset(forbidden_pages_map); 1349 memory_bm_position_reset(forbidden_pages_map);
1328 memory_bm_position_reset(free_pages_map); 1350 memory_bm_position_reset(free_pages_map);
1329 1351
@@ -1351,6 +1373,7 @@ loop:
1351 goto loop; 1373 goto loop;
1352 } 1374 }
1353 1375
1376out:
1354 nr_copy_pages = 0; 1377 nr_copy_pages = 0;
1355 nr_meta_pages = 0; 1378 nr_meta_pages = 0;
1356 restore_pblist = NULL; 1379 restore_pblist = NULL;
@@ -2015,7 +2038,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm)
2015 do { 2038 do {
2016 pfn = memory_bm_next_pfn(bm); 2039 pfn = memory_bm_next_pfn(bm);
2017 if (likely(pfn != BM_END_OF_MAP)) { 2040 if (likely(pfn != BM_END_OF_MAP)) {
2018 if (likely(pfn_valid(pfn))) 2041 if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn))
2019 swsusp_set_page_free(pfn_to_page(pfn)); 2042 swsusp_set_page_free(pfn_to_page(pfn));
2020 else 2043 else
2021 return -EFAULT; 2044 return -EFAULT;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 6dadb25cb0d8..c347e3ce3a55 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,7 +31,7 @@
31 31
32#include "power.h" 32#include "power.h"
33 33
34static const char *pm_labels[] = { "mem", "standby", "freeze", }; 34const char *pm_labels[] = { "mem", "standby", "freeze", NULL };
35const char *pm_states[PM_SUSPEND_MAX]; 35const char *pm_states[PM_SUSPEND_MAX];
36 36
37static const struct platform_suspend_ops *suspend_ops; 37static const struct platform_suspend_ops *suspend_ops;
@@ -146,17 +146,29 @@ static int platform_suspend_prepare(suspend_state_t state)
146 146
147static int platform_suspend_prepare_late(suspend_state_t state) 147static int platform_suspend_prepare_late(suspend_state_t state)
148{ 148{
149 return state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->prepare ?
150 freeze_ops->prepare() : 0;
151}
152
153static int platform_suspend_prepare_noirq(suspend_state_t state)
154{
149 return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ? 155 return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
150 suspend_ops->prepare_late() : 0; 156 suspend_ops->prepare_late() : 0;
151} 157}
152 158
153static void platform_suspend_wake(suspend_state_t state) 159static void platform_resume_noirq(suspend_state_t state)
154{ 160{
155 if (state != PM_SUSPEND_FREEZE && suspend_ops->wake) 161 if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
156 suspend_ops->wake(); 162 suspend_ops->wake();
157} 163}
158 164
159static void platform_suspend_finish(suspend_state_t state) 165static void platform_resume_early(suspend_state_t state)
166{
167 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->restore)
168 freeze_ops->restore();
169}
170
171static void platform_resume_finish(suspend_state_t state)
160{ 172{
161 if (state != PM_SUSPEND_FREEZE && suspend_ops->finish) 173 if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
162 suspend_ops->finish(); 174 suspend_ops->finish();
@@ -172,7 +184,7 @@ static int platform_suspend_begin(suspend_state_t state)
172 return 0; 184 return 0;
173} 185}
174 186
175static void platform_suspend_end(suspend_state_t state) 187static void platform_resume_end(suspend_state_t state)
176{ 188{
177 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end) 189 if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
178 freeze_ops->end(); 190 freeze_ops->end();
@@ -180,7 +192,7 @@ static void platform_suspend_end(suspend_state_t state)
180 suspend_ops->end(); 192 suspend_ops->end();
181} 193}
182 194
183static void platform_suspend_recover(suspend_state_t state) 195static void platform_recover(suspend_state_t state)
184{ 196{
185 if (state != PM_SUSPEND_FREEZE && suspend_ops->recover) 197 if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
186 suspend_ops->recover(); 198 suspend_ops->recover();
@@ -265,13 +277,22 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
265 if (error) 277 if (error)
266 goto Platform_finish; 278 goto Platform_finish;
267 279
268 error = dpm_suspend_end(PMSG_SUSPEND); 280 error = dpm_suspend_late(PMSG_SUSPEND);
269 if (error) { 281 if (error) {
270 printk(KERN_ERR "PM: Some devices failed to power down\n"); 282 printk(KERN_ERR "PM: late suspend of devices failed\n");
271 goto Platform_finish; 283 goto Platform_finish;
272 } 284 }
273 error = platform_suspend_prepare_late(state); 285 error = platform_suspend_prepare_late(state);
274 if (error) 286 if (error)
287 goto Devices_early_resume;
288
289 error = dpm_suspend_noirq(PMSG_SUSPEND);
290 if (error) {
291 printk(KERN_ERR "PM: noirq suspend of devices failed\n");
292 goto Platform_early_resume;
293 }
294 error = platform_suspend_prepare_noirq(state);
295 if (error)
275 goto Platform_wake; 296 goto Platform_wake;
276 297
277 if (suspend_test(TEST_PLATFORM)) 298 if (suspend_test(TEST_PLATFORM))
@@ -318,11 +339,17 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
318 enable_nonboot_cpus(); 339 enable_nonboot_cpus();
319 340
320 Platform_wake: 341 Platform_wake:
321 platform_suspend_wake(state); 342 platform_resume_noirq(state);
322 dpm_resume_start(PMSG_RESUME); 343 dpm_resume_noirq(PMSG_RESUME);
344
345 Platform_early_resume:
346 platform_resume_early(state);
347
348 Devices_early_resume:
349 dpm_resume_early(PMSG_RESUME);
323 350
324 Platform_finish: 351 Platform_finish:
325 platform_suspend_finish(state); 352 platform_resume_finish(state);
326 return error; 353 return error;
327} 354}
328 355
@@ -361,14 +388,16 @@ int suspend_devices_and_enter(suspend_state_t state)
361 suspend_test_start(); 388 suspend_test_start();
362 dpm_resume_end(PMSG_RESUME); 389 dpm_resume_end(PMSG_RESUME);
363 suspend_test_finish("resume devices"); 390 suspend_test_finish("resume devices");
391 trace_suspend_resume(TPS("resume_console"), state, true);
364 resume_console(); 392 resume_console();
393 trace_suspend_resume(TPS("resume_console"), state, false);
365 394
366 Close: 395 Close:
367 platform_suspend_end(state); 396 platform_resume_end(state);
368 return error; 397 return error;
369 398
370 Recover_platform: 399 Recover_platform:
371 platform_suspend_recover(state); 400 platform_recover(state);
372 goto Resume_devices; 401 goto Resume_devices;
373} 402}
374 403
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 2f524928b6aa..084452e34a12 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -22,6 +22,8 @@
22#define TEST_SUSPEND_SECONDS 10 22#define TEST_SUSPEND_SECONDS 10
23 23
24static unsigned long suspend_test_start_time; 24static unsigned long suspend_test_start_time;
25static u32 test_repeat_count_max = 1;
26static u32 test_repeat_count_current;
25 27
26void suspend_test_start(void) 28void suspend_test_start(void)
27{ 29{
@@ -74,6 +76,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
74 int status; 76 int status;
75 77
76 /* this may fail if the RTC hasn't been initialized */ 78 /* this may fail if the RTC hasn't been initialized */
79repeat:
77 status = rtc_read_time(rtc, &alm.time); 80 status = rtc_read_time(rtc, &alm.time);
78 if (status < 0) { 81 if (status < 0) {
79 printk(err_readtime, dev_name(&rtc->dev), status); 82 printk(err_readtime, dev_name(&rtc->dev), status);
@@ -100,10 +103,21 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
100 if (state == PM_SUSPEND_STANDBY) { 103 if (state == PM_SUSPEND_STANDBY) {
101 printk(info_test, pm_states[state]); 104 printk(info_test, pm_states[state]);
102 status = pm_suspend(state); 105 status = pm_suspend(state);
106 if (status < 0)
107 state = PM_SUSPEND_FREEZE;
103 } 108 }
109 if (state == PM_SUSPEND_FREEZE) {
110 printk(info_test, pm_states[state]);
111 status = pm_suspend(state);
112 }
113
104 if (status < 0) 114 if (status < 0)
105 printk(err_suspend, status); 115 printk(err_suspend, status);
106 116
117 test_repeat_count_current++;
118 if (test_repeat_count_current < test_repeat_count_max)
119 goto repeat;
120
107 /* Some platforms can't detect that the alarm triggered the 121 /* Some platforms can't detect that the alarm triggered the
108 * wakeup, or (accordingly) disable it after it afterwards. 122 * wakeup, or (accordingly) disable it after it afterwards.
109 * It's supposed to give oneshot behavior; cope. 123 * It's supposed to give oneshot behavior; cope.
@@ -129,24 +143,36 @@ static int __init has_wakealarm(struct device *dev, const void *data)
129 * at startup time. They're normally disabled, for faster boot and because 143 * at startup time. They're normally disabled, for faster boot and because
130 * we can't know which states really work on this particular system. 144 * we can't know which states really work on this particular system.
131 */ 145 */
132static suspend_state_t test_state __initdata = PM_SUSPEND_ON; 146static const char *test_state_label __initdata;
133 147
134static char warn_bad_state[] __initdata = 148static char warn_bad_state[] __initdata =
135 KERN_WARNING "PM: can't test '%s' suspend state\n"; 149 KERN_WARNING "PM: can't test '%s' suspend state\n";
136 150
137static int __init setup_test_suspend(char *value) 151static int __init setup_test_suspend(char *value)
138{ 152{
139 suspend_state_t i; 153 int i;
154 char *repeat;
155 char *suspend_type;
140 156
141 /* "=mem" ==> "mem" */ 157 /* example : "=mem[,N]" ==> "mem[,N]" */
142 value++; 158 value++;
143 for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++) 159 suspend_type = strsep(&value, ",");
144 if (!strcmp(pm_states[i], value)) { 160 if (!suspend_type)
145 test_state = i; 161 return 0;
162
163 repeat = strsep(&value, ",");
164 if (repeat) {
165 if (kstrtou32(repeat, 0, &test_repeat_count_max))
166 return 0;
167 }
168
169 for (i = 0; pm_labels[i]; i++)
170 if (!strcmp(pm_labels[i], suspend_type)) {
171 test_state_label = pm_labels[i];
146 return 0; 172 return 0;
147 } 173 }
148 174
149 printk(warn_bad_state, value); 175 printk(warn_bad_state, suspend_type);
150 return 0; 176 return 0;
151} 177}
152__setup("test_suspend", setup_test_suspend); 178__setup("test_suspend", setup_test_suspend);
@@ -158,13 +184,21 @@ static int __init test_suspend(void)
158 184
159 struct rtc_device *rtc = NULL; 185 struct rtc_device *rtc = NULL;
160 struct device *dev; 186 struct device *dev;
187 suspend_state_t test_state;
161 188
162 /* PM is initialized by now; is that state testable? */ 189 /* PM is initialized by now; is that state testable? */
163 if (test_state == PM_SUSPEND_ON) 190 if (!test_state_label)
164 goto done; 191 return 0;
165 if (!pm_states[test_state]) { 192
166 printk(warn_bad_state, pm_states[test_state]); 193 for (test_state = PM_SUSPEND_MIN; test_state < PM_SUSPEND_MAX; test_state++) {
167 goto done; 194 const char *state_label = pm_states[test_state];
195
196 if (state_label && !strcmp(test_state_label, state_label))
197 break;
198 }
199 if (test_state == PM_SUSPEND_MAX) {
200 printk(warn_bad_state, test_state_label);
201 return 0;
168 } 202 }
169 203
170 /* RTCs have initialized by now too ... can we use one? */ 204 /* RTCs have initialized by now too ... can we use one? */
@@ -173,13 +207,12 @@ static int __init test_suspend(void)
173 rtc = rtc_class_open(dev_name(dev)); 207 rtc = rtc_class_open(dev_name(dev));
174 if (!rtc) { 208 if (!rtc) {
175 printk(warn_no_rtc); 209 printk(warn_no_rtc);
176 goto done; 210 return 0;
177 } 211 }
178 212
179 /* go for it */ 213 /* go for it */
180 test_wakealarm(rtc, test_state); 214 test_wakealarm(rtc, test_state);
181 rtc_class_close(rtc); 215 rtc_class_close(rtc);
182done:
183 return 0; 216 return 0;
184} 217}
185late_initcall(test_suspend); 218late_initcall(test_suspend);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 13e839dbca07..ced2b84b1cb7 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -45,6 +45,7 @@
45#include <linux/poll.h> 45#include <linux/poll.h>
46#include <linux/irq_work.h> 46#include <linux/irq_work.h>
47#include <linux/utsname.h> 47#include <linux/utsname.h>
48#include <linux/ctype.h>
48 49
49#include <asm/uaccess.h> 50#include <asm/uaccess.h>
50 51
@@ -56,7 +57,7 @@
56 57
57int console_printk[4] = { 58int console_printk[4] = {
58 CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ 59 CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
59 DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ 60 MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
60 CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ 61 CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */
61 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ 62 CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */
62}; 63};
@@ -113,9 +114,9 @@ static int __down_trylock_console_sem(unsigned long ip)
113 * This is used for debugging the mess that is the VT code by 114 * This is used for debugging the mess that is the VT code by
114 * keeping track if we have the console semaphore held. It's 115 * keeping track if we have the console semaphore held. It's
115 * definitely not the perfect debug tool (we don't know if _WE_ 116 * definitely not the perfect debug tool (we don't know if _WE_
116 * hold it are racing, but it helps tracking those weird code 117 * hold it and are racing, but it helps tracking those weird code
117 * path in the console code where we end up in places I want 118 * paths in the console code where we end up in places I want
118 * locked without the console sempahore held 119 * locked without the console sempahore held).
119 */ 120 */
120static int console_locked, console_suspended; 121static int console_locked, console_suspended;
121 122
@@ -146,8 +147,8 @@ static int console_may_schedule;
146 * the overall length of the record. 147 * the overall length of the record.
147 * 148 *
148 * The heads to the first and last entry in the buffer, as well as the 149 * The heads to the first and last entry in the buffer, as well as the
149 * sequence numbers of these both entries are maintained when messages 150 * sequence numbers of these entries are maintained when messages are
150 * are stored.. 151 * stored.
151 * 152 *
152 * If the heads indicate available messages, the length in the header 153 * If the heads indicate available messages, the length in the header
153 * tells the start next message. A length == 0 for the next message 154 * tells the start next message. A length == 0 for the next message
@@ -257,7 +258,7 @@ static u64 clear_seq;
257static u32 clear_idx; 258static u32 clear_idx;
258 259
259#define PREFIX_MAX 32 260#define PREFIX_MAX 32
260#define LOG_LINE_MAX 1024 - PREFIX_MAX 261#define LOG_LINE_MAX (1024 - PREFIX_MAX)
261 262
262/* record buffer */ 263/* record buffer */
263#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) 264#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
@@ -270,6 +271,18 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
270static char *log_buf = __log_buf; 271static char *log_buf = __log_buf;
271static u32 log_buf_len = __LOG_BUF_LEN; 272static u32 log_buf_len = __LOG_BUF_LEN;
272 273
274/* Return log buffer address */
275char *log_buf_addr_get(void)
276{
277 return log_buf;
278}
279
280/* Return log buffer size */
281u32 log_buf_len_get(void)
282{
283 return log_buf_len;
284}
285
273/* human readable text of the record */ 286/* human readable text of the record */
274static char *log_text(const struct printk_log *msg) 287static char *log_text(const struct printk_log *msg)
275{ 288{
@@ -344,7 +357,7 @@ static int log_make_free_space(u32 msg_size)
344 while (log_first_seq < log_next_seq) { 357 while (log_first_seq < log_next_seq) {
345 if (logbuf_has_space(msg_size, false)) 358 if (logbuf_has_space(msg_size, false))
346 return 0; 359 return 0;
347 /* drop old messages until we have enough continuous space */ 360 /* drop old messages until we have enough contiguous space */
348 log_first_idx = log_next(log_first_idx); 361 log_first_idx = log_next(log_first_idx);
349 log_first_seq++; 362 log_first_seq++;
350 } 363 }
@@ -453,11 +466,7 @@ static int log_store(int facility, int level,
453 return msg->text_len; 466 return msg->text_len;
454} 467}
455 468
456#ifdef CONFIG_SECURITY_DMESG_RESTRICT 469int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT);
457int dmesg_restrict = 1;
458#else
459int dmesg_restrict;
460#endif
461 470
462static int syslog_action_restricted(int type) 471static int syslog_action_restricted(int type)
463{ 472{
@@ -509,14 +518,13 @@ struct devkmsg_user {
509 char buf[8192]; 518 char buf[8192];
510}; 519};
511 520
512static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, 521static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
513 unsigned long count, loff_t pos)
514{ 522{
515 char *buf, *line; 523 char *buf, *line;
516 int i; 524 int i;
517 int level = default_message_loglevel; 525 int level = default_message_loglevel;
518 int facility = 1; /* LOG_USER */ 526 int facility = 1; /* LOG_USER */
519 size_t len = iov_length(iv, count); 527 size_t len = iocb->ki_nbytes;
520 ssize_t ret = len; 528 ssize_t ret = len;
521 529
522 if (len > LOG_LINE_MAX) 530 if (len > LOG_LINE_MAX)
@@ -525,13 +533,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
525 if (buf == NULL) 533 if (buf == NULL)
526 return -ENOMEM; 534 return -ENOMEM;
527 535
528 line = buf; 536 buf[len] = '\0';
529 for (i = 0; i < count; i++) { 537 if (copy_from_iter(buf, len, from) != len) {
530 if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) { 538 kfree(buf);
531 ret = -EFAULT; 539 return -EFAULT;
532 goto out;
533 }
534 line += iv[i].iov_len;
535 } 540 }
536 541
537 /* 542 /*
@@ -557,10 +562,8 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
557 line = endp; 562 line = endp;
558 } 563 }
559 } 564 }
560 line[len] = '\0';
561 565
562 printk_emit(facility, level, NULL, 0, "%s", line); 566 printk_emit(facility, level, NULL, 0, "%s", line);
563out:
564 kfree(buf); 567 kfree(buf);
565 return ret; 568 return ret;
566} 569}
@@ -792,7 +795,7 @@ static int devkmsg_release(struct inode *inode, struct file *file)
792const struct file_operations kmsg_fops = { 795const struct file_operations kmsg_fops = {
793 .open = devkmsg_open, 796 .open = devkmsg_open,
794 .read = devkmsg_read, 797 .read = devkmsg_read,
795 .aio_write = devkmsg_writev, 798 .write_iter = devkmsg_write,
796 .llseek = devkmsg_llseek, 799 .llseek = devkmsg_llseek,
797 .poll = devkmsg_poll, 800 .poll = devkmsg_poll,
798 .release = devkmsg_release, 801 .release = devkmsg_release,
@@ -828,34 +831,80 @@ void log_buf_kexec_setup(void)
828/* requested log_buf_len from kernel cmdline */ 831/* requested log_buf_len from kernel cmdline */
829static unsigned long __initdata new_log_buf_len; 832static unsigned long __initdata new_log_buf_len;
830 833
831/* save requested log_buf_len since it's too early to process it */ 834/* we practice scaling the ring buffer by powers of 2 */
832static int __init log_buf_len_setup(char *str) 835static void __init log_buf_len_update(unsigned size)
833{ 836{
834 unsigned size = memparse(str, &str);
835
836 if (size) 837 if (size)
837 size = roundup_pow_of_two(size); 838 size = roundup_pow_of_two(size);
838 if (size > log_buf_len) 839 if (size > log_buf_len)
839 new_log_buf_len = size; 840 new_log_buf_len = size;
841}
842
843/* save requested log_buf_len since it's too early to process it */
844static int __init log_buf_len_setup(char *str)
845{
846 unsigned size = memparse(str, &str);
847
848 log_buf_len_update(size);
840 849
841 return 0; 850 return 0;
842} 851}
843early_param("log_buf_len", log_buf_len_setup); 852early_param("log_buf_len", log_buf_len_setup);
844 853
854#ifdef CONFIG_SMP
855#define __LOG_CPU_MAX_BUF_LEN (1 << CONFIG_LOG_CPU_MAX_BUF_SHIFT)
856
857static void __init log_buf_add_cpu(void)
858{
859 unsigned int cpu_extra;
860
861 /*
862 * archs should set up cpu_possible_bits properly with
863 * set_cpu_possible() after setup_arch() but just in
864 * case lets ensure this is valid.
865 */
866 if (num_possible_cpus() == 1)
867 return;
868
869 cpu_extra = (num_possible_cpus() - 1) * __LOG_CPU_MAX_BUF_LEN;
870
871 /* by default this will only continue through for large > 64 CPUs */
872 if (cpu_extra <= __LOG_BUF_LEN / 2)
873 return;
874
875 pr_info("log_buf_len individual max cpu contribution: %d bytes\n",
876 __LOG_CPU_MAX_BUF_LEN);
877 pr_info("log_buf_len total cpu_extra contributions: %d bytes\n",
878 cpu_extra);
879 pr_info("log_buf_len min size: %d bytes\n", __LOG_BUF_LEN);
880
881 log_buf_len_update(cpu_extra + __LOG_BUF_LEN);
882}
883#else /* !CONFIG_SMP */
884static inline void log_buf_add_cpu(void) {}
885#endif /* CONFIG_SMP */
886
845void __init setup_log_buf(int early) 887void __init setup_log_buf(int early)
846{ 888{
847 unsigned long flags; 889 unsigned long flags;
848 char *new_log_buf; 890 char *new_log_buf;
849 int free; 891 int free;
850 892
893 if (log_buf != __log_buf)
894 return;
895
896 if (!early && !new_log_buf_len)
897 log_buf_add_cpu();
898
851 if (!new_log_buf_len) 899 if (!new_log_buf_len)
852 return; 900 return;
853 901
854 if (early) { 902 if (early) {
855 new_log_buf = 903 new_log_buf =
856 memblock_virt_alloc(new_log_buf_len, PAGE_SIZE); 904 memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);
857 } else { 905 } else {
858 new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len, 0); 906 new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len,
907 LOG_ALIGN);
859 } 908 }
860 909
861 if (unlikely(!new_log_buf)) { 910 if (unlikely(!new_log_buf)) {
@@ -872,7 +921,7 @@ void __init setup_log_buf(int early)
872 memcpy(log_buf, __log_buf, __LOG_BUF_LEN); 921 memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
873 raw_spin_unlock_irqrestore(&logbuf_lock, flags); 922 raw_spin_unlock_irqrestore(&logbuf_lock, flags);
874 923
875 pr_info("log_buf_len: %d\n", log_buf_len); 924 pr_info("log_buf_len: %d bytes\n", log_buf_len);
876 pr_info("early log buf free: %d(%d%%)\n", 925 pr_info("early log buf free: %d(%d%%)\n",
877 free, (free * 100) / __LOG_BUF_LEN); 926 free, (free * 100) / __LOG_BUF_LEN);
878} 927}
@@ -881,7 +930,7 @@ static bool __read_mostly ignore_loglevel;
881 930
882static int __init ignore_loglevel_setup(char *str) 931static int __init ignore_loglevel_setup(char *str)
883{ 932{
884 ignore_loglevel = 1; 933 ignore_loglevel = true;
885 pr_info("debug: ignoring loglevel setting.\n"); 934 pr_info("debug: ignoring loglevel setting.\n");
886 935
887 return 0; 936 return 0;
@@ -947,11 +996,7 @@ static inline void boot_delay_msec(int level)
947} 996}
948#endif 997#endif
949 998
950#if defined(CONFIG_PRINTK_TIME) 999static bool printk_time = IS_ENABLED(CONFIG_PRINTK_TIME);
951static bool printk_time = 1;
952#else
953static bool printk_time;
954#endif
955module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR); 1000module_param_named(time, printk_time, bool, S_IRUGO | S_IWUSR);
956 1001
957static size_t print_time(u64 ts, char *buf) 1002static size_t print_time(u64 ts, char *buf)
@@ -1310,7 +1355,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
1310 * for pending data, not the size; return the count of 1355 * for pending data, not the size; return the count of
1311 * records, not the length. 1356 * records, not the length.
1312 */ 1357 */
1313 error = log_next_idx - syslog_idx; 1358 error = log_next_seq - syslog_seq;
1314 } else { 1359 } else {
1315 u64 seq = syslog_seq; 1360 u64 seq = syslog_seq;
1316 u32 idx = syslog_idx; 1361 u32 idx = syslog_idx;
@@ -1416,10 +1461,9 @@ static int have_callable_console(void)
1416/* 1461/*
1417 * Can we actually use the console at this time on this cpu? 1462 * Can we actually use the console at this time on this cpu?
1418 * 1463 *
1419 * Console drivers may assume that per-cpu resources have 1464 * Console drivers may assume that per-cpu resources have been allocated. So
1420 * been allocated. So unless they're explicitly marked as 1465 * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
1421 * being able to cope (CON_ANYTIME) don't call them until 1466 * call them until this CPU is officially up.
1422 * this CPU is officially up.
1423 */ 1467 */
1424static inline int can_use_console(unsigned int cpu) 1468static inline int can_use_console(unsigned int cpu)
1425{ 1469{
@@ -1432,8 +1476,10 @@ static inline int can_use_console(unsigned int cpu)
1432 * console_lock held, and 'console_locked' set) if it 1476 * console_lock held, and 'console_locked' set) if it
1433 * is successful, false otherwise. 1477 * is successful, false otherwise.
1434 */ 1478 */
1435static int console_trylock_for_printk(unsigned int cpu) 1479static int console_trylock_for_printk(void)
1436{ 1480{
1481 unsigned int cpu = smp_processor_id();
1482
1437 if (!console_trylock()) 1483 if (!console_trylock())
1438 return 0; 1484 return 0;
1439 /* 1485 /*
@@ -1476,7 +1522,7 @@ static struct cont {
1476 struct task_struct *owner; /* task of first print*/ 1522 struct task_struct *owner; /* task of first print*/
1477 u64 ts_nsec; /* time of first print */ 1523 u64 ts_nsec; /* time of first print */
1478 u8 level; /* log level of first message */ 1524 u8 level; /* log level of first message */
1479 u8 facility; /* log level of first message */ 1525 u8 facility; /* log facility of first message */
1480 enum log_flags flags; /* prefix, newline flags */ 1526 enum log_flags flags; /* prefix, newline flags */
1481 bool flushed:1; /* buffer sealed and committed */ 1527 bool flushed:1; /* buffer sealed and committed */
1482} cont; 1528} cont;
@@ -1608,7 +1654,8 @@ asmlinkage int vprintk_emit(int facility, int level,
1608 */ 1654 */
1609 if (!oops_in_progress && !lockdep_recursing(current)) { 1655 if (!oops_in_progress && !lockdep_recursing(current)) {
1610 recursion_bug = 1; 1656 recursion_bug = 1;
1611 goto out_restore_irqs; 1657 local_irq_restore(flags);
1658 return 0;
1612 } 1659 }
1613 zap_locks(); 1660 zap_locks();
1614 } 1661 }
@@ -1617,27 +1664,22 @@ asmlinkage int vprintk_emit(int facility, int level,
1617 raw_spin_lock(&logbuf_lock); 1664 raw_spin_lock(&logbuf_lock);
1618 logbuf_cpu = this_cpu; 1665 logbuf_cpu = this_cpu;
1619 1666
1620 if (recursion_bug) { 1667 if (unlikely(recursion_bug)) {
1621 static const char recursion_msg[] = 1668 static const char recursion_msg[] =
1622 "BUG: recent printk recursion!"; 1669 "BUG: recent printk recursion!";
1623 1670
1624 recursion_bug = 0; 1671 recursion_bug = 0;
1625 text_len = strlen(recursion_msg);
1626 /* emit KERN_CRIT message */ 1672 /* emit KERN_CRIT message */
1627 printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, 1673 printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
1628 NULL, 0, recursion_msg, text_len); 1674 NULL, 0, recursion_msg,
1675 strlen(recursion_msg));
1629 } 1676 }
1630 1677
1631 /* 1678 /*
1632 * The printf needs to come first; we need the syslog 1679 * The printf needs to come first; we need the syslog
1633 * prefix which might be passed-in as a parameter. 1680 * prefix which might be passed-in as a parameter.
1634 */ 1681 */
1635 if (in_sched) 1682 text_len = vscnprintf(text, sizeof(textbuf), fmt, args);
1636 text_len = scnprintf(text, sizeof(textbuf),
1637 KERN_WARNING "[sched_delayed] ");
1638
1639 text_len += vscnprintf(text + text_len,
1640 sizeof(textbuf) - text_len, fmt, args);
1641 1683
1642 /* mark and strip a trailing newline */ 1684 /* mark and strip a trailing newline */
1643 if (text_len && text[text_len-1] == '\n') { 1685 if (text_len && text[text_len-1] == '\n') {
@@ -1716,21 +1758,30 @@ asmlinkage int vprintk_emit(int facility, int level,
1716 1758
1717 logbuf_cpu = UINT_MAX; 1759 logbuf_cpu = UINT_MAX;
1718 raw_spin_unlock(&logbuf_lock); 1760 raw_spin_unlock(&logbuf_lock);
1761 lockdep_on();
1762 local_irq_restore(flags);
1719 1763
1720 /* If called from the scheduler, we can not call up(). */ 1764 /* If called from the scheduler, we can not call up(). */
1721 if (!in_sched) { 1765 if (!in_sched) {
1766 lockdep_off();
1767 /*
1768 * Disable preemption to avoid being preempted while holding
1769 * console_sem which would prevent anyone from printing to
1770 * console
1771 */
1772 preempt_disable();
1773
1722 /* 1774 /*
1723 * Try to acquire and then immediately release the console 1775 * Try to acquire and then immediately release the console
1724 * semaphore. The release will print out buffers and wake up 1776 * semaphore. The release will print out buffers and wake up
1725 * /dev/kmsg and syslog() users. 1777 * /dev/kmsg and syslog() users.
1726 */ 1778 */
1727 if (console_trylock_for_printk(this_cpu)) 1779 if (console_trylock_for_printk())
1728 console_unlock(); 1780 console_unlock();
1781 preempt_enable();
1782 lockdep_on();
1729 } 1783 }
1730 1784
1731 lockdep_on();
1732out_restore_irqs:
1733 local_irq_restore(flags);
1734 return printed_len; 1785 return printed_len;
1735} 1786}
1736EXPORT_SYMBOL(vprintk_emit); 1787EXPORT_SYMBOL(vprintk_emit);
@@ -1802,7 +1853,7 @@ EXPORT_SYMBOL(printk);
1802 1853
1803#define LOG_LINE_MAX 0 1854#define LOG_LINE_MAX 0
1804#define PREFIX_MAX 0 1855#define PREFIX_MAX 0
1805#define LOG_LINE_MAX 0 1856
1806static u64 syslog_seq; 1857static u64 syslog_seq;
1807static u32 syslog_idx; 1858static u32 syslog_idx;
1808static u64 console_seq; 1859static u64 console_seq;
@@ -1881,11 +1932,12 @@ static int __add_preferred_console(char *name, int idx, char *options,
1881 return 0; 1932 return 0;
1882} 1933}
1883/* 1934/*
1884 * Set up a list of consoles. Called from init/main.c 1935 * Set up a console. Called via do_early_param() in init/main.c
1936 * for each "console=" parameter in the boot command line.
1885 */ 1937 */
1886static int __init console_setup(char *str) 1938static int __init console_setup(char *str)
1887{ 1939{
1888 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for index */ 1940 char buf[sizeof(console_cmdline[0].name) + 4]; /* 4 for "ttyS" */
1889 char *s, *options, *brl_options = NULL; 1941 char *s, *options, *brl_options = NULL;
1890 int idx; 1942 int idx;
1891 1943
@@ -1902,7 +1954,8 @@ static int __init console_setup(char *str)
1902 strncpy(buf, str, sizeof(buf) - 1); 1954 strncpy(buf, str, sizeof(buf) - 1);
1903 } 1955 }
1904 buf[sizeof(buf) - 1] = 0; 1956 buf[sizeof(buf) - 1] = 0;
1905 if ((options = strchr(str, ',')) != NULL) 1957 options = strchr(str, ',');
1958 if (options)
1906 *(options++) = 0; 1959 *(options++) = 0;
1907#ifdef __sparc__ 1960#ifdef __sparc__
1908 if (!strcmp(str, "ttya")) 1961 if (!strcmp(str, "ttya"))
@@ -1911,7 +1964,7 @@ static int __init console_setup(char *str)
1911 strcpy(buf, "ttyS1"); 1964 strcpy(buf, "ttyS1");
1912#endif 1965#endif
1913 for (s = buf; *s; s++) 1966 for (s = buf; *s; s++)
1914 if ((*s >= '0' && *s <= '9') || *s == ',') 1967 if (isdigit(*s) || *s == ',')
1915 break; 1968 break;
1916 idx = simple_strtoul(s, NULL, 10); 1969 idx = simple_strtoul(s, NULL, 10);
1917 *s = 0; 1970 *s = 0;
@@ -1950,7 +2003,6 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
1950 i++, c++) 2003 i++, c++)
1951 if (strcmp(c->name, name) == 0 && c->index == idx) { 2004 if (strcmp(c->name, name) == 0 && c->index == idx) {
1952 strlcpy(c->name, name_new, sizeof(c->name)); 2005 strlcpy(c->name, name_new, sizeof(c->name));
1953 c->name[sizeof(c->name) - 1] = 0;
1954 c->options = options; 2006 c->options = options;
1955 c->index = idx_new; 2007 c->index = idx_new;
1956 return i; 2008 return i;
@@ -1959,12 +2011,12 @@ int update_console_cmdline(char *name, int idx, char *name_new, int idx_new, cha
1959 return -1; 2011 return -1;
1960} 2012}
1961 2013
1962bool console_suspend_enabled = 1; 2014bool console_suspend_enabled = true;
1963EXPORT_SYMBOL(console_suspend_enabled); 2015EXPORT_SYMBOL(console_suspend_enabled);
1964 2016
1965static int __init console_suspend_disable(char *str) 2017static int __init console_suspend_disable(char *str)
1966{ 2018{
1967 console_suspend_enabled = 0; 2019 console_suspend_enabled = false;
1968 return 1; 2020 return 1;
1969} 2021}
1970__setup("no_console_suspend", console_suspend_disable); 2022__setup("no_console_suspend", console_suspend_disable);
@@ -2045,8 +2097,8 @@ EXPORT_SYMBOL(console_lock);
2045/** 2097/**
2046 * console_trylock - try to lock the console system for exclusive use. 2098 * console_trylock - try to lock the console system for exclusive use.
2047 * 2099 *
2048 * Tried to acquire a lock which guarantees that the caller has 2100 * Try to acquire a lock which guarantees that the caller has exclusive
2049 * exclusive access to the console system and the console_drivers list. 2101 * access to the console system and the console_drivers list.
2050 * 2102 *
2051 * returns 1 on success, and 0 on failure to acquire the lock. 2103 * returns 1 on success, and 0 on failure to acquire the lock.
2052 */ 2104 */
@@ -2570,7 +2622,7 @@ void wake_up_klogd(void)
2570 preempt_disable(); 2622 preempt_disable();
2571 if (waitqueue_active(&log_wait)) { 2623 if (waitqueue_active(&log_wait)) {
2572 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); 2624 this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
2573 irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); 2625 irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
2574 } 2626 }
2575 preempt_enable(); 2627 preempt_enable();
2576} 2628}
@@ -2586,7 +2638,7 @@ int printk_deferred(const char *fmt, ...)
2586 va_end(args); 2638 va_end(args);
2587 2639
2588 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); 2640 __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
2589 irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); 2641 irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
2590 preempt_enable(); 2642 preempt_enable();
2591 2643
2592 return r; 2644 return r;
@@ -2618,14 +2670,13 @@ EXPORT_SYMBOL(__printk_ratelimit);
2618bool printk_timed_ratelimit(unsigned long *caller_jiffies, 2670bool printk_timed_ratelimit(unsigned long *caller_jiffies,
2619 unsigned int interval_msecs) 2671 unsigned int interval_msecs)
2620{ 2672{
2621 if (*caller_jiffies == 0 2673 unsigned long elapsed = jiffies - *caller_jiffies;
2622 || !time_in_range(jiffies, *caller_jiffies, 2674
2623 *caller_jiffies 2675 if (*caller_jiffies && elapsed <= msecs_to_jiffies(interval_msecs))
2624 + msecs_to_jiffies(interval_msecs))) { 2676 return false;
2625 *caller_jiffies = jiffies; 2677
2626 return true; 2678 *caller_jiffies = jiffies;
2627 } 2679 return true;
2628 return false;
2629} 2680}
2630EXPORT_SYMBOL(printk_timed_ratelimit); 2681EXPORT_SYMBOL(printk_timed_ratelimit);
2631 2682
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 948a7693748e..240fa9094f83 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -49,11 +49,19 @@
49#include <linux/trace_clock.h> 49#include <linux/trace_clock.h>
50#include <asm/byteorder.h> 50#include <asm/byteorder.h>
51#include <linux/torture.h> 51#include <linux/torture.h>
52#include <linux/vmalloc.h>
52 53
53MODULE_LICENSE("GPL"); 54MODULE_LICENSE("GPL");
54MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>"); 55MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
55 56
56 57
58torture_param(int, cbflood_inter_holdoff, HZ,
59 "Holdoff between floods (jiffies)");
60torture_param(int, cbflood_intra_holdoff, 1,
61 "Holdoff between bursts (jiffies)");
62torture_param(int, cbflood_n_burst, 3, "# bursts in flood, zero to disable");
63torture_param(int, cbflood_n_per_burst, 20000,
64 "# callbacks per burst in flood");
57torture_param(int, fqs_duration, 0, 65torture_param(int, fqs_duration, 0,
58 "Duration of fqs bursts (us), 0 to disable"); 66 "Duration of fqs bursts (us), 0 to disable");
59torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)"); 67torture_param(int, fqs_holdoff, 0, "Holdoff time within fqs bursts (us)");
@@ -96,10 +104,12 @@ module_param(torture_type, charp, 0444);
96MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); 104MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)");
97 105
98static int nrealreaders; 106static int nrealreaders;
107static int ncbflooders;
99static struct task_struct *writer_task; 108static struct task_struct *writer_task;
100static struct task_struct **fakewriter_tasks; 109static struct task_struct **fakewriter_tasks;
101static struct task_struct **reader_tasks; 110static struct task_struct **reader_tasks;
102static struct task_struct *stats_task; 111static struct task_struct *stats_task;
112static struct task_struct **cbflood_task;
103static struct task_struct *fqs_task; 113static struct task_struct *fqs_task;
104static struct task_struct *boost_tasks[NR_CPUS]; 114static struct task_struct *boost_tasks[NR_CPUS];
105static struct task_struct *stall_task; 115static struct task_struct *stall_task;
@@ -138,6 +148,7 @@ static long n_rcu_torture_boosts;
138static long n_rcu_torture_timers; 148static long n_rcu_torture_timers;
139static long n_barrier_attempts; 149static long n_barrier_attempts;
140static long n_barrier_successes; 150static long n_barrier_successes;
151static atomic_long_t n_cbfloods;
141static struct list_head rcu_torture_removed; 152static struct list_head rcu_torture_removed;
142 153
143static int rcu_torture_writer_state; 154static int rcu_torture_writer_state;
@@ -157,9 +168,9 @@ static int rcu_torture_writer_state;
157#else 168#else
158#define RCUTORTURE_RUNNABLE_INIT 0 169#define RCUTORTURE_RUNNABLE_INIT 0
159#endif 170#endif
160int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; 171static int torture_runnable = RCUTORTURE_RUNNABLE_INIT;
161module_param(rcutorture_runnable, int, 0444); 172module_param(torture_runnable, int, 0444);
162MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); 173MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
163 174
164#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) 175#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
165#define rcu_can_boost() 1 176#define rcu_can_boost() 1
@@ -182,7 +193,7 @@ static u64 notrace rcu_trace_clock_local(void)
182#endif /* #else #ifdef CONFIG_RCU_TRACE */ 193#endif /* #else #ifdef CONFIG_RCU_TRACE */
183 194
184static unsigned long boost_starttime; /* jiffies of next boost test start. */ 195static unsigned long boost_starttime; /* jiffies of next boost test start. */
185DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ 196static DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
186 /* and boost task create/destroy. */ 197 /* and boost task create/destroy. */
187static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ 198static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */
188static bool barrier_phase; /* Test phase. */ 199static bool barrier_phase; /* Test phase. */
@@ -242,7 +253,7 @@ struct rcu_torture_ops {
242 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); 253 void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
243 void (*cb_barrier)(void); 254 void (*cb_barrier)(void);
244 void (*fqs)(void); 255 void (*fqs)(void);
245 void (*stats)(char *page); 256 void (*stats)(void);
246 int irq_capable; 257 int irq_capable;
247 int can_boost; 258 int can_boost;
248 const char *name; 259 const char *name;
@@ -525,21 +536,21 @@ static void srcu_torture_barrier(void)
525 srcu_barrier(&srcu_ctl); 536 srcu_barrier(&srcu_ctl);
526} 537}
527 538
528static void srcu_torture_stats(char *page) 539static void srcu_torture_stats(void)
529{ 540{
530 int cpu; 541 int cpu;
531 int idx = srcu_ctl.completed & 0x1; 542 int idx = srcu_ctl.completed & 0x1;
532 543
533 page += sprintf(page, "%s%s per-CPU(idx=%d):", 544 pr_alert("%s%s per-CPU(idx=%d):",
534 torture_type, TORTURE_FLAG, idx); 545 torture_type, TORTURE_FLAG, idx);
535 for_each_possible_cpu(cpu) { 546 for_each_possible_cpu(cpu) {
536 long c0, c1; 547 long c0, c1;
537 548
538 c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx]; 549 c0 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx];
539 c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]; 550 c1 = (long)per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx];
540 page += sprintf(page, " %d(%ld,%ld)", cpu, c0, c1); 551 pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
541 } 552 }
542 sprintf(page, "\n"); 553 pr_cont("\n");
543} 554}
544 555
545static void srcu_torture_synchronize_expedited(void) 556static void srcu_torture_synchronize_expedited(void)
@@ -601,6 +612,52 @@ static struct rcu_torture_ops sched_ops = {
601 .name = "sched" 612 .name = "sched"
602}; 613};
603 614
615#ifdef CONFIG_TASKS_RCU
616
617/*
618 * Definitions for RCU-tasks torture testing.
619 */
620
621static int tasks_torture_read_lock(void)
622{
623 return 0;
624}
625
626static void tasks_torture_read_unlock(int idx)
627{
628}
629
630static void rcu_tasks_torture_deferred_free(struct rcu_torture *p)
631{
632 call_rcu_tasks(&p->rtort_rcu, rcu_torture_cb);
633}
634
635static struct rcu_torture_ops tasks_ops = {
636 .ttype = RCU_TASKS_FLAVOR,
637 .init = rcu_sync_torture_init,
638 .readlock = tasks_torture_read_lock,
639 .read_delay = rcu_read_delay, /* just reuse rcu's version. */
640 .readunlock = tasks_torture_read_unlock,
641 .completed = rcu_no_completed,
642 .deferred_free = rcu_tasks_torture_deferred_free,
643 .sync = synchronize_rcu_tasks,
644 .exp_sync = synchronize_rcu_tasks,
645 .call = call_rcu_tasks,
646 .cb_barrier = rcu_barrier_tasks,
647 .fqs = NULL,
648 .stats = NULL,
649 .irq_capable = 1,
650 .name = "tasks"
651};
652
653#define RCUTORTURE_TASKS_OPS &tasks_ops,
654
655#else /* #ifdef CONFIG_TASKS_RCU */
656
657#define RCUTORTURE_TASKS_OPS
658
659#endif /* #else #ifdef CONFIG_TASKS_RCU */
660
604/* 661/*
605 * RCU torture priority-boost testing. Runs one real-time thread per 662 * RCU torture priority-boost testing. Runs one real-time thread per
606 * CPU for moderate bursts, repeatedly registering RCU callbacks and 663 * CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -667,7 +724,7 @@ static int rcu_torture_boost(void *arg)
667 } 724 }
668 call_rcu_time = jiffies; 725 call_rcu_time = jiffies;
669 } 726 }
670 cond_resched(); 727 cond_resched_rcu_qs();
671 stutter_wait("rcu_torture_boost"); 728 stutter_wait("rcu_torture_boost");
672 if (torture_must_stop()) 729 if (torture_must_stop())
673 goto checkwait; 730 goto checkwait;
@@ -707,6 +764,58 @@ checkwait: stutter_wait("rcu_torture_boost");
707 return 0; 764 return 0;
708} 765}
709 766
767static void rcu_torture_cbflood_cb(struct rcu_head *rhp)
768{
769}
770
771/*
772 * RCU torture callback-flood kthread. Repeatedly induces bursts of calls
773 * to call_rcu() or analogous, increasing the probability of occurrence
774 * of callback-overflow corner cases.
775 */
776static int
777rcu_torture_cbflood(void *arg)
778{
779 int err = 1;
780 int i;
781 int j;
782 struct rcu_head *rhp;
783
784 if (cbflood_n_per_burst > 0 &&
785 cbflood_inter_holdoff > 0 &&
786 cbflood_intra_holdoff > 0 &&
787 cur_ops->call &&
788 cur_ops->cb_barrier) {
789 rhp = vmalloc(sizeof(*rhp) *
790 cbflood_n_burst * cbflood_n_per_burst);
791 err = !rhp;
792 }
793 if (err) {
794 VERBOSE_TOROUT_STRING("rcu_torture_cbflood disabled: Bad args or OOM");
795 while (!torture_must_stop())
796 schedule_timeout_interruptible(HZ);
797 return 0;
798 }
799 VERBOSE_TOROUT_STRING("rcu_torture_cbflood task started");
800 do {
801 schedule_timeout_interruptible(cbflood_inter_holdoff);
802 atomic_long_inc(&n_cbfloods);
803 WARN_ON(signal_pending(current));
804 for (i = 0; i < cbflood_n_burst; i++) {
805 for (j = 0; j < cbflood_n_per_burst; j++) {
806 cur_ops->call(&rhp[i * cbflood_n_per_burst + j],
807 rcu_torture_cbflood_cb);
808 }
809 schedule_timeout_interruptible(cbflood_intra_holdoff);
810 WARN_ON(signal_pending(current));
811 }
812 cur_ops->cb_barrier();
813 stutter_wait("rcu_torture_cbflood");
814 } while (!torture_must_stop());
815 torture_kthread_stopping("rcu_torture_cbflood");
816 return 0;
817}
818
710/* 819/*
711 * RCU torture force-quiescent-state kthread. Repeatedly induces 820 * RCU torture force-quiescent-state kthread. Repeatedly induces
712 * bursts of calls to force_quiescent_state(), increasing the probability 821 * bursts of calls to force_quiescent_state(), increasing the probability
@@ -1019,7 +1128,7 @@ rcu_torture_reader(void *arg)
1019 __this_cpu_inc(rcu_torture_batch[completed]); 1128 __this_cpu_inc(rcu_torture_batch[completed]);
1020 preempt_enable(); 1129 preempt_enable();
1021 cur_ops->readunlock(idx); 1130 cur_ops->readunlock(idx);
1022 cond_resched(); 1131 cond_resched_rcu_qs();
1023 stutter_wait("rcu_torture_reader"); 1132 stutter_wait("rcu_torture_reader");
1024 } while (!torture_must_stop()); 1133 } while (!torture_must_stop());
1025 if (irqreader && cur_ops->irq_capable) { 1134 if (irqreader && cur_ops->irq_capable) {
@@ -1031,10 +1140,15 @@ rcu_torture_reader(void *arg)
1031} 1140}
1032 1141
1033/* 1142/*
1034 * Create an RCU-torture statistics message in the specified buffer. 1143 * Print torture statistics. Caller must ensure that there is only
1144 * one call to this function at a given time!!! This is normally
1145 * accomplished by relying on the module system to only have one copy
1146 * of the module loaded, and then by giving the rcu_torture_stats
1147 * kthread full control (or the init/cleanup functions when rcu_torture_stats
1148 * thread is not running).
1035 */ 1149 */
1036static void 1150static void
1037rcu_torture_printk(char *page) 1151rcu_torture_stats_print(void)
1038{ 1152{
1039 int cpu; 1153 int cpu;
1040 int i; 1154 int i;
@@ -1052,55 +1166,61 @@ rcu_torture_printk(char *page)
1052 if (pipesummary[i] != 0) 1166 if (pipesummary[i] != 0)
1053 break; 1167 break;
1054 } 1168 }
1055 page += sprintf(page, "%s%s ", torture_type, TORTURE_FLAG); 1169
1056 page += sprintf(page, 1170 pr_alert("%s%s ", torture_type, TORTURE_FLAG);
1057 "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", 1171 pr_cont("rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ",
1058 rcu_torture_current, 1172 rcu_torture_current,
1059 rcu_torture_current_version, 1173 rcu_torture_current_version,
1060 list_empty(&rcu_torture_freelist), 1174 list_empty(&rcu_torture_freelist),
1061 atomic_read(&n_rcu_torture_alloc), 1175 atomic_read(&n_rcu_torture_alloc),
1062 atomic_read(&n_rcu_torture_alloc_fail), 1176 atomic_read(&n_rcu_torture_alloc_fail),
1063 atomic_read(&n_rcu_torture_free)); 1177 atomic_read(&n_rcu_torture_free));
1064 page += sprintf(page, "rtmbe: %d rtbke: %ld rtbre: %ld ", 1178 pr_cont("rtmbe: %d rtbke: %ld rtbre: %ld ",
1065 atomic_read(&n_rcu_torture_mberror), 1179 atomic_read(&n_rcu_torture_mberror),
1066 n_rcu_torture_boost_ktrerror, 1180 n_rcu_torture_boost_ktrerror,
1067 n_rcu_torture_boost_rterror); 1181 n_rcu_torture_boost_rterror);
1068 page += sprintf(page, "rtbf: %ld rtb: %ld nt: %ld ", 1182 pr_cont("rtbf: %ld rtb: %ld nt: %ld ",
1069 n_rcu_torture_boost_failure, 1183 n_rcu_torture_boost_failure,
1070 n_rcu_torture_boosts, 1184 n_rcu_torture_boosts,
1071 n_rcu_torture_timers); 1185 n_rcu_torture_timers);
1072 page = torture_onoff_stats(page); 1186 torture_onoff_stats();
1073 page += sprintf(page, "barrier: %ld/%ld:%ld", 1187 pr_cont("barrier: %ld/%ld:%ld ",
1074 n_barrier_successes, 1188 n_barrier_successes,
1075 n_barrier_attempts, 1189 n_barrier_attempts,
1076 n_rcu_torture_barrier_error); 1190 n_rcu_torture_barrier_error);
1077 page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); 1191 pr_cont("cbflood: %ld\n", atomic_long_read(&n_cbfloods));
1192
1193 pr_alert("%s%s ", torture_type, TORTURE_FLAG);
1078 if (atomic_read(&n_rcu_torture_mberror) != 0 || 1194 if (atomic_read(&n_rcu_torture_mberror) != 0 ||
1079 n_rcu_torture_barrier_error != 0 || 1195 n_rcu_torture_barrier_error != 0 ||
1080 n_rcu_torture_boost_ktrerror != 0 || 1196 n_rcu_torture_boost_ktrerror != 0 ||
1081 n_rcu_torture_boost_rterror != 0 || 1197 n_rcu_torture_boost_rterror != 0 ||
1082 n_rcu_torture_boost_failure != 0 || 1198 n_rcu_torture_boost_failure != 0 ||
1083 i > 1) { 1199 i > 1) {
1084 page += sprintf(page, "!!! "); 1200 pr_cont("%s", "!!! ");
1085 atomic_inc(&n_rcu_torture_error); 1201 atomic_inc(&n_rcu_torture_error);
1086 WARN_ON_ONCE(1); 1202 WARN_ON_ONCE(1);
1087 } 1203 }
1088 page += sprintf(page, "Reader Pipe: "); 1204 pr_cont("Reader Pipe: ");
1089 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1205 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1090 page += sprintf(page, " %ld", pipesummary[i]); 1206 pr_cont(" %ld", pipesummary[i]);
1091 page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); 1207 pr_cont("\n");
1092 page += sprintf(page, "Reader Batch: "); 1208
1209 pr_alert("%s%s ", torture_type, TORTURE_FLAG);
1210 pr_cont("Reader Batch: ");
1093 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 1211 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
1094 page += sprintf(page, " %ld", batchsummary[i]); 1212 pr_cont(" %ld", batchsummary[i]);
1095 page += sprintf(page, "\n%s%s ", torture_type, TORTURE_FLAG); 1213 pr_cont("\n");
1096 page += sprintf(page, "Free-Block Circulation: "); 1214
1215 pr_alert("%s%s ", torture_type, TORTURE_FLAG);
1216 pr_cont("Free-Block Circulation: ");
1097 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 1217 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
1098 page += sprintf(page, " %d", 1218 pr_cont(" %d", atomic_read(&rcu_torture_wcount[i]));
1099 atomic_read(&rcu_torture_wcount[i]));
1100 } 1219 }
1101 page += sprintf(page, "\n"); 1220 pr_cont("\n");
1221
1102 if (cur_ops->stats) 1222 if (cur_ops->stats)
1103 cur_ops->stats(page); 1223 cur_ops->stats();
1104 if (rtcv_snap == rcu_torture_current_version && 1224 if (rtcv_snap == rcu_torture_current_version &&
1105 rcu_torture_current != NULL) { 1225 rcu_torture_current != NULL) {
1106 int __maybe_unused flags; 1226 int __maybe_unused flags;
@@ -1109,10 +1229,9 @@ rcu_torture_printk(char *page)
1109 1229
1110 rcutorture_get_gp_data(cur_ops->ttype, 1230 rcutorture_get_gp_data(cur_ops->ttype,
1111 &flags, &gpnum, &completed); 1231 &flags, &gpnum, &completed);
1112 page += sprintf(page, 1232 pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
1113 "??? Writer stall state %d g%lu c%lu f%#x\n", 1233 rcu_torture_writer_state,
1114 rcu_torture_writer_state, 1234 gpnum, completed, flags);
1115 gpnum, completed, flags);
1116 show_rcu_gp_kthreads(); 1235 show_rcu_gp_kthreads();
1117 rcutorture_trace_dump(); 1236 rcutorture_trace_dump();
1118 } 1237 }
@@ -1120,30 +1239,6 @@ rcu_torture_printk(char *page)
1120} 1239}
1121 1240
1122/* 1241/*
1123 * Print torture statistics. Caller must ensure that there is only
1124 * one call to this function at a given time!!! This is normally
1125 * accomplished by relying on the module system to only have one copy
1126 * of the module loaded, and then by giving the rcu_torture_stats
1127 * kthread full control (or the init/cleanup functions when rcu_torture_stats
1128 * thread is not running).
1129 */
1130static void
1131rcu_torture_stats_print(void)
1132{
1133 int size = nr_cpu_ids * 200 + 8192;
1134 char *buf;
1135
1136 buf = kmalloc(size, GFP_KERNEL);
1137 if (!buf) {
1138 pr_err("rcu-torture: Out of memory, need: %d", size);
1139 return;
1140 }
1141 rcu_torture_printk(buf);
1142 pr_alert("%s", buf);
1143 kfree(buf);
1144}
1145
1146/*
1147 * Periodically prints torture statistics, if periodic statistics printing 1242 * Periodically prints torture statistics, if periodic statistics printing
1148 * was specified via the stat_interval module parameter. 1243 * was specified via the stat_interval module parameter.
1149 */ 1244 */
@@ -1295,7 +1390,8 @@ static int rcu_torture_barrier_cbs(void *arg)
1295 if (atomic_dec_and_test(&barrier_cbs_count)) 1390 if (atomic_dec_and_test(&barrier_cbs_count))
1296 wake_up(&barrier_wq); 1391 wake_up(&barrier_wq);
1297 } while (!torture_must_stop()); 1392 } while (!torture_must_stop());
1298 cur_ops->cb_barrier(); 1393 if (cur_ops->cb_barrier != NULL)
1394 cur_ops->cb_barrier();
1299 destroy_rcu_head_on_stack(&rcu); 1395 destroy_rcu_head_on_stack(&rcu);
1300 torture_kthread_stopping("rcu_torture_barrier_cbs"); 1396 torture_kthread_stopping("rcu_torture_barrier_cbs");
1301 return 0; 1397 return 0;
@@ -1418,7 +1514,7 @@ rcu_torture_cleanup(void)
1418 int i; 1514 int i;
1419 1515
1420 rcutorture_record_test_transition(); 1516 rcutorture_record_test_transition();
1421 if (torture_cleanup()) { 1517 if (torture_cleanup_begin()) {
1422 if (cur_ops->cb_barrier != NULL) 1518 if (cur_ops->cb_barrier != NULL)
1423 cur_ops->cb_barrier(); 1519 cur_ops->cb_barrier();
1424 return; 1520 return;
@@ -1447,6 +1543,8 @@ rcu_torture_cleanup(void)
1447 1543
1448 torture_stop_kthread(rcu_torture_stats, stats_task); 1544 torture_stop_kthread(rcu_torture_stats, stats_task);
1449 torture_stop_kthread(rcu_torture_fqs, fqs_task); 1545 torture_stop_kthread(rcu_torture_fqs, fqs_task);
1546 for (i = 0; i < ncbflooders; i++)
1547 torture_stop_kthread(rcu_torture_cbflood, cbflood_task[i]);
1450 if ((test_boost == 1 && cur_ops->can_boost) || 1548 if ((test_boost == 1 && cur_ops->can_boost) ||
1451 test_boost == 2) { 1549 test_boost == 2) {
1452 unregister_cpu_notifier(&rcutorture_cpu_nb); 1550 unregister_cpu_notifier(&rcutorture_cpu_nb);
@@ -1468,6 +1566,7 @@ rcu_torture_cleanup(void)
1468 "End of test: RCU_HOTPLUG"); 1566 "End of test: RCU_HOTPLUG");
1469 else 1567 else
1470 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); 1568 rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS");
1569 torture_cleanup_end();
1471} 1570}
1472 1571
1473#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD 1572#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
@@ -1534,9 +1633,10 @@ rcu_torture_init(void)
1534 int firsterr = 0; 1633 int firsterr = 0;
1535 static struct rcu_torture_ops *torture_ops[] = { 1634 static struct rcu_torture_ops *torture_ops[] = {
1536 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops, 1635 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &sched_ops,
1636 RCUTORTURE_TASKS_OPS
1537 }; 1637 };
1538 1638
1539 if (!torture_init_begin(torture_type, verbose, &rcutorture_runnable)) 1639 if (!torture_init_begin(torture_type, verbose, &torture_runnable))
1540 return -EBUSY; 1640 return -EBUSY;
1541 1641
1542 /* Process args and tell the world that the torturer is on the job. */ 1642 /* Process args and tell the world that the torturer is on the job. */
@@ -1693,6 +1793,24 @@ rcu_torture_init(void)
1693 goto unwind; 1793 goto unwind;
1694 if (object_debug) 1794 if (object_debug)
1695 rcu_test_debug_objects(); 1795 rcu_test_debug_objects();
1796 if (cbflood_n_burst > 0) {
1797 /* Create the cbflood threads */
1798 ncbflooders = (num_online_cpus() + 3) / 4;
1799 cbflood_task = kcalloc(ncbflooders, sizeof(*cbflood_task),
1800 GFP_KERNEL);
1801 if (!cbflood_task) {
1802 VERBOSE_TOROUT_ERRSTRING("out of memory");
1803 firsterr = -ENOMEM;
1804 goto unwind;
1805 }
1806 for (i = 0; i < ncbflooders; i++) {
1807 firsterr = torture_create_kthread(rcu_torture_cbflood,
1808 NULL,
1809 cbflood_task[i]);
1810 if (firsterr)
1811 goto unwind;
1812 }
1813 }
1696 rcutorture_record_test_transition(); 1814 rcutorture_record_test_transition();
1697 torture_init_end(); 1815 torture_init_end();
1698 return 0; 1816 return 0;
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index d9efcc13008c..c0623fc47125 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -51,7 +51,7 @@ static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
51 51
52#include "tiny_plugin.h" 52#include "tiny_plugin.h"
53 53
54/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ 54/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcu/tree.c. */
55static void rcu_idle_enter_common(long long newval) 55static void rcu_idle_enter_common(long long newval)
56{ 56{
57 if (newval) { 57 if (newval) {
@@ -62,7 +62,7 @@ static void rcu_idle_enter_common(long long newval)
62 } 62 }
63 RCU_TRACE(trace_rcu_dyntick(TPS("Start"), 63 RCU_TRACE(trace_rcu_dyntick(TPS("Start"),
64 rcu_dynticks_nesting, newval)); 64 rcu_dynticks_nesting, newval));
65 if (!is_idle_task(current)) { 65 if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
66 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); 66 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
67 67
68 RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), 68 RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"),
@@ -72,7 +72,7 @@ static void rcu_idle_enter_common(long long newval)
72 current->pid, current->comm, 72 current->pid, current->comm,
73 idle->pid, idle->comm); /* must be idle task! */ 73 idle->pid, idle->comm); /* must be idle task! */
74 } 74 }
75 rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ 75 rcu_sched_qs(); /* implies rcu_bh_inc() */
76 barrier(); 76 barrier();
77 rcu_dynticks_nesting = newval; 77 rcu_dynticks_nesting = newval;
78} 78}
@@ -114,7 +114,7 @@ void rcu_irq_exit(void)
114} 114}
115EXPORT_SYMBOL_GPL(rcu_irq_exit); 115EXPORT_SYMBOL_GPL(rcu_irq_exit);
116 116
117/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ 117/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcu/tree.c. */
118static void rcu_idle_exit_common(long long oldval) 118static void rcu_idle_exit_common(long long oldval)
119{ 119{
120 if (oldval) { 120 if (oldval) {
@@ -123,7 +123,7 @@ static void rcu_idle_exit_common(long long oldval)
123 return; 123 return;
124 } 124 }
125 RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); 125 RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting));
126 if (!is_idle_task(current)) { 126 if (IS_ENABLED(CONFIG_RCU_TRACE) && !is_idle_task(current)) {
127 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); 127 struct task_struct *idle __maybe_unused = idle_task(smp_processor_id());
128 128
129 RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), 129 RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"),
@@ -217,7 +217,7 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
217 * are at it, given that any rcu quiescent state is also an rcu_bh 217 * are at it, given that any rcu quiescent state is also an rcu_bh
218 * quiescent state. Use "+" instead of "||" to defeat short circuiting. 218 * quiescent state. Use "+" instead of "||" to defeat short circuiting.
219 */ 219 */
220void rcu_sched_qs(int cpu) 220void rcu_sched_qs(void)
221{ 221{
222 unsigned long flags; 222 unsigned long flags;
223 223
@@ -231,7 +231,7 @@ void rcu_sched_qs(int cpu)
231/* 231/*
232 * Record an rcu_bh quiescent state. 232 * Record an rcu_bh quiescent state.
233 */ 233 */
234void rcu_bh_qs(int cpu) 234void rcu_bh_qs(void)
235{ 235{
236 unsigned long flags; 236 unsigned long flags;
237 237
@@ -251,9 +251,11 @@ void rcu_check_callbacks(int cpu, int user)
251{ 251{
252 RCU_TRACE(check_cpu_stalls()); 252 RCU_TRACE(check_cpu_stalls());
253 if (user || rcu_is_cpu_rrupt_from_idle()) 253 if (user || rcu_is_cpu_rrupt_from_idle())
254 rcu_sched_qs(cpu); 254 rcu_sched_qs();
255 else if (!in_softirq()) 255 else if (!in_softirq())
256 rcu_bh_qs(cpu); 256 rcu_bh_qs();
257 if (user)
258 rcu_note_voluntary_context_switch(current);
257} 259}
258 260
259/* 261/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1b70cb6fbe3c..9815447d22e0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -79,9 +79,18 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
79 * the tracing userspace tools to be able to decipher the string 79 * the tracing userspace tools to be able to decipher the string
80 * address to the matching string. 80 * address to the matching string.
81 */ 81 */
82#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ 82#ifdef CONFIG_TRACING
83# define DEFINE_RCU_TPS(sname) \
83static char sname##_varname[] = #sname; \ 84static char sname##_varname[] = #sname; \
84static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ 85static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname;
86# define RCU_STATE_NAME(sname) sname##_varname
87#else
88# define DEFINE_RCU_TPS(sname)
89# define RCU_STATE_NAME(sname) __stringify(sname)
90#endif
91
92#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
93DEFINE_RCU_TPS(sname) \
85struct rcu_state sname##_state = { \ 94struct rcu_state sname##_state = { \
86 .level = { &sname##_state.node[0] }, \ 95 .level = { &sname##_state.node[0] }, \
87 .call = cr, \ 96 .call = cr, \
@@ -93,7 +102,7 @@ struct rcu_state sname##_state = { \
93 .orphan_donetail = &sname##_state.orphan_donelist, \ 102 .orphan_donetail = &sname##_state.orphan_donelist, \
94 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 103 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
95 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ 104 .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
96 .name = sname##_varname, \ 105 .name = RCU_STATE_NAME(sname), \
97 .abbr = sabbr, \ 106 .abbr = sabbr, \
98}; \ 107}; \
99DEFINE_PER_CPU(struct rcu_data, sname##_data) 108DEFINE_PER_CPU(struct rcu_data, sname##_data)
@@ -188,22 +197,24 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
188 * one since the start of the grace period, this just sets a flag. 197 * one since the start of the grace period, this just sets a flag.
189 * The caller must have disabled preemption. 198 * The caller must have disabled preemption.
190 */ 199 */
191void rcu_sched_qs(int cpu) 200void rcu_sched_qs(void)
192{ 201{
193 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); 202 if (!__this_cpu_read(rcu_sched_data.passed_quiesce)) {
194 203 trace_rcu_grace_period(TPS("rcu_sched"),
195 if (rdp->passed_quiesce == 0) 204 __this_cpu_read(rcu_sched_data.gpnum),
196 trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); 205 TPS("cpuqs"));
197 rdp->passed_quiesce = 1; 206 __this_cpu_write(rcu_sched_data.passed_quiesce, 1);
207 }
198} 208}
199 209
200void rcu_bh_qs(int cpu) 210void rcu_bh_qs(void)
201{ 211{
202 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); 212 if (!__this_cpu_read(rcu_bh_data.passed_quiesce)) {
203 213 trace_rcu_grace_period(TPS("rcu_bh"),
204 if (rdp->passed_quiesce == 0) 214 __this_cpu_read(rcu_bh_data.gpnum),
205 trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); 215 TPS("cpuqs"));
206 rdp->passed_quiesce = 1; 216 __this_cpu_write(rcu_bh_data.passed_quiesce, 1);
217 }
207} 218}
208 219
209static DEFINE_PER_CPU(int, rcu_sched_qs_mask); 220static DEFINE_PER_CPU(int, rcu_sched_qs_mask);
@@ -278,7 +289,7 @@ static void rcu_momentary_dyntick_idle(void)
278void rcu_note_context_switch(int cpu) 289void rcu_note_context_switch(int cpu)
279{ 290{
280 trace_rcu_utilization(TPS("Start context switch")); 291 trace_rcu_utilization(TPS("Start context switch"));
281 rcu_sched_qs(cpu); 292 rcu_sched_qs();
282 rcu_preempt_note_context_switch(cpu); 293 rcu_preempt_note_context_switch(cpu);
283 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 294 if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
284 rcu_momentary_dyntick_idle(); 295 rcu_momentary_dyntick_idle();
@@ -526,6 +537,7 @@ static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval,
526 atomic_inc(&rdtp->dynticks); 537 atomic_inc(&rdtp->dynticks);
527 smp_mb__after_atomic(); /* Force ordering with next sojourn. */ 538 smp_mb__after_atomic(); /* Force ordering with next sojourn. */
528 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); 539 WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
540 rcu_dynticks_task_enter();
529 541
530 /* 542 /*
531 * It is illegal to enter an extended quiescent state while 543 * It is illegal to enter an extended quiescent state while
@@ -642,6 +654,7 @@ void rcu_irq_exit(void)
642static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, 654static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval,
643 int user) 655 int user)
644{ 656{
657 rcu_dynticks_task_exit();
645 smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */ 658 smp_mb__before_atomic(); /* Force ordering w/previous sojourn. */
646 atomic_inc(&rdtp->dynticks); 659 atomic_inc(&rdtp->dynticks);
647 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ 660 /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
@@ -819,7 +832,7 @@ bool notrace __rcu_is_watching(void)
819 */ 832 */
820bool notrace rcu_is_watching(void) 833bool notrace rcu_is_watching(void)
821{ 834{
822 int ret; 835 bool ret;
823 836
824 preempt_disable(); 837 preempt_disable();
825 ret = __rcu_is_watching(); 838 ret = __rcu_is_watching();
@@ -1647,7 +1660,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
1647 rnp->level, rnp->grplo, 1660 rnp->level, rnp->grplo,
1648 rnp->grphi, rnp->qsmask); 1661 rnp->grphi, rnp->qsmask);
1649 raw_spin_unlock_irq(&rnp->lock); 1662 raw_spin_unlock_irq(&rnp->lock);
1650 cond_resched(); 1663 cond_resched_rcu_qs();
1651 } 1664 }
1652 1665
1653 mutex_unlock(&rsp->onoff_mutex); 1666 mutex_unlock(&rsp->onoff_mutex);
@@ -1668,7 +1681,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1668 if (fqs_state == RCU_SAVE_DYNTICK) { 1681 if (fqs_state == RCU_SAVE_DYNTICK) {
1669 /* Collect dyntick-idle snapshots. */ 1682 /* Collect dyntick-idle snapshots. */
1670 if (is_sysidle_rcu_state(rsp)) { 1683 if (is_sysidle_rcu_state(rsp)) {
1671 isidle = 1; 1684 isidle = true;
1672 maxj = jiffies - ULONG_MAX / 4; 1685 maxj = jiffies - ULONG_MAX / 4;
1673 } 1686 }
1674 force_qs_rnp(rsp, dyntick_save_progress_counter, 1687 force_qs_rnp(rsp, dyntick_save_progress_counter,
@@ -1677,14 +1690,15 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
1677 fqs_state = RCU_FORCE_QS; 1690 fqs_state = RCU_FORCE_QS;
1678 } else { 1691 } else {
1679 /* Handle dyntick-idle and offline CPUs. */ 1692 /* Handle dyntick-idle and offline CPUs. */
1680 isidle = 0; 1693 isidle = false;
1681 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); 1694 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
1682 } 1695 }
1683 /* Clear flag to prevent immediate re-entry. */ 1696 /* Clear flag to prevent immediate re-entry. */
1684 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 1697 if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
1685 raw_spin_lock_irq(&rnp->lock); 1698 raw_spin_lock_irq(&rnp->lock);
1686 smp_mb__after_unlock_lock(); 1699 smp_mb__after_unlock_lock();
1687 ACCESS_ONCE(rsp->gp_flags) &= ~RCU_GP_FLAG_FQS; 1700 ACCESS_ONCE(rsp->gp_flags) =
1701 ACCESS_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS;
1688 raw_spin_unlock_irq(&rnp->lock); 1702 raw_spin_unlock_irq(&rnp->lock);
1689 } 1703 }
1690 return fqs_state; 1704 return fqs_state;
@@ -1736,7 +1750,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
1736 /* smp_mb() provided by prior unlock-lock pair. */ 1750 /* smp_mb() provided by prior unlock-lock pair. */
1737 nocb += rcu_future_gp_cleanup(rsp, rnp); 1751 nocb += rcu_future_gp_cleanup(rsp, rnp);
1738 raw_spin_unlock_irq(&rnp->lock); 1752 raw_spin_unlock_irq(&rnp->lock);
1739 cond_resched(); 1753 cond_resched_rcu_qs();
1740 } 1754 }
1741 rnp = rcu_get_root(rsp); 1755 rnp = rcu_get_root(rsp);
1742 raw_spin_lock_irq(&rnp->lock); 1756 raw_spin_lock_irq(&rnp->lock);
@@ -1785,8 +1799,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
1785 /* Locking provides needed memory barrier. */ 1799 /* Locking provides needed memory barrier. */
1786 if (rcu_gp_init(rsp)) 1800 if (rcu_gp_init(rsp))
1787 break; 1801 break;
1788 cond_resched(); 1802 cond_resched_rcu_qs();
1789 flush_signals(current); 1803 WARN_ON(signal_pending(current));
1790 trace_rcu_grace_period(rsp->name, 1804 trace_rcu_grace_period(rsp->name,
1791 ACCESS_ONCE(rsp->gpnum), 1805 ACCESS_ONCE(rsp->gpnum),
1792 TPS("reqwaitsig")); 1806 TPS("reqwaitsig"));
@@ -1828,11 +1842,11 @@ static int __noreturn rcu_gp_kthread(void *arg)
1828 trace_rcu_grace_period(rsp->name, 1842 trace_rcu_grace_period(rsp->name,
1829 ACCESS_ONCE(rsp->gpnum), 1843 ACCESS_ONCE(rsp->gpnum),
1830 TPS("fqsend")); 1844 TPS("fqsend"));
1831 cond_resched(); 1845 cond_resched_rcu_qs();
1832 } else { 1846 } else {
1833 /* Deal with stray signal. */ 1847 /* Deal with stray signal. */
1834 cond_resched(); 1848 cond_resched_rcu_qs();
1835 flush_signals(current); 1849 WARN_ON(signal_pending(current));
1836 trace_rcu_grace_period(rsp->name, 1850 trace_rcu_grace_period(rsp->name,
1837 ACCESS_ONCE(rsp->gpnum), 1851 ACCESS_ONCE(rsp->gpnum),
1838 TPS("fqswaitsig")); 1852 TPS("fqswaitsig"));
@@ -1928,7 +1942,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
1928{ 1942{
1929 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 1943 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
1930 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); 1944 raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
1931 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ 1945 rcu_gp_kthread_wake(rsp);
1932} 1946}
1933 1947
1934/* 1948/*
@@ -2210,8 +2224,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
2210 /* Adjust any no-longer-needed kthreads. */ 2224 /* Adjust any no-longer-needed kthreads. */
2211 rcu_boost_kthread_setaffinity(rnp, -1); 2225 rcu_boost_kthread_setaffinity(rnp, -1);
2212 2226
2213 /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */
2214
2215 /* Exclude any attempts to start a new grace period. */ 2227 /* Exclude any attempts to start a new grace period. */
2216 mutex_lock(&rsp->onoff_mutex); 2228 mutex_lock(&rsp->onoff_mutex);
2217 raw_spin_lock_irqsave(&rsp->orphan_lock, flags); 2229 raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
@@ -2393,8 +2405,8 @@ void rcu_check_callbacks(int cpu, int user)
2393 * at least not while the corresponding CPU is online. 2405 * at least not while the corresponding CPU is online.
2394 */ 2406 */
2395 2407
2396 rcu_sched_qs(cpu); 2408 rcu_sched_qs();
2397 rcu_bh_qs(cpu); 2409 rcu_bh_qs();
2398 2410
2399 } else if (!in_softirq()) { 2411 } else if (!in_softirq()) {
2400 2412
@@ -2405,11 +2417,13 @@ void rcu_check_callbacks(int cpu, int user)
2405 * critical section, so note it. 2417 * critical section, so note it.
2406 */ 2418 */
2407 2419
2408 rcu_bh_qs(cpu); 2420 rcu_bh_qs();
2409 } 2421 }
2410 rcu_preempt_check_callbacks(cpu); 2422 rcu_preempt_check_callbacks(cpu);
2411 if (rcu_pending(cpu)) 2423 if (rcu_pending(cpu))
2412 invoke_rcu_core(); 2424 invoke_rcu_core();
2425 if (user)
2426 rcu_note_voluntary_context_switch(current);
2413 trace_rcu_utilization(TPS("End scheduler-tick")); 2427 trace_rcu_utilization(TPS("End scheduler-tick"));
2414} 2428}
2415 2429
@@ -2432,7 +2446,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
2432 struct rcu_node *rnp; 2446 struct rcu_node *rnp;
2433 2447
2434 rcu_for_each_leaf_node(rsp, rnp) { 2448 rcu_for_each_leaf_node(rsp, rnp) {
2435 cond_resched(); 2449 cond_resched_rcu_qs();
2436 mask = 0; 2450 mask = 0;
2437 raw_spin_lock_irqsave(&rnp->lock, flags); 2451 raw_spin_lock_irqsave(&rnp->lock, flags);
2438 smp_mb__after_unlock_lock(); 2452 smp_mb__after_unlock_lock();
@@ -2449,7 +2463,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
2449 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { 2463 for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
2450 if ((rnp->qsmask & bit) != 0) { 2464 if ((rnp->qsmask & bit) != 0) {
2451 if ((rnp->qsmaskinit & bit) != 0) 2465 if ((rnp->qsmaskinit & bit) != 0)
2452 *isidle = 0; 2466 *isidle = false;
2453 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) 2467 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
2454 mask |= bit; 2468 mask |= bit;
2455 } 2469 }
@@ -2505,9 +2519,10 @@ static void force_quiescent_state(struct rcu_state *rsp)
2505 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2519 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2506 return; /* Someone beat us to it. */ 2520 return; /* Someone beat us to it. */
2507 } 2521 }
2508 ACCESS_ONCE(rsp->gp_flags) |= RCU_GP_FLAG_FQS; 2522 ACCESS_ONCE(rsp->gp_flags) =
2523 ACCESS_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS;
2509 raw_spin_unlock_irqrestore(&rnp_old->lock, flags); 2524 raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
2510 wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ 2525 rcu_gp_kthread_wake(rsp);
2511} 2526}
2512 2527
2513/* 2528/*
@@ -2925,11 +2940,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
2925 * restructure your code to batch your updates, and then use a single 2940 * restructure your code to batch your updates, and then use a single
2926 * synchronize_sched() instead. 2941 * synchronize_sched() instead.
2927 * 2942 *
2928 * Note that it is illegal to call this function while holding any lock
2929 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
2930 * to call this function from a CPU-hotplug notifier. Failing to observe
2931 * these restriction will result in deadlock.
2932 *
2933 * This implementation can be thought of as an application of ticket 2943 * This implementation can be thought of as an application of ticket
2934 * locking to RCU, with sync_sched_expedited_started and 2944 * locking to RCU, with sync_sched_expedited_started and
2935 * sync_sched_expedited_done taking on the roles of the halves 2945 * sync_sched_expedited_done taking on the roles of the halves
@@ -2979,7 +2989,12 @@ void synchronize_sched_expedited(void)
2979 */ 2989 */
2980 snap = atomic_long_inc_return(&rsp->expedited_start); 2990 snap = atomic_long_inc_return(&rsp->expedited_start);
2981 firstsnap = snap; 2991 firstsnap = snap;
2982 get_online_cpus(); 2992 if (!try_get_online_cpus()) {
2993 /* CPU hotplug operation in flight, fall back to normal GP. */
2994 wait_rcu_gp(call_rcu_sched);
2995 atomic_long_inc(&rsp->expedited_normal);
2996 return;
2997 }
2983 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); 2998 WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id()));
2984 2999
2985 /* 3000 /*
@@ -3026,7 +3041,12 @@ void synchronize_sched_expedited(void)
3026 * and they started after our first try, so their grace 3041 * and they started after our first try, so their grace
3027 * period works for us. 3042 * period works for us.
3028 */ 3043 */
3029 get_online_cpus(); 3044 if (!try_get_online_cpus()) {
3045 /* CPU hotplug operation in flight, use normal GP. */
3046 wait_rcu_gp(call_rcu_sched);
3047 atomic_long_inc(&rsp->expedited_normal);
3048 return;
3049 }
3030 snap = atomic_long_read(&rsp->expedited_start); 3050 snap = atomic_long_read(&rsp->expedited_start);
3031 smp_mb(); /* ensure read is before try_stop_cpus(). */ 3051 smp_mb(); /* ensure read is before try_stop_cpus(). */
3032 } 3052 }
@@ -3279,11 +3299,16 @@ static void _rcu_barrier(struct rcu_state *rsp)
3279 continue; 3299 continue;
3280 rdp = per_cpu_ptr(rsp->rda, cpu); 3300 rdp = per_cpu_ptr(rsp->rda, cpu);
3281 if (rcu_is_nocb_cpu(cpu)) { 3301 if (rcu_is_nocb_cpu(cpu)) {
3282 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 3302 if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
3283 rsp->n_barrier_done); 3303 _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
3284 atomic_inc(&rsp->barrier_cpu_count); 3304 rsp->n_barrier_done);
3285 __call_rcu(&rdp->barrier_head, rcu_barrier_callback, 3305 } else {
3286 rsp, cpu, 0); 3306 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
3307 rsp->n_barrier_done);
3308 atomic_inc(&rsp->barrier_cpu_count);
3309 __call_rcu(&rdp->barrier_head,
3310 rcu_barrier_callback, rsp, cpu, 0);
3311 }
3287 } else if (ACCESS_ONCE(rdp->qlen)) { 3312 } else if (ACCESS_ONCE(rdp->qlen)) {
3288 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 3313 _rcu_barrier_trace(rsp, "OnlineQ", cpu,
3289 rsp->n_barrier_done); 3314 rsp->n_barrier_done);
@@ -3442,6 +3467,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
3442 case CPU_UP_PREPARE_FROZEN: 3467 case CPU_UP_PREPARE_FROZEN:
3443 rcu_prepare_cpu(cpu); 3468 rcu_prepare_cpu(cpu);
3444 rcu_prepare_kthreads(cpu); 3469 rcu_prepare_kthreads(cpu);
3470 rcu_spawn_all_nocb_kthreads(cpu);
3445 break; 3471 break;
3446 case CPU_ONLINE: 3472 case CPU_ONLINE:
3447 case CPU_DOWN_FAILED: 3473 case CPU_DOWN_FAILED:
@@ -3489,7 +3515,7 @@ static int rcu_pm_notify(struct notifier_block *self,
3489} 3515}
3490 3516
3491/* 3517/*
3492 * Spawn the kthread that handles this RCU flavor's grace periods. 3518 * Spawn the kthreads that handle each RCU flavor's grace periods.
3493 */ 3519 */
3494static int __init rcu_spawn_gp_kthread(void) 3520static int __init rcu_spawn_gp_kthread(void)
3495{ 3521{
@@ -3498,6 +3524,7 @@ static int __init rcu_spawn_gp_kthread(void)
3498 struct rcu_state *rsp; 3524 struct rcu_state *rsp;
3499 struct task_struct *t; 3525 struct task_struct *t;
3500 3526
3527 rcu_scheduler_fully_active = 1;
3501 for_each_rcu_flavor(rsp) { 3528 for_each_rcu_flavor(rsp) {
3502 t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); 3529 t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name);
3503 BUG_ON(IS_ERR(t)); 3530 BUG_ON(IS_ERR(t));
@@ -3505,8 +3532,9 @@ static int __init rcu_spawn_gp_kthread(void)
3505 raw_spin_lock_irqsave(&rnp->lock, flags); 3532 raw_spin_lock_irqsave(&rnp->lock, flags);
3506 rsp->gp_kthread = t; 3533 rsp->gp_kthread = t;
3507 raw_spin_unlock_irqrestore(&rnp->lock, flags); 3534 raw_spin_unlock_irqrestore(&rnp->lock, flags);
3508 rcu_spawn_nocb_kthreads(rsp);
3509 } 3535 }
3536 rcu_spawn_nocb_kthreads();
3537 rcu_spawn_boost_kthreads();
3510 return 0; 3538 return 0;
3511} 3539}
3512early_initcall(rcu_spawn_gp_kthread); 3540early_initcall(rcu_spawn_gp_kthread);
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 71e64c718f75..bbdc45d8d74f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -350,7 +350,7 @@ struct rcu_data {
350 int nocb_p_count_lazy; /* (approximate). */ 350 int nocb_p_count_lazy; /* (approximate). */
351 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ 351 wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
352 struct task_struct *nocb_kthread; 352 struct task_struct *nocb_kthread;
353 bool nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ 353 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
354 354
355 /* The following fields are used by the leader, hence own cacheline. */ 355 /* The following fields are used by the leader, hence own cacheline. */
356 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; 356 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
@@ -358,7 +358,7 @@ struct rcu_data {
358 struct rcu_head **nocb_gp_tail; 358 struct rcu_head **nocb_gp_tail;
359 long nocb_gp_count; 359 long nocb_gp_count;
360 long nocb_gp_count_lazy; 360 long nocb_gp_count_lazy;
361 bool nocb_leader_wake; /* Is the nocb leader thread awake? */ 361 bool nocb_leader_sleep; /* Is the nocb leader thread asleep? */
362 struct rcu_data *nocb_next_follower; 362 struct rcu_data *nocb_next_follower;
363 /* Next follower in wakeup chain. */ 363 /* Next follower in wakeup chain. */
364 364
@@ -383,6 +383,11 @@ struct rcu_data {
383#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ 383#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
384#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 384#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
385 385
386/* Values for nocb_defer_wakeup field in struct rcu_data. */
387#define RCU_NOGP_WAKE_NOT 0
388#define RCU_NOGP_WAKE 1
389#define RCU_NOGP_WAKE_FORCE 2
390
386#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) 391#define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500))
387 /* For jiffies_till_first_fqs and */ 392 /* For jiffies_till_first_fqs and */
388 /* and jiffies_till_next_fqs. */ 393 /* and jiffies_till_next_fqs. */
@@ -572,6 +577,7 @@ static void rcu_preempt_do_callbacks(void);
572static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, 577static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
573 struct rcu_node *rnp); 578 struct rcu_node *rnp);
574#endif /* #ifdef CONFIG_RCU_BOOST */ 579#endif /* #ifdef CONFIG_RCU_BOOST */
580static void __init rcu_spawn_boost_kthreads(void);
575static void rcu_prepare_kthreads(int cpu); 581static void rcu_prepare_kthreads(int cpu);
576static void rcu_cleanup_after_idle(int cpu); 582static void rcu_cleanup_after_idle(int cpu);
577static void rcu_prepare_for_idle(int cpu); 583static void rcu_prepare_for_idle(int cpu);
@@ -581,6 +587,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
581static void print_cpu_stall_info_end(void); 587static void print_cpu_stall_info_end(void);
582static void zero_cpu_stall_ticks(struct rcu_data *rdp); 588static void zero_cpu_stall_ticks(struct rcu_data *rdp);
583static void increment_cpu_stall_ticks(void); 589static void increment_cpu_stall_ticks(void);
590static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
584static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); 591static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
585static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); 592static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
586static void rcu_init_one_nocb(struct rcu_node *rnp); 593static void rcu_init_one_nocb(struct rcu_node *rnp);
@@ -589,10 +596,14 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
589static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 596static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
590 struct rcu_data *rdp, 597 struct rcu_data *rdp,
591 unsigned long flags); 598 unsigned long flags);
592static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); 599static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
593static void do_nocb_deferred_wakeup(struct rcu_data *rdp); 600static void do_nocb_deferred_wakeup(struct rcu_data *rdp);
594static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); 601static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
595static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); 602static void rcu_spawn_all_nocb_kthreads(int cpu);
603static void __init rcu_spawn_nocb_kthreads(void);
604#ifdef CONFIG_RCU_NOCB_CPU
605static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp);
606#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
596static void __maybe_unused rcu_kick_nohz_cpu(int cpu); 607static void __maybe_unused rcu_kick_nohz_cpu(int cpu);
597static bool init_nocb_callback_list(struct rcu_data *rdp); 608static bool init_nocb_callback_list(struct rcu_data *rdp);
598static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); 609static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq);
@@ -605,6 +616,8 @@ static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
605static void rcu_bind_gp_kthread(void); 616static void rcu_bind_gp_kthread(void);
606static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); 617static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp);
607static bool rcu_nohz_full_cpu(struct rcu_state *rsp); 618static bool rcu_nohz_full_cpu(struct rcu_state *rsp);
619static void rcu_dynticks_task_enter(void);
620static void rcu_dynticks_task_exit(void);
608 621
609#endif /* #ifndef RCU_TREE_NONCORE */ 622#endif /* #ifndef RCU_TREE_NONCORE */
610 623
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 00dc411e9676..c1d7f27bd38f 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -85,33 +85,6 @@ static void __init rcu_bootup_announce_oddness(void)
85 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); 85 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
86 if (nr_cpu_ids != NR_CPUS) 86 if (nr_cpu_ids != NR_CPUS)
87 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); 87 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
88#ifdef CONFIG_RCU_NOCB_CPU
89#ifndef CONFIG_RCU_NOCB_CPU_NONE
90 if (!have_rcu_nocb_mask) {
91 zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL);
92 have_rcu_nocb_mask = true;
93 }
94#ifdef CONFIG_RCU_NOCB_CPU_ZERO
95 pr_info("\tOffload RCU callbacks from CPU 0\n");
96 cpumask_set_cpu(0, rcu_nocb_mask);
97#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
98#ifdef CONFIG_RCU_NOCB_CPU_ALL
99 pr_info("\tOffload RCU callbacks from all CPUs\n");
100 cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
101#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
102#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
103 if (have_rcu_nocb_mask) {
104 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
105 pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
106 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
107 rcu_nocb_mask);
108 }
109 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
110 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
111 if (rcu_nocb_poll)
112 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
113 }
114#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
115} 88}
116 89
117#ifdef CONFIG_TREE_PREEMPT_RCU 90#ifdef CONFIG_TREE_PREEMPT_RCU
@@ -134,7 +107,7 @@ static void __init rcu_bootup_announce(void)
134 * Return the number of RCU-preempt batches processed thus far 107 * Return the number of RCU-preempt batches processed thus far
135 * for debug and statistics. 108 * for debug and statistics.
136 */ 109 */
137long rcu_batches_completed_preempt(void) 110static long rcu_batches_completed_preempt(void)
138{ 111{
139 return rcu_preempt_state.completed; 112 return rcu_preempt_state.completed;
140} 113}
@@ -155,18 +128,19 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed);
155 * not in a quiescent state. There might be any number of tasks blocked 128 * not in a quiescent state. There might be any number of tasks blocked
156 * while in an RCU read-side critical section. 129 * while in an RCU read-side critical section.
157 * 130 *
158 * Unlike the other rcu_*_qs() functions, callers to this function 131 * As with the other rcu_*_qs() functions, callers to this function
159 * must disable irqs in order to protect the assignment to 132 * must disable preemption.
160 * ->rcu_read_unlock_special. 133 */
161 */ 134static void rcu_preempt_qs(void)
162static void rcu_preempt_qs(int cpu) 135{
163{ 136 if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
164 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 137 trace_rcu_grace_period(TPS("rcu_preempt"),
165 138 __this_cpu_read(rcu_preempt_data.gpnum),
166 if (rdp->passed_quiesce == 0) 139 TPS("cpuqs"));
167 trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); 140 __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
168 rdp->passed_quiesce = 1; 141 barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
169 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 142 current->rcu_read_unlock_special.b.need_qs = false;
143 }
170} 144}
171 145
172/* 146/*
@@ -190,14 +164,14 @@ static void rcu_preempt_note_context_switch(int cpu)
190 struct rcu_node *rnp; 164 struct rcu_node *rnp;
191 165
192 if (t->rcu_read_lock_nesting > 0 && 166 if (t->rcu_read_lock_nesting > 0 &&
193 (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { 167 !t->rcu_read_unlock_special.b.blocked) {
194 168
195 /* Possibly blocking in an RCU read-side critical section. */ 169 /* Possibly blocking in an RCU read-side critical section. */
196 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); 170 rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu);
197 rnp = rdp->mynode; 171 rnp = rdp->mynode;
198 raw_spin_lock_irqsave(&rnp->lock, flags); 172 raw_spin_lock_irqsave(&rnp->lock, flags);
199 smp_mb__after_unlock_lock(); 173 smp_mb__after_unlock_lock();
200 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; 174 t->rcu_read_unlock_special.b.blocked = true;
201 t->rcu_blocked_node = rnp; 175 t->rcu_blocked_node = rnp;
202 176
203 /* 177 /*
@@ -239,7 +213,7 @@ static void rcu_preempt_note_context_switch(int cpu)
239 : rnp->gpnum + 1); 213 : rnp->gpnum + 1);
240 raw_spin_unlock_irqrestore(&rnp->lock, flags); 214 raw_spin_unlock_irqrestore(&rnp->lock, flags);
241 } else if (t->rcu_read_lock_nesting < 0 && 215 } else if (t->rcu_read_lock_nesting < 0 &&
242 t->rcu_read_unlock_special) { 216 t->rcu_read_unlock_special.s) {
243 217
244 /* 218 /*
245 * Complete exit from RCU read-side critical section on 219 * Complete exit from RCU read-side critical section on
@@ -257,9 +231,7 @@ static void rcu_preempt_note_context_switch(int cpu)
257 * grace period, then the fact that the task has been enqueued 231 * grace period, then the fact that the task has been enqueued
258 * means that we continue to block the current grace period. 232 * means that we continue to block the current grace period.
259 */ 233 */
260 local_irq_save(flags); 234 rcu_preempt_qs();
261 rcu_preempt_qs(cpu);
262 local_irq_restore(flags);
263} 235}
264 236
265/* 237/*
@@ -340,7 +312,7 @@ void rcu_read_unlock_special(struct task_struct *t)
340 bool drop_boost_mutex = false; 312 bool drop_boost_mutex = false;
341#endif /* #ifdef CONFIG_RCU_BOOST */ 313#endif /* #ifdef CONFIG_RCU_BOOST */
342 struct rcu_node *rnp; 314 struct rcu_node *rnp;
343 int special; 315 union rcu_special special;
344 316
345 /* NMI handlers cannot block and cannot safely manipulate state. */ 317 /* NMI handlers cannot block and cannot safely manipulate state. */
346 if (in_nmi()) 318 if (in_nmi())
@@ -350,12 +322,13 @@ void rcu_read_unlock_special(struct task_struct *t)
350 322
351 /* 323 /*
352 * If RCU core is waiting for this CPU to exit critical section, 324 * If RCU core is waiting for this CPU to exit critical section,
353 * let it know that we have done so. 325 * let it know that we have done so. Because irqs are disabled,
326 * t->rcu_read_unlock_special cannot change.
354 */ 327 */
355 special = t->rcu_read_unlock_special; 328 special = t->rcu_read_unlock_special;
356 if (special & RCU_READ_UNLOCK_NEED_QS) { 329 if (special.b.need_qs) {
357 rcu_preempt_qs(smp_processor_id()); 330 rcu_preempt_qs();
358 if (!t->rcu_read_unlock_special) { 331 if (!t->rcu_read_unlock_special.s) {
359 local_irq_restore(flags); 332 local_irq_restore(flags);
360 return; 333 return;
361 } 334 }
@@ -368,8 +341,8 @@ void rcu_read_unlock_special(struct task_struct *t)
368 } 341 }
369 342
370 /* Clean up if blocked during RCU read-side critical section. */ 343 /* Clean up if blocked during RCU read-side critical section. */
371 if (special & RCU_READ_UNLOCK_BLOCKED) { 344 if (special.b.blocked) {
372 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; 345 t->rcu_read_unlock_special.b.blocked = false;
373 346
374 /* 347 /*
375 * Remove this task from the list it blocked on. The 348 * Remove this task from the list it blocked on. The
@@ -653,12 +626,13 @@ static void rcu_preempt_check_callbacks(int cpu)
653 struct task_struct *t = current; 626 struct task_struct *t = current;
654 627
655 if (t->rcu_read_lock_nesting == 0) { 628 if (t->rcu_read_lock_nesting == 0) {
656 rcu_preempt_qs(cpu); 629 rcu_preempt_qs();
657 return; 630 return;
658 } 631 }
659 if (t->rcu_read_lock_nesting > 0 && 632 if (t->rcu_read_lock_nesting > 0 &&
660 per_cpu(rcu_preempt_data, cpu).qs_pending) 633 per_cpu(rcu_preempt_data, cpu).qs_pending &&
661 t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; 634 !per_cpu(rcu_preempt_data, cpu).passed_quiesce)
635 t->rcu_read_unlock_special.b.need_qs = true;
662} 636}
663 637
664#ifdef CONFIG_RCU_BOOST 638#ifdef CONFIG_RCU_BOOST
@@ -819,11 +793,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
819 * In fact, if you are using synchronize_rcu_expedited() in a loop, 793 * In fact, if you are using synchronize_rcu_expedited() in a loop,
820 * please restructure your code to batch your updates, and then Use a 794 * please restructure your code to batch your updates, and then Use a
821 * single synchronize_rcu() instead. 795 * single synchronize_rcu() instead.
822 *
823 * Note that it is illegal to call this function while holding any lock
824 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal
825 * to call this function from a CPU-hotplug notifier. Failing to observe
826 * these restriction will result in deadlock.
827 */ 796 */
828void synchronize_rcu_expedited(void) 797void synchronize_rcu_expedited(void)
829{ 798{
@@ -845,7 +814,11 @@ void synchronize_rcu_expedited(void)
845 * being boosted. This simplifies the process of moving tasks 814 * being boosted. This simplifies the process of moving tasks
846 * from leaf to root rcu_node structures. 815 * from leaf to root rcu_node structures.
847 */ 816 */
848 get_online_cpus(); 817 if (!try_get_online_cpus()) {
818 /* CPU-hotplug operation in flight, fall back to normal GP. */
819 wait_rcu_gp(call_rcu);
820 return;
821 }
849 822
850 /* 823 /*
851 * Acquire lock, falling back to synchronize_rcu() if too many 824 * Acquire lock, falling back to synchronize_rcu() if too many
@@ -897,7 +870,8 @@ void synchronize_rcu_expedited(void)
897 870
898 /* Clean up and exit. */ 871 /* Clean up and exit. */
899 smp_mb(); /* ensure expedited GP seen before counter increment. */ 872 smp_mb(); /* ensure expedited GP seen before counter increment. */
900 ACCESS_ONCE(sync_rcu_preempt_exp_count)++; 873 ACCESS_ONCE(sync_rcu_preempt_exp_count) =
874 sync_rcu_preempt_exp_count + 1;
901unlock_mb_ret: 875unlock_mb_ret:
902 mutex_unlock(&sync_rcu_preempt_exp_mutex); 876 mutex_unlock(&sync_rcu_preempt_exp_mutex);
903mb_ret: 877mb_ret:
@@ -941,7 +915,7 @@ void exit_rcu(void)
941 return; 915 return;
942 t->rcu_read_lock_nesting = 1; 916 t->rcu_read_lock_nesting = 1;
943 barrier(); 917 barrier();
944 t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; 918 t->rcu_read_unlock_special.b.blocked = true;
945 __rcu_read_unlock(); 919 __rcu_read_unlock();
946} 920}
947 921
@@ -1462,14 +1436,13 @@ static struct smp_hotplug_thread rcu_cpu_thread_spec = {
1462}; 1436};
1463 1437
1464/* 1438/*
1465 * Spawn all kthreads -- called as soon as the scheduler is running. 1439 * Spawn boost kthreads -- called as soon as the scheduler is running.
1466 */ 1440 */
1467static int __init rcu_spawn_kthreads(void) 1441static void __init rcu_spawn_boost_kthreads(void)
1468{ 1442{
1469 struct rcu_node *rnp; 1443 struct rcu_node *rnp;
1470 int cpu; 1444 int cpu;
1471 1445
1472 rcu_scheduler_fully_active = 1;
1473 for_each_possible_cpu(cpu) 1446 for_each_possible_cpu(cpu)
1474 per_cpu(rcu_cpu_has_work, cpu) = 0; 1447 per_cpu(rcu_cpu_has_work, cpu) = 0;
1475 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); 1448 BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
@@ -1479,9 +1452,7 @@ static int __init rcu_spawn_kthreads(void)
1479 rcu_for_each_leaf_node(rcu_state_p, rnp) 1452 rcu_for_each_leaf_node(rcu_state_p, rnp)
1480 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp); 1453 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1481 } 1454 }
1482 return 0;
1483} 1455}
1484early_initcall(rcu_spawn_kthreads);
1485 1456
1486static void rcu_prepare_kthreads(int cpu) 1457static void rcu_prepare_kthreads(int cpu)
1487{ 1458{
@@ -1519,12 +1490,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1519{ 1490{
1520} 1491}
1521 1492
1522static int __init rcu_scheduler_really_started(void) 1493static void __init rcu_spawn_boost_kthreads(void)
1523{ 1494{
1524 rcu_scheduler_fully_active = 1;
1525 return 0;
1526} 1495}
1527early_initcall(rcu_scheduler_really_started);
1528 1496
1529static void rcu_prepare_kthreads(int cpu) 1497static void rcu_prepare_kthreads(int cpu)
1530{ 1498{
@@ -1625,7 +1593,7 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
1625 1593
1626 /* Exit early if we advanced recently. */ 1594 /* Exit early if we advanced recently. */
1627 if (jiffies == rdtp->last_advance_all) 1595 if (jiffies == rdtp->last_advance_all)
1628 return 0; 1596 return false;
1629 rdtp->last_advance_all = jiffies; 1597 rdtp->last_advance_all = jiffies;
1630 1598
1631 for_each_rcu_flavor(rsp) { 1599 for_each_rcu_flavor(rsp) {
@@ -1848,7 +1816,7 @@ static int rcu_oom_notify(struct notifier_block *self,
1848 get_online_cpus(); 1816 get_online_cpus();
1849 for_each_online_cpu(cpu) { 1817 for_each_online_cpu(cpu) {
1850 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); 1818 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
1851 cond_resched(); 1819 cond_resched_rcu_qs();
1852 } 1820 }
1853 put_online_cpus(); 1821 put_online_cpus();
1854 1822
@@ -2074,14 +2042,41 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
2074 2042
2075 if (!ACCESS_ONCE(rdp_leader->nocb_kthread)) 2043 if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
2076 return; 2044 return;
2077 if (!ACCESS_ONCE(rdp_leader->nocb_leader_wake) || force) { 2045 if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
2078 /* Prior xchg orders against prior callback enqueue. */ 2046 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
2079 ACCESS_ONCE(rdp_leader->nocb_leader_wake) = true; 2047 ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
2080 wake_up(&rdp_leader->nocb_wq); 2048 wake_up(&rdp_leader->nocb_wq);
2081 } 2049 }
2082} 2050}
2083 2051
2084/* 2052/*
2053 * Does the specified CPU need an RCU callback for the specified flavor
2054 * of rcu_barrier()?
2055 */
2056static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2057{
2058 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2059 struct rcu_head *rhp;
2060
2061 /* No-CBs CPUs might have callbacks on any of three lists. */
2062 rhp = ACCESS_ONCE(rdp->nocb_head);
2063 if (!rhp)
2064 rhp = ACCESS_ONCE(rdp->nocb_gp_head);
2065 if (!rhp)
2066 rhp = ACCESS_ONCE(rdp->nocb_follower_head);
2067
2068 /* Having no rcuo kthread but CBs after scheduler starts is bad! */
2069 if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
2070 /* RCU callback enqueued before CPU first came online??? */
2071 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
2072 cpu, rhp->func);
2073 WARN_ON_ONCE(1);
2074 }
2075
2076 return !!rhp;
2077}
2078
2079/*
2085 * Enqueue the specified string of rcu_head structures onto the specified 2080 * Enqueue the specified string of rcu_head structures onto the specified
2086 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the 2081 * CPU's no-CBs lists. The CPU is specified by rdp, the head of the
2087 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy 2082 * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy
@@ -2104,6 +2099,7 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2104 ACCESS_ONCE(*old_rhpp) = rhp; 2099 ACCESS_ONCE(*old_rhpp) = rhp;
2105 atomic_long_add(rhcount, &rdp->nocb_q_count); 2100 atomic_long_add(rhcount, &rdp->nocb_q_count);
2106 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); 2101 atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
2102 smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
2107 2103
2108 /* If we are not being polled and there is a kthread, awaken it ... */ 2104 /* If we are not being polled and there is a kthread, awaken it ... */
2109 t = ACCESS_ONCE(rdp->nocb_kthread); 2105 t = ACCESS_ONCE(rdp->nocb_kthread);
@@ -2120,16 +2116,23 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
2120 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2116 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2121 TPS("WakeEmpty")); 2117 TPS("WakeEmpty"));
2122 } else { 2118 } else {
2123 rdp->nocb_defer_wakeup = true; 2119 rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
2124 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2120 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2125 TPS("WakeEmptyIsDeferred")); 2121 TPS("WakeEmptyIsDeferred"));
2126 } 2122 }
2127 rdp->qlen_last_fqs_check = 0; 2123 rdp->qlen_last_fqs_check = 0;
2128 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 2124 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2129 /* ... or if many callbacks queued. */ 2125 /* ... or if many callbacks queued. */
2130 wake_nocb_leader(rdp, true); 2126 if (!irqs_disabled_flags(flags)) {
2127 wake_nocb_leader(rdp, true);
2128 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2129 TPS("WakeOvf"));
2130 } else {
2131 rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
2132 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2133 TPS("WakeOvfIsDeferred"));
2134 }
2131 rdp->qlen_last_fqs_check = LONG_MAX / 2; 2135 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2132 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf"));
2133 } else { 2136 } else {
2134 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); 2137 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
2135 } 2138 }
@@ -2150,7 +2153,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2150{ 2153{
2151 2154
2152 if (!rcu_is_nocb_cpu(rdp->cpu)) 2155 if (!rcu_is_nocb_cpu(rdp->cpu))
2153 return 0; 2156 return false;
2154 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags); 2157 __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
2155 if (__is_kfree_rcu_offset((unsigned long)rhp->func)) 2158 if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2156 trace_rcu_kfree_callback(rdp->rsp->name, rhp, 2159 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
@@ -2161,7 +2164,18 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2161 trace_rcu_callback(rdp->rsp->name, rhp, 2164 trace_rcu_callback(rdp->rsp->name, rhp,
2162 -atomic_long_read(&rdp->nocb_q_count_lazy), 2165 -atomic_long_read(&rdp->nocb_q_count_lazy),
2163 -atomic_long_read(&rdp->nocb_q_count)); 2166 -atomic_long_read(&rdp->nocb_q_count));
2164 return 1; 2167
2168 /*
2169 * If called from an extended quiescent state with interrupts
2170 * disabled, invoke the RCU core in order to allow the idle-entry
2171 * deferred-wakeup check to function.
2172 */
2173 if (irqs_disabled_flags(flags) &&
2174 !rcu_is_watching() &&
2175 cpu_online(smp_processor_id()))
2176 invoke_rcu_core();
2177
2178 return true;
2165} 2179}
2166 2180
2167/* 2181/*
@@ -2177,7 +2191,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2177 2191
2178 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ 2192 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2179 if (!rcu_is_nocb_cpu(smp_processor_id())) 2193 if (!rcu_is_nocb_cpu(smp_processor_id()))
2180 return 0; 2194 return false;
2181 rsp->qlen = 0; 2195 rsp->qlen = 0;
2182 rsp->qlen_lazy = 0; 2196 rsp->qlen_lazy = 0;
2183 2197
@@ -2196,7 +2210,7 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2196 rsp->orphan_nxtlist = NULL; 2210 rsp->orphan_nxtlist = NULL;
2197 rsp->orphan_nxttail = &rsp->orphan_nxtlist; 2211 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2198 } 2212 }
2199 return 1; 2213 return true;
2200} 2214}
2201 2215
2202/* 2216/*
@@ -2229,7 +2243,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2229 (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); 2243 (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
2230 if (likely(d)) 2244 if (likely(d))
2231 break; 2245 break;
2232 flush_signals(current); 2246 WARN_ON(signal_pending(current));
2233 trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); 2247 trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
2234 } 2248 }
2235 trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); 2249 trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
@@ -2253,7 +2267,7 @@ wait_again:
2253 if (!rcu_nocb_poll) { 2267 if (!rcu_nocb_poll) {
2254 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); 2268 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
2255 wait_event_interruptible(my_rdp->nocb_wq, 2269 wait_event_interruptible(my_rdp->nocb_wq,
2256 ACCESS_ONCE(my_rdp->nocb_leader_wake)); 2270 !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
2257 /* Memory barrier handled by smp_mb() calls below and repoll. */ 2271 /* Memory barrier handled by smp_mb() calls below and repoll. */
2258 } else if (firsttime) { 2272 } else if (firsttime) {
2259 firsttime = false; /* Don't drown trace log with "Poll"! */ 2273 firsttime = false; /* Don't drown trace log with "Poll"! */
@@ -2288,16 +2302,16 @@ wait_again:
2288 if (!rcu_nocb_poll) 2302 if (!rcu_nocb_poll)
2289 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, 2303 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
2290 "WokeEmpty"); 2304 "WokeEmpty");
2291 flush_signals(current); 2305 WARN_ON(signal_pending(current));
2292 schedule_timeout_interruptible(1); 2306 schedule_timeout_interruptible(1);
2293 2307
2294 /* Rescan in case we were a victim of memory ordering. */ 2308 /* Rescan in case we were a victim of memory ordering. */
2295 my_rdp->nocb_leader_wake = false; 2309 my_rdp->nocb_leader_sleep = true;
2296 smp_mb(); /* Ensure _wake false before scan. */ 2310 smp_mb(); /* Ensure _sleep true before scan. */
2297 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) 2311 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
2298 if (ACCESS_ONCE(rdp->nocb_head)) { 2312 if (ACCESS_ONCE(rdp->nocb_head)) {
2299 /* Found CB, so short-circuit next wait. */ 2313 /* Found CB, so short-circuit next wait. */
2300 my_rdp->nocb_leader_wake = true; 2314 my_rdp->nocb_leader_sleep = false;
2301 break; 2315 break;
2302 } 2316 }
2303 goto wait_again; 2317 goto wait_again;
@@ -2307,17 +2321,17 @@ wait_again:
2307 rcu_nocb_wait_gp(my_rdp); 2321 rcu_nocb_wait_gp(my_rdp);
2308 2322
2309 /* 2323 /*
2310 * We left ->nocb_leader_wake set to reduce cache thrashing. 2324 * We left ->nocb_leader_sleep unset to reduce cache thrashing.
2311 * We clear it now, but recheck for new callbacks while 2325 * We set it now, but recheck for new callbacks while
2312 * traversing our follower list. 2326 * traversing our follower list.
2313 */ 2327 */
2314 my_rdp->nocb_leader_wake = false; 2328 my_rdp->nocb_leader_sleep = true;
2315 smp_mb(); /* Ensure _wake false before scan of ->nocb_head. */ 2329 smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
2316 2330
2317 /* Each pass through the following loop wakes a follower, if needed. */ 2331 /* Each pass through the following loop wakes a follower, if needed. */
2318 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { 2332 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
2319 if (ACCESS_ONCE(rdp->nocb_head)) 2333 if (ACCESS_ONCE(rdp->nocb_head))
2320 my_rdp->nocb_leader_wake = true; /* No need to wait. */ 2334 my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
2321 if (!rdp->nocb_gp_head) 2335 if (!rdp->nocb_gp_head)
2322 continue; /* No CBs, so no need to wake follower. */ 2336 continue; /* No CBs, so no need to wake follower. */
2323 2337
@@ -2327,6 +2341,7 @@ wait_again:
2327 atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count); 2341 atomic_long_add(rdp->nocb_gp_count, &rdp->nocb_follower_count);
2328 atomic_long_add(rdp->nocb_gp_count_lazy, 2342 atomic_long_add(rdp->nocb_gp_count_lazy,
2329 &rdp->nocb_follower_count_lazy); 2343 &rdp->nocb_follower_count_lazy);
2344 smp_mb__after_atomic(); /* Store *tail before wakeup. */
2330 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { 2345 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
2331 /* 2346 /*
2332 * List was empty, wake up the follower. 2347 * List was empty, wake up the follower.
@@ -2367,7 +2382,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
2367 if (!rcu_nocb_poll) 2382 if (!rcu_nocb_poll)
2368 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2383 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2369 "WokeEmpty"); 2384 "WokeEmpty");
2370 flush_signals(current); 2385 WARN_ON(signal_pending(current));
2371 schedule_timeout_interruptible(1); 2386 schedule_timeout_interruptible(1);
2372 } 2387 }
2373} 2388}
@@ -2428,15 +2443,16 @@ static int rcu_nocb_kthread(void *arg)
2428 list = next; 2443 list = next;
2429 } 2444 }
2430 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); 2445 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2431 ACCESS_ONCE(rdp->nocb_p_count) -= c; 2446 ACCESS_ONCE(rdp->nocb_p_count) = rdp->nocb_p_count - c;
2432 ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; 2447 ACCESS_ONCE(rdp->nocb_p_count_lazy) =
2448 rdp->nocb_p_count_lazy - cl;
2433 rdp->n_nocbs_invoked += c; 2449 rdp->n_nocbs_invoked += c;
2434 } 2450 }
2435 return 0; 2451 return 0;
2436} 2452}
2437 2453
2438/* Is a deferred wakeup of rcu_nocb_kthread() required? */ 2454/* Is a deferred wakeup of rcu_nocb_kthread() required? */
2439static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) 2455static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2440{ 2456{
2441 return ACCESS_ONCE(rdp->nocb_defer_wakeup); 2457 return ACCESS_ONCE(rdp->nocb_defer_wakeup);
2442} 2458}
@@ -2444,11 +2460,79 @@ static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2444/* Do a deferred wakeup of rcu_nocb_kthread(). */ 2460/* Do a deferred wakeup of rcu_nocb_kthread(). */
2445static void do_nocb_deferred_wakeup(struct rcu_data *rdp) 2461static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2446{ 2462{
2463 int ndw;
2464
2447 if (!rcu_nocb_need_deferred_wakeup(rdp)) 2465 if (!rcu_nocb_need_deferred_wakeup(rdp))
2448 return; 2466 return;
2449 ACCESS_ONCE(rdp->nocb_defer_wakeup) = false; 2467 ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
2450 wake_nocb_leader(rdp, false); 2468 ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
2451 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWakeEmpty")); 2469 wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
2470 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
2471}
2472
2473void __init rcu_init_nohz(void)
2474{
2475 int cpu;
2476 bool need_rcu_nocb_mask = true;
2477 struct rcu_state *rsp;
2478
2479#ifdef CONFIG_RCU_NOCB_CPU_NONE
2480 need_rcu_nocb_mask = false;
2481#endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
2482
2483#if defined(CONFIG_NO_HZ_FULL)
2484 if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
2485 need_rcu_nocb_mask = true;
2486#endif /* #if defined(CONFIG_NO_HZ_FULL) */
2487
2488 if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
2489 if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
2490 pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
2491 return;
2492 }
2493 have_rcu_nocb_mask = true;
2494 }
2495 if (!have_rcu_nocb_mask)
2496 return;
2497
2498#ifdef CONFIG_RCU_NOCB_CPU_ZERO
2499 pr_info("\tOffload RCU callbacks from CPU 0\n");
2500 cpumask_set_cpu(0, rcu_nocb_mask);
2501#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
2502#ifdef CONFIG_RCU_NOCB_CPU_ALL
2503 pr_info("\tOffload RCU callbacks from all CPUs\n");
2504 cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
2505#endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
2506#if defined(CONFIG_NO_HZ_FULL)
2507 if (tick_nohz_full_running)
2508 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
2509#endif /* #if defined(CONFIG_NO_HZ_FULL) */
2510
2511 if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
2512 pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
2513 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
2514 rcu_nocb_mask);
2515 }
2516 cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask);
2517 pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf);
2518 if (rcu_nocb_poll)
2519 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2520
2521 for_each_rcu_flavor(rsp) {
2522 for_each_cpu(cpu, rcu_nocb_mask) {
2523 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2524
2525 /*
2526 * If there are early callbacks, they will need
2527 * to be moved to the nocb lists.
2528 */
2529 WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
2530 &rdp->nxtlist &&
2531 rdp->nxttail[RCU_NEXT_TAIL] != NULL);
2532 init_nocb_callback_list(rdp);
2533 }
2534 rcu_organize_nocb_kthreads(rsp);
2535 }
2452} 2536}
2453 2537
2454/* Initialize per-rcu_data variables for no-CBs CPUs. */ 2538/* Initialize per-rcu_data variables for no-CBs CPUs. */
@@ -2459,15 +2543,85 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2459 rdp->nocb_follower_tail = &rdp->nocb_follower_head; 2543 rdp->nocb_follower_tail = &rdp->nocb_follower_head;
2460} 2544}
2461 2545
2546/*
2547 * If the specified CPU is a no-CBs CPU that does not already have its
2548 * rcuo kthread for the specified RCU flavor, spawn it. If the CPUs are
2549 * brought online out of order, this can require re-organizing the
2550 * leader-follower relationships.
2551 */
2552static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
2553{
2554 struct rcu_data *rdp;
2555 struct rcu_data *rdp_last;
2556 struct rcu_data *rdp_old_leader;
2557 struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu);
2558 struct task_struct *t;
2559
2560 /*
2561 * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
2562 * then nothing to do.
2563 */
2564 if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread)
2565 return;
2566
2567 /* If we didn't spawn the leader first, reorganize! */
2568 rdp_old_leader = rdp_spawn->nocb_leader;
2569 if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
2570 rdp_last = NULL;
2571 rdp = rdp_old_leader;
2572 do {
2573 rdp->nocb_leader = rdp_spawn;
2574 if (rdp_last && rdp != rdp_spawn)
2575 rdp_last->nocb_next_follower = rdp;
2576 rdp_last = rdp;
2577 rdp = rdp->nocb_next_follower;
2578 rdp_last->nocb_next_follower = NULL;
2579 } while (rdp);
2580 rdp_spawn->nocb_next_follower = rdp_old_leader;
2581 }
2582
2583 /* Spawn the kthread for this CPU and RCU flavor. */
2584 t = kthread_run(rcu_nocb_kthread, rdp_spawn,
2585 "rcuo%c/%d", rsp->abbr, cpu);
2586 BUG_ON(IS_ERR(t));
2587 ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
2588}
2589
2590/*
2591 * If the specified CPU is a no-CBs CPU that does not already have its
2592 * rcuo kthreads, spawn them.
2593 */
2594static void rcu_spawn_all_nocb_kthreads(int cpu)
2595{
2596 struct rcu_state *rsp;
2597
2598 if (rcu_scheduler_fully_active)
2599 for_each_rcu_flavor(rsp)
2600 rcu_spawn_one_nocb_kthread(rsp, cpu);
2601}
2602
2603/*
2604 * Once the scheduler is running, spawn rcuo kthreads for all online
2605 * no-CBs CPUs. This assumes that the early_initcall()s happen before
2606 * non-boot CPUs come online -- if this changes, we will need to add
2607 * some mutual exclusion.
2608 */
2609static void __init rcu_spawn_nocb_kthreads(void)
2610{
2611 int cpu;
2612
2613 for_each_online_cpu(cpu)
2614 rcu_spawn_all_nocb_kthreads(cpu);
2615}
2616
2462/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */ 2617/* How many follower CPU IDs per leader? Default of -1 for sqrt(nr_cpu_ids). */
2463static int rcu_nocb_leader_stride = -1; 2618static int rcu_nocb_leader_stride = -1;
2464module_param(rcu_nocb_leader_stride, int, 0444); 2619module_param(rcu_nocb_leader_stride, int, 0444);
2465 2620
2466/* 2621/*
2467 * Create a kthread for each RCU flavor for each no-CBs CPU. 2622 * Initialize leader-follower relationships for all no-CBs CPU.
2468 * Also initialize leader-follower relationships.
2469 */ 2623 */
2470static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) 2624static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
2471{ 2625{
2472 int cpu; 2626 int cpu;
2473 int ls = rcu_nocb_leader_stride; 2627 int ls = rcu_nocb_leader_stride;
@@ -2475,14 +2629,9 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2475 struct rcu_data *rdp; 2629 struct rcu_data *rdp;
2476 struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ 2630 struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */
2477 struct rcu_data *rdp_prev = NULL; 2631 struct rcu_data *rdp_prev = NULL;
2478 struct task_struct *t;
2479 2632
2480 if (rcu_nocb_mask == NULL) 2633 if (!have_rcu_nocb_mask)
2481 return; 2634 return;
2482#if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL)
2483 if (tick_nohz_full_running)
2484 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
2485#endif /* #if defined(CONFIG_NO_HZ_FULL) && !defined(CONFIG_NO_HZ_FULL_ALL) */
2486 if (ls == -1) { 2635 if (ls == -1) {
2487 ls = int_sqrt(nr_cpu_ids); 2636 ls = int_sqrt(nr_cpu_ids);
2488 rcu_nocb_leader_stride = ls; 2637 rcu_nocb_leader_stride = ls;
@@ -2505,27 +2654,27 @@ static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp)
2505 rdp_prev->nocb_next_follower = rdp; 2654 rdp_prev->nocb_next_follower = rdp;
2506 } 2655 }
2507 rdp_prev = rdp; 2656 rdp_prev = rdp;
2508
2509 /* Spawn the kthread for this CPU. */
2510 t = kthread_run(rcu_nocb_kthread, rdp,
2511 "rcuo%c/%d", rsp->abbr, cpu);
2512 BUG_ON(IS_ERR(t));
2513 ACCESS_ONCE(rdp->nocb_kthread) = t;
2514 } 2657 }
2515} 2658}
2516 2659
2517/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ 2660/* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2518static bool init_nocb_callback_list(struct rcu_data *rdp) 2661static bool init_nocb_callback_list(struct rcu_data *rdp)
2519{ 2662{
2520 if (rcu_nocb_mask == NULL || 2663 if (!rcu_is_nocb_cpu(rdp->cpu))
2521 !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask))
2522 return false; 2664 return false;
2665
2523 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2666 rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2524 return true; 2667 return true;
2525} 2668}
2526 2669
2527#else /* #ifdef CONFIG_RCU_NOCB_CPU */ 2670#else /* #ifdef CONFIG_RCU_NOCB_CPU */
2528 2671
2672static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2673{
2674 WARN_ON_ONCE(1); /* Should be dead code. */
2675 return false;
2676}
2677
2529static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 2678static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2530{ 2679{
2531} 2680}
@@ -2541,21 +2690,21 @@ static void rcu_init_one_nocb(struct rcu_node *rnp)
2541static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 2690static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2542 bool lazy, unsigned long flags) 2691 bool lazy, unsigned long flags)
2543{ 2692{
2544 return 0; 2693 return false;
2545} 2694}
2546 2695
2547static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 2696static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2548 struct rcu_data *rdp, 2697 struct rcu_data *rdp,
2549 unsigned long flags) 2698 unsigned long flags)
2550{ 2699{
2551 return 0; 2700 return false;
2552} 2701}
2553 2702
2554static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) 2703static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2555{ 2704{
2556} 2705}
2557 2706
2558static bool rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) 2707static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2559{ 2708{
2560 return false; 2709 return false;
2561} 2710}
@@ -2564,7 +2713,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2564{ 2713{
2565} 2714}
2566 2715
2567static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) 2716static void rcu_spawn_all_nocb_kthreads(int cpu)
2717{
2718}
2719
2720static void __init rcu_spawn_nocb_kthreads(void)
2568{ 2721{
2569} 2722}
2570 2723
@@ -2595,16 +2748,6 @@ static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
2595 2748
2596#ifdef CONFIG_NO_HZ_FULL_SYSIDLE 2749#ifdef CONFIG_NO_HZ_FULL_SYSIDLE
2597 2750
2598/*
2599 * Define RCU flavor that holds sysidle state. This needs to be the
2600 * most active flavor of RCU.
2601 */
2602#ifdef CONFIG_PREEMPT_RCU
2603static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state;
2604#else /* #ifdef CONFIG_PREEMPT_RCU */
2605static struct rcu_state *rcu_sysidle_state = &rcu_sched_state;
2606#endif /* #else #ifdef CONFIG_PREEMPT_RCU */
2607
2608static int full_sysidle_state; /* Current system-idle state. */ 2751static int full_sysidle_state; /* Current system-idle state. */
2609#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ 2752#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */
2610#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ 2753#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */
@@ -2622,6 +2765,10 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq)
2622{ 2765{
2623 unsigned long j; 2766 unsigned long j;
2624 2767
2768 /* If there are no nohz_full= CPUs, no need to track this. */
2769 if (!tick_nohz_full_enabled())
2770 return;
2771
2625 /* Adjust nesting, check for fully idle. */ 2772 /* Adjust nesting, check for fully idle. */
2626 if (irq) { 2773 if (irq) {
2627 rdtp->dynticks_idle_nesting--; 2774 rdtp->dynticks_idle_nesting--;
@@ -2687,6 +2834,10 @@ void rcu_sysidle_force_exit(void)
2687 */ 2834 */
2688static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) 2835static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq)
2689{ 2836{
2837 /* If there are no nohz_full= CPUs, no need to track this. */
2838 if (!tick_nohz_full_enabled())
2839 return;
2840
2690 /* Adjust nesting, check for already non-idle. */ 2841 /* Adjust nesting, check for already non-idle. */
2691 if (irq) { 2842 if (irq) {
2692 rdtp->dynticks_idle_nesting++; 2843 rdtp->dynticks_idle_nesting++;
@@ -2741,12 +2892,16 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2741 unsigned long j; 2892 unsigned long j;
2742 struct rcu_dynticks *rdtp = rdp->dynticks; 2893 struct rcu_dynticks *rdtp = rdp->dynticks;
2743 2894
2895 /* If there are no nohz_full= CPUs, don't check system-wide idleness. */
2896 if (!tick_nohz_full_enabled())
2897 return;
2898
2744 /* 2899 /*
2745 * If some other CPU has already reported non-idle, if this is 2900 * If some other CPU has already reported non-idle, if this is
2746 * not the flavor of RCU that tracks sysidle state, or if this 2901 * not the flavor of RCU that tracks sysidle state, or if this
2747 * is an offline or the timekeeping CPU, nothing to do. 2902 * is an offline or the timekeeping CPU, nothing to do.
2748 */ 2903 */
2749 if (!*isidle || rdp->rsp != rcu_sysidle_state || 2904 if (!*isidle || rdp->rsp != rcu_state_p ||
2750 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) 2905 cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2751 return; 2906 return;
2752 if (rcu_gp_in_progress(rdp->rsp)) 2907 if (rcu_gp_in_progress(rdp->rsp))
@@ -2772,7 +2927,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2772 */ 2927 */
2773static bool is_sysidle_rcu_state(struct rcu_state *rsp) 2928static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2774{ 2929{
2775 return rsp == rcu_sysidle_state; 2930 return rsp == rcu_state_p;
2776} 2931}
2777 2932
2778/* 2933/*
@@ -2850,7 +3005,7 @@ static void rcu_sysidle_cancel(void)
2850static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, 3005static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
2851 unsigned long maxj, bool gpkt) 3006 unsigned long maxj, bool gpkt)
2852{ 3007{
2853 if (rsp != rcu_sysidle_state) 3008 if (rsp != rcu_state_p)
2854 return; /* Wrong flavor, ignore. */ 3009 return; /* Wrong flavor, ignore. */
2855 if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) 3010 if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2856 return; /* Running state machine from timekeeping CPU. */ 3011 return; /* Running state machine from timekeeping CPU. */
@@ -2867,6 +3022,10 @@ static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
2867static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, 3022static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2868 unsigned long maxj) 3023 unsigned long maxj)
2869{ 3024{
3025 /* If there are no nohz_full= CPUs, no need to track this. */
3026 if (!tick_nohz_full_enabled())
3027 return;
3028
2870 rcu_sysidle_report(rsp, isidle, maxj, true); 3029 rcu_sysidle_report(rsp, isidle, maxj, true);
2871} 3030}
2872 3031
@@ -2893,7 +3052,8 @@ static void rcu_sysidle_cb(struct rcu_head *rhp)
2893 3052
2894/* 3053/*
2895 * Check to see if the system is fully idle, other than the timekeeping CPU. 3054 * Check to see if the system is fully idle, other than the timekeeping CPU.
2896 * The caller must have disabled interrupts. 3055 * The caller must have disabled interrupts. This is not intended to be
3056 * called unless tick_nohz_full_enabled().
2897 */ 3057 */
2898bool rcu_sys_is_idle(void) 3058bool rcu_sys_is_idle(void)
2899{ 3059{
@@ -2919,13 +3079,12 @@ bool rcu_sys_is_idle(void)
2919 3079
2920 /* Scan all the CPUs looking for nonidle CPUs. */ 3080 /* Scan all the CPUs looking for nonidle CPUs. */
2921 for_each_possible_cpu(cpu) { 3081 for_each_possible_cpu(cpu) {
2922 rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); 3082 rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
2923 rcu_sysidle_check_cpu(rdp, &isidle, &maxj); 3083 rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
2924 if (!isidle) 3084 if (!isidle)
2925 break; 3085 break;
2926 } 3086 }
2927 rcu_sysidle_report(rcu_sysidle_state, 3087 rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
2928 isidle, maxj, false);
2929 oldrss = rss; 3088 oldrss = rss;
2930 rss = ACCESS_ONCE(full_sysidle_state); 3089 rss = ACCESS_ONCE(full_sysidle_state);
2931 } 3090 }
@@ -2952,7 +3111,7 @@ bool rcu_sys_is_idle(void)
2952 * provided by the memory allocator. 3111 * provided by the memory allocator.
2953 */ 3112 */
2954 if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && 3113 if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
2955 !rcu_gp_in_progress(rcu_sysidle_state) && 3114 !rcu_gp_in_progress(rcu_state_p) &&
2956 !rsh.inuse && xchg(&rsh.inuse, 1) == 0) 3115 !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
2957 call_rcu(&rsh.rh, rcu_sysidle_cb); 3116 call_rcu(&rsh.rh, rcu_sysidle_cb);
2958 return false; 3117 return false;
@@ -3036,3 +3195,19 @@ static void rcu_bind_gp_kthread(void)
3036 housekeeping_affine(current); 3195 housekeeping_affine(current);
3037#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 3196#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3038} 3197}
3198
3199/* Record the current task on dyntick-idle entry. */
3200static void rcu_dynticks_task_enter(void)
3201{
3202#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
3203 ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
3204#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
3205}
3206
3207/* Record no current task on dyntick-idle exit. */
3208static void rcu_dynticks_task_exit(void)
3209{
3210#if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
3211 ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
3212#endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
3213}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4056d7992a6c..3ef8ba58694e 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -47,6 +47,8 @@
47#include <linux/hardirq.h> 47#include <linux/hardirq.h>
48#include <linux/delay.h> 48#include <linux/delay.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kthread.h>
51#include <linux/tick.h>
50 52
51#define CREATE_TRACE_POINTS 53#define CREATE_TRACE_POINTS
52 54
@@ -91,7 +93,7 @@ void __rcu_read_unlock(void)
91 barrier(); /* critical section before exit code. */ 93 barrier(); /* critical section before exit code. */
92 t->rcu_read_lock_nesting = INT_MIN; 94 t->rcu_read_lock_nesting = INT_MIN;
93 barrier(); /* assign before ->rcu_read_unlock_special load */ 95 barrier(); /* assign before ->rcu_read_unlock_special load */
94 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) 96 if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special.s)))
95 rcu_read_unlock_special(t); 97 rcu_read_unlock_special(t);
96 barrier(); /* ->rcu_read_unlock_special load before assign */ 98 barrier(); /* ->rcu_read_unlock_special load before assign */
97 t->rcu_read_lock_nesting = 0; 99 t->rcu_read_lock_nesting = 0;
@@ -137,6 +139,38 @@ int notrace debug_lockdep_rcu_enabled(void)
137EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); 139EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
138 140
139/** 141/**
142 * rcu_read_lock_held() - might we be in RCU read-side critical section?
143 *
144 * If CONFIG_DEBUG_LOCK_ALLOC is selected, returns nonzero iff in an RCU
145 * read-side critical section. In absence of CONFIG_DEBUG_LOCK_ALLOC,
146 * this assumes we are in an RCU read-side critical section unless it can
147 * prove otherwise. This is useful for debug checks in functions that
148 * require that they be called within an RCU read-side critical section.
149 *
150 * Checks debug_lockdep_rcu_enabled() to prevent false positives during boot
151 * and while lockdep is disabled.
152 *
153 * Note that rcu_read_lock() and the matching rcu_read_unlock() must
154 * occur in the same context, for example, it is illegal to invoke
155 * rcu_read_unlock() in process context if the matching rcu_read_lock()
156 * was invoked from within an irq handler.
157 *
158 * Note that rcu_read_lock() is disallowed if the CPU is either idle or
159 * offline from an RCU perspective, so check for those as well.
160 */
161int rcu_read_lock_held(void)
162{
163 if (!debug_lockdep_rcu_enabled())
164 return 1;
165 if (!rcu_is_watching())
166 return 0;
167 if (!rcu_lockdep_current_cpu_online())
168 return 0;
169 return lock_is_held(&rcu_lock_map);
170}
171EXPORT_SYMBOL_GPL(rcu_read_lock_held);
172
173/**
140 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? 174 * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section?
141 * 175 *
142 * Check for bottom half being disabled, which covers both the 176 * Check for bottom half being disabled, which covers both the
@@ -347,3 +381,312 @@ static int __init check_cpu_stall_init(void)
347early_initcall(check_cpu_stall_init); 381early_initcall(check_cpu_stall_init);
348 382
349#endif /* #ifdef CONFIG_RCU_STALL_COMMON */ 383#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
384
385#ifdef CONFIG_TASKS_RCU
386
387/*
388 * Simple variant of RCU whose quiescent states are voluntary context switch,
389 * user-space execution, and idle. As such, grace periods can take one good
390 * long time. There are no read-side primitives similar to rcu_read_lock()
391 * and rcu_read_unlock() because this implementation is intended to get
392 * the system into a safe state for some of the manipulations involved in
393 * tracing and the like. Finally, this implementation does not support
394 * high call_rcu_tasks() rates from multiple CPUs. If this is required,
395 * per-CPU callback lists will be needed.
396 */
397
398/* Global list of callbacks and associated lock. */
399static struct rcu_head *rcu_tasks_cbs_head;
400static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
401static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
402static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
403
404/* Track exiting tasks in order to allow them to be waited for. */
405DEFINE_SRCU(tasks_rcu_exit_srcu);
406
407/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
408static int rcu_task_stall_timeout __read_mostly = HZ * 60 * 10;
409module_param(rcu_task_stall_timeout, int, 0644);
410
411static void rcu_spawn_tasks_kthread(void);
412
413/*
414 * Post an RCU-tasks callback. First call must be from process context
415 * after the scheduler if fully operational.
416 */
417void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
418{
419 unsigned long flags;
420 bool needwake;
421
422 rhp->next = NULL;
423 rhp->func = func;
424 raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
425 needwake = !rcu_tasks_cbs_head;
426 *rcu_tasks_cbs_tail = rhp;
427 rcu_tasks_cbs_tail = &rhp->next;
428 raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
429 if (needwake) {
430 rcu_spawn_tasks_kthread();
431 wake_up(&rcu_tasks_cbs_wq);
432 }
433}
434EXPORT_SYMBOL_GPL(call_rcu_tasks);
435
436/**
437 * synchronize_rcu_tasks - wait until an rcu-tasks grace period has elapsed.
438 *
439 * Control will return to the caller some time after a full rcu-tasks
440 * grace period has elapsed, in other words after all currently
441 * executing rcu-tasks read-side critical sections have elapsed. These
442 * read-side critical sections are delimited by calls to schedule(),
443 * cond_resched_rcu_qs(), idle execution, userspace execution, calls
444 * to synchronize_rcu_tasks(), and (in theory, anyway) cond_resched().
445 *
446 * This is a very specialized primitive, intended only for a few uses in
447 * tracing and other situations requiring manipulation of function
448 * preambles and profiling hooks. The synchronize_rcu_tasks() function
449 * is not (yet) intended for heavy use from multiple CPUs.
450 *
451 * Note that this guarantee implies further memory-ordering guarantees.
452 * On systems with more than one CPU, when synchronize_rcu_tasks() returns,
453 * each CPU is guaranteed to have executed a full memory barrier since the
454 * end of its last RCU-tasks read-side critical section whose beginning
455 * preceded the call to synchronize_rcu_tasks(). In addition, each CPU
456 * having an RCU-tasks read-side critical section that extends beyond
457 * the return from synchronize_rcu_tasks() is guaranteed to have executed
458 * a full memory barrier after the beginning of synchronize_rcu_tasks()
459 * and before the beginning of that RCU-tasks read-side critical section.
460 * Note that these guarantees include CPUs that are offline, idle, or
461 * executing in user mode, as well as CPUs that are executing in the kernel.
462 *
463 * Furthermore, if CPU A invoked synchronize_rcu_tasks(), which returned
464 * to its caller on CPU B, then both CPU A and CPU B are guaranteed
465 * to have executed a full memory barrier during the execution of
466 * synchronize_rcu_tasks() -- even if CPU A and CPU B are the same CPU
467 * (but again only if the system has more than one CPU).
468 */
469void synchronize_rcu_tasks(void)
470{
471 /* Complain if the scheduler has not started. */
472 rcu_lockdep_assert(!rcu_scheduler_active,
473 "synchronize_rcu_tasks called too soon");
474
475 /* Wait for the grace period. */
476 wait_rcu_gp(call_rcu_tasks);
477}
478EXPORT_SYMBOL_GPL(synchronize_rcu_tasks);
479
480/**
481 * rcu_barrier_tasks - Wait for in-flight call_rcu_tasks() callbacks.
482 *
483 * Although the current implementation is guaranteed to wait, it is not
484 * obligated to, for example, if there are no pending callbacks.
485 */
486void rcu_barrier_tasks(void)
487{
488 /* There is only one callback queue, so this is easy. ;-) */
489 synchronize_rcu_tasks();
490}
491EXPORT_SYMBOL_GPL(rcu_barrier_tasks);
492
493/* See if tasks are still holding out, complain if so. */
494static void check_holdout_task(struct task_struct *t,
495 bool needreport, bool *firstreport)
496{
497 int cpu;
498
499 if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
500 t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
501 !ACCESS_ONCE(t->on_rq) ||
502 (IS_ENABLED(CONFIG_NO_HZ_FULL) &&
503 !is_idle_task(t) && t->rcu_tasks_idle_cpu >= 0)) {
504 ACCESS_ONCE(t->rcu_tasks_holdout) = false;
505 list_del_init(&t->rcu_tasks_holdout_list);
506 put_task_struct(t);
507 return;
508 }
509 if (!needreport)
510 return;
511 if (*firstreport) {
512 pr_err("INFO: rcu_tasks detected stalls on tasks:\n");
513 *firstreport = false;
514 }
515 cpu = task_cpu(t);
516 pr_alert("%p: %c%c nvcsw: %lu/%lu holdout: %d idle_cpu: %d/%d\n",
517 t, ".I"[is_idle_task(t)],
518 "N."[cpu < 0 || !tick_nohz_full_cpu(cpu)],
519 t->rcu_tasks_nvcsw, t->nvcsw, t->rcu_tasks_holdout,
520 t->rcu_tasks_idle_cpu, cpu);
521 sched_show_task(t);
522}
523
524/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
525static int __noreturn rcu_tasks_kthread(void *arg)
526{
527 unsigned long flags;
528 struct task_struct *g, *t;
529 unsigned long lastreport;
530 struct rcu_head *list;
531 struct rcu_head *next;
532 LIST_HEAD(rcu_tasks_holdouts);
533
534 /* FIXME: Add housekeeping affinity. */
535
536 /*
537 * Each pass through the following loop makes one check for
538 * newly arrived callbacks, and, if there are some, waits for
539 * one RCU-tasks grace period and then invokes the callbacks.
540 * This loop is terminated by the system going down. ;-)
541 */
542 for (;;) {
543
544 /* Pick up any new callbacks. */
545 raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
546 list = rcu_tasks_cbs_head;
547 rcu_tasks_cbs_head = NULL;
548 rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
549 raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
550
551 /* If there were none, wait a bit and start over. */
552 if (!list) {
553 wait_event_interruptible(rcu_tasks_cbs_wq,
554 rcu_tasks_cbs_head);
555 if (!rcu_tasks_cbs_head) {
556 WARN_ON(signal_pending(current));
557 schedule_timeout_interruptible(HZ/10);
558 }
559 continue;
560 }
561
562 /*
563 * Wait for all pre-existing t->on_rq and t->nvcsw
564 * transitions to complete. Invoking synchronize_sched()
565 * suffices because all these transitions occur with
566 * interrupts disabled. Without this synchronize_sched(),
567 * a read-side critical section that started before the
568 * grace period might be incorrectly seen as having started
569 * after the grace period.
570 *
571 * This synchronize_sched() also dispenses with the
572 * need for a memory barrier on the first store to
573 * ->rcu_tasks_holdout, as it forces the store to happen
574 * after the beginning of the grace period.
575 */
576 synchronize_sched();
577
578 /*
579 * There were callbacks, so we need to wait for an
580 * RCU-tasks grace period. Start off by scanning
581 * the task list for tasks that are not already
582 * voluntarily blocked. Mark these tasks and make
583 * a list of them in rcu_tasks_holdouts.
584 */
585 rcu_read_lock();
586 for_each_process_thread(g, t) {
587 if (t != current && ACCESS_ONCE(t->on_rq) &&
588 !is_idle_task(t)) {
589 get_task_struct(t);
590 t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
591 ACCESS_ONCE(t->rcu_tasks_holdout) = true;
592 list_add(&t->rcu_tasks_holdout_list,
593 &rcu_tasks_holdouts);
594 }
595 }
596 rcu_read_unlock();
597
598 /*
599 * Wait for tasks that are in the process of exiting.
600 * This does only part of the job, ensuring that all
601 * tasks that were previously exiting reach the point
602 * where they have disabled preemption, allowing the
603 * later synchronize_sched() to finish the job.
604 */
605 synchronize_srcu(&tasks_rcu_exit_srcu);
606
607 /*
608 * Each pass through the following loop scans the list
609 * of holdout tasks, removing any that are no longer
610 * holdouts. When the list is empty, we are done.
611 */
612 lastreport = jiffies;
613 while (!list_empty(&rcu_tasks_holdouts)) {
614 bool firstreport;
615 bool needreport;
616 int rtst;
617 struct task_struct *t1;
618
619 schedule_timeout_interruptible(HZ);
620 rtst = ACCESS_ONCE(rcu_task_stall_timeout);
621 needreport = rtst > 0 &&
622 time_after(jiffies, lastreport + rtst);
623 if (needreport)
624 lastreport = jiffies;
625 firstreport = true;
626 WARN_ON(signal_pending(current));
627 list_for_each_entry_safe(t, t1, &rcu_tasks_holdouts,
628 rcu_tasks_holdout_list) {
629 check_holdout_task(t, needreport, &firstreport);
630 cond_resched();
631 }
632 }
633
634 /*
635 * Because ->on_rq and ->nvcsw are not guaranteed
636 * to have a full memory barriers prior to them in the
637 * schedule() path, memory reordering on other CPUs could
638 * cause their RCU-tasks read-side critical sections to
639 * extend past the end of the grace period. However,
640 * because these ->nvcsw updates are carried out with
641 * interrupts disabled, we can use synchronize_sched()
642 * to force the needed ordering on all such CPUs.
643 *
644 * This synchronize_sched() also confines all
645 * ->rcu_tasks_holdout accesses to be within the grace
646 * period, avoiding the need for memory barriers for
647 * ->rcu_tasks_holdout accesses.
648 *
649 * In addition, this synchronize_sched() waits for exiting
650 * tasks to complete their final preempt_disable() region
651 * of execution, cleaning up after the synchronize_srcu()
652 * above.
653 */
654 synchronize_sched();
655
656 /* Invoke the callbacks. */
657 while (list) {
658 next = list->next;
659 local_bh_disable();
660 list->func(list);
661 local_bh_enable();
662 list = next;
663 cond_resched();
664 }
665 schedule_timeout_uninterruptible(HZ/10);
666 }
667}
668
669/* Spawn rcu_tasks_kthread() at first call to call_rcu_tasks(). */
670static void rcu_spawn_tasks_kthread(void)
671{
672 static DEFINE_MUTEX(rcu_tasks_kthread_mutex);
673 static struct task_struct *rcu_tasks_kthread_ptr;
674 struct task_struct *t;
675
676 if (ACCESS_ONCE(rcu_tasks_kthread_ptr)) {
677 smp_mb(); /* Ensure caller sees full kthread. */
678 return;
679 }
680 mutex_lock(&rcu_tasks_kthread_mutex);
681 if (rcu_tasks_kthread_ptr) {
682 mutex_unlock(&rcu_tasks_kthread_mutex);
683 return;
684 }
685 t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
686 BUG_ON(IS_ERR(t));
687 smp_mb(); /* Ensure others see full kthread. */
688 ACCESS_ONCE(rcu_tasks_kthread_ptr) = t;
689 mutex_unlock(&rcu_tasks_kthread_mutex);
690}
691
692#endif /* #ifdef CONFIG_TASKS_RCU */
diff --git a/kernel/reboot.c b/kernel/reboot.c
index a3a9e240fcdb..5925f5ae8dff 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,87 @@ int unregister_reboot_notifier(struct notifier_block *nb)
104} 104}
105EXPORT_SYMBOL(unregister_reboot_notifier); 105EXPORT_SYMBOL(unregister_reboot_notifier);
106 106
107/*
108 * Notifier list for kernel code which wants to be called
109 * to restart the system.
110 */
111static ATOMIC_NOTIFIER_HEAD(restart_handler_list);
112
113/**
114 * register_restart_handler - Register function to be called to reset
115 * the system
116 * @nb: Info about handler function to be called
117 * @nb->priority: Handler priority. Handlers should follow the
118 * following guidelines for setting priorities.
119 * 0: Restart handler of last resort,
120 * with limited restart capabilities
121 * 128: Default restart handler; use if no other
122 * restart handler is expected to be available,
123 * and/or if restart functionality is
124 * sufficient to restart the entire system
125 * 255: Highest priority restart handler, will
126 * preempt all other restart handlers
127 *
128 * Registers a function with code to be called to restart the
129 * system.
130 *
131 * Registered functions will be called from machine_restart as last
132 * step of the restart sequence (if the architecture specific
133 * machine_restart function calls do_kernel_restart - see below
134 * for details).
135 * Registered functions are expected to restart the system immediately.
136 * If more than one function is registered, the restart handler priority
137 * selects which function will be called first.
138 *
139 * Restart handlers are expected to be registered from non-architecture
140 * code, typically from drivers. A typical use case would be a system
141 * where restart functionality is provided through a watchdog. Multiple
142 * restart handlers may exist; for example, one restart handler might
143 * restart the entire system, while another only restarts the CPU.
144 * In such cases, the restart handler which only restarts part of the
145 * hardware is expected to register with low priority to ensure that
146 * it only runs if no other means to restart the system is available.
147 *
148 * Currently always returns zero, as atomic_notifier_chain_register()
149 * always returns zero.
150 */
151int register_restart_handler(struct notifier_block *nb)
152{
153 return atomic_notifier_chain_register(&restart_handler_list, nb);
154}
155EXPORT_SYMBOL(register_restart_handler);
156
157/**
158 * unregister_restart_handler - Unregister previously registered
159 * restart handler
160 * @nb: Hook to be unregistered
161 *
162 * Unregisters a previously registered restart handler function.
163 *
164 * Returns zero on success, or %-ENOENT on failure.
165 */
166int unregister_restart_handler(struct notifier_block *nb)
167{
168 return atomic_notifier_chain_unregister(&restart_handler_list, nb);
169}
170EXPORT_SYMBOL(unregister_restart_handler);
171
172/**
173 * do_kernel_restart - Execute kernel restart handler call chain
174 *
175 * Calls functions registered with register_restart_handler.
176 *
177 * Expected to be called from machine_restart as last step of the restart
178 * sequence.
179 *
180 * Restarts the system immediately if a restart handler function has been
181 * registered. Otherwise does nothing.
182 */
183void do_kernel_restart(char *cmd)
184{
185 atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd);
186}
187
107void migrate_to_reboot_cpu(void) 188void migrate_to_reboot_cpu(void)
108{ 189{
109 /* The boot cpu is always logical cpu 0 */ 190 /* The boot cpu is always logical cpu 0 */
diff --git a/kernel/resource.c b/kernel/resource.c
index 3c2237ac32db..0bcebffc4e77 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -59,10 +59,12 @@ static DEFINE_RWLOCK(resource_lock);
59static struct resource *bootmem_resource_free; 59static struct resource *bootmem_resource_free;
60static DEFINE_SPINLOCK(bootmem_resource_lock); 60static DEFINE_SPINLOCK(bootmem_resource_lock);
61 61
62static void *r_next(struct seq_file *m, void *v, loff_t *pos) 62static struct resource *next_resource(struct resource *p, bool sibling_only)
63{ 63{
64 struct resource *p = v; 64 /* Caller wants to traverse through siblings only */
65 (*pos)++; 65 if (sibling_only)
66 return p->sibling;
67
66 if (p->child) 68 if (p->child)
67 return p->child; 69 return p->child;
68 while (!p->sibling && p->parent) 70 while (!p->sibling && p->parent)
@@ -70,6 +72,13 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
70 return p->sibling; 72 return p->sibling;
71} 73}
72 74
75static void *r_next(struct seq_file *m, void *v, loff_t *pos)
76{
77 struct resource *p = v;
78 (*pos)++;
79 return (void *)next_resource(p, false);
80}
81
73#ifdef CONFIG_PROC_FS 82#ifdef CONFIG_PROC_FS
74 83
75enum { MAX_IORES_LEVEL = 5 }; 84enum { MAX_IORES_LEVEL = 5 };
@@ -322,16 +331,19 @@ int release_resource(struct resource *old)
322 331
323EXPORT_SYMBOL(release_resource); 332EXPORT_SYMBOL(release_resource);
324 333
325#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
326/* 334/*
327 * Finds the lowest memory reosurce exists within [res->start.res->end) 335 * Finds the lowest iomem reosurce exists with-in [res->start.res->end)
328 * the caller must specify res->start, res->end, res->flags and "name". 336 * the caller must specify res->start, res->end, res->flags and "name".
329 * If found, returns 0, res is overwritten, if not found, returns -1. 337 * If found, returns 0, res is overwritten, if not found, returns -1.
338 * This walks through whole tree and not just first level children
339 * until and unless first_level_children_only is true.
330 */ 340 */
331static int find_next_system_ram(struct resource *res, char *name) 341static int find_next_iomem_res(struct resource *res, char *name,
342 bool first_level_children_only)
332{ 343{
333 resource_size_t start, end; 344 resource_size_t start, end;
334 struct resource *p; 345 struct resource *p;
346 bool sibling_only = false;
335 347
336 BUG_ON(!res); 348 BUG_ON(!res);
337 349
@@ -339,9 +351,12 @@ static int find_next_system_ram(struct resource *res, char *name)
339 end = res->end; 351 end = res->end;
340 BUG_ON(start >= end); 352 BUG_ON(start >= end);
341 353
354 if (first_level_children_only)
355 sibling_only = true;
356
342 read_lock(&resource_lock); 357 read_lock(&resource_lock);
343 for (p = iomem_resource.child; p ; p = p->sibling) { 358
344 /* system ram is just marked as IORESOURCE_MEM */ 359 for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
345 if (p->flags != res->flags) 360 if (p->flags != res->flags)
346 continue; 361 continue;
347 if (name && strcmp(p->name, name)) 362 if (name && strcmp(p->name, name))
@@ -353,6 +368,7 @@ static int find_next_system_ram(struct resource *res, char *name)
353 if ((p->end >= start) && (p->start < end)) 368 if ((p->end >= start) && (p->start < end))
354 break; 369 break;
355 } 370 }
371
356 read_unlock(&resource_lock); 372 read_unlock(&resource_lock);
357 if (!p) 373 if (!p)
358 return -1; 374 return -1;
@@ -365,6 +381,70 @@ static int find_next_system_ram(struct resource *res, char *name)
365} 381}
366 382
367/* 383/*
384 * Walks through iomem resources and calls func() with matching resource
385 * ranges. This walks through whole tree and not just first level children.
386 * All the memory ranges which overlap start,end and also match flags and
387 * name are valid candidates.
388 *
389 * @name: name of resource
390 * @flags: resource flags
391 * @start: start addr
392 * @end: end addr
393 */
394int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
395 void *arg, int (*func)(u64, u64, void *))
396{
397 struct resource res;
398 u64 orig_end;
399 int ret = -1;
400
401 res.start = start;
402 res.end = end;
403 res.flags = flags;
404 orig_end = res.end;
405 while ((res.start < res.end) &&
406 (!find_next_iomem_res(&res, name, false))) {
407 ret = (*func)(res.start, res.end, arg);
408 if (ret)
409 break;
410 res.start = res.end + 1;
411 res.end = orig_end;
412 }
413 return ret;
414}
415
416/*
417 * This function calls callback against all memory range of "System RAM"
418 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
419 * Now, this function is only for "System RAM". This function deals with
420 * full ranges and not pfn. If resources are not pfn aligned, dealing
421 * with pfn can truncate ranges.
422 */
423int walk_system_ram_res(u64 start, u64 end, void *arg,
424 int (*func)(u64, u64, void *))
425{
426 struct resource res;
427 u64 orig_end;
428 int ret = -1;
429
430 res.start = start;
431 res.end = end;
432 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
433 orig_end = res.end;
434 while ((res.start < res.end) &&
435 (!find_next_iomem_res(&res, "System RAM", true))) {
436 ret = (*func)(res.start, res.end, arg);
437 if (ret)
438 break;
439 res.start = res.end + 1;
440 res.end = orig_end;
441 }
442 return ret;
443}
444
445#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
446
447/*
368 * This function calls callback against all memory range of "System RAM" 448 * This function calls callback against all memory range of "System RAM"
369 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. 449 * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
370 * Now, this function is only for "System RAM". 450 * Now, this function is only for "System RAM".
@@ -382,7 +462,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
382 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; 462 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
383 orig_end = res.end; 463 orig_end = res.end;
384 while ((res.start < res.end) && 464 while ((res.start < res.end) &&
385 (find_next_system_ram(&res, "System RAM") >= 0)) { 465 (find_next_iomem_res(&res, "System RAM", true) >= 0)) {
386 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; 466 pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
387 end_pfn = (res.end + 1) >> PAGE_SHIFT; 467 end_pfn = (res.end + 1) >> PAGE_SHIFT;
388 if (end_pfn > pfn) 468 if (end_pfn > pfn)
@@ -411,6 +491,42 @@ int __weak page_is_ram(unsigned long pfn)
411} 491}
412EXPORT_SYMBOL_GPL(page_is_ram); 492EXPORT_SYMBOL_GPL(page_is_ram);
413 493
494/*
495 * Search for a resouce entry that fully contains the specified region.
496 * If found, return 1 if it is RAM, 0 if not.
497 * If not found, or region is not fully contained, return -1
498 *
499 * Used by the ioremap functions to ensure the user is not remapping RAM and is
500 * a vast speed up over walking through the resource table page by page.
501 */
502int region_is_ram(resource_size_t start, unsigned long size)
503{
504 struct resource *p;
505 resource_size_t end = start + size - 1;
506 int flags = IORESOURCE_MEM | IORESOURCE_BUSY;
507 const char *name = "System RAM";
508 int ret = -1;
509
510 read_lock(&resource_lock);
511 for (p = iomem_resource.child; p ; p = p->sibling) {
512 if (end < p->start)
513 continue;
514
515 if (p->start <= start && end <= p->end) {
516 /* resource fully contains region */
517 if ((p->flags != flags) || strcmp(p->name, name))
518 ret = 0;
519 else
520 ret = 1;
521 break;
522 }
523 if (p->end < start)
524 break; /* not found */
525 }
526 read_unlock(&resource_lock);
527 return ret;
528}
529
414void __weak arch_remove_reservations(struct resource *avail) 530void __weak arch_remove_reservations(struct resource *avail)
415{ 531{
416} 532}
@@ -1165,6 +1281,76 @@ int release_mem_region_adjustable(struct resource *parent,
1165/* 1281/*
1166 * Managed region resource 1282 * Managed region resource
1167 */ 1283 */
1284static void devm_resource_release(struct device *dev, void *ptr)
1285{
1286 struct resource **r = ptr;
1287
1288 release_resource(*r);
1289}
1290
1291/**
1292 * devm_request_resource() - request and reserve an I/O or memory resource
1293 * @dev: device for which to request the resource
1294 * @root: root of the resource tree from which to request the resource
1295 * @new: descriptor of the resource to request
1296 *
1297 * This is a device-managed version of request_resource(). There is usually
1298 * no need to release resources requested by this function explicitly since
1299 * that will be taken care of when the device is unbound from its driver.
1300 * If for some reason the resource needs to be released explicitly, because
1301 * of ordering issues for example, drivers must call devm_release_resource()
1302 * rather than the regular release_resource().
1303 *
1304 * When a conflict is detected between any existing resources and the newly
1305 * requested resource, an error message will be printed.
1306 *
1307 * Returns 0 on success or a negative error code on failure.
1308 */
1309int devm_request_resource(struct device *dev, struct resource *root,
1310 struct resource *new)
1311{
1312 struct resource *conflict, **ptr;
1313
1314 ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL);
1315 if (!ptr)
1316 return -ENOMEM;
1317
1318 *ptr = new;
1319
1320 conflict = request_resource_conflict(root, new);
1321 if (conflict) {
1322 dev_err(dev, "resource collision: %pR conflicts with %s %pR\n",
1323 new, conflict->name, conflict);
1324 devres_free(ptr);
1325 return -EBUSY;
1326 }
1327
1328 devres_add(dev, ptr);
1329 return 0;
1330}
1331EXPORT_SYMBOL(devm_request_resource);
1332
1333static int devm_resource_match(struct device *dev, void *res, void *data)
1334{
1335 struct resource **ptr = res;
1336
1337 return *ptr == data;
1338}
1339
1340/**
1341 * devm_release_resource() - release a previously requested resource
1342 * @dev: device for which to release the resource
1343 * @new: descriptor of the resource to release
1344 *
1345 * Releases a resource previously requested using devm_request_resource().
1346 */
1347void devm_release_resource(struct device *dev, struct resource *new)
1348{
1349 WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match,
1350 new));
1351}
1352EXPORT_SYMBOL(devm_release_resource);
1353
1168struct region_devres { 1354struct region_devres {
1169 struct resource *parent; 1355 struct resource *parent;
1170 resource_size_t start; 1356 resource_size_t start;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e73efba98301..8a2e230fb86a 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
148 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled)) 148 if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
149 goto out; 149 goto out;
150 150
151 t = p; 151 for_each_thread(p, t)
152 do {
153 sched_move_task(t); 152 sched_move_task(t);
154 } while_each_thread(p, t);
155
156out: 153out:
157 unlock_task_sighand(p, &flags); 154 unlock_task_sighand(p, &flags);
158 autogroup_kref_put(prev); 155 autogroup_kref_put(prev);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 3ef6451e972e..c27e4f8f4879 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -134,7 +134,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
134 134
135static inline struct sched_clock_data *this_scd(void) 135static inline struct sched_clock_data *this_scd(void)
136{ 136{
137 return &__get_cpu_var(sched_clock_data); 137 return this_cpu_ptr(&sched_clock_data);
138} 138}
139 139
140static inline struct sched_clock_data *cpu_sdc(int cpu) 140static inline struct sched_clock_data *cpu_sdc(int cpu)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 1211575a2208..240157c13ddc 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,22 +90,6 @@
90#define CREATE_TRACE_POINTS 90#define CREATE_TRACE_POINTS
91#include <trace/events/sched.h> 91#include <trace/events/sched.h>
92 92
93#ifdef smp_mb__before_atomic
94void __smp_mb__before_atomic(void)
95{
96 smp_mb__before_atomic();
97}
98EXPORT_SYMBOL(__smp_mb__before_atomic);
99#endif
100
101#ifdef smp_mb__after_atomic
102void __smp_mb__after_atomic(void)
103{
104 smp_mb__after_atomic();
105}
106EXPORT_SYMBOL(__smp_mb__after_atomic);
107#endif
108
109void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) 93void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
110{ 94{
111 unsigned long delta; 95 unsigned long delta;
@@ -333,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
333 for (;;) { 317 for (;;) {
334 rq = task_rq(p); 318 rq = task_rq(p);
335 raw_spin_lock(&rq->lock); 319 raw_spin_lock(&rq->lock);
336 if (likely(rq == task_rq(p))) 320 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
337 return rq; 321 return rq;
338 raw_spin_unlock(&rq->lock); 322 raw_spin_unlock(&rq->lock);
323
324 while (unlikely(task_on_rq_migrating(p)))
325 cpu_relax();
339 } 326 }
340} 327}
341 328
@@ -352,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
352 raw_spin_lock_irqsave(&p->pi_lock, *flags); 339 raw_spin_lock_irqsave(&p->pi_lock, *flags);
353 rq = task_rq(p); 340 rq = task_rq(p);
354 raw_spin_lock(&rq->lock); 341 raw_spin_lock(&rq->lock);
355 if (likely(rq == task_rq(p))) 342 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
356 return rq; 343 return rq;
357 raw_spin_unlock(&rq->lock); 344 raw_spin_unlock(&rq->lock);
358 raw_spin_unlock_irqrestore(&p->pi_lock, *flags); 345 raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
346
347 while (unlikely(task_on_rq_migrating(p)))
348 cpu_relax();
359 } 349 }
360} 350}
361 351
@@ -449,7 +439,15 @@ static void __hrtick_start(void *arg)
449void hrtick_start(struct rq *rq, u64 delay) 439void hrtick_start(struct rq *rq, u64 delay)
450{ 440{
451 struct hrtimer *timer = &rq->hrtick_timer; 441 struct hrtimer *timer = &rq->hrtick_timer;
452 ktime_t time = ktime_add_ns(timer->base->get_time(), delay); 442 ktime_t time;
443 s64 delta;
444
445 /*
446 * Don't schedule slices shorter than 10000ns, that just
447 * doesn't make sense and can cause timer DoS.
448 */
449 delta = max_t(s64, delay, 10000LL);
450 time = ktime_add_ns(timer->base->get_time(), delta);
453 451
454 hrtimer_set_expires(timer, time); 452 hrtimer_set_expires(timer, time);
455 453
@@ -1043,7 +1041,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1043 * A queue event has occurred, and we're going to schedule. In 1041 * A queue event has occurred, and we're going to schedule. In
1044 * this case, we can save a useless back to back clock update. 1042 * this case, we can save a useless back to back clock update.
1045 */ 1043 */
1046 if (rq->curr->on_rq && test_tsk_need_resched(rq->curr)) 1044 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
1047 rq->skip_clock_update = 1; 1045 rq->skip_clock_update = 1;
1048} 1046}
1049 1047
@@ -1088,7 +1086,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1088 1086
1089static void __migrate_swap_task(struct task_struct *p, int cpu) 1087static void __migrate_swap_task(struct task_struct *p, int cpu)
1090{ 1088{
1091 if (p->on_rq) { 1089 if (task_on_rq_queued(p)) {
1092 struct rq *src_rq, *dst_rq; 1090 struct rq *src_rq, *dst_rq;
1093 1091
1094 src_rq = task_rq(p); 1092 src_rq = task_rq(p);
@@ -1214,7 +1212,7 @@ static int migration_cpu_stop(void *data);
1214unsigned long wait_task_inactive(struct task_struct *p, long match_state) 1212unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1215{ 1213{
1216 unsigned long flags; 1214 unsigned long flags;
1217 int running, on_rq; 1215 int running, queued;
1218 unsigned long ncsw; 1216 unsigned long ncsw;
1219 struct rq *rq; 1217 struct rq *rq;
1220 1218
@@ -1252,7 +1250,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1252 rq = task_rq_lock(p, &flags); 1250 rq = task_rq_lock(p, &flags);
1253 trace_sched_wait_task(p); 1251 trace_sched_wait_task(p);
1254 running = task_running(rq, p); 1252 running = task_running(rq, p);
1255 on_rq = p->on_rq; 1253 queued = task_on_rq_queued(p);
1256 ncsw = 0; 1254 ncsw = 0;
1257 if (!match_state || p->state == match_state) 1255 if (!match_state || p->state == match_state)
1258 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ 1256 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1284,7 +1282,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1284 * running right now), it's preempted, and we should 1282 * running right now), it's preempted, and we should
1285 * yield - it could be a while. 1283 * yield - it could be a while.
1286 */ 1284 */
1287 if (unlikely(on_rq)) { 1285 if (unlikely(queued)) {
1288 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ); 1286 ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
1289 1287
1290 set_current_state(TASK_UNINTERRUPTIBLE); 1288 set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1478,7 +1476,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1478static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) 1476static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1479{ 1477{
1480 activate_task(rq, p, en_flags); 1478 activate_task(rq, p, en_flags);
1481 p->on_rq = 1; 1479 p->on_rq = TASK_ON_RQ_QUEUED;
1482 1480
1483 /* if a worker is waking up, notify workqueue */ 1481 /* if a worker is waking up, notify workqueue */
1484 if (p->flags & PF_WQ_WORKER) 1482 if (p->flags & PF_WQ_WORKER)
@@ -1537,7 +1535,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
1537 int ret = 0; 1535 int ret = 0;
1538 1536
1539 rq = __task_rq_lock(p); 1537 rq = __task_rq_lock(p);
1540 if (p->on_rq) { 1538 if (task_on_rq_queued(p)) {
1541 /* check_preempt_curr() may use rq clock */ 1539 /* check_preempt_curr() may use rq clock */
1542 update_rq_clock(rq); 1540 update_rq_clock(rq);
1543 ttwu_do_wakeup(rq, p, wake_flags); 1541 ttwu_do_wakeup(rq, p, wake_flags);
@@ -1620,6 +1618,25 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
1620 } 1618 }
1621} 1619}
1622 1620
1621void wake_up_if_idle(int cpu)
1622{
1623 struct rq *rq = cpu_rq(cpu);
1624 unsigned long flags;
1625
1626 if (!is_idle_task(rq->curr))
1627 return;
1628
1629 if (set_nr_if_polling(rq->idle)) {
1630 trace_sched_wake_idle_without_ipi(cpu);
1631 } else {
1632 raw_spin_lock_irqsave(&rq->lock, flags);
1633 if (is_idle_task(rq->curr))
1634 smp_send_reschedule(cpu);
1635 /* Else cpu is not in idle, do nothing here */
1636 raw_spin_unlock_irqrestore(&rq->lock, flags);
1637 }
1638}
1639
1623bool cpus_share_cache(int this_cpu, int that_cpu) 1640bool cpus_share_cache(int this_cpu, int that_cpu)
1624{ 1641{
1625 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); 1642 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1742,7 +1759,7 @@ static void try_to_wake_up_local(struct task_struct *p)
1742 if (!(p->state & TASK_NORMAL)) 1759 if (!(p->state & TASK_NORMAL))
1743 goto out; 1760 goto out;
1744 1761
1745 if (!p->on_rq) 1762 if (!task_on_rq_queued(p))
1746 ttwu_activate(rq, p, ENQUEUE_WAKEUP); 1763 ttwu_activate(rq, p, ENQUEUE_WAKEUP);
1747 1764
1748 ttwu_do_wakeup(rq, p, 0); 1765 ttwu_do_wakeup(rq, p, 0);
@@ -1776,6 +1793,20 @@ int wake_up_state(struct task_struct *p, unsigned int state)
1776} 1793}
1777 1794
1778/* 1795/*
1796 * This function clears the sched_dl_entity static params.
1797 */
1798void __dl_clear_params(struct task_struct *p)
1799{
1800 struct sched_dl_entity *dl_se = &p->dl;
1801
1802 dl_se->dl_runtime = 0;
1803 dl_se->dl_deadline = 0;
1804 dl_se->dl_period = 0;
1805 dl_se->flags = 0;
1806 dl_se->dl_bw = 0;
1807}
1808
1809/*
1779 * Perform scheduler related setup for a newly forked process p. 1810 * Perform scheduler related setup for a newly forked process p.
1780 * p is forked by current. 1811 * p is forked by current.
1781 * 1812 *
@@ -1799,10 +1830,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
1799 1830
1800 RB_CLEAR_NODE(&p->dl.rb_node); 1831 RB_CLEAR_NODE(&p->dl.rb_node);
1801 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 1832 hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1802 p->dl.dl_runtime = p->dl.runtime = 0; 1833 __dl_clear_params(p);
1803 p->dl.dl_deadline = p->dl.deadline = 0;
1804 p->dl.dl_period = 0;
1805 p->dl.flags = 0;
1806 1834
1807 INIT_LIST_HEAD(&p->rt.run_list); 1835 INIT_LIST_HEAD(&p->rt.run_list);
1808 1836
@@ -1977,6 +2005,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
1977#ifdef CONFIG_SMP 2005#ifdef CONFIG_SMP
1978inline struct dl_bw *dl_bw_of(int i) 2006inline struct dl_bw *dl_bw_of(int i)
1979{ 2007{
2008 rcu_lockdep_assert(rcu_read_lock_sched_held(),
2009 "sched RCU must be held");
1980 return &cpu_rq(i)->rd->dl_bw; 2010 return &cpu_rq(i)->rd->dl_bw;
1981} 2011}
1982 2012
@@ -1985,6 +2015,8 @@ static inline int dl_bw_cpus(int i)
1985 struct root_domain *rd = cpu_rq(i)->rd; 2015 struct root_domain *rd = cpu_rq(i)->rd;
1986 int cpus = 0; 2016 int cpus = 0;
1987 2017
2018 rcu_lockdep_assert(rcu_read_lock_sched_held(),
2019 "sched RCU must be held");
1988 for_each_cpu_and(i, rd->span, cpu_active_mask) 2020 for_each_cpu_and(i, rd->span, cpu_active_mask)
1989 cpus++; 2021 cpus++;
1990 2022
@@ -2095,7 +2127,7 @@ void wake_up_new_task(struct task_struct *p)
2095 init_task_runnable_average(p); 2127 init_task_runnable_average(p);
2096 rq = __task_rq_lock(p); 2128 rq = __task_rq_lock(p);
2097 activate_task(rq, p, 0); 2129 activate_task(rq, p, 0);
2098 p->on_rq = 1; 2130 p->on_rq = TASK_ON_RQ_QUEUED;
2099 trace_sched_wakeup_new(p, true); 2131 trace_sched_wakeup_new(p, true);
2100 check_preempt_curr(rq, p, WF_FORK); 2132 check_preempt_curr(rq, p, WF_FORK);
2101#ifdef CONFIG_SMP 2133#ifdef CONFIG_SMP
@@ -2287,10 +2319,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
2287 */ 2319 */
2288 post_schedule(rq); 2320 post_schedule(rq);
2289 2321
2290#ifdef __ARCH_WANT_UNLOCKED_CTXSW
2291 /* In this case, finish_task_switch does not reenable preemption */
2292 preempt_enable();
2293#endif
2294 if (current->set_child_tid) 2322 if (current->set_child_tid)
2295 put_user(task_pid_vnr(current), current->set_child_tid); 2323 put_user(task_pid_vnr(current), current->set_child_tid);
2296} 2324}
@@ -2333,9 +2361,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2333 * of the scheduler it's an obvious special-case), so we 2361 * of the scheduler it's an obvious special-case), so we
2334 * do an early lockdep release here: 2362 * do an early lockdep release here:
2335 */ 2363 */
2336#ifndef __ARCH_WANT_UNLOCKED_CTXSW
2337 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 2364 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2338#endif
2339 2365
2340 context_tracking_task_switch(prev, next); 2366 context_tracking_task_switch(prev, next);
2341 /* Here we just switch the register state and the stack. */ 2367 /* Here we just switch the register state and the stack. */
@@ -2366,6 +2392,18 @@ unsigned long nr_running(void)
2366 return sum; 2392 return sum;
2367} 2393}
2368 2394
2395/*
2396 * Check if only the current task is running on the cpu.
2397 */
2398bool single_task_running(void)
2399{
2400 if (cpu_rq(smp_processor_id())->nr_running == 1)
2401 return true;
2402 else
2403 return false;
2404}
2405EXPORT_SYMBOL(single_task_running);
2406
2369unsigned long long nr_context_switches(void) 2407unsigned long long nr_context_switches(void)
2370{ 2408{
2371 int i; 2409 int i;
@@ -2393,6 +2431,13 @@ unsigned long nr_iowait_cpu(int cpu)
2393 return atomic_read(&this->nr_iowait); 2431 return atomic_read(&this->nr_iowait);
2394} 2432}
2395 2433
2434void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2435{
2436 struct rq *this = this_rq();
2437 *nr_waiters = atomic_read(&this->nr_iowait);
2438 *load = this->cpu_load[0];
2439}
2440
2396#ifdef CONFIG_SMP 2441#ifdef CONFIG_SMP
2397 2442
2398/* 2443/*
@@ -2444,7 +2489,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
2444 * project cycles that may never be accounted to this 2489 * project cycles that may never be accounted to this
2445 * thread, breaking clock_gettime(). 2490 * thread, breaking clock_gettime().
2446 */ 2491 */
2447 if (task_current(rq, p) && p->on_rq) { 2492 if (task_current(rq, p) && task_on_rq_queued(p)) {
2448 update_rq_clock(rq); 2493 update_rq_clock(rq);
2449 ns = rq_clock_task(rq) - p->se.exec_start; 2494 ns = rq_clock_task(rq) - p->se.exec_start;
2450 if ((s64)ns < 0) 2495 if ((s64)ns < 0)
@@ -2490,7 +2535,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
2490 * If we see ->on_cpu without ->on_rq, the task is leaving, and has 2535 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
2491 * been accounted, so we're correct here as well. 2536 * been accounted, so we're correct here as well.
2492 */ 2537 */
2493 if (!p->on_cpu || !p->on_rq) 2538 if (!p->on_cpu || !task_on_rq_queued(p))
2494 return p->se.sum_exec_runtime; 2539 return p->se.sum_exec_runtime;
2495#endif 2540#endif
2496 2541
@@ -2653,6 +2698,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
2653 */ 2698 */
2654static inline void schedule_debug(struct task_struct *prev) 2699static inline void schedule_debug(struct task_struct *prev)
2655{ 2700{
2701#ifdef CONFIG_SCHED_STACK_END_CHECK
2702 BUG_ON(unlikely(task_stack_end_corrupted(prev)));
2703#endif
2656 /* 2704 /*
2657 * Test if we are atomic. Since do_exit() needs to call into 2705 * Test if we are atomic. Since do_exit() needs to call into
2658 * schedule() atomically, we ignore that path. Otherwise whine 2706 * schedule() atomically, we ignore that path. Otherwise whine
@@ -2794,7 +2842,7 @@ need_resched:
2794 switch_count = &prev->nvcsw; 2842 switch_count = &prev->nvcsw;
2795 } 2843 }
2796 2844
2797 if (prev->on_rq || rq->skip_clock_update < 0) 2845 if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
2798 update_rq_clock(rq); 2846 update_rq_clock(rq);
2799 2847
2800 next = pick_next_task(rq, prev); 2848 next = pick_next_task(rq, prev);
@@ -2903,6 +2951,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
2903} 2951}
2904NOKPROBE_SYMBOL(preempt_schedule); 2952NOKPROBE_SYMBOL(preempt_schedule);
2905EXPORT_SYMBOL(preempt_schedule); 2953EXPORT_SYMBOL(preempt_schedule);
2954
2955#ifdef CONFIG_CONTEXT_TRACKING
2956/**
2957 * preempt_schedule_context - preempt_schedule called by tracing
2958 *
2959 * The tracing infrastructure uses preempt_enable_notrace to prevent
2960 * recursion and tracing preempt enabling caused by the tracing
2961 * infrastructure itself. But as tracing can happen in areas coming
2962 * from userspace or just about to enter userspace, a preempt enable
2963 * can occur before user_exit() is called. This will cause the scheduler
2964 * to be called when the system is still in usermode.
2965 *
2966 * To prevent this, the preempt_enable_notrace will use this function
2967 * instead of preempt_schedule() to exit user context if needed before
2968 * calling the scheduler.
2969 */
2970asmlinkage __visible void __sched notrace preempt_schedule_context(void)
2971{
2972 enum ctx_state prev_ctx;
2973
2974 if (likely(!preemptible()))
2975 return;
2976
2977 do {
2978 __preempt_count_add(PREEMPT_ACTIVE);
2979 /*
2980 * Needs preempt disabled in case user_exit() is traced
2981 * and the tracer calls preempt_enable_notrace() causing
2982 * an infinite recursion.
2983 */
2984 prev_ctx = exception_enter();
2985 __schedule();
2986 exception_exit(prev_ctx);
2987
2988 __preempt_count_sub(PREEMPT_ACTIVE);
2989 barrier();
2990 } while (need_resched());
2991}
2992EXPORT_SYMBOL_GPL(preempt_schedule_context);
2993#endif /* CONFIG_CONTEXT_TRACKING */
2994
2906#endif /* CONFIG_PREEMPT */ 2995#endif /* CONFIG_PREEMPT */
2907 2996
2908/* 2997/*
@@ -2959,7 +3048,7 @@ EXPORT_SYMBOL(default_wake_function);
2959 */ 3048 */
2960void rt_mutex_setprio(struct task_struct *p, int prio) 3049void rt_mutex_setprio(struct task_struct *p, int prio)
2961{ 3050{
2962 int oldprio, on_rq, running, enqueue_flag = 0; 3051 int oldprio, queued, running, enqueue_flag = 0;
2963 struct rq *rq; 3052 struct rq *rq;
2964 const struct sched_class *prev_class; 3053 const struct sched_class *prev_class;
2965 3054
@@ -2988,12 +3077,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
2988 trace_sched_pi_setprio(p, prio); 3077 trace_sched_pi_setprio(p, prio);
2989 oldprio = p->prio; 3078 oldprio = p->prio;
2990 prev_class = p->sched_class; 3079 prev_class = p->sched_class;
2991 on_rq = p->on_rq; 3080 queued = task_on_rq_queued(p);
2992 running = task_current(rq, p); 3081 running = task_current(rq, p);
2993 if (on_rq) 3082 if (queued)
2994 dequeue_task(rq, p, 0); 3083 dequeue_task(rq, p, 0);
2995 if (running) 3084 if (running)
2996 p->sched_class->put_prev_task(rq, p); 3085 put_prev_task(rq, p);
2997 3086
2998 /* 3087 /*
2999 * Boosting condition are: 3088 * Boosting condition are:
@@ -3030,7 +3119,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
3030 3119
3031 if (running) 3120 if (running)
3032 p->sched_class->set_curr_task(rq); 3121 p->sched_class->set_curr_task(rq);
3033 if (on_rq) 3122 if (queued)
3034 enqueue_task(rq, p, enqueue_flag); 3123 enqueue_task(rq, p, enqueue_flag);
3035 3124
3036 check_class_changed(rq, p, prev_class, oldprio); 3125 check_class_changed(rq, p, prev_class, oldprio);
@@ -3041,7 +3130,7 @@ out_unlock:
3041 3130
3042void set_user_nice(struct task_struct *p, long nice) 3131void set_user_nice(struct task_struct *p, long nice)
3043{ 3132{
3044 int old_prio, delta, on_rq; 3133 int old_prio, delta, queued;
3045 unsigned long flags; 3134 unsigned long flags;
3046 struct rq *rq; 3135 struct rq *rq;
3047 3136
@@ -3062,8 +3151,8 @@ void set_user_nice(struct task_struct *p, long nice)
3062 p->static_prio = NICE_TO_PRIO(nice); 3151 p->static_prio = NICE_TO_PRIO(nice);
3063 goto out_unlock; 3152 goto out_unlock;
3064 } 3153 }
3065 on_rq = p->on_rq; 3154 queued = task_on_rq_queued(p);
3066 if (on_rq) 3155 if (queued)
3067 dequeue_task(rq, p, 0); 3156 dequeue_task(rq, p, 0);
3068 3157
3069 p->static_prio = NICE_TO_PRIO(nice); 3158 p->static_prio = NICE_TO_PRIO(nice);
@@ -3072,7 +3161,7 @@ void set_user_nice(struct task_struct *p, long nice)
3072 p->prio = effective_prio(p); 3161 p->prio = effective_prio(p);
3073 delta = p->prio - old_prio; 3162 delta = p->prio - old_prio;
3074 3163
3075 if (on_rq) { 3164 if (queued) {
3076 enqueue_task(rq, p, 0); 3165 enqueue_task(rq, p, 0);
3077 /* 3166 /*
3078 * If the task increased its priority or is running and 3167 * If the task increased its priority or is running and
@@ -3344,7 +3433,7 @@ static int __sched_setscheduler(struct task_struct *p,
3344{ 3433{
3345 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 : 3434 int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
3346 MAX_RT_PRIO - 1 - attr->sched_priority; 3435 MAX_RT_PRIO - 1 - attr->sched_priority;
3347 int retval, oldprio, oldpolicy = -1, on_rq, running; 3436 int retval, oldprio, oldpolicy = -1, queued, running;
3348 int policy = attr->sched_policy; 3437 int policy = attr->sched_policy;
3349 unsigned long flags; 3438 unsigned long flags;
3350 const struct sched_class *prev_class; 3439 const struct sched_class *prev_class;
@@ -3541,19 +3630,19 @@ change:
3541 return 0; 3630 return 0;
3542 } 3631 }
3543 3632
3544 on_rq = p->on_rq; 3633 queued = task_on_rq_queued(p);
3545 running = task_current(rq, p); 3634 running = task_current(rq, p);
3546 if (on_rq) 3635 if (queued)
3547 dequeue_task(rq, p, 0); 3636 dequeue_task(rq, p, 0);
3548 if (running) 3637 if (running)
3549 p->sched_class->put_prev_task(rq, p); 3638 put_prev_task(rq, p);
3550 3639
3551 prev_class = p->sched_class; 3640 prev_class = p->sched_class;
3552 __setscheduler(rq, p, attr); 3641 __setscheduler(rq, p, attr);
3553 3642
3554 if (running) 3643 if (running)
3555 p->sched_class->set_curr_task(rq); 3644 p->sched_class->set_curr_task(rq);
3556 if (on_rq) { 3645 if (queued) {
3557 /* 3646 /*
3558 * We enqueue to tail when the priority of a task is 3647 * We enqueue to tail when the priority of a task is
3559 * increased (user space view). 3648 * increased (user space view).
@@ -3977,14 +4066,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3977 rcu_read_lock(); 4066 rcu_read_lock();
3978 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { 4067 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
3979 rcu_read_unlock(); 4068 rcu_read_unlock();
3980 goto out_unlock; 4069 goto out_free_new_mask;
3981 } 4070 }
3982 rcu_read_unlock(); 4071 rcu_read_unlock();
3983 } 4072 }
3984 4073
3985 retval = security_task_setscheduler(p); 4074 retval = security_task_setscheduler(p);
3986 if (retval) 4075 if (retval)
3987 goto out_unlock; 4076 goto out_free_new_mask;
3988 4077
3989 4078
3990 cpuset_cpus_allowed(p, cpus_allowed); 4079 cpuset_cpus_allowed(p, cpus_allowed);
@@ -3997,13 +4086,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
3997 * root_domain. 4086 * root_domain.
3998 */ 4087 */
3999#ifdef CONFIG_SMP 4088#ifdef CONFIG_SMP
4000 if (task_has_dl_policy(p)) { 4089 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4001 const struct cpumask *span = task_rq(p)->rd->span; 4090 rcu_read_lock();
4002 4091 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4003 if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
4004 retval = -EBUSY; 4092 retval = -EBUSY;
4005 goto out_unlock; 4093 rcu_read_unlock();
4094 goto out_free_new_mask;
4006 } 4095 }
4096 rcu_read_unlock();
4007 } 4097 }
4008#endif 4098#endif
4009again: 4099again:
@@ -4021,7 +4111,7 @@ again:
4021 goto again; 4111 goto again;
4022 } 4112 }
4023 } 4113 }
4024out_unlock: 4114out_free_new_mask:
4025 free_cpumask_var(new_mask); 4115 free_cpumask_var(new_mask);
4026out_free_cpus_allowed: 4116out_free_cpus_allowed:
4027 free_cpumask_var(cpus_allowed); 4117 free_cpumask_var(cpus_allowed);
@@ -4505,7 +4595,7 @@ void show_state_filter(unsigned long state_filter)
4505 " task PC stack pid father\n"); 4595 " task PC stack pid father\n");
4506#endif 4596#endif
4507 rcu_read_lock(); 4597 rcu_read_lock();
4508 do_each_thread(g, p) { 4598 for_each_process_thread(g, p) {
4509 /* 4599 /*
4510 * reset the NMI-timeout, listing all files on a slow 4600 * reset the NMI-timeout, listing all files on a slow
4511 * console might take a lot of time: 4601 * console might take a lot of time:
@@ -4513,7 +4603,7 @@ void show_state_filter(unsigned long state_filter)
4513 touch_nmi_watchdog(); 4603 touch_nmi_watchdog();
4514 if (!state_filter || (p->state & state_filter)) 4604 if (!state_filter || (p->state & state_filter))
4515 sched_show_task(p); 4605 sched_show_task(p);
4516 } while_each_thread(g, p); 4606 }
4517 4607
4518 touch_all_softlockup_watchdogs(); 4608 touch_all_softlockup_watchdogs();
4519 4609
@@ -4568,7 +4658,7 @@ void init_idle(struct task_struct *idle, int cpu)
4568 rcu_read_unlock(); 4658 rcu_read_unlock();
4569 4659
4570 rq->curr = rq->idle = idle; 4660 rq->curr = rq->idle = idle;
4571 idle->on_rq = 1; 4661 idle->on_rq = TASK_ON_RQ_QUEUED;
4572#if defined(CONFIG_SMP) 4662#if defined(CONFIG_SMP)
4573 idle->on_cpu = 1; 4663 idle->on_cpu = 1;
4574#endif 4664#endif
@@ -4589,6 +4679,33 @@ void init_idle(struct task_struct *idle, int cpu)
4589} 4679}
4590 4680
4591#ifdef CONFIG_SMP 4681#ifdef CONFIG_SMP
4682/*
4683 * move_queued_task - move a queued task to new rq.
4684 *
4685 * Returns (locked) new rq. Old rq's lock is released.
4686 */
4687static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
4688{
4689 struct rq *rq = task_rq(p);
4690
4691 lockdep_assert_held(&rq->lock);
4692
4693 dequeue_task(rq, p, 0);
4694 p->on_rq = TASK_ON_RQ_MIGRATING;
4695 set_task_cpu(p, new_cpu);
4696 raw_spin_unlock(&rq->lock);
4697
4698 rq = cpu_rq(new_cpu);
4699
4700 raw_spin_lock(&rq->lock);
4701 BUG_ON(task_cpu(p) != new_cpu);
4702 p->on_rq = TASK_ON_RQ_QUEUED;
4703 enqueue_task(rq, p, 0);
4704 check_preempt_curr(rq, p, 0);
4705
4706 return rq;
4707}
4708
4592void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) 4709void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4593{ 4710{
4594 if (p->sched_class && p->sched_class->set_cpus_allowed) 4711 if (p->sched_class && p->sched_class->set_cpus_allowed)
@@ -4645,14 +4762,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
4645 goto out; 4762 goto out;
4646 4763
4647 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 4764 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
4648 if (p->on_rq) { 4765 if (task_running(rq, p) || p->state == TASK_WAKING) {
4649 struct migration_arg arg = { p, dest_cpu }; 4766 struct migration_arg arg = { p, dest_cpu };
4650 /* Need help from migration thread: drop lock and wait. */ 4767 /* Need help from migration thread: drop lock and wait. */
4651 task_rq_unlock(rq, p, &flags); 4768 task_rq_unlock(rq, p, &flags);
4652 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); 4769 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
4653 tlb_migrate_finish(p->mm); 4770 tlb_migrate_finish(p->mm);
4654 return 0; 4771 return 0;
4655 } 4772 } else if (task_on_rq_queued(p))
4773 rq = move_queued_task(p, dest_cpu);
4656out: 4774out:
4657 task_rq_unlock(rq, p, &flags); 4775 task_rq_unlock(rq, p, &flags);
4658 4776
@@ -4673,20 +4791,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
4673 */ 4791 */
4674static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4792static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4675{ 4793{
4676 struct rq *rq_dest, *rq_src; 4794 struct rq *rq;
4677 int ret = 0; 4795 int ret = 0;
4678 4796
4679 if (unlikely(!cpu_active(dest_cpu))) 4797 if (unlikely(!cpu_active(dest_cpu)))
4680 return ret; 4798 return ret;
4681 4799
4682 rq_src = cpu_rq(src_cpu); 4800 rq = cpu_rq(src_cpu);
4683 rq_dest = cpu_rq(dest_cpu);
4684 4801
4685 raw_spin_lock(&p->pi_lock); 4802 raw_spin_lock(&p->pi_lock);
4686 double_rq_lock(rq_src, rq_dest); 4803 raw_spin_lock(&rq->lock);
4687 /* Already moved. */ 4804 /* Already moved. */
4688 if (task_cpu(p) != src_cpu) 4805 if (task_cpu(p) != src_cpu)
4689 goto done; 4806 goto done;
4807
4690 /* Affinity changed (again). */ 4808 /* Affinity changed (again). */
4691 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p))) 4809 if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
4692 goto fail; 4810 goto fail;
@@ -4695,16 +4813,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4695 * If we're not on a rq, the next wake-up will ensure we're 4813 * If we're not on a rq, the next wake-up will ensure we're
4696 * placed properly. 4814 * placed properly.
4697 */ 4815 */
4698 if (p->on_rq) { 4816 if (task_on_rq_queued(p))
4699 dequeue_task(rq_src, p, 0); 4817 rq = move_queued_task(p, dest_cpu);
4700 set_task_cpu(p, dest_cpu);
4701 enqueue_task(rq_dest, p, 0);
4702 check_preempt_curr(rq_dest, p, 0);
4703 }
4704done: 4818done:
4705 ret = 1; 4819 ret = 1;
4706fail: 4820fail:
4707 double_rq_unlock(rq_src, rq_dest); 4821 raw_spin_unlock(&rq->lock);
4708 raw_spin_unlock(&p->pi_lock); 4822 raw_spin_unlock(&p->pi_lock);
4709 return ret; 4823 return ret;
4710} 4824}
@@ -4736,22 +4850,22 @@ void sched_setnuma(struct task_struct *p, int nid)
4736{ 4850{
4737 struct rq *rq; 4851 struct rq *rq;
4738 unsigned long flags; 4852 unsigned long flags;
4739 bool on_rq, running; 4853 bool queued, running;
4740 4854
4741 rq = task_rq_lock(p, &flags); 4855 rq = task_rq_lock(p, &flags);
4742 on_rq = p->on_rq; 4856 queued = task_on_rq_queued(p);
4743 running = task_current(rq, p); 4857 running = task_current(rq, p);
4744 4858
4745 if (on_rq) 4859 if (queued)
4746 dequeue_task(rq, p, 0); 4860 dequeue_task(rq, p, 0);
4747 if (running) 4861 if (running)
4748 p->sched_class->put_prev_task(rq, p); 4862 put_prev_task(rq, p);
4749 4863
4750 p->numa_preferred_nid = nid; 4864 p->numa_preferred_nid = nid;
4751 4865
4752 if (running) 4866 if (running)
4753 p->sched_class->set_curr_task(rq); 4867 p->sched_class->set_curr_task(rq);
4754 if (on_rq) 4868 if (queued)
4755 enqueue_task(rq, p, 0); 4869 enqueue_task(rq, p, 0);
4756 task_rq_unlock(rq, p, &flags); 4870 task_rq_unlock(rq, p, &flags);
4757} 4871}
@@ -4771,6 +4885,12 @@ static int migration_cpu_stop(void *data)
4771 * be on another cpu but it doesn't matter. 4885 * be on another cpu but it doesn't matter.
4772 */ 4886 */
4773 local_irq_disable(); 4887 local_irq_disable();
4888 /*
4889 * We need to explicitly wake pending tasks before running
4890 * __migrate_task() such that we will not miss enforcing cpus_allowed
4891 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
4892 */
4893 sched_ttwu_pending();
4774 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu); 4894 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
4775 local_irq_enable(); 4895 local_irq_enable();
4776 return 0; 4896 return 0;
@@ -5181,6 +5301,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
5181{ 5301{
5182 unsigned long flags; 5302 unsigned long flags;
5183 long cpu = (long)hcpu; 5303 long cpu = (long)hcpu;
5304 struct dl_bw *dl_b;
5184 5305
5185 switch (action & ~CPU_TASKS_FROZEN) { 5306 switch (action & ~CPU_TASKS_FROZEN) {
5186 case CPU_DOWN_PREPARE: 5307 case CPU_DOWN_PREPARE:
@@ -5188,15 +5309,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
5188 5309
5189 /* explicitly allow suspend */ 5310 /* explicitly allow suspend */
5190 if (!(action & CPU_TASKS_FROZEN)) { 5311 if (!(action & CPU_TASKS_FROZEN)) {
5191 struct dl_bw *dl_b = dl_bw_of(cpu);
5192 bool overflow; 5312 bool overflow;
5193 int cpus; 5313 int cpus;
5194 5314
5315 rcu_read_lock_sched();
5316 dl_b = dl_bw_of(cpu);
5317
5195 raw_spin_lock_irqsave(&dl_b->lock, flags); 5318 raw_spin_lock_irqsave(&dl_b->lock, flags);
5196 cpus = dl_bw_cpus(cpu); 5319 cpus = dl_bw_cpus(cpu);
5197 overflow = __dl_overflow(dl_b, cpus, 0, 0); 5320 overflow = __dl_overflow(dl_b, cpus, 0, 0);
5198 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 5321 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
5199 5322
5323 rcu_read_unlock_sched();
5324
5200 if (overflow) 5325 if (overflow)
5201 return notifier_from_errno(-EBUSY); 5326 return notifier_from_errno(-EBUSY);
5202 } 5327 }
@@ -5739,7 +5864,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5739 const struct cpumask *span = sched_domain_span(sd); 5864 const struct cpumask *span = sched_domain_span(sd);
5740 struct cpumask *covered = sched_domains_tmpmask; 5865 struct cpumask *covered = sched_domains_tmpmask;
5741 struct sd_data *sdd = sd->private; 5866 struct sd_data *sdd = sd->private;
5742 struct sched_domain *child; 5867 struct sched_domain *sibling;
5743 int i; 5868 int i;
5744 5869
5745 cpumask_clear(covered); 5870 cpumask_clear(covered);
@@ -5750,10 +5875,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5750 if (cpumask_test_cpu(i, covered)) 5875 if (cpumask_test_cpu(i, covered))
5751 continue; 5876 continue;
5752 5877
5753 child = *per_cpu_ptr(sdd->sd, i); 5878 sibling = *per_cpu_ptr(sdd->sd, i);
5754 5879
5755 /* See the comment near build_group_mask(). */ 5880 /* See the comment near build_group_mask(). */
5756 if (!cpumask_test_cpu(i, sched_domain_span(child))) 5881 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
5757 continue; 5882 continue;
5758 5883
5759 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 5884 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
@@ -5763,10 +5888,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5763 goto fail; 5888 goto fail;
5764 5889
5765 sg_span = sched_group_cpus(sg); 5890 sg_span = sched_group_cpus(sg);
5766 if (child->child) { 5891 if (sibling->child)
5767 child = child->child; 5892 cpumask_copy(sg_span, sched_domain_span(sibling->child));
5768 cpumask_copy(sg_span, sched_domain_span(child)); 5893 else
5769 } else
5770 cpumask_set_cpu(i, sg_span); 5894 cpumask_set_cpu(i, sg_span);
5771 5895
5772 cpumask_or(covered, covered, sg_span); 5896 cpumask_or(covered, covered, sg_span);
@@ -7117,13 +7241,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7117 .sched_policy = SCHED_NORMAL, 7241 .sched_policy = SCHED_NORMAL,
7118 }; 7242 };
7119 int old_prio = p->prio; 7243 int old_prio = p->prio;
7120 int on_rq; 7244 int queued;
7121 7245
7122 on_rq = p->on_rq; 7246 queued = task_on_rq_queued(p);
7123 if (on_rq) 7247 if (queued)
7124 dequeue_task(rq, p, 0); 7248 dequeue_task(rq, p, 0);
7125 __setscheduler(rq, p, &attr); 7249 __setscheduler(rq, p, &attr);
7126 if (on_rq) { 7250 if (queued) {
7127 enqueue_task(rq, p, 0); 7251 enqueue_task(rq, p, 0);
7128 resched_curr(rq); 7252 resched_curr(rq);
7129 } 7253 }
@@ -7137,12 +7261,12 @@ void normalize_rt_tasks(void)
7137 unsigned long flags; 7261 unsigned long flags;
7138 struct rq *rq; 7262 struct rq *rq;
7139 7263
7140 read_lock_irqsave(&tasklist_lock, flags); 7264 read_lock(&tasklist_lock);
7141 do_each_thread(g, p) { 7265 for_each_process_thread(g, p) {
7142 /* 7266 /*
7143 * Only normalize user tasks: 7267 * Only normalize user tasks:
7144 */ 7268 */
7145 if (!p->mm) 7269 if (p->flags & PF_KTHREAD)
7146 continue; 7270 continue;
7147 7271
7148 p->se.exec_start = 0; 7272 p->se.exec_start = 0;
@@ -7157,21 +7281,16 @@ void normalize_rt_tasks(void)
7157 * Renice negative nice level userspace 7281 * Renice negative nice level userspace
7158 * tasks back to 0: 7282 * tasks back to 0:
7159 */ 7283 */
7160 if (task_nice(p) < 0 && p->mm) 7284 if (task_nice(p) < 0)
7161 set_user_nice(p, 0); 7285 set_user_nice(p, 0);
7162 continue; 7286 continue;
7163 } 7287 }
7164 7288
7165 raw_spin_lock(&p->pi_lock); 7289 rq = task_rq_lock(p, &flags);
7166 rq = __task_rq_lock(p);
7167
7168 normalize_task(rq, p); 7290 normalize_task(rq, p);
7169 7291 task_rq_unlock(rq, p, &flags);
7170 __task_rq_unlock(rq); 7292 }
7171 raw_spin_unlock(&p->pi_lock); 7293 read_unlock(&tasklist_lock);
7172 } while_each_thread(g, p);
7173
7174 read_unlock_irqrestore(&tasklist_lock, flags);
7175} 7294}
7176 7295
7177#endif /* CONFIG_MAGIC_SYSRQ */ 7296#endif /* CONFIG_MAGIC_SYSRQ */
@@ -7311,19 +7430,19 @@ void sched_offline_group(struct task_group *tg)
7311void sched_move_task(struct task_struct *tsk) 7430void sched_move_task(struct task_struct *tsk)
7312{ 7431{
7313 struct task_group *tg; 7432 struct task_group *tg;
7314 int on_rq, running; 7433 int queued, running;
7315 unsigned long flags; 7434 unsigned long flags;
7316 struct rq *rq; 7435 struct rq *rq;
7317 7436
7318 rq = task_rq_lock(tsk, &flags); 7437 rq = task_rq_lock(tsk, &flags);
7319 7438
7320 running = task_current(rq, tsk); 7439 running = task_current(rq, tsk);
7321 on_rq = tsk->on_rq; 7440 queued = task_on_rq_queued(tsk);
7322 7441
7323 if (on_rq) 7442 if (queued)
7324 dequeue_task(rq, tsk, 0); 7443 dequeue_task(rq, tsk, 0);
7325 if (unlikely(running)) 7444 if (unlikely(running))
7326 tsk->sched_class->put_prev_task(rq, tsk); 7445 put_prev_task(rq, tsk);
7327 7446
7328 tg = container_of(task_css_check(tsk, cpu_cgrp_id, 7447 tg = container_of(task_css_check(tsk, cpu_cgrp_id,
7329 lockdep_is_held(&tsk->sighand->siglock)), 7448 lockdep_is_held(&tsk->sighand->siglock)),
@@ -7333,14 +7452,14 @@ void sched_move_task(struct task_struct *tsk)
7333 7452
7334#ifdef CONFIG_FAIR_GROUP_SCHED 7453#ifdef CONFIG_FAIR_GROUP_SCHED
7335 if (tsk->sched_class->task_move_group) 7454 if (tsk->sched_class->task_move_group)
7336 tsk->sched_class->task_move_group(tsk, on_rq); 7455 tsk->sched_class->task_move_group(tsk, queued);
7337 else 7456 else
7338#endif 7457#endif
7339 set_task_rq(tsk, task_cpu(tsk)); 7458 set_task_rq(tsk, task_cpu(tsk));
7340 7459
7341 if (unlikely(running)) 7460 if (unlikely(running))
7342 tsk->sched_class->set_curr_task(rq); 7461 tsk->sched_class->set_curr_task(rq);
7343 if (on_rq) 7462 if (queued)
7344 enqueue_task(rq, tsk, 0); 7463 enqueue_task(rq, tsk, 0);
7345 7464
7346 task_rq_unlock(rq, tsk, &flags); 7465 task_rq_unlock(rq, tsk, &flags);
@@ -7358,10 +7477,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
7358{ 7477{
7359 struct task_struct *g, *p; 7478 struct task_struct *g, *p;
7360 7479
7361 do_each_thread(g, p) { 7480 for_each_process_thread(g, p) {
7362 if (rt_task(p) && task_rq(p)->rt.tg == tg) 7481 if (rt_task(p) && task_group(p) == tg)
7363 return 1; 7482 return 1;
7364 } while_each_thread(g, p); 7483 }
7365 7484
7366 return 0; 7485 return 0;
7367} 7486}
@@ -7570,6 +7689,7 @@ static int sched_dl_global_constraints(void)
7570 u64 runtime = global_rt_runtime(); 7689 u64 runtime = global_rt_runtime();
7571 u64 period = global_rt_period(); 7690 u64 period = global_rt_period();
7572 u64 new_bw = to_ratio(period, runtime); 7691 u64 new_bw = to_ratio(period, runtime);
7692 struct dl_bw *dl_b;
7573 int cpu, ret = 0; 7693 int cpu, ret = 0;
7574 unsigned long flags; 7694 unsigned long flags;
7575 7695
@@ -7583,13 +7703,16 @@ static int sched_dl_global_constraints(void)
7583 * solutions is welcome! 7703 * solutions is welcome!
7584 */ 7704 */
7585 for_each_possible_cpu(cpu) { 7705 for_each_possible_cpu(cpu) {
7586 struct dl_bw *dl_b = dl_bw_of(cpu); 7706 rcu_read_lock_sched();
7707 dl_b = dl_bw_of(cpu);
7587 7708
7588 raw_spin_lock_irqsave(&dl_b->lock, flags); 7709 raw_spin_lock_irqsave(&dl_b->lock, flags);
7589 if (new_bw < dl_b->total_bw) 7710 if (new_bw < dl_b->total_bw)
7590 ret = -EBUSY; 7711 ret = -EBUSY;
7591 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7712 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7592 7713
7714 rcu_read_unlock_sched();
7715
7593 if (ret) 7716 if (ret)
7594 break; 7717 break;
7595 } 7718 }
@@ -7600,6 +7723,7 @@ static int sched_dl_global_constraints(void)
7600static void sched_dl_do_global(void) 7723static void sched_dl_do_global(void)
7601{ 7724{
7602 u64 new_bw = -1; 7725 u64 new_bw = -1;
7726 struct dl_bw *dl_b;
7603 int cpu; 7727 int cpu;
7604 unsigned long flags; 7728 unsigned long flags;
7605 7729
@@ -7613,11 +7737,14 @@ static void sched_dl_do_global(void)
7613 * FIXME: As above... 7737 * FIXME: As above...
7614 */ 7738 */
7615 for_each_possible_cpu(cpu) { 7739 for_each_possible_cpu(cpu) {
7616 struct dl_bw *dl_b = dl_bw_of(cpu); 7740 rcu_read_lock_sched();
7741 dl_b = dl_bw_of(cpu);
7617 7742
7618 raw_spin_lock_irqsave(&dl_b->lock, flags); 7743 raw_spin_lock_irqsave(&dl_b->lock, flags);
7619 dl_b->bw = new_bw; 7744 dl_b->bw = new_bw;
7620 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 7745 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
7746
7747 rcu_read_unlock_sched();
7621 } 7748 }
7622} 7749}
7623 7750
@@ -7747,6 +7874,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
7747 sched_offline_group(tg); 7874 sched_offline_group(tg);
7748} 7875}
7749 7876
7877static void cpu_cgroup_fork(struct task_struct *task)
7878{
7879 sched_move_task(task);
7880}
7881
7750static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, 7882static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
7751 struct cgroup_taskset *tset) 7883 struct cgroup_taskset *tset)
7752{ 7884{
@@ -7998,7 +8130,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
7998 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth; 8130 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
7999 8131
8000 quota = normalize_cfs_quota(tg, d); 8132 quota = normalize_cfs_quota(tg, d);
8001 parent_quota = parent_b->hierarchal_quota; 8133 parent_quota = parent_b->hierarchical_quota;
8002 8134
8003 /* 8135 /*
8004 * ensure max(child_quota) <= parent_quota, inherit when no 8136 * ensure max(child_quota) <= parent_quota, inherit when no
@@ -8009,7 +8141,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
8009 else if (parent_quota != RUNTIME_INF && quota > parent_quota) 8141 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
8010 return -EINVAL; 8142 return -EINVAL;
8011 } 8143 }
8012 cfs_b->hierarchal_quota = quota; 8144 cfs_b->hierarchical_quota = quota;
8013 8145
8014 return 0; 8146 return 0;
8015} 8147}
@@ -8119,6 +8251,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
8119 .css_free = cpu_cgroup_css_free, 8251 .css_free = cpu_cgroup_css_free,
8120 .css_online = cpu_cgroup_css_online, 8252 .css_online = cpu_cgroup_css_online,
8121 .css_offline = cpu_cgroup_css_offline, 8253 .css_offline = cpu_cgroup_css_offline,
8254 .fork = cpu_cgroup_fork,
8122 .can_attach = cpu_cgroup_can_attach, 8255 .can_attach = cpu_cgroup_can_attach,
8123 .attach = cpu_cgroup_attach, 8256 .attach = cpu_cgroup_attach,
8124 .exit = cpu_cgroup_exit, 8257 .exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index bd95963dae80..539ca3ce071b 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
107 int best_cpu = -1; 107 int best_cpu = -1;
108 const struct sched_dl_entity *dl_se = &p->dl; 108 const struct sched_dl_entity *dl_se = &p->dl;
109 109
110 if (later_mask && cpumask_and(later_mask, cp->free_cpus, 110 if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
111 &p->cpus_allowed) && cpumask_and(later_mask,
112 later_mask, cpu_active_mask)) {
113 best_cpu = cpumask_any(later_mask); 111 best_cpu = cpumask_any(later_mask);
114 goto out; 112 goto out;
115 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) && 113 } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 72fdf06ef865..8394b1ee600c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
288 struct signal_struct *sig = tsk->signal; 288 struct signal_struct *sig = tsk->signal;
289 cputime_t utime, stime; 289 cputime_t utime, stime;
290 struct task_struct *t; 290 struct task_struct *t;
291 291 unsigned int seq, nextseq;
292 times->utime = sig->utime; 292 unsigned long flags;
293 times->stime = sig->stime;
294 times->sum_exec_runtime = sig->sum_sched_runtime;
295 293
296 rcu_read_lock(); 294 rcu_read_lock();
297 /* make sure we can trust tsk->thread_group list */ 295 /* Attempt a lockless read on the first round. */
298 if (!likely(pid_alive(tsk))) 296 nextseq = 0;
299 goto out;
300
301 t = tsk;
302 do { 297 do {
303 task_cputime(t, &utime, &stime); 298 seq = nextseq;
304 times->utime += utime; 299 flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
305 times->stime += stime; 300 times->utime = sig->utime;
306 times->sum_exec_runtime += task_sched_runtime(t); 301 times->stime = sig->stime;
307 } while_each_thread(tsk, t); 302 times->sum_exec_runtime = sig->sum_sched_runtime;
308out: 303
304 for_each_thread(tsk, t) {
305 task_cputime(t, &utime, &stime);
306 times->utime += utime;
307 times->stime += stime;
308 times->sum_exec_runtime += task_sched_runtime(t);
309 }
310 /* If lockless access failed, take the lock. */
311 nextseq = 1;
312 } while (need_seqretry(&sig->stats_lock, seq));
313 done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
309 rcu_read_unlock(); 314 rcu_read_unlock();
310} 315}
311 316
@@ -550,6 +555,23 @@ drop_precision:
550} 555}
551 556
552/* 557/*
558 * Atomically advance counter to the new value. Interrupts, vcpu
559 * scheduling, and scaling inaccuracies can cause cputime_advance
560 * to be occasionally called with a new value smaller than counter.
561 * Let's enforce atomicity.
562 *
563 * Normally a caller will only go through this loop once, or not
564 * at all in case a previous caller updated counter the same jiffy.
565 */
566static void cputime_advance(cputime_t *counter, cputime_t new)
567{
568 cputime_t old;
569
570 while (new > (old = ACCESS_ONCE(*counter)))
571 cmpxchg_cputime(counter, old, new);
572}
573
574/*
553 * Adjust tick based cputime random precision against scheduler 575 * Adjust tick based cputime random precision against scheduler
554 * runtime accounting. 576 * runtime accounting.
555 */ 577 */
@@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr,
594 utime = rtime - stime; 616 utime = rtime - stime;
595 } 617 }
596 618
597 /* 619 cputime_advance(&prev->stime, stime);
598 * If the tick based count grows faster than the scheduler one, 620 cputime_advance(&prev->utime, utime);
599 * the result of the scaling may go backward.
600 * Let's enforce monotonicity.
601 */
602 prev->stime = max(prev->stime, stime);
603 prev->utime = max(prev->utime, utime);
604 621
605out: 622out:
606 *ut = prev->utime; 623 *ut = prev->utime;
@@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
617 cputime_adjust(&cputime, &p->prev_cputime, ut, st); 634 cputime_adjust(&cputime, &p->prev_cputime, ut, st);
618} 635}
619 636
620/*
621 * Must be called with siglock held.
622 */
623void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) 637void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
624{ 638{
625 struct task_cputime cputime; 639 struct task_cputime cputime;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce138b652..5285332392d5 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -518,21 +518,29 @@ again:
518 } 518 }
519 519
520 /* 520 /*
521 * We need to take care of a possible races here. In fact, the 521 * We need to take care of several possible races here:
522 * task might have changed its scheduling policy to something 522 *
523 * different from SCHED_DEADLINE or changed its reservation 523 * - the task might have changed its scheduling policy
524 * parameters (through sched_setattr()). 524 * to something different than SCHED_DEADLINE
525 * - the task might have changed its reservation parameters
526 * (through sched_setattr())
527 * - the task might have been boosted by someone else and
528 * might be in the boosting/deboosting path
529 *
530 * In all this cases we bail out, as the task is already
531 * in the runqueue or is going to be enqueued back anyway.
525 */ 532 */
526 if (!dl_task(p) || dl_se->dl_new) 533 if (!dl_task(p) || dl_se->dl_new ||
534 dl_se->dl_boosted || !dl_se->dl_throttled)
527 goto unlock; 535 goto unlock;
528 536
529 sched_clock_tick(); 537 sched_clock_tick();
530 update_rq_clock(rq); 538 update_rq_clock(rq);
531 dl_se->dl_throttled = 0; 539 dl_se->dl_throttled = 0;
532 dl_se->dl_yielded = 0; 540 dl_se->dl_yielded = 0;
533 if (p->on_rq) { 541 if (task_on_rq_queued(p)) {
534 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); 542 enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
535 if (task_has_dl_policy(rq->curr)) 543 if (dl_task(rq->curr))
536 check_preempt_curr_dl(rq, p, 0); 544 check_preempt_curr_dl(rq, p, 0);
537 else 545 else
538 resched_curr(rq); 546 resched_curr(rq);
@@ -847,8 +855,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
847 * smaller than our one... OTW we keep our runtime and 855 * smaller than our one... OTW we keep our runtime and
848 * deadline. 856 * deadline.
849 */ 857 */
850 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) 858 if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
851 pi_se = &pi_task->dl; 859 pi_se = &pi_task->dl;
860 } else if (!dl_prio(p->normal_prio)) {
861 /*
862 * Special case in which we have a !SCHED_DEADLINE task
863 * that is going to be deboosted, but exceedes its
864 * runtime while doing so. No point in replenishing
865 * it, as it's going to return back to its original
866 * scheduling class after this.
867 */
868 BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
869 return;
870 }
852 871
853 /* 872 /*
854 * If p is throttled, we do nothing. In fact, if it exhausted 873 * If p is throttled, we do nothing. In fact, if it exhausted
@@ -997,10 +1016,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
997#ifdef CONFIG_SCHED_HRTICK 1016#ifdef CONFIG_SCHED_HRTICK
998static void start_hrtick_dl(struct rq *rq, struct task_struct *p) 1017static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
999{ 1018{
1000 s64 delta = p->dl.dl_runtime - p->dl.runtime; 1019 hrtick_start(rq, p->dl.runtime);
1001
1002 if (delta > 10000)
1003 hrtick_start(rq, p->dl.runtime);
1004} 1020}
1005#endif 1021#endif
1006 1022
@@ -1030,7 +1046,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
1030 * means a stop task can slip in, in which case we need to 1046 * means a stop task can slip in, in which case we need to
1031 * re-start task selection. 1047 * re-start task selection.
1032 */ 1048 */
1033 if (rq->stop && rq->stop->on_rq) 1049 if (rq->stop && task_on_rq_queued(rq->stop))
1034 return RETRY_TASK; 1050 return RETRY_TASK;
1035 } 1051 }
1036 1052
@@ -1124,10 +1140,8 @@ static void set_curr_task_dl(struct rq *rq)
1124static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) 1140static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
1125{ 1141{
1126 if (!task_running(rq, p) && 1142 if (!task_running(rq, p) &&
1127 (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) && 1143 cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1128 (p->nr_cpus_allowed > 1))
1129 return 1; 1144 return 1;
1130
1131 return 0; 1145 return 0;
1132} 1146}
1133 1147
@@ -1158,7 +1172,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
1158static int find_later_rq(struct task_struct *task) 1172static int find_later_rq(struct task_struct *task)
1159{ 1173{
1160 struct sched_domain *sd; 1174 struct sched_domain *sd;
1161 struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl); 1175 struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
1162 int this_cpu = smp_processor_id(); 1176 int this_cpu = smp_processor_id();
1163 int best_cpu, cpu = task_cpu(task); 1177 int best_cpu, cpu = task_cpu(task);
1164 1178
@@ -1169,6 +1183,13 @@ static int find_later_rq(struct task_struct *task)
1169 if (task->nr_cpus_allowed == 1) 1183 if (task->nr_cpus_allowed == 1)
1170 return -1; 1184 return -1;
1171 1185
1186 /*
1187 * We have to consider system topology and task affinity
1188 * first, then we can look for a suitable cpu.
1189 */
1190 cpumask_copy(later_mask, task_rq(task)->rd->span);
1191 cpumask_and(later_mask, later_mask, cpu_active_mask);
1192 cpumask_and(later_mask, later_mask, &task->cpus_allowed);
1172 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl, 1193 best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
1173 task, later_mask); 1194 task, later_mask);
1174 if (best_cpu == -1) 1195 if (best_cpu == -1)
@@ -1257,7 +1278,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
1257 if (unlikely(task_rq(task) != rq || 1278 if (unlikely(task_rq(task) != rq ||
1258 !cpumask_test_cpu(later_rq->cpu, 1279 !cpumask_test_cpu(later_rq->cpu,
1259 &task->cpus_allowed) || 1280 &task->cpus_allowed) ||
1260 task_running(rq, task) || !task->on_rq)) { 1281 task_running(rq, task) ||
1282 !task_on_rq_queued(task))) {
1261 double_unlock_balance(rq, later_rq); 1283 double_unlock_balance(rq, later_rq);
1262 later_rq = NULL; 1284 later_rq = NULL;
1263 break; 1285 break;
@@ -1296,7 +1318,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
1296 BUG_ON(task_current(rq, p)); 1318 BUG_ON(task_current(rq, p));
1297 BUG_ON(p->nr_cpus_allowed <= 1); 1319 BUG_ON(p->nr_cpus_allowed <= 1);
1298 1320
1299 BUG_ON(!p->on_rq); 1321 BUG_ON(!task_on_rq_queued(p));
1300 BUG_ON(!dl_task(p)); 1322 BUG_ON(!dl_task(p));
1301 1323
1302 return p; 1324 return p;
@@ -1443,7 +1465,7 @@ static int pull_dl_task(struct rq *this_rq)
1443 dl_time_before(p->dl.deadline, 1465 dl_time_before(p->dl.deadline,
1444 this_rq->dl.earliest_dl.curr))) { 1466 this_rq->dl.earliest_dl.curr))) {
1445 WARN_ON(p == src_rq->curr); 1467 WARN_ON(p == src_rq->curr);
1446 WARN_ON(!p->on_rq); 1468 WARN_ON(!task_on_rq_queued(p));
1447 1469
1448 /* 1470 /*
1449 * Then we pull iff p has actually an earlier 1471 * Then we pull iff p has actually an earlier
@@ -1569,6 +1591,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
1569 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy)) 1591 if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
1570 hrtimer_try_to_cancel(&p->dl.dl_timer); 1592 hrtimer_try_to_cancel(&p->dl.dl_timer);
1571 1593
1594 __dl_clear_params(p);
1595
1572#ifdef CONFIG_SMP 1596#ifdef CONFIG_SMP
1573 /* 1597 /*
1574 * Since this might be the only -deadline task on the rq, 1598 * Since this might be the only -deadline task on the rq,
@@ -1596,14 +1620,18 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1596 if (unlikely(p->dl.dl_throttled)) 1620 if (unlikely(p->dl.dl_throttled))
1597 return; 1621 return;
1598 1622
1599 if (p->on_rq && rq->curr != p) { 1623 if (task_on_rq_queued(p) && rq->curr != p) {
1600#ifdef CONFIG_SMP 1624#ifdef CONFIG_SMP
1601 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p)) 1625 if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
1602 /* Only reschedule if pushing failed */ 1626 /* Only reschedule if pushing failed */
1603 check_resched = 0; 1627 check_resched = 0;
1604#endif /* CONFIG_SMP */ 1628#endif /* CONFIG_SMP */
1605 if (check_resched && task_has_dl_policy(rq->curr)) 1629 if (check_resched) {
1606 check_preempt_curr_dl(rq, p, 0); 1630 if (dl_task(rq->curr))
1631 check_preempt_curr_dl(rq, p, 0);
1632 else
1633 resched_curr(rq);
1634 }
1607 } 1635 }
1608} 1636}
1609 1637
@@ -1614,7 +1642,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
1614static void prio_changed_dl(struct rq *rq, struct task_struct *p, 1642static void prio_changed_dl(struct rq *rq, struct task_struct *p,
1615 int oldprio) 1643 int oldprio)
1616{ 1644{
1617 if (p->on_rq || rq->curr == p) { 1645 if (task_on_rq_queued(p) || rq->curr == p) {
1618#ifdef CONFIG_SMP 1646#ifdef CONFIG_SMP
1619 /* 1647 /*
1620 * This might be too much, but unfortunately 1648 * This might be too much, but unfortunately
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 627b3c34b821..ce33780d8f20 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
150static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) 150static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
151{ 151{
152 struct task_struct *g, *p; 152 struct task_struct *g, *p;
153 unsigned long flags;
154 153
155 SEQ_printf(m, 154 SEQ_printf(m,
156 "\nrunnable tasks:\n" 155 "\nrunnable tasks:\n"
@@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
159 "------------------------------------------------------" 158 "------------------------------------------------------"
160 "----------------------------------------------------\n"); 159 "----------------------------------------------------\n");
161 160
162 read_lock_irqsave(&tasklist_lock, flags); 161 rcu_read_lock();
163 162 for_each_process_thread(g, p) {
164 do_each_thread(g, p) {
165 if (task_cpu(p) != rq_cpu) 163 if (task_cpu(p) != rq_cpu)
166 continue; 164 continue;
167 165
168 print_task(m, rq, p); 166 print_task(m, rq, p);
169 } while_each_thread(g, p); 167 }
170 168 rcu_read_unlock();
171 read_unlock_irqrestore(&tasklist_lock, flags);
172} 169}
173 170
174void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 171void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -333,9 +330,7 @@ do { \
333 print_cfs_stats(m, cpu); 330 print_cfs_stats(m, cpu);
334 print_rt_stats(m, cpu); 331 print_rt_stats(m, cpu);
335 332
336 rcu_read_lock();
337 print_rq(m, rq, cpu); 333 print_rq(m, rq, cpu);
338 rcu_read_unlock();
339 spin_unlock_irqrestore(&sched_debug_lock, flags); 334 spin_unlock_irqrestore(&sched_debug_lock, flags);
340 SEQ_printf(m, "\n"); 335 SEQ_printf(m, "\n");
341} 336}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86d0d68..34baa60f8a7b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
23#include <linux/latencytop.h> 23#include <linux/latencytop.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/cpumask.h> 25#include <linux/cpumask.h>
26#include <linux/cpuidle.h>
26#include <linux/slab.h> 27#include <linux/slab.h>
27#include <linux/profile.h> 28#include <linux/profile.h>
28#include <linux/interrupt.h> 29#include <linux/interrupt.h>
@@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
665} 666}
666 667
667#ifdef CONFIG_SMP 668#ifdef CONFIG_SMP
669static int select_idle_sibling(struct task_struct *p, int cpu);
668static unsigned long task_h_load(struct task_struct *p); 670static unsigned long task_h_load(struct task_struct *p);
669 671
670static inline void __update_task_entity_contrib(struct sched_entity *se); 672static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -826,11 +828,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
826 828
827static unsigned int task_scan_min(struct task_struct *p) 829static unsigned int task_scan_min(struct task_struct *p)
828{ 830{
831 unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
829 unsigned int scan, floor; 832 unsigned int scan, floor;
830 unsigned int windows = 1; 833 unsigned int windows = 1;
831 834
832 if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW) 835 if (scan_size < MAX_SCAN_WINDOW)
833 windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size; 836 windows = MAX_SCAN_WINDOW / scan_size;
834 floor = 1000 / windows; 837 floor = 1000 / windows;
835 838
836 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p); 839 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -1038,7 +1041,8 @@ struct numa_stats {
1038 */ 1041 */
1039static void update_numa_stats(struct numa_stats *ns, int nid) 1042static void update_numa_stats(struct numa_stats *ns, int nid)
1040{ 1043{
1041 int cpu, cpus = 0; 1044 int smt, cpu, cpus = 0;
1045 unsigned long capacity;
1042 1046
1043 memset(ns, 0, sizeof(*ns)); 1047 memset(ns, 0, sizeof(*ns));
1044 for_each_cpu(cpu, cpumask_of_node(nid)) { 1048 for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1062,8 +1066,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
1062 if (!cpus) 1066 if (!cpus)
1063 return; 1067 return;
1064 1068
1065 ns->task_capacity = 1069 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1066 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE); 1070 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1071 capacity = cpus / smt; /* cores */
1072
1073 ns->task_capacity = min_t(unsigned, capacity,
1074 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1067 ns->has_free_capacity = (ns->nr_running < ns->task_capacity); 1075 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1068} 1076}
1069 1077
@@ -1157,9 +1165,19 @@ static void task_numa_compare(struct task_numa_env *env,
1157 long moveimp = imp; 1165 long moveimp = imp;
1158 1166
1159 rcu_read_lock(); 1167 rcu_read_lock();
1160 cur = ACCESS_ONCE(dst_rq->curr); 1168
1161 if (cur->pid == 0) /* idle */ 1169 raw_spin_lock_irq(&dst_rq->lock);
1170 cur = dst_rq->curr;
1171 /*
1172 * No need to move the exiting task, and this ensures that ->curr
1173 * wasn't reaped and thus get_task_struct() in task_numa_assign()
1174 * is safe under RCU read lock.
1175 * Note that rcu_read_lock() itself can't protect from the final
1176 * put_task_struct() after the last schedule().
1177 */
1178 if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1162 cur = NULL; 1179 cur = NULL;
1180 raw_spin_unlock_irq(&dst_rq->lock);
1163 1181
1164 /* 1182 /*
1165 * "imp" is the fault differential for the source task between the 1183 * "imp" is the fault differential for the source task between the
@@ -1206,7 +1224,7 @@ static void task_numa_compare(struct task_numa_env *env,
1206 1224
1207 if (!cur) { 1225 if (!cur) {
1208 /* Is there capacity at our destination? */ 1226 /* Is there capacity at our destination? */
1209 if (env->src_stats.has_free_capacity && 1227 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1210 !env->dst_stats.has_free_capacity) 1228 !env->dst_stats.has_free_capacity)
1211 goto unlock; 1229 goto unlock;
1212 1230
@@ -1252,6 +1270,13 @@ balance:
1252 if (load_too_imbalanced(src_load, dst_load, env)) 1270 if (load_too_imbalanced(src_load, dst_load, env))
1253 goto unlock; 1271 goto unlock;
1254 1272
1273 /*
1274 * One idle CPU per node is evaluated for a task numa move.
1275 * Call select_idle_sibling to maybe find a better one.
1276 */
1277 if (!cur)
1278 env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
1279
1255assign: 1280assign:
1256 task_numa_assign(env, cur, imp); 1281 task_numa_assign(env, cur, imp);
1257unlock: 1282unlock:
@@ -1506,7 +1531,7 @@ static void update_task_scan_period(struct task_struct *p,
1506 * scanning faster if shared accesses dominate as it may 1531 * scanning faster if shared accesses dominate as it may
1507 * simply bounce migrations uselessly 1532 * simply bounce migrations uselessly
1508 */ 1533 */
1509 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared)); 1534 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1510 diff = (diff * ratio) / NUMA_PERIOD_SLOTS; 1535 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1511 } 1536 }
1512 1537
@@ -1775,7 +1800,7 @@ void task_numa_free(struct task_struct *p)
1775 list_del(&p->numa_entry); 1800 list_del(&p->numa_entry);
1776 grp->nr_tasks--; 1801 grp->nr_tasks--;
1777 spin_unlock_irqrestore(&grp->lock, flags); 1802 spin_unlock_irqrestore(&grp->lock, flags);
1778 rcu_assign_pointer(p->numa_group, NULL); 1803 RCU_INIT_POINTER(p->numa_group, NULL);
1779 put_numa_group(grp); 1804 put_numa_group(grp);
1780 } 1805 }
1781 1806
@@ -1804,10 +1829,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
1804 if (!p->mm) 1829 if (!p->mm)
1805 return; 1830 return;
1806 1831
1807 /* Do not worry about placement if exiting */
1808 if (p->state == TASK_DEAD)
1809 return;
1810
1811 /* Allocate buffer to track faults on a per-node basis */ 1832 /* Allocate buffer to track faults on a per-node basis */
1812 if (unlikely(!p->numa_faults_memory)) { 1833 if (unlikely(!p->numa_faults_memory)) {
1813 int size = sizeof(*p->numa_faults_memory) * 1834 int size = sizeof(*p->numa_faults_memory) *
@@ -1946,7 +1967,7 @@ void task_numa_work(struct callback_head *work)
1946 vma = mm->mmap; 1967 vma = mm->mmap;
1947 } 1968 }
1948 for (; vma; vma = vma->vm_next) { 1969 for (; vma; vma = vma->vm_next) {
1949 if (!vma_migratable(vma) || !vma_policy_mof(p, vma)) 1970 if (!vma_migratable(vma) || !vma_policy_mof(vma))
1950 continue; 1971 continue;
1951 1972
1952 /* 1973 /*
@@ -2211,8 +2232,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
2211 2232
2212 /* 2233 /*
2213 * As y^PERIOD = 1/2, we can combine 2234 * As y^PERIOD = 1/2, we can combine
2214 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD) 2235 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2215 * With a look-up table which covers k^n (n<PERIOD) 2236 * With a look-up table which covers y^n (n<PERIOD)
2216 * 2237 *
2217 * To achieve constant time decay_load. 2238 * To achieve constant time decay_load.
2218 */ 2239 */
@@ -2377,6 +2398,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2377 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg; 2398 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
2378 tg_contrib -= cfs_rq->tg_load_contrib; 2399 tg_contrib -= cfs_rq->tg_load_contrib;
2379 2400
2401 if (!tg_contrib)
2402 return;
2403
2380 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) { 2404 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
2381 atomic_long_add(tg_contrib, &tg->load_avg); 2405 atomic_long_add(tg_contrib, &tg->load_avg);
2382 cfs_rq->tg_load_contrib += tg_contrib; 2406 cfs_rq->tg_load_contrib += tg_contrib;
@@ -3892,14 +3916,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3892 resched_curr(rq); 3916 resched_curr(rq);
3893 return; 3917 return;
3894 } 3918 }
3895
3896 /*
3897 * Don't schedule slices shorter than 10000ns, that just
3898 * doesn't make sense. Rely on vruntime for fairness.
3899 */
3900 if (rq->curr != p)
3901 delta = max_t(s64, 10000LL, delta);
3902
3903 hrtick_start(rq, delta); 3919 hrtick_start(rq, delta);
3904 } 3920 }
3905} 3921}
@@ -4087,7 +4103,7 @@ static unsigned long capacity_of(int cpu)
4087static unsigned long cpu_avg_load_per_task(int cpu) 4103static unsigned long cpu_avg_load_per_task(int cpu)
4088{ 4104{
4089 struct rq *rq = cpu_rq(cpu); 4105 struct rq *rq = cpu_rq(cpu);
4090 unsigned long nr_running = ACCESS_ONCE(rq->nr_running); 4106 unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
4091 unsigned long load_avg = rq->cfs.runnable_load_avg; 4107 unsigned long load_avg = rq->cfs.runnable_load_avg;
4092 4108
4093 if (nr_running) 4109 if (nr_running)
@@ -4276,8 +4292,8 @@ static int wake_wide(struct task_struct *p)
4276static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 4292static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4277{ 4293{
4278 s64 this_load, load; 4294 s64 this_load, load;
4295 s64 this_eff_load, prev_eff_load;
4279 int idx, this_cpu, prev_cpu; 4296 int idx, this_cpu, prev_cpu;
4280 unsigned long tl_per_task;
4281 struct task_group *tg; 4297 struct task_group *tg;
4282 unsigned long weight; 4298 unsigned long weight;
4283 int balanced; 4299 int balanced;
@@ -4320,47 +4336,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4320 * Otherwise check if either cpus are near enough in load to allow this 4336 * Otherwise check if either cpus are near enough in load to allow this
4321 * task to be woken on this_cpu. 4337 * task to be woken on this_cpu.
4322 */ 4338 */
4323 if (this_load > 0) { 4339 this_eff_load = 100;
4324 s64 this_eff_load, prev_eff_load; 4340 this_eff_load *= capacity_of(prev_cpu);
4341
4342 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4343 prev_eff_load *= capacity_of(this_cpu);
4325 4344
4326 this_eff_load = 100; 4345 if (this_load > 0) {
4327 this_eff_load *= capacity_of(prev_cpu);
4328 this_eff_load *= this_load + 4346 this_eff_load *= this_load +
4329 effective_load(tg, this_cpu, weight, weight); 4347 effective_load(tg, this_cpu, weight, weight);
4330 4348
4331 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4332 prev_eff_load *= capacity_of(this_cpu);
4333 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); 4349 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4350 }
4334 4351
4335 balanced = this_eff_load <= prev_eff_load; 4352 balanced = this_eff_load <= prev_eff_load;
4336 } else
4337 balanced = true;
4338
4339 /*
4340 * If the currently running task will sleep within
4341 * a reasonable amount of time then attract this newly
4342 * woken task:
4343 */
4344 if (sync && balanced)
4345 return 1;
4346 4353
4347 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts); 4354 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4348 tl_per_task = cpu_avg_load_per_task(this_cpu);
4349 4355
4350 if (balanced || 4356 if (!balanced)
4351 (this_load <= load && 4357 return 0;
4352 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
4353 /*
4354 * This domain has SD_WAKE_AFFINE and
4355 * p is cache cold in this domain, and
4356 * there is no bad imbalance.
4357 */
4358 schedstat_inc(sd, ttwu_move_affine);
4359 schedstat_inc(p, se.statistics.nr_wakeups_affine);
4360 4358
4361 return 1; 4359 schedstat_inc(sd, ttwu_move_affine);
4362 } 4360 schedstat_inc(p, se.statistics.nr_wakeups_affine);
4363 return 0; 4361
4362 return 1;
4364} 4363}
4365 4364
4366/* 4365/*
@@ -4428,20 +4427,46 @@ static int
4428find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 4427find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4429{ 4428{
4430 unsigned long load, min_load = ULONG_MAX; 4429 unsigned long load, min_load = ULONG_MAX;
4431 int idlest = -1; 4430 unsigned int min_exit_latency = UINT_MAX;
4431 u64 latest_idle_timestamp = 0;
4432 int least_loaded_cpu = this_cpu;
4433 int shallowest_idle_cpu = -1;
4432 int i; 4434 int i;
4433 4435
4434 /* Traverse only the allowed CPUs */ 4436 /* Traverse only the allowed CPUs */
4435 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) { 4437 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4436 load = weighted_cpuload(i); 4438 if (idle_cpu(i)) {
4437 4439 struct rq *rq = cpu_rq(i);
4438 if (load < min_load || (load == min_load && i == this_cpu)) { 4440 struct cpuidle_state *idle = idle_get_state(rq);
4439 min_load = load; 4441 if (idle && idle->exit_latency < min_exit_latency) {
4440 idlest = i; 4442 /*
4443 * We give priority to a CPU whose idle state
4444 * has the smallest exit latency irrespective
4445 * of any idle timestamp.
4446 */
4447 min_exit_latency = idle->exit_latency;
4448 latest_idle_timestamp = rq->idle_stamp;
4449 shallowest_idle_cpu = i;
4450 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
4451 rq->idle_stamp > latest_idle_timestamp) {
4452 /*
4453 * If equal or no active idle state, then
4454 * the most recently idled CPU might have
4455 * a warmer cache.
4456 */
4457 latest_idle_timestamp = rq->idle_stamp;
4458 shallowest_idle_cpu = i;
4459 }
4460 } else {
4461 load = weighted_cpuload(i);
4462 if (load < min_load || (load == min_load && i == this_cpu)) {
4463 min_load = load;
4464 least_loaded_cpu = i;
4465 }
4441 } 4466 }
4442 } 4467 }
4443 4468
4444 return idlest; 4469 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
4445} 4470}
4446 4471
4447/* 4472/*
@@ -4513,11 +4538,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
4513 if (p->nr_cpus_allowed == 1) 4538 if (p->nr_cpus_allowed == 1)
4514 return prev_cpu; 4539 return prev_cpu;
4515 4540
4516 if (sd_flag & SD_BALANCE_WAKE) { 4541 if (sd_flag & SD_BALANCE_WAKE)
4517 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) 4542 want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
4518 want_affine = 1;
4519 new_cpu = prev_cpu;
4520 }
4521 4543
4522 rcu_read_lock(); 4544 rcu_read_lock();
4523 for_each_domain(cpu, tmp) { 4545 for_each_domain(cpu, tmp) {
@@ -4704,7 +4726,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
4704 return; 4726 return;
4705 4727
4706 /* 4728 /*
4707 * This is possible from callers such as move_task(), in which we 4729 * This is possible from callers such as attach_tasks(), in which we
4708 * unconditionally check_prempt_curr() after an enqueue (which may have 4730 * unconditionally check_prempt_curr() after an enqueue (which may have
4709 * lead to a throttle). This both saves work and prevents false 4731 * lead to a throttle). This both saves work and prevents false
4710 * next-buddy nomination below. 4732 * next-buddy nomination below.
@@ -5112,27 +5134,18 @@ struct lb_env {
5112 unsigned int loop_max; 5134 unsigned int loop_max;
5113 5135
5114 enum fbq_type fbq_type; 5136 enum fbq_type fbq_type;
5137 struct list_head tasks;
5115}; 5138};
5116 5139
5117/* 5140/*
5118 * move_task - move a task from one runqueue to another runqueue.
5119 * Both runqueues must be locked.
5120 */
5121static void move_task(struct task_struct *p, struct lb_env *env)
5122{
5123 deactivate_task(env->src_rq, p, 0);
5124 set_task_cpu(p, env->dst_cpu);
5125 activate_task(env->dst_rq, p, 0);
5126 check_preempt_curr(env->dst_rq, p, 0);
5127}
5128
5129/*
5130 * Is this task likely cache-hot: 5141 * Is this task likely cache-hot:
5131 */ 5142 */
5132static int task_hot(struct task_struct *p, struct lb_env *env) 5143static int task_hot(struct task_struct *p, struct lb_env *env)
5133{ 5144{
5134 s64 delta; 5145 s64 delta;
5135 5146
5147 lockdep_assert_held(&env->src_rq->lock);
5148
5136 if (p->sched_class != &fair_sched_class) 5149 if (p->sched_class != &fair_sched_class)
5137 return 0; 5150 return 0;
5138 5151
@@ -5252,6 +5265,9 @@ static
5252int can_migrate_task(struct task_struct *p, struct lb_env *env) 5265int can_migrate_task(struct task_struct *p, struct lb_env *env)
5253{ 5266{
5254 int tsk_cache_hot = 0; 5267 int tsk_cache_hot = 0;
5268
5269 lockdep_assert_held(&env->src_rq->lock);
5270
5255 /* 5271 /*
5256 * We do not migrate tasks that are: 5272 * We do not migrate tasks that are:
5257 * 1) throttled_lb_pair, or 5273 * 1) throttled_lb_pair, or
@@ -5310,24 +5326,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5310 if (!tsk_cache_hot) 5326 if (!tsk_cache_hot)
5311 tsk_cache_hot = migrate_degrades_locality(p, env); 5327 tsk_cache_hot = migrate_degrades_locality(p, env);
5312 5328
5313 if (migrate_improves_locality(p, env)) { 5329 if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
5314#ifdef CONFIG_SCHEDSTATS 5330 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5315 if (tsk_cache_hot) {
5316 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5317 schedstat_inc(p, se.statistics.nr_forced_migrations);
5318 }
5319#endif
5320 return 1;
5321 }
5322
5323 if (!tsk_cache_hot ||
5324 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5325
5326 if (tsk_cache_hot) { 5331 if (tsk_cache_hot) {
5327 schedstat_inc(env->sd, lb_hot_gained[env->idle]); 5332 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5328 schedstat_inc(p, se.statistics.nr_forced_migrations); 5333 schedstat_inc(p, se.statistics.nr_forced_migrations);
5329 } 5334 }
5330
5331 return 1; 5335 return 1;
5332 } 5336 }
5333 5337
@@ -5336,47 +5340,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
5336} 5340}
5337 5341
5338/* 5342/*
5339 * move_one_task tries to move exactly one task from busiest to this_rq, as 5343 * detach_task() -- detach the task for the migration specified in env
5344 */
5345static void detach_task(struct task_struct *p, struct lb_env *env)
5346{
5347 lockdep_assert_held(&env->src_rq->lock);
5348
5349 deactivate_task(env->src_rq, p, 0);
5350 p->on_rq = TASK_ON_RQ_MIGRATING;
5351 set_task_cpu(p, env->dst_cpu);
5352}
5353
5354/*
5355 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
5340 * part of active balancing operations within "domain". 5356 * part of active balancing operations within "domain".
5341 * Returns 1 if successful and 0 otherwise.
5342 * 5357 *
5343 * Called with both runqueues locked. 5358 * Returns a task if successful and NULL otherwise.
5344 */ 5359 */
5345static int move_one_task(struct lb_env *env) 5360static struct task_struct *detach_one_task(struct lb_env *env)
5346{ 5361{
5347 struct task_struct *p, *n; 5362 struct task_struct *p, *n;
5348 5363
5364 lockdep_assert_held(&env->src_rq->lock);
5365
5349 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { 5366 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
5350 if (!can_migrate_task(p, env)) 5367 if (!can_migrate_task(p, env))
5351 continue; 5368 continue;
5352 5369
5353 move_task(p, env); 5370 detach_task(p, env);
5371
5354 /* 5372 /*
5355 * Right now, this is only the second place move_task() 5373 * Right now, this is only the second place where
5356 * is called, so we can safely collect move_task() 5374 * lb_gained[env->idle] is updated (other is detach_tasks)
5357 * stats here rather than inside move_task(). 5375 * so we can safely collect stats here rather than
5376 * inside detach_tasks().
5358 */ 5377 */
5359 schedstat_inc(env->sd, lb_gained[env->idle]); 5378 schedstat_inc(env->sd, lb_gained[env->idle]);
5360 return 1; 5379 return p;
5361 } 5380 }
5362 return 0; 5381 return NULL;
5363} 5382}
5364 5383
5365static const unsigned int sched_nr_migrate_break = 32; 5384static const unsigned int sched_nr_migrate_break = 32;
5366 5385
5367/* 5386/*
5368 * move_tasks tries to move up to imbalance weighted load from busiest to 5387 * detach_tasks() -- tries to detach up to imbalance weighted load from
5369 * this_rq, as part of a balancing operation within domain "sd". 5388 * busiest_rq, as part of a balancing operation within domain "sd".
5370 * Returns 1 if successful and 0 otherwise.
5371 * 5389 *
5372 * Called with both runqueues locked. 5390 * Returns number of detached tasks if successful and 0 otherwise.
5373 */ 5391 */
5374static int move_tasks(struct lb_env *env) 5392static int detach_tasks(struct lb_env *env)
5375{ 5393{
5376 struct list_head *tasks = &env->src_rq->cfs_tasks; 5394 struct list_head *tasks = &env->src_rq->cfs_tasks;
5377 struct task_struct *p; 5395 struct task_struct *p;
5378 unsigned long load; 5396 unsigned long load;
5379 int pulled = 0; 5397 int detached = 0;
5398
5399 lockdep_assert_held(&env->src_rq->lock);
5380 5400
5381 if (env->imbalance <= 0) 5401 if (env->imbalance <= 0)
5382 return 0; 5402 return 0;
@@ -5407,14 +5427,16 @@ static int move_tasks(struct lb_env *env)
5407 if ((load / 2) > env->imbalance) 5427 if ((load / 2) > env->imbalance)
5408 goto next; 5428 goto next;
5409 5429
5410 move_task(p, env); 5430 detach_task(p, env);
5411 pulled++; 5431 list_add(&p->se.group_node, &env->tasks);
5432
5433 detached++;
5412 env->imbalance -= load; 5434 env->imbalance -= load;
5413 5435
5414#ifdef CONFIG_PREEMPT 5436#ifdef CONFIG_PREEMPT
5415 /* 5437 /*
5416 * NEWIDLE balancing is a source of latency, so preemptible 5438 * NEWIDLE balancing is a source of latency, so preemptible
5417 * kernels will stop after the first task is pulled to minimize 5439 * kernels will stop after the first task is detached to minimize
5418 * the critical section. 5440 * the critical section.
5419 */ 5441 */
5420 if (env->idle == CPU_NEWLY_IDLE) 5442 if (env->idle == CPU_NEWLY_IDLE)
@@ -5434,13 +5456,58 @@ next:
5434 } 5456 }
5435 5457
5436 /* 5458 /*
5437 * Right now, this is one of only two places move_task() is called, 5459 * Right now, this is one of only two places we collect this stat
5438 * so we can safely collect move_task() stats here rather than 5460 * so we can safely collect detach_one_task() stats here rather
5439 * inside move_task(). 5461 * than inside detach_one_task().
5440 */ 5462 */
5441 schedstat_add(env->sd, lb_gained[env->idle], pulled); 5463 schedstat_add(env->sd, lb_gained[env->idle], detached);
5464
5465 return detached;
5466}
5467
5468/*
5469 * attach_task() -- attach the task detached by detach_task() to its new rq.
5470 */
5471static void attach_task(struct rq *rq, struct task_struct *p)
5472{
5473 lockdep_assert_held(&rq->lock);
5474
5475 BUG_ON(task_rq(p) != rq);
5476 p->on_rq = TASK_ON_RQ_QUEUED;
5477 activate_task(rq, p, 0);
5478 check_preempt_curr(rq, p, 0);
5479}
5480
5481/*
5482 * attach_one_task() -- attaches the task returned from detach_one_task() to
5483 * its new rq.
5484 */
5485static void attach_one_task(struct rq *rq, struct task_struct *p)
5486{
5487 raw_spin_lock(&rq->lock);
5488 attach_task(rq, p);
5489 raw_spin_unlock(&rq->lock);
5490}
5491
5492/*
5493 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
5494 * new rq.
5495 */
5496static void attach_tasks(struct lb_env *env)
5497{
5498 struct list_head *tasks = &env->tasks;
5499 struct task_struct *p;
5500
5501 raw_spin_lock(&env->dst_rq->lock);
5502
5503 while (!list_empty(tasks)) {
5504 p = list_first_entry(tasks, struct task_struct, se.group_node);
5505 list_del_init(&p->se.group_node);
5506
5507 attach_task(env->dst_rq, p);
5508 }
5442 5509
5443 return pulled; 5510 raw_spin_unlock(&env->dst_rq->lock);
5444} 5511}
5445 5512
5446#ifdef CONFIG_FAIR_GROUP_SCHED 5513#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5559,6 +5626,13 @@ static unsigned long task_h_load(struct task_struct *p)
5559#endif 5626#endif
5560 5627
5561/********** Helpers for find_busiest_group ************************/ 5628/********** Helpers for find_busiest_group ************************/
5629
5630enum group_type {
5631 group_other = 0,
5632 group_imbalanced,
5633 group_overloaded,
5634};
5635
5562/* 5636/*
5563 * sg_lb_stats - stats of a sched_group required for load_balancing 5637 * sg_lb_stats - stats of a sched_group required for load_balancing
5564 */ 5638 */
@@ -5572,7 +5646,7 @@ struct sg_lb_stats {
5572 unsigned int group_capacity_factor; 5646 unsigned int group_capacity_factor;
5573 unsigned int idle_cpus; 5647 unsigned int idle_cpus;
5574 unsigned int group_weight; 5648 unsigned int group_weight;
5575 int group_imb; /* Is there an imbalance in the group ? */ 5649 enum group_type group_type;
5576 int group_has_free_capacity; 5650 int group_has_free_capacity;
5577#ifdef CONFIG_NUMA_BALANCING 5651#ifdef CONFIG_NUMA_BALANCING
5578 unsigned int nr_numa_running; 5652 unsigned int nr_numa_running;
@@ -5610,6 +5684,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
5610 .total_capacity = 0UL, 5684 .total_capacity = 0UL,
5611 .busiest_stat = { 5685 .busiest_stat = {
5612 .avg_load = 0UL, 5686 .avg_load = 0UL,
5687 .sum_nr_running = 0,
5688 .group_type = group_other,
5613 }, 5689 },
5614 }; 5690 };
5615} 5691}
@@ -5652,19 +5728,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
5652 return default_scale_capacity(sd, cpu); 5728 return default_scale_capacity(sd, cpu);
5653} 5729}
5654 5730
5655static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu) 5731static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5656{ 5732{
5657 unsigned long weight = sd->span_weight; 5733 if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
5658 unsigned long smt_gain = sd->smt_gain; 5734 return sd->smt_gain / sd->span_weight;
5659
5660 smt_gain /= weight;
5661 5735
5662 return smt_gain; 5736 return SCHED_CAPACITY_SCALE;
5663} 5737}
5664 5738
5665unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu) 5739unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
5666{ 5740{
5667 return default_scale_smt_capacity(sd, cpu); 5741 return default_scale_cpu_capacity(sd, cpu);
5668} 5742}
5669 5743
5670static unsigned long scale_rt_capacity(int cpu) 5744static unsigned long scale_rt_capacity(int cpu)
@@ -5703,18 +5777,15 @@ static unsigned long scale_rt_capacity(int cpu)
5703 5777
5704static void update_cpu_capacity(struct sched_domain *sd, int cpu) 5778static void update_cpu_capacity(struct sched_domain *sd, int cpu)
5705{ 5779{
5706 unsigned long weight = sd->span_weight;
5707 unsigned long capacity = SCHED_CAPACITY_SCALE; 5780 unsigned long capacity = SCHED_CAPACITY_SCALE;
5708 struct sched_group *sdg = sd->groups; 5781 struct sched_group *sdg = sd->groups;
5709 5782
5710 if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) { 5783 if (sched_feat(ARCH_CAPACITY))
5711 if (sched_feat(ARCH_CAPACITY)) 5784 capacity *= arch_scale_cpu_capacity(sd, cpu);
5712 capacity *= arch_scale_smt_capacity(sd, cpu); 5785 else
5713 else 5786 capacity *= default_scale_cpu_capacity(sd, cpu);
5714 capacity *= default_scale_smt_capacity(sd, cpu);
5715 5787
5716 capacity >>= SCHED_CAPACITY_SHIFT; 5788 capacity >>= SCHED_CAPACITY_SHIFT;
5717 }
5718 5789
5719 sdg->sgc->capacity_orig = capacity; 5790 sdg->sgc->capacity_orig = capacity;
5720 5791
@@ -5891,6 +5962,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
5891 return capacity_factor; 5962 return capacity_factor;
5892} 5963}
5893 5964
5965static enum group_type
5966group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
5967{
5968 if (sgs->sum_nr_running > sgs->group_capacity_factor)
5969 return group_overloaded;
5970
5971 if (sg_imbalanced(group))
5972 return group_imbalanced;
5973
5974 return group_other;
5975}
5976
5894/** 5977/**
5895 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 5978 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
5896 * @env: The load balancing environment. 5979 * @env: The load balancing environment.
@@ -5920,7 +6003,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5920 load = source_load(i, load_idx); 6003 load = source_load(i, load_idx);
5921 6004
5922 sgs->group_load += load; 6005 sgs->group_load += load;
5923 sgs->sum_nr_running += rq->nr_running; 6006 sgs->sum_nr_running += rq->cfs.h_nr_running;
5924 6007
5925 if (rq->nr_running > 1) 6008 if (rq->nr_running > 1)
5926 *overload = true; 6009 *overload = true;
@@ -5942,9 +6025,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
5942 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; 6025 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
5943 6026
5944 sgs->group_weight = group->group_weight; 6027 sgs->group_weight = group->group_weight;
5945
5946 sgs->group_imb = sg_imbalanced(group);
5947 sgs->group_capacity_factor = sg_capacity_factor(env, group); 6028 sgs->group_capacity_factor = sg_capacity_factor(env, group);
6029 sgs->group_type = group_classify(group, sgs);
5948 6030
5949 if (sgs->group_capacity_factor > sgs->sum_nr_running) 6031 if (sgs->group_capacity_factor > sgs->sum_nr_running)
5950 sgs->group_has_free_capacity = 1; 6032 sgs->group_has_free_capacity = 1;
@@ -5968,13 +6050,19 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5968 struct sched_group *sg, 6050 struct sched_group *sg,
5969 struct sg_lb_stats *sgs) 6051 struct sg_lb_stats *sgs)
5970{ 6052{
5971 if (sgs->avg_load <= sds->busiest_stat.avg_load) 6053 struct sg_lb_stats *busiest = &sds->busiest_stat;
5972 return false;
5973 6054
5974 if (sgs->sum_nr_running > sgs->group_capacity_factor) 6055 if (sgs->group_type > busiest->group_type)
5975 return true; 6056 return true;
5976 6057
5977 if (sgs->group_imb) 6058 if (sgs->group_type < busiest->group_type)
6059 return false;
6060
6061 if (sgs->avg_load <= busiest->avg_load)
6062 return false;
6063
6064 /* This is the busiest node in its class. */
6065 if (!(env->sd->flags & SD_ASYM_PACKING))
5978 return true; 6066 return true;
5979 6067
5980 /* 6068 /*
@@ -5982,8 +6070,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
5982 * numbered CPUs in the group, therefore mark all groups 6070 * numbered CPUs in the group, therefore mark all groups
5983 * higher than ourself as busy. 6071 * higher than ourself as busy.
5984 */ 6072 */
5985 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running && 6073 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
5986 env->dst_cpu < group_first_cpu(sg)) {
5987 if (!sds->busiest) 6074 if (!sds->busiest)
5988 return true; 6075 return true;
5989 6076
@@ -6228,7 +6315,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6228 local = &sds->local_stat; 6315 local = &sds->local_stat;
6229 busiest = &sds->busiest_stat; 6316 busiest = &sds->busiest_stat;
6230 6317
6231 if (busiest->group_imb) { 6318 if (busiest->group_type == group_imbalanced) {
6232 /* 6319 /*
6233 * In the group_imb case we cannot rely on group-wide averages 6320 * In the group_imb case we cannot rely on group-wide averages
6234 * to ensure cpu-load equilibrium, look at wider averages. XXX 6321 * to ensure cpu-load equilibrium, look at wider averages. XXX
@@ -6248,12 +6335,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
6248 return fix_small_imbalance(env, sds); 6335 return fix_small_imbalance(env, sds);
6249 } 6336 }
6250 6337
6251 if (!busiest->group_imb) { 6338 /*
6252 /* 6339 * If there aren't any idle cpus, avoid creating some.
6253 * Don't want to pull so many tasks that a group would go idle. 6340 */
6254 * Except of course for the group_imb case, since then we might 6341 if (busiest->group_type == group_overloaded &&
6255 * have to drop below capacity to reach cpu-load equilibrium. 6342 local->group_type == group_overloaded) {
6256 */
6257 load_above_capacity = 6343 load_above_capacity =
6258 (busiest->sum_nr_running - busiest->group_capacity_factor); 6344 (busiest->sum_nr_running - busiest->group_capacity_factor);
6259 6345
@@ -6337,7 +6423,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6337 * work because they assume all things are equal, which typically 6423 * work because they assume all things are equal, which typically
6338 * isn't true due to cpus_allowed constraints and the like. 6424 * isn't true due to cpus_allowed constraints and the like.
6339 */ 6425 */
6340 if (busiest->group_imb) 6426 if (busiest->group_type == group_imbalanced)
6341 goto force_balance; 6427 goto force_balance;
6342 6428
6343 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ 6429 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
@@ -6346,7 +6432,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6346 goto force_balance; 6432 goto force_balance;
6347 6433
6348 /* 6434 /*
6349 * If the local group is more busy than the selected busiest group 6435 * If the local group is busier than the selected busiest group
6350 * don't try and pull any tasks. 6436 * don't try and pull any tasks.
6351 */ 6437 */
6352 if (local->avg_load >= busiest->avg_load) 6438 if (local->avg_load >= busiest->avg_load)
@@ -6361,13 +6447,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
6361 6447
6362 if (env->idle == CPU_IDLE) { 6448 if (env->idle == CPU_IDLE) {
6363 /* 6449 /*
6364 * This cpu is idle. If the busiest group load doesn't 6450 * This cpu is idle. If the busiest group is not overloaded
6365 * have more tasks than the number of available cpu's and 6451 * and there is no imbalance between this and busiest group
6366 * there is no imbalance between this and busiest group 6452 * wrt idle cpus, it is balanced. The imbalance becomes
6367 * wrt to idle cpu's, it is balanced. 6453 * significant if the diff is greater than 1 otherwise we
6454 * might end up to just move the imbalance on another group
6368 */ 6455 */
6369 if ((local->idle_cpus < busiest->idle_cpus) && 6456 if ((busiest->group_type != group_overloaded) &&
6370 busiest->sum_nr_running <= busiest->group_weight) 6457 (local->idle_cpus <= (busiest->idle_cpus + 1)))
6371 goto out_balanced; 6458 goto out_balanced;
6372 } else { 6459 } else {
6373 /* 6460 /*
@@ -6539,7 +6626,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
6539 struct sched_group *group; 6626 struct sched_group *group;
6540 struct rq *busiest; 6627 struct rq *busiest;
6541 unsigned long flags; 6628 unsigned long flags;
6542 struct cpumask *cpus = __get_cpu_var(load_balance_mask); 6629 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
6543 6630
6544 struct lb_env env = { 6631 struct lb_env env = {
6545 .sd = sd, 6632 .sd = sd,
@@ -6550,6 +6637,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
6550 .loop_break = sched_nr_migrate_break, 6637 .loop_break = sched_nr_migrate_break,
6551 .cpus = cpus, 6638 .cpus = cpus,
6552 .fbq_type = all, 6639 .fbq_type = all,
6640 .tasks = LIST_HEAD_INIT(env.tasks),
6553 }; 6641 };
6554 6642
6555 /* 6643 /*
@@ -6599,23 +6687,30 @@ redo:
6599 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); 6687 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
6600 6688
6601more_balance: 6689more_balance:
6602 local_irq_save(flags); 6690 raw_spin_lock_irqsave(&busiest->lock, flags);
6603 double_rq_lock(env.dst_rq, busiest);
6604 6691
6605 /* 6692 /*
6606 * cur_ld_moved - load moved in current iteration 6693 * cur_ld_moved - load moved in current iteration
6607 * ld_moved - cumulative load moved across iterations 6694 * ld_moved - cumulative load moved across iterations
6608 */ 6695 */
6609 cur_ld_moved = move_tasks(&env); 6696 cur_ld_moved = detach_tasks(&env);
6610 ld_moved += cur_ld_moved;
6611 double_rq_unlock(env.dst_rq, busiest);
6612 local_irq_restore(flags);
6613 6697
6614 /* 6698 /*
6615 * some other cpu did the load balance for us. 6699 * We've detached some tasks from busiest_rq. Every
6700 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
6701 * unlock busiest->lock, and we are able to be sure
6702 * that nobody can manipulate the tasks in parallel.
6703 * See task_rq_lock() family for the details.
6616 */ 6704 */
6617 if (cur_ld_moved && env.dst_cpu != smp_processor_id()) 6705
6618 resched_cpu(env.dst_cpu); 6706 raw_spin_unlock(&busiest->lock);
6707
6708 if (cur_ld_moved) {
6709 attach_tasks(&env);
6710 ld_moved += cur_ld_moved;
6711 }
6712
6713 local_irq_restore(flags);
6619 6714
6620 if (env.flags & LBF_NEED_BREAK) { 6715 if (env.flags & LBF_NEED_BREAK) {
6621 env.flags &= ~LBF_NEED_BREAK; 6716 env.flags &= ~LBF_NEED_BREAK;
@@ -6665,10 +6760,8 @@ more_balance:
6665 if (sd_parent) { 6760 if (sd_parent) {
6666 int *group_imbalance = &sd_parent->groups->sgc->imbalance; 6761 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6667 6762
6668 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { 6763 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
6669 *group_imbalance = 1; 6764 *group_imbalance = 1;
6670 } else if (*group_imbalance)
6671 *group_imbalance = 0;
6672 } 6765 }
6673 6766
6674 /* All tasks on this runqueue were pinned by CPU affinity */ 6767 /* All tasks on this runqueue were pinned by CPU affinity */
@@ -6679,7 +6772,7 @@ more_balance:
6679 env.loop_break = sched_nr_migrate_break; 6772 env.loop_break = sched_nr_migrate_break;
6680 goto redo; 6773 goto redo;
6681 } 6774 }
6682 goto out_balanced; 6775 goto out_all_pinned;
6683 } 6776 }
6684 } 6777 }
6685 6778
@@ -6744,7 +6837,7 @@ more_balance:
6744 * If we've begun active balancing, start to back off. This 6837 * If we've begun active balancing, start to back off. This
6745 * case may not be covered by the all_pinned logic if there 6838 * case may not be covered by the all_pinned logic if there
6746 * is only 1 task on the busy runqueue (because we don't call 6839 * is only 1 task on the busy runqueue (because we don't call
6747 * move_tasks). 6840 * detach_tasks).
6748 */ 6841 */
6749 if (sd->balance_interval < sd->max_interval) 6842 if (sd->balance_interval < sd->max_interval)
6750 sd->balance_interval *= 2; 6843 sd->balance_interval *= 2;
@@ -6753,6 +6846,23 @@ more_balance:
6753 goto out; 6846 goto out;
6754 6847
6755out_balanced: 6848out_balanced:
6849 /*
6850 * We reach balance although we may have faced some affinity
6851 * constraints. Clear the imbalance flag if it was set.
6852 */
6853 if (sd_parent) {
6854 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
6855
6856 if (*group_imbalance)
6857 *group_imbalance = 0;
6858 }
6859
6860out_all_pinned:
6861 /*
6862 * We reach balance because all tasks are pinned at this level so
6863 * we can't migrate them. Let the imbalance flag set so parent level
6864 * can try to migrate them.
6865 */
6756 schedstat_inc(sd, lb_balanced[idle]); 6866 schedstat_inc(sd, lb_balanced[idle]);
6757 6867
6758 sd->nr_balance_failed = 0; 6868 sd->nr_balance_failed = 0;
@@ -6914,6 +7024,7 @@ static int active_load_balance_cpu_stop(void *data)
6914 int target_cpu = busiest_rq->push_cpu; 7024 int target_cpu = busiest_rq->push_cpu;
6915 struct rq *target_rq = cpu_rq(target_cpu); 7025 struct rq *target_rq = cpu_rq(target_cpu);
6916 struct sched_domain *sd; 7026 struct sched_domain *sd;
7027 struct task_struct *p = NULL;
6917 7028
6918 raw_spin_lock_irq(&busiest_rq->lock); 7029 raw_spin_lock_irq(&busiest_rq->lock);
6919 7030
@@ -6933,9 +7044,6 @@ static int active_load_balance_cpu_stop(void *data)
6933 */ 7044 */
6934 BUG_ON(busiest_rq == target_rq); 7045 BUG_ON(busiest_rq == target_rq);
6935 7046
6936 /* move a task from busiest_rq to target_rq */
6937 double_lock_balance(busiest_rq, target_rq);
6938
6939 /* Search for an sd spanning us and the target CPU. */ 7047 /* Search for an sd spanning us and the target CPU. */
6940 rcu_read_lock(); 7048 rcu_read_lock();
6941 for_each_domain(target_cpu, sd) { 7049 for_each_domain(target_cpu, sd) {
@@ -6956,16 +7064,22 @@ static int active_load_balance_cpu_stop(void *data)
6956 7064
6957 schedstat_inc(sd, alb_count); 7065 schedstat_inc(sd, alb_count);
6958 7066
6959 if (move_one_task(&env)) 7067 p = detach_one_task(&env);
7068 if (p)
6960 schedstat_inc(sd, alb_pushed); 7069 schedstat_inc(sd, alb_pushed);
6961 else 7070 else
6962 schedstat_inc(sd, alb_failed); 7071 schedstat_inc(sd, alb_failed);
6963 } 7072 }
6964 rcu_read_unlock(); 7073 rcu_read_unlock();
6965 double_unlock_balance(busiest_rq, target_rq);
6966out_unlock: 7074out_unlock:
6967 busiest_rq->active_balance = 0; 7075 busiest_rq->active_balance = 0;
6968 raw_spin_unlock_irq(&busiest_rq->lock); 7076 raw_spin_unlock(&busiest_rq->lock);
7077
7078 if (p)
7079 attach_one_task(target_rq, p);
7080
7081 local_irq_enable();
7082
6969 return 0; 7083 return 0;
6970} 7084}
6971 7085
@@ -7465,7 +7579,7 @@ static void task_fork_fair(struct task_struct *p)
7465static void 7579static void
7466prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) 7580prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
7467{ 7581{
7468 if (!p->se.on_rq) 7582 if (!task_on_rq_queued(p))
7469 return; 7583 return;
7470 7584
7471 /* 7585 /*
@@ -7490,11 +7604,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7490 * switched back to the fair class the enqueue_entity(.flags=0) will 7604 * switched back to the fair class the enqueue_entity(.flags=0) will
7491 * do the right thing. 7605 * do the right thing.
7492 * 7606 *
7493 * If it's on_rq, then the dequeue_entity(.flags=0) will already 7607 * If it's queued, then the dequeue_entity(.flags=0) will already
7494 * have normalized the vruntime, if it's !on_rq, then only when 7608 * have normalized the vruntime, if it's !queued, then only when
7495 * the task is sleeping will it still have non-normalized vruntime. 7609 * the task is sleeping will it still have non-normalized vruntime.
7496 */ 7610 */
7497 if (!p->on_rq && p->state != TASK_RUNNING) { 7611 if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
7498 /* 7612 /*
7499 * Fix up our vruntime so that the current sleep doesn't 7613 * Fix up our vruntime so that the current sleep doesn't
7500 * cause 'unlimited' sleep bonus. 7614 * cause 'unlimited' sleep bonus.
@@ -7521,15 +7635,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
7521 */ 7635 */
7522static void switched_to_fair(struct rq *rq, struct task_struct *p) 7636static void switched_to_fair(struct rq *rq, struct task_struct *p)
7523{ 7637{
7524 struct sched_entity *se = &p->se;
7525#ifdef CONFIG_FAIR_GROUP_SCHED 7638#ifdef CONFIG_FAIR_GROUP_SCHED
7639 struct sched_entity *se = &p->se;
7526 /* 7640 /*
7527 * Since the real-depth could have been changed (only FAIR 7641 * Since the real-depth could have been changed (only FAIR
7528 * class maintain depth value), reset depth properly. 7642 * class maintain depth value), reset depth properly.
7529 */ 7643 */
7530 se->depth = se->parent ? se->parent->depth + 1 : 0; 7644 se->depth = se->parent ? se->parent->depth + 1 : 0;
7531#endif 7645#endif
7532 if (!se->on_rq) 7646 if (!task_on_rq_queued(p))
7533 return; 7647 return;
7534 7648
7535 /* 7649 /*
@@ -7575,7 +7689,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
7575} 7689}
7576 7690
7577#ifdef CONFIG_FAIR_GROUP_SCHED 7691#ifdef CONFIG_FAIR_GROUP_SCHED
7578static void task_move_group_fair(struct task_struct *p, int on_rq) 7692static void task_move_group_fair(struct task_struct *p, int queued)
7579{ 7693{
7580 struct sched_entity *se = &p->se; 7694 struct sched_entity *se = &p->se;
7581 struct cfs_rq *cfs_rq; 7695 struct cfs_rq *cfs_rq;
@@ -7594,7 +7708,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7594 * fair sleeper stuff for the first placement, but who cares. 7708 * fair sleeper stuff for the first placement, but who cares.
7595 */ 7709 */
7596 /* 7710 /*
7597 * When !on_rq, vruntime of the task has usually NOT been normalized. 7711 * When !queued, vruntime of the task has usually NOT been normalized.
7598 * But there are some cases where it has already been normalized: 7712 * But there are some cases where it has already been normalized:
7599 * 7713 *
7600 * - Moving a forked child which is waiting for being woken up by 7714 * - Moving a forked child which is waiting for being woken up by
@@ -7605,14 +7719,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
7605 * To prevent boost or penalty in the new cfs_rq caused by delta 7719 * To prevent boost or penalty in the new cfs_rq caused by delta
7606 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment. 7720 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
7607 */ 7721 */
7608 if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING)) 7722 if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
7609 on_rq = 1; 7723 queued = 1;
7610 7724
7611 if (!on_rq) 7725 if (!queued)
7612 se->vruntime -= cfs_rq_of(se)->min_vruntime; 7726 se->vruntime -= cfs_rq_of(se)->min_vruntime;
7613 set_task_rq(p, task_cpu(p)); 7727 set_task_rq(p, task_cpu(p));
7614 se->depth = se->parent ? se->parent->depth + 1 : 0; 7728 se->depth = se->parent ? se->parent->depth + 1 : 0;
7615 if (!on_rq) { 7729 if (!queued) {
7616 cfs_rq = cfs_rq_of(se); 7730 cfs_rq = cfs_rq_of(se);
7617 se->vruntime += cfs_rq->min_vruntime; 7731 se->vruntime += cfs_rq->min_vruntime;
7618#ifdef CONFIG_SMP 7732#ifdef CONFIG_SMP
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 11e7bc434f43..c47fce75e666 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -147,6 +147,9 @@ use_default:
147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) 147 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
148 goto use_default; 148 goto use_default;
149 149
150 /* Take note of the planned idle state. */
151 idle_set_state(this_rq(), &drv->states[next_state]);
152
150 /* 153 /*
151 * Enter the idle state previously returned by the governor decision. 154 * Enter the idle state previously returned by the governor decision.
152 * This function will block until an interrupt occurs and will take 155 * This function will block until an interrupt occurs and will take
@@ -154,6 +157,9 @@ use_default:
154 */ 157 */
155 entered_state = cpuidle_enter(drv, dev, next_state); 158 entered_state = cpuidle_enter(drv, dev, next_state);
156 159
160 /* The cpu is no longer idle or about to enter idle. */
161 idle_set_state(this_rq(), NULL);
162
157 if (broadcast) 163 if (broadcast)
158 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); 164 clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
159 165
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c
index 16f5a30f9c88..8ecd552fe4f2 100644
--- a/kernel/sched/proc.c
+++ b/kernel/sched/proc.c
@@ -8,13 +8,6 @@
8 8
9#include "sched.h" 9#include "sched.h"
10 10
11unsigned long this_cpu_load(void)
12{
13 struct rq *this = this_rq();
14 return this->cpu_load[0];
15}
16
17
18/* 11/*
19 * Global load-average calculations 12 * Global load-average calculations
20 * 13 *
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca4fafd..d024e6ce30ba 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1448 * means a dl or stop task can slip in, in which case we need 1448 * means a dl or stop task can slip in, in which case we need
1449 * to re-start task selection. 1449 * to re-start task selection.
1450 */ 1450 */
1451 if (unlikely((rq->stop && rq->stop->on_rq) || 1451 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1452 rq->dl.dl_nr_running)) 1452 rq->dl.dl_nr_running))
1453 return RETRY_TASK; 1453 return RETRY_TASK;
1454 } 1454 }
@@ -1468,8 +1468,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1468 p = _pick_next_task_rt(rq); 1468 p = _pick_next_task_rt(rq);
1469 1469
1470 /* The running task is never eligible for pushing */ 1470 /* The running task is never eligible for pushing */
1471 if (p) 1471 dequeue_pushable_task(rq, p);
1472 dequeue_pushable_task(rq, p);
1473 1472
1474 set_post_schedule(rq); 1473 set_post_schedule(rq);
1475 1474
@@ -1526,7 +1525,7 @@ static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1526static int find_lowest_rq(struct task_struct *task) 1525static int find_lowest_rq(struct task_struct *task)
1527{ 1526{
1528 struct sched_domain *sd; 1527 struct sched_domain *sd;
1529 struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); 1528 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1530 int this_cpu = smp_processor_id(); 1529 int this_cpu = smp_processor_id();
1531 int cpu = task_cpu(task); 1530 int cpu = task_cpu(task);
1532 1531
@@ -1624,7 +1623,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1624 !cpumask_test_cpu(lowest_rq->cpu, 1623 !cpumask_test_cpu(lowest_rq->cpu,
1625 tsk_cpus_allowed(task)) || 1624 tsk_cpus_allowed(task)) ||
1626 task_running(rq, task) || 1625 task_running(rq, task) ||
1627 !task->on_rq)) { 1626 !task_on_rq_queued(task))) {
1628 1627
1629 double_unlock_balance(rq, lowest_rq); 1628 double_unlock_balance(rq, lowest_rq);
1630 lowest_rq = NULL; 1629 lowest_rq = NULL;
@@ -1658,7 +1657,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1658 BUG_ON(task_current(rq, p)); 1657 BUG_ON(task_current(rq, p));
1659 BUG_ON(p->nr_cpus_allowed <= 1); 1658 BUG_ON(p->nr_cpus_allowed <= 1);
1660 1659
1661 BUG_ON(!p->on_rq); 1660 BUG_ON(!task_on_rq_queued(p));
1662 BUG_ON(!rt_task(p)); 1661 BUG_ON(!rt_task(p));
1663 1662
1664 return p; 1663 return p;
@@ -1809,7 +1808,7 @@ static int pull_rt_task(struct rq *this_rq)
1809 */ 1808 */
1810 if (p && (p->prio < this_rq->rt.highest_prio.curr)) { 1809 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1811 WARN_ON(p == src_rq->curr); 1810 WARN_ON(p == src_rq->curr);
1812 WARN_ON(!p->on_rq); 1811 WARN_ON(!task_on_rq_queued(p));
1813 1812
1814 /* 1813 /*
1815 * There's a chance that p is higher in priority 1814 * There's a chance that p is higher in priority
@@ -1870,7 +1869,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1870 1869
1871 BUG_ON(!rt_task(p)); 1870 BUG_ON(!rt_task(p));
1872 1871
1873 if (!p->on_rq) 1872 if (!task_on_rq_queued(p))
1874 return; 1873 return;
1875 1874
1876 weight = cpumask_weight(new_mask); 1875 weight = cpumask_weight(new_mask);
@@ -1936,7 +1935,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
1936 * we may need to handle the pulling of RT tasks 1935 * we may need to handle the pulling of RT tasks
1937 * now. 1936 * now.
1938 */ 1937 */
1939 if (!p->on_rq || rq->rt.rt_nr_running) 1938 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
1940 return; 1939 return;
1941 1940
1942 if (pull_rt_task(rq)) 1941 if (pull_rt_task(rq))
@@ -1970,7 +1969,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1970 * If that current running task is also an RT task 1969 * If that current running task is also an RT task
1971 * then see if we can move to another run queue. 1970 * then see if we can move to another run queue.
1972 */ 1971 */
1973 if (p->on_rq && rq->curr != p) { 1972 if (task_on_rq_queued(p) && rq->curr != p) {
1974#ifdef CONFIG_SMP 1973#ifdef CONFIG_SMP
1975 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded && 1974 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
1976 /* Don't resched if we changed runqueues */ 1975 /* Don't resched if we changed runqueues */
@@ -1989,7 +1988,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
1989static void 1988static void
1990prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) 1989prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
1991{ 1990{
1992 if (!p->on_rq) 1991 if (!task_on_rq_queued(p))
1993 return; 1992 return;
1994 1993
1995 if (rq->curr == p) { 1994 if (rq->curr == p) {
@@ -2073,7 +2072,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2073 for_each_sched_rt_entity(rt_se) { 2072 for_each_sched_rt_entity(rt_se) {
2074 if (rt_se->run_list.prev != rt_se->run_list.next) { 2073 if (rt_se->run_list.prev != rt_se->run_list.next) {
2075 requeue_task_rt(rq, p, 0); 2074 requeue_task_rt(rq, p, 0);
2076 set_tsk_need_resched(p); 2075 resched_curr(rq);
2077 return; 2076 return;
2078 } 2077 }
2079 } 2078 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f4e9d5..24156c8434d1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -14,6 +14,11 @@
14#include "cpuacct.h" 14#include "cpuacct.h"
15 15
16struct rq; 16struct rq;
17struct cpuidle_state;
18
19/* task_struct::on_rq states: */
20#define TASK_ON_RQ_QUEUED 1
21#define TASK_ON_RQ_MIGRATING 2
17 22
18extern __read_mostly int scheduler_running; 23extern __read_mostly int scheduler_running;
19 24
@@ -126,6 +131,9 @@ struct rt_bandwidth {
126 u64 rt_runtime; 131 u64 rt_runtime;
127 struct hrtimer rt_period_timer; 132 struct hrtimer rt_period_timer;
128}; 133};
134
135void __dl_clear_params(struct task_struct *p);
136
129/* 137/*
130 * To keep the bandwidth of -deadline tasks and groups under control 138 * To keep the bandwidth of -deadline tasks and groups under control
131 * we need some place where: 139 * we need some place where:
@@ -184,7 +192,7 @@ struct cfs_bandwidth {
184 raw_spinlock_t lock; 192 raw_spinlock_t lock;
185 ktime_t period; 193 ktime_t period;
186 u64 quota, runtime; 194 u64 quota, runtime;
187 s64 hierarchal_quota; 195 s64 hierarchical_quota;
188 u64 runtime_expires; 196 u64 runtime_expires;
189 197
190 int idle, timer_active; 198 int idle, timer_active;
@@ -636,6 +644,11 @@ struct rq {
636#ifdef CONFIG_SMP 644#ifdef CONFIG_SMP
637 struct llist_head wake_list; 645 struct llist_head wake_list;
638#endif 646#endif
647
648#ifdef CONFIG_CPU_IDLE
649 /* Must be inspected within a rcu lock section */
650 struct cpuidle_state *idle_state;
651#endif
639}; 652};
640 653
641static inline int cpu_of(struct rq *rq) 654static inline int cpu_of(struct rq *rq)
@@ -647,13 +660,13 @@ static inline int cpu_of(struct rq *rq)
647#endif 660#endif
648} 661}
649 662
650DECLARE_PER_CPU(struct rq, runqueues); 663DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
651 664
652#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 665#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
653#define this_rq() (&__get_cpu_var(runqueues)) 666#define this_rq() this_cpu_ptr(&runqueues)
654#define task_rq(p) cpu_rq(task_cpu(p)) 667#define task_rq(p) cpu_rq(task_cpu(p))
655#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 668#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
656#define raw_rq() (&__raw_get_cpu_var(runqueues)) 669#define raw_rq() raw_cpu_ptr(&runqueues)
657 670
658static inline u64 rq_clock(struct rq *rq) 671static inline u64 rq_clock(struct rq *rq)
659{ 672{
@@ -942,6 +955,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
942#endif 955#endif
943} 956}
944 957
958static inline int task_on_rq_queued(struct task_struct *p)
959{
960 return p->on_rq == TASK_ON_RQ_QUEUED;
961}
962
963static inline int task_on_rq_migrating(struct task_struct *p)
964{
965 return p->on_rq == TASK_ON_RQ_MIGRATING;
966}
945 967
946#ifndef prepare_arch_switch 968#ifndef prepare_arch_switch
947# define prepare_arch_switch(next) do { } while (0) 969# define prepare_arch_switch(next) do { } while (0)
@@ -953,7 +975,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
953# define finish_arch_post_lock_switch() do { } while (0) 975# define finish_arch_post_lock_switch() do { } while (0)
954#endif 976#endif
955 977
956#ifndef __ARCH_WANT_UNLOCKED_CTXSW
957static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 978static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
958{ 979{
959#ifdef CONFIG_SMP 980#ifdef CONFIG_SMP
@@ -991,35 +1012,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
991 raw_spin_unlock_irq(&rq->lock); 1012 raw_spin_unlock_irq(&rq->lock);
992} 1013}
993 1014
994#else /* __ARCH_WANT_UNLOCKED_CTXSW */
995static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
996{
997#ifdef CONFIG_SMP
998 /*
999 * We can optimise this out completely for !SMP, because the
1000 * SMP rebalancing from interrupt is the only thing that cares
1001 * here.
1002 */
1003 next->on_cpu = 1;
1004#endif
1005 raw_spin_unlock(&rq->lock);
1006}
1007
1008static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
1009{
1010#ifdef CONFIG_SMP
1011 /*
1012 * After ->on_cpu is cleared, the task can be moved to a different CPU.
1013 * We must ensure this doesn't happen until the switch is completely
1014 * finished.
1015 */
1016 smp_wmb();
1017 prev->on_cpu = 0;
1018#endif
1019 local_irq_enable();
1020}
1021#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
1022
1023/* 1015/*
1024 * wake flags 1016 * wake flags
1025 */ 1017 */
@@ -1180,6 +1172,30 @@ static inline void idle_exit_fair(struct rq *rq) { }
1180 1172
1181#endif 1173#endif
1182 1174
1175#ifdef CONFIG_CPU_IDLE
1176static inline void idle_set_state(struct rq *rq,
1177 struct cpuidle_state *idle_state)
1178{
1179 rq->idle_state = idle_state;
1180}
1181
1182static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1183{
1184 WARN_ON(!rcu_read_lock_held());
1185 return rq->idle_state;
1186}
1187#else
1188static inline void idle_set_state(struct rq *rq,
1189 struct cpuidle_state *idle_state)
1190{
1191}
1192
1193static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1194{
1195 return NULL;
1196}
1197#endif
1198
1183extern void sysrq_sched_debug_show(void); 1199extern void sysrq_sched_debug_show(void);
1184extern void sched_init_granularity(void); 1200extern void sched_init_granularity(void);
1185extern void update_max_interval(void); 1201extern void update_max_interval(void);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0edadbfbb..67426e529f59 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
28{ 28{
29 struct task_struct *stop = rq->stop; 29 struct task_struct *stop = rq->stop;
30 30
31 if (!stop || !stop->on_rq) 31 if (!stop || !task_on_rq_queued(stop))
32 return NULL; 32 return NULL;
33 33
34 put_prev_task(rq, prev); 34 put_prev_task(rq, prev);
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 15cab1a4f84e..5a62915f47a8 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -343,6 +343,18 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
343} 343}
344EXPORT_SYMBOL(out_of_line_wait_on_bit); 344EXPORT_SYMBOL(out_of_line_wait_on_bit);
345 345
346int __sched out_of_line_wait_on_bit_timeout(
347 void *word, int bit, wait_bit_action_f *action,
348 unsigned mode, unsigned long timeout)
349{
350 wait_queue_head_t *wq = bit_waitqueue(word, bit);
351 DEFINE_WAIT_BIT(wait, word, bit);
352
353 wait.key.timeout = jiffies + timeout;
354 return __wait_on_bit(wq, &wait, action, mode);
355}
356EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
357
346int __sched 358int __sched
347__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, 359__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
348 wait_bit_action_f *action, unsigned mode) 360 wait_bit_action_f *action, unsigned mode)
@@ -520,3 +532,27 @@ __sched int bit_wait_io(struct wait_bit_key *word)
520 return 0; 532 return 0;
521} 533}
522EXPORT_SYMBOL(bit_wait_io); 534EXPORT_SYMBOL(bit_wait_io);
535
536__sched int bit_wait_timeout(struct wait_bit_key *word)
537{
538 unsigned long now = ACCESS_ONCE(jiffies);
539 if (signal_pending_state(current->state, current))
540 return 1;
541 if (time_after_eq(now, word->timeout))
542 return -EAGAIN;
543 schedule_timeout(word->timeout - now);
544 return 0;
545}
546EXPORT_SYMBOL_GPL(bit_wait_timeout);
547
548__sched int bit_wait_io_timeout(struct wait_bit_key *word)
549{
550 unsigned long now = ACCESS_ONCE(jiffies);
551 if (signal_pending_state(current->state, current))
552 return 1;
553 if (time_after_eq(now, word->timeout))
554 return -EAGAIN;
555 io_schedule_timeout(word->timeout - now);
556 return 0;
557}
558EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 25b0043f4755..4ef9687ac115 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -21,10 +21,11 @@
21#include <linux/slab.h> 21#include <linux/slab.h>
22#include <linux/syscalls.h> 22#include <linux/syscalls.h>
23 23
24/* #define SECCOMP_DEBUG 1 */ 24#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
25#include <asm/syscall.h>
26#endif
25 27
26#ifdef CONFIG_SECCOMP_FILTER 28#ifdef CONFIG_SECCOMP_FILTER
27#include <asm/syscall.h>
28#include <linux/filter.h> 29#include <linux/filter.h>
29#include <linux/pid.h> 30#include <linux/pid.h>
30#include <linux/ptrace.h> 31#include <linux/ptrace.h>
@@ -172,10 +173,10 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
172 * 173 *
173 * Returns valid seccomp BPF response codes. 174 * Returns valid seccomp BPF response codes.
174 */ 175 */
175static u32 seccomp_run_filters(int syscall) 176static u32 seccomp_run_filters(struct seccomp_data *sd)
176{ 177{
177 struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter); 178 struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
178 struct seccomp_data sd; 179 struct seccomp_data sd_local;
179 u32 ret = SECCOMP_RET_ALLOW; 180 u32 ret = SECCOMP_RET_ALLOW;
180 181
181 /* Ensure unexpected behavior doesn't result in failing open. */ 182 /* Ensure unexpected behavior doesn't result in failing open. */
@@ -185,14 +186,17 @@ static u32 seccomp_run_filters(int syscall)
185 /* Make sure cross-thread synced filter points somewhere sane. */ 186 /* Make sure cross-thread synced filter points somewhere sane. */
186 smp_read_barrier_depends(); 187 smp_read_barrier_depends();
187 188
188 populate_seccomp_data(&sd); 189 if (!sd) {
190 populate_seccomp_data(&sd_local);
191 sd = &sd_local;
192 }
189 193
190 /* 194 /*
191 * All filters in the list are evaluated and the lowest BPF return 195 * All filters in the list are evaluated and the lowest BPF return
192 * value always takes priority (ignoring the DATA). 196 * value always takes priority (ignoring the DATA).
193 */ 197 */
194 for (; f; f = f->prev) { 198 for (; f; f = f->prev) {
195 u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)&sd); 199 u32 cur_ret = BPF_PROG_RUN(f->prog, (void *)sd);
196 200
197 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) 201 if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
198 ret = cur_ret; 202 ret = cur_ret;
@@ -203,7 +207,7 @@ static u32 seccomp_run_filters(int syscall)
203 207
204static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) 208static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
205{ 209{
206 BUG_ON(!spin_is_locked(&current->sighand->siglock)); 210 assert_spin_locked(&current->sighand->siglock);
207 211
208 if (current->seccomp.mode && current->seccomp.mode != seccomp_mode) 212 if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
209 return false; 213 return false;
@@ -214,7 +218,7 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
214static inline void seccomp_assign_mode(struct task_struct *task, 218static inline void seccomp_assign_mode(struct task_struct *task,
215 unsigned long seccomp_mode) 219 unsigned long seccomp_mode)
216{ 220{
217 BUG_ON(!spin_is_locked(&task->sighand->siglock)); 221 assert_spin_locked(&task->sighand->siglock);
218 222
219 task->seccomp.mode = seccomp_mode; 223 task->seccomp.mode = seccomp_mode;
220 /* 224 /*
@@ -253,7 +257,7 @@ static inline pid_t seccomp_can_sync_threads(void)
253 struct task_struct *thread, *caller; 257 struct task_struct *thread, *caller;
254 258
255 BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex)); 259 BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
256 BUG_ON(!spin_is_locked(&current->sighand->siglock)); 260 assert_spin_locked(&current->sighand->siglock);
257 261
258 /* Validate all threads being eligible for synchronization. */ 262 /* Validate all threads being eligible for synchronization. */
259 caller = current; 263 caller = current;
@@ -294,7 +298,7 @@ static inline void seccomp_sync_threads(void)
294 struct task_struct *thread, *caller; 298 struct task_struct *thread, *caller;
295 299
296 BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex)); 300 BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
297 BUG_ON(!spin_is_locked(&current->sighand->siglock)); 301 assert_spin_locked(&current->sighand->siglock);
298 302
299 /* Synchronize all threads. */ 303 /* Synchronize all threads. */
300 caller = current; 304 caller = current;
@@ -395,16 +399,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
395 if (!filter) 399 if (!filter)
396 goto free_prog; 400 goto free_prog;
397 401
398 filter->prog = kzalloc(bpf_prog_size(new_len), 402 filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
399 GFP_KERNEL|__GFP_NOWARN);
400 if (!filter->prog) 403 if (!filter->prog)
401 goto free_filter; 404 goto free_filter;
402 405
403 ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len); 406 ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
404 if (ret) 407 if (ret)
405 goto free_filter_prog; 408 goto free_filter_prog;
406 kfree(fp);
407 409
410 kfree(fp);
408 atomic_set(&filter->usage, 1); 411 atomic_set(&filter->usage, 1);
409 filter->prog->len = new_len; 412 filter->prog->len = new_len;
410 413
@@ -413,7 +416,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
413 return filter; 416 return filter;
414 417
415free_filter_prog: 418free_filter_prog:
416 kfree(filter->prog); 419 __bpf_prog_free(filter->prog);
417free_filter: 420free_filter:
418 kfree(filter); 421 kfree(filter);
419free_prog: 422free_prog:
@@ -464,7 +467,7 @@ static long seccomp_attach_filter(unsigned int flags,
464 unsigned long total_insns; 467 unsigned long total_insns;
465 struct seccomp_filter *walker; 468 struct seccomp_filter *walker;
466 469
467 BUG_ON(!spin_is_locked(&current->sighand->siglock)); 470 assert_spin_locked(&current->sighand->siglock);
468 471
469 /* Validate resulting filter length. */ 472 /* Validate resulting filter length. */
470 total_insns = filter->prog->len; 473 total_insns = filter->prog->len;
@@ -564,11 +567,55 @@ static int mode1_syscalls_32[] = {
564}; 567};
565#endif 568#endif
566 569
567int __secure_computing(int this_syscall) 570static void __secure_computing_strict(int this_syscall)
571{
572 int *syscall_whitelist = mode1_syscalls;
573#ifdef CONFIG_COMPAT
574 if (is_compat_task())
575 syscall_whitelist = mode1_syscalls_32;
576#endif
577 do {
578 if (*syscall_whitelist == this_syscall)
579 return;
580 } while (*++syscall_whitelist);
581
582#ifdef SECCOMP_DEBUG
583 dump_stack();
584#endif
585 audit_seccomp(this_syscall, SIGKILL, SECCOMP_RET_KILL);
586 do_exit(SIGKILL);
587}
588
589#ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
590void secure_computing_strict(int this_syscall)
591{
592 int mode = current->seccomp.mode;
593
594 if (mode == 0)
595 return;
596 else if (mode == SECCOMP_MODE_STRICT)
597 __secure_computing_strict(this_syscall);
598 else
599 BUG();
600}
601#else
602int __secure_computing(void)
603{
604 u32 phase1_result = seccomp_phase1(NULL);
605
606 if (likely(phase1_result == SECCOMP_PHASE1_OK))
607 return 0;
608 else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
609 return -1;
610 else
611 return seccomp_phase2(phase1_result);
612}
613
614#ifdef CONFIG_SECCOMP_FILTER
615static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
568{ 616{
569 int exit_sig = 0; 617 u32 filter_ret, action;
570 int *syscall; 618 int data;
571 u32 ret;
572 619
573 /* 620 /*
574 * Make sure that any changes to mode from another thread have 621 * Make sure that any changes to mode from another thread have
@@ -576,85 +623,127 @@ int __secure_computing(int this_syscall)
576 */ 623 */
577 rmb(); 624 rmb();
578 625
579 switch (current->seccomp.mode) { 626 filter_ret = seccomp_run_filters(sd);
580 case SECCOMP_MODE_STRICT: 627 data = filter_ret & SECCOMP_RET_DATA;
581 syscall = mode1_syscalls; 628 action = filter_ret & SECCOMP_RET_ACTION;
582#ifdef CONFIG_COMPAT 629
583 if (is_compat_task()) 630 switch (action) {
584 syscall = mode1_syscalls_32; 631 case SECCOMP_RET_ERRNO:
632 /* Set the low-order 16-bits as a errno. */
633 syscall_set_return_value(current, task_pt_regs(current),
634 -data, 0);
635 goto skip;
636
637 case SECCOMP_RET_TRAP:
638 /* Show the handler the original registers. */
639 syscall_rollback(current, task_pt_regs(current));
640 /* Let the filter pass back 16 bits of data. */
641 seccomp_send_sigsys(this_syscall, data);
642 goto skip;
643
644 case SECCOMP_RET_TRACE:
645 return filter_ret; /* Save the rest for phase 2. */
646
647 case SECCOMP_RET_ALLOW:
648 return SECCOMP_PHASE1_OK;
649
650 case SECCOMP_RET_KILL:
651 default:
652 audit_seccomp(this_syscall, SIGSYS, action);
653 do_exit(SIGSYS);
654 }
655
656 unreachable();
657
658skip:
659 audit_seccomp(this_syscall, 0, action);
660 return SECCOMP_PHASE1_SKIP;
661}
585#endif 662#endif
586 do { 663
587 if (*syscall == this_syscall) 664/**
588 return 0; 665 * seccomp_phase1() - run fast path seccomp checks on the current syscall
589 } while (*++syscall); 666 * @arg sd: The seccomp_data or NULL
590 exit_sig = SIGKILL; 667 *
591 ret = SECCOMP_RET_KILL; 668 * This only reads pt_regs via the syscall_xyz helpers. The only change
592 break; 669 * it will make to pt_regs is via syscall_set_return_value, and it will
670 * only do that if it returns SECCOMP_PHASE1_SKIP.
671 *
672 * If sd is provided, it will not read pt_regs at all.
673 *
674 * It may also call do_exit or force a signal; these actions must be
675 * safe.
676 *
677 * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
678 * be processed normally.
679 *
680 * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
681 * invoked. In this case, seccomp_phase1 will have set the return value
682 * using syscall_set_return_value.
683 *
684 * If it returns anything else, then the return value should be passed
685 * to seccomp_phase2 from a context in which ptrace hooks are safe.
686 */
687u32 seccomp_phase1(struct seccomp_data *sd)
688{
689 int mode = current->seccomp.mode;
690 int this_syscall = sd ? sd->nr :
691 syscall_get_nr(current, task_pt_regs(current));
692
693 switch (mode) {
694 case SECCOMP_MODE_STRICT:
695 __secure_computing_strict(this_syscall); /* may call do_exit */
696 return SECCOMP_PHASE1_OK;
593#ifdef CONFIG_SECCOMP_FILTER 697#ifdef CONFIG_SECCOMP_FILTER
594 case SECCOMP_MODE_FILTER: { 698 case SECCOMP_MODE_FILTER:
595 int data; 699 return __seccomp_phase1_filter(this_syscall, sd);
596 struct pt_regs *regs = task_pt_regs(current);
597 ret = seccomp_run_filters(this_syscall);
598 data = ret & SECCOMP_RET_DATA;
599 ret &= SECCOMP_RET_ACTION;
600 switch (ret) {
601 case SECCOMP_RET_ERRNO:
602 /* Set the low-order 16-bits as a errno. */
603 syscall_set_return_value(current, regs,
604 -data, 0);
605 goto skip;
606 case SECCOMP_RET_TRAP:
607 /* Show the handler the original registers. */
608 syscall_rollback(current, regs);
609 /* Let the filter pass back 16 bits of data. */
610 seccomp_send_sigsys(this_syscall, data);
611 goto skip;
612 case SECCOMP_RET_TRACE:
613 /* Skip these calls if there is no tracer. */
614 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
615 syscall_set_return_value(current, regs,
616 -ENOSYS, 0);
617 goto skip;
618 }
619 /* Allow the BPF to provide the event message */
620 ptrace_event(PTRACE_EVENT_SECCOMP, data);
621 /*
622 * The delivery of a fatal signal during event
623 * notification may silently skip tracer notification.
624 * Terminating the task now avoids executing a system
625 * call that may not be intended.
626 */
627 if (fatal_signal_pending(current))
628 break;
629 if (syscall_get_nr(current, regs) < 0)
630 goto skip; /* Explicit request to skip. */
631
632 return 0;
633 case SECCOMP_RET_ALLOW:
634 return 0;
635 case SECCOMP_RET_KILL:
636 default:
637 break;
638 }
639 exit_sig = SIGSYS;
640 break;
641 }
642#endif 700#endif
643 default: 701 default:
644 BUG(); 702 BUG();
645 } 703 }
704}
646 705
647#ifdef SECCOMP_DEBUG 706/**
648 dump_stack(); 707 * seccomp_phase2() - finish slow path seccomp work for the current syscall
649#endif 708 * @phase1_result: The return value from seccomp_phase1()
650 audit_seccomp(this_syscall, exit_sig, ret); 709 *
651 do_exit(exit_sig); 710 * This must be called from a context in which ptrace hooks can be used.
652#ifdef CONFIG_SECCOMP_FILTER 711 *
653skip: 712 * Returns 0 if the syscall should be processed or -1 to skip the syscall.
654 audit_seccomp(this_syscall, exit_sig, ret); 713 */
655#endif 714int seccomp_phase2(u32 phase1_result)
656 return -1; 715{
716 struct pt_regs *regs = task_pt_regs(current);
717 u32 action = phase1_result & SECCOMP_RET_ACTION;
718 int data = phase1_result & SECCOMP_RET_DATA;
719
720 BUG_ON(action != SECCOMP_RET_TRACE);
721
722 audit_seccomp(syscall_get_nr(current, regs), 0, action);
723
724 /* Skip these calls if there is no tracer. */
725 if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
726 syscall_set_return_value(current, regs,
727 -ENOSYS, 0);
728 return -1;
729 }
730
731 /* Allow the BPF to provide the event message */
732 ptrace_event(PTRACE_EVENT_SECCOMP, data);
733 /*
734 * The delivery of a fatal signal during event
735 * notification may silently skip tracer notification.
736 * Terminating the task now avoids executing a system
737 * call that may not be intended.
738 */
739 if (fatal_signal_pending(current))
740 do_exit(SIGSYS);
741 if (syscall_get_nr(current, regs) < 0)
742 return -1; /* Explicit request to skip. */
743
744 return 0;
657} 745}
746#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
658 747
659long prctl_get_seccomp(void) 748long prctl_get_seccomp(void)
660{ 749{
diff --git a/kernel/signal.c b/kernel/signal.c
index 40b76e351e64..8f0876f9f6dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2170,8 +2170,7 @@ static int ptrace_signal(int signr, siginfo_t *info)
2170 return signr; 2170 return signr;
2171} 2171}
2172 2172
2173int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, 2173int get_signal(struct ksignal *ksig)
2174 struct pt_regs *regs, void *cookie)
2175{ 2174{
2176 struct sighand_struct *sighand = current->sighand; 2175 struct sighand_struct *sighand = current->sighand;
2177 struct signal_struct *signal = current->signal; 2176 struct signal_struct *signal = current->signal;
@@ -2241,13 +2240,13 @@ relock:
2241 goto relock; 2240 goto relock;
2242 } 2241 }
2243 2242
2244 signr = dequeue_signal(current, &current->blocked, info); 2243 signr = dequeue_signal(current, &current->blocked, &ksig->info);
2245 2244
2246 if (!signr) 2245 if (!signr)
2247 break; /* will return 0 */ 2246 break; /* will return 0 */
2248 2247
2249 if (unlikely(current->ptrace) && signr != SIGKILL) { 2248 if (unlikely(current->ptrace) && signr != SIGKILL) {
2250 signr = ptrace_signal(signr, info); 2249 signr = ptrace_signal(signr, &ksig->info);
2251 if (!signr) 2250 if (!signr)
2252 continue; 2251 continue;
2253 } 2252 }
@@ -2255,13 +2254,13 @@ relock:
2255 ka = &sighand->action[signr-1]; 2254 ka = &sighand->action[signr-1];
2256 2255
2257 /* Trace actually delivered signals. */ 2256 /* Trace actually delivered signals. */
2258 trace_signal_deliver(signr, info, ka); 2257 trace_signal_deliver(signr, &ksig->info, ka);
2259 2258
2260 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 2259 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
2261 continue; 2260 continue;
2262 if (ka->sa.sa_handler != SIG_DFL) { 2261 if (ka->sa.sa_handler != SIG_DFL) {
2263 /* Run the handler. */ 2262 /* Run the handler. */
2264 *return_ka = *ka; 2263 ksig->ka = *ka;
2265 2264
2266 if (ka->sa.sa_flags & SA_ONESHOT) 2265 if (ka->sa.sa_flags & SA_ONESHOT)
2267 ka->sa.sa_handler = SIG_DFL; 2266 ka->sa.sa_handler = SIG_DFL;
@@ -2311,7 +2310,7 @@ relock:
2311 spin_lock_irq(&sighand->siglock); 2310 spin_lock_irq(&sighand->siglock);
2312 } 2311 }
2313 2312
2314 if (likely(do_signal_stop(info->si_signo))) { 2313 if (likely(do_signal_stop(ksig->info.si_signo))) {
2315 /* It released the siglock. */ 2314 /* It released the siglock. */
2316 goto relock; 2315 goto relock;
2317 } 2316 }
@@ -2332,7 +2331,7 @@ relock:
2332 2331
2333 if (sig_kernel_coredump(signr)) { 2332 if (sig_kernel_coredump(signr)) {
2334 if (print_fatal_signals) 2333 if (print_fatal_signals)
2335 print_fatal_signal(info->si_signo); 2334 print_fatal_signal(ksig->info.si_signo);
2336 proc_coredump_connector(current); 2335 proc_coredump_connector(current);
2337 /* 2336 /*
2338 * If it was able to dump core, this kills all 2337 * If it was able to dump core, this kills all
@@ -2342,34 +2341,32 @@ relock:
2342 * first and our do_group_exit call below will use 2341 * first and our do_group_exit call below will use
2343 * that value and ignore the one we pass it. 2342 * that value and ignore the one we pass it.
2344 */ 2343 */
2345 do_coredump(info); 2344 do_coredump(&ksig->info);
2346 } 2345 }
2347 2346
2348 /* 2347 /*
2349 * Death signals, no core dump. 2348 * Death signals, no core dump.
2350 */ 2349 */
2351 do_group_exit(info->si_signo); 2350 do_group_exit(ksig->info.si_signo);
2352 /* NOTREACHED */ 2351 /* NOTREACHED */
2353 } 2352 }
2354 spin_unlock_irq(&sighand->siglock); 2353 spin_unlock_irq(&sighand->siglock);
2355 return signr; 2354
2355 ksig->sig = signr;
2356 return ksig->sig > 0;
2356} 2357}
2357 2358
2358/** 2359/**
2359 * signal_delivered - 2360 * signal_delivered -
2360 * @sig: number of signal being delivered 2361 * @ksig: kernel signal struct
2361 * @info: siginfo_t of signal being delivered
2362 * @ka: sigaction setting that chose the handler
2363 * @regs: user register state
2364 * @stepping: nonzero if debugger single-step or block-step in use 2362 * @stepping: nonzero if debugger single-step or block-step in use
2365 * 2363 *
2366 * This function should be called when a signal has successfully been 2364 * This function should be called when a signal has successfully been
2367 * delivered. It updates the blocked signals accordingly (@ka->sa.sa_mask 2365 * delivered. It updates the blocked signals accordingly (@ksig->ka.sa.sa_mask
2368 * is always blocked, and the signal itself is blocked unless %SA_NODEFER 2366 * is always blocked, and the signal itself is blocked unless %SA_NODEFER
2369 * is set in @ka->sa.sa_flags. Tracing is notified. 2367 * is set in @ksig->ka.sa.sa_flags. Tracing is notified.
2370 */ 2368 */
2371void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka, 2369static void signal_delivered(struct ksignal *ksig, int stepping)
2372 struct pt_regs *regs, int stepping)
2373{ 2370{
2374 sigset_t blocked; 2371 sigset_t blocked;
2375 2372
@@ -2379,11 +2376,11 @@ void signal_delivered(int sig, siginfo_t *info, struct k_sigaction *ka,
2379 simply clear the restore sigmask flag. */ 2376 simply clear the restore sigmask flag. */
2380 clear_restore_sigmask(); 2377 clear_restore_sigmask();
2381 2378
2382 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask); 2379 sigorsets(&blocked, &current->blocked, &ksig->ka.sa.sa_mask);
2383 if (!(ka->sa.sa_flags & SA_NODEFER)) 2380 if (!(ksig->ka.sa.sa_flags & SA_NODEFER))
2384 sigaddset(&blocked, sig); 2381 sigaddset(&blocked, ksig->sig);
2385 set_current_blocked(&blocked); 2382 set_current_blocked(&blocked);
2386 tracehook_signal_handler(sig, info, ka, regs, stepping); 2383 tracehook_signal_handler(stepping);
2387} 2384}
2388 2385
2389void signal_setup_done(int failed, struct ksignal *ksig, int stepping) 2386void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
@@ -2391,8 +2388,7 @@ void signal_setup_done(int failed, struct ksignal *ksig, int stepping)
2391 if (failed) 2388 if (failed)
2392 force_sigsegv(ksig->sig, current); 2389 force_sigsegv(ksig->sig, current);
2393 else 2390 else
2394 signal_delivered(ksig->sig, &ksig->info, &ksig->ka, 2391 signal_delivered(ksig, stepping);
2395 signal_pt_regs(), stepping);
2396} 2392}
2397 2393
2398/* 2394/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 487653b5844f..f38a1e692259 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
13#include <linux/gfp.h> 13#include <linux/gfp.h>
14#include <linux/smp.h> 14#include <linux/smp.h>
15#include <linux/cpu.h> 15#include <linux/cpu.h>
16#include <linux/sched.h>
16 17
17#include "smpboot.h" 18#include "smpboot.h"
18 19
@@ -164,7 +165,7 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
164 if (!csd) { 165 if (!csd) {
165 csd = &csd_stack; 166 csd = &csd_stack;
166 if (!wait) 167 if (!wait)
167 csd = &__get_cpu_var(csd_data); 168 csd = this_cpu_ptr(&csd_data);
168 } 169 }
169 170
170 csd_lock(csd); 171 csd_lock(csd);
@@ -229,7 +230,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
229 230
230 WARN_ON(!irqs_disabled()); 231 WARN_ON(!irqs_disabled());
231 232
232 head = &__get_cpu_var(call_single_queue); 233 head = this_cpu_ptr(&call_single_queue);
233 entry = llist_del_all(head); 234 entry = llist_del_all(head);
234 entry = llist_reverse_order(entry); 235 entry = llist_reverse_order(entry);
235 236
@@ -419,7 +420,7 @@ void smp_call_function_many(const struct cpumask *mask,
419 return; 420 return;
420 } 421 }
421 422
422 cfd = &__get_cpu_var(cfd_data); 423 cfd = this_cpu_ptr(&cfd_data);
423 424
424 cpumask_and(cfd->cpumask, mask, cpu_online_mask); 425 cpumask_and(cfd->cpumask, mask, cpu_online_mask);
425 cpumask_clear_cpu(this_cpu, cfd->cpumask); 426 cpumask_clear_cpu(this_cpu, cfd->cpumask);
@@ -670,7 +671,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info),
670 if (cond_func(cpu, info)) { 671 if (cond_func(cpu, info)) {
671 ret = smp_call_function_single(cpu, func, 672 ret = smp_call_function_single(cpu, func,
672 info, wait); 673 info, wait);
673 WARN_ON_ONCE(!ret); 674 WARN_ON_ONCE(ret);
674 } 675 }
675 preempt_enable(); 676 preempt_enable();
676 } 677 }
@@ -699,3 +700,24 @@ void kick_all_cpus_sync(void)
699 smp_call_function(do_nothing, NULL, 1); 700 smp_call_function(do_nothing, NULL, 1);
700} 701}
701EXPORT_SYMBOL_GPL(kick_all_cpus_sync); 702EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
703
704/**
705 * wake_up_all_idle_cpus - break all cpus out of idle
706 * wake_up_all_idle_cpus try to break all cpus which is in idle state even
707 * including idle polling cpus, for non-idle cpus, we will do nothing
708 * for them.
709 */
710void wake_up_all_idle_cpus(void)
711{
712 int cpu;
713
714 preempt_disable();
715 for_each_online_cpu(cpu) {
716 if (cpu == smp_processor_id())
717 continue;
718
719 wake_up_if_idle(cpu);
720 }
721 preempt_enable();
722}
723EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 5918d227730f..0699add19164 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -278,7 +278,7 @@ restart:
278 pending >>= softirq_bit; 278 pending >>= softirq_bit;
279 } 279 }
280 280
281 rcu_bh_qs(smp_processor_id()); 281 rcu_bh_qs();
282 local_irq_disable(); 282 local_irq_disable();
283 283
284 pending = local_softirq_pending(); 284 pending = local_softirq_pending();
@@ -485,7 +485,7 @@ static void tasklet_action(struct softirq_action *a)
485 local_irq_disable(); 485 local_irq_disable();
486 list = __this_cpu_read(tasklet_vec.head); 486 list = __this_cpu_read(tasklet_vec.head);
487 __this_cpu_write(tasklet_vec.head, NULL); 487 __this_cpu_write(tasklet_vec.head, NULL);
488 __this_cpu_write(tasklet_vec.tail, &__get_cpu_var(tasklet_vec).head); 488 __this_cpu_write(tasklet_vec.tail, this_cpu_ptr(&tasklet_vec.head));
489 local_irq_enable(); 489 local_irq_enable();
490 490
491 while (list) { 491 while (list) {
@@ -521,7 +521,7 @@ static void tasklet_hi_action(struct softirq_action *a)
521 local_irq_disable(); 521 local_irq_disable();
522 list = __this_cpu_read(tasklet_hi_vec.head); 522 list = __this_cpu_read(tasklet_hi_vec.head);
523 __this_cpu_write(tasklet_hi_vec.head, NULL); 523 __this_cpu_write(tasklet_hi_vec.head, NULL);
524 __this_cpu_write(tasklet_hi_vec.tail, &__get_cpu_var(tasklet_hi_vec).head); 524 __this_cpu_write(tasklet_hi_vec.tail, this_cpu_ptr(&tasklet_hi_vec.head));
525 local_irq_enable(); 525 local_irq_enable();
526 526
527 while (list) { 527 while (list) {
diff --git a/kernel/sys.c b/kernel/sys.c
index ce8129192a26..1eaa2f0b0246 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -62,28 +62,28 @@
62#include <asm/unistd.h> 62#include <asm/unistd.h>
63 63
64#ifndef SET_UNALIGN_CTL 64#ifndef SET_UNALIGN_CTL
65# define SET_UNALIGN_CTL(a,b) (-EINVAL) 65# define SET_UNALIGN_CTL(a, b) (-EINVAL)
66#endif 66#endif
67#ifndef GET_UNALIGN_CTL 67#ifndef GET_UNALIGN_CTL
68# define GET_UNALIGN_CTL(a,b) (-EINVAL) 68# define GET_UNALIGN_CTL(a, b) (-EINVAL)
69#endif 69#endif
70#ifndef SET_FPEMU_CTL 70#ifndef SET_FPEMU_CTL
71# define SET_FPEMU_CTL(a,b) (-EINVAL) 71# define SET_FPEMU_CTL(a, b) (-EINVAL)
72#endif 72#endif
73#ifndef GET_FPEMU_CTL 73#ifndef GET_FPEMU_CTL
74# define GET_FPEMU_CTL(a,b) (-EINVAL) 74# define GET_FPEMU_CTL(a, b) (-EINVAL)
75#endif 75#endif
76#ifndef SET_FPEXC_CTL 76#ifndef SET_FPEXC_CTL
77# define SET_FPEXC_CTL(a,b) (-EINVAL) 77# define SET_FPEXC_CTL(a, b) (-EINVAL)
78#endif 78#endif
79#ifndef GET_FPEXC_CTL 79#ifndef GET_FPEXC_CTL
80# define GET_FPEXC_CTL(a,b) (-EINVAL) 80# define GET_FPEXC_CTL(a, b) (-EINVAL)
81#endif 81#endif
82#ifndef GET_ENDIAN 82#ifndef GET_ENDIAN
83# define GET_ENDIAN(a,b) (-EINVAL) 83# define GET_ENDIAN(a, b) (-EINVAL)
84#endif 84#endif
85#ifndef SET_ENDIAN 85#ifndef SET_ENDIAN
86# define SET_ENDIAN(a,b) (-EINVAL) 86# define SET_ENDIAN(a, b) (-EINVAL)
87#endif 87#endif
88#ifndef GET_TSC_CTL 88#ifndef GET_TSC_CTL
89# define GET_TSC_CTL(a) (-EINVAL) 89# define GET_TSC_CTL(a) (-EINVAL)
@@ -182,39 +182,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
182 rcu_read_lock(); 182 rcu_read_lock();
183 read_lock(&tasklist_lock); 183 read_lock(&tasklist_lock);
184 switch (which) { 184 switch (which) {
185 case PRIO_PROCESS: 185 case PRIO_PROCESS:
186 if (who) 186 if (who)
187 p = find_task_by_vpid(who); 187 p = find_task_by_vpid(who);
188 else 188 else
189 p = current; 189 p = current;
190 if (p) 190 if (p)
191 error = set_one_prio(p, niceval, error); 191 error = set_one_prio(p, niceval, error);
192 break; 192 break;
193 case PRIO_PGRP: 193 case PRIO_PGRP:
194 if (who) 194 if (who)
195 pgrp = find_vpid(who); 195 pgrp = find_vpid(who);
196 else 196 else
197 pgrp = task_pgrp(current); 197 pgrp = task_pgrp(current);
198 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { 198 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
199 error = set_one_prio(p, niceval, error); 199 error = set_one_prio(p, niceval, error);
200 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); 200 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
201 break; 201 break;
202 case PRIO_USER: 202 case PRIO_USER:
203 uid = make_kuid(cred->user_ns, who); 203 uid = make_kuid(cred->user_ns, who);
204 user = cred->user; 204 user = cred->user;
205 if (!who) 205 if (!who)
206 uid = cred->uid; 206 uid = cred->uid;
207 else if (!uid_eq(uid, cred->uid) && 207 else if (!uid_eq(uid, cred->uid)) {
208 !(user = find_user(uid))) 208 user = find_user(uid);
209 if (!user)
209 goto out_unlock; /* No processes for this user */ 210 goto out_unlock; /* No processes for this user */
210 211 }
211 do_each_thread(g, p) { 212 do_each_thread(g, p) {
212 if (uid_eq(task_uid(p), uid)) 213 if (uid_eq(task_uid(p), uid))
213 error = set_one_prio(p, niceval, error); 214 error = set_one_prio(p, niceval, error);
214 } while_each_thread(g, p); 215 } while_each_thread(g, p);
215 if (!uid_eq(uid, cred->uid)) 216 if (!uid_eq(uid, cred->uid))
216 free_uid(user); /* For find_user() */ 217 free_uid(user); /* For find_user() */
217 break; 218 break;
218 } 219 }
219out_unlock: 220out_unlock:
220 read_unlock(&tasklist_lock); 221 read_unlock(&tasklist_lock);
@@ -244,47 +245,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
244 rcu_read_lock(); 245 rcu_read_lock();
245 read_lock(&tasklist_lock); 246 read_lock(&tasklist_lock);
246 switch (which) { 247 switch (which) {
247 case PRIO_PROCESS: 248 case PRIO_PROCESS:
248 if (who) 249 if (who)
249 p = find_task_by_vpid(who); 250 p = find_task_by_vpid(who);
250 else 251 else
251 p = current; 252 p = current;
252 if (p) { 253 if (p) {
254 niceval = nice_to_rlimit(task_nice(p));
255 if (niceval > retval)
256 retval = niceval;
257 }
258 break;
259 case PRIO_PGRP:
260 if (who)
261 pgrp = find_vpid(who);
262 else
263 pgrp = task_pgrp(current);
264 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
265 niceval = nice_to_rlimit(task_nice(p));
266 if (niceval > retval)
267 retval = niceval;
268 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
269 break;
270 case PRIO_USER:
271 uid = make_kuid(cred->user_ns, who);
272 user = cred->user;
273 if (!who)
274 uid = cred->uid;
275 else if (!uid_eq(uid, cred->uid)) {
276 user = find_user(uid);
277 if (!user)
278 goto out_unlock; /* No processes for this user */
279 }
280 do_each_thread(g, p) {
281 if (uid_eq(task_uid(p), uid)) {
253 niceval = nice_to_rlimit(task_nice(p)); 282 niceval = nice_to_rlimit(task_nice(p));
254 if (niceval > retval) 283 if (niceval > retval)
255 retval = niceval; 284 retval = niceval;
256 } 285 }
257 break; 286 } while_each_thread(g, p);
258 case PRIO_PGRP: 287 if (!uid_eq(uid, cred->uid))
259 if (who) 288 free_uid(user); /* for find_user() */
260 pgrp = find_vpid(who); 289 break;
261 else
262 pgrp = task_pgrp(current);
263 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
264 niceval = nice_to_rlimit(task_nice(p));
265 if (niceval > retval)
266 retval = niceval;
267 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
268 break;
269 case PRIO_USER:
270 uid = make_kuid(cred->user_ns, who);
271 user = cred->user;
272 if (!who)
273 uid = cred->uid;
274 else if (!uid_eq(uid, cred->uid) &&
275 !(user = find_user(uid)))
276 goto out_unlock; /* No processes for this user */
277
278 do_each_thread(g, p) {
279 if (uid_eq(task_uid(p), uid)) {
280 niceval = nice_to_rlimit(task_nice(p));
281 if (niceval > retval)
282 retval = niceval;
283 }
284 } while_each_thread(g, p);
285 if (!uid_eq(uid, cred->uid))
286 free_uid(user); /* for find_user() */
287 break;
288 } 290 }
289out_unlock: 291out_unlock:
290 read_unlock(&tasklist_lock); 292 read_unlock(&tasklist_lock);
@@ -306,7 +308,7 @@ out_unlock:
306 * 308 *
307 * The general idea is that a program which uses just setregid() will be 309 * The general idea is that a program which uses just setregid() will be
308 * 100% compatible with BSD. A program which uses just setgid() will be 310 * 100% compatible with BSD. A program which uses just setgid() will be
309 * 100% compatible with POSIX with saved IDs. 311 * 100% compatible with POSIX with saved IDs.
310 * 312 *
311 * SMP: There are not races, the GIDs are checked only by filesystem 313 * SMP: There are not races, the GIDs are checked only by filesystem
312 * operations (as far as semantic preservation is concerned). 314 * operations (as far as semantic preservation is concerned).
@@ -364,7 +366,7 @@ error:
364} 366}
365 367
366/* 368/*
367 * setgid() is implemented like SysV w/ SAVED_IDS 369 * setgid() is implemented like SysV w/ SAVED_IDS
368 * 370 *
369 * SMP: Same implicit races as above. 371 * SMP: Same implicit races as above.
370 */ 372 */
@@ -442,7 +444,7 @@ static int set_user(struct cred *new)
442 * 444 *
443 * The general idea is that a program which uses just setreuid() will be 445 * The general idea is that a program which uses just setreuid() will be
444 * 100% compatible with BSD. A program which uses just setuid() will be 446 * 100% compatible with BSD. A program which uses just setuid() will be
445 * 100% compatible with POSIX with saved IDs. 447 * 100% compatible with POSIX with saved IDs.
446 */ 448 */
447SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) 449SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
448{ 450{
@@ -503,17 +505,17 @@ error:
503 abort_creds(new); 505 abort_creds(new);
504 return retval; 506 return retval;
505} 507}
506 508
507/* 509/*
508 * setuid() is implemented like SysV with SAVED_IDS 510 * setuid() is implemented like SysV with SAVED_IDS
509 * 511 *
510 * Note that SAVED_ID's is deficient in that a setuid root program 512 * Note that SAVED_ID's is deficient in that a setuid root program
511 * like sendmail, for example, cannot set its uid to be a normal 513 * like sendmail, for example, cannot set its uid to be a normal
512 * user and then switch back, because if you're root, setuid() sets 514 * user and then switch back, because if you're root, setuid() sets
513 * the saved uid too. If you don't like this, blame the bright people 515 * the saved uid too. If you don't like this, blame the bright people
514 * in the POSIX committee and/or USG. Note that the BSD-style setreuid() 516 * in the POSIX committee and/or USG. Note that the BSD-style setreuid()
515 * will allow a root program to temporarily drop privileges and be able to 517 * will allow a root program to temporarily drop privileges and be able to
516 * regain them by swapping the real and effective uid. 518 * regain them by swapping the real and effective uid.
517 */ 519 */
518SYSCALL_DEFINE1(setuid, uid_t, uid) 520SYSCALL_DEFINE1(setuid, uid_t, uid)
519{ 521{
@@ -637,10 +639,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
637 euid = from_kuid_munged(cred->user_ns, cred->euid); 639 euid = from_kuid_munged(cred->user_ns, cred->euid);
638 suid = from_kuid_munged(cred->user_ns, cred->suid); 640 suid = from_kuid_munged(cred->user_ns, cred->suid);
639 641
640 if (!(retval = put_user(ruid, ruidp)) && 642 retval = put_user(ruid, ruidp);
641 !(retval = put_user(euid, euidp))) 643 if (!retval) {
642 retval = put_user(suid, suidp); 644 retval = put_user(euid, euidp);
643 645 if (!retval)
646 return put_user(suid, suidp);
647 }
644 return retval; 648 return retval;
645} 649}
646 650
@@ -709,9 +713,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _
709 egid = from_kgid_munged(cred->user_ns, cred->egid); 713 egid = from_kgid_munged(cred->user_ns, cred->egid);
710 sgid = from_kgid_munged(cred->user_ns, cred->sgid); 714 sgid = from_kgid_munged(cred->user_ns, cred->sgid);
711 715
712 if (!(retval = put_user(rgid, rgidp)) && 716 retval = put_user(rgid, rgidp);
713 !(retval = put_user(egid, egidp))) 717 if (!retval) {
714 retval = put_user(sgid, sgidp); 718 retval = put_user(egid, egidp);
719 if (!retval)
720 retval = put_user(sgid, sgidp);
721 }
715 722
716 return retval; 723 return retval;
717} 724}
@@ -862,11 +869,9 @@ void do_sys_times(struct tms *tms)
862{ 869{
863 cputime_t tgutime, tgstime, cutime, cstime; 870 cputime_t tgutime, tgstime, cutime, cstime;
864 871
865 spin_lock_irq(&current->sighand->siglock);
866 thread_group_cputime_adjusted(current, &tgutime, &tgstime); 872 thread_group_cputime_adjusted(current, &tgutime, &tgstime);
867 cutime = current->signal->cutime; 873 cutime = current->signal->cutime;
868 cstime = current->signal->cstime; 874 cstime = current->signal->cstime;
869 spin_unlock_irq(&current->sighand->siglock);
870 tms->tms_utime = cputime_to_clock_t(tgutime); 875 tms->tms_utime = cputime_to_clock_t(tgutime);
871 tms->tms_stime = cputime_to_clock_t(tgstime); 876 tms->tms_stime = cputime_to_clock_t(tgstime);
872 tms->tms_cutime = cputime_to_clock_t(cutime); 877 tms->tms_cutime = cputime_to_clock_t(cutime);
@@ -1284,7 +1289,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
1284/* 1289/*
1285 * Back compatibility for getrlimit. Needed for some apps. 1290 * Back compatibility for getrlimit. Needed for some apps.
1286 */ 1291 */
1287
1288SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, 1292SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1289 struct rlimit __user *, rlim) 1293 struct rlimit __user *, rlim)
1290{ 1294{
@@ -1299,7 +1303,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
1299 x.rlim_cur = 0x7FFFFFFF; 1303 x.rlim_cur = 0x7FFFFFFF;
1300 if (x.rlim_max > 0x7FFFFFFF) 1304 if (x.rlim_max > 0x7FFFFFFF)
1301 x.rlim_max = 0x7FFFFFFF; 1305 x.rlim_max = 0x7FFFFFFF;
1302 return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0; 1306 return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
1303} 1307}
1304 1308
1305#endif 1309#endif
@@ -1527,7 +1531,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1527 cputime_t tgutime, tgstime, utime, stime; 1531 cputime_t tgutime, tgstime, utime, stime;
1528 unsigned long maxrss = 0; 1532 unsigned long maxrss = 0;
1529 1533
1530 memset((char *) r, 0, sizeof *r); 1534 memset((char *)r, 0, sizeof (*r));
1531 utime = stime = 0; 1535 utime = stime = 0;
1532 1536
1533 if (who == RUSAGE_THREAD) { 1537 if (who == RUSAGE_THREAD) {
@@ -1541,41 +1545,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1541 return; 1545 return;
1542 1546
1543 switch (who) { 1547 switch (who) {
1544 case RUSAGE_BOTH: 1548 case RUSAGE_BOTH:
1545 case RUSAGE_CHILDREN: 1549 case RUSAGE_CHILDREN:
1546 utime = p->signal->cutime; 1550 utime = p->signal->cutime;
1547 stime = p->signal->cstime; 1551 stime = p->signal->cstime;
1548 r->ru_nvcsw = p->signal->cnvcsw; 1552 r->ru_nvcsw = p->signal->cnvcsw;
1549 r->ru_nivcsw = p->signal->cnivcsw; 1553 r->ru_nivcsw = p->signal->cnivcsw;
1550 r->ru_minflt = p->signal->cmin_flt; 1554 r->ru_minflt = p->signal->cmin_flt;
1551 r->ru_majflt = p->signal->cmaj_flt; 1555 r->ru_majflt = p->signal->cmaj_flt;
1552 r->ru_inblock = p->signal->cinblock; 1556 r->ru_inblock = p->signal->cinblock;
1553 r->ru_oublock = p->signal->coublock; 1557 r->ru_oublock = p->signal->coublock;
1554 maxrss = p->signal->cmaxrss; 1558 maxrss = p->signal->cmaxrss;
1555 1559
1556 if (who == RUSAGE_CHILDREN) 1560 if (who == RUSAGE_CHILDREN)
1557 break;
1558
1559 case RUSAGE_SELF:
1560 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1561 utime += tgutime;
1562 stime += tgstime;
1563 r->ru_nvcsw += p->signal->nvcsw;
1564 r->ru_nivcsw += p->signal->nivcsw;
1565 r->ru_minflt += p->signal->min_flt;
1566 r->ru_majflt += p->signal->maj_flt;
1567 r->ru_inblock += p->signal->inblock;
1568 r->ru_oublock += p->signal->oublock;
1569 if (maxrss < p->signal->maxrss)
1570 maxrss = p->signal->maxrss;
1571 t = p;
1572 do {
1573 accumulate_thread_rusage(t, r);
1574 } while_each_thread(p, t);
1575 break; 1561 break;
1576 1562
1577 default: 1563 case RUSAGE_SELF:
1578 BUG(); 1564 thread_group_cputime_adjusted(p, &tgutime, &tgstime);
1565 utime += tgutime;
1566 stime += tgstime;
1567 r->ru_nvcsw += p->signal->nvcsw;
1568 r->ru_nivcsw += p->signal->nivcsw;
1569 r->ru_minflt += p->signal->min_flt;
1570 r->ru_majflt += p->signal->maj_flt;
1571 r->ru_inblock += p->signal->inblock;
1572 r->ru_oublock += p->signal->oublock;
1573 if (maxrss < p->signal->maxrss)
1574 maxrss = p->signal->maxrss;
1575 t = p;
1576 do {
1577 accumulate_thread_rusage(t, r);
1578 } while_each_thread(p, t);
1579 break;
1580
1581 default:
1582 BUG();
1579 } 1583 }
1580 unlock_task_sighand(p, &flags); 1584 unlock_task_sighand(p, &flags);
1581 1585
@@ -1585,6 +1589,7 @@ out:
1585 1589
1586 if (who != RUSAGE_CHILDREN) { 1590 if (who != RUSAGE_CHILDREN) {
1587 struct mm_struct *mm = get_task_mm(p); 1591 struct mm_struct *mm = get_task_mm(p);
1592
1588 if (mm) { 1593 if (mm) {
1589 setmax_mm_hiwater_rss(&maxrss, mm); 1594 setmax_mm_hiwater_rss(&maxrss, mm);
1590 mmput(mm); 1595 mmput(mm);
@@ -1596,6 +1601,7 @@ out:
1596int getrusage(struct task_struct *p, int who, struct rusage __user *ru) 1601int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
1597{ 1602{
1598 struct rusage r; 1603 struct rusage r;
1604
1599 k_getrusage(p, who, &r); 1605 k_getrusage(p, who, &r);
1600 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0; 1606 return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
1601} 1607}
@@ -1628,12 +1634,14 @@ SYSCALL_DEFINE1(umask, int, mask)
1628 return mask; 1634 return mask;
1629} 1635}
1630 1636
1631static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) 1637static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
1632{ 1638{
1633 struct fd exe; 1639 struct fd exe;
1634 struct inode *inode; 1640 struct inode *inode;
1635 int err; 1641 int err;
1636 1642
1643 VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
1644
1637 exe = fdget(fd); 1645 exe = fdget(fd);
1638 if (!exe.file) 1646 if (!exe.file)
1639 return -EBADF; 1647 return -EBADF;
@@ -1654,8 +1662,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1654 if (err) 1662 if (err)
1655 goto exit; 1663 goto exit;
1656 1664
1657 down_write(&mm->mmap_sem);
1658
1659 /* 1665 /*
1660 * Forbid mm->exe_file change if old file still mapped. 1666 * Forbid mm->exe_file change if old file still mapped.
1661 */ 1667 */
@@ -1667,7 +1673,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1667 if (vma->vm_file && 1673 if (vma->vm_file &&
1668 path_equal(&vma->vm_file->f_path, 1674 path_equal(&vma->vm_file->f_path,
1669 &mm->exe_file->f_path)) 1675 &mm->exe_file->f_path))
1670 goto exit_unlock; 1676 goto exit;
1671 } 1677 }
1672 1678
1673 /* 1679 /*
@@ -1678,34 +1684,222 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
1678 */ 1684 */
1679 err = -EPERM; 1685 err = -EPERM;
1680 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) 1686 if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
1681 goto exit_unlock; 1687 goto exit;
1682 1688
1683 err = 0; 1689 err = 0;
1684 set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */ 1690 set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
1685exit_unlock:
1686 up_write(&mm->mmap_sem);
1687
1688exit: 1691exit:
1689 fdput(exe); 1692 fdput(exe);
1690 return err; 1693 return err;
1691} 1694}
1692 1695
1696#ifdef CONFIG_CHECKPOINT_RESTORE
1697/*
1698 * WARNING: we don't require any capability here so be very careful
1699 * in what is allowed for modification from userspace.
1700 */
1701static int validate_prctl_map(struct prctl_mm_map *prctl_map)
1702{
1703 unsigned long mmap_max_addr = TASK_SIZE;
1704 struct mm_struct *mm = current->mm;
1705 int error = -EINVAL, i;
1706
1707 static const unsigned char offsets[] = {
1708 offsetof(struct prctl_mm_map, start_code),
1709 offsetof(struct prctl_mm_map, end_code),
1710 offsetof(struct prctl_mm_map, start_data),
1711 offsetof(struct prctl_mm_map, end_data),
1712 offsetof(struct prctl_mm_map, start_brk),
1713 offsetof(struct prctl_mm_map, brk),
1714 offsetof(struct prctl_mm_map, start_stack),
1715 offsetof(struct prctl_mm_map, arg_start),
1716 offsetof(struct prctl_mm_map, arg_end),
1717 offsetof(struct prctl_mm_map, env_start),
1718 offsetof(struct prctl_mm_map, env_end),
1719 };
1720
1721 /*
1722 * Make sure the members are not somewhere outside
1723 * of allowed address space.
1724 */
1725 for (i = 0; i < ARRAY_SIZE(offsets); i++) {
1726 u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
1727
1728 if ((unsigned long)val >= mmap_max_addr ||
1729 (unsigned long)val < mmap_min_addr)
1730 goto out;
1731 }
1732
1733 /*
1734 * Make sure the pairs are ordered.
1735 */
1736#define __prctl_check_order(__m1, __op, __m2) \
1737 ((unsigned long)prctl_map->__m1 __op \
1738 (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
1739 error = __prctl_check_order(start_code, <, end_code);
1740 error |= __prctl_check_order(start_data, <, end_data);
1741 error |= __prctl_check_order(start_brk, <=, brk);
1742 error |= __prctl_check_order(arg_start, <=, arg_end);
1743 error |= __prctl_check_order(env_start, <=, env_end);
1744 if (error)
1745 goto out;
1746#undef __prctl_check_order
1747
1748 error = -EINVAL;
1749
1750 /*
1751 * @brk should be after @end_data in traditional maps.
1752 */
1753 if (prctl_map->start_brk <= prctl_map->end_data ||
1754 prctl_map->brk <= prctl_map->end_data)
1755 goto out;
1756
1757 /*
1758 * Neither we should allow to override limits if they set.
1759 */
1760 if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
1761 prctl_map->start_brk, prctl_map->end_data,
1762 prctl_map->start_data))
1763 goto out;
1764
1765 /*
1766 * Someone is trying to cheat the auxv vector.
1767 */
1768 if (prctl_map->auxv_size) {
1769 if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
1770 goto out;
1771 }
1772
1773 /*
1774 * Finally, make sure the caller has the rights to
1775 * change /proc/pid/exe link: only local root should
1776 * be allowed to.
1777 */
1778 if (prctl_map->exe_fd != (u32)-1) {
1779 struct user_namespace *ns = current_user_ns();
1780 const struct cred *cred = current_cred();
1781
1782 if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
1783 !gid_eq(cred->gid, make_kgid(ns, 0)))
1784 goto out;
1785 }
1786
1787 error = 0;
1788out:
1789 return error;
1790}
1791
1792static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
1793{
1794 struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
1795 unsigned long user_auxv[AT_VECTOR_SIZE];
1796 struct mm_struct *mm = current->mm;
1797 int error;
1798
1799 BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
1800 BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
1801
1802 if (opt == PR_SET_MM_MAP_SIZE)
1803 return put_user((unsigned int)sizeof(prctl_map),
1804 (unsigned int __user *)addr);
1805
1806 if (data_size != sizeof(prctl_map))
1807 return -EINVAL;
1808
1809 if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
1810 return -EFAULT;
1811
1812 error = validate_prctl_map(&prctl_map);
1813 if (error)
1814 return error;
1815
1816 if (prctl_map.auxv_size) {
1817 memset(user_auxv, 0, sizeof(user_auxv));
1818 if (copy_from_user(user_auxv,
1819 (const void __user *)prctl_map.auxv,
1820 prctl_map.auxv_size))
1821 return -EFAULT;
1822
1823 /* Last entry must be AT_NULL as specification requires */
1824 user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
1825 user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
1826 }
1827
1828 down_write(&mm->mmap_sem);
1829 if (prctl_map.exe_fd != (u32)-1)
1830 error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
1831 downgrade_write(&mm->mmap_sem);
1832 if (error)
1833 goto out;
1834
1835 /*
1836 * We don't validate if these members are pointing to
1837 * real present VMAs because application may have correspond
1838 * VMAs already unmapped and kernel uses these members for statistics
1839 * output in procfs mostly, except
1840 *
1841 * - @start_brk/@brk which are used in do_brk but kernel lookups
1842 * for VMAs when updating these memvers so anything wrong written
1843 * here cause kernel to swear at userspace program but won't lead
1844 * to any problem in kernel itself
1845 */
1846
1847 mm->start_code = prctl_map.start_code;
1848 mm->end_code = prctl_map.end_code;
1849 mm->start_data = prctl_map.start_data;
1850 mm->end_data = prctl_map.end_data;
1851 mm->start_brk = prctl_map.start_brk;
1852 mm->brk = prctl_map.brk;
1853 mm->start_stack = prctl_map.start_stack;
1854 mm->arg_start = prctl_map.arg_start;
1855 mm->arg_end = prctl_map.arg_end;
1856 mm->env_start = prctl_map.env_start;
1857 mm->env_end = prctl_map.env_end;
1858
1859 /*
1860 * Note this update of @saved_auxv is lockless thus
1861 * if someone reads this member in procfs while we're
1862 * updating -- it may get partly updated results. It's
1863 * known and acceptable trade off: we leave it as is to
1864 * not introduce additional locks here making the kernel
1865 * more complex.
1866 */
1867 if (prctl_map.auxv_size)
1868 memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
1869
1870 error = 0;
1871out:
1872 up_read(&mm->mmap_sem);
1873 return error;
1874}
1875#endif /* CONFIG_CHECKPOINT_RESTORE */
1876
1693static int prctl_set_mm(int opt, unsigned long addr, 1877static int prctl_set_mm(int opt, unsigned long addr,
1694 unsigned long arg4, unsigned long arg5) 1878 unsigned long arg4, unsigned long arg5)
1695{ 1879{
1696 unsigned long rlim = rlimit(RLIMIT_DATA);
1697 struct mm_struct *mm = current->mm; 1880 struct mm_struct *mm = current->mm;
1698 struct vm_area_struct *vma; 1881 struct vm_area_struct *vma;
1699 int error; 1882 int error;
1700 1883
1701 if (arg5 || (arg4 && opt != PR_SET_MM_AUXV)) 1884 if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
1885 opt != PR_SET_MM_MAP &&
1886 opt != PR_SET_MM_MAP_SIZE)))
1702 return -EINVAL; 1887 return -EINVAL;
1703 1888
1889#ifdef CONFIG_CHECKPOINT_RESTORE
1890 if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
1891 return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
1892#endif
1893
1704 if (!capable(CAP_SYS_RESOURCE)) 1894 if (!capable(CAP_SYS_RESOURCE))
1705 return -EPERM; 1895 return -EPERM;
1706 1896
1707 if (opt == PR_SET_MM_EXE_FILE) 1897 if (opt == PR_SET_MM_EXE_FILE) {
1708 return prctl_set_mm_exe_file(mm, (unsigned int)addr); 1898 down_write(&mm->mmap_sem);
1899 error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
1900 up_write(&mm->mmap_sem);
1901 return error;
1902 }
1709 1903
1710 if (addr >= TASK_SIZE || addr < mmap_min_addr) 1904 if (addr >= TASK_SIZE || addr < mmap_min_addr)
1711 return -EINVAL; 1905 return -EINVAL;
@@ -1733,9 +1927,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
1733 if (addr <= mm->end_data) 1927 if (addr <= mm->end_data)
1734 goto out; 1928 goto out;
1735 1929
1736 if (rlim < RLIM_INFINITY && 1930 if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
1737 (mm->brk - addr) + 1931 mm->end_data, mm->start_data))
1738 (mm->end_data - mm->start_data) > rlim)
1739 goto out; 1932 goto out;
1740 1933
1741 mm->start_brk = addr; 1934 mm->start_brk = addr;
@@ -1745,9 +1938,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
1745 if (addr <= mm->end_data) 1938 if (addr <= mm->end_data)
1746 goto out; 1939 goto out;
1747 1940
1748 if (rlim < RLIM_INFINITY && 1941 if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
1749 (addr - mm->start_brk) + 1942 mm->end_data, mm->start_data))
1750 (mm->end_data - mm->start_data) > rlim)
1751 goto out; 1943 goto out;
1752 1944
1753 mm->brk = addr; 1945 mm->brk = addr;
@@ -2023,6 +2215,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
2023{ 2215{
2024 int err = 0; 2216 int err = 0;
2025 int cpu = raw_smp_processor_id(); 2217 int cpu = raw_smp_processor_id();
2218
2026 if (cpup) 2219 if (cpup)
2027 err |= put_user(cpu, cpup); 2220 err |= put_user(cpu, cpup);
2028 if (nodep) 2221 if (nodep)
@@ -2135,7 +2328,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
2135 /* Check to see if any memory value is too large for 32-bit and scale 2328 /* Check to see if any memory value is too large for 32-bit and scale
2136 * down if needed 2329 * down if needed
2137 */ 2330 */
2138 if ((s.totalram >> 32) || (s.totalswap >> 32)) { 2331 if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
2139 int bitcount = 0; 2332 int bitcount = 0;
2140 2333
2141 while (s.mem_unit < PAGE_SIZE) { 2334 while (s.mem_unit < PAGE_SIZE) {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2904a2105914..02aa4185b17e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -25,6 +25,7 @@ cond_syscall(sys_swapon);
25cond_syscall(sys_swapoff); 25cond_syscall(sys_swapoff);
26cond_syscall(sys_kexec_load); 26cond_syscall(sys_kexec_load);
27cond_syscall(compat_sys_kexec_load); 27cond_syscall(compat_sys_kexec_load);
28cond_syscall(sys_kexec_file_load);
28cond_syscall(sys_init_module); 29cond_syscall(sys_init_module);
29cond_syscall(sys_finit_module); 30cond_syscall(sys_finit_module);
30cond_syscall(sys_delete_module); 31cond_syscall(sys_delete_module);
@@ -155,6 +156,9 @@ cond_syscall(sys_process_vm_writev);
155cond_syscall(compat_sys_process_vm_readv); 156cond_syscall(compat_sys_process_vm_readv);
156cond_syscall(compat_sys_process_vm_writev); 157cond_syscall(compat_sys_process_vm_writev);
157cond_syscall(sys_uselib); 158cond_syscall(sys_uselib);
159cond_syscall(sys_fadvise64);
160cond_syscall(sys_fadvise64_64);
161cond_syscall(sys_madvise);
158 162
159/* arch-specific weak syscall entries */ 163/* arch-specific weak syscall entries */
160cond_syscall(sys_pciconfig_read); 164cond_syscall(sys_pciconfig_read);
@@ -197,6 +201,7 @@ cond_syscall(compat_sys_timerfd_settime);
197cond_syscall(compat_sys_timerfd_gettime); 201cond_syscall(compat_sys_timerfd_gettime);
198cond_syscall(sys_eventfd); 202cond_syscall(sys_eventfd);
199cond_syscall(sys_eventfd2); 203cond_syscall(sys_eventfd2);
204cond_syscall(sys_memfd_create);
200 205
201/* performance counters: */ 206/* performance counters: */
202cond_syscall(sys_perf_event_open); 207cond_syscall(sys_perf_event_open);
@@ -216,3 +221,6 @@ cond_syscall(sys_kcmp);
216 221
217/* operate on Secure Computing state */ 222/* operate on Secure Computing state */
218cond_syscall(sys_seccomp); 223cond_syscall(sys_seccomp);
224
225/* access BPF programs and maps */
226cond_syscall(sys_bpf);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 75b22e22a72c..15f2511a1b7c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -387,7 +387,8 @@ static struct ctl_table kern_table[] = {
387 .data = &sysctl_numa_balancing_scan_size, 387 .data = &sysctl_numa_balancing_scan_size,
388 .maxlen = sizeof(unsigned int), 388 .maxlen = sizeof(unsigned int),
389 .mode = 0644, 389 .mode = 0644,
390 .proc_handler = proc_dointvec, 390 .proc_handler = proc_dointvec_minmax,
391 .extra1 = &one,
391 }, 392 },
392 { 393 {
393 .procname = "numa_balancing", 394 .procname = "numa_balancing",
@@ -1055,15 +1056,6 @@ static struct ctl_table kern_table[] = {
1055 .child = key_sysctls, 1056 .child = key_sysctls,
1056 }, 1057 },
1057#endif 1058#endif
1058#ifdef CONFIG_RCU_TORTURE_TEST
1059 {
1060 .procname = "rcutorture_runnable",
1061 .data = &rcutorture_runnable,
1062 .maxlen = sizeof(int),
1063 .mode = 0644,
1064 .proc_handler = proc_dointvec,
1065 },
1066#endif
1067#ifdef CONFIG_PERF_EVENTS 1059#ifdef CONFIG_PERF_EVENTS
1068 /* 1060 /*
1069 * User-space scripts rely on the existence of this file 1061 * User-space scripts rely on the existence of this file
@@ -1240,8 +1232,7 @@ static struct ctl_table vm_table[] = {
1240 .maxlen = sizeof(unsigned long), 1232 .maxlen = sizeof(unsigned long),
1241 .mode = 0644, 1233 .mode = 0644,
1242 .proc_handler = hugetlb_sysctl_handler, 1234 .proc_handler = hugetlb_sysctl_handler,
1243 .extra1 = (void *)&hugetlb_zero, 1235 .extra1 = &zero,
1244 .extra2 = (void *)&hugetlb_infinity,
1245 }, 1236 },
1246#ifdef CONFIG_NUMA 1237#ifdef CONFIG_NUMA
1247 { 1238 {
@@ -1250,8 +1241,7 @@ static struct ctl_table vm_table[] = {
1250 .maxlen = sizeof(unsigned long), 1241 .maxlen = sizeof(unsigned long),
1251 .mode = 0644, 1242 .mode = 0644,
1252 .proc_handler = &hugetlb_mempolicy_sysctl_handler, 1243 .proc_handler = &hugetlb_mempolicy_sysctl_handler,
1253 .extra1 = (void *)&hugetlb_zero, 1244 .extra1 = &zero,
1254 .extra2 = (void *)&hugetlb_infinity,
1255 }, 1245 },
1256#endif 1246#endif
1257 { 1247 {
@@ -1274,8 +1264,7 @@ static struct ctl_table vm_table[] = {
1274 .maxlen = sizeof(unsigned long), 1264 .maxlen = sizeof(unsigned long),
1275 .mode = 0644, 1265 .mode = 0644,
1276 .proc_handler = hugetlb_overcommit_handler, 1266 .proc_handler = hugetlb_overcommit_handler,
1277 .extra1 = (void *)&hugetlb_zero, 1267 .extra1 = &zero,
1278 .extra2 = (void *)&hugetlb_infinity,
1279 }, 1268 },
1280#endif 1269#endif
1281 { 1270 {
@@ -1463,13 +1452,6 @@ static struct ctl_table vm_table[] = {
1463 .extra2 = &one, 1452 .extra2 = &one,
1464 }, 1453 },
1465#endif 1454#endif
1466 {
1467 .procname = "scan_unevictable_pages",
1468 .data = &scan_unevictable_pages,
1469 .maxlen = sizeof(scan_unevictable_pages),
1470 .mode = 0644,
1471 .proc_handler = scan_unevictable_handler,
1472 },
1473#ifdef CONFIG_MEMORY_FAILURE 1455#ifdef CONFIG_MEMORY_FAILURE
1474 { 1456 {
1475 .procname = "memory_failure_early_kill", 1457 .procname = "memory_failure_early_kill",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e4ba9a5a5ccb..9a4f750a2963 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -390,7 +390,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
390 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" }, 390 { CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
391 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" }, 391 { CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
392 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" }, 392 { CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
393 { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
394 { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" }, 393 { CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
395 { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" }, 394 { CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
396 { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" }, 395 { CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 13d2f7cd65db..b312fcc73024 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -638,7 +638,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
638 fill_tgid_exit(tsk); 638 fill_tgid_exit(tsk);
639 } 639 }
640 640
641 listeners = __this_cpu_ptr(&listener_array); 641 listeners = raw_cpu_ptr(&listener_array);
642 if (list_empty(&listeners->list)) 642 if (list_empty(&listeners->list))
643 return; 643 return;
644 644
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
index 12d6ebbfdd83..0dbab6d1acb4 100644
--- a/kernel/test_kprobes.c
+++ b/kernel/test_kprobes.c
@@ -14,6 +14,8 @@
14 * the GNU General Public License for more details. 14 * the GNU General Public License for more details.
15 */ 15 */
16 16
17#define pr_fmt(fmt) "Kprobe smoke test: " fmt
18
17#include <linux/kernel.h> 19#include <linux/kernel.h>
18#include <linux/kprobes.h> 20#include <linux/kprobes.h>
19#include <linux/random.h> 21#include <linux/random.h>
@@ -41,8 +43,7 @@ static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
41{ 43{
42 if (preh_val != (rand1 / div_factor)) { 44 if (preh_val != (rand1 / div_factor)) {
43 handler_errors++; 45 handler_errors++;
44 printk(KERN_ERR "Kprobe smoke test failed: " 46 pr_err("incorrect value in post_handler\n");
45 "incorrect value in post_handler\n");
46 } 47 }
47 posth_val = preh_val + div_factor; 48 posth_val = preh_val + div_factor;
48} 49}
@@ -59,8 +60,7 @@ static int test_kprobe(void)
59 60
60 ret = register_kprobe(&kp); 61 ret = register_kprobe(&kp);
61 if (ret < 0) { 62 if (ret < 0) {
62 printk(KERN_ERR "Kprobe smoke test failed: " 63 pr_err("register_kprobe returned %d\n", ret);
63 "register_kprobe returned %d\n", ret);
64 return ret; 64 return ret;
65 } 65 }
66 66
@@ -68,14 +68,12 @@ static int test_kprobe(void)
68 unregister_kprobe(&kp); 68 unregister_kprobe(&kp);
69 69
70 if (preh_val == 0) { 70 if (preh_val == 0) {
71 printk(KERN_ERR "Kprobe smoke test failed: " 71 pr_err("kprobe pre_handler not called\n");
72 "kprobe pre_handler not called\n");
73 handler_errors++; 72 handler_errors++;
74 } 73 }
75 74
76 if (posth_val == 0) { 75 if (posth_val == 0) {
77 printk(KERN_ERR "Kprobe smoke test failed: " 76 pr_err("kprobe post_handler not called\n");
78 "kprobe post_handler not called\n");
79 handler_errors++; 77 handler_errors++;
80 } 78 }
81 79
@@ -98,8 +96,7 @@ static void kp_post_handler2(struct kprobe *p, struct pt_regs *regs,
98{ 96{
99 if (preh_val != (rand1 / div_factor) + 1) { 97 if (preh_val != (rand1 / div_factor) + 1) {
100 handler_errors++; 98 handler_errors++;
101 printk(KERN_ERR "Kprobe smoke test failed: " 99 pr_err("incorrect value in post_handler2\n");
102 "incorrect value in post_handler2\n");
103 } 100 }
104 posth_val = preh_val + div_factor; 101 posth_val = preh_val + div_factor;
105} 102}
@@ -120,8 +117,7 @@ static int test_kprobes(void)
120 kp.flags = 0; 117 kp.flags = 0;
121 ret = register_kprobes(kps, 2); 118 ret = register_kprobes(kps, 2);
122 if (ret < 0) { 119 if (ret < 0) {
123 printk(KERN_ERR "Kprobe smoke test failed: " 120 pr_err("register_kprobes returned %d\n", ret);
124 "register_kprobes returned %d\n", ret);
125 return ret; 121 return ret;
126 } 122 }
127 123
@@ -130,14 +126,12 @@ static int test_kprobes(void)
130 ret = target(rand1); 126 ret = target(rand1);
131 127
132 if (preh_val == 0) { 128 if (preh_val == 0) {
133 printk(KERN_ERR "Kprobe smoke test failed: " 129 pr_err("kprobe pre_handler not called\n");
134 "kprobe pre_handler not called\n");
135 handler_errors++; 130 handler_errors++;
136 } 131 }
137 132
138 if (posth_val == 0) { 133 if (posth_val == 0) {
139 printk(KERN_ERR "Kprobe smoke test failed: " 134 pr_err("kprobe post_handler not called\n");
140 "kprobe post_handler not called\n");
141 handler_errors++; 135 handler_errors++;
142 } 136 }
143 137
@@ -146,14 +140,12 @@ static int test_kprobes(void)
146 ret = target2(rand1); 140 ret = target2(rand1);
147 141
148 if (preh_val == 0) { 142 if (preh_val == 0) {
149 printk(KERN_ERR "Kprobe smoke test failed: " 143 pr_err("kprobe pre_handler2 not called\n");
150 "kprobe pre_handler2 not called\n");
151 handler_errors++; 144 handler_errors++;
152 } 145 }
153 146
154 if (posth_val == 0) { 147 if (posth_val == 0) {
155 printk(KERN_ERR "Kprobe smoke test failed: " 148 pr_err("kprobe post_handler2 not called\n");
156 "kprobe post_handler2 not called\n");
157 handler_errors++; 149 handler_errors++;
158 } 150 }
159 151
@@ -166,8 +158,7 @@ static u32 j_kprobe_target(u32 value)
166{ 158{
167 if (value != rand1) { 159 if (value != rand1) {
168 handler_errors++; 160 handler_errors++;
169 printk(KERN_ERR "Kprobe smoke test failed: " 161 pr_err("incorrect value in jprobe handler\n");
170 "incorrect value in jprobe handler\n");
171 } 162 }
172 163
173 jph_val = rand1; 164 jph_val = rand1;
@@ -186,16 +177,14 @@ static int test_jprobe(void)
186 177
187 ret = register_jprobe(&jp); 178 ret = register_jprobe(&jp);
188 if (ret < 0) { 179 if (ret < 0) {
189 printk(KERN_ERR "Kprobe smoke test failed: " 180 pr_err("register_jprobe returned %d\n", ret);
190 "register_jprobe returned %d\n", ret);
191 return ret; 181 return ret;
192 } 182 }
193 183
194 ret = target(rand1); 184 ret = target(rand1);
195 unregister_jprobe(&jp); 185 unregister_jprobe(&jp);
196 if (jph_val == 0) { 186 if (jph_val == 0) {
197 printk(KERN_ERR "Kprobe smoke test failed: " 187 pr_err("jprobe handler not called\n");
198 "jprobe handler not called\n");
199 handler_errors++; 188 handler_errors++;
200 } 189 }
201 190
@@ -217,24 +206,21 @@ static int test_jprobes(void)
217 jp.kp.flags = 0; 206 jp.kp.flags = 0;
218 ret = register_jprobes(jps, 2); 207 ret = register_jprobes(jps, 2);
219 if (ret < 0) { 208 if (ret < 0) {
220 printk(KERN_ERR "Kprobe smoke test failed: " 209 pr_err("register_jprobes returned %d\n", ret);
221 "register_jprobes returned %d\n", ret);
222 return ret; 210 return ret;
223 } 211 }
224 212
225 jph_val = 0; 213 jph_val = 0;
226 ret = target(rand1); 214 ret = target(rand1);
227 if (jph_val == 0) { 215 if (jph_val == 0) {
228 printk(KERN_ERR "Kprobe smoke test failed: " 216 pr_err("jprobe handler not called\n");
229 "jprobe handler not called\n");
230 handler_errors++; 217 handler_errors++;
231 } 218 }
232 219
233 jph_val = 0; 220 jph_val = 0;
234 ret = target2(rand1); 221 ret = target2(rand1);
235 if (jph_val == 0) { 222 if (jph_val == 0) {
236 printk(KERN_ERR "Kprobe smoke test failed: " 223 pr_err("jprobe handler2 not called\n");
237 "jprobe handler2 not called\n");
238 handler_errors++; 224 handler_errors++;
239 } 225 }
240 unregister_jprobes(jps, 2); 226 unregister_jprobes(jps, 2);
@@ -256,13 +242,11 @@ static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
256 242
257 if (ret != (rand1 / div_factor)) { 243 if (ret != (rand1 / div_factor)) {
258 handler_errors++; 244 handler_errors++;
259 printk(KERN_ERR "Kprobe smoke test failed: " 245 pr_err("incorrect value in kretprobe handler\n");
260 "incorrect value in kretprobe handler\n");
261 } 246 }
262 if (krph_val == 0) { 247 if (krph_val == 0) {
263 handler_errors++; 248 handler_errors++;
264 printk(KERN_ERR "Kprobe smoke test failed: " 249 pr_err("call to kretprobe entry handler failed\n");
265 "call to kretprobe entry handler failed\n");
266 } 250 }
267 251
268 krph_val = rand1; 252 krph_val = rand1;
@@ -281,16 +265,14 @@ static int test_kretprobe(void)
281 265
282 ret = register_kretprobe(&rp); 266 ret = register_kretprobe(&rp);
283 if (ret < 0) { 267 if (ret < 0) {
284 printk(KERN_ERR "Kprobe smoke test failed: " 268 pr_err("register_kretprobe returned %d\n", ret);
285 "register_kretprobe returned %d\n", ret);
286 return ret; 269 return ret;
287 } 270 }
288 271
289 ret = target(rand1); 272 ret = target(rand1);
290 unregister_kretprobe(&rp); 273 unregister_kretprobe(&rp);
291 if (krph_val != rand1) { 274 if (krph_val != rand1) {
292 printk(KERN_ERR "Kprobe smoke test failed: " 275 pr_err("kretprobe handler not called\n");
293 "kretprobe handler not called\n");
294 handler_errors++; 276 handler_errors++;
295 } 277 }
296 278
@@ -303,13 +285,11 @@ static int return_handler2(struct kretprobe_instance *ri, struct pt_regs *regs)
303 285
304 if (ret != (rand1 / div_factor) + 1) { 286 if (ret != (rand1 / div_factor) + 1) {
305 handler_errors++; 287 handler_errors++;
306 printk(KERN_ERR "Kprobe smoke test failed: " 288 pr_err("incorrect value in kretprobe handler2\n");
307 "incorrect value in kretprobe handler2\n");
308 } 289 }
309 if (krph_val == 0) { 290 if (krph_val == 0) {
310 handler_errors++; 291 handler_errors++;
311 printk(KERN_ERR "Kprobe smoke test failed: " 292 pr_err("call to kretprobe entry handler failed\n");
312 "call to kretprobe entry handler failed\n");
313 } 293 }
314 294
315 krph_val = rand1; 295 krph_val = rand1;
@@ -332,24 +312,21 @@ static int test_kretprobes(void)
332 rp.kp.flags = 0; 312 rp.kp.flags = 0;
333 ret = register_kretprobes(rps, 2); 313 ret = register_kretprobes(rps, 2);
334 if (ret < 0) { 314 if (ret < 0) {
335 printk(KERN_ERR "Kprobe smoke test failed: " 315 pr_err("register_kretprobe returned %d\n", ret);
336 "register_kretprobe returned %d\n", ret);
337 return ret; 316 return ret;
338 } 317 }
339 318
340 krph_val = 0; 319 krph_val = 0;
341 ret = target(rand1); 320 ret = target(rand1);
342 if (krph_val != rand1) { 321 if (krph_val != rand1) {
343 printk(KERN_ERR "Kprobe smoke test failed: " 322 pr_err("kretprobe handler not called\n");
344 "kretprobe handler not called\n");
345 handler_errors++; 323 handler_errors++;
346 } 324 }
347 325
348 krph_val = 0; 326 krph_val = 0;
349 ret = target2(rand1); 327 ret = target2(rand1);
350 if (krph_val != rand1) { 328 if (krph_val != rand1) {
351 printk(KERN_ERR "Kprobe smoke test failed: " 329 pr_err("kretprobe handler2 not called\n");
352 "kretprobe handler2 not called\n");
353 handler_errors++; 330 handler_errors++;
354 } 331 }
355 unregister_kretprobes(rps, 2); 332 unregister_kretprobes(rps, 2);
@@ -368,7 +345,7 @@ int init_test_probes(void)
368 rand1 = prandom_u32(); 345 rand1 = prandom_u32();
369 } while (rand1 <= div_factor); 346 } while (rand1 <= div_factor);
370 347
371 printk(KERN_INFO "Kprobe smoke test started\n"); 348 pr_info("started\n");
372 num_tests++; 349 num_tests++;
373 ret = test_kprobe(); 350 ret = test_kprobe();
374 if (ret < 0) 351 if (ret < 0)
@@ -402,13 +379,11 @@ int init_test_probes(void)
402#endif /* CONFIG_KRETPROBES */ 379#endif /* CONFIG_KRETPROBES */
403 380
404 if (errors) 381 if (errors)
405 printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " 382 pr_err("BUG: %d out of %d tests failed\n", errors, num_tests);
406 "%d tests failed\n", errors, num_tests);
407 else if (handler_errors) 383 else if (handler_errors)
408 printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " 384 pr_err("BUG: %d error(s) running handlers\n", handler_errors);
409 "running handlers\n", handler_errors);
410 else 385 else
411 printk(KERN_INFO "Kprobe smoke test passed successfully\n"); 386 pr_info("passed successfully\n");
412 387
413 return 0; 388 return 0;
414} 389}
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 4aec4a457431..a7077d3ae52f 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -464,18 +464,26 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid)
464static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, 464static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
465 ktime_t now) 465 ktime_t now)
466{ 466{
467 unsigned long flags;
467 struct k_itimer *ptr = container_of(alarm, struct k_itimer, 468 struct k_itimer *ptr = container_of(alarm, struct k_itimer,
468 it.alarm.alarmtimer); 469 it.alarm.alarmtimer);
469 if (posix_timer_event(ptr, 0) != 0) 470 enum alarmtimer_restart result = ALARMTIMER_NORESTART;
470 ptr->it_overrun++; 471
472 spin_lock_irqsave(&ptr->it_lock, flags);
473 if ((ptr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) {
474 if (posix_timer_event(ptr, 0) != 0)
475 ptr->it_overrun++;
476 }
471 477
472 /* Re-add periodic timers */ 478 /* Re-add periodic timers */
473 if (ptr->it.alarm.interval.tv64) { 479 if (ptr->it.alarm.interval.tv64) {
474 ptr->it_overrun += alarm_forward(alarm, now, 480 ptr->it_overrun += alarm_forward(alarm, now,
475 ptr->it.alarm.interval); 481 ptr->it.alarm.interval);
476 return ALARMTIMER_RESTART; 482 result = ALARMTIMER_RESTART;
477 } 483 }
478 return ALARMTIMER_NORESTART; 484 spin_unlock_irqrestore(&ptr->it_lock, flags);
485
486 return result;
479} 487}
480 488
481/** 489/**
@@ -541,18 +549,22 @@ static int alarm_timer_create(struct k_itimer *new_timer)
541 * @new_timer: k_itimer pointer 549 * @new_timer: k_itimer pointer
542 * @cur_setting: itimerspec data to fill 550 * @cur_setting: itimerspec data to fill
543 * 551 *
544 * Copies the itimerspec data out from the k_itimer 552 * Copies out the current itimerspec data
545 */ 553 */
546static void alarm_timer_get(struct k_itimer *timr, 554static void alarm_timer_get(struct k_itimer *timr,
547 struct itimerspec *cur_setting) 555 struct itimerspec *cur_setting)
548{ 556{
549 memset(cur_setting, 0, sizeof(struct itimerspec)); 557 ktime_t relative_expiry_time =
558 alarm_expires_remaining(&(timr->it.alarm.alarmtimer));
559
560 if (ktime_to_ns(relative_expiry_time) > 0) {
561 cur_setting->it_value = ktime_to_timespec(relative_expiry_time);
562 } else {
563 cur_setting->it_value.tv_sec = 0;
564 cur_setting->it_value.tv_nsec = 0;
565 }
550 566
551 cur_setting->it_interval = 567 cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval);
552 ktime_to_timespec(timr->it.alarm.interval);
553 cur_setting->it_value =
554 ktime_to_timespec(timr->it.alarm.alarmtimer.node.expires);
555 return;
556} 568}
557 569
558/** 570/**
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 9c94c19f1305..55449909f114 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,7 +72,7 @@ static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt,
72 * Also omit the add if it would overflow the u64 boundary. 72 * Also omit the add if it would overflow the u64 boundary.
73 */ 73 */
74 if ((~0ULL - clc > rnd) && 74 if ((~0ULL - clc > rnd) &&
75 (!ismax || evt->mult <= (1U << evt->shift))) 75 (!ismax || evt->mult <= (1ULL << evt->shift)))
76 clc += rnd; 76 clc += rnd;
77 77
78 do_div(clc, evt->mult); 78 do_div(clc, evt->mult);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1c2fe7de2842..37e50aadd471 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -558,7 +558,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
558static int hrtimer_reprogram(struct hrtimer *timer, 558static int hrtimer_reprogram(struct hrtimer *timer,
559 struct hrtimer_clock_base *base) 559 struct hrtimer_clock_base *base)
560{ 560{
561 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 561 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
562 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); 562 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
563 int res; 563 int res;
564 564
@@ -629,7 +629,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
629 */ 629 */
630static void retrigger_next_event(void *arg) 630static void retrigger_next_event(void *arg)
631{ 631{
632 struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); 632 struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
633 633
634 if (!hrtimer_hres_active()) 634 if (!hrtimer_hres_active())
635 return; 635 return;
@@ -903,7 +903,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
903 */ 903 */
904 debug_deactivate(timer); 904 debug_deactivate(timer);
905 timer_stats_hrtimer_clear_start_info(timer); 905 timer_stats_hrtimer_clear_start_info(timer);
906 reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases); 906 reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
907 /* 907 /*
908 * We must preserve the CALLBACK state flag here, 908 * We must preserve the CALLBACK state flag here,
909 * otherwise we could move the timer base in 909 * otherwise we could move the timer base in
@@ -963,7 +963,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
963 * on dynticks target. 963 * on dynticks target.
964 */ 964 */
965 wake_up_nohz_cpu(new_base->cpu_base->cpu); 965 wake_up_nohz_cpu(new_base->cpu_base->cpu);
966 } else if (new_base->cpu_base == &__get_cpu_var(hrtimer_bases) && 966 } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
967 hrtimer_reprogram(timer, new_base)) { 967 hrtimer_reprogram(timer, new_base)) {
968 /* 968 /*
969 * Only allow reprogramming if the new base is on this CPU. 969 * Only allow reprogramming if the new base is on this CPU.
@@ -1103,7 +1103,7 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1103 */ 1103 */
1104ktime_t hrtimer_get_next_event(void) 1104ktime_t hrtimer_get_next_event(void)
1105{ 1105{
1106 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1106 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1107 struct hrtimer_clock_base *base = cpu_base->clock_base; 1107 struct hrtimer_clock_base *base = cpu_base->clock_base;
1108 ktime_t delta, mindelta = { .tv64 = KTIME_MAX }; 1108 ktime_t delta, mindelta = { .tv64 = KTIME_MAX };
1109 unsigned long flags; 1109 unsigned long flags;
@@ -1144,7 +1144,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
1144 1144
1145 memset(timer, 0, sizeof(struct hrtimer)); 1145 memset(timer, 0, sizeof(struct hrtimer));
1146 1146
1147 cpu_base = &__raw_get_cpu_var(hrtimer_bases); 1147 cpu_base = raw_cpu_ptr(&hrtimer_bases);
1148 1148
1149 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) 1149 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
1150 clock_id = CLOCK_MONOTONIC; 1150 clock_id = CLOCK_MONOTONIC;
@@ -1187,7 +1187,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1187 struct hrtimer_cpu_base *cpu_base; 1187 struct hrtimer_cpu_base *cpu_base;
1188 int base = hrtimer_clockid_to_base(which_clock); 1188 int base = hrtimer_clockid_to_base(which_clock);
1189 1189
1190 cpu_base = &__raw_get_cpu_var(hrtimer_bases); 1190 cpu_base = raw_cpu_ptr(&hrtimer_bases);
1191 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); 1191 *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
1192 1192
1193 return 0; 1193 return 0;
@@ -1242,7 +1242,7 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
1242 */ 1242 */
1243void hrtimer_interrupt(struct clock_event_device *dev) 1243void hrtimer_interrupt(struct clock_event_device *dev)
1244{ 1244{
1245 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1245 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1246 ktime_t expires_next, now, entry_time, delta; 1246 ktime_t expires_next, now, entry_time, delta;
1247 int i, retries = 0; 1247 int i, retries = 0;
1248 1248
@@ -1376,7 +1376,7 @@ static void __hrtimer_peek_ahead_timers(void)
1376 if (!hrtimer_hres_active()) 1376 if (!hrtimer_hres_active())
1377 return; 1377 return;
1378 1378
1379 td = &__get_cpu_var(tick_cpu_device); 1379 td = this_cpu_ptr(&tick_cpu_device);
1380 if (td && td->evtdev) 1380 if (td && td->evtdev)
1381 hrtimer_interrupt(td->evtdev); 1381 hrtimer_interrupt(td->evtdev);
1382} 1382}
@@ -1440,7 +1440,7 @@ void hrtimer_run_pending(void)
1440void hrtimer_run_queues(void) 1440void hrtimer_run_queues(void)
1441{ 1441{
1442 struct timerqueue_node *node; 1442 struct timerqueue_node *node;
1443 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1443 struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
1444 struct hrtimer_clock_base *base; 1444 struct hrtimer_clock_base *base;
1445 int index, gettime = 1; 1445 int index, gettime = 1;
1446 1446
@@ -1679,7 +1679,7 @@ static void migrate_hrtimers(int scpu)
1679 1679
1680 local_irq_disable(); 1680 local_irq_disable();
1681 old_base = &per_cpu(hrtimer_bases, scpu); 1681 old_base = &per_cpu(hrtimer_bases, scpu);
1682 new_base = &__get_cpu_var(hrtimer_bases); 1682 new_base = this_cpu_ptr(&hrtimer_bases);
1683 /* 1683 /*
1684 * The caller is globally serialized and nobody else 1684 * The caller is globally serialized and nobody else
1685 * takes two locks at once, deadlock is not possible. 1685 * takes two locks at once, deadlock is not possible.
@@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1776 */ 1776 */
1777 if (!expires) { 1777 if (!expires) {
1778 schedule(); 1778 schedule();
1779 __set_current_state(TASK_RUNNING);
1780 return -EINTR; 1779 return -EINTR;
1781 } 1780 }
1782 1781
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5f..492b986195d5 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
272 if (same_thread_group(tsk, current)) 272 if (same_thread_group(tsk, current))
273 err = cpu_clock_sample(which_clock, tsk, &rtn); 273 err = cpu_clock_sample(which_clock, tsk, &rtn);
274 } else { 274 } else {
275 unsigned long flags;
276 struct sighand_struct *sighand;
277
278 /*
279 * while_each_thread() is not yet entirely RCU safe,
280 * keep locking the group while sampling process
281 * clock for now.
282 */
283 sighand = lock_task_sighand(tsk, &flags);
284 if (!sighand)
285 return err;
286
287 if (tsk == current || thread_group_leader(tsk)) 275 if (tsk == current || thread_group_leader(tsk))
288 err = cpu_clock_sample_group(which_clock, tsk, &rtn); 276 err = cpu_clock_sample_group(which_clock, tsk, &rtn);
289
290 unlock_task_sighand(tsk, &flags);
291 } 277 }
292 278
293 if (!err) 279 if (!err)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 42b463ad90f2..31ea01f42e1f 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -636,6 +636,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
636 goto out; 636 goto out;
637 } 637 }
638 } else { 638 } else {
639 memset(&event.sigev_value, 0, sizeof(event.sigev_value));
639 event.sigev_notify = SIGEV_SIGNAL; 640 event.sigev_notify = SIGEV_SIGNAL;
640 event.sigev_signo = SIGALRM; 641 event.sigev_signo = SIGALRM;
641 event.sigev_value.sival_int = new_timer->it_id; 642 event.sigev_value.sival_int = new_timer->it_id;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 64c5990fd500..066f0ec05e48 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -554,7 +554,7 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
554void tick_check_oneshot_broadcast_this_cpu(void) 554void tick_check_oneshot_broadcast_this_cpu(void)
555{ 555{
556 if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) { 556 if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) {
557 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 557 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
558 558
559 /* 559 /*
560 * We might be in the middle of switching over from 560 * We might be in the middle of switching over from
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 0a0608edeb26..7efeedf53ebd 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -224,7 +224,7 @@ static void tick_setup_device(struct tick_device *td,
224 224
225void tick_install_replacement(struct clock_event_device *newdev) 225void tick_install_replacement(struct clock_event_device *newdev)
226{ 226{
227 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 227 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
228 int cpu = smp_processor_id(); 228 int cpu = smp_processor_id();
229 229
230 clockevents_exchange_device(td->evtdev, newdev); 230 clockevents_exchange_device(td->evtdev, newdev);
@@ -374,14 +374,14 @@ void tick_shutdown(unsigned int *cpup)
374 374
375void tick_suspend(void) 375void tick_suspend(void)
376{ 376{
377 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 377 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
378 378
379 clockevents_shutdown(td->evtdev); 379 clockevents_shutdown(td->evtdev);
380} 380}
381 381
382void tick_resume(void) 382void tick_resume(void)
383{ 383{
384 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 384 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
385 int broadcast = tick_resume_broadcast(); 385 int broadcast = tick_resume_broadcast();
386 386
387 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); 387 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME);
@@ -400,4 +400,5 @@ void tick_resume(void)
400void __init tick_init(void) 400void __init tick_init(void)
401{ 401{
402 tick_broadcast_init(); 402 tick_broadcast_init();
403 tick_nohz_init();
403} 404}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index c19c1d84b6f3..366aeb4f2c66 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -99,6 +99,13 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; }
99static inline bool tick_broadcast_oneshot_available(void) { return false; } 99static inline bool tick_broadcast_oneshot_available(void) { return false; }
100#endif /* !TICK_ONESHOT */ 100#endif /* !TICK_ONESHOT */
101 101
102/* NO_HZ_FULL internal */
103#ifdef CONFIG_NO_HZ_FULL
104extern void tick_nohz_init(void);
105# else
106static inline void tick_nohz_init(void) { }
107#endif
108
102/* 109/*
103 * Broadcasting support 110 * Broadcasting support
104 */ 111 */
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 824109060a33..7ce740e78e1b 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -59,7 +59,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
59 */ 59 */
60int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) 60int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
61{ 61{
62 struct tick_device *td = &__get_cpu_var(tick_cpu_device); 62 struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
63 struct clock_event_device *dev = td->evtdev; 63 struct clock_event_device *dev = td->evtdev;
64 64
65 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) || 65 if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT) ||
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 99aa6ee3908f..7b5741fc4110 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -205,7 +205,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now);
205 */ 205 */
206void __tick_nohz_full_check(void) 206void __tick_nohz_full_check(void)
207{ 207{
208 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 208 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
209 209
210 if (tick_nohz_full_cpu(smp_processor_id())) { 210 if (tick_nohz_full_cpu(smp_processor_id())) {
211 if (ts->tick_stopped && !is_idle_task(current)) { 211 if (ts->tick_stopped && !is_idle_task(current)) {
@@ -225,6 +225,20 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
225}; 225};
226 226
227/* 227/*
228 * Kick this CPU if it's full dynticks in order to force it to
229 * re-evaluate its dependency on the tick and restart it if necessary.
230 * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
231 * is NMI safe.
232 */
233void tick_nohz_full_kick(void)
234{
235 if (!tick_nohz_full_cpu(smp_processor_id()))
236 return;
237
238 irq_work_queue(&__get_cpu_var(nohz_full_kick_work));
239}
240
241/*
228 * Kick the CPU if it's full dynticks in order to force it to 242 * Kick the CPU if it's full dynticks in order to force it to
229 * re-evaluate its dependency on the tick and restart it if necessary. 243 * re-evaluate its dependency on the tick and restart it if necessary.
230 */ 244 */
@@ -281,22 +295,12 @@ out:
281/* Parse the boot-time nohz CPU list from the kernel parameters. */ 295/* Parse the boot-time nohz CPU list from the kernel parameters. */
282static int __init tick_nohz_full_setup(char *str) 296static int __init tick_nohz_full_setup(char *str)
283{ 297{
284 int cpu;
285
286 alloc_bootmem_cpumask_var(&tick_nohz_full_mask); 298 alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
287 alloc_bootmem_cpumask_var(&housekeeping_mask);
288 if (cpulist_parse(str, tick_nohz_full_mask) < 0) { 299 if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
289 pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); 300 pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
301 free_bootmem_cpumask_var(tick_nohz_full_mask);
290 return 1; 302 return 1;
291 } 303 }
292
293 cpu = smp_processor_id();
294 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
295 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
296 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
297 }
298 cpumask_andnot(housekeeping_mask,
299 cpu_possible_mask, tick_nohz_full_mask);
300 tick_nohz_full_running = true; 304 tick_nohz_full_running = true;
301 305
302 return 1; 306 return 1;
@@ -335,18 +339,11 @@ static int tick_nohz_init_all(void)
335 339
336#ifdef CONFIG_NO_HZ_FULL_ALL 340#ifdef CONFIG_NO_HZ_FULL_ALL
337 if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { 341 if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
338 pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); 342 WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
339 return err;
340 }
341 if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
342 pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n");
343 return err; 343 return err;
344 } 344 }
345 err = 0; 345 err = 0;
346 cpumask_setall(tick_nohz_full_mask); 346 cpumask_setall(tick_nohz_full_mask);
347 cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
348 cpumask_clear(housekeeping_mask);
349 cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
350 tick_nohz_full_running = true; 347 tick_nohz_full_running = true;
351#endif 348#endif
352 return err; 349 return err;
@@ -361,6 +358,37 @@ void __init tick_nohz_init(void)
361 return; 358 return;
362 } 359 }
363 360
361 if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
362 WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
363 cpumask_clear(tick_nohz_full_mask);
364 tick_nohz_full_running = false;
365 return;
366 }
367
368 /*
369 * Full dynticks uses irq work to drive the tick rescheduling on safe
370 * locking contexts. But then we need irq work to raise its own
371 * interrupts to avoid circular dependency on the tick
372 */
373 if (!arch_irq_work_has_interrupt()) {
374 pr_warning("NO_HZ: Can't run full dynticks because arch doesn't "
375 "support irq work self-IPIs\n");
376 cpumask_clear(tick_nohz_full_mask);
377 cpumask_copy(housekeeping_mask, cpu_possible_mask);
378 tick_nohz_full_running = false;
379 return;
380 }
381
382 cpu = smp_processor_id();
383
384 if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
385 pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
386 cpumask_clear_cpu(cpu, tick_nohz_full_mask);
387 }
388
389 cpumask_andnot(housekeeping_mask,
390 cpu_possible_mask, tick_nohz_full_mask);
391
364 for_each_cpu(cpu, tick_nohz_full_mask) 392 for_each_cpu(cpu, tick_nohz_full_mask)
365 context_tracking_cpu_set(cpu); 393 context_tracking_cpu_set(cpu);
366 394
@@ -545,7 +573,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
545 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; 573 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
546 ktime_t last_update, expires, ret = { .tv64 = 0 }; 574 ktime_t last_update, expires, ret = { .tv64 = 0 };
547 unsigned long rcu_delta_jiffies; 575 unsigned long rcu_delta_jiffies;
548 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 576 struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
549 u64 time_delta; 577 u64 time_delta;
550 578
551 time_delta = timekeeping_max_deferment(); 579 time_delta = timekeeping_max_deferment();
@@ -558,7 +586,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
558 } while (read_seqretry(&jiffies_lock, seq)); 586 } while (read_seqretry(&jiffies_lock, seq));
559 587
560 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || 588 if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) ||
561 arch_needs_cpu(cpu) || irq_work_needs_cpu()) { 589 arch_needs_cpu() || irq_work_needs_cpu()) {
562 next_jiffies = last_jiffies + 1; 590 next_jiffies = last_jiffies + 1;
563 delta_jiffies = 1; 591 delta_jiffies = 1;
564 } else { 592 } else {
@@ -813,7 +841,7 @@ void tick_nohz_idle_enter(void)
813 841
814 local_irq_disable(); 842 local_irq_disable();
815 843
816 ts = &__get_cpu_var(tick_cpu_sched); 844 ts = this_cpu_ptr(&tick_cpu_sched);
817 ts->inidle = 1; 845 ts->inidle = 1;
818 __tick_nohz_idle_enter(ts); 846 __tick_nohz_idle_enter(ts);
819 847
@@ -831,7 +859,7 @@ EXPORT_SYMBOL_GPL(tick_nohz_idle_enter);
831 */ 859 */
832void tick_nohz_irq_exit(void) 860void tick_nohz_irq_exit(void)
833{ 861{
834 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 862 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
835 863
836 if (ts->inidle) 864 if (ts->inidle)
837 __tick_nohz_idle_enter(ts); 865 __tick_nohz_idle_enter(ts);
@@ -846,7 +874,7 @@ void tick_nohz_irq_exit(void)
846 */ 874 */
847ktime_t tick_nohz_get_sleep_length(void) 875ktime_t tick_nohz_get_sleep_length(void)
848{ 876{
849 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 877 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
850 878
851 return ts->sleep_length; 879 return ts->sleep_length;
852} 880}
@@ -924,7 +952,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
924 */ 952 */
925void tick_nohz_idle_exit(void) 953void tick_nohz_idle_exit(void)
926{ 954{
927 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 955 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
928 ktime_t now; 956 ktime_t now;
929 957
930 local_irq_disable(); 958 local_irq_disable();
@@ -959,7 +987,7 @@ static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
959 */ 987 */
960static void tick_nohz_handler(struct clock_event_device *dev) 988static void tick_nohz_handler(struct clock_event_device *dev)
961{ 989{
962 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 990 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
963 struct pt_regs *regs = get_irq_regs(); 991 struct pt_regs *regs = get_irq_regs();
964 ktime_t now = ktime_get(); 992 ktime_t now = ktime_get();
965 993
@@ -968,6 +996,10 @@ static void tick_nohz_handler(struct clock_event_device *dev)
968 tick_sched_do_timer(now); 996 tick_sched_do_timer(now);
969 tick_sched_handle(ts, regs); 997 tick_sched_handle(ts, regs);
970 998
999 /* No need to reprogram if we are running tickless */
1000 if (unlikely(ts->tick_stopped))
1001 return;
1002
971 while (tick_nohz_reprogram(ts, now)) { 1003 while (tick_nohz_reprogram(ts, now)) {
972 now = ktime_get(); 1004 now = ktime_get();
973 tick_do_update_jiffies64(now); 1005 tick_do_update_jiffies64(now);
@@ -979,7 +1011,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
979 */ 1011 */
980static void tick_nohz_switch_to_nohz(void) 1012static void tick_nohz_switch_to_nohz(void)
981{ 1013{
982 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 1014 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
983 ktime_t next; 1015 ktime_t next;
984 1016
985 if (!tick_nohz_enabled) 1017 if (!tick_nohz_enabled)
@@ -1041,7 +1073,7 @@ static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
1041 1073
1042static inline void tick_nohz_irq_enter(void) 1074static inline void tick_nohz_irq_enter(void)
1043{ 1075{
1044 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 1076 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1045 ktime_t now; 1077 ktime_t now;
1046 1078
1047 if (!ts->idle_active && !ts->tick_stopped) 1079 if (!ts->idle_active && !ts->tick_stopped)
@@ -1095,6 +1127,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
1095 if (regs) 1127 if (regs)
1096 tick_sched_handle(ts, regs); 1128 tick_sched_handle(ts, regs);
1097 1129
1130 /* No need to reprogram if we are in idle or full dynticks mode */
1131 if (unlikely(ts->tick_stopped))
1132 return HRTIMER_NORESTART;
1133
1098 hrtimer_forward(timer, now, tick_period); 1134 hrtimer_forward(timer, now, tick_period);
1099 1135
1100 return HRTIMER_RESTART; 1136 return HRTIMER_RESTART;
@@ -1115,7 +1151,7 @@ early_param("skew_tick", skew_tick);
1115 */ 1151 */
1116void tick_setup_sched_timer(void) 1152void tick_setup_sched_timer(void)
1117{ 1153{
1118 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 1154 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1119 ktime_t now = ktime_get(); 1155 ktime_t now = ktime_get();
1120 1156
1121 /* 1157 /*
@@ -1184,7 +1220,7 @@ void tick_clock_notify(void)
1184 */ 1220 */
1185void tick_oneshot_notify(void) 1221void tick_oneshot_notify(void)
1186{ 1222{
1187 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 1223 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1188 1224
1189 set_bit(0, &ts->check_clocks); 1225 set_bit(0, &ts->check_clocks);
1190} 1226}
@@ -1199,7 +1235,7 @@ void tick_oneshot_notify(void)
1199 */ 1235 */
1200int tick_check_oneshot_change(int allow_nohz) 1236int tick_check_oneshot_change(int allow_nohz)
1201{ 1237{
1202 struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); 1238 struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
1203 1239
1204 if (!test_and_clear_bit(0, &ts->check_clocks)) 1240 if (!test_and_clear_bit(0, &ts->check_clocks))
1205 return 0; 1241 return 0;
diff --git a/kernel/time/time.c b/kernel/time/time.c
index f0294ba14634..a9ae20fb0b11 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -559,17 +559,20 @@ EXPORT_SYMBOL(usecs_to_jiffies);
559 * that a remainder subtract here would not do the right thing as the 559 * that a remainder subtract here would not do the right thing as the
560 * resolution values don't fall on second boundries. I.e. the line: 560 * resolution values don't fall on second boundries. I.e. the line:
561 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding. 561 * nsec -= nsec % TICK_NSEC; is NOT a correct resolution rounding.
562 * Note that due to the small error in the multiplier here, this
563 * rounding is incorrect for sufficiently large values of tv_nsec, but
564 * well formed timespecs should have tv_nsec < NSEC_PER_SEC, so we're
565 * OK.
562 * 566 *
563 * Rather, we just shift the bits off the right. 567 * Rather, we just shift the bits off the right.
564 * 568 *
565 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec 569 * The >> (NSEC_JIFFIE_SC - SEC_JIFFIE_SC) converts the scaled nsec
566 * value to a scaled second value. 570 * value to a scaled second value.
567 */ 571 */
568unsigned long 572static unsigned long
569timespec_to_jiffies(const struct timespec *value) 573__timespec_to_jiffies(unsigned long sec, long nsec)
570{ 574{
571 unsigned long sec = value->tv_sec; 575 nsec = nsec + TICK_NSEC - 1;
572 long nsec = value->tv_nsec + TICK_NSEC - 1;
573 576
574 if (sec >= MAX_SEC_IN_JIFFIES){ 577 if (sec >= MAX_SEC_IN_JIFFIES){
575 sec = MAX_SEC_IN_JIFFIES; 578 sec = MAX_SEC_IN_JIFFIES;
@@ -580,6 +583,13 @@ timespec_to_jiffies(const struct timespec *value)
580 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC; 583 (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
581 584
582} 585}
586
587unsigned long
588timespec_to_jiffies(const struct timespec *value)
589{
590 return __timespec_to_jiffies(value->tv_sec, value->tv_nsec);
591}
592
583EXPORT_SYMBOL(timespec_to_jiffies); 593EXPORT_SYMBOL(timespec_to_jiffies);
584 594
585void 595void
@@ -596,31 +606,27 @@ jiffies_to_timespec(const unsigned long jiffies, struct timespec *value)
596} 606}
597EXPORT_SYMBOL(jiffies_to_timespec); 607EXPORT_SYMBOL(jiffies_to_timespec);
598 608
599/* Same for "timeval" 609/*
600 * 610 * We could use a similar algorithm to timespec_to_jiffies (with a
601 * Well, almost. The problem here is that the real system resolution is 611 * different multiplier for usec instead of nsec). But this has a
602 * in nanoseconds and the value being converted is in micro seconds. 612 * problem with rounding: we can't exactly add TICK_NSEC - 1 to the
603 * Also for some machines (those that use HZ = 1024, in-particular), 613 * usec value, since it's not necessarily integral.
604 * there is a LARGE error in the tick size in microseconds. 614 *
605 615 * We could instead round in the intermediate scaled representation
606 * The solution we use is to do the rounding AFTER we convert the 616 * (i.e. in units of 1/2^(large scale) jiffies) but that's also
607 * microsecond part. Thus the USEC_ROUND, the bits to be shifted off. 617 * perilous: the scaling introduces a small positive error, which
608 * Instruction wise, this should cost only an additional add with carry 618 * combined with a division-rounding-upward (i.e. adding 2^(scale) - 1
609 * instruction above the way it was done above. 619 * units to the intermediate before shifting) leads to accidental
620 * overflow and overestimates.
621 *
622 * At the cost of one additional multiplication by a constant, just
623 * use the timespec implementation.
610 */ 624 */
611unsigned long 625unsigned long
612timeval_to_jiffies(const struct timeval *value) 626timeval_to_jiffies(const struct timeval *value)
613{ 627{
614 unsigned long sec = value->tv_sec; 628 return __timespec_to_jiffies(value->tv_sec,
615 long usec = value->tv_usec; 629 value->tv_usec * NSEC_PER_USEC);
616
617 if (sec >= MAX_SEC_IN_JIFFIES){
618 sec = MAX_SEC_IN_JIFFIES;
619 usec = 0;
620 }
621 return (((u64)sec * SEC_CONVERSION) +
622 (((u64)usec * USEC_CONVERSION + USEC_ROUND) >>
623 (USEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
624} 630}
625EXPORT_SYMBOL(timeval_to_jiffies); 631EXPORT_SYMBOL(timeval_to_jiffies);
626 632
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index f36b02838a47..ec1791fae965 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -338,10 +338,11 @@ EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
338 338
339static inline void update_vsyscall(struct timekeeper *tk) 339static inline void update_vsyscall(struct timekeeper *tk)
340{ 340{
341 struct timespec xt; 341 struct timespec xt, wm;
342 342
343 xt = timespec64_to_timespec(tk_xtime(tk)); 343 xt = timespec64_to_timespec(tk_xtime(tk));
344 update_vsyscall_old(&xt, &tk->wall_to_monotonic, tk->tkr.clock, tk->tkr.mult, 344 wm = timespec64_to_timespec(tk->wall_to_monotonic);
345 update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
345 tk->tkr.cycle_last); 346 tk->tkr.cycle_last);
346} 347}
347 348
@@ -441,11 +442,12 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
441 tk->ntp_error = 0; 442 tk->ntp_error = 0;
442 ntp_clear(); 443 ntp_clear();
443 } 444 }
444 update_vsyscall(tk);
445 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
446 445
447 tk_update_ktime_data(tk); 446 tk_update_ktime_data(tk);
448 447
448 update_vsyscall(tk);
449 update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
450
449 if (action & TK_MIRROR) 451 if (action & TK_MIRROR)
450 memcpy(&shadow_timekeeper, &tk_core.timekeeper, 452 memcpy(&shadow_timekeeper, &tk_core.timekeeper,
451 sizeof(tk_core.timekeeper)); 453 sizeof(tk_core.timekeeper));
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index aca5dfe2fa3d..3260ffdb368f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -655,7 +655,7 @@ static inline void debug_assert_init(struct timer_list *timer)
655static void do_init_timer(struct timer_list *timer, unsigned int flags, 655static void do_init_timer(struct timer_list *timer, unsigned int flags,
656 const char *name, struct lock_class_key *key) 656 const char *name, struct lock_class_key *key)
657{ 657{
658 struct tvec_base *base = __raw_get_cpu_var(tvec_bases); 658 struct tvec_base *base = raw_cpu_read(tvec_bases);
659 659
660 timer->entry.next = NULL; 660 timer->entry.next = NULL;
661 timer->base = (void *)((unsigned long)base | flags); 661 timer->base = (void *)((unsigned long)base | flags);
@@ -1385,7 +1385,7 @@ void update_process_times(int user_tick)
1385 rcu_check_callbacks(cpu, user_tick); 1385 rcu_check_callbacks(cpu, user_tick);
1386#ifdef CONFIG_IRQ_WORK 1386#ifdef CONFIG_IRQ_WORK
1387 if (in_irq()) 1387 if (in_irq())
1388 irq_work_run(); 1388 irq_work_tick();
1389#endif 1389#endif
1390 scheduler_tick(); 1390 scheduler_tick();
1391 run_posix_cpu_timers(p); 1391 run_posix_cpu_timers(p);
diff --git a/kernel/torture.c b/kernel/torture.c
index d600af21f022..dd70993c266c 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -211,18 +211,16 @@ EXPORT_SYMBOL_GPL(torture_onoff_cleanup);
211/* 211/*
212 * Print online/offline testing statistics. 212 * Print online/offline testing statistics.
213 */ 213 */
214char *torture_onoff_stats(char *page) 214void torture_onoff_stats(void)
215{ 215{
216#ifdef CONFIG_HOTPLUG_CPU 216#ifdef CONFIG_HOTPLUG_CPU
217 page += sprintf(page, 217 pr_cont("onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ",
218 "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", 218 n_online_successes, n_online_attempts,
219 n_online_successes, n_online_attempts, 219 n_offline_successes, n_offline_attempts,
220 n_offline_successes, n_offline_attempts, 220 min_online, max_online,
221 min_online, max_online, 221 min_offline, max_offline,
222 min_offline, max_offline, 222 sum_online, sum_offline, HZ);
223 sum_online, sum_offline, HZ);
224#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 223#endif /* #ifdef CONFIG_HOTPLUG_CPU */
225 return page;
226} 224}
227EXPORT_SYMBOL_GPL(torture_onoff_stats); 225EXPORT_SYMBOL_GPL(torture_onoff_stats);
228 226
@@ -635,8 +633,13 @@ EXPORT_SYMBOL_GPL(torture_init_end);
635 * 633 *
636 * This must be called before the caller starts shutting down its own 634 * This must be called before the caller starts shutting down its own
637 * kthreads. 635 * kthreads.
636 *
637 * Both torture_cleanup_begin() and torture_cleanup_end() must be paired,
638 * in order to correctly perform the cleanup. They are separated because
639 * threads can still need to reference the torture_type type, thus nullify
640 * only after completing all other relevant calls.
638 */ 641 */
639bool torture_cleanup(void) 642bool torture_cleanup_begin(void)
640{ 643{
641 mutex_lock(&fullstop_mutex); 644 mutex_lock(&fullstop_mutex);
642 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { 645 if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) {
@@ -651,12 +654,17 @@ bool torture_cleanup(void)
651 torture_shuffle_cleanup(); 654 torture_shuffle_cleanup();
652 torture_stutter_cleanup(); 655 torture_stutter_cleanup();
653 torture_onoff_cleanup(); 656 torture_onoff_cleanup();
657 return false;
658}
659EXPORT_SYMBOL_GPL(torture_cleanup_begin);
660
661void torture_cleanup_end(void)
662{
654 mutex_lock(&fullstop_mutex); 663 mutex_lock(&fullstop_mutex);
655 torture_type = NULL; 664 torture_type = NULL;
656 mutex_unlock(&fullstop_mutex); 665 mutex_unlock(&fullstop_mutex);
657 return false;
658} 666}
659EXPORT_SYMBOL_GPL(torture_cleanup); 667EXPORT_SYMBOL_GPL(torture_cleanup_end);
660 668
661/* 669/*
662 * Is it time for the current torture test to stop? 670 * Is it time for the current torture test to stop?
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1654b12c891a..31c90fec4158 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -65,15 +65,21 @@
65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) 65#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL)
66 66
67#ifdef CONFIG_DYNAMIC_FTRACE 67#ifdef CONFIG_DYNAMIC_FTRACE
68#define INIT_REGEX_LOCK(opsname) \ 68#define INIT_OPS_HASH(opsname) \
69 .regex_lock = __MUTEX_INITIALIZER(opsname.regex_lock), 69 .func_hash = &opsname.local_hash, \
70 .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
71#define ASSIGN_OPS_HASH(opsname, val) \
72 .func_hash = val, \
73 .local_hash.regex_lock = __MUTEX_INITIALIZER(opsname.local_hash.regex_lock),
70#else 74#else
71#define INIT_REGEX_LOCK(opsname) 75#define INIT_OPS_HASH(opsname)
76#define ASSIGN_OPS_HASH(opsname, val)
72#endif 77#endif
73 78
74static struct ftrace_ops ftrace_list_end __read_mostly = { 79static struct ftrace_ops ftrace_list_end __read_mostly = {
75 .func = ftrace_stub, 80 .func = ftrace_stub,
76 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, 81 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
82 INIT_OPS_HASH(ftrace_list_end)
77}; 83};
78 84
79/* ftrace_enabled is a method to turn ftrace on or off */ 85/* ftrace_enabled is a method to turn ftrace on or off */
@@ -107,6 +113,9 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
107static struct ftrace_ops global_ops; 113static struct ftrace_ops global_ops;
108static struct ftrace_ops control_ops; 114static struct ftrace_ops control_ops;
109 115
116static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
117 struct ftrace_ops *op, struct pt_regs *regs);
118
110#if ARCH_SUPPORTS_FTRACE_OPS 119#if ARCH_SUPPORTS_FTRACE_OPS
111static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, 120static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
112 struct ftrace_ops *op, struct pt_regs *regs); 121 struct ftrace_ops *op, struct pt_regs *regs);
@@ -140,7 +149,8 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops)
140{ 149{
141#ifdef CONFIG_DYNAMIC_FTRACE 150#ifdef CONFIG_DYNAMIC_FTRACE
142 if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) { 151 if (!(ops->flags & FTRACE_OPS_FL_INITIALIZED)) {
143 mutex_init(&ops->regex_lock); 152 mutex_init(&ops->local_hash.regex_lock);
153 ops->func_hash = &ops->local_hash;
144 ops->flags |= FTRACE_OPS_FL_INITIALIZED; 154 ops->flags |= FTRACE_OPS_FL_INITIALIZED;
145 } 155 }
146#endif 156#endif
@@ -244,18 +254,24 @@ static void update_ftrace_function(void)
244 ftrace_func_t func; 254 ftrace_func_t func;
245 255
246 /* 256 /*
257 * Prepare the ftrace_ops that the arch callback will use.
258 * If there's only one ftrace_ops registered, the ftrace_ops_list
259 * will point to the ops we want.
260 */
261 set_function_trace_op = ftrace_ops_list;
262
263 /* If there's no ftrace_ops registered, just call the stub function */
264 if (ftrace_ops_list == &ftrace_list_end) {
265 func = ftrace_stub;
266
267 /*
247 * If we are at the end of the list and this ops is 268 * If we are at the end of the list and this ops is
248 * recursion safe and not dynamic and the arch supports passing ops, 269 * recursion safe and not dynamic and the arch supports passing ops,
249 * then have the mcount trampoline call the function directly. 270 * then have the mcount trampoline call the function directly.
250 */ 271 */
251 if (ftrace_ops_list == &ftrace_list_end || 272 } else if (ftrace_ops_list->next == &ftrace_list_end) {
252 (ftrace_ops_list->next == &ftrace_list_end && 273 func = ftrace_ops_get_func(ftrace_ops_list);
253 !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) && 274
254 (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
255 !FTRACE_FORCE_LIST_FUNC)) {
256 /* Set the ftrace_ops that the arch callback uses */
257 set_function_trace_op = ftrace_ops_list;
258 func = ftrace_ops_list->func;
259 } else { 275 } else {
260 /* Just use the default ftrace_ops */ 276 /* Just use the default ftrace_ops */
261 set_function_trace_op = &ftrace_list_end; 277 set_function_trace_op = &ftrace_list_end;
@@ -899,7 +915,7 @@ static void unregister_ftrace_profiler(void)
899static struct ftrace_ops ftrace_profile_ops __read_mostly = { 915static struct ftrace_ops ftrace_profile_ops __read_mostly = {
900 .func = function_profile_call, 916 .func = function_profile_call,
901 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, 917 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
902 INIT_REGEX_LOCK(ftrace_profile_ops) 918 INIT_OPS_HASH(ftrace_profile_ops)
903}; 919};
904 920
905static int register_ftrace_profiler(void) 921static int register_ftrace_profiler(void)
@@ -1041,6 +1057,12 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
1041 1057
1042static struct ftrace_ops *removed_ops; 1058static struct ftrace_ops *removed_ops;
1043 1059
1060/*
1061 * Set when doing a global update, like enabling all recs or disabling them.
1062 * It is not set when just updating a single ftrace_ops.
1063 */
1064static bool update_all_ops;
1065
1044#ifndef CONFIG_FTRACE_MCOUNT_RECORD 1066#ifndef CONFIG_FTRACE_MCOUNT_RECORD
1045# error Dynamic ftrace depends on MCOUNT_RECORD 1067# error Dynamic ftrace depends on MCOUNT_RECORD
1046#endif 1068#endif
@@ -1081,11 +1103,12 @@ static const struct ftrace_hash empty_hash = {
1081#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash) 1103#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
1082 1104
1083static struct ftrace_ops global_ops = { 1105static struct ftrace_ops global_ops = {
1084 .func = ftrace_stub, 1106 .func = ftrace_stub,
1085 .notrace_hash = EMPTY_HASH, 1107 .local_hash.notrace_hash = EMPTY_HASH,
1086 .filter_hash = EMPTY_HASH, 1108 .local_hash.filter_hash = EMPTY_HASH,
1087 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, 1109 INIT_OPS_HASH(global_ops)
1088 INIT_REGEX_LOCK(global_ops) 1110 .flags = FTRACE_OPS_FL_RECURSION_SAFE |
1111 FTRACE_OPS_FL_INITIALIZED,
1089}; 1112};
1090 1113
1091struct ftrace_page { 1114struct ftrace_page {
@@ -1226,8 +1249,8 @@ static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
1226void ftrace_free_filter(struct ftrace_ops *ops) 1249void ftrace_free_filter(struct ftrace_ops *ops)
1227{ 1250{
1228 ftrace_ops_init(ops); 1251 ftrace_ops_init(ops);
1229 free_ftrace_hash(ops->filter_hash); 1252 free_ftrace_hash(ops->func_hash->filter_hash);
1230 free_ftrace_hash(ops->notrace_hash); 1253 free_ftrace_hash(ops->func_hash->notrace_hash);
1231} 1254}
1232 1255
1233static struct ftrace_hash *alloc_ftrace_hash(int size_bits) 1256static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
@@ -1288,9 +1311,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
1288} 1311}
1289 1312
1290static void 1313static void
1291ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash); 1314ftrace_hash_rec_disable_modify(struct ftrace_ops *ops, int filter_hash);
1292static void 1315static void
1293ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash); 1316ftrace_hash_rec_enable_modify(struct ftrace_ops *ops, int filter_hash);
1294 1317
1295static int 1318static int
1296ftrace_hash_move(struct ftrace_ops *ops, int enable, 1319ftrace_hash_move(struct ftrace_ops *ops, int enable,
@@ -1299,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
1299 struct ftrace_func_entry *entry; 1322 struct ftrace_func_entry *entry;
1300 struct hlist_node *tn; 1323 struct hlist_node *tn;
1301 struct hlist_head *hhd; 1324 struct hlist_head *hhd;
1302 struct ftrace_hash *old_hash;
1303 struct ftrace_hash *new_hash; 1325 struct ftrace_hash *new_hash;
1304 int size = src->count; 1326 int size = src->count;
1305 int bits = 0; 1327 int bits = 0;
@@ -1342,17 +1364,30 @@ update:
1342 * Remove the current set, update the hash and add 1364 * Remove the current set, update the hash and add
1343 * them back. 1365 * them back.
1344 */ 1366 */
1345 ftrace_hash_rec_disable(ops, enable); 1367 ftrace_hash_rec_disable_modify(ops, enable);
1346 1368
1347 old_hash = *dst;
1348 rcu_assign_pointer(*dst, new_hash); 1369 rcu_assign_pointer(*dst, new_hash);
1349 free_ftrace_hash_rcu(old_hash);
1350 1370
1351 ftrace_hash_rec_enable(ops, enable); 1371 ftrace_hash_rec_enable_modify(ops, enable);
1352 1372
1353 return 0; 1373 return 0;
1354} 1374}
1355 1375
1376static bool hash_contains_ip(unsigned long ip,
1377 struct ftrace_ops_hash *hash)
1378{
1379 /*
1380 * The function record is a match if it exists in the filter
1381 * hash and not in the notrace hash. Note, an emty hash is
1382 * considered a match for the filter hash, but an empty
1383 * notrace hash is considered not in the notrace hash.
1384 */
1385 return (ftrace_hash_empty(hash->filter_hash) ||
1386 ftrace_lookup_ip(hash->filter_hash, ip)) &&
1387 (ftrace_hash_empty(hash->notrace_hash) ||
1388 !ftrace_lookup_ip(hash->notrace_hash, ip));
1389}
1390
1356/* 1391/*
1357 * Test the hashes for this ops to see if we want to call 1392 * Test the hashes for this ops to see if we want to call
1358 * the ops->func or not. 1393 * the ops->func or not.
@@ -1368,8 +1403,7 @@ update:
1368static int 1403static int
1369ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs) 1404ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
1370{ 1405{
1371 struct ftrace_hash *filter_hash; 1406 struct ftrace_ops_hash hash;
1372 struct ftrace_hash *notrace_hash;
1373 int ret; 1407 int ret;
1374 1408
1375#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS 1409#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
@@ -1382,13 +1416,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
1382 return 0; 1416 return 0;
1383#endif 1417#endif
1384 1418
1385 filter_hash = rcu_dereference_raw_notrace(ops->filter_hash); 1419 hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
1386 notrace_hash = rcu_dereference_raw_notrace(ops->notrace_hash); 1420 hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
1387 1421
1388 if ((ftrace_hash_empty(filter_hash) || 1422 if (hash_contains_ip(ip, &hash))
1389 ftrace_lookup_ip(filter_hash, ip)) &&
1390 (ftrace_hash_empty(notrace_hash) ||
1391 !ftrace_lookup_ip(notrace_hash, ip)))
1392 ret = 1; 1423 ret = 1;
1393 else 1424 else
1394 ret = 0; 1425 ret = 0;
@@ -1500,33 +1531,6 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
1500 return keep_regs; 1531 return keep_regs;
1501} 1532}
1502 1533
1503static void ftrace_remove_tramp(struct ftrace_ops *ops,
1504 struct dyn_ftrace *rec)
1505{
1506 struct ftrace_func_entry *entry;
1507
1508 entry = ftrace_lookup_ip(ops->tramp_hash, rec->ip);
1509 if (!entry)
1510 return;
1511
1512 /*
1513 * The tramp_hash entry will be removed at time
1514 * of update.
1515 */
1516 ops->nr_trampolines--;
1517 rec->flags &= ~FTRACE_FL_TRAMP;
1518}
1519
1520static void ftrace_clear_tramps(struct dyn_ftrace *rec)
1521{
1522 struct ftrace_ops *op;
1523
1524 do_for_each_ftrace_op(op, ftrace_ops_list) {
1525 if (op->nr_trampolines)
1526 ftrace_remove_tramp(op, rec);
1527 } while_for_each_ftrace_op(op);
1528}
1529
1530static void __ftrace_hash_rec_update(struct ftrace_ops *ops, 1534static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1531 int filter_hash, 1535 int filter_hash,
1532 bool inc) 1536 bool inc)
@@ -1554,14 +1558,14 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1554 * gets inversed. 1558 * gets inversed.
1555 */ 1559 */
1556 if (filter_hash) { 1560 if (filter_hash) {
1557 hash = ops->filter_hash; 1561 hash = ops->func_hash->filter_hash;
1558 other_hash = ops->notrace_hash; 1562 other_hash = ops->func_hash->notrace_hash;
1559 if (ftrace_hash_empty(hash)) 1563 if (ftrace_hash_empty(hash))
1560 all = 1; 1564 all = 1;
1561 } else { 1565 } else {
1562 inc = !inc; 1566 inc = !inc;
1563 hash = ops->notrace_hash; 1567 hash = ops->func_hash->notrace_hash;
1564 other_hash = ops->filter_hash; 1568 other_hash = ops->func_hash->filter_hash;
1565 /* 1569 /*
1566 * If the notrace hash has no items, 1570 * If the notrace hash has no items,
1567 * then there's nothing to do. 1571 * then there's nothing to do.
@@ -1615,22 +1619,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1615 * function, and the ops has a trampoline registered 1619 * function, and the ops has a trampoline registered
1616 * for it, then we can call it directly. 1620 * for it, then we can call it directly.
1617 */ 1621 */
1618 if (ftrace_rec_count(rec) == 1 && ops->trampoline) { 1622 if (ftrace_rec_count(rec) == 1 && ops->trampoline)
1619 rec->flags |= FTRACE_FL_TRAMP; 1623 rec->flags |= FTRACE_FL_TRAMP;
1620 ops->nr_trampolines++; 1624 else
1621 } else {
1622 /* 1625 /*
1623 * If we are adding another function callback 1626 * If we are adding another function callback
1624 * to this function, and the previous had a 1627 * to this function, and the previous had a
1625 * trampoline used, then we need to go back to 1628 * custom trampoline in use, then we need to go
1626 * the default trampoline. 1629 * back to the default trampoline.
1627 */ 1630 */
1628 rec->flags &= ~FTRACE_FL_TRAMP; 1631 rec->flags &= ~FTRACE_FL_TRAMP;
1629 1632
1630 /* remove trampolines from any ops for this rec */
1631 ftrace_clear_tramps(rec);
1632 }
1633
1634 /* 1633 /*
1635 * If any ops wants regs saved for this function 1634 * If any ops wants regs saved for this function
1636 * then all ops will get saved regs. 1635 * then all ops will get saved regs.
@@ -1642,9 +1641,6 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1642 return; 1641 return;
1643 rec->flags--; 1642 rec->flags--;
1644 1643
1645 if (ops->trampoline && !ftrace_rec_count(rec))
1646 ftrace_remove_tramp(ops, rec);
1647
1648 /* 1644 /*
1649 * If the rec had REGS enabled and the ops that is 1645 * If the rec had REGS enabled and the ops that is
1650 * being removed had REGS set, then see if there is 1646 * being removed had REGS set, then see if there is
@@ -1659,6 +1655,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
1659 } 1655 }
1660 1656
1661 /* 1657 /*
1658 * If the rec had TRAMP enabled, then it needs to
1659 * be cleared. As TRAMP can only be enabled iff
1660 * there is only a single ops attached to it.
1661 * In otherwords, always disable it on decrementing.
1662 * In the future, we may set it if rec count is
1663 * decremented to one, and the ops that is left
1664 * has a trampoline.
1665 */
1666 rec->flags &= ~FTRACE_FL_TRAMP;
1667
1668 /*
1662 * flags will be cleared in ftrace_check_record() 1669 * flags will be cleared in ftrace_check_record()
1663 * if rec count is zero. 1670 * if rec count is zero.
1664 */ 1671 */
@@ -1682,6 +1689,41 @@ static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
1682 __ftrace_hash_rec_update(ops, filter_hash, 1); 1689 __ftrace_hash_rec_update(ops, filter_hash, 1);
1683} 1690}
1684 1691
1692static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops,
1693 int filter_hash, int inc)
1694{
1695 struct ftrace_ops *op;
1696
1697 __ftrace_hash_rec_update(ops, filter_hash, inc);
1698
1699 if (ops->func_hash != &global_ops.local_hash)
1700 return;
1701
1702 /*
1703 * If the ops shares the global_ops hash, then we need to update
1704 * all ops that are enabled and use this hash.
1705 */
1706 do_for_each_ftrace_op(op, ftrace_ops_list) {
1707 /* Already done */
1708 if (op == ops)
1709 continue;
1710 if (op->func_hash == &global_ops.local_hash)
1711 __ftrace_hash_rec_update(op, filter_hash, inc);
1712 } while_for_each_ftrace_op(op);
1713}
1714
1715static void ftrace_hash_rec_disable_modify(struct ftrace_ops *ops,
1716 int filter_hash)
1717{
1718 ftrace_hash_rec_update_modify(ops, filter_hash, 0);
1719}
1720
1721static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
1722 int filter_hash)
1723{
1724 ftrace_hash_rec_update_modify(ops, filter_hash, 1);
1725}
1726
1685static void print_ip_ins(const char *fmt, unsigned char *p) 1727static void print_ip_ins(const char *fmt, unsigned char *p)
1686{ 1728{
1687 int i; 1729 int i;
@@ -1842,21 +1884,86 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
1842} 1884}
1843 1885
1844static struct ftrace_ops * 1886static struct ftrace_ops *
1887ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
1888{
1889 struct ftrace_ops *op;
1890 unsigned long ip = rec->ip;
1891
1892 do_for_each_ftrace_op(op, ftrace_ops_list) {
1893
1894 if (!op->trampoline)
1895 continue;
1896
1897 if (hash_contains_ip(ip, op->func_hash))
1898 return op;
1899 } while_for_each_ftrace_op(op);
1900
1901 return NULL;
1902}
1903
1904static struct ftrace_ops *
1845ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) 1905ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
1846{ 1906{
1847 struct ftrace_ops *op; 1907 struct ftrace_ops *op;
1908 unsigned long ip = rec->ip;
1848 1909
1849 /* Removed ops need to be tested first */ 1910 /*
1850 if (removed_ops && removed_ops->tramp_hash) { 1911 * Need to check removed ops first.
1851 if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip)) 1912 * If they are being removed, and this rec has a tramp,
1913 * and this rec is in the ops list, then it would be the
1914 * one with the tramp.
1915 */
1916 if (removed_ops) {
1917 if (hash_contains_ip(ip, &removed_ops->old_hash))
1852 return removed_ops; 1918 return removed_ops;
1853 } 1919 }
1854 1920
1921 /*
1922 * Need to find the current trampoline for a rec.
1923 * Now, a trampoline is only attached to a rec if there
1924 * was a single 'ops' attached to it. But this can be called
1925 * when we are adding another op to the rec or removing the
1926 * current one. Thus, if the op is being added, we can
1927 * ignore it because it hasn't attached itself to the rec
1928 * yet.
1929 *
1930 * If an ops is being modified (hooking to different functions)
1931 * then we don't care about the new functions that are being
1932 * added, just the old ones (that are probably being removed).
1933 *
1934 * If we are adding an ops to a function that already is using
1935 * a trampoline, it needs to be removed (trampolines are only
1936 * for single ops connected), then an ops that is not being
1937 * modified also needs to be checked.
1938 */
1855 do_for_each_ftrace_op(op, ftrace_ops_list) { 1939 do_for_each_ftrace_op(op, ftrace_ops_list) {
1856 if (!op->tramp_hash) 1940
1941 if (!op->trampoline)
1857 continue; 1942 continue;
1858 1943
1859 if (ftrace_lookup_ip(op->tramp_hash, rec->ip)) 1944 /*
1945 * If the ops is being added, it hasn't gotten to
1946 * the point to be removed from this tree yet.
1947 */
1948 if (op->flags & FTRACE_OPS_FL_ADDING)
1949 continue;
1950
1951
1952 /*
1953 * If the ops is being modified and is in the old
1954 * hash, then it is probably being removed from this
1955 * function.
1956 */
1957 if ((op->flags & FTRACE_OPS_FL_MODIFYING) &&
1958 hash_contains_ip(ip, &op->old_hash))
1959 return op;
1960 /*
1961 * If the ops is not being added or modified, and it's
1962 * in its normal filter hash, then this must be the one
1963 * we want!
1964 */
1965 if (!(op->flags & FTRACE_OPS_FL_MODIFYING) &&
1966 hash_contains_ip(ip, op->func_hash))
1860 return op; 1967 return op;
1861 1968
1862 } while_for_each_ftrace_op(op); 1969 } while_for_each_ftrace_op(op);
@@ -1868,10 +1975,11 @@ static struct ftrace_ops *
1868ftrace_find_tramp_ops_new(struct dyn_ftrace *rec) 1975ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
1869{ 1976{
1870 struct ftrace_ops *op; 1977 struct ftrace_ops *op;
1978 unsigned long ip = rec->ip;
1871 1979
1872 do_for_each_ftrace_op(op, ftrace_ops_list) { 1980 do_for_each_ftrace_op(op, ftrace_ops_list) {
1873 /* pass rec in as regs to have non-NULL val */ 1981 /* pass rec in as regs to have non-NULL val */
1874 if (ftrace_ops_test(op, rec->ip, rec)) 1982 if (hash_contains_ip(ip, op->func_hash))
1875 return op; 1983 return op;
1876 } while_for_each_ftrace_op(op); 1984 } while_for_each_ftrace_op(op);
1877 1985
@@ -1896,8 +2004,8 @@ unsigned long ftrace_get_addr_new(struct dyn_ftrace *rec)
1896 if (rec->flags & FTRACE_FL_TRAMP) { 2004 if (rec->flags & FTRACE_FL_TRAMP) {
1897 ops = ftrace_find_tramp_ops_new(rec); 2005 ops = ftrace_find_tramp_ops_new(rec);
1898 if (FTRACE_WARN_ON(!ops || !ops->trampoline)) { 2006 if (FTRACE_WARN_ON(!ops || !ops->trampoline)) {
1899 pr_warning("Bad trampoline accounting at: %p (%pS)\n", 2007 pr_warn("Bad trampoline accounting at: %p (%pS) (%lx)\n",
1900 (void *)rec->ip, (void *)rec->ip); 2008 (void *)rec->ip, (void *)rec->ip, rec->flags);
1901 /* Ftrace is shutting down, return anything */ 2009 /* Ftrace is shutting down, return anything */
1902 return (unsigned long)FTRACE_ADDR; 2010 return (unsigned long)FTRACE_ADDR;
1903 } 2011 }
@@ -1964,7 +2072,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1964 return ftrace_make_call(rec, ftrace_addr); 2072 return ftrace_make_call(rec, ftrace_addr);
1965 2073
1966 case FTRACE_UPDATE_MAKE_NOP: 2074 case FTRACE_UPDATE_MAKE_NOP:
1967 return ftrace_make_nop(NULL, rec, ftrace_addr); 2075 return ftrace_make_nop(NULL, rec, ftrace_old_addr);
1968 2076
1969 case FTRACE_UPDATE_MODIFY_CALL: 2077 case FTRACE_UPDATE_MODIFY_CALL:
1970 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); 2078 return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr);
@@ -2178,89 +2286,6 @@ void __weak arch_ftrace_update_code(int command)
2178 ftrace_run_stop_machine(command); 2286 ftrace_run_stop_machine(command);
2179} 2287}
2180 2288
2181static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops)
2182{
2183 struct ftrace_page *pg;
2184 struct dyn_ftrace *rec;
2185 int size, bits;
2186 int ret;
2187
2188 size = ops->nr_trampolines;
2189 bits = 0;
2190 /*
2191 * Make the hash size about 1/2 the # found
2192 */
2193 for (size /= 2; size; size >>= 1)
2194 bits++;
2195
2196 ops->tramp_hash = alloc_ftrace_hash(bits);
2197 /*
2198 * TODO: a failed allocation is going to screw up
2199 * the accounting of what needs to be modified
2200 * and not. For now, we kill ftrace if we fail
2201 * to allocate here. But there are ways around this,
2202 * but that will take a little more work.
2203 */
2204 if (!ops->tramp_hash)
2205 return -ENOMEM;
2206
2207 do_for_each_ftrace_rec(pg, rec) {
2208 if (ftrace_rec_count(rec) == 1 &&
2209 ftrace_ops_test(ops, rec->ip, rec)) {
2210
2211 /*
2212 * If another ops adds to a rec, the rec will
2213 * lose its trampoline and never get it back
2214 * until all ops are off of it.
2215 */
2216 if (!(rec->flags & FTRACE_FL_TRAMP))
2217 continue;
2218
2219 /* This record had better have a trampoline */
2220 if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN)))
2221 return -1;
2222
2223 ret = add_hash_entry(ops->tramp_hash, rec->ip);
2224 if (ret < 0)
2225 return ret;
2226 }
2227 } while_for_each_ftrace_rec();
2228
2229 /* The number of recs in the hash must match nr_trampolines */
2230 FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines);
2231
2232 return 0;
2233}
2234
2235static int ftrace_save_tramp_hashes(void)
2236{
2237 struct ftrace_ops *op;
2238 int ret;
2239
2240 /*
2241 * Now that any trampoline is being used, we need to save the
2242 * hashes for the ops that have them. This allows the mapping
2243 * back from the record to the ops that has the trampoline to
2244 * know what code is being replaced. Modifying code must always
2245 * verify what it is changing.
2246 */
2247 do_for_each_ftrace_op(op, ftrace_ops_list) {
2248
2249 /* The tramp_hash is recreated each time. */
2250 free_ftrace_hash(op->tramp_hash);
2251 op->tramp_hash = NULL;
2252
2253 if (op->nr_trampolines) {
2254 ret = ftrace_save_ops_tramp_hash(op);
2255 if (ret)
2256 return ret;
2257 }
2258
2259 } while_for_each_ftrace_op(op);
2260
2261 return 0;
2262}
2263
2264static void ftrace_run_update_code(int command) 2289static void ftrace_run_update_code(int command)
2265{ 2290{
2266 int ret; 2291 int ret;
@@ -2280,9 +2305,16 @@ static void ftrace_run_update_code(int command)
2280 2305
2281 ret = ftrace_arch_code_modify_post_process(); 2306 ret = ftrace_arch_code_modify_post_process();
2282 FTRACE_WARN_ON(ret); 2307 FTRACE_WARN_ON(ret);
2308}
2283 2309
2284 ret = ftrace_save_tramp_hashes(); 2310static void ftrace_run_modify_code(struct ftrace_ops *ops, int command,
2285 FTRACE_WARN_ON(ret); 2311 struct ftrace_hash *old_hash)
2312{
2313 ops->flags |= FTRACE_OPS_FL_MODIFYING;
2314 ops->old_hash.filter_hash = old_hash;
2315 ftrace_run_update_code(command);
2316 ops->old_hash.filter_hash = NULL;
2317 ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
2286} 2318}
2287 2319
2288static ftrace_func_t saved_ftrace_func; 2320static ftrace_func_t saved_ftrace_func;
@@ -2306,6 +2338,13 @@ static void ftrace_startup_enable(int command)
2306 ftrace_run_update_code(command); 2338 ftrace_run_update_code(command);
2307} 2339}
2308 2340
2341static void ftrace_startup_all(int command)
2342{
2343 update_all_ops = true;
2344 ftrace_startup_enable(command);
2345 update_all_ops = false;
2346}
2347
2309static int ftrace_startup(struct ftrace_ops *ops, int command) 2348static int ftrace_startup(struct ftrace_ops *ops, int command)
2310{ 2349{
2311 int ret; 2350 int ret;
@@ -2320,12 +2359,22 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
2320 ftrace_start_up++; 2359 ftrace_start_up++;
2321 command |= FTRACE_UPDATE_CALLS; 2360 command |= FTRACE_UPDATE_CALLS;
2322 2361
2323 ops->flags |= FTRACE_OPS_FL_ENABLED; 2362 /*
2363 * Note that ftrace probes uses this to start up
2364 * and modify functions it will probe. But we still
2365 * set the ADDING flag for modification, as probes
2366 * do not have trampolines. If they add them in the
2367 * future, then the probes will need to distinguish
2368 * between adding and updating probes.
2369 */
2370 ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
2324 2371
2325 ftrace_hash_rec_enable(ops, 1); 2372 ftrace_hash_rec_enable(ops, 1);
2326 2373
2327 ftrace_startup_enable(command); 2374 ftrace_startup_enable(command);
2328 2375
2376 ops->flags &= ~FTRACE_OPS_FL_ADDING;
2377
2329 return 0; 2378 return 0;
2330} 2379}
2331 2380
@@ -2375,11 +2424,35 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
2375 * If the ops uses a trampoline, then it needs to be 2424 * If the ops uses a trampoline, then it needs to be
2376 * tested first on update. 2425 * tested first on update.
2377 */ 2426 */
2427 ops->flags |= FTRACE_OPS_FL_REMOVING;
2378 removed_ops = ops; 2428 removed_ops = ops;
2379 2429
2430 /* The trampoline logic checks the old hashes */
2431 ops->old_hash.filter_hash = ops->func_hash->filter_hash;
2432 ops->old_hash.notrace_hash = ops->func_hash->notrace_hash;
2433
2380 ftrace_run_update_code(command); 2434 ftrace_run_update_code(command);
2381 2435
2436 /*
2437 * If there's no more ops registered with ftrace, run a
2438 * sanity check to make sure all rec flags are cleared.
2439 */
2440 if (ftrace_ops_list == &ftrace_list_end) {
2441 struct ftrace_page *pg;
2442 struct dyn_ftrace *rec;
2443
2444 do_for_each_ftrace_rec(pg, rec) {
2445 if (FTRACE_WARN_ON_ONCE(rec->flags))
2446 pr_warn(" %pS flags:%lx\n",
2447 (void *)rec->ip, rec->flags);
2448 } while_for_each_ftrace_rec();
2449 }
2450
2451 ops->old_hash.filter_hash = NULL;
2452 ops->old_hash.notrace_hash = NULL;
2453
2382 removed_ops = NULL; 2454 removed_ops = NULL;
2455 ops->flags &= ~FTRACE_OPS_FL_REMOVING;
2383 2456
2384 /* 2457 /*
2385 * Dynamic ops may be freed, we must make sure that all 2458 * Dynamic ops may be freed, we must make sure that all
@@ -2436,8 +2509,8 @@ static inline int ops_traces_mod(struct ftrace_ops *ops)
2436 * Filter_hash being empty will default to trace module. 2509 * Filter_hash being empty will default to trace module.
2437 * But notrace hash requires a test of individual module functions. 2510 * But notrace hash requires a test of individual module functions.
2438 */ 2511 */
2439 return ftrace_hash_empty(ops->filter_hash) && 2512 return ftrace_hash_empty(ops->func_hash->filter_hash) &&
2440 ftrace_hash_empty(ops->notrace_hash); 2513 ftrace_hash_empty(ops->func_hash->notrace_hash);
2441} 2514}
2442 2515
2443/* 2516/*
@@ -2459,12 +2532,12 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
2459 return 0; 2532 return 0;
2460 2533
2461 /* The function must be in the filter */ 2534 /* The function must be in the filter */
2462 if (!ftrace_hash_empty(ops->filter_hash) && 2535 if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
2463 !ftrace_lookup_ip(ops->filter_hash, rec->ip)) 2536 !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
2464 return 0; 2537 return 0;
2465 2538
2466 /* If in notrace hash, we ignore it too */ 2539 /* If in notrace hash, we ignore it too */
2467 if (ftrace_lookup_ip(ops->notrace_hash, rec->ip)) 2540 if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
2468 return 0; 2541 return 0;
2469 2542
2470 return 1; 2543 return 1;
@@ -2785,10 +2858,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
2785 } else { 2858 } else {
2786 rec = &iter->pg->records[iter->idx++]; 2859 rec = &iter->pg->records[iter->idx++];
2787 if (((iter->flags & FTRACE_ITER_FILTER) && 2860 if (((iter->flags & FTRACE_ITER_FILTER) &&
2788 !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) || 2861 !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) ||
2789 2862
2790 ((iter->flags & FTRACE_ITER_NOTRACE) && 2863 ((iter->flags & FTRACE_ITER_NOTRACE) &&
2791 !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) || 2864 !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) ||
2792 2865
2793 ((iter->flags & FTRACE_ITER_ENABLED) && 2866 ((iter->flags & FTRACE_ITER_ENABLED) &&
2794 !(rec->flags & FTRACE_FL_ENABLED))) { 2867 !(rec->flags & FTRACE_FL_ENABLED))) {
@@ -2837,9 +2910,9 @@ static void *t_start(struct seq_file *m, loff_t *pos)
2837 * functions are enabled. 2910 * functions are enabled.
2838 */ 2911 */
2839 if ((iter->flags & FTRACE_ITER_FILTER && 2912 if ((iter->flags & FTRACE_ITER_FILTER &&
2840 ftrace_hash_empty(ops->filter_hash)) || 2913 ftrace_hash_empty(ops->func_hash->filter_hash)) ||
2841 (iter->flags & FTRACE_ITER_NOTRACE && 2914 (iter->flags & FTRACE_ITER_NOTRACE &&
2842 ftrace_hash_empty(ops->notrace_hash))) { 2915 ftrace_hash_empty(ops->func_hash->notrace_hash))) {
2843 if (*pos > 0) 2916 if (*pos > 0)
2844 return t_hash_start(m, pos); 2917 return t_hash_start(m, pos);
2845 iter->flags |= FTRACE_ITER_PRINTALL; 2918 iter->flags |= FTRACE_ITER_PRINTALL;
@@ -2904,8 +2977,8 @@ static int t_show(struct seq_file *m, void *v)
2904 if (rec->flags & FTRACE_FL_TRAMP_EN) { 2977 if (rec->flags & FTRACE_FL_TRAMP_EN) {
2905 struct ftrace_ops *ops; 2978 struct ftrace_ops *ops;
2906 2979
2907 ops = ftrace_find_tramp_ops_curr(rec); 2980 ops = ftrace_find_tramp_ops_any(rec);
2908 if (ops && ops->trampoline) 2981 if (ops)
2909 seq_printf(m, "\ttramp: %pS", 2982 seq_printf(m, "\ttramp: %pS",
2910 (void *)ops->trampoline); 2983 (void *)ops->trampoline);
2911 else 2984 else
@@ -3001,12 +3074,12 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
3001 iter->ops = ops; 3074 iter->ops = ops;
3002 iter->flags = flag; 3075 iter->flags = flag;
3003 3076
3004 mutex_lock(&ops->regex_lock); 3077 mutex_lock(&ops->func_hash->regex_lock);
3005 3078
3006 if (flag & FTRACE_ITER_NOTRACE) 3079 if (flag & FTRACE_ITER_NOTRACE)
3007 hash = ops->notrace_hash; 3080 hash = ops->func_hash->notrace_hash;
3008 else 3081 else
3009 hash = ops->filter_hash; 3082 hash = ops->func_hash->filter_hash;
3010 3083
3011 if (file->f_mode & FMODE_WRITE) { 3084 if (file->f_mode & FMODE_WRITE) {
3012 const int size_bits = FTRACE_HASH_DEFAULT_BITS; 3085 const int size_bits = FTRACE_HASH_DEFAULT_BITS;
@@ -3041,7 +3114,7 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
3041 file->private_data = iter; 3114 file->private_data = iter;
3042 3115
3043 out_unlock: 3116 out_unlock:
3044 mutex_unlock(&ops->regex_lock); 3117 mutex_unlock(&ops->func_hash->regex_lock);
3045 3118
3046 return ret; 3119 return ret;
3047} 3120}
@@ -3279,12 +3352,12 @@ static struct ftrace_ops trace_probe_ops __read_mostly =
3279{ 3352{
3280 .func = function_trace_probe_call, 3353 .func = function_trace_probe_call,
3281 .flags = FTRACE_OPS_FL_INITIALIZED, 3354 .flags = FTRACE_OPS_FL_INITIALIZED,
3282 INIT_REGEX_LOCK(trace_probe_ops) 3355 INIT_OPS_HASH(trace_probe_ops)
3283}; 3356};
3284 3357
3285static int ftrace_probe_registered; 3358static int ftrace_probe_registered;
3286 3359
3287static void __enable_ftrace_function_probe(void) 3360static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash)
3288{ 3361{
3289 int ret; 3362 int ret;
3290 int i; 3363 int i;
@@ -3292,7 +3365,8 @@ static void __enable_ftrace_function_probe(void)
3292 if (ftrace_probe_registered) { 3365 if (ftrace_probe_registered) {
3293 /* still need to update the function call sites */ 3366 /* still need to update the function call sites */
3294 if (ftrace_enabled) 3367 if (ftrace_enabled)
3295 ftrace_run_update_code(FTRACE_UPDATE_CALLS); 3368 ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
3369 old_hash);
3296 return; 3370 return;
3297 } 3371 }
3298 3372
@@ -3342,7 +3416,8 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3342 void *data) 3416 void *data)
3343{ 3417{
3344 struct ftrace_func_probe *entry; 3418 struct ftrace_func_probe *entry;
3345 struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; 3419 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
3420 struct ftrace_hash *old_hash = *orig_hash;
3346 struct ftrace_hash *hash; 3421 struct ftrace_hash *hash;
3347 struct ftrace_page *pg; 3422 struct ftrace_page *pg;
3348 struct dyn_ftrace *rec; 3423 struct dyn_ftrace *rec;
@@ -3359,9 +3434,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3359 if (WARN_ON(not)) 3434 if (WARN_ON(not))
3360 return -EINVAL; 3435 return -EINVAL;
3361 3436
3362 mutex_lock(&trace_probe_ops.regex_lock); 3437 mutex_lock(&trace_probe_ops.func_hash->regex_lock);
3363 3438
3364 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); 3439 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
3365 if (!hash) { 3440 if (!hash) {
3366 count = -ENOMEM; 3441 count = -ENOMEM;
3367 goto out; 3442 goto out;
@@ -3420,15 +3495,18 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3420 } while_for_each_ftrace_rec(); 3495 } while_for_each_ftrace_rec();
3421 3496
3422 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); 3497 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3423 if (ret < 0)
3424 count = ret;
3425 3498
3426 __enable_ftrace_function_probe(); 3499 __enable_ftrace_function_probe(old_hash);
3500
3501 if (!ret)
3502 free_ftrace_hash_rcu(old_hash);
3503 else
3504 count = ret;
3427 3505
3428 out_unlock: 3506 out_unlock:
3429 mutex_unlock(&ftrace_lock); 3507 mutex_unlock(&ftrace_lock);
3430 out: 3508 out:
3431 mutex_unlock(&trace_probe_ops.regex_lock); 3509 mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
3432 free_ftrace_hash(hash); 3510 free_ftrace_hash(hash);
3433 3511
3434 return count; 3512 return count;
@@ -3446,7 +3524,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3446 struct ftrace_func_entry *rec_entry; 3524 struct ftrace_func_entry *rec_entry;
3447 struct ftrace_func_probe *entry; 3525 struct ftrace_func_probe *entry;
3448 struct ftrace_func_probe *p; 3526 struct ftrace_func_probe *p;
3449 struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; 3527 struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
3528 struct ftrace_hash *old_hash = *orig_hash;
3450 struct list_head free_list; 3529 struct list_head free_list;
3451 struct ftrace_hash *hash; 3530 struct ftrace_hash *hash;
3452 struct hlist_node *tmp; 3531 struct hlist_node *tmp;
@@ -3454,6 +3533,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3454 int type = MATCH_FULL; 3533 int type = MATCH_FULL;
3455 int i, len = 0; 3534 int i, len = 0;
3456 char *search; 3535 char *search;
3536 int ret;
3457 3537
3458 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) 3538 if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
3459 glob = NULL; 3539 glob = NULL;
@@ -3468,7 +3548,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3468 return; 3548 return;
3469 } 3549 }
3470 3550
3471 mutex_lock(&trace_probe_ops.regex_lock); 3551 mutex_lock(&trace_probe_ops.func_hash->regex_lock);
3472 3552
3473 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); 3553 hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
3474 if (!hash) 3554 if (!hash)
@@ -3512,8 +3592,11 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3512 * Remove after the disable is called. Otherwise, if the last 3592 * Remove after the disable is called. Otherwise, if the last
3513 * probe is removed, a null hash means *all enabled*. 3593 * probe is removed, a null hash means *all enabled*.
3514 */ 3594 */
3515 ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); 3595 ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
3516 synchronize_sched(); 3596 synchronize_sched();
3597 if (!ret)
3598 free_ftrace_hash_rcu(old_hash);
3599
3517 list_for_each_entry_safe(entry, p, &free_list, free_list) { 3600 list_for_each_entry_safe(entry, p, &free_list, free_list) {
3518 list_del(&entry->free_list); 3601 list_del(&entry->free_list);
3519 ftrace_free_entry(entry); 3602 ftrace_free_entry(entry);
@@ -3521,7 +3604,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
3521 mutex_unlock(&ftrace_lock); 3604 mutex_unlock(&ftrace_lock);
3522 3605
3523 out_unlock: 3606 out_unlock:
3524 mutex_unlock(&trace_probe_ops.regex_lock); 3607 mutex_unlock(&trace_probe_ops.func_hash->regex_lock);
3525 free_ftrace_hash(hash); 3608 free_ftrace_hash(hash);
3526} 3609}
3527 3610
@@ -3700,10 +3783,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
3700 return add_hash_entry(hash, ip); 3783 return add_hash_entry(hash, ip);
3701} 3784}
3702 3785
3703static void ftrace_ops_update_code(struct ftrace_ops *ops) 3786static void ftrace_ops_update_code(struct ftrace_ops *ops,
3787 struct ftrace_hash *old_hash)
3704{ 3788{
3705 if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) 3789 if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
3706 ftrace_run_update_code(FTRACE_UPDATE_CALLS); 3790 ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash);
3707} 3791}
3708 3792
3709static int 3793static int
@@ -3711,18 +3795,19 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3711 unsigned long ip, int remove, int reset, int enable) 3795 unsigned long ip, int remove, int reset, int enable)
3712{ 3796{
3713 struct ftrace_hash **orig_hash; 3797 struct ftrace_hash **orig_hash;
3798 struct ftrace_hash *old_hash;
3714 struct ftrace_hash *hash; 3799 struct ftrace_hash *hash;
3715 int ret; 3800 int ret;
3716 3801
3717 if (unlikely(ftrace_disabled)) 3802 if (unlikely(ftrace_disabled))
3718 return -ENODEV; 3803 return -ENODEV;
3719 3804
3720 mutex_lock(&ops->regex_lock); 3805 mutex_lock(&ops->func_hash->regex_lock);
3721 3806
3722 if (enable) 3807 if (enable)
3723 orig_hash = &ops->filter_hash; 3808 orig_hash = &ops->func_hash->filter_hash;
3724 else 3809 else
3725 orig_hash = &ops->notrace_hash; 3810 orig_hash = &ops->func_hash->notrace_hash;
3726 3811
3727 if (reset) 3812 if (reset)
3728 hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); 3813 hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS);
@@ -3745,14 +3830,16 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
3745 } 3830 }
3746 3831
3747 mutex_lock(&ftrace_lock); 3832 mutex_lock(&ftrace_lock);
3833 old_hash = *orig_hash;
3748 ret = ftrace_hash_move(ops, enable, orig_hash, hash); 3834 ret = ftrace_hash_move(ops, enable, orig_hash, hash);
3749 if (!ret) 3835 if (!ret) {
3750 ftrace_ops_update_code(ops); 3836 ftrace_ops_update_code(ops, old_hash);
3751 3837 free_ftrace_hash_rcu(old_hash);
3838 }
3752 mutex_unlock(&ftrace_lock); 3839 mutex_unlock(&ftrace_lock);
3753 3840
3754 out_regex_unlock: 3841 out_regex_unlock:
3755 mutex_unlock(&ops->regex_lock); 3842 mutex_unlock(&ops->func_hash->regex_lock);
3756 3843
3757 free_ftrace_hash(hash); 3844 free_ftrace_hash(hash);
3758 return ret; 3845 return ret;
@@ -3957,6 +4044,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
3957 struct seq_file *m = (struct seq_file *)file->private_data; 4044 struct seq_file *m = (struct seq_file *)file->private_data;
3958 struct ftrace_iterator *iter; 4045 struct ftrace_iterator *iter;
3959 struct ftrace_hash **orig_hash; 4046 struct ftrace_hash **orig_hash;
4047 struct ftrace_hash *old_hash;
3960 struct trace_parser *parser; 4048 struct trace_parser *parser;
3961 int filter_hash; 4049 int filter_hash;
3962 int ret; 4050 int ret;
@@ -3975,26 +4063,28 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
3975 4063
3976 trace_parser_put(parser); 4064 trace_parser_put(parser);
3977 4065
3978 mutex_lock(&iter->ops->regex_lock); 4066 mutex_lock(&iter->ops->func_hash->regex_lock);
3979 4067
3980 if (file->f_mode & FMODE_WRITE) { 4068 if (file->f_mode & FMODE_WRITE) {
3981 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER); 4069 filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
3982 4070
3983 if (filter_hash) 4071 if (filter_hash)
3984 orig_hash = &iter->ops->filter_hash; 4072 orig_hash = &iter->ops->func_hash->filter_hash;
3985 else 4073 else
3986 orig_hash = &iter->ops->notrace_hash; 4074 orig_hash = &iter->ops->func_hash->notrace_hash;
3987 4075
3988 mutex_lock(&ftrace_lock); 4076 mutex_lock(&ftrace_lock);
4077 old_hash = *orig_hash;
3989 ret = ftrace_hash_move(iter->ops, filter_hash, 4078 ret = ftrace_hash_move(iter->ops, filter_hash,
3990 orig_hash, iter->hash); 4079 orig_hash, iter->hash);
3991 if (!ret) 4080 if (!ret) {
3992 ftrace_ops_update_code(iter->ops); 4081 ftrace_ops_update_code(iter->ops, old_hash);
3993 4082 free_ftrace_hash_rcu(old_hash);
4083 }
3994 mutex_unlock(&ftrace_lock); 4084 mutex_unlock(&ftrace_lock);
3995 } 4085 }
3996 4086
3997 mutex_unlock(&iter->ops->regex_lock); 4087 mutex_unlock(&iter->ops->func_hash->regex_lock);
3998 free_ftrace_hash(iter->hash); 4088 free_ftrace_hash(iter->hash);
3999 kfree(iter); 4089 kfree(iter);
4000 4090
@@ -4611,7 +4701,6 @@ void __init ftrace_init(void)
4611static struct ftrace_ops global_ops = { 4701static struct ftrace_ops global_ops = {
4612 .func = ftrace_stub, 4702 .func = ftrace_stub,
4613 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, 4703 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
4614 INIT_REGEX_LOCK(global_ops)
4615}; 4704};
4616 4705
4617static int __init ftrace_nodyn_init(void) 4706static int __init ftrace_nodyn_init(void)
@@ -4623,6 +4712,7 @@ core_initcall(ftrace_nodyn_init);
4623 4712
4624static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } 4713static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
4625static inline void ftrace_startup_enable(int command) { } 4714static inline void ftrace_startup_enable(int command) { }
4715static inline void ftrace_startup_all(int command) { }
4626/* Keep as macros so we do not need to define the commands */ 4716/* Keep as macros so we do not need to define the commands */
4627# define ftrace_startup(ops, command) \ 4717# define ftrace_startup(ops, command) \
4628 ({ \ 4718 ({ \
@@ -4713,7 +4803,7 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4713static struct ftrace_ops control_ops = { 4803static struct ftrace_ops control_ops = {
4714 .func = ftrace_ops_control_func, 4804 .func = ftrace_ops_control_func,
4715 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, 4805 .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED,
4716 INIT_REGEX_LOCK(control_ops) 4806 INIT_OPS_HASH(control_ops)
4717}; 4807};
4718 4808
4719static inline void 4809static inline void
@@ -4772,6 +4862,56 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
4772} 4862}
4773#endif 4863#endif
4774 4864
4865/*
4866 * If there's only one function registered but it does not support
4867 * recursion, this function will be called by the mcount trampoline.
4868 * This function will handle recursion protection.
4869 */
4870static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
4871 struct ftrace_ops *op, struct pt_regs *regs)
4872{
4873 int bit;
4874
4875 bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
4876 if (bit < 0)
4877 return;
4878
4879 op->func(ip, parent_ip, op, regs);
4880
4881 trace_clear_recursion(bit);
4882}
4883
4884/**
4885 * ftrace_ops_get_func - get the function a trampoline should call
4886 * @ops: the ops to get the function for
4887 *
4888 * Normally the mcount trampoline will call the ops->func, but there
4889 * are times that it should not. For example, if the ops does not
4890 * have its own recursion protection, then it should call the
4891 * ftrace_ops_recurs_func() instead.
4892 *
4893 * Returns the function that the trampoline should call for @ops.
4894 */
4895ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
4896{
4897 /*
4898 * If this is a dynamic ops or we force list func,
4899 * then it needs to call the list anyway.
4900 */
4901 if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
4902 return ftrace_ops_list_func;
4903
4904 /*
4905 * If the func handles its own recursion, call it directly.
4906 * Otherwise call the recursion protected function that
4907 * will call the ftrace ops function.
4908 */
4909 if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
4910 return ftrace_ops_recurs_func;
4911
4912 return ops->func;
4913}
4914
4775static void clear_ftrace_swapper(void) 4915static void clear_ftrace_swapper(void)
4776{ 4916{
4777 struct task_struct *p; 4917 struct task_struct *p;
@@ -4872,7 +5012,8 @@ static int ftrace_pid_add(int p)
4872 set_ftrace_pid_task(pid); 5012 set_ftrace_pid_task(pid);
4873 5013
4874 ftrace_update_pid_func(); 5014 ftrace_update_pid_func();
4875 ftrace_startup_enable(0); 5015
5016 ftrace_startup_all(0);
4876 5017
4877 mutex_unlock(&ftrace_lock); 5018 mutex_unlock(&ftrace_lock);
4878 return 0; 5019 return 0;
@@ -4901,7 +5042,7 @@ static void ftrace_pid_reset(void)
4901 } 5042 }
4902 5043
4903 ftrace_update_pid_func(); 5044 ftrace_update_pid_func();
4904 ftrace_startup_enable(0); 5045 ftrace_startup_all(0);
4905 5046
4906 mutex_unlock(&ftrace_lock); 5047 mutex_unlock(&ftrace_lock);
4907} 5048}
@@ -5145,6 +5286,17 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
5145 5286
5146#ifdef CONFIG_FUNCTION_GRAPH_TRACER 5287#ifdef CONFIG_FUNCTION_GRAPH_TRACER
5147 5288
5289static struct ftrace_ops graph_ops = {
5290 .func = ftrace_stub,
5291 .flags = FTRACE_OPS_FL_RECURSION_SAFE |
5292 FTRACE_OPS_FL_INITIALIZED |
5293 FTRACE_OPS_FL_STUB,
5294#ifdef FTRACE_GRAPH_TRAMP_ADDR
5295 .trampoline = FTRACE_GRAPH_TRAMP_ADDR,
5296#endif
5297 ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash)
5298};
5299
5148static int ftrace_graph_active; 5300static int ftrace_graph_active;
5149 5301
5150int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) 5302int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -5307,12 +5459,28 @@ static int ftrace_graph_entry_test(struct ftrace_graph_ent *trace)
5307 */ 5459 */
5308static void update_function_graph_func(void) 5460static void update_function_graph_func(void)
5309{ 5461{
5310 if (ftrace_ops_list == &ftrace_list_end || 5462 struct ftrace_ops *op;
5311 (ftrace_ops_list == &global_ops && 5463 bool do_test = false;
5312 global_ops.next == &ftrace_list_end)) 5464
5313 ftrace_graph_entry = __ftrace_graph_entry; 5465 /*
5314 else 5466 * The graph and global ops share the same set of functions
5467 * to test. If any other ops is on the list, then
5468 * the graph tracing needs to test if its the function
5469 * it should call.
5470 */
5471 do_for_each_ftrace_op(op, ftrace_ops_list) {
5472 if (op != &global_ops && op != &graph_ops &&
5473 op != &ftrace_list_end) {
5474 do_test = true;
5475 /* in double loop, break out with goto */
5476 goto out;
5477 }
5478 } while_for_each_ftrace_op(op);
5479 out:
5480 if (do_test)
5315 ftrace_graph_entry = ftrace_graph_entry_test; 5481 ftrace_graph_entry = ftrace_graph_entry_test;
5482 else
5483 ftrace_graph_entry = __ftrace_graph_entry;
5316} 5484}
5317 5485
5318static struct notifier_block ftrace_suspend_notifier = { 5486static struct notifier_block ftrace_suspend_notifier = {
@@ -5353,16 +5521,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
5353 ftrace_graph_entry = ftrace_graph_entry_test; 5521 ftrace_graph_entry = ftrace_graph_entry_test;
5354 update_function_graph_func(); 5522 update_function_graph_func();
5355 5523
5356 /* Function graph doesn't use the .func field of global_ops */ 5524 ret = ftrace_startup(&graph_ops, FTRACE_START_FUNC_RET);
5357 global_ops.flags |= FTRACE_OPS_FL_STUB;
5358
5359#ifdef CONFIG_DYNAMIC_FTRACE
5360 /* Optimize function graph calling (if implemented by arch) */
5361 if (FTRACE_GRAPH_TRAMP_ADDR != 0)
5362 global_ops.trampoline = FTRACE_GRAPH_TRAMP_ADDR;
5363#endif
5364
5365 ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
5366 5525
5367out: 5526out:
5368 mutex_unlock(&ftrace_lock); 5527 mutex_unlock(&ftrace_lock);
@@ -5380,12 +5539,7 @@ void unregister_ftrace_graph(void)
5380 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 5539 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
5381 ftrace_graph_entry = ftrace_graph_entry_stub; 5540 ftrace_graph_entry = ftrace_graph_entry_stub;
5382 __ftrace_graph_entry = ftrace_graph_entry_stub; 5541 __ftrace_graph_entry = ftrace_graph_entry_stub;
5383 ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET); 5542 ftrace_shutdown(&graph_ops, FTRACE_STOP_FUNC_RET);
5384 global_ops.flags &= ~FTRACE_OPS_FL_STUB;
5385#ifdef CONFIG_DYNAMIC_FTRACE
5386 if (FTRACE_GRAPH_TRAMP_ADDR != 0)
5387 global_ops.trampoline = 0;
5388#endif
5389 unregister_pm_notifier(&ftrace_suspend_notifier); 5543 unregister_pm_notifier(&ftrace_suspend_notifier);
5390 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); 5544 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
5391 5545
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 925f629658d6..a56e07c8d15b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -538,16 +538,18 @@ static void rb_wake_up_waiters(struct irq_work *work)
538 * ring_buffer_wait - wait for input to the ring buffer 538 * ring_buffer_wait - wait for input to the ring buffer
539 * @buffer: buffer to wait on 539 * @buffer: buffer to wait on
540 * @cpu: the cpu buffer to wait on 540 * @cpu: the cpu buffer to wait on
541 * @full: wait until a full page is available, if @cpu != RING_BUFFER_ALL_CPUS
541 * 542 *
542 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon 543 * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
543 * as data is added to any of the @buffer's cpu buffers. Otherwise 544 * as data is added to any of the @buffer's cpu buffers. Otherwise
544 * it will wait for data to be added to a specific cpu buffer. 545 * it will wait for data to be added to a specific cpu buffer.
545 */ 546 */
546int ring_buffer_wait(struct ring_buffer *buffer, int cpu) 547int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full)
547{ 548{
548 struct ring_buffer_per_cpu *cpu_buffer; 549 struct ring_buffer_per_cpu *uninitialized_var(cpu_buffer);
549 DEFINE_WAIT(wait); 550 DEFINE_WAIT(wait);
550 struct rb_irq_work *work; 551 struct rb_irq_work *work;
552 int ret = 0;
551 553
552 /* 554 /*
553 * Depending on what the caller is waiting for, either any 555 * Depending on what the caller is waiting for, either any
@@ -564,36 +566,61 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
564 } 566 }
565 567
566 568
567 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); 569 while (true) {
570 prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
568 571
569 /* 572 /*
570 * The events can happen in critical sections where 573 * The events can happen in critical sections where
571 * checking a work queue can cause deadlocks. 574 * checking a work queue can cause deadlocks.
572 * After adding a task to the queue, this flag is set 575 * After adding a task to the queue, this flag is set
573 * only to notify events to try to wake up the queue 576 * only to notify events to try to wake up the queue
574 * using irq_work. 577 * using irq_work.
575 * 578 *
576 * We don't clear it even if the buffer is no longer 579 * We don't clear it even if the buffer is no longer
577 * empty. The flag only causes the next event to run 580 * empty. The flag only causes the next event to run
578 * irq_work to do the work queue wake up. The worse 581 * irq_work to do the work queue wake up. The worse
579 * that can happen if we race with !trace_empty() is that 582 * that can happen if we race with !trace_empty() is that
580 * an event will cause an irq_work to try to wake up 583 * an event will cause an irq_work to try to wake up
581 * an empty queue. 584 * an empty queue.
582 * 585 *
583 * There's no reason to protect this flag either, as 586 * There's no reason to protect this flag either, as
584 * the work queue and irq_work logic will do the necessary 587 * the work queue and irq_work logic will do the necessary
585 * synchronization for the wake ups. The only thing 588 * synchronization for the wake ups. The only thing
586 * that is necessary is that the wake up happens after 589 * that is necessary is that the wake up happens after
587 * a task has been queued. It's OK for spurious wake ups. 590 * a task has been queued. It's OK for spurious wake ups.
588 */ 591 */
589 work->waiters_pending = true; 592 work->waiters_pending = true;
593
594 if (signal_pending(current)) {
595 ret = -EINTR;
596 break;
597 }
598
599 if (cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer))
600 break;
601
602 if (cpu != RING_BUFFER_ALL_CPUS &&
603 !ring_buffer_empty_cpu(buffer, cpu)) {
604 unsigned long flags;
605 bool pagebusy;
606
607 if (!full)
608 break;
609
610 raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
611 pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
612 raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
613
614 if (!pagebusy)
615 break;
616 }
590 617
591 if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
592 (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
593 schedule(); 618 schedule();
619 }
594 620
595 finish_wait(&work->waiters, &wait); 621 finish_wait(&work->waiters, &wait);
596 return 0; 622
623 return ret;
597} 624}
598 625
599/** 626/**
@@ -626,8 +653,22 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
626 work = &cpu_buffer->irq_work; 653 work = &cpu_buffer->irq_work;
627 } 654 }
628 655
629 work->waiters_pending = true;
630 poll_wait(filp, &work->waiters, poll_table); 656 poll_wait(filp, &work->waiters, poll_table);
657 work->waiters_pending = true;
658 /*
659 * There's a tight race between setting the waiters_pending and
660 * checking if the ring buffer is empty. Once the waiters_pending bit
661 * is set, the next event will wake the task up, but we can get stuck
662 * if there's only a single event in.
663 *
664 * FIXME: Ideally, we need a memory barrier on the writer side as well,
665 * but adding a memory barrier to all events will cause too much of a
666 * performance hit in the fast path. We only need a memory barrier when
667 * the buffer goes from empty to having content. But as this race is
668 * extremely small, and it's not a problem if another event comes in, we
669 * will fix it later.
670 */
671 smp_mb();
631 672
632 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || 673 if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
633 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) 674 (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
@@ -1968,7 +2009,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
1968 2009
1969/** 2010/**
1970 * rb_update_event - update event type and data 2011 * rb_update_event - update event type and data
1971 * @event: the even to update 2012 * @event: the event to update
1972 * @type: the type of event 2013 * @type: the type of event
1973 * @length: the size of the event field in the ring buffer 2014 * @length: the size of the event field in the ring buffer
1974 * 2015 *
@@ -3341,21 +3382,16 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
3341 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; 3382 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
3342 3383
3343 /* Iterator usage is expected to have record disabled */ 3384 /* Iterator usage is expected to have record disabled */
3344 if (list_empty(&cpu_buffer->reader_page->list)) { 3385 iter->head_page = cpu_buffer->reader_page;
3345 iter->head_page = rb_set_head_page(cpu_buffer); 3386 iter->head = cpu_buffer->reader_page->read;
3346 if (unlikely(!iter->head_page)) 3387
3347 return; 3388 iter->cache_reader_page = iter->head_page;
3348 iter->head = iter->head_page->read; 3389 iter->cache_read = cpu_buffer->read;
3349 } else { 3390
3350 iter->head_page = cpu_buffer->reader_page;
3351 iter->head = cpu_buffer->reader_page->read;
3352 }
3353 if (iter->head) 3391 if (iter->head)
3354 iter->read_stamp = cpu_buffer->read_stamp; 3392 iter->read_stamp = cpu_buffer->read_stamp;
3355 else 3393 else
3356 iter->read_stamp = iter->head_page->page->time_stamp; 3394 iter->read_stamp = iter->head_page->page->time_stamp;
3357 iter->cache_reader_page = cpu_buffer->reader_page;
3358 iter->cache_read = cpu_buffer->read;
3359} 3395}
3360 3396
3361/** 3397/**
@@ -3748,12 +3784,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3748 return NULL; 3784 return NULL;
3749 3785
3750 /* 3786 /*
3751 * We repeat when a time extend is encountered. 3787 * We repeat when a time extend is encountered or we hit
3752 * Since the time extend is always attached to a data event, 3788 * the end of the page. Since the time extend is always attached
3753 * we should never loop more than once. 3789 * to a data event, we should never loop more than three times.
3754 * (We never hit the following condition more than twice). 3790 * Once for going to next page, once on time extend, and
3791 * finally once to get the event.
3792 * (We never hit the following condition more than thrice).
3755 */ 3793 */
3756 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2)) 3794 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 3))
3757 return NULL; 3795 return NULL;
3758 3796
3759 if (rb_per_cpu_empty(cpu_buffer)) 3797 if (rb_per_cpu_empty(cpu_buffer))
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 0434ff1b808e..3f9e328c30b5 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -205,7 +205,6 @@ static void ring_buffer_consumer(void)
205 break; 205 break;
206 206
207 schedule(); 207 schedule();
208 __set_current_state(TASK_RUNNING);
209 } 208 }
210 reader_finish = 0; 209 reader_finish = 0;
211 complete(&read_done); 210 complete(&read_done);
@@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg)
379 break; 378 break;
380 379
381 schedule(); 380 schedule();
382 __set_current_state(TASK_RUNNING);
383 } 381 }
384 __set_current_state(TASK_RUNNING); 382 __set_current_state(TASK_RUNNING);
385 383
@@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg)
407 trace_printk("Sleeping for 10 secs\n"); 405 trace_printk("Sleeping for 10 secs\n");
408 set_current_state(TASK_INTERRUPTIBLE); 406 set_current_state(TASK_INTERRUPTIBLE);
409 schedule_timeout(HZ * SLEEP_TIME); 407 schedule_timeout(HZ * SLEEP_TIME);
410 __set_current_state(TASK_RUNNING);
411 } 408 }
412 409
413 if (kill_test) 410 if (kill_test)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f3ef80c8914c..0fa2d2070bd4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1076,13 +1076,14 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
1076} 1076}
1077#endif /* CONFIG_TRACER_MAX_TRACE */ 1077#endif /* CONFIG_TRACER_MAX_TRACE */
1078 1078
1079static int wait_on_pipe(struct trace_iterator *iter) 1079static int wait_on_pipe(struct trace_iterator *iter, bool full)
1080{ 1080{
1081 /* Iterators are static, they should be filled or empty */ 1081 /* Iterators are static, they should be filled or empty */
1082 if (trace_buffer_iter(iter, iter->cpu_file)) 1082 if (trace_buffer_iter(iter, iter->cpu_file))
1083 return 0; 1083 return 0;
1084 1084
1085 return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); 1085 return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file,
1086 full);
1086} 1087}
1087 1088
1088#ifdef CONFIG_FTRACE_STARTUP_TEST 1089#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -4434,15 +4435,12 @@ static int tracing_wait_pipe(struct file *filp)
4434 4435
4435 mutex_unlock(&iter->mutex); 4436 mutex_unlock(&iter->mutex);
4436 4437
4437 ret = wait_on_pipe(iter); 4438 ret = wait_on_pipe(iter, false);
4438 4439
4439 mutex_lock(&iter->mutex); 4440 mutex_lock(&iter->mutex);
4440 4441
4441 if (ret) 4442 if (ret)
4442 return ret; 4443 return ret;
4443
4444 if (signal_pending(current))
4445 return -EINTR;
4446 } 4444 }
4447 4445
4448 return 1; 4446 return 1;
@@ -5372,16 +5370,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
5372 goto out_unlock; 5370 goto out_unlock;
5373 } 5371 }
5374 mutex_unlock(&trace_types_lock); 5372 mutex_unlock(&trace_types_lock);
5375 ret = wait_on_pipe(iter); 5373 ret = wait_on_pipe(iter, false);
5376 mutex_lock(&trace_types_lock); 5374 mutex_lock(&trace_types_lock);
5377 if (ret) { 5375 if (ret) {
5378 size = ret; 5376 size = ret;
5379 goto out_unlock; 5377 goto out_unlock;
5380 } 5378 }
5381 if (signal_pending(current)) {
5382 size = -EINTR;
5383 goto out_unlock;
5384 }
5385 goto again; 5379 goto again;
5386 } 5380 }
5387 size = 0; 5381 size = 0;
@@ -5500,7 +5494,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5500 }; 5494 };
5501 struct buffer_ref *ref; 5495 struct buffer_ref *ref;
5502 int entries, size, i; 5496 int entries, size, i;
5503 ssize_t ret; 5497 ssize_t ret = 0;
5504 5498
5505 mutex_lock(&trace_types_lock); 5499 mutex_lock(&trace_types_lock);
5506 5500
@@ -5538,13 +5532,16 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5538 int r; 5532 int r;
5539 5533
5540 ref = kzalloc(sizeof(*ref), GFP_KERNEL); 5534 ref = kzalloc(sizeof(*ref), GFP_KERNEL);
5541 if (!ref) 5535 if (!ref) {
5536 ret = -ENOMEM;
5542 break; 5537 break;
5538 }
5543 5539
5544 ref->ref = 1; 5540 ref->ref = 1;
5545 ref->buffer = iter->trace_buffer->buffer; 5541 ref->buffer = iter->trace_buffer->buffer;
5546 ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); 5542 ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
5547 if (!ref->page) { 5543 if (!ref->page) {
5544 ret = -ENOMEM;
5548 kfree(ref); 5545 kfree(ref);
5549 break; 5546 break;
5550 } 5547 }
@@ -5582,19 +5579,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
5582 5579
5583 /* did we read anything? */ 5580 /* did we read anything? */
5584 if (!spd.nr_pages) { 5581 if (!spd.nr_pages) {
5582 if (ret)
5583 goto out;
5584
5585 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { 5585 if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
5586 ret = -EAGAIN; 5586 ret = -EAGAIN;
5587 goto out; 5587 goto out;
5588 } 5588 }
5589 mutex_unlock(&trace_types_lock); 5589 mutex_unlock(&trace_types_lock);
5590 ret = wait_on_pipe(iter); 5590 ret = wait_on_pipe(iter, true);
5591 mutex_lock(&trace_types_lock); 5591 mutex_lock(&trace_types_lock);
5592 if (ret) 5592 if (ret)
5593 goto out; 5593 goto out;
5594 if (signal_pending(current)) { 5594
5595 ret = -EINTR;
5596 goto out;
5597 }
5598 goto again; 5595 goto again;
5599 } 5596 }
5600 5597
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ef06ce7e9cf8..0cc51edde3a8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2513,8 +2513,11 @@ static __init int event_test_thread(void *unused)
2513 kfree(test_malloc); 2513 kfree(test_malloc);
2514 2514
2515 set_current_state(TASK_INTERRUPTIBLE); 2515 set_current_state(TASK_INTERRUPTIBLE);
2516 while (!kthread_should_stop()) 2516 while (!kthread_should_stop()) {
2517 schedule(); 2517 schedule();
2518 set_current_state(TASK_INTERRUPTIBLE);
2519 }
2520 __set_current_state(TASK_RUNNING);
2518 2521
2519 return 0; 2522 return 0;
2520} 2523}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 5ef60499dc8e..b0f86ea77881 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -382,6 +382,8 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
382 382
383 /* check the trace buffer */ 383 /* check the trace buffer */
384 ret = trace_test_buffer(&tr->trace_buffer, &count); 384 ret = trace_test_buffer(&tr->trace_buffer, &count);
385
386 ftrace_enabled = 1;
385 tracing_start(); 387 tracing_start();
386 388
387 /* we should only have one item */ 389 /* we should only have one item */
@@ -679,6 +681,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
679 681
680 /* check the trace buffer */ 682 /* check the trace buffer */
681 ret = trace_test_buffer(&tr->trace_buffer, &count); 683 ret = trace_test_buffer(&tr->trace_buffer, &count);
684
685 ftrace_enabled = 1;
682 trace->reset(tr); 686 trace->reset(tr);
683 tracing_start(); 687 tracing_start();
684 688
@@ -1025,6 +1029,12 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
1025#endif 1029#endif
1026 1030
1027#ifdef CONFIG_SCHED_TRACER 1031#ifdef CONFIG_SCHED_TRACER
1032
1033struct wakeup_test_data {
1034 struct completion is_ready;
1035 int go;
1036};
1037
1028static int trace_wakeup_test_thread(void *data) 1038static int trace_wakeup_test_thread(void *data)
1029{ 1039{
1030 /* Make this a -deadline thread */ 1040 /* Make this a -deadline thread */
@@ -1034,51 +1044,56 @@ static int trace_wakeup_test_thread(void *data)
1034 .sched_deadline = 10000000ULL, 1044 .sched_deadline = 10000000ULL,
1035 .sched_period = 10000000ULL 1045 .sched_period = 10000000ULL
1036 }; 1046 };
1037 struct completion *x = data; 1047 struct wakeup_test_data *x = data;
1038 1048
1039 sched_setattr(current, &attr); 1049 sched_setattr(current, &attr);
1040 1050
1041 /* Make it know we have a new prio */ 1051 /* Make it know we have a new prio */
1042 complete(x); 1052 complete(&x->is_ready);
1043 1053
1044 /* now go to sleep and let the test wake us up */ 1054 /* now go to sleep and let the test wake us up */
1045 set_current_state(TASK_INTERRUPTIBLE); 1055 set_current_state(TASK_INTERRUPTIBLE);
1046 schedule(); 1056 while (!x->go) {
1057 schedule();
1058 set_current_state(TASK_INTERRUPTIBLE);
1059 }
1047 1060
1048 complete(x); 1061 complete(&x->is_ready);
1062
1063 set_current_state(TASK_INTERRUPTIBLE);
1049 1064
1050 /* we are awake, now wait to disappear */ 1065 /* we are awake, now wait to disappear */
1051 while (!kthread_should_stop()) { 1066 while (!kthread_should_stop()) {
1052 /* 1067 schedule();
1053 * This will likely be the system top priority 1068 set_current_state(TASK_INTERRUPTIBLE);
1054 * task, do short sleeps to let others run.
1055 */
1056 msleep(100);
1057 } 1069 }
1058 1070
1071 __set_current_state(TASK_RUNNING);
1072
1059 return 0; 1073 return 0;
1060} 1074}
1061
1062int 1075int
1063trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) 1076trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1064{ 1077{
1065 unsigned long save_max = tr->max_latency; 1078 unsigned long save_max = tr->max_latency;
1066 struct task_struct *p; 1079 struct task_struct *p;
1067 struct completion is_ready; 1080 struct wakeup_test_data data;
1068 unsigned long count; 1081 unsigned long count;
1069 int ret; 1082 int ret;
1070 1083
1071 init_completion(&is_ready); 1084 memset(&data, 0, sizeof(data));
1085
1086 init_completion(&data.is_ready);
1072 1087
1073 /* create a -deadline thread */ 1088 /* create a -deadline thread */
1074 p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test"); 1089 p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test");
1075 if (IS_ERR(p)) { 1090 if (IS_ERR(p)) {
1076 printk(KERN_CONT "Failed to create ftrace wakeup test thread "); 1091 printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
1077 return -1; 1092 return -1;
1078 } 1093 }
1079 1094
1080 /* make sure the thread is running at -deadline policy */ 1095 /* make sure the thread is running at -deadline policy */
1081 wait_for_completion(&is_ready); 1096 wait_for_completion(&data.is_ready);
1082 1097
1083 /* start the tracing */ 1098 /* start the tracing */
1084 ret = tracer_init(trace, tr); 1099 ret = tracer_init(trace, tr);
@@ -1099,18 +1114,20 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
1099 msleep(100); 1114 msleep(100);
1100 } 1115 }
1101 1116
1102 init_completion(&is_ready); 1117 init_completion(&data.is_ready);
1118
1119 data.go = 1;
1120 /* memory barrier is in the wake_up_process() */
1103 1121
1104 wake_up_process(p); 1122 wake_up_process(p);
1105 1123
1106 /* Wait for the task to wake up */ 1124 /* Wait for the task to wake up */
1107 wait_for_completion(&is_ready); 1125 wait_for_completion(&data.is_ready);
1108 1126
1109 /* stop the tracing. */ 1127 /* stop the tracing. */
1110 tracing_stop(); 1128 tracing_stop();
1111 /* check both trace buffers */ 1129 /* check both trace buffers */
1112 ret = trace_test_buffer(&tr->trace_buffer, NULL); 1130 ret = trace_test_buffer(&tr->trace_buffer, NULL);
1113 printk("ret = %d\n", ret);
1114 if (!ret) 1131 if (!ret)
1115 ret = trace_test_buffer(&tr->max_buffer, &count); 1132 ret = trace_test_buffer(&tr->max_buffer, &count);
1116 1133
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8a4e5cb66a4c..16eddb308c33 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,7 +13,6 @@
13#include <linux/sysctl.h> 13#include <linux/sysctl.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/fs.h> 15#include <linux/fs.h>
16#include <linux/magic.h>
17 16
18#include <asm/setup.h> 17#include <asm/setup.h>
19 18
@@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack)
171 i++; 170 i++;
172 } 171 }
173 172
174 if ((current != &init_task && 173 if (task_stack_end_corrupted(current)) {
175 *(end_of_stack(current)) != STACK_END_MAGIC)) {
176 print_max_stack(); 174 print_max_stack();
177 BUG(); 175 BUG();
178 } 176 }
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 759d5e004517..29228c4d5696 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -313,7 +313,7 @@ static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
313 int size; 313 int size;
314 314
315 syscall_nr = trace_get_syscall_nr(current, regs); 315 syscall_nr = trace_get_syscall_nr(current, regs);
316 if (syscall_nr < 0) 316 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
317 return; 317 return;
318 318
319 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */ 319 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE) */
@@ -360,7 +360,7 @@ static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
360 int syscall_nr; 360 int syscall_nr;
361 361
362 syscall_nr = trace_get_syscall_nr(current, regs); 362 syscall_nr = trace_get_syscall_nr(current, regs);
363 if (syscall_nr < 0) 363 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
364 return; 364 return;
365 365
366 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */ 366 /* Here we're inside tp handler's rcu_read_lock_sched (__DO_TRACE()) */
@@ -425,7 +425,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
425 return; 425 return;
426 mutex_lock(&syscall_trace_lock); 426 mutex_lock(&syscall_trace_lock);
427 tr->sys_refcount_enter--; 427 tr->sys_refcount_enter--;
428 rcu_assign_pointer(tr->enter_syscall_files[num], NULL); 428 RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL);
429 if (!tr->sys_refcount_enter) 429 if (!tr->sys_refcount_enter)
430 unregister_trace_sys_enter(ftrace_syscall_enter, tr); 430 unregister_trace_sys_enter(ftrace_syscall_enter, tr);
431 mutex_unlock(&syscall_trace_lock); 431 mutex_unlock(&syscall_trace_lock);
@@ -463,7 +463,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
463 return; 463 return;
464 mutex_lock(&syscall_trace_lock); 464 mutex_lock(&syscall_trace_lock);
465 tr->sys_refcount_exit--; 465 tr->sys_refcount_exit--;
466 rcu_assign_pointer(tr->exit_syscall_files[num], NULL); 466 RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL);
467 if (!tr->sys_refcount_exit) 467 if (!tr->sys_refcount_exit)
468 unregister_trace_sys_exit(ftrace_syscall_exit, tr); 468 unregister_trace_sys_exit(ftrace_syscall_exit, tr);
469 mutex_unlock(&syscall_trace_lock); 469 mutex_unlock(&syscall_trace_lock);
@@ -567,7 +567,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
567 int size; 567 int size;
568 568
569 syscall_nr = trace_get_syscall_nr(current, regs); 569 syscall_nr = trace_get_syscall_nr(current, regs);
570 if (syscall_nr < 0) 570 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
571 return; 571 return;
572 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) 572 if (!test_bit(syscall_nr, enabled_perf_enter_syscalls))
573 return; 573 return;
@@ -641,7 +641,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
641 int size; 641 int size;
642 642
643 syscall_nr = trace_get_syscall_nr(current, regs); 643 syscall_nr = trace_get_syscall_nr(current, regs);
644 if (syscall_nr < 0) 644 if (syscall_nr < 0 || syscall_nr >= NR_syscalls)
645 return; 645 return;
646 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) 646 if (!test_bit(syscall_nr, enabled_perf_exit_syscalls))
647 return; 647 return;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 394f70b17162..9586b670a5b2 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -14,7 +14,7 @@ static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
14void user_return_notifier_register(struct user_return_notifier *urn) 14void user_return_notifier_register(struct user_return_notifier *urn)
15{ 15{
16 set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); 16 set_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
17 hlist_add_head(&urn->link, &__get_cpu_var(return_notifier_list)); 17 hlist_add_head(&urn->link, this_cpu_ptr(&return_notifier_list));
18} 18}
19EXPORT_SYMBOL_GPL(user_return_notifier_register); 19EXPORT_SYMBOL_GPL(user_return_notifier_register);
20 20
@@ -25,7 +25,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
25void user_return_notifier_unregister(struct user_return_notifier *urn) 25void user_return_notifier_unregister(struct user_return_notifier *urn)
26{ 26{
27 hlist_del(&urn->link); 27 hlist_del(&urn->link);
28 if (hlist_empty(&__get_cpu_var(return_notifier_list))) 28 if (hlist_empty(this_cpu_ptr(&return_notifier_list)))
29 clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY); 29 clear_tsk_thread_flag(current, TIF_USER_RETURN_NOTIFY);
30} 30}
31EXPORT_SYMBOL_GPL(user_return_notifier_unregister); 31EXPORT_SYMBOL_GPL(user_return_notifier_unregister);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index fcc02560fd6b..aa312b0dc3ec 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -526,21 +526,21 @@ static void m_stop(struct seq_file *seq, void *v)
526 return; 526 return;
527} 527}
528 528
529struct seq_operations proc_uid_seq_operations = { 529const struct seq_operations proc_uid_seq_operations = {
530 .start = uid_m_start, 530 .start = uid_m_start,
531 .stop = m_stop, 531 .stop = m_stop,
532 .next = m_next, 532 .next = m_next,
533 .show = uid_m_show, 533 .show = uid_m_show,
534}; 534};
535 535
536struct seq_operations proc_gid_seq_operations = { 536const struct seq_operations proc_gid_seq_operations = {
537 .start = gid_m_start, 537 .start = gid_m_start,
538 .stop = m_stop, 538 .stop = m_stop,
539 .next = m_next, 539 .next = m_next,
540 .show = gid_m_show, 540 .show = gid_m_show,
541}; 541};
542 542
543struct seq_operations proc_projid_seq_operations = { 543const struct seq_operations proc_projid_seq_operations = {
544 .start = projid_m_start, 544 .start = projid_m_start,
545 .stop = m_stop, 545 .stop = m_stop,
546 .next = m_next, 546 .next = m_next,
diff --git a/kernel/utsname.c b/kernel/utsname.c
index fd393124e507..883aaaa7de8a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -93,13 +93,13 @@ static void *utsns_get(struct task_struct *task)
93 struct uts_namespace *ns = NULL; 93 struct uts_namespace *ns = NULL;
94 struct nsproxy *nsproxy; 94 struct nsproxy *nsproxy;
95 95
96 rcu_read_lock(); 96 task_lock(task);
97 nsproxy = task_nsproxy(task); 97 nsproxy = task->nsproxy;
98 if (nsproxy) { 98 if (nsproxy) {
99 ns = nsproxy->uts_ns; 99 ns = nsproxy->uts_ns;
100 get_uts_ns(ns); 100 get_uts_ns(ns);
101 } 101 }
102 rcu_read_unlock(); 102 task_unlock(task);
103 103
104 return ns; 104 return ns;
105} 105}
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index c3319bd1b040..70bf11815f84 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -15,11 +15,6 @@
15#include <linux/cpu.h> 15#include <linux/cpu.h>
16#include <linux/nmi.h> 16#include <linux/nmi.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/delay.h>
19#include <linux/freezer.h>
20#include <linux/kthread.h>
21#include <linux/lockdep.h>
22#include <linux/notifier.h>
23#include <linux/module.h> 18#include <linux/module.h>
24#include <linux/sysctl.h> 19#include <linux/sysctl.h>
25#include <linux/smpboot.h> 20#include <linux/smpboot.h>
@@ -47,6 +42,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync);
47static DEFINE_PER_CPU(bool, soft_watchdog_warn); 42static DEFINE_PER_CPU(bool, soft_watchdog_warn);
48static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); 43static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
49static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt); 44static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
45static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
50#ifdef CONFIG_HARDLOCKUP_DETECTOR 46#ifdef CONFIG_HARDLOCKUP_DETECTOR
51static DEFINE_PER_CPU(bool, hard_watchdog_warn); 47static DEFINE_PER_CPU(bool, hard_watchdog_warn);
52static DEFINE_PER_CPU(bool, watchdog_nmi_touch); 48static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
@@ -63,6 +59,25 @@ static unsigned long soft_lockup_nmi_warn;
63static int hardlockup_panic = 59static int hardlockup_panic =
64 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE; 60 CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
65 61
62static bool hardlockup_detector_enabled = true;
63/*
64 * We may not want to enable hard lockup detection by default in all cases,
65 * for example when running the kernel as a guest on a hypervisor. In these
66 * cases this function can be called to disable hard lockup detection. This
67 * function should only be executed once by the boot processor before the
68 * kernel command line parameters are parsed, because otherwise it is not
69 * possible to override this in hardlockup_panic_setup().
70 */
71void watchdog_enable_hardlockup_detector(bool val)
72{
73 hardlockup_detector_enabled = val;
74}
75
76bool watchdog_hardlockup_detector_is_enabled(void)
77{
78 return hardlockup_detector_enabled;
79}
80
66static int __init hardlockup_panic_setup(char *str) 81static int __init hardlockup_panic_setup(char *str)
67{ 82{
68 if (!strncmp(str, "panic", 5)) 83 if (!strncmp(str, "panic", 5))
@@ -71,6 +86,14 @@ static int __init hardlockup_panic_setup(char *str)
71 hardlockup_panic = 0; 86 hardlockup_panic = 0;
72 else if (!strncmp(str, "0", 1)) 87 else if (!strncmp(str, "0", 1))
73 watchdog_user_enabled = 0; 88 watchdog_user_enabled = 0;
89 else if (!strncmp(str, "1", 1) || !strncmp(str, "2", 1)) {
90 /*
91 * Setting 'nmi_watchdog=1' or 'nmi_watchdog=2' (legacy option)
92 * has the same effect.
93 */
94 watchdog_user_enabled = 1;
95 watchdog_enable_hardlockup_detector(true);
96 }
74 return 1; 97 return 1;
75} 98}
76__setup("nmi_watchdog=", hardlockup_panic_setup); 99__setup("nmi_watchdog=", hardlockup_panic_setup);
@@ -185,7 +208,7 @@ void touch_nmi_watchdog(void)
185 * case we shouldn't have to worry about the watchdog 208 * case we shouldn't have to worry about the watchdog
186 * going off. 209 * going off.
187 */ 210 */
188 __raw_get_cpu_var(watchdog_nmi_touch) = true; 211 raw_cpu_write(watchdog_nmi_touch, true);
189 touch_softlockup_watchdog(); 212 touch_softlockup_watchdog();
190} 213}
191EXPORT_SYMBOL(touch_nmi_watchdog); 214EXPORT_SYMBOL(touch_nmi_watchdog);
@@ -194,8 +217,8 @@ EXPORT_SYMBOL(touch_nmi_watchdog);
194 217
195void touch_softlockup_watchdog_sync(void) 218void touch_softlockup_watchdog_sync(void)
196{ 219{
197 __raw_get_cpu_var(softlockup_touch_sync) = true; 220 __this_cpu_write(softlockup_touch_sync, true);
198 __raw_get_cpu_var(watchdog_touch_ts) = 0; 221 __this_cpu_write(watchdog_touch_ts, 0);
199} 222}
200 223
201#ifdef CONFIG_HARDLOCKUP_DETECTOR 224#ifdef CONFIG_HARDLOCKUP_DETECTOR
@@ -260,9 +283,11 @@ static void watchdog_overflow_callback(struct perf_event *event,
260 return; 283 return;
261 284
262 if (hardlockup_panic) 285 if (hardlockup_panic)
263 panic("Watchdog detected hard LOCKUP on cpu %d", this_cpu); 286 panic("Watchdog detected hard LOCKUP on cpu %d",
287 this_cpu);
264 else 288 else
265 WARN(1, "Watchdog detected hard LOCKUP on cpu %d", this_cpu); 289 WARN(1, "Watchdog detected hard LOCKUP on cpu %d",
290 this_cpu);
266 291
267 __this_cpu_write(hard_watchdog_warn, true); 292 __this_cpu_write(hard_watchdog_warn, true);
268 return; 293 return;
@@ -331,8 +356,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
331 return HRTIMER_RESTART; 356 return HRTIMER_RESTART;
332 357
333 /* only warn once */ 358 /* only warn once */
334 if (__this_cpu_read(soft_watchdog_warn) == true) 359 if (__this_cpu_read(soft_watchdog_warn) == true) {
360 /*
361 * When multiple processes are causing softlockups the
362 * softlockup detector only warns on the first one
363 * because the code relies on a full quiet cycle to
364 * re-arm. The second process prevents the quiet cycle
365 * and never gets reported. Use task pointers to detect
366 * this.
367 */
368 if (__this_cpu_read(softlockup_task_ptr_saved) !=
369 current) {
370 __this_cpu_write(soft_watchdog_warn, false);
371 __touch_watchdog();
372 }
335 return HRTIMER_RESTART; 373 return HRTIMER_RESTART;
374 }
336 375
337 if (softlockup_all_cpu_backtrace) { 376 if (softlockup_all_cpu_backtrace) {
338 /* Prevent multiple soft-lockup reports if one cpu is already 377 /* Prevent multiple soft-lockup reports if one cpu is already
@@ -345,9 +384,10 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
345 } 384 }
346 } 385 }
347 386
348 printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n", 387 pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
349 smp_processor_id(), duration, 388 smp_processor_id(), duration,
350 current->comm, task_pid_nr(current)); 389 current->comm, task_pid_nr(current));
390 __this_cpu_write(softlockup_task_ptr_saved, current);
351 print_modules(); 391 print_modules();
352 print_irqtrace_events(current); 392 print_irqtrace_events(current);
353 if (regs) 393 if (regs)
@@ -366,6 +406,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
366 smp_mb__after_atomic(); 406 smp_mb__after_atomic();
367 } 407 }
368 408
409 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
369 if (softlockup_panic) 410 if (softlockup_panic)
370 panic("softlockup: hung tasks"); 411 panic("softlockup: hung tasks");
371 __this_cpu_write(soft_watchdog_warn, true); 412 __this_cpu_write(soft_watchdog_warn, true);
@@ -384,7 +425,7 @@ static void watchdog_set_prio(unsigned int policy, unsigned int prio)
384 425
385static void watchdog_enable(unsigned int cpu) 426static void watchdog_enable(unsigned int cpu)
386{ 427{
387 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 428 struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
388 429
389 /* kick off the timer for the hardlockup detector */ 430 /* kick off the timer for the hardlockup detector */
390 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 431 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
@@ -404,7 +445,7 @@ static void watchdog_enable(unsigned int cpu)
404 445
405static void watchdog_disable(unsigned int cpu) 446static void watchdog_disable(unsigned int cpu)
406{ 447{
407 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 448 struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
408 449
409 watchdog_set_prio(SCHED_NORMAL, 0); 450 watchdog_set_prio(SCHED_NORMAL, 0);
410 hrtimer_cancel(hrtimer); 451 hrtimer_cancel(hrtimer);
@@ -451,6 +492,15 @@ static int watchdog_nmi_enable(unsigned int cpu)
451 struct perf_event_attr *wd_attr; 492 struct perf_event_attr *wd_attr;
452 struct perf_event *event = per_cpu(watchdog_ev, cpu); 493 struct perf_event *event = per_cpu(watchdog_ev, cpu);
453 494
495 /*
496 * Some kernels need to default hard lockup detection to
497 * 'disabled', for example a guest on a hypervisor.
498 */
499 if (!watchdog_hardlockup_detector_is_enabled()) {
500 event = ERR_PTR(-ENOENT);
501 goto handle_err;
502 }
503
454 /* is it already setup and enabled? */ 504 /* is it already setup and enabled? */
455 if (event && event->state > PERF_EVENT_STATE_OFF) 505 if (event && event->state > PERF_EVENT_STATE_OFF)
456 goto out; 506 goto out;
@@ -465,6 +515,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
465 /* Try to register using hardware perf events */ 515 /* Try to register using hardware perf events */
466 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL); 516 event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
467 517
518handle_err:
468 /* save cpu0 error for future comparision */ 519 /* save cpu0 error for future comparision */
469 if (cpu == 0 && IS_ERR(event)) 520 if (cpu == 0 && IS_ERR(event))
470 cpu0_err = PTR_ERR(event); 521 cpu0_err = PTR_ERR(event);
@@ -484,7 +535,7 @@ static int watchdog_nmi_enable(unsigned int cpu)
484 if (PTR_ERR(event) == -EOPNOTSUPP) 535 if (PTR_ERR(event) == -EOPNOTSUPP)
485 pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu); 536 pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
486 else if (PTR_ERR(event) == -ENOENT) 537 else if (PTR_ERR(event) == -ENOENT)
487 pr_warning("disabled (cpu%i): hardware events not enabled\n", 538 pr_warn("disabled (cpu%i): hardware events not enabled\n",
488 cpu); 539 cpu);
489 else 540 else
490 pr_err("disabled (cpu%i): unable to create perf event: %ld\n", 541 pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
@@ -511,7 +562,10 @@ static void watchdog_nmi_disable(unsigned int cpu)
511 /* should be in cleanup, but blocks oprofile */ 562 /* should be in cleanup, but blocks oprofile */
512 perf_event_release_kernel(event); 563 perf_event_release_kernel(event);
513 } 564 }
514 return; 565 if (cpu == 0) {
566 /* watchdog_nmi_enable() expects this to be zero initially. */
567 cpu0_err = 0;
568 }
515} 569}
516#else 570#else
517static int watchdog_nmi_enable(unsigned int cpu) { return 0; } 571static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
@@ -531,7 +585,7 @@ static struct smp_hotplug_thread watchdog_threads = {
531 585
532static void restart_watchdog_hrtimer(void *info) 586static void restart_watchdog_hrtimer(void *info)
533{ 587{
534 struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); 588 struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
535 int ret; 589 int ret;
536 590
537 /* 591 /*
@@ -607,11 +661,13 @@ int proc_dowatchdog(struct ctl_table *table, int write,
607 void __user *buffer, size_t *lenp, loff_t *ppos) 661 void __user *buffer, size_t *lenp, loff_t *ppos)
608{ 662{
609 int err, old_thresh, old_enabled; 663 int err, old_thresh, old_enabled;
664 bool old_hardlockup;
610 static DEFINE_MUTEX(watchdog_proc_mutex); 665 static DEFINE_MUTEX(watchdog_proc_mutex);
611 666
612 mutex_lock(&watchdog_proc_mutex); 667 mutex_lock(&watchdog_proc_mutex);
613 old_thresh = ACCESS_ONCE(watchdog_thresh); 668 old_thresh = ACCESS_ONCE(watchdog_thresh);
614 old_enabled = ACCESS_ONCE(watchdog_user_enabled); 669 old_enabled = ACCESS_ONCE(watchdog_user_enabled);
670 old_hardlockup = watchdog_hardlockup_detector_is_enabled();
615 671
616 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); 672 err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
617 if (err || !write) 673 if (err || !write)
@@ -623,15 +679,22 @@ int proc_dowatchdog(struct ctl_table *table, int write,
623 * disabled. The 'watchdog_running' variable check in 679 * disabled. The 'watchdog_running' variable check in
624 * watchdog_*_all_cpus() function takes care of this. 680 * watchdog_*_all_cpus() function takes care of this.
625 */ 681 */
626 if (watchdog_user_enabled && watchdog_thresh) 682 if (watchdog_user_enabled && watchdog_thresh) {
683 /*
684 * Prevent a change in watchdog_thresh accidentally overriding
685 * the enablement of the hardlockup detector.
686 */
687 if (watchdog_user_enabled != old_enabled)
688 watchdog_enable_hardlockup_detector(true);
627 err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); 689 err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh);
628 else 690 } else
629 watchdog_disable_all_cpus(); 691 watchdog_disable_all_cpus();
630 692
631 /* Restore old values on failure */ 693 /* Restore old values on failure */
632 if (err) { 694 if (err) {
633 watchdog_thresh = old_thresh; 695 watchdog_thresh = old_thresh;
634 watchdog_user_enabled = old_enabled; 696 watchdog_user_enabled = old_enabled;
697 watchdog_enable_hardlockup_detector(old_hardlockup);
635 } 698 }
636out: 699out:
637 mutex_unlock(&watchdog_proc_mutex); 700 mutex_unlock(&watchdog_proc_mutex);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5dbe22aa3efd..09b685daee3d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2043,9 +2043,10 @@ __acquires(&pool->lock)
2043 * kernels, where a requeueing work item waiting for something to 2043 * kernels, where a requeueing work item waiting for something to
2044 * happen could deadlock with stop_machine as such work item could 2044 * happen could deadlock with stop_machine as such work item could
2045 * indefinitely requeue itself while all other CPUs are trapped in 2045 * indefinitely requeue itself while all other CPUs are trapped in
2046 * stop_machine. 2046 * stop_machine. At the same time, report a quiescent RCU state so
2047 * the same condition doesn't freeze RCU.
2047 */ 2048 */
2048 cond_resched(); 2049 cond_resched_rcu_qs();
2049 2050
2050 spin_lock_irq(&pool->lock); 2051 spin_lock_irq(&pool->lock);
2051 2052