aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c37
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup.c66
-rw-r--r--kernel/cgroup_freezer.c26
-rw-r--r--kernel/compat.c25
-rw-r--r--kernel/cpu.c26
-rw-r--r--kernel/cpuset.c67
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c9
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/groups.c6
-rw-r--r--kernel/hrtimer.c67
-rw-r--r--kernel/hw_breakpoint.c196
-rw-r--r--kernel/irq/handle.c3
-rw-r--r--kernel/irq/manage.c89
-rw-r--r--kernel/irq/proc.c60
-rw-r--r--kernel/kexec.c6
-rw-r--r--kernel/kprobes.c132
-rw-r--r--kernel/lockdep.c88
-rw-r--r--kernel/lockdep_internals.h72
-rw-r--r--kernel/lockdep_proc.c58
-rw-r--r--kernel/module.c22
-rw-r--r--kernel/perf_event.c379
-rw-r--r--kernel/pm_qos_params.c218
-rw-r--r--kernel/posix-cpu-timers.c298
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/snapshot.c145
-rw-r--r--kernel/power/swap.c333
-rw-r--r--kernel/power/user.c37
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c12
-rw-r--r--kernel/rcupdate.c30
-rw-r--r--kernel/rcutiny.c35
-rw-r--r--kernel/rcutiny_plugin.h39
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/rcutree.c131
-rw-r--r--kernel/rcutree.h2
-rw-r--r--kernel/rcutree_plugin.h69
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/sched.c789
-rw-r--r--kernel/sched_debug.c110
-rw-r--r--kernel/sched_fair.c350
-rw-r--r--kernel/sched_features.h55
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c15
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/stop_machine.c537
-rw-r--r--kernel/sys.c33
-rw-r--r--kernel/sysctl.c5
-rw-r--r--kernel/time.c11
-rw-r--r--kernel/time/clocksource.c48
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-sched.c84
-rw-r--r--kernel/time/timekeeping.c35
-rw-r--r--kernel/time/timer_list.c1
-rw-r--r--kernel/timer.c137
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c33
-rw-r--r--kernel/trace/ring_buffer.c179
-rw-r--r--kernel/trace/ring_buffer_benchmark.c5
-rw-r--r--kernel/trace/trace.c127
-rw-r--r--kernel/trace/trace.h47
-rw-r--r--kernel/trace/trace_entries.h12
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_functions_graph.c169
-rw-r--r--kernel/trace/trace_hw_branches.c312
-rw-r--r--kernel/trace/trace_irqsoff.c271
-rw-r--r--kernel/trace/trace_kprobe.c535
-rw-r--r--kernel/trace/trace_ksym.c26
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_sched_switch.c5
-rw-r--r--kernel/trace/trace_sched_wakeup.c5
-rw-r--r--kernel/trace/trace_selftest.c64
-rw-r--r--kernel/user.c11
-rw-r--r--kernel/workqueue.c38
80 files changed, 3901 insertions, 3133 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1676b5..149e18ef1ab1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,7 +68,7 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
68obj-$(CONFIG_PID_NS) += pid_namespace.o 68obj-$(CONFIG_PID_NS) += pid_namespace.o
69obj-$(CONFIG_IKCONFIG) += configs.o 69obj-$(CONFIG_IKCONFIG) += configs.o
70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
71obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 24f8c81fc48d..385b88461c29 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -216,7 +216,6 @@ static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt; 218 struct vfsmount *mnt;
219 int error;
220 struct pid_namespace *ns; 219 struct pid_namespace *ns;
221 struct bsd_acct_struct *acct = NULL; 220 struct bsd_acct_struct *acct = NULL;
222 221
@@ -244,13 +243,6 @@ static int acct_on(char *name)
244 } 243 }
245 } 244 }
246 245
247 error = security_acct(file);
248 if (error) {
249 kfree(acct);
250 filp_close(file, NULL);
251 return error;
252 }
253
254 spin_lock(&acct_lock); 246 spin_lock(&acct_lock);
255 if (ns->bacct == NULL) { 247 if (ns->bacct == NULL) {
256 ns->bacct = acct; 248 ns->bacct = acct;
@@ -281,7 +273,7 @@ static int acct_on(char *name)
281 */ 273 */
282SYSCALL_DEFINE1(acct, const char __user *, name) 274SYSCALL_DEFINE1(acct, const char __user *, name)
283{ 275{
284 int error; 276 int error = 0;
285 277
286 if (!capable(CAP_SYS_PACCT)) 278 if (!capable(CAP_SYS_PACCT))
287 return -EPERM; 279 return -EPERM;
@@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
299 if (acct == NULL) 291 if (acct == NULL)
300 return 0; 292 return 0;
301 293
302 error = security_acct(NULL); 294 spin_lock(&acct_lock);
303 if (!error) { 295 acct_file_reopen(acct, NULL, NULL);
304 spin_lock(&acct_lock); 296 spin_unlock(&acct_lock);
305 acct_file_reopen(acct, NULL, NULL);
306 spin_unlock(&acct_lock);
307 }
308 } 297 }
298
309 return error; 299 return error;
310} 300}
311 301
@@ -353,17 +343,18 @@ restart:
353 343
354void acct_exit_ns(struct pid_namespace *ns) 344void acct_exit_ns(struct pid_namespace *ns)
355{ 345{
356 struct bsd_acct_struct *acct; 346 struct bsd_acct_struct *acct = ns->bacct;
357 347
358 spin_lock(&acct_lock); 348 if (acct == NULL)
359 acct = ns->bacct; 349 return;
360 if (acct != NULL) {
361 if (acct->file != NULL)
362 acct_file_reopen(acct, NULL, NULL);
363 350
364 kfree(acct); 351 del_timer_sync(&acct->timer);
365 } 352 spin_lock(&acct_lock);
353 if (acct->file != NULL)
354 acct_file_reopen(acct, NULL, NULL);
366 spin_unlock(&acct_lock); 355 spin_unlock(&acct_lock);
356
357 kfree(acct);
367} 358}
368 359
369/* 360/*
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e4697e9b276..2f05303715a5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "cred-internals.h"
19 18
20/* 19/*
21 * Leveraged for setting/resetting capabilities 20 * Leveraged for setting/resetting capabilities
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2769e13980c..291775021b2e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1646,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1647{ 1647{
1648 char *start; 1648 char *start;
1649 struct dentry *dentry = rcu_dereference(cgrp->dentry); 1649 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1650 rcu_read_lock_held() ||
1651 cgroup_lock_is_held());
1650 1652
1651 if (!dentry || cgrp == dummytop) { 1653 if (!dentry || cgrp == dummytop) {
1652 /* 1654 /*
@@ -1662,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1662 *--start = '\0'; 1664 *--start = '\0';
1663 for (;;) { 1665 for (;;) {
1664 int len = dentry->d_name.len; 1666 int len = dentry->d_name.len;
1667
1665 if ((start -= len) < buf) 1668 if ((start -= len) < buf)
1666 return -ENAMETOOLONG; 1669 return -ENAMETOOLONG;
1667 memcpy(start, cgrp->dentry->d_name.name, len); 1670 memcpy(start, dentry->d_name.name, len);
1668 cgrp = cgrp->parent; 1671 cgrp = cgrp->parent;
1669 if (!cgrp) 1672 if (!cgrp)
1670 break; 1673 break;
1671 dentry = rcu_dereference(cgrp->dentry); 1674
1675 dentry = rcu_dereference_check(cgrp->dentry,
1676 rcu_read_lock_held() ||
1677 cgroup_lock_is_held());
1672 if (!cgrp->parent) 1678 if (!cgrp->parent)
1673 continue; 1679 continue;
1674 if (--start < buf) 1680 if (--start < buf)
@@ -3010,7 +3016,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3010 unsigned long flags = (unsigned long)key; 3016 unsigned long flags = (unsigned long)key;
3011 3017
3012 if (flags & POLLHUP) { 3018 if (flags & POLLHUP) {
3013 remove_wait_queue_locked(event->wqh, &event->wait); 3019 __remove_wait_queue(event->wqh, &event->wait);
3014 spin_lock(&cgrp->event_list_lock); 3020 spin_lock(&cgrp->event_list_lock);
3015 list_del(&event->list); 3021 list_del(&event->list);
3016 spin_unlock(&cgrp->event_list_lock); 3022 spin_unlock(&cgrp->event_list_lock);
@@ -3609,7 +3615,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3609 * @ss: the subsystem to load 3615 * @ss: the subsystem to load
3610 * 3616 *
3611 * This function should be called in a modular subsystem's initcall. If the 3617 * This function should be called in a modular subsystem's initcall. If the
3612 * subsytem is built as a module, it will be assigned a new subsys_id and set 3618 * subsystem is built as a module, it will be assigned a new subsys_id and set
3613 * up for use. If the subsystem is built-in anyway, work is delegated to the 3619 * up for use. If the subsystem is built-in anyway, work is delegated to the
3614 * simpler cgroup_init_subsys. 3620 * simpler cgroup_init_subsys.
3615 */ 3621 */
@@ -4429,7 +4435,15 @@ __setup("cgroup_disable=", cgroup_disable);
4429 */ 4435 */
4430unsigned short css_id(struct cgroup_subsys_state *css) 4436unsigned short css_id(struct cgroup_subsys_state *css)
4431{ 4437{
4432 struct css_id *cssid = rcu_dereference(css->id); 4438 struct css_id *cssid;
4439
4440 /*
4441 * This css_id() can return correct value when somone has refcnt
4442 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4443 * it's unchanged until freed.
4444 */
4445 cssid = rcu_dereference_check(css->id,
4446 rcu_read_lock_held() || atomic_read(&css->refcnt));
4433 4447
4434 if (cssid) 4448 if (cssid)
4435 return cssid->id; 4449 return cssid->id;
@@ -4439,7 +4453,10 @@ EXPORT_SYMBOL_GPL(css_id);
4439 4453
4440unsigned short css_depth(struct cgroup_subsys_state *css) 4454unsigned short css_depth(struct cgroup_subsys_state *css)
4441{ 4455{
4442 struct css_id *cssid = rcu_dereference(css->id); 4456 struct css_id *cssid;
4457
4458 cssid = rcu_dereference_check(css->id,
4459 rcu_read_lock_held() || atomic_read(&css->refcnt));
4443 4460
4444 if (cssid) 4461 if (cssid)
4445 return cssid->depth; 4462 return cssid->depth;
@@ -4447,15 +4464,36 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4447} 4464}
4448EXPORT_SYMBOL_GPL(css_depth); 4465EXPORT_SYMBOL_GPL(css_depth);
4449 4466
4467/**
4468 * css_is_ancestor - test "root" css is an ancestor of "child"
4469 * @child: the css to be tested.
4470 * @root: the css supporsed to be an ancestor of the child.
4471 *
4472 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4473 * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
4474 * But, considering usual usage, the csses should be valid objects after test.
4475 * Assuming that the caller will do some action to the child if this returns
4476 * returns true, the caller must take "child";s reference count.
4477 * If "child" is valid object and this returns true, "root" is valid, too.
4478 */
4479
4450bool css_is_ancestor(struct cgroup_subsys_state *child, 4480bool css_is_ancestor(struct cgroup_subsys_state *child,
4451 const struct cgroup_subsys_state *root) 4481 const struct cgroup_subsys_state *root)
4452{ 4482{
4453 struct css_id *child_id = rcu_dereference(child->id); 4483 struct css_id *child_id;
4454 struct css_id *root_id = rcu_dereference(root->id); 4484 struct css_id *root_id;
4485 bool ret = true;
4455 4486
4456 if (!child_id || !root_id || (child_id->depth < root_id->depth)) 4487 rcu_read_lock();
4457 return false; 4488 child_id = rcu_dereference(child->id);
4458 return child_id->stack[root_id->depth] == root_id->id; 4489 root_id = rcu_dereference(root->id);
4490 if (!child_id
4491 || !root_id
4492 || (child_id->depth < root_id->depth)
4493 || (child_id->stack[root_id->depth] != root_id->id))
4494 ret = false;
4495 rcu_read_unlock();
4496 return ret;
4459} 4497}
4460 4498
4461static void __free_css_id_cb(struct rcu_head *head) 4499static void __free_css_id_cb(struct rcu_head *head)
@@ -4555,13 +4593,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
4555{ 4593{
4556 int subsys_id, i, depth = 0; 4594 int subsys_id, i, depth = 0;
4557 struct cgroup_subsys_state *parent_css, *child_css; 4595 struct cgroup_subsys_state *parent_css, *child_css;
4558 struct css_id *child_id, *parent_id = NULL; 4596 struct css_id *child_id, *parent_id;
4559 4597
4560 subsys_id = ss->subsys_id; 4598 subsys_id = ss->subsys_id;
4561 parent_css = parent->subsys[subsys_id]; 4599 parent_css = parent->subsys[subsys_id];
4562 child_css = child->subsys[subsys_id]; 4600 child_css = child->subsys[subsys_id];
4563 depth = css_depth(parent_css) + 1;
4564 parent_id = parent_css->id; 4601 parent_id = parent_css->id;
4602 depth = parent_id->depth;
4565 4603
4566 child_id = get_new_cssid(ss, depth); 4604 child_id = get_new_cssid(ss, depth);
4567 if (IS_ERR(child_id)) 4605 if (IS_ERR(child_id))
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index da5e13975531..ce71ed53e88f 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -89,10 +89,10 @@ struct cgroup_subsys freezer_subsys;
89 89
90/* Locks taken and their ordering 90/* Locks taken and their ordering
91 * ------------------------------ 91 * ------------------------------
92 * css_set_lock
93 * cgroup_mutex (AKA cgroup_lock) 92 * cgroup_mutex (AKA cgroup_lock)
94 * task->alloc_lock (AKA task_lock)
95 * freezer->lock 93 * freezer->lock
94 * css_set_lock
95 * task->alloc_lock (AKA task_lock)
96 * task->sighand->siglock 96 * task->sighand->siglock
97 * 97 *
98 * cgroup code forces css_set_lock to be taken before task->alloc_lock 98 * cgroup code forces css_set_lock to be taken before task->alloc_lock
@@ -100,33 +100,38 @@ struct cgroup_subsys freezer_subsys;
100 * freezer_create(), freezer_destroy(): 100 * freezer_create(), freezer_destroy():
101 * cgroup_mutex [ by cgroup core ] 101 * cgroup_mutex [ by cgroup core ]
102 * 102 *
103 * can_attach(): 103 * freezer_can_attach():
104 * cgroup_mutex 104 * cgroup_mutex (held by caller of can_attach)
105 * 105 *
106 * cgroup_frozen(): 106 * cgroup_freezing_or_frozen():
107 * task->alloc_lock (to get task's cgroup) 107 * task->alloc_lock (to get task's cgroup)
108 * 108 *
109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): 109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
110 * task->alloc_lock (to get task's cgroup)
111 * freezer->lock 110 * freezer->lock
112 * sighand->siglock (if the cgroup is freezing) 111 * sighand->siglock (if the cgroup is freezing)
113 * 112 *
114 * freezer_read(): 113 * freezer_read():
115 * cgroup_mutex 114 * cgroup_mutex
116 * freezer->lock 115 * freezer->lock
116 * write_lock css_set_lock (cgroup iterator start)
117 * task->alloc_lock
117 * read_lock css_set_lock (cgroup iterator start) 118 * read_lock css_set_lock (cgroup iterator start)
118 * 119 *
119 * freezer_write() (freeze): 120 * freezer_write() (freeze):
120 * cgroup_mutex 121 * cgroup_mutex
121 * freezer->lock 122 * freezer->lock
123 * write_lock css_set_lock (cgroup iterator start)
124 * task->alloc_lock
122 * read_lock css_set_lock (cgroup iterator start) 125 * read_lock css_set_lock (cgroup iterator start)
123 * sighand->siglock 126 * sighand->siglock (fake signal delivery inside freeze_task())
124 * 127 *
125 * freezer_write() (unfreeze): 128 * freezer_write() (unfreeze):
126 * cgroup_mutex 129 * cgroup_mutex
127 * freezer->lock 130 * freezer->lock
131 * write_lock css_set_lock (cgroup iterator start)
132 * task->alloc_lock
128 * read_lock css_set_lock (cgroup iterator start) 133 * read_lock css_set_lock (cgroup iterator start)
129 * task->alloc_lock (to prevent races with freeze_task()) 134 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
130 * sighand->siglock 135 * sighand->siglock
131 */ 136 */
132static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 137static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -205,9 +210,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
205 * No lock is needed, since the task isn't on tasklist yet, 210 * No lock is needed, since the task isn't on tasklist yet,
206 * so it can't be moved to another cgroup, which means the 211 * so it can't be moved to another cgroup, which means the
207 * freezer won't be removed and will be valid during this 212 * freezer won't be removed and will be valid during this
208 * function call. 213 * function call. Nevertheless, apply RCU read-side critical
214 * section to suppress RCU lockdep false positives.
209 */ 215 */
216 rcu_read_lock();
210 freezer = task_freezer(task); 217 freezer = task_freezer(task);
218 rcu_read_unlock();
211 219
212 /* 220 /*
213 * The root cgroup is non-freezable, so we can skip the 221 * The root cgroup is non-freezable, so we can skip the
diff --git a/kernel/compat.c b/kernel/compat.c
index 7f40e9275fd9..5adab05a3172 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -495,29 +495,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
495{ 495{
496 int ret; 496 int ret;
497 cpumask_var_t mask; 497 cpumask_var_t mask;
498 unsigned long *k;
499 unsigned int min_length = cpumask_size();
500
501 if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
502 min_length = sizeof(compat_ulong_t);
503 498
504 if (len < min_length) 499 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
500 return -EINVAL;
501 if (len & (sizeof(compat_ulong_t)-1))
505 return -EINVAL; 502 return -EINVAL;
506 503
507 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 504 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
508 return -ENOMEM; 505 return -ENOMEM;
509 506
510 ret = sched_getaffinity(pid, mask); 507 ret = sched_getaffinity(pid, mask);
511 if (ret < 0) 508 if (ret == 0) {
512 goto out; 509 size_t retlen = min_t(size_t, len, cpumask_size());
513 510
514 k = cpumask_bits(mask); 511 if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
515 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); 512 ret = -EFAULT;
516 if (ret == 0) 513 else
517 ret = min_length; 514 ret = retlen;
518 515 }
519out:
520 free_cpumask_var(mask); 516 free_cpumask_var(mask);
517
521 return ret; 518 return ret;
522} 519}
523 520
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 25bba73b1be3..545777574779 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -164,6 +164,7 @@ static inline void check_for_tasks(int cpu)
164} 164}
165 165
166struct take_cpu_down_param { 166struct take_cpu_down_param {
167 struct task_struct *caller;
167 unsigned long mod; 168 unsigned long mod;
168 void *hcpu; 169 void *hcpu;
169}; 170};
@@ -172,6 +173,7 @@ struct take_cpu_down_param {
172static int __ref take_cpu_down(void *_param) 173static int __ref take_cpu_down(void *_param)
173{ 174{
174 struct take_cpu_down_param *param = _param; 175 struct take_cpu_down_param *param = _param;
176 unsigned int cpu = (unsigned long)param->hcpu;
175 int err; 177 int err;
176 178
177 /* Ensure this CPU doesn't handle any more interrupts. */ 179 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -182,6 +184,8 @@ static int __ref take_cpu_down(void *_param)
182 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, 184 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
183 param->hcpu); 185 param->hcpu);
184 186
187 if (task_cpu(param->caller) == cpu)
188 move_task_off_dead_cpu(cpu, param->caller);
185 /* Force idle task to run as soon as we yield: it should 189 /* Force idle task to run as soon as we yield: it should
186 immediately notice cpu is offline and die quickly. */ 190 immediately notice cpu is offline and die quickly. */
187 sched_idle_next(); 191 sched_idle_next();
@@ -192,10 +196,10 @@ static int __ref take_cpu_down(void *_param)
192static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 196static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
193{ 197{
194 int err, nr_calls = 0; 198 int err, nr_calls = 0;
195 cpumask_var_t old_allowed;
196 void *hcpu = (void *)(long)cpu; 199 void *hcpu = (void *)(long)cpu;
197 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 200 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
198 struct take_cpu_down_param tcd_param = { 201 struct take_cpu_down_param tcd_param = {
202 .caller = current,
199 .mod = mod, 203 .mod = mod,
200 .hcpu = hcpu, 204 .hcpu = hcpu,
201 }; 205 };
@@ -206,9 +210,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
206 if (!cpu_online(cpu)) 210 if (!cpu_online(cpu))
207 return -EINVAL; 211 return -EINVAL;
208 212
209 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
210 return -ENOMEM;
211
212 cpu_hotplug_begin(); 213 cpu_hotplug_begin();
213 set_cpu_active(cpu, false); 214 set_cpu_active(cpu, false);
214 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 215 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
@@ -225,10 +226,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
225 goto out_release; 226 goto out_release;
226 } 227 }
227 228
228 /* Ensure that we are not runnable on dying cpu */
229 cpumask_copy(old_allowed, &current->cpus_allowed);
230 set_cpus_allowed_ptr(current, cpu_active_mask);
231
232 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
233 if (err) { 230 if (err) {
234 set_cpu_active(cpu, true); 231 set_cpu_active(cpu, true);
@@ -237,7 +234,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
237 hcpu) == NOTIFY_BAD) 234 hcpu) == NOTIFY_BAD)
238 BUG(); 235 BUG();
239 236
240 goto out_allowed; 237 goto out_release;
241 } 238 }
242 BUG_ON(cpu_online(cpu)); 239 BUG_ON(cpu_online(cpu));
243 240
@@ -255,8 +252,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
255 252
256 check_for_tasks(cpu); 253 check_for_tasks(cpu);
257 254
258out_allowed:
259 set_cpus_allowed_ptr(current, old_allowed);
260out_release: 255out_release:
261 cpu_hotplug_done(); 256 cpu_hotplug_done();
262 if (!err) { 257 if (!err) {
@@ -264,7 +259,6 @@ out_release:
264 hcpu) == NOTIFY_BAD) 259 hcpu) == NOTIFY_BAD)
265 BUG(); 260 BUG();
266 } 261 }
267 free_cpumask_var(old_allowed);
268 return err; 262 return err;
269} 263}
270 264
@@ -272,9 +266,6 @@ int __ref cpu_down(unsigned int cpu)
272{ 266{
273 int err; 267 int err;
274 268
275 err = stop_machine_create();
276 if (err)
277 return err;
278 cpu_maps_update_begin(); 269 cpu_maps_update_begin();
279 270
280 if (cpu_hotplug_disabled) { 271 if (cpu_hotplug_disabled) {
@@ -286,7 +277,6 @@ int __ref cpu_down(unsigned int cpu)
286 277
287out: 278out:
288 cpu_maps_update_done(); 279 cpu_maps_update_done();
289 stop_machine_destroy();
290 return err; 280 return err;
291} 281}
292EXPORT_SYMBOL(cpu_down); 282EXPORT_SYMBOL(cpu_down);
@@ -367,9 +357,6 @@ int disable_nonboot_cpus(void)
367{ 357{
368 int cpu, first_cpu, error; 358 int cpu, first_cpu, error;
369 359
370 error = stop_machine_create();
371 if (error)
372 return error;
373 cpu_maps_update_begin(); 360 cpu_maps_update_begin();
374 first_cpu = cpumask_first(cpu_online_mask); 361 first_cpu = cpumask_first(cpu_online_mask);
375 /* 362 /*
@@ -400,7 +387,6 @@ int disable_nonboot_cpus(void)
400 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 387 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
401 } 388 }
402 cpu_maps_update_done(); 389 cpu_maps_update_done();
403 stop_machine_destroy();
404 return error; 390 return error;
405} 391}
406 392
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d10946748ec2..9a50c5f6e727 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2182,19 +2182,52 @@ void __init cpuset_init_smp(void)
2182void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2182void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2183{ 2183{
2184 mutex_lock(&callback_mutex); 2184 mutex_lock(&callback_mutex);
2185 cpuset_cpus_allowed_locked(tsk, pmask); 2185 task_lock(tsk);
2186 guarantee_online_cpus(task_cs(tsk), pmask);
2187 task_unlock(tsk);
2186 mutex_unlock(&callback_mutex); 2188 mutex_unlock(&callback_mutex);
2187} 2189}
2188 2190
2189/** 2191int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2190 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2191 * Must be called with callback_mutex held.
2192 **/
2193void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2194{ 2192{
2195 task_lock(tsk); 2193 const struct cpuset *cs;
2196 guarantee_online_cpus(task_cs(tsk), pmask); 2194 int cpu;
2197 task_unlock(tsk); 2195
2196 rcu_read_lock();
2197 cs = task_cs(tsk);
2198 if (cs)
2199 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
2200 rcu_read_unlock();
2201
2202 /*
2203 * We own tsk->cpus_allowed, nobody can change it under us.
2204 *
2205 * But we used cs && cs->cpus_allowed lockless and thus can
2206 * race with cgroup_attach_task() or update_cpumask() and get
2207 * the wrong tsk->cpus_allowed. However, both cases imply the
2208 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
2209 * which takes task_rq_lock().
2210 *
2211 * If we are called after it dropped the lock we must see all
2212 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2213 * set any mask even if it is not right from task_cs() pov,
2214 * the pending set_cpus_allowed_ptr() will fix things.
2215 */
2216
2217 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2218 if (cpu >= nr_cpu_ids) {
2219 /*
2220 * Either tsk->cpus_allowed is wrong (see above) or it
2221 * is actually empty. The latter case is only possible
2222 * if we are racing with remove_tasks_in_empty_cpuset().
2223 * Like above we can temporary set any mask and rely on
2224 * set_cpus_allowed_ptr() as synchronization point.
2225 */
2226 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
2227 cpu = cpumask_any(cpu_active_mask);
2228 }
2229
2230 return cpu;
2198} 2231}
2199 2232
2200void cpuset_init_current_mems_allowed(void) 2233void cpuset_init_current_mems_allowed(void)
@@ -2383,22 +2416,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2383} 2416}
2384 2417
2385/** 2418/**
2386 * cpuset_lock - lock out any changes to cpuset structures
2387 *
2388 * The out of memory (oom) code needs to mutex_lock cpusets
2389 * from being changed while it scans the tasklist looking for a
2390 * task in an overlapping cpuset. Expose callback_mutex via this
2391 * cpuset_lock() routine, so the oom code can lock it, before
2392 * locking the task list. The tasklist_lock is a spinlock, so
2393 * must be taken inside callback_mutex.
2394 */
2395
2396void cpuset_lock(void)
2397{
2398 mutex_lock(&callback_mutex);
2399}
2400
2401/**
2402 * cpuset_unlock - release lock on cpuset changes 2419 * cpuset_unlock - release lock on cpuset changes
2403 * 2420 *
2404 * Undo the lock taken in a previous cpuset_lock() call. 2421 * Undo the lock taken in a previous cpuset_lock() call.
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf1..000000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
1/* Internal credentials stuff
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12/*
13 * user.c
14 */
15static inline void sched_switch_user(struct task_struct *p)
16{
17#ifdef CONFIG_USER_SCHED
18 sched_move_task(p);
19#endif /* CONFIG_USER_SCHED */
20}
21
diff --git a/kernel/cred.c b/kernel/cred.c
index e1dbe9eef800..2c24870c55d1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -17,7 +17,6 @@
17#include <linux/init_task.h> 17#include <linux/init_task.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/cn_proc.h> 19#include <linux/cn_proc.h>
20#include "cred-internals.h"
21 20
22#if 0 21#if 0
23#define kdebug(FMT, ...) \ 22#define kdebug(FMT, ...) \
@@ -398,6 +397,8 @@ struct cred *prepare_usermodehelper_creds(void)
398 397
399error: 398error:
400 put_cred(new); 399 put_cred(new);
400 return NULL;
401
401free_tgcred: 402free_tgcred:
402#ifdef CONFIG_KEYS 403#ifdef CONFIG_KEYS
403 kfree(tgcred); 404 kfree(tgcred);
@@ -521,8 +522,6 @@ int commit_creds(struct cred *new)
521#endif 522#endif
522 BUG_ON(atomic_read(&new->usage) < 1); 523 BUG_ON(atomic_read(&new->usage) < 1);
523 524
524 security_commit_creds(new, old);
525
526 get_cred(new); /* we will require a ref for the subj creds too */ 525 get_cred(new); /* we will require a ref for the subj creds too */
527 526
528 /* dumpability changes */ 527 /* dumpability changes */
@@ -558,8 +557,6 @@ int commit_creds(struct cred *new)
558 atomic_dec(&old->user->processes); 557 atomic_dec(&old->user->processes);
559 alter_cred_subscribers(old, -2); 558 alter_cred_subscribers(old, -2);
560 559
561 sched_switch_user(task);
562
563 /* send notifications */ 560 /* send notifications */
564 if (new->uid != old->uid || 561 if (new->uid != old->uid ||
565 new->euid != old->euid || 562 new->euid != old->euid ||
@@ -791,8 +788,6 @@ bool creds_are_invalid(const struct cred *cred)
791{ 788{
792 if (cred->magic != CRED_MAGIC) 789 if (cred->magic != CRED_MAGIC)
793 return true; 790 return true;
794 if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
795 return true;
796#ifdef CONFIG_SECURITY_SELINUX 791#ifdef CONFIG_SECURITY_SELINUX
797 if (selinux_is_enabled()) { 792 if (selinux_is_enabled()) {
798 if ((unsigned long) cred->security < PAGE_SIZE) 793 if ((unsigned long) cred->security < PAGE_SIZE)
diff --git a/kernel/exit.c b/kernel/exit.c
index 7f2683a10ac4..eabca5a73a85 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,7 +55,6 @@
55#include <asm/unistd.h> 55#include <asm/unistd.h>
56#include <asm/pgtable.h> 56#include <asm/pgtable.h>
57#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
58#include "cred-internals.h"
59 58
60static void exit_mm(struct task_struct * tsk); 59static void exit_mm(struct task_struct * tsk);
61 60
diff --git a/kernel/fork.c b/kernel/fork.c
index 44b0791b0a2e..4d57d9e3a6e9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1112,10 +1112,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1112 p->memcg_batch.memcg = NULL; 1112 p->memcg_batch.memcg = NULL;
1113#endif 1113#endif
1114 1114
1115 p->bts = NULL;
1116
1117 p->stack_start = stack_start;
1118
1119 /* Perform scheduler related setup. Assign this task to a CPU. */ 1115 /* Perform scheduler related setup. Assign this task to a CPU. */
1120 sched_fork(p, clone_flags); 1116 sched_fork(p, clone_flags);
1121 1117
diff --git a/kernel/groups.c b/kernel/groups.c
index 2b45b2ee3964..53b1916c9492 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -164,12 +164,6 @@ int groups_search(const struct group_info *group_info, gid_t grp)
164 */ 164 */
165int set_groups(struct cred *new, struct group_info *group_info) 165int set_groups(struct cred *new, struct group_info *group_info)
166{ 166{
167 int retval;
168
169 retval = security_task_setgroups(group_info);
170 if (retval)
171 return retval;
172
173 put_group_info(new->group_info); 167 put_group_info(new->group_info);
174 groups_sort(group_info); 168 groups_sort(group_info);
175 get_group_info(group_info); 169 get_group_info(group_info);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0086628b6e97..b9b134b35088 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1749,35 +1749,15 @@ void __init hrtimers_init(void)
1749} 1749}
1750 1750
1751/** 1751/**
1752 * schedule_hrtimeout_range - sleep until timeout 1752 * schedule_hrtimeout_range_clock - sleep until timeout
1753 * @expires: timeout value (ktime_t) 1753 * @expires: timeout value (ktime_t)
1754 * @delta: slack in expires timeout (ktime_t) 1754 * @delta: slack in expires timeout (ktime_t)
1755 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL 1755 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1756 * 1756 * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
1757 * Make the current task sleep until the given expiry time has
1758 * elapsed. The routine will return immediately unless
1759 * the current task state has been set (see set_current_state()).
1760 *
1761 * The @delta argument gives the kernel the freedom to schedule the
1762 * actual wakeup to a time that is both power and performance friendly.
1763 * The kernel give the normal best effort behavior for "@expires+@delta",
1764 * but may decide to fire the timer earlier, but no earlier than @expires.
1765 *
1766 * You can set the task state as follows -
1767 *
1768 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1769 * pass before the routine returns.
1770 *
1771 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1772 * delivered to the current task.
1773 *
1774 * The current task state is guaranteed to be TASK_RUNNING when this
1775 * routine returns.
1776 *
1777 * Returns 0 when the timer has expired otherwise -EINTR
1778 */ 1757 */
1779int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, 1758int __sched
1780 const enum hrtimer_mode mode) 1759schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1760 const enum hrtimer_mode mode, int clock)
1781{ 1761{
1782 struct hrtimer_sleeper t; 1762 struct hrtimer_sleeper t;
1783 1763
@@ -1799,7 +1779,7 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1799 return -EINTR; 1779 return -EINTR;
1800 } 1780 }
1801 1781
1802 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); 1782 hrtimer_init_on_stack(&t.timer, clock, mode);
1803 hrtimer_set_expires_range_ns(&t.timer, *expires, delta); 1783 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1804 1784
1805 hrtimer_init_sleeper(&t, current); 1785 hrtimer_init_sleeper(&t, current);
@@ -1818,6 +1798,41 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1818 1798
1819 return !t.task ? 0 : -EINTR; 1799 return !t.task ? 0 : -EINTR;
1820} 1800}
1801
1802/**
1803 * schedule_hrtimeout_range - sleep until timeout
1804 * @expires: timeout value (ktime_t)
1805 * @delta: slack in expires timeout (ktime_t)
1806 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1807 *
1808 * Make the current task sleep until the given expiry time has
1809 * elapsed. The routine will return immediately unless
1810 * the current task state has been set (see set_current_state()).
1811 *
1812 * The @delta argument gives the kernel the freedom to schedule the
1813 * actual wakeup to a time that is both power and performance friendly.
1814 * The kernel give the normal best effort behavior for "@expires+@delta",
1815 * but may decide to fire the timer earlier, but no earlier than @expires.
1816 *
1817 * You can set the task state as follows -
1818 *
1819 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1820 * pass before the routine returns.
1821 *
1822 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1823 * delivered to the current task.
1824 *
1825 * The current task state is guaranteed to be TASK_RUNNING when this
1826 * routine returns.
1827 *
1828 * Returns 0 when the timer has expired otherwise -EINTR
1829 */
1830int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1831 const enum hrtimer_mode mode)
1832{
1833 return schedule_hrtimeout_range_clock(expires, delta, mode,
1834 CLOCK_MONOTONIC);
1835}
1821EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); 1836EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
1822 1837
1823/** 1838/**
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 03808ed342a6..7a56b22e0602 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,23 +40,29 @@
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/slab.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/smp.h> 45#include <linux/smp.h>
45 46
46#include <linux/hw_breakpoint.h> 47#include <linux/hw_breakpoint.h>
47 48
49
48/* 50/*
49 * Constraints data 51 * Constraints data
50 */ 52 */
51 53
52/* Number of pinned cpu breakpoints in a cpu */ 54/* Number of pinned cpu breakpoints in a cpu */
53static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); 55static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
54 56
55/* Number of pinned task breakpoints in a cpu */ 57/* Number of pinned task breakpoints in a cpu */
56static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); 58static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
57 59
58/* Number of non-pinned cpu/task breakpoints in a cpu */ 60/* Number of non-pinned cpu/task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); 61static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
62
63static int nr_slots[TYPE_MAX];
64
65static int constraints_initialized;
60 66
61/* Gather the number of total pinned and un-pinned bp in a cpuset */ 67/* Gather the number of total pinned and un-pinned bp in a cpuset */
62struct bp_busy_slots { 68struct bp_busy_slots {
@@ -67,16 +73,29 @@ struct bp_busy_slots {
67/* Serialize accesses to the above constraints */ 73/* Serialize accesses to the above constraints */
68static DEFINE_MUTEX(nr_bp_mutex); 74static DEFINE_MUTEX(nr_bp_mutex);
69 75
76__weak int hw_breakpoint_weight(struct perf_event *bp)
77{
78 return 1;
79}
80
81static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
82{
83 if (bp->attr.bp_type & HW_BREAKPOINT_RW)
84 return TYPE_DATA;
85
86 return TYPE_INST;
87}
88
70/* 89/*
71 * Report the maximum number of pinned breakpoints a task 90 * Report the maximum number of pinned breakpoints a task
72 * have in this cpu 91 * have in this cpu
73 */ 92 */
74static unsigned int max_task_bp_pinned(int cpu) 93static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
75{ 94{
76 int i; 95 int i;
77 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 96 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
78 97
79 for (i = HBP_NUM -1; i >= 0; i--) { 98 for (i = nr_slots[type] - 1; i >= 0; i--) {
80 if (tsk_pinned[i] > 0) 99 if (tsk_pinned[i] > 0)
81 return i + 1; 100 return i + 1;
82 } 101 }
@@ -84,7 +103,7 @@ static unsigned int max_task_bp_pinned(int cpu)
84 return 0; 103 return 0;
85} 104}
86 105
87static int task_bp_pinned(struct task_struct *tsk) 106static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
88{ 107{
89 struct perf_event_context *ctx = tsk->perf_event_ctxp; 108 struct perf_event_context *ctx = tsk->perf_event_ctxp;
90 struct list_head *list; 109 struct list_head *list;
@@ -105,7 +124,8 @@ static int task_bp_pinned(struct task_struct *tsk)
105 */ 124 */
106 list_for_each_entry(bp, list, event_entry) { 125 list_for_each_entry(bp, list, event_entry) {
107 if (bp->attr.type == PERF_TYPE_BREAKPOINT) 126 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
108 count++; 127 if (find_slot_idx(bp) == type)
128 count += hw_breakpoint_weight(bp);
109 } 129 }
110 130
111 raw_spin_unlock_irqrestore(&ctx->lock, flags); 131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -118,18 +138,19 @@ static int task_bp_pinned(struct task_struct *tsk)
118 * a given cpu (cpu > -1) or in all of them (cpu = -1). 138 * a given cpu (cpu > -1) or in all of them (cpu = -1).
119 */ 139 */
120static void 140static void
121fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) 141fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
142 enum bp_type_idx type)
122{ 143{
123 int cpu = bp->cpu; 144 int cpu = bp->cpu;
124 struct task_struct *tsk = bp->ctx->task; 145 struct task_struct *tsk = bp->ctx->task;
125 146
126 if (cpu >= 0) { 147 if (cpu >= 0) {
127 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); 148 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
128 if (!tsk) 149 if (!tsk)
129 slots->pinned += max_task_bp_pinned(cpu); 150 slots->pinned += max_task_bp_pinned(cpu, type);
130 else 151 else
131 slots->pinned += task_bp_pinned(tsk); 152 slots->pinned += task_bp_pinned(tsk, type);
132 slots->flexible = per_cpu(nr_bp_flexible, cpu); 153 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
133 154
134 return; 155 return;
135 } 156 }
@@ -137,16 +158,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
137 for_each_online_cpu(cpu) { 158 for_each_online_cpu(cpu) {
138 unsigned int nr; 159 unsigned int nr;
139 160
140 nr = per_cpu(nr_cpu_bp_pinned, cpu); 161 nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
141 if (!tsk) 162 if (!tsk)
142 nr += max_task_bp_pinned(cpu); 163 nr += max_task_bp_pinned(cpu, type);
143 else 164 else
144 nr += task_bp_pinned(tsk); 165 nr += task_bp_pinned(tsk, type);
145 166
146 if (nr > slots->pinned) 167 if (nr > slots->pinned)
147 slots->pinned = nr; 168 slots->pinned = nr;
148 169
149 nr = per_cpu(nr_bp_flexible, cpu); 170 nr = per_cpu(nr_bp_flexible[type], cpu);
150 171
151 if (nr > slots->flexible) 172 if (nr > slots->flexible)
152 slots->flexible = nr; 173 slots->flexible = nr;
@@ -154,31 +175,49 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
154} 175}
155 176
156/* 177/*
178 * For now, continue to consider flexible as pinned, until we can
179 * ensure no flexible event can ever be scheduled before a pinned event
180 * in a same cpu.
181 */
182static void
183fetch_this_slot(struct bp_busy_slots *slots, int weight)
184{
185 slots->pinned += weight;
186}
187
188/*
157 * Add a pinned breakpoint for the given task in our constraint table 189 * Add a pinned breakpoint for the given task in our constraint table
158 */ 190 */
159static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) 191static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
192 enum bp_type_idx type, int weight)
160{ 193{
161 unsigned int *tsk_pinned; 194 unsigned int *tsk_pinned;
162 int count = 0; 195 int old_count = 0;
196 int old_idx = 0;
197 int idx = 0;
163 198
164 count = task_bp_pinned(tsk); 199 old_count = task_bp_pinned(tsk, type);
200 old_idx = old_count - 1;
201 idx = old_idx + weight;
165 202
166 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 203 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
167 if (enable) { 204 if (enable) {
168 tsk_pinned[count]++; 205 tsk_pinned[idx]++;
169 if (count > 0) 206 if (old_count > 0)
170 tsk_pinned[count-1]--; 207 tsk_pinned[old_idx]--;
171 } else { 208 } else {
172 tsk_pinned[count]--; 209 tsk_pinned[idx]--;
173 if (count > 0) 210 if (old_count > 0)
174 tsk_pinned[count-1]++; 211 tsk_pinned[old_idx]++;
175 } 212 }
176} 213}
177 214
178/* 215/*
179 * Add/remove the given breakpoint in our constraint table 216 * Add/remove the given breakpoint in our constraint table
180 */ 217 */
181static void toggle_bp_slot(struct perf_event *bp, bool enable) 218static void
219toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
220 int weight)
182{ 221{
183 int cpu = bp->cpu; 222 int cpu = bp->cpu;
184 struct task_struct *tsk = bp->ctx->task; 223 struct task_struct *tsk = bp->ctx->task;
@@ -186,20 +225,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
186 /* Pinned counter task profiling */ 225 /* Pinned counter task profiling */
187 if (tsk) { 226 if (tsk) {
188 if (cpu >= 0) { 227 if (cpu >= 0) {
189 toggle_bp_task_slot(tsk, cpu, enable); 228 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
190 return; 229 return;
191 } 230 }
192 231
193 for_each_online_cpu(cpu) 232 for_each_online_cpu(cpu)
194 toggle_bp_task_slot(tsk, cpu, enable); 233 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
195 return; 234 return;
196 } 235 }
197 236
198 /* Pinned counter cpu profiling */ 237 /* Pinned counter cpu profiling */
199 if (enable) 238 if (enable)
200 per_cpu(nr_cpu_bp_pinned, bp->cpu)++; 239 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
201 else 240 else
202 per_cpu(nr_cpu_bp_pinned, bp->cpu)--; 241 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
203} 242}
204 243
205/* 244/*
@@ -246,14 +285,29 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
246static int __reserve_bp_slot(struct perf_event *bp) 285static int __reserve_bp_slot(struct perf_event *bp)
247{ 286{
248 struct bp_busy_slots slots = {0}; 287 struct bp_busy_slots slots = {0};
288 enum bp_type_idx type;
289 int weight;
249 290
250 fetch_bp_busy_slots(&slots, bp); 291 /* We couldn't initialize breakpoint constraints on boot */
292 if (!constraints_initialized)
293 return -ENOMEM;
294
295 /* Basic checks */
296 if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
297 bp->attr.bp_type == HW_BREAKPOINT_INVALID)
298 return -EINVAL;
299
300 type = find_slot_idx(bp);
301 weight = hw_breakpoint_weight(bp);
302
303 fetch_bp_busy_slots(&slots, bp, type);
304 fetch_this_slot(&slots, weight);
251 305
252 /* Flexible counters need to keep at least one slot */ 306 /* Flexible counters need to keep at least one slot */
253 if (slots.pinned + (!!slots.flexible) == HBP_NUM) 307 if (slots.pinned + (!!slots.flexible) > nr_slots[type])
254 return -ENOSPC; 308 return -ENOSPC;
255 309
256 toggle_bp_slot(bp, true); 310 toggle_bp_slot(bp, true, type, weight);
257 311
258 return 0; 312 return 0;
259} 313}
@@ -273,7 +327,12 @@ int reserve_bp_slot(struct perf_event *bp)
273 327
274static void __release_bp_slot(struct perf_event *bp) 328static void __release_bp_slot(struct perf_event *bp)
275{ 329{
276 toggle_bp_slot(bp, false); 330 enum bp_type_idx type;
331 int weight;
332
333 type = find_slot_idx(bp);
334 weight = hw_breakpoint_weight(bp);
335 toggle_bp_slot(bp, false, type, weight);
277} 336}
278 337
279void release_bp_slot(struct perf_event *bp) 338void release_bp_slot(struct perf_event *bp)
@@ -308,6 +367,28 @@ int dbg_release_bp_slot(struct perf_event *bp)
308 return 0; 367 return 0;
309} 368}
310 369
370static int validate_hw_breakpoint(struct perf_event *bp)
371{
372 int ret;
373
374 ret = arch_validate_hwbkpt_settings(bp);
375 if (ret)
376 return ret;
377
378 if (arch_check_bp_in_kernelspace(bp)) {
379 if (bp->attr.exclude_kernel)
380 return -EINVAL;
381 /*
382 * Don't let unprivileged users set a breakpoint in the trap
383 * path to avoid trap recursion attacks.
384 */
385 if (!capable(CAP_SYS_ADMIN))
386 return -EPERM;
387 }
388
389 return 0;
390}
391
311int register_perf_hw_breakpoint(struct perf_event *bp) 392int register_perf_hw_breakpoint(struct perf_event *bp)
312{ 393{
313 int ret; 394 int ret;
@@ -316,17 +397,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
316 if (ret) 397 if (ret)
317 return ret; 398 return ret;
318 399
319 /* 400 ret = validate_hw_breakpoint(bp);
320 * Ptrace breakpoints can be temporary perf events only
321 * meant to reserve a slot. In this case, it is created disabled and
322 * we don't want to check the params right now (as we put a null addr)
323 * But perf tools create events as disabled and we want to check
324 * the params for them.
325 * This is a quick hack that will be removed soon, once we remove
326 * the tmp breakpoints from ptrace
327 */
328 if (!bp->attr.disabled || !bp->overflow_handler)
329 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
330 401
331 /* if arch_validate_hwbkpt_settings() fails then release bp slot */ 402 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
332 if (ret) 403 if (ret)
@@ -373,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
373 if (attr->disabled) 444 if (attr->disabled)
374 goto end; 445 goto end;
375 446
376 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); 447 err = validate_hw_breakpoint(bp);
377 if (!err) 448 if (!err)
378 perf_event_enable(bp); 449 perf_event_enable(bp);
379 450
@@ -480,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
480 551
481static int __init init_hw_breakpoint(void) 552static int __init init_hw_breakpoint(void)
482{ 553{
554 unsigned int **task_bp_pinned;
555 int cpu, err_cpu;
556 int i;
557
558 for (i = 0; i < TYPE_MAX; i++)
559 nr_slots[i] = hw_breakpoint_slots(i);
560
561 for_each_possible_cpu(cpu) {
562 for (i = 0; i < TYPE_MAX; i++) {
563 task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
564 *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
565 GFP_KERNEL);
566 if (!*task_bp_pinned)
567 goto err_alloc;
568 }
569 }
570
571 constraints_initialized = 1;
572
483 return register_die_notifier(&hw_breakpoint_exceptions_nb); 573 return register_die_notifier(&hw_breakpoint_exceptions_nb);
574
575 err_alloc:
576 for_each_possible_cpu(err_cpu) {
577 if (err_cpu == cpu)
578 break;
579 for (i = 0; i < TYPE_MAX; i++)
580 kfree(per_cpu(nr_task_bp_pinned[i], cpu));
581 }
582
583 return -ENOMEM;
484} 584}
485core_initcall(init_hw_breakpoint); 585core_initcall(init_hw_breakpoint);
486 586
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 76d5a671bfe1..27e5c6911223 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -370,9 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
370 irqreturn_t ret, retval = IRQ_NONE; 370 irqreturn_t ret, retval = IRQ_NONE;
371 unsigned int status = 0; 371 unsigned int status = 0;
372 372
373 if (!(action->flags & IRQF_DISABLED))
374 local_irq_enable_in_hardirq();
375
376 do { 373 do {
377 trace_irq_handler_entry(irq, action); 374 trace_irq_handler_entry(irq, action);
378 ret = action->handler(irq, action->dev_id); 375 ret = action->handler(irq, action->dev_id);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 704e488730a5..3164ba7ce151 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
138 return 0; 138 return 0;
139} 139}
140 140
141int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
142{
143 struct irq_desc *desc = irq_to_desc(irq);
144 unsigned long flags;
145
146 if (!desc)
147 return -EINVAL;
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->affinity_hint = m;
151 raw_spin_unlock_irqrestore(&desc->lock, flags);
152
153 return 0;
154}
155EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
156
141#ifndef CONFIG_AUTO_IRQ_AFFINITY 157#ifndef CONFIG_AUTO_IRQ_AFFINITY
142/* 158/*
143 * Generic version of the affinity autoselector. 159 * Generic version of the affinity autoselector.
@@ -757,16 +773,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
757 if (new->flags & IRQF_ONESHOT) 773 if (new->flags & IRQF_ONESHOT)
758 desc->status |= IRQ_ONESHOT; 774 desc->status |= IRQ_ONESHOT;
759 775
760 /*
761 * Force MSI interrupts to run with interrupts
762 * disabled. The multi vector cards can cause stack
763 * overflows due to nested interrupts when enough of
764 * them are directed to a core and fire at the same
765 * time.
766 */
767 if (desc->msi_desc)
768 new->flags |= IRQF_DISABLED;
769
770 if (!(desc->status & IRQ_NOAUTOEN)) { 776 if (!(desc->status & IRQ_NOAUTOEN)) {
771 desc->depth = 0; 777 desc->depth = 0;
772 desc->status &= ~IRQ_DISABLED; 778 desc->status &= ~IRQ_DISABLED;
@@ -916,6 +922,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
916 desc->chip->disable(irq); 922 desc->chip->disable(irq);
917 } 923 }
918 924
925#ifdef CONFIG_SMP
926 /* make sure affinity_hint is cleaned up */
927 if (WARN_ON_ONCE(desc->affinity_hint))
928 desc->affinity_hint = NULL;
929#endif
930
919 raw_spin_unlock_irqrestore(&desc->lock, flags); 931 raw_spin_unlock_irqrestore(&desc->lock, flags);
920 932
921 unregister_handler_proc(irq, action); 933 unregister_handler_proc(irq, action);
@@ -1027,7 +1039,6 @@ EXPORT_SYMBOL(free_irq);
1027 * Flags: 1039 * Flags:
1028 * 1040 *
1029 * IRQF_SHARED Interrupt is shared 1041 * IRQF_SHARED Interrupt is shared
1030 * IRQF_DISABLED Disable local interrupts while processing
1031 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy 1042 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
1032 * IRQF_TRIGGER_* Specify active edge(s) or level 1043 * IRQF_TRIGGER_* Specify active edge(s) or level
1033 * 1044 *
@@ -1041,25 +1052,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1041 int retval; 1052 int retval;
1042 1053
1043 /* 1054 /*
1044 * handle_IRQ_event() always ignores IRQF_DISABLED except for
1045 * the _first_ irqaction (sigh). That can cause oopsing, but
1046 * the behavior is classified as "will not fix" so we need to
1047 * start nudging drivers away from using that idiom.
1048 */
1049 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
1050 (IRQF_SHARED|IRQF_DISABLED)) {
1051 pr_warning(
1052 "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
1053 irq, devname);
1054 }
1055
1056#ifdef CONFIG_LOCKDEP
1057 /*
1058 * Lockdep wants atomic interrupt handlers:
1059 */
1060 irqflags |= IRQF_DISABLED;
1061#endif
1062 /*
1063 * Sanity-check: shared interrupts must pass in a real dev-ID, 1055 * Sanity-check: shared interrupts must pass in a real dev-ID,
1064 * otherwise we'll have trouble later trying to figure out 1056 * otherwise we'll have trouble later trying to figure out
1065 * which interrupt is which (messes up the interrupt freeing 1057 * which interrupt is which (messes up the interrupt freeing
@@ -1120,3 +1112,40 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1120 return retval; 1112 return retval;
1121} 1113}
1122EXPORT_SYMBOL(request_threaded_irq); 1114EXPORT_SYMBOL(request_threaded_irq);
1115
1116/**
1117 * request_any_context_irq - allocate an interrupt line
1118 * @irq: Interrupt line to allocate
1119 * @handler: Function to be called when the IRQ occurs.
1120 * Threaded handler for threaded interrupts.
1121 * @flags: Interrupt type flags
1122 * @name: An ascii name for the claiming device
1123 * @dev_id: A cookie passed back to the handler function
1124 *
1125 * This call allocates interrupt resources and enables the
1126 * interrupt line and IRQ handling. It selects either a
1127 * hardirq or threaded handling method depending on the
1128 * context.
1129 *
1130 * On failure, it returns a negative value. On success,
1131 * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
1132 */
1133int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1134 unsigned long flags, const char *name, void *dev_id)
1135{
1136 struct irq_desc *desc = irq_to_desc(irq);
1137 int ret;
1138
1139 if (!desc)
1140 return -EINVAL;
1141
1142 if (desc->status & IRQ_NESTED_THREAD) {
1143 ret = request_threaded_irq(irq, NULL, handler,
1144 flags, name, dev_id);
1145 return !ret ? IRQC_IS_NESTED : ret;
1146 }
1147
1148 ret = request_irq(irq, handler, flags, name, dev_id);
1149 return !ret ? IRQC_IS_HARDIRQ : ret;
1150}
1151EXPORT_SYMBOL_GPL(request_any_context_irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7a6eb04ef6b5..09a2ee540bd2 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -32,6 +32,27 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
32 return 0; 32 return 0;
33} 33}
34 34
35static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
36{
37 struct irq_desc *desc = irq_to_desc((long)m->private);
38 unsigned long flags;
39 cpumask_var_t mask;
40
41 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
42 return -ENOMEM;
43
44 raw_spin_lock_irqsave(&desc->lock, flags);
45 if (desc->affinity_hint)
46 cpumask_copy(mask, desc->affinity_hint);
47 raw_spin_unlock_irqrestore(&desc->lock, flags);
48
49 seq_cpumask(m, mask);
50 seq_putc(m, '\n');
51 free_cpumask_var(mask);
52
53 return 0;
54}
55
35#ifndef is_affinity_mask_valid 56#ifndef is_affinity_mask_valid
36#define is_affinity_mask_valid(val) 1 57#define is_affinity_mask_valid(val) 1
37#endif 58#endif
@@ -84,6 +105,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file)
84 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 105 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
85} 106}
86 107
108static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
109{
110 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
111}
112
87static const struct file_operations irq_affinity_proc_fops = { 113static const struct file_operations irq_affinity_proc_fops = {
88 .open = irq_affinity_proc_open, 114 .open = irq_affinity_proc_open,
89 .read = seq_read, 115 .read = seq_read,
@@ -92,6 +118,13 @@ static const struct file_operations irq_affinity_proc_fops = {
92 .write = irq_affinity_proc_write, 118 .write = irq_affinity_proc_write,
93}; 119};
94 120
121static const struct file_operations irq_affinity_hint_proc_fops = {
122 .open = irq_affinity_hint_proc_open,
123 .read = seq_read,
124 .llseek = seq_lseek,
125 .release = single_release,
126};
127
95static int default_affinity_show(struct seq_file *m, void *v) 128static int default_affinity_show(struct seq_file *m, void *v)
96{ 129{
97 seq_cpumask(m, irq_default_affinity); 130 seq_cpumask(m, irq_default_affinity);
@@ -147,6 +180,26 @@ static const struct file_operations default_affinity_proc_fops = {
147 .release = single_release, 180 .release = single_release,
148 .write = default_affinity_write, 181 .write = default_affinity_write,
149}; 182};
183
184static int irq_node_proc_show(struct seq_file *m, void *v)
185{
186 struct irq_desc *desc = irq_to_desc((long) m->private);
187
188 seq_printf(m, "%d\n", desc->node);
189 return 0;
190}
191
192static int irq_node_proc_open(struct inode *inode, struct file *file)
193{
194 return single_open(file, irq_node_proc_show, PDE(inode)->data);
195}
196
197static const struct file_operations irq_node_proc_fops = {
198 .open = irq_node_proc_open,
199 .read = seq_read,
200 .llseek = seq_lseek,
201 .release = single_release,
202};
150#endif 203#endif
151 204
152static int irq_spurious_proc_show(struct seq_file *m, void *v) 205static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -231,6 +284,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
231 /* create /proc/irq/<irq>/smp_affinity */ 284 /* create /proc/irq/<irq>/smp_affinity */
232 proc_create_data("smp_affinity", 0600, desc->dir, 285 proc_create_data("smp_affinity", 0600, desc->dir,
233 &irq_affinity_proc_fops, (void *)(long)irq); 286 &irq_affinity_proc_fops, (void *)(long)irq);
287
288 /* create /proc/irq/<irq>/affinity_hint */
289 proc_create_data("affinity_hint", 0400, desc->dir,
290 &irq_affinity_hint_proc_fops, (void *)(long)irq);
291
292 proc_create_data("node", 0444, desc->dir,
293 &irq_node_proc_fops, (void *)(long)irq);
234#endif 294#endif
235 295
236 proc_create_data("spurious", 0444, desc->dir, 296 proc_create_data("spurious", 0444, desc->dir,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87ebe8adc474..474a84715eac 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1134,11 +1134,9 @@ int crash_shrink_memory(unsigned long new_size)
1134 1134
1135 free_reserved_phys_range(end, crashk_res.end); 1135 free_reserved_phys_range(end, crashk_res.end);
1136 1136
1137 if (start == end) { 1137 if (start == end)
1138 crashk_res.end = end;
1139 release_resource(&crashk_res); 1138 release_resource(&crashk_res);
1140 } else 1139 crashk_res.end = end - 1;
1141 crashk_res.end = end - 1;
1142 1140
1143unlock: 1141unlock:
1144 mutex_unlock(&kexec_mutex); 1142 mutex_unlock(&kexec_mutex);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0ed46f3e51e9..282035f3ae96 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1588,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1588 arch_remove_kprobe(p); 1588 arch_remove_kprobe(p);
1589} 1589}
1590 1590
1591/* Disable one kprobe */
1592int __kprobes disable_kprobe(struct kprobe *kp)
1593{
1594 int ret = 0;
1595 struct kprobe *p;
1596
1597 mutex_lock(&kprobe_mutex);
1598
1599 /* Check whether specified probe is valid. */
1600 p = __get_valid_kprobe(kp);
1601 if (unlikely(p == NULL)) {
1602 ret = -EINVAL;
1603 goto out;
1604 }
1605
1606 /* If the probe is already disabled (or gone), just return */
1607 if (kprobe_disabled(kp))
1608 goto out;
1609
1610 kp->flags |= KPROBE_FLAG_DISABLED;
1611 if (p != kp)
1612 /* When kp != p, p is always enabled. */
1613 try_to_disable_aggr_kprobe(p);
1614
1615 if (!kprobes_all_disarmed && kprobe_disabled(p))
1616 disarm_kprobe(p);
1617out:
1618 mutex_unlock(&kprobe_mutex);
1619 return ret;
1620}
1621EXPORT_SYMBOL_GPL(disable_kprobe);
1622
1623/* Enable one kprobe */
1624int __kprobes enable_kprobe(struct kprobe *kp)
1625{
1626 int ret = 0;
1627 struct kprobe *p;
1628
1629 mutex_lock(&kprobe_mutex);
1630
1631 /* Check whether specified probe is valid. */
1632 p = __get_valid_kprobe(kp);
1633 if (unlikely(p == NULL)) {
1634 ret = -EINVAL;
1635 goto out;
1636 }
1637
1638 if (kprobe_gone(kp)) {
1639 /* This kprobe has gone, we couldn't enable it. */
1640 ret = -EINVAL;
1641 goto out;
1642 }
1643
1644 if (p != kp)
1645 kp->flags &= ~KPROBE_FLAG_DISABLED;
1646
1647 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1648 p->flags &= ~KPROBE_FLAG_DISABLED;
1649 arm_kprobe(p);
1650 }
1651out:
1652 mutex_unlock(&kprobe_mutex);
1653 return ret;
1654}
1655EXPORT_SYMBOL_GPL(enable_kprobe);
1656
1591void __kprobes dump_kprobe(struct kprobe *kp) 1657void __kprobes dump_kprobe(struct kprobe *kp)
1592{ 1658{
1593 printk(KERN_WARNING "Dumping kprobe:\n"); 1659 printk(KERN_WARNING "Dumping kprobe:\n");
@@ -1805,72 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = {
1805 .release = seq_release, 1871 .release = seq_release,
1806}; 1872};
1807 1873
1808/* Disable one kprobe */
1809int __kprobes disable_kprobe(struct kprobe *kp)
1810{
1811 int ret = 0;
1812 struct kprobe *p;
1813
1814 mutex_lock(&kprobe_mutex);
1815
1816 /* Check whether specified probe is valid. */
1817 p = __get_valid_kprobe(kp);
1818 if (unlikely(p == NULL)) {
1819 ret = -EINVAL;
1820 goto out;
1821 }
1822
1823 /* If the probe is already disabled (or gone), just return */
1824 if (kprobe_disabled(kp))
1825 goto out;
1826
1827 kp->flags |= KPROBE_FLAG_DISABLED;
1828 if (p != kp)
1829 /* When kp != p, p is always enabled. */
1830 try_to_disable_aggr_kprobe(p);
1831
1832 if (!kprobes_all_disarmed && kprobe_disabled(p))
1833 disarm_kprobe(p);
1834out:
1835 mutex_unlock(&kprobe_mutex);
1836 return ret;
1837}
1838EXPORT_SYMBOL_GPL(disable_kprobe);
1839
1840/* Enable one kprobe */
1841int __kprobes enable_kprobe(struct kprobe *kp)
1842{
1843 int ret = 0;
1844 struct kprobe *p;
1845
1846 mutex_lock(&kprobe_mutex);
1847
1848 /* Check whether specified probe is valid. */
1849 p = __get_valid_kprobe(kp);
1850 if (unlikely(p == NULL)) {
1851 ret = -EINVAL;
1852 goto out;
1853 }
1854
1855 if (kprobe_gone(kp)) {
1856 /* This kprobe has gone, we couldn't enable it. */
1857 ret = -EINVAL;
1858 goto out;
1859 }
1860
1861 if (p != kp)
1862 kp->flags &= ~KPROBE_FLAG_DISABLED;
1863
1864 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1865 p->flags &= ~KPROBE_FLAG_DISABLED;
1866 arm_kprobe(p);
1867 }
1868out:
1869 mutex_unlock(&kprobe_mutex);
1870 return ret;
1871}
1872EXPORT_SYMBOL_GPL(enable_kprobe);
1873
1874static void __kprobes arm_all_kprobes(void) 1874static void __kprobes arm_all_kprobes(void)
1875{ 1875{
1876 struct hlist_head *head; 1876 struct hlist_head *head;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2594e1ce41cb..ec21304856d1 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -431,20 +431,7 @@ static struct stack_trace lockdep_init_trace = {
431/* 431/*
432 * Various lockdep statistics: 432 * Various lockdep statistics:
433 */ 433 */
434atomic_t chain_lookup_hits; 434DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
435atomic_t chain_lookup_misses;
436atomic_t hardirqs_on_events;
437atomic_t hardirqs_off_events;
438atomic_t redundant_hardirqs_on;
439atomic_t redundant_hardirqs_off;
440atomic_t softirqs_on_events;
441atomic_t softirqs_off_events;
442atomic_t redundant_softirqs_on;
443atomic_t redundant_softirqs_off;
444atomic_t nr_unused_locks;
445atomic_t nr_cyclic_checks;
446atomic_t nr_find_usage_forwards_checks;
447atomic_t nr_find_usage_backwards_checks;
448#endif 435#endif
449 436
450/* 437/*
@@ -748,7 +735,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
748 return NULL; 735 return NULL;
749 } 736 }
750 class = lock_classes + nr_lock_classes++; 737 class = lock_classes + nr_lock_classes++;
751 debug_atomic_inc(&nr_unused_locks); 738 debug_atomic_inc(nr_unused_locks);
752 class->key = key; 739 class->key = key;
753 class->name = lock->name; 740 class->name = lock->name;
754 class->subclass = subclass; 741 class->subclass = subclass;
@@ -818,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
818 * Add a new dependency to the head of the list: 805 * Add a new dependency to the head of the list:
819 */ 806 */
820static int add_lock_to_list(struct lock_class *class, struct lock_class *this, 807static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
821 struct list_head *head, unsigned long ip, int distance) 808 struct list_head *head, unsigned long ip,
809 int distance, struct stack_trace *trace)
822{ 810{
823 struct lock_list *entry; 811 struct lock_list *entry;
824 /* 812 /*
@@ -829,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
829 if (!entry) 817 if (!entry)
830 return 0; 818 return 0;
831 819
832 if (!save_trace(&entry->trace))
833 return 0;
834
835 entry->class = this; 820 entry->class = this;
836 entry->distance = distance; 821 entry->distance = distance;
822 entry->trace = *trace;
837 /* 823 /*
838 * Since we never remove from the dependency list, the list can 824 * Since we never remove from the dependency list, the list can
839 * be walked lockless by other CPUs, it's only allocation 825 * be walked lockless by other CPUs, it's only allocation
@@ -1205,7 +1191,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
1205{ 1191{
1206 int result; 1192 int result;
1207 1193
1208 debug_atomic_inc(&nr_cyclic_checks); 1194 debug_atomic_inc(nr_cyclic_checks);
1209 1195
1210 result = __bfs_forwards(root, target, class_equal, target_entry); 1196 result = __bfs_forwards(root, target, class_equal, target_entry);
1211 1197
@@ -1242,7 +1228,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1242{ 1228{
1243 int result; 1229 int result;
1244 1230
1245 debug_atomic_inc(&nr_find_usage_forwards_checks); 1231 debug_atomic_inc(nr_find_usage_forwards_checks);
1246 1232
1247 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); 1233 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1248 1234
@@ -1265,7 +1251,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1265{ 1251{
1266 int result; 1252 int result;
1267 1253
1268 debug_atomic_inc(&nr_find_usage_backwards_checks); 1254 debug_atomic_inc(nr_find_usage_backwards_checks);
1269 1255
1270 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); 1256 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1271 1257
@@ -1635,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1635 */ 1621 */
1636static int 1622static int
1637check_prev_add(struct task_struct *curr, struct held_lock *prev, 1623check_prev_add(struct task_struct *curr, struct held_lock *prev,
1638 struct held_lock *next, int distance) 1624 struct held_lock *next, int distance, int trylock_loop)
1639{ 1625{
1640 struct lock_list *entry; 1626 struct lock_list *entry;
1641 int ret; 1627 int ret;
1642 struct lock_list this; 1628 struct lock_list this;
1643 struct lock_list *uninitialized_var(target_entry); 1629 struct lock_list *uninitialized_var(target_entry);
1630 /*
1631 * Static variable, serialized by the graph_lock().
1632 *
1633 * We use this static variable to save the stack trace in case
1634 * we call into this function multiple times due to encountering
1635 * trylocks in the held lock stack.
1636 */
1637 static struct stack_trace trace;
1644 1638
1645 /* 1639 /*
1646 * Prove that the new <prev> -> <next> dependency would not 1640 * Prove that the new <prev> -> <next> dependency would not
@@ -1688,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1688 } 1682 }
1689 } 1683 }
1690 1684
1685 if (!trylock_loop && !save_trace(&trace))
1686 return 0;
1687
1691 /* 1688 /*
1692 * Ok, all validations passed, add the new lock 1689 * Ok, all validations passed, add the new lock
1693 * to the previous lock's dependency list: 1690 * to the previous lock's dependency list:
1694 */ 1691 */
1695 ret = add_lock_to_list(hlock_class(prev), hlock_class(next), 1692 ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
1696 &hlock_class(prev)->locks_after, 1693 &hlock_class(prev)->locks_after,
1697 next->acquire_ip, distance); 1694 next->acquire_ip, distance, &trace);
1698 1695
1699 if (!ret) 1696 if (!ret)
1700 return 0; 1697 return 0;
1701 1698
1702 ret = add_lock_to_list(hlock_class(next), hlock_class(prev), 1699 ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
1703 &hlock_class(next)->locks_before, 1700 &hlock_class(next)->locks_before,
1704 next->acquire_ip, distance); 1701 next->acquire_ip, distance, &trace);
1705 if (!ret) 1702 if (!ret)
1706 return 0; 1703 return 0;
1707 1704
@@ -1731,6 +1728,7 @@ static int
1731check_prevs_add(struct task_struct *curr, struct held_lock *next) 1728check_prevs_add(struct task_struct *curr, struct held_lock *next)
1732{ 1729{
1733 int depth = curr->lockdep_depth; 1730 int depth = curr->lockdep_depth;
1731 int trylock_loop = 0;
1734 struct held_lock *hlock; 1732 struct held_lock *hlock;
1735 1733
1736 /* 1734 /*
@@ -1756,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1756 * added: 1754 * added:
1757 */ 1755 */
1758 if (hlock->read != 2) { 1756 if (hlock->read != 2) {
1759 if (!check_prev_add(curr, hlock, next, distance)) 1757 if (!check_prev_add(curr, hlock, next,
1758 distance, trylock_loop))
1760 return 0; 1759 return 0;
1761 /* 1760 /*
1762 * Stop after the first non-trylock entry, 1761 * Stop after the first non-trylock entry,
@@ -1779,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1779 if (curr->held_locks[depth].irq_context != 1778 if (curr->held_locks[depth].irq_context !=
1780 curr->held_locks[depth-1].irq_context) 1779 curr->held_locks[depth-1].irq_context)
1781 break; 1780 break;
1781 trylock_loop = 1;
1782 } 1782 }
1783 return 1; 1783 return 1;
1784out_bug: 1784out_bug:
@@ -1825,7 +1825,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1825 list_for_each_entry(chain, hash_head, entry) { 1825 list_for_each_entry(chain, hash_head, entry) {
1826 if (chain->chain_key == chain_key) { 1826 if (chain->chain_key == chain_key) {
1827cache_hit: 1827cache_hit:
1828 debug_atomic_inc(&chain_lookup_hits); 1828 debug_atomic_inc(chain_lookup_hits);
1829 if (very_verbose(class)) 1829 if (very_verbose(class))
1830 printk("\nhash chain already cached, key: " 1830 printk("\nhash chain already cached, key: "
1831 "%016Lx tail class: [%p] %s\n", 1831 "%016Lx tail class: [%p] %s\n",
@@ -1890,7 +1890,7 @@ cache_hit:
1890 chain_hlocks[chain->base + j] = class - lock_classes; 1890 chain_hlocks[chain->base + j] = class - lock_classes;
1891 } 1891 }
1892 list_add_tail_rcu(&chain->entry, hash_head); 1892 list_add_tail_rcu(&chain->entry, hash_head);
1893 debug_atomic_inc(&chain_lookup_misses); 1893 debug_atomic_inc(chain_lookup_misses);
1894 inc_chains(); 1894 inc_chains();
1895 1895
1896 return 1; 1896 return 1;
@@ -2311,7 +2311,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
2311 return; 2311 return;
2312 2312
2313 if (unlikely(curr->hardirqs_enabled)) { 2313 if (unlikely(curr->hardirqs_enabled)) {
2314 debug_atomic_inc(&redundant_hardirqs_on); 2314 /*
2315 * Neither irq nor preemption are disabled here
2316 * so this is racy by nature but loosing one hit
2317 * in a stat is not a big deal.
2318 */
2319 __debug_atomic_inc(redundant_hardirqs_on);
2315 return; 2320 return;
2316 } 2321 }
2317 /* we'll do an OFF -> ON transition: */ 2322 /* we'll do an OFF -> ON transition: */
@@ -2338,7 +2343,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2338 2343
2339 curr->hardirq_enable_ip = ip; 2344 curr->hardirq_enable_ip = ip;
2340 curr->hardirq_enable_event = ++curr->irq_events; 2345 curr->hardirq_enable_event = ++curr->irq_events;
2341 debug_atomic_inc(&hardirqs_on_events); 2346 debug_atomic_inc(hardirqs_on_events);
2342} 2347}
2343EXPORT_SYMBOL(trace_hardirqs_on_caller); 2348EXPORT_SYMBOL(trace_hardirqs_on_caller);
2344 2349
@@ -2370,9 +2375,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
2370 curr->hardirqs_enabled = 0; 2375 curr->hardirqs_enabled = 0;
2371 curr->hardirq_disable_ip = ip; 2376 curr->hardirq_disable_ip = ip;
2372 curr->hardirq_disable_event = ++curr->irq_events; 2377 curr->hardirq_disable_event = ++curr->irq_events;
2373 debug_atomic_inc(&hardirqs_off_events); 2378 debug_atomic_inc(hardirqs_off_events);
2374 } else 2379 } else
2375 debug_atomic_inc(&redundant_hardirqs_off); 2380 debug_atomic_inc(redundant_hardirqs_off);
2376} 2381}
2377EXPORT_SYMBOL(trace_hardirqs_off_caller); 2382EXPORT_SYMBOL(trace_hardirqs_off_caller);
2378 2383
@@ -2396,7 +2401,7 @@ void trace_softirqs_on(unsigned long ip)
2396 return; 2401 return;
2397 2402
2398 if (curr->softirqs_enabled) { 2403 if (curr->softirqs_enabled) {
2399 debug_atomic_inc(&redundant_softirqs_on); 2404 debug_atomic_inc(redundant_softirqs_on);
2400 return; 2405 return;
2401 } 2406 }
2402 2407
@@ -2406,7 +2411,7 @@ void trace_softirqs_on(unsigned long ip)
2406 curr->softirqs_enabled = 1; 2411 curr->softirqs_enabled = 1;
2407 curr->softirq_enable_ip = ip; 2412 curr->softirq_enable_ip = ip;
2408 curr->softirq_enable_event = ++curr->irq_events; 2413 curr->softirq_enable_event = ++curr->irq_events;
2409 debug_atomic_inc(&softirqs_on_events); 2414 debug_atomic_inc(softirqs_on_events);
2410 /* 2415 /*
2411 * We are going to turn softirqs on, so set the 2416 * We are going to turn softirqs on, so set the
2412 * usage bit for all held locks, if hardirqs are 2417 * usage bit for all held locks, if hardirqs are
@@ -2436,10 +2441,10 @@ void trace_softirqs_off(unsigned long ip)
2436 curr->softirqs_enabled = 0; 2441 curr->softirqs_enabled = 0;
2437 curr->softirq_disable_ip = ip; 2442 curr->softirq_disable_ip = ip;
2438 curr->softirq_disable_event = ++curr->irq_events; 2443 curr->softirq_disable_event = ++curr->irq_events;
2439 debug_atomic_inc(&softirqs_off_events); 2444 debug_atomic_inc(softirqs_off_events);
2440 DEBUG_LOCKS_WARN_ON(!softirq_count()); 2445 DEBUG_LOCKS_WARN_ON(!softirq_count());
2441 } else 2446 } else
2442 debug_atomic_inc(&redundant_softirqs_off); 2447 debug_atomic_inc(redundant_softirqs_off);
2443} 2448}
2444 2449
2445static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) 2450static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2644,7 +2649,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2644 return 0; 2649 return 0;
2645 break; 2650 break;
2646 case LOCK_USED: 2651 case LOCK_USED:
2647 debug_atomic_dec(&nr_unused_locks); 2652 debug_atomic_dec(nr_unused_locks);
2648 break; 2653 break;
2649 default: 2654 default:
2650 if (!debug_locks_off_graph_unlock()) 2655 if (!debug_locks_off_graph_unlock())
@@ -2750,7 +2755,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2750 if (!class) 2755 if (!class)
2751 return 0; 2756 return 0;
2752 } 2757 }
2753 debug_atomic_inc((atomic_t *)&class->ops); 2758 atomic_inc((atomic_t *)&class->ops);
2754 if (very_verbose(class)) { 2759 if (very_verbose(class)) {
2755 printk("\nacquire class [%p] %s", class->key, class->name); 2760 printk("\nacquire class [%p] %s", class->key, class->name);
2756 if (class->name_version > 1) 2761 if (class->name_version > 1)
@@ -3227,7 +3232,7 @@ void lock_release(struct lockdep_map *lock, int nested,
3227 raw_local_irq_save(flags); 3232 raw_local_irq_save(flags);
3228 check_flags(flags); 3233 check_flags(flags);
3229 current->lockdep_recursion = 1; 3234 current->lockdep_recursion = 1;
3230 trace_lock_release(lock, nested, ip); 3235 trace_lock_release(lock, ip);
3231 __lock_release(lock, nested, ip); 3236 __lock_release(lock, nested, ip);
3232 current->lockdep_recursion = 0; 3237 current->lockdep_recursion = 0;
3233 raw_local_irq_restore(flags); 3238 raw_local_irq_restore(flags);
@@ -3380,7 +3385,7 @@ found_it:
3380 hlock->holdtime_stamp = now; 3385 hlock->holdtime_stamp = now;
3381 } 3386 }
3382 3387
3383 trace_lock_acquired(lock, ip, waittime); 3388 trace_lock_acquired(lock, ip);
3384 3389
3385 stats = get_lock_stats(hlock_class(hlock)); 3390 stats = get_lock_stats(hlock_class(hlock));
3386 if (waittime) { 3391 if (waittime) {
@@ -3801,8 +3806,11 @@ void lockdep_rcu_dereference(const char *file, const int line)
3801{ 3806{
3802 struct task_struct *curr = current; 3807 struct task_struct *curr = current;
3803 3808
3809#ifndef CONFIG_PROVE_RCU_REPEATEDLY
3804 if (!debug_locks_off()) 3810 if (!debug_locks_off())
3805 return; 3811 return;
3812#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
3813 /* Note: the following can be executed concurrently, so be careful. */
3806 printk("\n===================================================\n"); 3814 printk("\n===================================================\n");
3807 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); 3815 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3808 printk( "---------------------------------------------------\n"); 3816 printk( "---------------------------------------------------\n");
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad1313..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,30 +110,60 @@ lockdep_count_backward_deps(struct lock_class *class)
110#endif 110#endif
111 111
112#ifdef CONFIG_DEBUG_LOCKDEP 112#ifdef CONFIG_DEBUG_LOCKDEP
113
114#include <asm/local.h>
113/* 115/*
114 * Various lockdep statistics: 116 * Various lockdep statistics.
117 * We want them per cpu as they are often accessed in fast path
118 * and we want to avoid too much cache bouncing.
115 */ 119 */
116extern atomic_t chain_lookup_hits; 120struct lockdep_stats {
117extern atomic_t chain_lookup_misses; 121 int chain_lookup_hits;
118extern atomic_t hardirqs_on_events; 122 int chain_lookup_misses;
119extern atomic_t hardirqs_off_events; 123 int hardirqs_on_events;
120extern atomic_t redundant_hardirqs_on; 124 int hardirqs_off_events;
121extern atomic_t redundant_hardirqs_off; 125 int redundant_hardirqs_on;
122extern atomic_t softirqs_on_events; 126 int redundant_hardirqs_off;
123extern atomic_t softirqs_off_events; 127 int softirqs_on_events;
124extern atomic_t redundant_softirqs_on; 128 int softirqs_off_events;
125extern atomic_t redundant_softirqs_off; 129 int redundant_softirqs_on;
126extern atomic_t nr_unused_locks; 130 int redundant_softirqs_off;
127extern atomic_t nr_cyclic_checks; 131 int nr_unused_locks;
128extern atomic_t nr_cyclic_check_recursions; 132 int nr_cyclic_checks;
129extern atomic_t nr_find_usage_forwards_checks; 133 int nr_cyclic_check_recursions;
130extern atomic_t nr_find_usage_forwards_recursions; 134 int nr_find_usage_forwards_checks;
131extern atomic_t nr_find_usage_backwards_checks; 135 int nr_find_usage_forwards_recursions;
132extern atomic_t nr_find_usage_backwards_recursions; 136 int nr_find_usage_backwards_checks;
133# define debug_atomic_inc(ptr) atomic_inc(ptr) 137 int nr_find_usage_backwards_recursions;
134# define debug_atomic_dec(ptr) atomic_dec(ptr) 138};
135# define debug_atomic_read(ptr) atomic_read(ptr) 139
140DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
141
142#define __debug_atomic_inc(ptr) \
143 this_cpu_inc(lockdep_stats.ptr);
144
145#define debug_atomic_inc(ptr) { \
146 WARN_ON_ONCE(!irqs_disabled()); \
147 __this_cpu_inc(lockdep_stats.ptr); \
148}
149
150#define debug_atomic_dec(ptr) { \
151 WARN_ON_ONCE(!irqs_disabled()); \
152 __this_cpu_dec(lockdep_stats.ptr); \
153}
154
155#define debug_atomic_read(ptr) ({ \
156 struct lockdep_stats *__cpu_lockdep_stats; \
157 unsigned long long __total = 0; \
158 int __cpu; \
159 for_each_possible_cpu(__cpu) { \
160 __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \
161 __total += __cpu_lockdep_stats->ptr; \
162 } \
163 __total; \
164})
136#else 165#else
166# define __debug_atomic_inc(ptr) do { } while (0)
137# define debug_atomic_inc(ptr) do { } while (0) 167# define debug_atomic_inc(ptr) do { } while (0)
138# define debug_atomic_dec(ptr) do { } while (0) 168# define debug_atomic_dec(ptr) do { } while (0)
139# define debug_atomic_read(ptr) 0 169# define debug_atomic_read(ptr) 0
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584c..59b76c8ce9d7 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
184static void lockdep_stats_debug_show(struct seq_file *m) 184static void lockdep_stats_debug_show(struct seq_file *m)
185{ 185{
186#ifdef CONFIG_DEBUG_LOCKDEP 186#ifdef CONFIG_DEBUG_LOCKDEP
187 unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), 187 unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
188 hi2 = debug_atomic_read(&hardirqs_off_events), 188 hi2 = debug_atomic_read(hardirqs_off_events),
189 hr1 = debug_atomic_read(&redundant_hardirqs_on), 189 hr1 = debug_atomic_read(redundant_hardirqs_on),
190 hr2 = debug_atomic_read(&redundant_hardirqs_off), 190 hr2 = debug_atomic_read(redundant_hardirqs_off),
191 si1 = debug_atomic_read(&softirqs_on_events), 191 si1 = debug_atomic_read(softirqs_on_events),
192 si2 = debug_atomic_read(&softirqs_off_events), 192 si2 = debug_atomic_read(softirqs_off_events),
193 sr1 = debug_atomic_read(&redundant_softirqs_on), 193 sr1 = debug_atomic_read(redundant_softirqs_on),
194 sr2 = debug_atomic_read(&redundant_softirqs_off); 194 sr2 = debug_atomic_read(redundant_softirqs_off);
195 195
196 seq_printf(m, " chain lookup misses: %11u\n", 196 seq_printf(m, " chain lookup misses: %11llu\n",
197 debug_atomic_read(&chain_lookup_misses)); 197 debug_atomic_read(chain_lookup_misses));
198 seq_printf(m, " chain lookup hits: %11u\n", 198 seq_printf(m, " chain lookup hits: %11llu\n",
199 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(chain_lookup_hits));
200 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11llu\n",
201 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(nr_cyclic_checks));
202 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11llu\n",
203 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(nr_find_usage_forwards_checks));
204 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11llu\n",
205 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(nr_find_usage_backwards_checks));
206 206
207 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11llu\n", hi1);
208 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11llu\n", hi2);
209 seq_printf(m, " redundant hardirq ons: %11u\n", hr1); 209 seq_printf(m, " redundant hardirq ons: %11llu\n", hr1);
210 seq_printf(m, " redundant hardirq offs: %11u\n", hr2); 210 seq_printf(m, " redundant hardirq offs: %11llu\n", hr2);
211 seq_printf(m, " softirq on events: %11u\n", si1); 211 seq_printf(m, " softirq on events: %11llu\n", si1);
212 seq_printf(m, " softirq off events: %11u\n", si2); 212 seq_printf(m, " softirq off events: %11llu\n", si2);
213 seq_printf(m, " redundant softirq ons: %11u\n", sr1); 213 seq_printf(m, " redundant softirq ons: %11llu\n", sr1);
214 seq_printf(m, " redundant softirq offs: %11u\n", sr2); 214 seq_printf(m, " redundant softirq offs: %11llu\n", sr2);
215#endif 215#endif
216} 216}
217 217
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
263#endif 263#endif
264 } 264 }
265#ifdef CONFIG_DEBUG_LOCKDEP 265#ifdef CONFIG_DEBUG_LOCKDEP
266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); 266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
267#endif 267#endif
268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n", 268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
269 nr_lock_classes, MAX_LOCKDEP_KEYS); 269 nr_lock_classes, MAX_LOCKDEP_KEYS);
diff --git a/kernel/module.c b/kernel/module.c
index 1016b75b026a..e2564580f3f1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
59#define CREATE_TRACE_POINTS 59#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 60#include <trace/events/module.h>
61 61
62EXPORT_TRACEPOINT_SYMBOL(module_get);
63
64#if 0 62#if 0
65#define DEBUGP printk 63#define DEBUGP printk
66#else 64#else
@@ -515,6 +513,9 @@ MODINFO_ATTR(srcversion);
515static char last_unloaded_module[MODULE_NAME_LEN+1]; 513static char last_unloaded_module[MODULE_NAME_LEN+1];
516 514
517#ifdef CONFIG_MODULE_UNLOAD 515#ifdef CONFIG_MODULE_UNLOAD
516
517EXPORT_TRACEPOINT_SYMBOL(module_get);
518
518/* Init the unload section of the module. */ 519/* Init the unload section of the module. */
519static void module_unload_init(struct module *mod) 520static void module_unload_init(struct module *mod)
520{ 521{
@@ -723,16 +724,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
723 return -EFAULT; 724 return -EFAULT;
724 name[MODULE_NAME_LEN-1] = '\0'; 725 name[MODULE_NAME_LEN-1] = '\0';
725 726
726 /* Create stop_machine threads since free_module relies on 727 if (mutex_lock_interruptible(&module_mutex) != 0)
727 * a non-failing stop_machine call. */ 728 return -EINTR;
728 ret = stop_machine_create();
729 if (ret)
730 return ret;
731
732 if (mutex_lock_interruptible(&module_mutex) != 0) {
733 ret = -EINTR;
734 goto out_stop;
735 }
736 729
737 mod = find_module(name); 730 mod = find_module(name);
738 if (!mod) { 731 if (!mod) {
@@ -792,8 +785,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
792 785
793 out: 786 out:
794 mutex_unlock(&module_mutex); 787 mutex_unlock(&module_mutex);
795out_stop:
796 stop_machine_destroy();
797 return ret; 788 return ret;
798} 789}
799 790
@@ -867,8 +858,7 @@ void module_put(struct module *module)
867 smp_wmb(); /* see comment in module_refcount */ 858 smp_wmb(); /* see comment in module_refcount */
868 __this_cpu_inc(module->refptr->decs); 859 __this_cpu_inc(module->refptr->decs);
869 860
870 trace_module_put(module, _RET_IP_, 861 trace_module_put(module, _RET_IP_);
871 __this_cpu_read(module->refptr->decs));
872 /* Maybe they're waiting for us to drop reference? */ 862 /* Maybe they're waiting for us to drop reference? */
873 if (unlikely(!module_is_live(module))) 863 if (unlikely(!module_is_live(module)))
874 wake_up_process(module->waiter); 864 wake_up_process(module->waiter);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2f3fbf84215a..a4fa381db3c2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -16,6 +16,7 @@
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/poll.h> 17#include <linux/poll.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/hash.h>
19#include <linux/sysfs.h> 20#include <linux/sysfs.h>
20#include <linux/dcache.h> 21#include <linux/dcache.h>
21#include <linux/percpu.h> 22#include <linux/percpu.h>
@@ -82,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
82void __weak hw_perf_disable(void) { barrier(); } 83void __weak hw_perf_disable(void) { barrier(); }
83void __weak hw_perf_enable(void) { barrier(); } 84void __weak hw_perf_enable(void) { barrier(); }
84 85
85int __weak
86hw_perf_group_sched_in(struct perf_event *group_leader,
87 struct perf_cpu_context *cpuctx,
88 struct perf_event_context *ctx)
89{
90 return 0;
91}
92
93void __weak perf_event_print_debug(void) { } 86void __weak perf_event_print_debug(void) { }
94 87
95static DEFINE_PER_CPU(int, perf_disable_count); 88static DEFINE_PER_CPU(int, perf_disable_count);
@@ -262,6 +255,18 @@ static void update_event_times(struct perf_event *event)
262 event->total_time_running = run_end - event->tstamp_running; 255 event->total_time_running = run_end - event->tstamp_running;
263} 256}
264 257
258/*
259 * Update total_time_enabled and total_time_running for all events in a group.
260 */
261static void update_group_times(struct perf_event *leader)
262{
263 struct perf_event *event;
264
265 update_event_times(leader);
266 list_for_each_entry(event, &leader->sibling_list, group_entry)
267 update_event_times(event);
268}
269
265static struct list_head * 270static struct list_head *
266ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) 271ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
267{ 272{
@@ -315,8 +320,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
315static void 320static void
316list_del_event(struct perf_event *event, struct perf_event_context *ctx) 321list_del_event(struct perf_event *event, struct perf_event_context *ctx)
317{ 322{
318 struct perf_event *sibling, *tmp;
319
320 if (list_empty(&event->group_entry)) 323 if (list_empty(&event->group_entry))
321 return; 324 return;
322 ctx->nr_events--; 325 ctx->nr_events--;
@@ -329,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
329 if (event->group_leader != event) 332 if (event->group_leader != event)
330 event->group_leader->nr_siblings--; 333 event->group_leader->nr_siblings--;
331 334
332 update_event_times(event); 335 update_group_times(event);
333 336
334 /* 337 /*
335 * If event was in error state, then keep it 338 * If event was in error state, then keep it
@@ -340,6 +343,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
340 */ 343 */
341 if (event->state > PERF_EVENT_STATE_OFF) 344 if (event->state > PERF_EVENT_STATE_OFF)
342 event->state = PERF_EVENT_STATE_OFF; 345 event->state = PERF_EVENT_STATE_OFF;
346}
347
348static void
349perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
350{
351 struct perf_event *sibling, *tmp;
343 352
344 /* 353 /*
345 * If this was a group event with sibling events then 354 * If this was a group event with sibling events then
@@ -505,18 +514,6 @@ retry:
505} 514}
506 515
507/* 516/*
508 * Update total_time_enabled and total_time_running for all events in a group.
509 */
510static void update_group_times(struct perf_event *leader)
511{
512 struct perf_event *event;
513
514 update_event_times(leader);
515 list_for_each_entry(event, &leader->sibling_list, group_entry)
516 update_event_times(event);
517}
518
519/*
520 * Cross CPU call to disable a performance event 517 * Cross CPU call to disable a performance event
521 */ 518 */
522static void __perf_event_disable(void *info) 519static void __perf_event_disable(void *info)
@@ -640,15 +637,20 @@ group_sched_in(struct perf_event *group_event,
640 struct perf_cpu_context *cpuctx, 637 struct perf_cpu_context *cpuctx,
641 struct perf_event_context *ctx) 638 struct perf_event_context *ctx)
642{ 639{
643 struct perf_event *event, *partial_group; 640 struct perf_event *event, *partial_group = NULL;
641 const struct pmu *pmu = group_event->pmu;
642 bool txn = false;
644 int ret; 643 int ret;
645 644
646 if (group_event->state == PERF_EVENT_STATE_OFF) 645 if (group_event->state == PERF_EVENT_STATE_OFF)
647 return 0; 646 return 0;
648 647
649 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); 648 /* Check if group transaction availabe */
650 if (ret) 649 if (pmu->start_txn)
651 return ret < 0 ? ret : 0; 650 txn = true;
651
652 if (txn)
653 pmu->start_txn(pmu);
652 654
653 if (event_sched_in(group_event, cpuctx, ctx)) 655 if (event_sched_in(group_event, cpuctx, ctx))
654 return -EAGAIN; 656 return -EAGAIN;
@@ -663,9 +665,19 @@ group_sched_in(struct perf_event *group_event,
663 } 665 }
664 } 666 }
665 667
666 return 0; 668 if (!txn)
669 return 0;
670
671 ret = pmu->commit_txn(pmu);
672 if (!ret) {
673 pmu->cancel_txn(pmu);
674 return 0;
675 }
667 676
668group_error: 677group_error:
678 if (txn)
679 pmu->cancel_txn(pmu);
680
669 /* 681 /*
670 * Groups can be scheduled in as one unit only, so undo any 682 * Groups can be scheduled in as one unit only, so undo any
671 * partial group before returning: 683 * partial group before returning:
@@ -1367,6 +1379,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1367 if (cpuctx->task_ctx == ctx) 1379 if (cpuctx->task_ctx == ctx)
1368 return; 1380 return;
1369 1381
1382 perf_disable();
1383
1370 /* 1384 /*
1371 * We want to keep the following priority order: 1385 * We want to keep the following priority order:
1372 * cpu pinned (that don't need to move), task pinned, 1386 * cpu pinned (that don't need to move), task pinned,
@@ -1379,6 +1393,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1379 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 1393 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1380 1394
1381 cpuctx->task_ctx = ctx; 1395 cpuctx->task_ctx = ctx;
1396
1397 perf_enable();
1382} 1398}
1383 1399
1384#define MAX_INTERRUPTS (~0ULL) 1400#define MAX_INTERRUPTS (~0ULL)
@@ -1856,9 +1872,30 @@ int perf_event_release_kernel(struct perf_event *event)
1856{ 1872{
1857 struct perf_event_context *ctx = event->ctx; 1873 struct perf_event_context *ctx = event->ctx;
1858 1874
1875 /*
1876 * Remove from the PMU, can't get re-enabled since we got
1877 * here because the last ref went.
1878 */
1879 perf_event_disable(event);
1880
1859 WARN_ON_ONCE(ctx->parent_ctx); 1881 WARN_ON_ONCE(ctx->parent_ctx);
1860 mutex_lock(&ctx->mutex); 1882 /*
1861 perf_event_remove_from_context(event); 1883 * There are two ways this annotation is useful:
1884 *
1885 * 1) there is a lock recursion from perf_event_exit_task
1886 * see the comment there.
1887 *
1888 * 2) there is a lock-inversion with mmap_sem through
1889 * perf_event_read_group(), which takes faults while
1890 * holding ctx->mutex, however this is called after
1891 * the last filedesc died, so there is no possibility
1892 * to trigger the AB-BA case.
1893 */
1894 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
1895 raw_spin_lock_irq(&ctx->lock);
1896 list_del_event(event, ctx);
1897 perf_destroy_group(event, ctx);
1898 raw_spin_unlock_irq(&ctx->lock);
1862 mutex_unlock(&ctx->mutex); 1899 mutex_unlock(&ctx->mutex);
1863 1900
1864 mutex_lock(&event->owner->perf_event_mutex); 1901 mutex_lock(&event->owner->perf_event_mutex);
@@ -2642,6 +2679,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
2642} 2679}
2643 2680
2644static const struct file_operations perf_fops = { 2681static const struct file_operations perf_fops = {
2682 .llseek = no_llseek,
2645 .release = perf_release, 2683 .release = perf_release,
2646 .read = perf_read, 2684 .read = perf_read,
2647 .poll = perf_poll, 2685 .poll = perf_poll,
@@ -2792,6 +2830,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
2792 2830
2793 2831
2794/* 2832/*
2833 * We assume there is only KVM supporting the callbacks.
2834 * Later on, we might change it to a list if there is
2835 * another virtualization implementation supporting the callbacks.
2836 */
2837struct perf_guest_info_callbacks *perf_guest_cbs;
2838
2839int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2840{
2841 perf_guest_cbs = cbs;
2842 return 0;
2843}
2844EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
2845
2846int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2847{
2848 perf_guest_cbs = NULL;
2849 return 0;
2850}
2851EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2852
2853/*
2795 * Output 2854 * Output
2796 */ 2855 */
2797static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, 2856static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
@@ -3743,7 +3802,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3743 .event_id = { 3802 .event_id = {
3744 .header = { 3803 .header = {
3745 .type = PERF_RECORD_MMAP, 3804 .type = PERF_RECORD_MMAP,
3746 .misc = 0, 3805 .misc = PERF_RECORD_MISC_USER,
3747 /* .size */ 3806 /* .size */
3748 }, 3807 },
3749 /* .pid */ 3808 /* .pid */
@@ -3961,36 +4020,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3961 perf_swevent_overflow(event, 0, nmi, data, regs); 4020 perf_swevent_overflow(event, 0, nmi, data, regs);
3962} 4021}
3963 4022
3964static int perf_swevent_is_counting(struct perf_event *event)
3965{
3966 /*
3967 * The event is active, we're good!
3968 */
3969 if (event->state == PERF_EVENT_STATE_ACTIVE)
3970 return 1;
3971
3972 /*
3973 * The event is off/error, not counting.
3974 */
3975 if (event->state != PERF_EVENT_STATE_INACTIVE)
3976 return 0;
3977
3978 /*
3979 * The event is inactive, if the context is active
3980 * we're part of a group that didn't make it on the 'pmu',
3981 * not counting.
3982 */
3983 if (event->ctx->is_active)
3984 return 0;
3985
3986 /*
3987 * We're inactive and the context is too, this means the
3988 * task is scheduled out, we're counting events that happen
3989 * to us, like migration events.
3990 */
3991 return 1;
3992}
3993
3994static int perf_tp_event_match(struct perf_event *event, 4023static int perf_tp_event_match(struct perf_event *event,
3995 struct perf_sample_data *data); 4024 struct perf_sample_data *data);
3996 4025
@@ -4014,12 +4043,6 @@ static int perf_swevent_match(struct perf_event *event,
4014 struct perf_sample_data *data, 4043 struct perf_sample_data *data,
4015 struct pt_regs *regs) 4044 struct pt_regs *regs)
4016{ 4045{
4017 if (event->cpu != -1 && event->cpu != smp_processor_id())
4018 return 0;
4019
4020 if (!perf_swevent_is_counting(event))
4021 return 0;
4022
4023 if (event->attr.type != type) 4046 if (event->attr.type != type)
4024 return 0; 4047 return 0;
4025 4048
@@ -4036,18 +4059,53 @@ static int perf_swevent_match(struct perf_event *event,
4036 return 1; 4059 return 1;
4037} 4060}
4038 4061
4039static void perf_swevent_ctx_event(struct perf_event_context *ctx, 4062static inline u64 swevent_hash(u64 type, u32 event_id)
4040 enum perf_type_id type, 4063{
4041 u32 event_id, u64 nr, int nmi, 4064 u64 val = event_id | (type << 32);
4042 struct perf_sample_data *data, 4065
4043 struct pt_regs *regs) 4066 return hash_64(val, SWEVENT_HLIST_BITS);
4067}
4068
4069static struct hlist_head *
4070find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4071{
4072 u64 hash;
4073 struct swevent_hlist *hlist;
4074
4075 hash = swevent_hash(type, event_id);
4076
4077 hlist = rcu_dereference(ctx->swevent_hlist);
4078 if (!hlist)
4079 return NULL;
4080
4081 return &hlist->heads[hash];
4082}
4083
4084static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4085 u64 nr, int nmi,
4086 struct perf_sample_data *data,
4087 struct pt_regs *regs)
4044{ 4088{
4089 struct perf_cpu_context *cpuctx;
4045 struct perf_event *event; 4090 struct perf_event *event;
4091 struct hlist_node *node;
4092 struct hlist_head *head;
4046 4093
4047 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4094 cpuctx = &__get_cpu_var(perf_cpu_context);
4095
4096 rcu_read_lock();
4097
4098 head = find_swevent_head(cpuctx, type, event_id);
4099
4100 if (!head)
4101 goto end;
4102
4103 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4048 if (perf_swevent_match(event, type, event_id, data, regs)) 4104 if (perf_swevent_match(event, type, event_id, data, regs))
4049 perf_swevent_add(event, nr, nmi, data, regs); 4105 perf_swevent_add(event, nr, nmi, data, regs);
4050 } 4106 }
4107end:
4108 rcu_read_unlock();
4051} 4109}
4052 4110
4053int perf_swevent_get_recursion_context(void) 4111int perf_swevent_get_recursion_context(void)
@@ -4085,27 +4143,6 @@ void perf_swevent_put_recursion_context(int rctx)
4085} 4143}
4086EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); 4144EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4087 4145
4088static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4089 u64 nr, int nmi,
4090 struct perf_sample_data *data,
4091 struct pt_regs *regs)
4092{
4093 struct perf_cpu_context *cpuctx;
4094 struct perf_event_context *ctx;
4095
4096 cpuctx = &__get_cpu_var(perf_cpu_context);
4097 rcu_read_lock();
4098 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4099 nr, nmi, data, regs);
4100 /*
4101 * doesn't really matter which of the child contexts the
4102 * events ends up in.
4103 */
4104 ctx = rcu_dereference(current->perf_event_ctxp);
4105 if (ctx)
4106 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4107 rcu_read_unlock();
4108}
4109 4146
4110void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4147void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4111 struct pt_regs *regs, u64 addr) 4148 struct pt_regs *regs, u64 addr)
@@ -4131,16 +4168,28 @@ static void perf_swevent_read(struct perf_event *event)
4131static int perf_swevent_enable(struct perf_event *event) 4168static int perf_swevent_enable(struct perf_event *event)
4132{ 4169{
4133 struct hw_perf_event *hwc = &event->hw; 4170 struct hw_perf_event *hwc = &event->hw;
4171 struct perf_cpu_context *cpuctx;
4172 struct hlist_head *head;
4173
4174 cpuctx = &__get_cpu_var(perf_cpu_context);
4134 4175
4135 if (hwc->sample_period) { 4176 if (hwc->sample_period) {
4136 hwc->last_period = hwc->sample_period; 4177 hwc->last_period = hwc->sample_period;
4137 perf_swevent_set_period(event); 4178 perf_swevent_set_period(event);
4138 } 4179 }
4180
4181 head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
4182 if (WARN_ON_ONCE(!head))
4183 return -EINVAL;
4184
4185 hlist_add_head_rcu(&event->hlist_entry, head);
4186
4139 return 0; 4187 return 0;
4140} 4188}
4141 4189
4142static void perf_swevent_disable(struct perf_event *event) 4190static void perf_swevent_disable(struct perf_event *event)
4143{ 4191{
4192 hlist_del_rcu(&event->hlist_entry);
4144} 4193}
4145 4194
4146static const struct pmu perf_ops_generic = { 4195static const struct pmu perf_ops_generic = {
@@ -4168,15 +4217,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4168 perf_sample_data_init(&data, 0); 4217 perf_sample_data_init(&data, 0);
4169 data.period = event->hw.last_period; 4218 data.period = event->hw.last_period;
4170 regs = get_irq_regs(); 4219 regs = get_irq_regs();
4171 /*
4172 * In case we exclude kernel IPs or are somehow not in interrupt
4173 * context, provide the next best thing, the user IP.
4174 */
4175 if ((event->attr.exclude_kernel || !regs) &&
4176 !event->attr.exclude_user)
4177 regs = task_pt_regs(current);
4178 4220
4179 if (regs) { 4221 if (regs && !perf_exclude_event(event, regs)) {
4180 if (!(event->attr.exclude_idle && current->pid == 0)) 4222 if (!(event->attr.exclude_idle && current->pid == 0))
4181 if (perf_event_overflow(event, 0, &data, regs)) 4223 if (perf_event_overflow(event, 0, &data, regs))
4182 ret = HRTIMER_NORESTART; 4224 ret = HRTIMER_NORESTART;
@@ -4324,6 +4366,105 @@ static const struct pmu perf_ops_task_clock = {
4324 .read = task_clock_perf_event_read, 4366 .read = task_clock_perf_event_read,
4325}; 4367};
4326 4368
4369static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4370{
4371 struct swevent_hlist *hlist;
4372
4373 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
4374 kfree(hlist);
4375}
4376
4377static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
4378{
4379 struct swevent_hlist *hlist;
4380
4381 if (!cpuctx->swevent_hlist)
4382 return;
4383
4384 hlist = cpuctx->swevent_hlist;
4385 rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
4386 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4387}
4388
4389static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4390{
4391 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4392
4393 mutex_lock(&cpuctx->hlist_mutex);
4394
4395 if (!--cpuctx->hlist_refcount)
4396 swevent_hlist_release(cpuctx);
4397
4398 mutex_unlock(&cpuctx->hlist_mutex);
4399}
4400
4401static void swevent_hlist_put(struct perf_event *event)
4402{
4403 int cpu;
4404
4405 if (event->cpu != -1) {
4406 swevent_hlist_put_cpu(event, event->cpu);
4407 return;
4408 }
4409
4410 for_each_possible_cpu(cpu)
4411 swevent_hlist_put_cpu(event, cpu);
4412}
4413
4414static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4415{
4416 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4417 int err = 0;
4418
4419 mutex_lock(&cpuctx->hlist_mutex);
4420
4421 if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
4422 struct swevent_hlist *hlist;
4423
4424 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
4425 if (!hlist) {
4426 err = -ENOMEM;
4427 goto exit;
4428 }
4429 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
4430 }
4431 cpuctx->hlist_refcount++;
4432 exit:
4433 mutex_unlock(&cpuctx->hlist_mutex);
4434
4435 return err;
4436}
4437
4438static int swevent_hlist_get(struct perf_event *event)
4439{
4440 int err;
4441 int cpu, failed_cpu;
4442
4443 if (event->cpu != -1)
4444 return swevent_hlist_get_cpu(event, event->cpu);
4445
4446 get_online_cpus();
4447 for_each_possible_cpu(cpu) {
4448 err = swevent_hlist_get_cpu(event, cpu);
4449 if (err) {
4450 failed_cpu = cpu;
4451 goto fail;
4452 }
4453 }
4454 put_online_cpus();
4455
4456 return 0;
4457 fail:
4458 for_each_possible_cpu(cpu) {
4459 if (cpu == failed_cpu)
4460 break;
4461 swevent_hlist_put_cpu(event, cpu);
4462 }
4463
4464 put_online_cpus();
4465 return err;
4466}
4467
4327#ifdef CONFIG_EVENT_TRACING 4468#ifdef CONFIG_EVENT_TRACING
4328 4469
4329void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4470void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
@@ -4357,10 +4498,13 @@ static int perf_tp_event_match(struct perf_event *event,
4357static void tp_perf_event_destroy(struct perf_event *event) 4498static void tp_perf_event_destroy(struct perf_event *event)
4358{ 4499{
4359 perf_trace_disable(event->attr.config); 4500 perf_trace_disable(event->attr.config);
4501 swevent_hlist_put(event);
4360} 4502}
4361 4503
4362static const struct pmu *tp_perf_event_init(struct perf_event *event) 4504static const struct pmu *tp_perf_event_init(struct perf_event *event)
4363{ 4505{
4506 int err;
4507
4364 /* 4508 /*
4365 * Raw tracepoint data is a severe data leak, only allow root to 4509 * Raw tracepoint data is a severe data leak, only allow root to
4366 * have these. 4510 * have these.
@@ -4374,6 +4518,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4374 return NULL; 4518 return NULL;
4375 4519
4376 event->destroy = tp_perf_event_destroy; 4520 event->destroy = tp_perf_event_destroy;
4521 err = swevent_hlist_get(event);
4522 if (err) {
4523 perf_trace_disable(event->attr.config);
4524 return ERR_PTR(err);
4525 }
4377 4526
4378 return &perf_ops_generic; 4527 return &perf_ops_generic;
4379} 4528}
@@ -4474,6 +4623,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4474 WARN_ON(event->parent); 4623 WARN_ON(event->parent);
4475 4624
4476 atomic_dec(&perf_swevent_enabled[event_id]); 4625 atomic_dec(&perf_swevent_enabled[event_id]);
4626 swevent_hlist_put(event);
4477} 4627}
4478 4628
4479static const struct pmu *sw_perf_event_init(struct perf_event *event) 4629static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4512,6 +4662,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4512 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 4662 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4513 case PERF_COUNT_SW_EMULATION_FAULTS: 4663 case PERF_COUNT_SW_EMULATION_FAULTS:
4514 if (!event->parent) { 4664 if (!event->parent) {
4665 int err;
4666
4667 err = swevent_hlist_get(event);
4668 if (err)
4669 return ERR_PTR(err);
4670
4515 atomic_inc(&perf_swevent_enabled[event_id]); 4671 atomic_inc(&perf_swevent_enabled[event_id]);
4516 event->destroy = sw_perf_event_destroy; 4672 event->destroy = sw_perf_event_destroy;
4517 } 4673 }
@@ -4897,7 +5053,7 @@ err_fput_free_put_context:
4897 5053
4898err_free_put_context: 5054err_free_put_context:
4899 if (err < 0) 5055 if (err < 0)
4900 kfree(event); 5056 free_event(event);
4901 5057
4902err_put_context: 5058err_put_context:
4903 if (err < 0) 5059 if (err < 0)
@@ -5176,7 +5332,7 @@ void perf_event_exit_task(struct task_struct *child)
5176 * 5332 *
5177 * But since its the parent context it won't be the same instance. 5333 * But since its the parent context it won't be the same instance.
5178 */ 5334 */
5179 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5335 mutex_lock(&child_ctx->mutex);
5180 5336
5181again: 5337again:
5182 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, 5338 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
@@ -5384,6 +5540,7 @@ static void __init perf_event_init_all_cpus(void)
5384 5540
5385 for_each_possible_cpu(cpu) { 5541 for_each_possible_cpu(cpu) {
5386 cpuctx = &per_cpu(perf_cpu_context, cpu); 5542 cpuctx = &per_cpu(perf_cpu_context, cpu);
5543 mutex_init(&cpuctx->hlist_mutex);
5387 __perf_event_init_context(&cpuctx->ctx, NULL); 5544 __perf_event_init_context(&cpuctx->ctx, NULL);
5388 } 5545 }
5389} 5546}
@@ -5397,6 +5554,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
5397 spin_lock(&perf_resource_lock); 5554 spin_lock(&perf_resource_lock);
5398 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5555 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5399 spin_unlock(&perf_resource_lock); 5556 spin_unlock(&perf_resource_lock);
5557
5558 mutex_lock(&cpuctx->hlist_mutex);
5559 if (cpuctx->hlist_refcount > 0) {
5560 struct swevent_hlist *hlist;
5561
5562 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5563 WARN_ON_ONCE(!hlist);
5564 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
5565 }
5566 mutex_unlock(&cpuctx->hlist_mutex);
5400} 5567}
5401 5568
5402#ifdef CONFIG_HOTPLUG_CPU 5569#ifdef CONFIG_HOTPLUG_CPU
@@ -5416,6 +5583,10 @@ static void perf_event_exit_cpu(int cpu)
5416 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 5583 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5417 struct perf_event_context *ctx = &cpuctx->ctx; 5584 struct perf_event_context *ctx = &cpuctx->ctx;
5418 5585
5586 mutex_lock(&cpuctx->hlist_mutex);
5587 swevent_hlist_release(cpuctx);
5588 mutex_unlock(&cpuctx->hlist_mutex);
5589
5419 mutex_lock(&ctx->mutex); 5590 mutex_lock(&ctx->mutex);
5420 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); 5591 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5421 mutex_unlock(&ctx->mutex); 5592 mutex_unlock(&ctx->mutex);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 3db49b9ca374..f42d3f737a33 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -2,7 +2,7 @@
2 * This module exposes the interface to kernel space for specifying 2 * This module exposes the interface to kernel space for specifying
3 * QoS dependencies. It provides infrastructure for registration of: 3 * QoS dependencies. It provides infrastructure for registration of:
4 * 4 *
5 * Dependents on a QoS value : register requirements 5 * Dependents on a QoS value : register requests
6 * Watchers of QoS value : get notified when target QoS value changes 6 * Watchers of QoS value : get notified when target QoS value changes
7 * 7 *
8 * This QoS design is best effort based. Dependents register their QoS needs. 8 * This QoS design is best effort based. Dependents register their QoS needs.
@@ -14,19 +14,21 @@
14 * timeout: usec <-- currently not used. 14 * timeout: usec <-- currently not used.
15 * throughput: kbs (kilo byte / sec) 15 * throughput: kbs (kilo byte / sec)
16 * 16 *
17 * There are lists of pm_qos_objects each one wrapping requirements, notifiers 17 * There are lists of pm_qos_objects each one wrapping requests, notifiers
18 * 18 *
19 * User mode requirements on a QOS parameter register themselves to the 19 * User mode requests on a QOS parameter register themselves to the
20 * subsystem by opening the device node /dev/... and writing there request to 20 * subsystem by opening the device node /dev/... and writing there request to
21 * the node. As long as the process holds a file handle open to the node the 21 * the node. As long as the process holds a file handle open to the node the
22 * client continues to be accounted for. Upon file release the usermode 22 * client continues to be accounted for. Upon file release the usermode
23 * requirement is removed and a new qos target is computed. This way when the 23 * request is removed and a new qos target is computed. This way when the
24 * requirement that the application has is cleaned up when closes the file 24 * request that the application has is cleaned up when closes the file
25 * pointer or exits the pm_qos_object will get an opportunity to clean up. 25 * pointer or exits the pm_qos_object will get an opportunity to clean up.
26 * 26 *
27 * Mark Gross <mgross@linux.intel.com> 27 * Mark Gross <mgross@linux.intel.com>
28 */ 28 */
29 29
30/*#define DEBUG*/
31
30#include <linux/pm_qos_params.h> 32#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 33#include <linux/sched.h>
32#include <linux/spinlock.h> 34#include <linux/spinlock.h>
@@ -42,25 +44,25 @@
42#include <linux/uaccess.h> 44#include <linux/uaccess.h>
43 45
44/* 46/*
45 * locking rule: all changes to requirements or notifiers lists 47 * locking rule: all changes to requests or notifiers lists
46 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
47 * held, taken with _irqsave. One lock to rule them all 49 * held, taken with _irqsave. One lock to rule them all
48 */ 50 */
49struct requirement_list { 51struct pm_qos_request_list {
50 struct list_head list; 52 struct list_head list;
51 union { 53 union {
52 s32 value; 54 s32 value;
53 s32 usec; 55 s32 usec;
54 s32 kbps; 56 s32 kbps;
55 }; 57 };
56 char *name; 58 int pm_qos_class;
57}; 59};
58 60
59static s32 max_compare(s32 v1, s32 v2); 61static s32 max_compare(s32 v1, s32 v2);
60static s32 min_compare(s32 v1, s32 v2); 62static s32 min_compare(s32 v1, s32 v2);
61 63
62struct pm_qos_object { 64struct pm_qos_object {
63 struct requirement_list requirements; 65 struct pm_qos_request_list requests;
64 struct blocking_notifier_head *notifiers; 66 struct blocking_notifier_head *notifiers;
65 struct miscdevice pm_qos_power_miscdev; 67 struct miscdevice pm_qos_power_miscdev;
66 char *name; 68 char *name;
@@ -72,7 +74,7 @@ struct pm_qos_object {
72static struct pm_qos_object null_pm_qos; 74static struct pm_qos_object null_pm_qos;
73static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
74static struct pm_qos_object cpu_dma_pm_qos = { 76static struct pm_qos_object cpu_dma_pm_qos = {
75 .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)}, 77 .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
76 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
77 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
78 .default_value = 2000 * USEC_PER_SEC, 80 .default_value = 2000 * USEC_PER_SEC,
@@ -82,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
82 84
83static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
84static struct pm_qos_object network_lat_pm_qos = { 86static struct pm_qos_object network_lat_pm_qos = {
85 .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)}, 87 .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
86 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
87 .name = "network_latency", 89 .name = "network_latency",
88 .default_value = 2000 * USEC_PER_SEC, 90 .default_value = 2000 * USEC_PER_SEC,
@@ -93,8 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
93 95
94static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
95static struct pm_qos_object network_throughput_pm_qos = { 97static struct pm_qos_object network_throughput_pm_qos = {
96 .requirements = 98 .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
97 {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
98 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
99 .name = "network_throughput", 100 .name = "network_throughput",
100 .default_value = 0, 101 .default_value = 0,
@@ -135,31 +136,34 @@ static s32 min_compare(s32 v1, s32 v2)
135} 136}
136 137
137 138
138static void update_target(int target) 139static void update_target(int pm_qos_class)
139{ 140{
140 s32 extreme_value; 141 s32 extreme_value;
141 struct requirement_list *node; 142 struct pm_qos_request_list *node;
142 unsigned long flags; 143 unsigned long flags;
143 int call_notifier = 0; 144 int call_notifier = 0;
144 145
145 spin_lock_irqsave(&pm_qos_lock, flags); 146 spin_lock_irqsave(&pm_qos_lock, flags);
146 extreme_value = pm_qos_array[target]->default_value; 147 extreme_value = pm_qos_array[pm_qos_class]->default_value;
147 list_for_each_entry(node, 148 list_for_each_entry(node,
148 &pm_qos_array[target]->requirements.list, list) { 149 &pm_qos_array[pm_qos_class]->requests.list, list) {
149 extreme_value = pm_qos_array[target]->comparitor( 150 extreme_value = pm_qos_array[pm_qos_class]->comparitor(
150 extreme_value, node->value); 151 extreme_value, node->value);
151 } 152 }
152 if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { 153 if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
154 extreme_value) {
153 call_notifier = 1; 155 call_notifier = 1;
154 atomic_set(&pm_qos_array[target]->target_value, extreme_value); 156 atomic_set(&pm_qos_array[pm_qos_class]->target_value,
155 pr_debug(KERN_ERR "new target for qos %d is %d\n", target, 157 extreme_value);
156 atomic_read(&pm_qos_array[target]->target_value)); 158 pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
159 atomic_read(&pm_qos_array[pm_qos_class]->target_value));
157 } 160 }
158 spin_unlock_irqrestore(&pm_qos_lock, flags); 161 spin_unlock_irqrestore(&pm_qos_lock, flags);
159 162
160 if (call_notifier) 163 if (call_notifier)
161 blocking_notifier_call_chain(pm_qos_array[target]->notifiers, 164 blocking_notifier_call_chain(
162 (unsigned long) extreme_value, NULL); 165 pm_qos_array[pm_qos_class]->notifiers,
166 (unsigned long) extreme_value, NULL);
163} 167}
164 168
165static int register_pm_qos_misc(struct pm_qos_object *qos) 169static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -185,125 +189,112 @@ static int find_pm_qos_object_by_minor(int minor)
185} 189}
186 190
187/** 191/**
188 * pm_qos_requirement - returns current system wide qos expectation 192 * pm_qos_request - returns current system wide qos expectation
189 * @pm_qos_class: identification of which qos value is requested 193 * @pm_qos_class: identification of which qos value is requested
190 * 194 *
191 * This function returns the current target value in an atomic manner. 195 * This function returns the current target value in an atomic manner.
192 */ 196 */
193int pm_qos_requirement(int pm_qos_class) 197int pm_qos_request(int pm_qos_class)
194{ 198{
195 return atomic_read(&pm_qos_array[pm_qos_class]->target_value); 199 return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
196} 200}
197EXPORT_SYMBOL_GPL(pm_qos_requirement); 201EXPORT_SYMBOL_GPL(pm_qos_request);
198 202
199/** 203/**
200 * pm_qos_add_requirement - inserts new qos request into the list 204 * pm_qos_add_request - inserts new qos request into the list
201 * @pm_qos_class: identifies which list of qos request to us 205 * @pm_qos_class: identifies which list of qos request to us
202 * @name: identifies the request
203 * @value: defines the qos request 206 * @value: defines the qos request
204 * 207 *
205 * This function inserts a new entry in the pm_qos_class list of requested qos 208 * This function inserts a new entry in the pm_qos_class list of requested qos
206 * performance characteristics. It recomputes the aggregate QoS expectations 209 * performance characteristics. It recomputes the aggregate QoS expectations
207 * for the pm_qos_class of parameters. 210 * for the pm_qos_class of parameters, and returns the pm_qos_request list
211 * element as a handle for use in updating and removal. Call needs to save
212 * this handle for later use.
208 */ 213 */
209int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) 214struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
210{ 215{
211 struct requirement_list *dep; 216 struct pm_qos_request_list *dep;
212 unsigned long flags; 217 unsigned long flags;
213 218
214 dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL); 219 dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
215 if (dep) { 220 if (dep) {
216 if (value == PM_QOS_DEFAULT_VALUE) 221 if (value == PM_QOS_DEFAULT_VALUE)
217 dep->value = pm_qos_array[pm_qos_class]->default_value; 222 dep->value = pm_qos_array[pm_qos_class]->default_value;
218 else 223 else
219 dep->value = value; 224 dep->value = value;
220 dep->name = kstrdup(name, GFP_KERNEL); 225 dep->pm_qos_class = pm_qos_class;
221 if (!dep->name)
222 goto cleanup;
223 226
224 spin_lock_irqsave(&pm_qos_lock, flags); 227 spin_lock_irqsave(&pm_qos_lock, flags);
225 list_add(&dep->list, 228 list_add(&dep->list,
226 &pm_qos_array[pm_qos_class]->requirements.list); 229 &pm_qos_array[pm_qos_class]->requests.list);
227 spin_unlock_irqrestore(&pm_qos_lock, flags); 230 spin_unlock_irqrestore(&pm_qos_lock, flags);
228 update_target(pm_qos_class); 231 update_target(pm_qos_class);
229
230 return 0;
231 } 232 }
232 233
233cleanup: 234 return dep;
234 kfree(dep);
235 return -ENOMEM;
236} 235}
237EXPORT_SYMBOL_GPL(pm_qos_add_requirement); 236EXPORT_SYMBOL_GPL(pm_qos_add_request);
238 237
239/** 238/**
240 * pm_qos_update_requirement - modifies an existing qos request 239 * pm_qos_update_request - modifies an existing qos request
241 * @pm_qos_class: identifies which list of qos request to us 240 * @pm_qos_req : handle to list element holding a pm_qos request to use
242 * @name: identifies the request
243 * @value: defines the qos request 241 * @value: defines the qos request
244 * 242 *
245 * Updates an existing qos requirement for the pm_qos_class of parameters along 243 * Updates an existing qos request for the pm_qos_class of parameters along
246 * with updating the target pm_qos_class value. 244 * with updating the target pm_qos_class value.
247 * 245 *
248 * If the named request isn't in the list then no change is made. 246 * Attempts are made to make this code callable on hot code paths.
249 */ 247 */
250int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) 248void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
249 s32 new_value)
251{ 250{
252 unsigned long flags; 251 unsigned long flags;
253 struct requirement_list *node;
254 int pending_update = 0; 252 int pending_update = 0;
253 s32 temp;
255 254
256 spin_lock_irqsave(&pm_qos_lock, flags); 255 if (pm_qos_req) { /*guard against callers passing in null */
257 list_for_each_entry(node, 256 spin_lock_irqsave(&pm_qos_lock, flags);
258 &pm_qos_array[pm_qos_class]->requirements.list, list) { 257 if (new_value == PM_QOS_DEFAULT_VALUE)
259 if (strcmp(node->name, name) == 0) { 258 temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
260 if (new_value == PM_QOS_DEFAULT_VALUE) 259 else
261 node->value = 260 temp = new_value;
262 pm_qos_array[pm_qos_class]->default_value; 261
263 else 262 if (temp != pm_qos_req->value) {
264 node->value = new_value;
265 pending_update = 1; 263 pending_update = 1;
266 break; 264 pm_qos_req->value = temp;
267 } 265 }
266 spin_unlock_irqrestore(&pm_qos_lock, flags);
267 if (pending_update)
268 update_target(pm_qos_req->pm_qos_class);
268 } 269 }
269 spin_unlock_irqrestore(&pm_qos_lock, flags);
270 if (pending_update)
271 update_target(pm_qos_class);
272
273 return 0;
274} 270}
275EXPORT_SYMBOL_GPL(pm_qos_update_requirement); 271EXPORT_SYMBOL_GPL(pm_qos_update_request);
276 272
277/** 273/**
278 * pm_qos_remove_requirement - modifies an existing qos request 274 * pm_qos_remove_request - modifies an existing qos request
279 * @pm_qos_class: identifies which list of qos request to us 275 * @pm_qos_req: handle to request list element
280 * @name: identifies the request
281 * 276 *
282 * Will remove named qos request from pm_qos_class list of parameters and 277 * Will remove pm qos request from the list of requests and
283 * recompute the current target value for the pm_qos_class. 278 * recompute the current target value for the pm_qos_class. Call this
279 * on slow code paths.
284 */ 280 */
285void pm_qos_remove_requirement(int pm_qos_class, char *name) 281void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
286{ 282{
287 unsigned long flags; 283 unsigned long flags;
288 struct requirement_list *node; 284 int qos_class;
289 int pending_update = 0;
290 285
286 if (pm_qos_req == NULL)
287 return;
288 /* silent return to keep pcm code cleaner */
289
290 qos_class = pm_qos_req->pm_qos_class;
291 spin_lock_irqsave(&pm_qos_lock, flags); 291 spin_lock_irqsave(&pm_qos_lock, flags);
292 list_for_each_entry(node, 292 list_del(&pm_qos_req->list);
293 &pm_qos_array[pm_qos_class]->requirements.list, list) { 293 kfree(pm_qos_req);
294 if (strcmp(node->name, name) == 0) {
295 kfree(node->name);
296 list_del(&node->list);
297 kfree(node);
298 pending_update = 1;
299 break;
300 }
301 }
302 spin_unlock_irqrestore(&pm_qos_lock, flags); 294 spin_unlock_irqrestore(&pm_qos_lock, flags);
303 if (pending_update) 295 update_target(qos_class);
304 update_target(pm_qos_class);
305} 296}
306EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); 297EXPORT_SYMBOL_GPL(pm_qos_remove_request);
307 298
308/** 299/**
309 * pm_qos_add_notifier - sets notification entry for changes to target value 300 * pm_qos_add_notifier - sets notification entry for changes to target value
@@ -313,7 +304,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
313 * will register the notifier into a notification chain that gets called 304 * will register the notifier into a notification chain that gets called
314 * upon changes to the pm_qos_class target value. 305 * upon changes to the pm_qos_class target value.
315 */ 306 */
316 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) 307int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
317{ 308{
318 int retval; 309 int retval;
319 310
@@ -343,21 +334,16 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
343} 334}
344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 335EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
345 336
346#define PID_NAME_LEN 32
347
348static int pm_qos_power_open(struct inode *inode, struct file *filp) 337static int pm_qos_power_open(struct inode *inode, struct file *filp)
349{ 338{
350 int ret;
351 long pm_qos_class; 339 long pm_qos_class;
352 char name[PID_NAME_LEN];
353 340
354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 341 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
355 if (pm_qos_class >= 0) { 342 if (pm_qos_class >= 0) {
356 filp->private_data = (void *)pm_qos_class; 343 filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 344 PM_QOS_DEFAULT_VALUE);
358 ret = pm_qos_add_requirement(pm_qos_class, name, 345
359 PM_QOS_DEFAULT_VALUE); 346 if (filp->private_data)
360 if (ret >= 0)
361 return 0; 347 return 0;
362 } 348 }
363 return -EPERM; 349 return -EPERM;
@@ -365,32 +351,40 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
365 351
366static int pm_qos_power_release(struct inode *inode, struct file *filp) 352static int pm_qos_power_release(struct inode *inode, struct file *filp)
367{ 353{
368 int pm_qos_class; 354 struct pm_qos_request_list *req;
369 char name[PID_NAME_LEN];
370 355
371 pm_qos_class = (long)filp->private_data; 356 req = (struct pm_qos_request_list *)filp->private_data;
372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 357 pm_qos_remove_request(req);
373 pm_qos_remove_requirement(pm_qos_class, name);
374 358
375 return 0; 359 return 0;
376} 360}
377 361
362
378static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 363static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
379 size_t count, loff_t *f_pos) 364 size_t count, loff_t *f_pos)
380{ 365{
381 s32 value; 366 s32 value;
382 int pm_qos_class; 367 int x;
383 char name[PID_NAME_LEN]; 368 char ascii_value[11];
384 369 struct pm_qos_request_list *pm_qos_req;
385 pm_qos_class = (long)filp->private_data; 370
386 if (count != sizeof(s32)) 371 if (count == sizeof(s32)) {
372 if (copy_from_user(&value, buf, sizeof(s32)))
373 return -EFAULT;
374 } else if (count == 11) { /* len('0x12345678/0') */
375 if (copy_from_user(ascii_value, buf, 11))
376 return -EFAULT;
377 x = sscanf(ascii_value, "%x", &value);
378 if (x != 1)
379 return -EINVAL;
380 pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value);
381 } else
387 return -EINVAL; 382 return -EINVAL;
388 if (copy_from_user(&value, buf, sizeof(s32)))
389 return -EFAULT;
390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
391 pm_qos_update_requirement(pm_qos_class, name, value);
392 383
393 return sizeof(s32); 384 pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
385 pm_qos_update_request(pm_qos_req, value);
386
387 return count;
394} 388}
395 389
396 390
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bc7704b3a443..00bb252f29a2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -11,19 +11,18 @@
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12 12
13/* 13/*
14 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 14 * Called after updating RLIMIT_CPU to run cpu timer and update
15 * tsk->signal->cputime_expires expiration cache if necessary. Needs
16 * siglock protection since other code may update expiration cache as
17 * well.
15 */ 18 */
16void update_rlimit_cpu(unsigned long rlim_new) 19void update_rlimit_cpu(unsigned long rlim_new)
17{ 20{
18 cputime_t cputime = secs_to_cputime(rlim_new); 21 cputime_t cputime = secs_to_cputime(rlim_new);
19 struct signal_struct *const sig = current->signal;
20 22
21 if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || 23 spin_lock_irq(&current->sighand->siglock);
22 cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { 24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
23 spin_lock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&current->sighand->siglock);
24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
25 spin_unlock_irq(&current->sighand->siglock);
26 }
27} 26}
28 27
29static int check_clock(const clockid_t which_clock) 28static int check_clock(const clockid_t which_clock)
@@ -548,111 +547,62 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
548 cputime_gt(expires, new_exp); 547 cputime_gt(expires, new_exp);
549} 548}
550 549
551static inline int expires_le(cputime_t expires, cputime_t new_exp)
552{
553 return !cputime_eq(expires, cputime_zero) &&
554 cputime_le(expires, new_exp);
555}
556/* 550/*
557 * Insert the timer on the appropriate list before any timers that 551 * Insert the timer on the appropriate list before any timers that
558 * expire later. This must be called with the tasklist_lock held 552 * expire later. This must be called with the tasklist_lock held
559 * for reading, and interrupts disabled. 553 * for reading, interrupts disabled and p->sighand->siglock taken.
560 */ 554 */
561static void arm_timer(struct k_itimer *timer, union cpu_time_count now) 555static void arm_timer(struct k_itimer *timer)
562{ 556{
563 struct task_struct *p = timer->it.cpu.task; 557 struct task_struct *p = timer->it.cpu.task;
564 struct list_head *head, *listpos; 558 struct list_head *head, *listpos;
559 struct task_cputime *cputime_expires;
565 struct cpu_timer_list *const nt = &timer->it.cpu; 560 struct cpu_timer_list *const nt = &timer->it.cpu;
566 struct cpu_timer_list *next; 561 struct cpu_timer_list *next;
567 unsigned long i;
568 562
569 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? 563 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
570 p->cpu_timers : p->signal->cpu_timers); 564 head = p->cpu_timers;
565 cputime_expires = &p->cputime_expires;
566 } else {
567 head = p->signal->cpu_timers;
568 cputime_expires = &p->signal->cputime_expires;
569 }
571 head += CPUCLOCK_WHICH(timer->it_clock); 570 head += CPUCLOCK_WHICH(timer->it_clock);
572 571
573 BUG_ON(!irqs_disabled());
574 spin_lock(&p->sighand->siglock);
575
576 listpos = head; 572 listpos = head;
577 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 573 list_for_each_entry(next, head, entry) {
578 list_for_each_entry(next, head, entry) { 574 if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
579 if (next->expires.sched > nt->expires.sched) 575 break;
580 break; 576 listpos = &next->entry;
581 listpos = &next->entry;
582 }
583 } else {
584 list_for_each_entry(next, head, entry) {
585 if (cputime_gt(next->expires.cpu, nt->expires.cpu))
586 break;
587 listpos = &next->entry;
588 }
589 } 577 }
590 list_add(&nt->entry, listpos); 578 list_add(&nt->entry, listpos);
591 579
592 if (listpos == head) { 580 if (listpos == head) {
581 union cpu_time_count *exp = &nt->expires;
582
593 /* 583 /*
594 * We are the new earliest-expiring timer. 584 * We are the new earliest-expiring POSIX 1.b timer, hence
595 * If we are a thread timer, there can always 585 * need to update expiration cache. Take into account that
596 * be a process timer telling us to stop earlier. 586 * for process timers we share expiration cache with itimers
587 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
597 */ 588 */
598 589
599 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 590 switch (CPUCLOCK_WHICH(timer->it_clock)) {
600 union cpu_time_count *exp = &nt->expires; 591 case CPUCLOCK_PROF:
601 592 if (expires_gt(cputime_expires->prof_exp, exp->cpu))
602 switch (CPUCLOCK_WHICH(timer->it_clock)) { 593 cputime_expires->prof_exp = exp->cpu;
603 default: 594 break;
604 BUG(); 595 case CPUCLOCK_VIRT:
605 case CPUCLOCK_PROF: 596 if (expires_gt(cputime_expires->virt_exp, exp->cpu))
606 if (expires_gt(p->cputime_expires.prof_exp, 597 cputime_expires->virt_exp = exp->cpu;
607 exp->cpu)) 598 break;
608 p->cputime_expires.prof_exp = exp->cpu; 599 case CPUCLOCK_SCHED:
609 break; 600 if (cputime_expires->sched_exp == 0 ||
610 case CPUCLOCK_VIRT: 601 cputime_expires->sched_exp > exp->sched)
611 if (expires_gt(p->cputime_expires.virt_exp, 602 cputime_expires->sched_exp = exp->sched;
612 exp->cpu)) 603 break;
613 p->cputime_expires.virt_exp = exp->cpu;
614 break;
615 case CPUCLOCK_SCHED:
616 if (p->cputime_expires.sched_exp == 0 ||
617 p->cputime_expires.sched_exp > exp->sched)
618 p->cputime_expires.sched_exp =
619 exp->sched;
620 break;
621 }
622 } else {
623 struct signal_struct *const sig = p->signal;
624 union cpu_time_count *exp = &timer->it.cpu.expires;
625
626 /*
627 * For a process timer, set the cached expiration time.
628 */
629 switch (CPUCLOCK_WHICH(timer->it_clock)) {
630 default:
631 BUG();
632 case CPUCLOCK_VIRT:
633 if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
634 exp->cpu))
635 break;
636 sig->cputime_expires.virt_exp = exp->cpu;
637 break;
638 case CPUCLOCK_PROF:
639 if (expires_le(sig->it[CPUCLOCK_PROF].expires,
640 exp->cpu))
641 break;
642 i = sig->rlim[RLIMIT_CPU].rlim_cur;
643 if (i != RLIM_INFINITY &&
644 i <= cputime_to_secs(exp->cpu))
645 break;
646 sig->cputime_expires.prof_exp = exp->cpu;
647 break;
648 case CPUCLOCK_SCHED:
649 sig->cputime_expires.sched_exp = exp->sched;
650 break;
651 }
652 } 604 }
653 } 605 }
654
655 spin_unlock(&p->sighand->siglock);
656} 606}
657 607
658/* 608/*
@@ -660,7 +610,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
660 */ 610 */
661static void cpu_timer_fire(struct k_itimer *timer) 611static void cpu_timer_fire(struct k_itimer *timer)
662{ 612{
663 if (unlikely(timer->sigq == NULL)) { 613 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
614 /*
615 * User don't want any signal.
616 */
617 timer->it.cpu.expires.sched = 0;
618 } else if (unlikely(timer->sigq == NULL)) {
664 /* 619 /*
665 * This a special case for clock_nanosleep, 620 * This a special case for clock_nanosleep,
666 * not a normal timer from sys_timer_create. 621 * not a normal timer from sys_timer_create.
@@ -721,7 +676,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
721 struct itimerspec *new, struct itimerspec *old) 676 struct itimerspec *new, struct itimerspec *old)
722{ 677{
723 struct task_struct *p = timer->it.cpu.task; 678 struct task_struct *p = timer->it.cpu.task;
724 union cpu_time_count old_expires, new_expires, val; 679 union cpu_time_count old_expires, new_expires, old_incr, val;
725 int ret; 680 int ret;
726 681
727 if (unlikely(p == NULL)) { 682 if (unlikely(p == NULL)) {
@@ -752,6 +707,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
752 BUG_ON(!irqs_disabled()); 707 BUG_ON(!irqs_disabled());
753 708
754 ret = 0; 709 ret = 0;
710 old_incr = timer->it.cpu.incr;
755 spin_lock(&p->sighand->siglock); 711 spin_lock(&p->sighand->siglock);
756 old_expires = timer->it.cpu.expires; 712 old_expires = timer->it.cpu.expires;
757 if (unlikely(timer->it.cpu.firing)) { 713 if (unlikely(timer->it.cpu.firing)) {
@@ -759,7 +715,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
759 ret = TIMER_RETRY; 715 ret = TIMER_RETRY;
760 } else 716 } else
761 list_del_init(&timer->it.cpu.entry); 717 list_del_init(&timer->it.cpu.entry);
762 spin_unlock(&p->sighand->siglock);
763 718
764 /* 719 /*
765 * We need to sample the current value to convert the new 720 * We need to sample the current value to convert the new
@@ -813,6 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
813 * disable this firing since we are already reporting 768 * disable this firing since we are already reporting
814 * it as an overrun (thanks to bump_cpu_timer above). 769 * it as an overrun (thanks to bump_cpu_timer above).
815 */ 770 */
771 spin_unlock(&p->sighand->siglock);
816 read_unlock(&tasklist_lock); 772 read_unlock(&tasklist_lock);
817 goto out; 773 goto out;
818 } 774 }
@@ -828,11 +784,11 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
828 */ 784 */
829 timer->it.cpu.expires = new_expires; 785 timer->it.cpu.expires = new_expires;
830 if (new_expires.sched != 0 && 786 if (new_expires.sched != 0 &&
831 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
832 cpu_time_before(timer->it_clock, val, new_expires)) { 787 cpu_time_before(timer->it_clock, val, new_expires)) {
833 arm_timer(timer, val); 788 arm_timer(timer);
834 } 789 }
835 790
791 spin_unlock(&p->sighand->siglock);
836 read_unlock(&tasklist_lock); 792 read_unlock(&tasklist_lock);
837 793
838 /* 794 /*
@@ -853,7 +809,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
853 timer->it_overrun = -1; 809 timer->it_overrun = -1;
854 810
855 if (new_expires.sched != 0 && 811 if (new_expires.sched != 0 &&
856 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
857 !cpu_time_before(timer->it_clock, val, new_expires)) { 812 !cpu_time_before(timer->it_clock, val, new_expires)) {
858 /* 813 /*
859 * The designated time already passed, so we notify 814 * The designated time already passed, so we notify
@@ -867,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
867 out: 822 out:
868 if (old) { 823 if (old) {
869 sample_to_timespec(timer->it_clock, 824 sample_to_timespec(timer->it_clock,
870 timer->it.cpu.incr, &old->it_interval); 825 old_incr, &old->it_interval);
871 } 826 }
872 return ret; 827 return ret;
873} 828}
@@ -927,25 +882,6 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
927 read_unlock(&tasklist_lock); 882 read_unlock(&tasklist_lock);
928 } 883 }
929 884
930 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
931 if (timer->it.cpu.incr.sched == 0 &&
932 cpu_time_before(timer->it_clock,
933 timer->it.cpu.expires, now)) {
934 /*
935 * Do-nothing timer expired and has no reload,
936 * so it's as if it was never set.
937 */
938 timer->it.cpu.expires.sched = 0;
939 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
940 return;
941 }
942 /*
943 * Account for any expirations and reloads that should
944 * have happened.
945 */
946 bump_cpu_timer(timer, now);
947 }
948
949 if (unlikely(clear_dead)) { 885 if (unlikely(clear_dead)) {
950 /* 886 /*
951 * We've noticed that the thread is dead, but 887 * We've noticed that the thread is dead, but
@@ -1066,16 +1002,9 @@ static void stop_process_timers(struct signal_struct *sig)
1066 struct thread_group_cputimer *cputimer = &sig->cputimer; 1002 struct thread_group_cputimer *cputimer = &sig->cputimer;
1067 unsigned long flags; 1003 unsigned long flags;
1068 1004
1069 if (!cputimer->running)
1070 return;
1071
1072 spin_lock_irqsave(&cputimer->lock, flags); 1005 spin_lock_irqsave(&cputimer->lock, flags);
1073 cputimer->running = 0; 1006 cputimer->running = 0;
1074 spin_unlock_irqrestore(&cputimer->lock, flags); 1007 spin_unlock_irqrestore(&cputimer->lock, flags);
1075
1076 sig->cputime_expires.prof_exp = cputime_zero;
1077 sig->cputime_expires.virt_exp = cputime_zero;
1078 sig->cputime_expires.sched_exp = 0;
1079} 1008}
1080 1009
1081static u32 onecputick; 1010static u32 onecputick;
@@ -1112,6 +1041,23 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1112 } 1041 }
1113} 1042}
1114 1043
1044/**
1045 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1046 *
1047 * @cputime: The struct to compare.
1048 *
1049 * Checks @cputime to see if all fields are zero. Returns true if all fields
1050 * are zero, false if any field is nonzero.
1051 */
1052static inline int task_cputime_zero(const struct task_cputime *cputime)
1053{
1054 if (cputime_eq(cputime->utime, cputime_zero) &&
1055 cputime_eq(cputime->stime, cputime_zero) &&
1056 cputime->sum_exec_runtime == 0)
1057 return 1;
1058 return 0;
1059}
1060
1115/* 1061/*
1116 * Check for any per-thread CPU timers that have fired and move them 1062 * Check for any per-thread CPU timers that have fired and move them
1117 * off the tsk->*_timers list onto the firing list. Per-thread timers 1063 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1129,19 +1075,6 @@ static void check_process_timers(struct task_struct *tsk,
1129 unsigned long soft; 1075 unsigned long soft;
1130 1076
1131 /* 1077 /*
1132 * Don't sample the current process CPU clocks if there are no timers.
1133 */
1134 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1135 cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
1136 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1137 list_empty(&timers[CPUCLOCK_VIRT]) &&
1138 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1139 list_empty(&timers[CPUCLOCK_SCHED])) {
1140 stop_process_timers(sig);
1141 return;
1142 }
1143
1144 /*
1145 * Collect the current process totals. 1078 * Collect the current process totals.
1146 */ 1079 */
1147 thread_group_cputimer(tsk, &cputime); 1080 thread_group_cputimer(tsk, &cputime);
@@ -1230,18 +1163,11 @@ static void check_process_timers(struct task_struct *tsk,
1230 } 1163 }
1231 } 1164 }
1232 1165
1233 if (!cputime_eq(prof_expires, cputime_zero) && 1166 sig->cputime_expires.prof_exp = prof_expires;
1234 (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || 1167 sig->cputime_expires.virt_exp = virt_expires;
1235 cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) 1168 sig->cputime_expires.sched_exp = sched_expires;
1236 sig->cputime_expires.prof_exp = prof_expires; 1169 if (task_cputime_zero(&sig->cputime_expires))
1237 if (!cputime_eq(virt_expires, cputime_zero) && 1170 stop_process_timers(sig);
1238 (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
1239 cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
1240 sig->cputime_expires.virt_exp = virt_expires;
1241 if (sched_expires != 0 &&
1242 (sig->cputime_expires.sched_exp == 0 ||
1243 sig->cputime_expires.sched_exp > sched_expires))
1244 sig->cputime_expires.sched_exp = sched_expires;
1245} 1171}
1246 1172
1247/* 1173/*
@@ -1270,6 +1196,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1270 goto out; 1196 goto out;
1271 } 1197 }
1272 read_lock(&tasklist_lock); /* arm_timer needs it. */ 1198 read_lock(&tasklist_lock); /* arm_timer needs it. */
1199 spin_lock(&p->sighand->siglock);
1273 } else { 1200 } else {
1274 read_lock(&tasklist_lock); 1201 read_lock(&tasklist_lock);
1275 if (unlikely(p->signal == NULL)) { 1202 if (unlikely(p->signal == NULL)) {
@@ -1290,6 +1217,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1290 clear_dead_task(timer, now); 1217 clear_dead_task(timer, now);
1291 goto out_unlock; 1218 goto out_unlock;
1292 } 1219 }
1220 spin_lock(&p->sighand->siglock);
1293 cpu_timer_sample_group(timer->it_clock, p, &now); 1221 cpu_timer_sample_group(timer->it_clock, p, &now);
1294 bump_cpu_timer(timer, now); 1222 bump_cpu_timer(timer, now);
1295 /* Leave the tasklist_lock locked for the call below. */ 1223 /* Leave the tasklist_lock locked for the call below. */
@@ -1298,7 +1226,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1298 /* 1226 /*
1299 * Now re-arm for the new expiry time. 1227 * Now re-arm for the new expiry time.
1300 */ 1228 */
1301 arm_timer(timer, now); 1229 BUG_ON(!irqs_disabled());
1230 arm_timer(timer);
1231 spin_unlock(&p->sighand->siglock);
1302 1232
1303out_unlock: 1233out_unlock:
1304 read_unlock(&tasklist_lock); 1234 read_unlock(&tasklist_lock);
@@ -1310,23 +1240,6 @@ out:
1310} 1240}
1311 1241
1312/** 1242/**
1313 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1314 *
1315 * @cputime: The struct to compare.
1316 *
1317 * Checks @cputime to see if all fields are zero. Returns true if all fields
1318 * are zero, false if any field is nonzero.
1319 */
1320static inline int task_cputime_zero(const struct task_cputime *cputime)
1321{
1322 if (cputime_eq(cputime->utime, cputime_zero) &&
1323 cputime_eq(cputime->stime, cputime_zero) &&
1324 cputime->sum_exec_runtime == 0)
1325 return 1;
1326 return 0;
1327}
1328
1329/**
1330 * task_cputime_expired - Compare two task_cputime entities. 1243 * task_cputime_expired - Compare two task_cputime entities.
1331 * 1244 *
1332 * @sample: The task_cputime structure to be checked for expiration. 1245 * @sample: The task_cputime structure to be checked for expiration.
@@ -1382,7 +1295,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1382 } 1295 }
1383 1296
1384 sig = tsk->signal; 1297 sig = tsk->signal;
1385 if (!task_cputime_zero(&sig->cputime_expires)) { 1298 if (sig->cputimer.running) {
1386 struct task_cputime group_sample; 1299 struct task_cputime group_sample;
1387 1300
1388 thread_group_cputimer(tsk, &group_sample); 1301 thread_group_cputimer(tsk, &group_sample);
@@ -1390,7 +1303,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1390 return 1; 1303 return 1;
1391 } 1304 }
1392 1305
1393 return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY; 1306 return 0;
1394} 1307}
1395 1308
1396/* 1309/*
@@ -1419,7 +1332,12 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1419 * put them on the firing list. 1332 * put them on the firing list.
1420 */ 1333 */
1421 check_thread_timers(tsk, &firing); 1334 check_thread_timers(tsk, &firing);
1422 check_process_timers(tsk, &firing); 1335 /*
1336 * If there are any active process wide timers (POSIX 1.b, itimers,
1337 * RLIMIT_CPU) cputimer must be running.
1338 */
1339 if (tsk->signal->cputimer.running)
1340 check_process_timers(tsk, &firing);
1423 1341
1424 /* 1342 /*
1425 * We must release these locks before taking any timer's lock. 1343 * We must release these locks before taking any timer's lock.
@@ -1456,21 +1374,23 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1456} 1374}
1457 1375
1458/* 1376/*
1459 * Set one of the process-wide special case CPU timers. 1377 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1460 * The tsk->sighand->siglock must be held by the caller. 1378 * The tsk->sighand->siglock must be held by the caller.
1461 * The *newval argument is relative and we update it to be absolute, *oldval
1462 * is absolute and we update it to be relative.
1463 */ 1379 */
1464void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1380void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1465 cputime_t *newval, cputime_t *oldval) 1381 cputime_t *newval, cputime_t *oldval)
1466{ 1382{
1467 union cpu_time_count now; 1383 union cpu_time_count now;
1468 struct list_head *head;
1469 1384
1470 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1385 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1471 cpu_timer_sample_group(clock_idx, tsk, &now); 1386 cpu_timer_sample_group(clock_idx, tsk, &now);
1472 1387
1473 if (oldval) { 1388 if (oldval) {
1389 /*
1390 * We are setting itimer. The *oldval is absolute and we update
1391 * it to be relative, *newval argument is relative and we update
1392 * it to be absolute.
1393 */
1474 if (!cputime_eq(*oldval, cputime_zero)) { 1394 if (!cputime_eq(*oldval, cputime_zero)) {
1475 if (cputime_le(*oldval, now.cpu)) { 1395 if (cputime_le(*oldval, now.cpu)) {
1476 /* Just about to fire. */ 1396 /* Just about to fire. */
@@ -1483,33 +1403,21 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1483 if (cputime_eq(*newval, cputime_zero)) 1403 if (cputime_eq(*newval, cputime_zero))
1484 return; 1404 return;
1485 *newval = cputime_add(*newval, now.cpu); 1405 *newval = cputime_add(*newval, now.cpu);
1486
1487 /*
1488 * If the RLIMIT_CPU timer will expire before the
1489 * ITIMER_PROF timer, we have nothing else to do.
1490 */
1491 if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
1492 < cputime_to_secs(*newval))
1493 return;
1494 } 1406 }
1495 1407
1496 /* 1408 /*
1497 * Check whether there are any process timers already set to fire 1409 * Update expiration cache if we are the earliest timer, or eventually
1498 * before this one. If so, we don't have anything more to do. 1410 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
1499 */ 1411 */
1500 head = &tsk->signal->cpu_timers[clock_idx]; 1412 switch (clock_idx) {
1501 if (list_empty(head) || 1413 case CPUCLOCK_PROF:
1502 cputime_ge(list_first_entry(head, 1414 if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1503 struct cpu_timer_list, entry)->expires.cpu,
1504 *newval)) {
1505 switch (clock_idx) {
1506 case CPUCLOCK_PROF:
1507 tsk->signal->cputime_expires.prof_exp = *newval; 1415 tsk->signal->cputime_expires.prof_exp = *newval;
1508 break; 1416 break;
1509 case CPUCLOCK_VIRT: 1417 case CPUCLOCK_VIRT:
1418 if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1510 tsk->signal->cputime_expires.virt_exp = *newval; 1419 tsk->signal->cputime_expires.virt_exp = *newval;
1511 break; 1420 break;
1512 }
1513 } 1421 }
1514} 1422}
1515 1423
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 43191815f874..524e058dcf06 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12 block_io.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 13obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
13 14
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
new file mode 100644
index 000000000000..97024fd40cd5
--- /dev/null
+++ b/kernel/power/block_io.c
@@ -0,0 +1,103 @@
1/*
2 * This file provides functions for block I/O operations on swap/file.
3 *
4 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
5 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#include <linux/bio.h>
11#include <linux/kernel.h>
12#include <linux/pagemap.h>
13#include <linux/swap.h>
14
15#include "power.h"
16
17/**
18 * submit - submit BIO request.
19 * @rw: READ or WRITE.
20 * @off physical offset of page.
21 * @page: page we're reading or writing.
22 * @bio_chain: list of pending biod (for async reading)
23 *
24 * Straight from the textbook - allocate and initialize the bio.
25 * If we're reading, make sure the page is marked as dirty.
26 * Then submit it and, if @bio_chain == NULL, wait.
27 */
28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain)
30{
31 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
32 struct bio *bio;
33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
35 bio->bi_sector = sector;
36 bio->bi_bdev = bdev;
37 bio->bi_end_io = end_swap_bio_read;
38
39 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
40 printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
41 (unsigned long long)sector);
42 bio_put(bio);
43 return -EFAULT;
44 }
45
46 lock_page(page);
47 bio_get(bio);
48
49 if (bio_chain == NULL) {
50 submit_bio(bio_rw, bio);
51 wait_on_page_locked(page);
52 if (rw == READ)
53 bio_set_pages_dirty(bio);
54 bio_put(bio);
55 } else {
56 if (rw == READ)
57 get_page(page); /* These pages are freed later */
58 bio->bi_private = *bio_chain;
59 *bio_chain = bio;
60 submit_bio(bio_rw, bio);
61 }
62 return 0;
63}
64
65int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
66{
67 return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
68 virt_to_page(addr), bio_chain);
69}
70
71int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
72{
73 return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
74 virt_to_page(addr), bio_chain);
75}
76
77int hib_wait_on_bio_chain(struct bio **bio_chain)
78{
79 struct bio *bio;
80 struct bio *next_bio;
81 int ret = 0;
82
83 if (bio_chain == NULL)
84 return 0;
85
86 bio = *bio_chain;
87 if (bio == NULL)
88 return 0;
89 while (bio) {
90 struct page *page;
91
92 next_bio = bio->bi_private;
93 page = bio->bi_io_vec[0].bv_page;
94 wait_on_page_locked(page);
95 if (!PageUptodate(page) || PageError(page))
96 ret = -EIO;
97 put_page(page);
98 bio_put(bio);
99 bio = next_bio;
100 }
101 *bio_chain = NULL;
102 return ret;
103}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46c5a26630a3..006270fe382d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void);
97 */ 97 */
98 98
99struct snapshot_handle { 99struct snapshot_handle {
100 loff_t offset; /* number of the last byte ready for reading
101 * or writing in the sequence
102 */
103 unsigned int cur; /* number of the block of PAGE_SIZE bytes the 100 unsigned int cur; /* number of the block of PAGE_SIZE bytes the
104 * next operation will refer to (ie. current) 101 * next operation will refer to (ie. current)
105 */ 102 */
106 unsigned int cur_offset; /* offset with respect to the current
107 * block (for the next operation)
108 */
109 unsigned int prev; /* number of the block of PAGE_SIZE bytes that
110 * was the current one previously
111 */
112 void *buffer; /* address of the block to read from 103 void *buffer; /* address of the block to read from
113 * or write to 104 * or write to
114 */ 105 */
115 unsigned int buf_offset; /* location to read from or write to,
116 * given as a displacement from 'buffer'
117 */
118 int sync_read; /* Set to one to notify the caller of 106 int sync_read; /* Set to one to notify the caller of
119 * snapshot_write_next() that it may 107 * snapshot_write_next() that it may
120 * need to call wait_on_bio_chain() 108 * need to call wait_on_bio_chain()
@@ -125,12 +113,12 @@ struct snapshot_handle {
125 * snapshot_read_next()/snapshot_write_next() is allowed to 113 * snapshot_read_next()/snapshot_write_next() is allowed to
126 * read/write data after the function returns 114 * read/write data after the function returns
127 */ 115 */
128#define data_of(handle) ((handle).buffer + (handle).buf_offset) 116#define data_of(handle) ((handle).buffer)
129 117
130extern unsigned int snapshot_additional_pages(struct zone *zone); 118extern unsigned int snapshot_additional_pages(struct zone *zone);
131extern unsigned long snapshot_get_image_size(void); 119extern unsigned long snapshot_get_image_size(void);
132extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 120extern int snapshot_read_next(struct snapshot_handle *handle);
133extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 121extern int snapshot_write_next(struct snapshot_handle *handle);
134extern void snapshot_write_finalize(struct snapshot_handle *handle); 122extern void snapshot_write_finalize(struct snapshot_handle *handle);
135extern int snapshot_image_loaded(struct snapshot_handle *handle); 123extern int snapshot_image_loaded(struct snapshot_handle *handle);
136 124
@@ -154,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p);
154extern int swsusp_write(unsigned int flags); 142extern int swsusp_write(unsigned int flags);
155extern void swsusp_close(fmode_t); 143extern void swsusp_close(fmode_t);
156 144
145/* kernel/power/block_io.c */
146extern struct block_device *hib_resume_bdev;
147
148extern int hib_bio_read_page(pgoff_t page_off, void *addr,
149 struct bio **bio_chain);
150extern int hib_bio_write_page(pgoff_t page_off, void *addr,
151 struct bio **bio_chain);
152extern int hib_wait_on_bio_chain(struct bio **bio_chain);
153
157struct timeval; 154struct timeval;
158/* kernel/power/swsusp.c */ 155/* kernel/power/swsusp.c */
159extern void swsusp_show_speed(struct timeval *, struct timeval *, 156extern void swsusp_show_speed(struct timeval *, struct timeval *,
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index be861c26dda7..25ce010e9f8b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1604,14 +1604,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1604 * snapshot_handle structure. The structure gets updated and a pointer 1604 * snapshot_handle structure. The structure gets updated and a pointer
1605 * to it should be passed to this function every next time. 1605 * to it should be passed to this function every next time.
1606 * 1606 *
1607 * The @count parameter should contain the number of bytes the caller
1608 * wants to read from the snapshot. It must not be zero.
1609 *
1610 * On success the function returns a positive number. Then, the caller 1607 * On success the function returns a positive number. Then, the caller
1611 * is allowed to read up to the returned number of bytes from the memory 1608 * is allowed to read up to the returned number of bytes from the memory
1612 * location computed by the data_of() macro. The number returned 1609 * location computed by the data_of() macro.
1613 * may be smaller than @count, but this only happens if the read would
1614 * cross a page boundary otherwise.
1615 * 1610 *
1616 * The function returns 0 to indicate the end of data stream condition, 1611 * The function returns 0 to indicate the end of data stream condition,
1617 * and a negative number is returned on error. In such cases the 1612 * and a negative number is returned on error. In such cases the
@@ -1619,7 +1614,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1619 * any more. 1614 * any more.
1620 */ 1615 */
1621 1616
1622int snapshot_read_next(struct snapshot_handle *handle, size_t count) 1617int snapshot_read_next(struct snapshot_handle *handle)
1623{ 1618{
1624 if (handle->cur > nr_meta_pages + nr_copy_pages) 1619 if (handle->cur > nr_meta_pages + nr_copy_pages)
1625 return 0; 1620 return 0;
@@ -1630,7 +1625,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1630 if (!buffer) 1625 if (!buffer)
1631 return -ENOMEM; 1626 return -ENOMEM;
1632 } 1627 }
1633 if (!handle->offset) { 1628 if (!handle->cur) {
1634 int error; 1629 int error;
1635 1630
1636 error = init_header((struct swsusp_info *)buffer); 1631 error = init_header((struct swsusp_info *)buffer);
@@ -1639,42 +1634,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1639 handle->buffer = buffer; 1634 handle->buffer = buffer;
1640 memory_bm_position_reset(&orig_bm); 1635 memory_bm_position_reset(&orig_bm);
1641 memory_bm_position_reset(&copy_bm); 1636 memory_bm_position_reset(&copy_bm);
1642 } 1637 } else if (handle->cur <= nr_meta_pages) {
1643 if (handle->prev < handle->cur) { 1638 memset(buffer, 0, PAGE_SIZE);
1644 if (handle->cur <= nr_meta_pages) { 1639 pack_pfns(buffer, &orig_bm);
1645 memset(buffer, 0, PAGE_SIZE); 1640 } else {
1646 pack_pfns(buffer, &orig_bm); 1641 struct page *page;
1647 } else {
1648 struct page *page;
1649 1642
1650 page = pfn_to_page(memory_bm_next_pfn(&copy_bm)); 1643 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1651 if (PageHighMem(page)) { 1644 if (PageHighMem(page)) {
1652 /* Highmem pages are copied to the buffer, 1645 /* Highmem pages are copied to the buffer,
1653 * because we can't return with a kmapped 1646 * because we can't return with a kmapped
1654 * highmem page (we may not be called again). 1647 * highmem page (we may not be called again).
1655 */ 1648 */
1656 void *kaddr; 1649 void *kaddr;
1657 1650
1658 kaddr = kmap_atomic(page, KM_USER0); 1651 kaddr = kmap_atomic(page, KM_USER0);
1659 memcpy(buffer, kaddr, PAGE_SIZE); 1652 memcpy(buffer, kaddr, PAGE_SIZE);
1660 kunmap_atomic(kaddr, KM_USER0); 1653 kunmap_atomic(kaddr, KM_USER0);
1661 handle->buffer = buffer; 1654 handle->buffer = buffer;
1662 } else { 1655 } else {
1663 handle->buffer = page_address(page); 1656 handle->buffer = page_address(page);
1664 }
1665 } 1657 }
1666 handle->prev = handle->cur;
1667 }
1668 handle->buf_offset = handle->cur_offset;
1669 if (handle->cur_offset + count >= PAGE_SIZE) {
1670 count = PAGE_SIZE - handle->cur_offset;
1671 handle->cur_offset = 0;
1672 handle->cur++;
1673 } else {
1674 handle->cur_offset += count;
1675 } 1658 }
1676 handle->offset += count; 1659 handle->cur++;
1677 return count; 1660 return PAGE_SIZE;
1678} 1661}
1679 1662
1680/** 1663/**
@@ -2133,14 +2116,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2133 * snapshot_handle structure. The structure gets updated and a pointer 2116 * snapshot_handle structure. The structure gets updated and a pointer
2134 * to it should be passed to this function every next time. 2117 * to it should be passed to this function every next time.
2135 * 2118 *
2136 * The @count parameter should contain the number of bytes the caller
2137 * wants to write to the image. It must not be zero.
2138 *
2139 * On success the function returns a positive number. Then, the caller 2119 * On success the function returns a positive number. Then, the caller
2140 * is allowed to write up to the returned number of bytes to the memory 2120 * is allowed to write up to the returned number of bytes to the memory
2141 * location computed by the data_of() macro. The number returned 2121 * location computed by the data_of() macro.
2142 * may be smaller than @count, but this only happens if the write would
2143 * cross a page boundary otherwise.
2144 * 2122 *
2145 * The function returns 0 to indicate the "end of file" condition, 2123 * The function returns 0 to indicate the "end of file" condition,
2146 * and a negative number is returned on error. In such cases the 2124 * and a negative number is returned on error. In such cases the
@@ -2148,16 +2126,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2148 * any more. 2126 * any more.
2149 */ 2127 */
2150 2128
2151int snapshot_write_next(struct snapshot_handle *handle, size_t count) 2129int snapshot_write_next(struct snapshot_handle *handle)
2152{ 2130{
2153 static struct chain_allocator ca; 2131 static struct chain_allocator ca;
2154 int error = 0; 2132 int error = 0;
2155 2133
2156 /* Check if we have already loaded the entire image */ 2134 /* Check if we have already loaded the entire image */
2157 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 2135 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
2158 return 0; 2136 return 0;
2159 2137
2160 if (handle->offset == 0) { 2138 handle->sync_read = 1;
2139
2140 if (!handle->cur) {
2161 if (!buffer) 2141 if (!buffer)
2162 /* This makes the buffer be freed by swsusp_free() */ 2142 /* This makes the buffer be freed by swsusp_free() */
2163 buffer = get_image_page(GFP_ATOMIC, PG_ANY); 2143 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
@@ -2166,56 +2146,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
2166 return -ENOMEM; 2146 return -ENOMEM;
2167 2147
2168 handle->buffer = buffer; 2148 handle->buffer = buffer;
2169 } 2149 } else if (handle->cur == 1) {
2170 handle->sync_read = 1; 2150 error = load_header(buffer);
2171 if (handle->prev < handle->cur) { 2151 if (error)
2172 if (handle->prev == 0) { 2152 return error;
2173 error = load_header(buffer);
2174 if (error)
2175 return error;
2176 2153
2177 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY); 2154 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
2178 if (error) 2155 if (error)
2179 return error; 2156 return error;
2157
2158 } else if (handle->cur <= nr_meta_pages + 1) {
2159 error = unpack_orig_pfns(buffer, &copy_bm);
2160 if (error)
2161 return error;
2180 2162
2181 } else if (handle->prev <= nr_meta_pages) { 2163 if (handle->cur == nr_meta_pages + 1) {
2182 error = unpack_orig_pfns(buffer, &copy_bm); 2164 error = prepare_image(&orig_bm, &copy_bm);
2183 if (error) 2165 if (error)
2184 return error; 2166 return error;
2185 2167
2186 if (handle->prev == nr_meta_pages) { 2168 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2187 error = prepare_image(&orig_bm, &copy_bm); 2169 memory_bm_position_reset(&orig_bm);
2188 if (error) 2170 restore_pblist = NULL;
2189 return error;
2190
2191 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2192 memory_bm_position_reset(&orig_bm);
2193 restore_pblist = NULL;
2194 handle->buffer = get_buffer(&orig_bm, &ca);
2195 handle->sync_read = 0;
2196 if (IS_ERR(handle->buffer))
2197 return PTR_ERR(handle->buffer);
2198 }
2199 } else {
2200 copy_last_highmem_page();
2201 handle->buffer = get_buffer(&orig_bm, &ca); 2171 handle->buffer = get_buffer(&orig_bm, &ca);
2172 handle->sync_read = 0;
2202 if (IS_ERR(handle->buffer)) 2173 if (IS_ERR(handle->buffer))
2203 return PTR_ERR(handle->buffer); 2174 return PTR_ERR(handle->buffer);
2204 if (handle->buffer != buffer)
2205 handle->sync_read = 0;
2206 } 2175 }
2207 handle->prev = handle->cur;
2208 }
2209 handle->buf_offset = handle->cur_offset;
2210 if (handle->cur_offset + count >= PAGE_SIZE) {
2211 count = PAGE_SIZE - handle->cur_offset;
2212 handle->cur_offset = 0;
2213 handle->cur++;
2214 } else { 2176 } else {
2215 handle->cur_offset += count; 2177 copy_last_highmem_page();
2178 handle->buffer = get_buffer(&orig_bm, &ca);
2179 if (IS_ERR(handle->buffer))
2180 return PTR_ERR(handle->buffer);
2181 if (handle->buffer != buffer)
2182 handle->sync_read = 0;
2216 } 2183 }
2217 handle->offset += count; 2184 handle->cur++;
2218 return count; 2185 return PAGE_SIZE;
2219} 2186}
2220 2187
2221/** 2188/**
@@ -2230,7 +2197,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
2230{ 2197{
2231 copy_last_highmem_page(); 2198 copy_last_highmem_page();
2232 /* Free only if we have loaded the image entirely */ 2199 /* Free only if we have loaded the image entirely */
2233 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { 2200 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2234 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2201 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
2235 free_highmem_data(); 2202 free_highmem_data();
2236 } 2203 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 66824d71983a..b0bb21778391 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -29,6 +29,40 @@
29 29
30#define SWSUSP_SIG "S1SUSPEND" 30#define SWSUSP_SIG "S1SUSPEND"
31 31
32/*
33 * The swap map is a data structure used for keeping track of each page
34 * written to a swap partition. It consists of many swap_map_page
35 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
36 * These structures are stored on the swap and linked together with the
37 * help of the .next_swap member.
38 *
39 * The swap map is created during suspend. The swap map pages are
40 * allocated and populated one at a time, so we only need one memory
41 * page to set up the entire structure.
42 *
43 * During resume we also only need to use one swap_map_page structure
44 * at a time.
45 */
46
47#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
48
49struct swap_map_page {
50 sector_t entries[MAP_PAGE_ENTRIES];
51 sector_t next_swap;
52};
53
54/**
55 * The swap_map_handle structure is used for handling swap in
56 * a file-alike way
57 */
58
59struct swap_map_handle {
60 struct swap_map_page *cur;
61 sector_t cur_swap;
62 sector_t first_sector;
63 unsigned int k;
64};
65
32struct swsusp_header { 66struct swsusp_header {
33 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; 67 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
34 sector_t image; 68 sector_t image;
@@ -145,110 +179,24 @@ int swsusp_swap_in_use(void)
145 */ 179 */
146 180
147static unsigned short root_swap = 0xffff; 181static unsigned short root_swap = 0xffff;
148static struct block_device *resume_bdev; 182struct block_device *hib_resume_bdev;
149
150/**
151 * submit - submit BIO request.
152 * @rw: READ or WRITE.
153 * @off physical offset of page.
154 * @page: page we're reading or writing.
155 * @bio_chain: list of pending biod (for async reading)
156 *
157 * Straight from the textbook - allocate and initialize the bio.
158 * If we're reading, make sure the page is marked as dirty.
159 * Then submit it and, if @bio_chain == NULL, wait.
160 */
161static int submit(int rw, pgoff_t page_off, struct page *page,
162 struct bio **bio_chain)
163{
164 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
165 struct bio *bio;
166
167 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
168 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
169 bio->bi_bdev = resume_bdev;
170 bio->bi_end_io = end_swap_bio_read;
171
172 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
173 printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
174 page_off);
175 bio_put(bio);
176 return -EFAULT;
177 }
178
179 lock_page(page);
180 bio_get(bio);
181
182 if (bio_chain == NULL) {
183 submit_bio(bio_rw, bio);
184 wait_on_page_locked(page);
185 if (rw == READ)
186 bio_set_pages_dirty(bio);
187 bio_put(bio);
188 } else {
189 if (rw == READ)
190 get_page(page); /* These pages are freed later */
191 bio->bi_private = *bio_chain;
192 *bio_chain = bio;
193 submit_bio(bio_rw, bio);
194 }
195 return 0;
196}
197
198static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
199{
200 return submit(READ, page_off, virt_to_page(addr), bio_chain);
201}
202
203static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
204{
205 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
206}
207
208static int wait_on_bio_chain(struct bio **bio_chain)
209{
210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235 183
236/* 184/*
237 * Saving part 185 * Saving part
238 */ 186 */
239 187
240static int mark_swapfiles(sector_t start, unsigned int flags) 188static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
241{ 189{
242 int error; 190 int error;
243 191
244 bio_read_page(swsusp_resume_block, swsusp_header, NULL); 192 hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
245 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 193 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
246 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 194 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
247 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 195 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
248 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 196 memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
249 swsusp_header->image = start; 197 swsusp_header->image = handle->first_sector;
250 swsusp_header->flags = flags; 198 swsusp_header->flags = flags;
251 error = bio_write_page(swsusp_resume_block, 199 error = hib_bio_write_page(swsusp_resume_block,
252 swsusp_header, NULL); 200 swsusp_header, NULL);
253 } else { 201 } else {
254 printk(KERN_ERR "PM: Swap header not found!\n"); 202 printk(KERN_ERR "PM: Swap header not found!\n");
@@ -260,25 +208,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
260/** 208/**
261 * swsusp_swap_check - check if the resume device is a swap device 209 * swsusp_swap_check - check if the resume device is a swap device
262 * and get its index (if so) 210 * and get its index (if so)
211 *
212 * This is called before saving image
263 */ 213 */
264 214static int swsusp_swap_check(void)
265static int swsusp_swap_check(void) /* This is called before saving image */
266{ 215{
267 int res; 216 int res;
268 217
269 res = swap_type_of(swsusp_resume_device, swsusp_resume_block, 218 res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
270 &resume_bdev); 219 &hib_resume_bdev);
271 if (res < 0) 220 if (res < 0)
272 return res; 221 return res;
273 222
274 root_swap = res; 223 root_swap = res;
275 res = blkdev_get(resume_bdev, FMODE_WRITE); 224 res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
276 if (res) 225 if (res)
277 return res; 226 return res;
278 227
279 res = set_blocksize(resume_bdev, PAGE_SIZE); 228 res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
280 if (res < 0) 229 if (res < 0)
281 blkdev_put(resume_bdev, FMODE_WRITE); 230 blkdev_put(hib_resume_bdev, FMODE_WRITE);
282 231
283 return res; 232 return res;
284} 233}
@@ -309,42 +258,9 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
309 } else { 258 } else {
310 src = buf; 259 src = buf;
311 } 260 }
312 return bio_write_page(offset, src, bio_chain); 261 return hib_bio_write_page(offset, src, bio_chain);
313} 262}
314 263
315/*
316 * The swap map is a data structure used for keeping track of each page
317 * written to a swap partition. It consists of many swap_map_page
318 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
319 * These structures are stored on the swap and linked together with the
320 * help of the .next_swap member.
321 *
322 * The swap map is created during suspend. The swap map pages are
323 * allocated and populated one at a time, so we only need one memory
324 * page to set up the entire structure.
325 *
326 * During resume we also only need to use one swap_map_page structure
327 * at a time.
328 */
329
330#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
331
332struct swap_map_page {
333 sector_t entries[MAP_PAGE_ENTRIES];
334 sector_t next_swap;
335};
336
337/**
338 * The swap_map_handle structure is used for handling swap in
339 * a file-alike way
340 */
341
342struct swap_map_handle {
343 struct swap_map_page *cur;
344 sector_t cur_swap;
345 unsigned int k;
346};
347
348static void release_swap_writer(struct swap_map_handle *handle) 264static void release_swap_writer(struct swap_map_handle *handle)
349{ 265{
350 if (handle->cur) 266 if (handle->cur)
@@ -354,16 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle)
354 270
355static int get_swap_writer(struct swap_map_handle *handle) 271static int get_swap_writer(struct swap_map_handle *handle)
356{ 272{
273 int ret;
274
275 ret = swsusp_swap_check();
276 if (ret) {
277 if (ret != -ENOSPC)
278 printk(KERN_ERR "PM: Cannot find swap device, try "
279 "swapon -a.\n");
280 return ret;
281 }
357 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 282 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
358 if (!handle->cur) 283 if (!handle->cur) {
359 return -ENOMEM; 284 ret = -ENOMEM;
285 goto err_close;
286 }
360 handle->cur_swap = alloc_swapdev_block(root_swap); 287 handle->cur_swap = alloc_swapdev_block(root_swap);
361 if (!handle->cur_swap) { 288 if (!handle->cur_swap) {
362 release_swap_writer(handle); 289 ret = -ENOSPC;
363 return -ENOSPC; 290 goto err_rel;
364 } 291 }
365 handle->k = 0; 292 handle->k = 0;
293 handle->first_sector = handle->cur_swap;
366 return 0; 294 return 0;
295err_rel:
296 release_swap_writer(handle);
297err_close:
298 swsusp_close(FMODE_WRITE);
299 return ret;
367} 300}
368 301
369static int swap_write_page(struct swap_map_handle *handle, void *buf, 302static int swap_write_page(struct swap_map_handle *handle, void *buf,
@@ -380,7 +313,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
380 return error; 313 return error;
381 handle->cur->entries[handle->k++] = offset; 314 handle->cur->entries[handle->k++] = offset;
382 if (handle->k >= MAP_PAGE_ENTRIES) { 315 if (handle->k >= MAP_PAGE_ENTRIES) {
383 error = wait_on_bio_chain(bio_chain); 316 error = hib_wait_on_bio_chain(bio_chain);
384 if (error) 317 if (error)
385 goto out; 318 goto out;
386 offset = alloc_swapdev_block(root_swap); 319 offset = alloc_swapdev_block(root_swap);
@@ -406,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle)
406 return -EINVAL; 339 return -EINVAL;
407} 340}
408 341
342static int swap_writer_finish(struct swap_map_handle *handle,
343 unsigned int flags, int error)
344{
345 if (!error) {
346 flush_swap_writer(handle);
347 printk(KERN_INFO "PM: S");
348 error = mark_swapfiles(handle, flags);
349 printk("|\n");
350 }
351
352 if (error)
353 free_all_swap_pages(root_swap);
354 release_swap_writer(handle);
355 swsusp_close(FMODE_WRITE);
356
357 return error;
358}
359
409/** 360/**
410 * save_image - save the suspend image data 361 * save_image - save the suspend image data
411 */ 362 */
@@ -431,7 +382,7 @@ static int save_image(struct swap_map_handle *handle,
431 bio = NULL; 382 bio = NULL;
432 do_gettimeofday(&start); 383 do_gettimeofday(&start);
433 while (1) { 384 while (1) {
434 ret = snapshot_read_next(snapshot, PAGE_SIZE); 385 ret = snapshot_read_next(snapshot);
435 if (ret <= 0) 386 if (ret <= 0)
436 break; 387 break;
437 ret = swap_write_page(handle, data_of(*snapshot), &bio); 388 ret = swap_write_page(handle, data_of(*snapshot), &bio);
@@ -441,7 +392,7 @@ static int save_image(struct swap_map_handle *handle,
441 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 392 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
442 nr_pages++; 393 nr_pages++;
443 } 394 }
444 err2 = wait_on_bio_chain(&bio); 395 err2 = hib_wait_on_bio_chain(&bio);
445 do_gettimeofday(&stop); 396 do_gettimeofday(&stop);
446 if (!ret) 397 if (!ret)
447 ret = err2; 398 ret = err2;
@@ -483,50 +434,34 @@ int swsusp_write(unsigned int flags)
483 struct swap_map_handle handle; 434 struct swap_map_handle handle;
484 struct snapshot_handle snapshot; 435 struct snapshot_handle snapshot;
485 struct swsusp_info *header; 436 struct swsusp_info *header;
437 unsigned long pages;
486 int error; 438 int error;
487 439
488 error = swsusp_swap_check(); 440 pages = snapshot_get_image_size();
441 error = get_swap_writer(&handle);
489 if (error) { 442 if (error) {
490 printk(KERN_ERR "PM: Cannot find swap device, try " 443 printk(KERN_ERR "PM: Cannot get swap writer\n");
491 "swapon -a.\n");
492 return error; 444 return error;
493 } 445 }
446 if (!enough_swap(pages)) {
447 printk(KERN_ERR "PM: Not enough free swap\n");
448 error = -ENOSPC;
449 goto out_finish;
450 }
494 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 451 memset(&snapshot, 0, sizeof(struct snapshot_handle));
495 error = snapshot_read_next(&snapshot, PAGE_SIZE); 452 error = snapshot_read_next(&snapshot);
496 if (error < PAGE_SIZE) { 453 if (error < PAGE_SIZE) {
497 if (error >= 0) 454 if (error >= 0)
498 error = -EFAULT; 455 error = -EFAULT;
499 456
500 goto out; 457 goto out_finish;
501 } 458 }
502 header = (struct swsusp_info *)data_of(snapshot); 459 header = (struct swsusp_info *)data_of(snapshot);
503 if (!enough_swap(header->pages)) { 460 error = swap_write_page(&handle, header, NULL);
504 printk(KERN_ERR "PM: Not enough free swap\n"); 461 if (!error)
505 error = -ENOSPC; 462 error = save_image(&handle, &snapshot, pages - 1);
506 goto out; 463out_finish:
507 } 464 error = swap_writer_finish(&handle, flags, error);
508 error = get_swap_writer(&handle);
509 if (!error) {
510 sector_t start = handle.cur_swap;
511
512 error = swap_write_page(&handle, header, NULL);
513 if (!error)
514 error = save_image(&handle, &snapshot,
515 header->pages - 1);
516
517 if (!error) {
518 flush_swap_writer(&handle);
519 printk(KERN_INFO "PM: S");
520 error = mark_swapfiles(start, flags);
521 printk("|\n");
522 }
523 }
524 if (error)
525 free_all_swap_pages(root_swap);
526
527 release_swap_writer(&handle);
528 out:
529 swsusp_close(FMODE_WRITE);
530 return error; 465 return error;
531} 466}
532 467
@@ -542,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle)
542 handle->cur = NULL; 477 handle->cur = NULL;
543} 478}
544 479
545static int get_swap_reader(struct swap_map_handle *handle, sector_t start) 480static int get_swap_reader(struct swap_map_handle *handle,
481 unsigned int *flags_p)
546{ 482{
547 int error; 483 int error;
548 484
549 if (!start) 485 *flags_p = swsusp_header->flags;
486
487 if (!swsusp_header->image) /* how can this happen? */
550 return -EINVAL; 488 return -EINVAL;
551 489
552 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); 490 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
553 if (!handle->cur) 491 if (!handle->cur)
554 return -ENOMEM; 492 return -ENOMEM;
555 493
556 error = bio_read_page(start, handle->cur, NULL); 494 error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
557 if (error) { 495 if (error) {
558 release_swap_reader(handle); 496 release_swap_reader(handle);
559 return error; 497 return error;
@@ -573,21 +511,28 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
573 offset = handle->cur->entries[handle->k]; 511 offset = handle->cur->entries[handle->k];
574 if (!offset) 512 if (!offset)
575 return -EFAULT; 513 return -EFAULT;
576 error = bio_read_page(offset, buf, bio_chain); 514 error = hib_bio_read_page(offset, buf, bio_chain);
577 if (error) 515 if (error)
578 return error; 516 return error;
579 if (++handle->k >= MAP_PAGE_ENTRIES) { 517 if (++handle->k >= MAP_PAGE_ENTRIES) {
580 error = wait_on_bio_chain(bio_chain); 518 error = hib_wait_on_bio_chain(bio_chain);
581 handle->k = 0; 519 handle->k = 0;
582 offset = handle->cur->next_swap; 520 offset = handle->cur->next_swap;
583 if (!offset) 521 if (!offset)
584 release_swap_reader(handle); 522 release_swap_reader(handle);
585 else if (!error) 523 else if (!error)
586 error = bio_read_page(offset, handle->cur, NULL); 524 error = hib_bio_read_page(offset, handle->cur, NULL);
587 } 525 }
588 return error; 526 return error;
589} 527}
590 528
529static int swap_reader_finish(struct swap_map_handle *handle)
530{
531 release_swap_reader(handle);
532
533 return 0;
534}
535
591/** 536/**
592 * load_image - load the image using the swap map handle 537 * load_image - load the image using the swap map handle
593 * @handle and the snapshot handle @snapshot 538 * @handle and the snapshot handle @snapshot
@@ -615,21 +560,21 @@ static int load_image(struct swap_map_handle *handle,
615 bio = NULL; 560 bio = NULL;
616 do_gettimeofday(&start); 561 do_gettimeofday(&start);
617 for ( ; ; ) { 562 for ( ; ; ) {
618 error = snapshot_write_next(snapshot, PAGE_SIZE); 563 error = snapshot_write_next(snapshot);
619 if (error <= 0) 564 if (error <= 0)
620 break; 565 break;
621 error = swap_read_page(handle, data_of(*snapshot), &bio); 566 error = swap_read_page(handle, data_of(*snapshot), &bio);
622 if (error) 567 if (error)
623 break; 568 break;
624 if (snapshot->sync_read) 569 if (snapshot->sync_read)
625 error = wait_on_bio_chain(&bio); 570 error = hib_wait_on_bio_chain(&bio);
626 if (error) 571 if (error)
627 break; 572 break;
628 if (!(nr_pages % m)) 573 if (!(nr_pages % m))
629 printk("\b\b\b\b%3d%%", nr_pages / m); 574 printk("\b\b\b\b%3d%%", nr_pages / m);
630 nr_pages++; 575 nr_pages++;
631 } 576 }
632 err2 = wait_on_bio_chain(&bio); 577 err2 = hib_wait_on_bio_chain(&bio);
633 do_gettimeofday(&stop); 578 do_gettimeofday(&stop);
634 if (!error) 579 if (!error)
635 error = err2; 580 error = err2;
@@ -657,20 +602,20 @@ int swsusp_read(unsigned int *flags_p)
657 struct snapshot_handle snapshot; 602 struct snapshot_handle snapshot;
658 struct swsusp_info *header; 603 struct swsusp_info *header;
659 604
660 *flags_p = swsusp_header->flags;
661
662 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 605 memset(&snapshot, 0, sizeof(struct snapshot_handle));
663 error = snapshot_write_next(&snapshot, PAGE_SIZE); 606 error = snapshot_write_next(&snapshot);
664 if (error < PAGE_SIZE) 607 if (error < PAGE_SIZE)
665 return error < 0 ? error : -EFAULT; 608 return error < 0 ? error : -EFAULT;
666 header = (struct swsusp_info *)data_of(snapshot); 609 header = (struct swsusp_info *)data_of(snapshot);
667 error = get_swap_reader(&handle, swsusp_header->image); 610 error = get_swap_reader(&handle, flags_p);
611 if (error)
612 goto end;
668 if (!error) 613 if (!error)
669 error = swap_read_page(&handle, header, NULL); 614 error = swap_read_page(&handle, header, NULL);
670 if (!error) 615 if (!error)
671 error = load_image(&handle, &snapshot, header->pages - 1); 616 error = load_image(&handle, &snapshot, header->pages - 1);
672 release_swap_reader(&handle); 617 swap_reader_finish(&handle);
673 618end:
674 if (!error) 619 if (!error)
675 pr_debug("PM: Image successfully loaded\n"); 620 pr_debug("PM: Image successfully loaded\n");
676 else 621 else
@@ -686,11 +631,11 @@ int swsusp_check(void)
686{ 631{
687 int error; 632 int error;
688 633
689 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 634 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
690 if (!IS_ERR(resume_bdev)) { 635 if (!IS_ERR(hib_resume_bdev)) {
691 set_blocksize(resume_bdev, PAGE_SIZE); 636 set_blocksize(hib_resume_bdev, PAGE_SIZE);
692 memset(swsusp_header, 0, PAGE_SIZE); 637 memset(swsusp_header, 0, PAGE_SIZE);
693 error = bio_read_page(swsusp_resume_block, 638 error = hib_bio_read_page(swsusp_resume_block,
694 swsusp_header, NULL); 639 swsusp_header, NULL);
695 if (error) 640 if (error)
696 goto put; 641 goto put;
@@ -698,7 +643,7 @@ int swsusp_check(void)
698 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 643 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
699 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 644 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
700 /* Reset swap signature now */ 645 /* Reset swap signature now */
701 error = bio_write_page(swsusp_resume_block, 646 error = hib_bio_write_page(swsusp_resume_block,
702 swsusp_header, NULL); 647 swsusp_header, NULL);
703 } else { 648 } else {
704 error = -EINVAL; 649 error = -EINVAL;
@@ -706,11 +651,11 @@ int swsusp_check(void)
706 651
707put: 652put:
708 if (error) 653 if (error)
709 blkdev_put(resume_bdev, FMODE_READ); 654 blkdev_put(hib_resume_bdev, FMODE_READ);
710 else 655 else
711 pr_debug("PM: Signature found, resuming\n"); 656 pr_debug("PM: Signature found, resuming\n");
712 } else { 657 } else {
713 error = PTR_ERR(resume_bdev); 658 error = PTR_ERR(hib_resume_bdev);
714 } 659 }
715 660
716 if (error) 661 if (error)
@@ -725,12 +670,12 @@ put:
725 670
726void swsusp_close(fmode_t mode) 671void swsusp_close(fmode_t mode)
727{ 672{
728 if (IS_ERR(resume_bdev)) { 673 if (IS_ERR(hib_resume_bdev)) {
729 pr_debug("PM: Image device not initialised\n"); 674 pr_debug("PM: Image device not initialised\n");
730 return; 675 return;
731 } 676 }
732 677
733 blkdev_put(resume_bdev, mode); 678 blkdev_put(hib_resume_bdev, mode);
734} 679}
735 680
736static int swsusp_header_init(void) 681static int swsusp_header_init(void)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a8c96212bc1b..e819e17877ca 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
151{ 151{
152 struct snapshot_data *data; 152 struct snapshot_data *data;
153 ssize_t res; 153 ssize_t res;
154 loff_t pg_offp = *offp & ~PAGE_MASK;
154 155
155 mutex_lock(&pm_mutex); 156 mutex_lock(&pm_mutex);
156 157
@@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
159 res = -ENODATA; 160 res = -ENODATA;
160 goto Unlock; 161 goto Unlock;
161 } 162 }
162 res = snapshot_read_next(&data->handle, count); 163 if (!pg_offp) { /* on page boundary? */
163 if (res > 0) { 164 res = snapshot_read_next(&data->handle);
164 if (copy_to_user(buf, data_of(data->handle), res)) 165 if (res <= 0)
165 res = -EFAULT; 166 goto Unlock;
166 else 167 } else {
167 *offp = data->handle.offset; 168 res = PAGE_SIZE - pg_offp;
168 } 169 }
169 170
171 res = simple_read_from_buffer(buf, count, &pg_offp,
172 data_of(data->handle), res);
173 if (res > 0)
174 *offp += res;
175
170 Unlock: 176 Unlock:
171 mutex_unlock(&pm_mutex); 177 mutex_unlock(&pm_mutex);
172 178
@@ -178,18 +184,25 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
178{ 184{
179 struct snapshot_data *data; 185 struct snapshot_data *data;
180 ssize_t res; 186 ssize_t res;
187 loff_t pg_offp = *offp & ~PAGE_MASK;
181 188
182 mutex_lock(&pm_mutex); 189 mutex_lock(&pm_mutex);
183 190
184 data = filp->private_data; 191 data = filp->private_data;
185 res = snapshot_write_next(&data->handle, count); 192
186 if (res > 0) { 193 if (!pg_offp) {
187 if (copy_from_user(data_of(data->handle), buf, res)) 194 res = snapshot_write_next(&data->handle);
188 res = -EFAULT; 195 if (res <= 0)
189 else 196 goto unlock;
190 *offp = data->handle.offset; 197 } else {
198 res = PAGE_SIZE - pg_offp;
191 } 199 }
192 200
201 res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
202 buf, count);
203 if (res > 0)
204 *offp += res;
205unlock:
193 mutex_unlock(&pm_mutex); 206 mutex_unlock(&pm_mutex);
194 207
195 return res; 208 return res;
diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae8..dfadc5b729f1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
127 return 0; 127 return 0;
128 128
129 prof_buffer = vmalloc(buffer_bytes); 129 prof_buffer = vmalloc(buffer_bytes);
130 if (prof_buffer) 130 if (prof_buffer) {
131 memset(prof_buffer, 0, buffer_bytes);
131 return 0; 132 return 0;
133 }
132 134
133 free_cpumask_var(prof_cpu_mask); 135 free_cpumask_var(prof_cpu_mask);
134 return -ENOMEM; 136 return -ENOMEM;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a0..6af9cdd558b7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18#include <linux/ptrace.h> 17#include <linux/ptrace.h>
19#include <linux/security.h> 18#include <linux/security.h>
20#include <linux/signal.h> 19#include <linux/signal.h>
@@ -76,7 +75,6 @@ void __ptrace_unlink(struct task_struct *child)
76 child->parent = child->real_parent; 75 child->parent = child->real_parent;
77 list_del_init(&child->ptrace_entry); 76 list_del_init(&child->ptrace_entry);
78 77
79 arch_ptrace_untrace(child);
80 if (task_is_traced(child)) 78 if (task_is_traced(child))
81 ptrace_untrace(child); 79 ptrace_untrace(child);
82} 80}
@@ -666,10 +664,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
666 struct task_struct *child; 664 struct task_struct *child;
667 long ret; 665 long ret;
668 666
669 /*
670 * This lock_kernel fixes a subtle race with suid exec
671 */
672 lock_kernel();
673 if (request == PTRACE_TRACEME) { 667 if (request == PTRACE_TRACEME) {
674 ret = ptrace_traceme(); 668 ret = ptrace_traceme();
675 if (!ret) 669 if (!ret)
@@ -703,7 +697,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
703 out_put_task_struct: 697 out_put_task_struct:
704 put_task_struct(child); 698 put_task_struct(child);
705 out: 699 out:
706 unlock_kernel();
707 return ret; 700 return ret;
708} 701}
709 702
@@ -813,10 +806,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
813 struct task_struct *child; 806 struct task_struct *child;
814 long ret; 807 long ret;
815 808
816 /*
817 * This lock_kernel fixes a subtle race with suid exec
818 */
819 lock_kernel();
820 if (request == PTRACE_TRACEME) { 809 if (request == PTRACE_TRACEME) {
821 ret = ptrace_traceme(); 810 ret = ptrace_traceme();
822 goto out; 811 goto out;
@@ -846,7 +835,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
846 out_put_task_struct: 835 out_put_task_struct:
847 put_task_struct(child); 836 put_task_struct(child);
848 out: 837 out:
849 unlock_kernel();
850 return ret; 838 return ret;
851} 839}
852#endif /* CONFIG_COMPAT */ 840#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 03a7ea1579f6..72a8dc9567f5 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48#include <linux/hardirq.h> 47#include <linux/hardirq.h>
49 48
50#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -64,9 +63,6 @@ struct lockdep_map rcu_sched_lock_map =
64EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
65#endif 64#endif
66 65
67int rcu_scheduler_active __read_mostly;
68EXPORT_SYMBOL_GPL(rcu_scheduler_active);
69
70#ifdef CONFIG_DEBUG_LOCK_ALLOC 66#ifdef CONFIG_DEBUG_LOCK_ALLOC
71 67
72int debug_lockdep_rcu_enabled(void) 68int debug_lockdep_rcu_enabled(void)
@@ -97,21 +93,6 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
97#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 93#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
98 94
99/* 95/*
100 * This function is invoked towards the end of the scheduler's initialization
101 * process. Before this is called, the idle task might contain
102 * RCU read-side critical sections (during which time, this idle
103 * task is booting the system). After this function is called, the
104 * idle tasks are prohibited from containing RCU read-side critical
105 * sections.
106 */
107void rcu_scheduler_starting(void)
108{
109 WARN_ON(num_online_cpus() != 1);
110 WARN_ON(nr_context_switches() > 0);
111 rcu_scheduler_active = 1;
112}
113
114/*
115 * Awaken the corresponding synchronize_rcu() instance now that a 96 * Awaken the corresponding synchronize_rcu() instance now that a
116 * grace period has elapsed. 97 * grace period has elapsed.
117 */ 98 */
@@ -122,3 +103,14 @@ void wakeme_after_rcu(struct rcu_head *head)
122 rcu = container_of(head, struct rcu_synchronize, head); 103 rcu = container_of(head, struct rcu_synchronize, head);
123 complete(&rcu->completion); 104 complete(&rcu->completion);
124} 105}
106
107#ifdef CONFIG_PROVE_RCU
108/*
109 * wrapper function to avoid #include problems.
110 */
111int rcu_my_thread_group_empty(void)
112{
113 return thread_group_empty(current);
114}
115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
116#endif /* #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572c..38729d3cd236 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
44}; 44};
45 45
46/* Definition for rcupdate control block. */ 46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = { 47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist, 48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist, 49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50}; 50};
51 51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = { 52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
54 .curtail = &rcu_bh_ctrlblk.rcucblist, 54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55}; 55};
56 56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61
57#ifdef CONFIG_NO_HZ 62#ifdef CONFIG_NO_HZ
58 63
59static long rcu_dynticks_nesting = 1; 64static long rcu_dynticks_nesting = 1;
@@ -108,7 +113,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
108 */ 113 */
109void rcu_sched_qs(int cpu) 114void rcu_sched_qs(int cpu)
110{ 115{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) 116 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
117 rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ); 118 raise_softirq(RCU_SOFTIRQ);
113} 119}
114 120
@@ -173,7 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
173 */ 179 */
174static void rcu_process_callbacks(struct softirq_action *unused) 180static void rcu_process_callbacks(struct softirq_action *unused)
175{ 181{
176 __rcu_process_callbacks(&rcu_ctrlblk); 182 __rcu_process_callbacks(&rcu_sched_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk); 183 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178} 184}
179 185
@@ -187,7 +193,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
187 * 193 *
188 * Cool, huh? (Due to Josh Triplett.) 194 * Cool, huh? (Due to Josh Triplett.)
189 * 195 *
190 * But we want to make this a static inline later. 196 * But we want to make this a static inline later. The cond_resched()
197 * currently makes this problematic.
191 */ 198 */
192void synchronize_sched(void) 199void synchronize_sched(void)
193{ 200{
@@ -195,12 +202,6 @@ void synchronize_sched(void)
195} 202}
196EXPORT_SYMBOL_GPL(synchronize_sched); 203EXPORT_SYMBOL_GPL(synchronize_sched);
197 204
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/* 205/*
205 * Helper function for call_rcu() and call_rcu_bh(). 206 * Helper function for call_rcu() and call_rcu_bh().
206 */ 207 */
@@ -226,7 +227,7 @@ static void __call_rcu(struct rcu_head *head,
226 */ 227 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 228void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{ 229{
229 __call_rcu(head, func, &rcu_ctrlblk); 230 __call_rcu(head, func, &rcu_sched_ctrlblk);
230} 231}
231EXPORT_SYMBOL_GPL(call_rcu); 232EXPORT_SYMBOL_GPL(call_rcu);
232 233
@@ -244,11 +245,13 @@ void rcu_barrier(void)
244{ 245{
245 struct rcu_synchronize rcu; 246 struct rcu_synchronize rcu;
246 247
248 init_rcu_head_on_stack(&rcu.head);
247 init_completion(&rcu.completion); 249 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */ 250 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu); 251 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */ 252 /* Wait for it. */
251 wait_for_completion(&rcu.completion); 253 wait_for_completion(&rcu.completion);
254 destroy_rcu_head_on_stack(&rcu.head);
252} 255}
253EXPORT_SYMBOL_GPL(rcu_barrier); 256EXPORT_SYMBOL_GPL(rcu_barrier);
254 257
@@ -256,11 +259,13 @@ void rcu_barrier_bh(void)
256{ 259{
257 struct rcu_synchronize rcu; 260 struct rcu_synchronize rcu;
258 261
262 init_rcu_head_on_stack(&rcu.head);
259 init_completion(&rcu.completion); 263 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */ 264 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu); 265 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */ 266 /* Wait for it. */
263 wait_for_completion(&rcu.completion); 267 wait_for_completion(&rcu.completion);
268 destroy_rcu_head_on_stack(&rcu.head);
264} 269}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh); 270EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266 271
@@ -268,11 +273,13 @@ void rcu_barrier_sched(void)
268{ 273{
269 struct rcu_synchronize rcu; 274 struct rcu_synchronize rcu;
270 275
276 init_rcu_head_on_stack(&rcu.head);
271 init_completion(&rcu.completion); 277 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */ 278 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu); 279 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */ 280 /* Wait for it. */
275 wait_for_completion(&rcu.completion); 281 wait_for_completion(&rcu.completion);
282 destroy_rcu_head_on_stack(&rcu.head);
276} 283}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched); 284EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278 285
@@ -280,3 +287,5 @@ void __init rcu_init(void)
280{ 287{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 288 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282} 289}
290
291#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 000000000000..d223a92bc742
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright IBM Corporation, 2009
21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#ifdef CONFIG_DEBUG_LOCK_ALLOC
26
27#include <linux/kernel_stat.h>
28
29/*
30 * During boot, we forgive RCU lockdep issues. After this function is
31 * invoked, we start taking RCU lockdep issues seriously.
32 */
33void rcu_scheduler_starting(void)
34{
35 WARN_ON(nr_context_switches() > 0);
36 rcu_scheduler_active = 1;
37}
38
39#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83ed..6535ac8bc6a5 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -464,9 +464,11 @@ static void rcu_bh_torture_synchronize(void)
464{ 464{
465 struct rcu_bh_torture_synchronize rcu; 465 struct rcu_bh_torture_synchronize rcu;
466 466
467 init_rcu_head_on_stack(&rcu.head);
467 init_completion(&rcu.completion); 468 init_completion(&rcu.completion);
468 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); 469 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
469 wait_for_completion(&rcu.completion); 470 wait_for_completion(&rcu.completion);
471 destroy_rcu_head_on_stack(&rcu.head);
470} 472}
471 473
472static struct rcu_torture_ops rcu_bh_ops = { 474static struct rcu_torture_ops rcu_bh_ops = {
@@ -669,7 +671,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
669 .sync = synchronize_sched_expedited, 671 .sync = synchronize_sched_expedited,
670 .cb_barrier = NULL, 672 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state, 673 .fqs = rcu_sched_force_quiescent_state,
672 .stats = rcu_expedited_torture_stats, 674 .stats = NULL,
673 .irq_capable = 1, 675 .irq_capable = 1,
674 .name = "sched_expedited" 676 .name = "sched_expedited"
675}; 677};
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3ec8160fc75f..d4437345706f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,7 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
@@ -53,8 +54,8 @@
53 54
54static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
55 56
56#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(structname) { \
57 .level = { &name.node[0] }, \ 58 .level = { &structname.node[0] }, \
58 .levelcnt = { \ 59 .levelcnt = { \
59 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
60 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
@@ -65,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
65 .signaled = RCU_GP_IDLE, \ 66 .signaled = RCU_GP_IDLE, \
66 .gpnum = -300, \ 67 .gpnum = -300, \
67 .completed = -300, \ 68 .completed = -300, \
68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
69 .orphan_cbs_list = NULL, \ 70 .orphan_cbs_list = NULL, \
70 .orphan_cbs_tail = &name.orphan_cbs_list, \ 71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
71 .orphan_qlen = 0, \ 72 .orphan_qlen = 0, \
72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ 73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
73 .n_force_qs = 0, \ 74 .n_force_qs = 0, \
74 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
76 .name = #structname, \
75} 77}
76 78
77struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); 79struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -80,6 +82,9 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
82 84
85int rcu_scheduler_active __read_mostly;
86EXPORT_SYMBOL_GPL(rcu_scheduler_active);
87
83/* 88/*
84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 89 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
85 * permit this function to be invoked without holding the root rcu_node 90 * permit this function to be invoked without holding the root rcu_node
@@ -97,25 +102,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
97 */ 102 */
98void rcu_sched_qs(int cpu) 103void rcu_sched_qs(int cpu)
99{ 104{
100 struct rcu_data *rdp; 105 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
101 106
102 rdp = &per_cpu(rcu_sched_data, cpu);
103 rdp->passed_quiesc_completed = rdp->gpnum - 1; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
104 barrier(); 108 barrier();
105 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
106 rcu_preempt_note_context_switch(cpu);
107} 110}
108 111
109void rcu_bh_qs(int cpu) 112void rcu_bh_qs(int cpu)
110{ 113{
111 struct rcu_data *rdp; 114 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
112 115
113 rdp = &per_cpu(rcu_bh_data, cpu);
114 rdp->passed_quiesc_completed = rdp->gpnum - 1; 116 rdp->passed_quiesc_completed = rdp->gpnum - 1;
115 barrier(); 117 barrier();
116 rdp->passed_quiesc = 1; 118 rdp->passed_quiesc = 1;
117} 119}
118 120
121/*
122 * Note a context switch. This is a quiescent state for RCU-sched,
123 * and requires special handling for preemptible RCU.
124 */
125void rcu_note_context_switch(int cpu)
126{
127 rcu_sched_qs(cpu);
128 rcu_preempt_note_context_switch(cpu);
129}
130
119#ifdef CONFIG_NO_HZ 131#ifdef CONFIG_NO_HZ
120DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 132DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
121 .dynticks_nesting = 1, 133 .dynticks_nesting = 1,
@@ -438,6 +450,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
438 450
439#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
440 452
453int rcu_cpu_stall_panicking __read_mostly;
454
441static void record_gp_stall_check_time(struct rcu_state *rsp) 455static void record_gp_stall_check_time(struct rcu_state *rsp)
442{ 456{
443 rsp->gp_start = jiffies; 457 rsp->gp_start = jiffies;
@@ -470,7 +484,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
470 484
471 /* OK, time to rat on our buddy... */ 485 /* OK, time to rat on our buddy... */
472 486
473 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name);
474 rcu_for_each_leaf_node(rsp, rnp) { 489 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags); 490 raw_spin_lock_irqsave(&rnp->lock, flags);
476 rcu_print_task_stall(rnp); 491 rcu_print_task_stall(rnp);
@@ -481,7 +496,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
481 if (rnp->qsmask & (1UL << cpu)) 496 if (rnp->qsmask & (1UL << cpu))
482 printk(" %d", rnp->grplo + cpu); 497 printk(" %d", rnp->grplo + cpu);
483 } 498 }
484 printk(" (detected by %d, t=%ld jiffies)\n", 499 printk("} (detected by %d, t=%ld jiffies)\n",
485 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 500 smp_processor_id(), (long)(jiffies - rsp->gp_start));
486 trigger_all_cpu_backtrace(); 501 trigger_all_cpu_backtrace();
487 502
@@ -497,8 +512,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
497 unsigned long flags; 512 unsigned long flags;
498 struct rcu_node *rnp = rcu_get_root(rsp); 513 struct rcu_node *rnp = rcu_get_root(rsp);
499 514
500 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
501 smp_processor_id(), jiffies - rsp->gp_start); 516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
502 trigger_all_cpu_backtrace(); 517 trigger_all_cpu_backtrace();
503 518
504 raw_spin_lock_irqsave(&rnp->lock, flags); 519 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -515,6 +530,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
515 long delta; 530 long delta;
516 struct rcu_node *rnp; 531 struct rcu_node *rnp;
517 532
533 if (rcu_cpu_stall_panicking)
534 return;
518 delta = jiffies - rsp->jiffies_stall; 535 delta = jiffies - rsp->jiffies_stall;
519 rnp = rdp->mynode; 536 rnp = rdp->mynode;
520 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -529,6 +546,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
529 } 546 }
530} 547}
531 548
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{
551 rcu_cpu_stall_panicking = 1;
552 return NOTIFY_DONE;
553}
554
555static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic,
557};
558
559static void __init check_cpu_stall_init(void)
560{
561 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
562}
563
532#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 564#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
533 565
534static void record_gp_stall_check_time(struct rcu_state *rsp) 566static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -539,6 +571,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 571{
540} 572}
541 573
574static void __init check_cpu_stall_init(void)
575{
576}
577
542#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 578#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
543 579
544/* 580/*
@@ -1125,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1125 */ 1161 */
1126void rcu_check_callbacks(int cpu, int user) 1162void rcu_check_callbacks(int cpu, int user)
1127{ 1163{
1128 if (!rcu_pending(cpu))
1129 return; /* if nothing for RCU to do. */
1130 if (user || 1164 if (user ||
1131 (idle_cpu(cpu) && rcu_scheduler_active && 1165 (idle_cpu(cpu) && rcu_scheduler_active &&
1132 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1166 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1158,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user)
1158 rcu_bh_qs(cpu); 1192 rcu_bh_qs(cpu);
1159 } 1193 }
1160 rcu_preempt_check_callbacks(cpu); 1194 rcu_preempt_check_callbacks(cpu);
1161 raise_softirq(RCU_SOFTIRQ); 1195 if (rcu_pending(cpu))
1196 raise_softirq(RCU_SOFTIRQ);
1162} 1197}
1163 1198
1164#ifdef CONFIG_SMP 1199#ifdef CONFIG_SMP
@@ -1236,11 +1271,11 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1236 break; /* grace period idle or initializing, ignore. */ 1271 break; /* grace period idle or initializing, ignore. */
1237 1272
1238 case RCU_SAVE_DYNTICK: 1273 case RCU_SAVE_DYNTICK:
1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1274 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1242 break; /* So gcc recognizes the dead code. */ 1275 break; /* So gcc recognizes the dead code. */
1243 1276
1277 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1278
1244 /* Record dyntick-idle state. */ 1279 /* Record dyntick-idle state. */
1245 force_qs_rnp(rsp, dyntick_save_progress_counter); 1280 force_qs_rnp(rsp, dyntick_save_progress_counter);
1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1281 raw_spin_lock(&rnp->lock); /* irqs already disabled */
@@ -1449,11 +1484,13 @@ void synchronize_sched(void)
1449 if (rcu_blocking_is_gp()) 1484 if (rcu_blocking_is_gp())
1450 return; 1485 return;
1451 1486
1487 init_rcu_head_on_stack(&rcu.head);
1452 init_completion(&rcu.completion); 1488 init_completion(&rcu.completion);
1453 /* Will wake me after RCU finished. */ 1489 /* Will wake me after RCU finished. */
1454 call_rcu_sched(&rcu.head, wakeme_after_rcu); 1490 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1455 /* Wait for it. */ 1491 /* Wait for it. */
1456 wait_for_completion(&rcu.completion); 1492 wait_for_completion(&rcu.completion);
1493 destroy_rcu_head_on_stack(&rcu.head);
1457} 1494}
1458EXPORT_SYMBOL_GPL(synchronize_sched); 1495EXPORT_SYMBOL_GPL(synchronize_sched);
1459 1496
@@ -1473,11 +1510,13 @@ void synchronize_rcu_bh(void)
1473 if (rcu_blocking_is_gp()) 1510 if (rcu_blocking_is_gp())
1474 return; 1511 return;
1475 1512
1513 init_rcu_head_on_stack(&rcu.head);
1476 init_completion(&rcu.completion); 1514 init_completion(&rcu.completion);
1477 /* Will wake me after RCU finished. */ 1515 /* Will wake me after RCU finished. */
1478 call_rcu_bh(&rcu.head, wakeme_after_rcu); 1516 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1479 /* Wait for it. */ 1517 /* Wait for it. */
1480 wait_for_completion(&rcu.completion); 1518 wait_for_completion(&rcu.completion);
1519 destroy_rcu_head_on_stack(&rcu.head);
1481} 1520}
1482EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1521EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1483 1522
@@ -1498,8 +1537,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1498 check_cpu_stall(rsp, rdp); 1537 check_cpu_stall(rsp, rdp);
1499 1538
1500 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1539 /* Is the RCU core waiting for a quiescent state from this CPU? */
1501 if (rdp->qs_pending) { 1540 if (rdp->qs_pending && !rdp->passed_quiesc) {
1541
1542 /*
1543 * If force_quiescent_state() coming soon and this CPU
1544 * needs a quiescent state, and this is either RCU-sched
1545 * or RCU-bh, force a local reschedule.
1546 */
1502 rdp->n_rp_qs_pending++; 1547 rdp->n_rp_qs_pending++;
1548 if (!rdp->preemptable &&
1549 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1550 jiffies))
1551 set_need_resched();
1552 } else if (rdp->qs_pending && rdp->passed_quiesc) {
1553 rdp->n_rp_report_qs++;
1503 return 1; 1554 return 1;
1504 } 1555 }
1505 1556
@@ -1767,6 +1818,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1767} 1818}
1768 1819
1769/* 1820/*
1821 * This function is invoked towards the end of the scheduler's initialization
1822 * process. Before this is called, the idle task might contain
1823 * RCU read-side critical sections (during which time, this idle
1824 * task is booting the system). After this function is called, the
1825 * idle tasks are prohibited from containing RCU read-side critical
1826 * sections. This function also enables RCU lockdep checking.
1827 */
1828void rcu_scheduler_starting(void)
1829{
1830 WARN_ON(num_online_cpus() != 1);
1831 WARN_ON(nr_context_switches() > 0);
1832 rcu_scheduler_active = 1;
1833}
1834
1835/*
1770 * Compute the per-level fanout, either using the exact fanout specified 1836 * Compute the per-level fanout, either using the exact fanout specified
1771 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. 1837 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1772 */ 1838 */
@@ -1849,6 +1915,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]); 1915 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1850 } 1916 }
1851 } 1917 }
1918
1919 rnp = rsp->level[NUM_RCU_LVLS - 1];
1920 for_each_possible_cpu(i) {
1921 while (i > rnp->grphi)
1922 rnp++;
1923 rsp->rda[i]->mynode = rnp;
1924 rcu_boot_init_percpu_data(i, rsp);
1925 }
1852} 1926}
1853 1927
1854/* 1928/*
@@ -1859,19 +1933,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1859#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1933#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1860do { \ 1934do { \
1861 int i; \ 1935 int i; \
1862 int j; \
1863 struct rcu_node *rnp; \
1864 \ 1936 \
1865 rcu_init_one(rsp); \
1866 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1867 j = 0; \
1868 for_each_possible_cpu(i) { \ 1937 for_each_possible_cpu(i) { \
1869 if (i > rnp[j].grphi) \
1870 j++; \
1871 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1872 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1938 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1873 rcu_boot_init_percpu_data(i, rsp); \
1874 } \ 1939 } \
1940 rcu_init_one(rsp); \
1875} while (0) 1941} while (0)
1876 1942
1877void __init rcu_init(void) 1943void __init rcu_init(void)
@@ -1879,12 +1945,6 @@ void __init rcu_init(void)
1879 int cpu; 1945 int cpu;
1880 1946
1881 rcu_bootup_announce(); 1947 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1948 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1949 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1890 __rcu_init_preempt(); 1950 __rcu_init_preempt();
@@ -1898,6 +1958,7 @@ void __init rcu_init(void)
1898 cpu_notifier(rcu_cpu_notify, 0); 1958 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(cpu) 1959 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1960 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1961 check_cpu_stall_init();
1901} 1962}
1902 1963
1903#include "rcutree_plugin.h" 1964#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4a525a30e08e..14c040b18ed0 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -223,6 +223,7 @@ struct rcu_data {
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 unsigned long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 unsigned long n_rp_report_qs;
226 unsigned long n_rp_cb_ready; 227 unsigned long n_rp_cb_ready;
227 unsigned long n_rp_cpu_needs_gp; 228 unsigned long n_rp_cpu_needs_gp;
228 unsigned long n_rp_gp_completed; 229 unsigned long n_rp_gp_completed;
@@ -326,6 +327,7 @@ struct rcu_state {
326 unsigned long jiffies_stall; /* Time at which to check */ 327 unsigned long jiffies_stall; /* Time at which to check */
327 /* for CPU stalls. */ 328 /* for CPU stalls. */
328#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 329#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
330 char *name; /* Name of structure. */
329}; 331};
330 332
331/* Return values for rcu_preempt_offline_tasks(). */ 333/* Return values for rcu_preempt_offline_tasks(). */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 79b53bda8943..0e4f420245d9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28 28
29/*
30 * Check the RCU kernel configuration parameters and print informative
31 * messages about anything out of the ordinary. If you like #ifdef, you
32 * will love this function.
33 */
34static void __init rcu_bootup_announce_oddness(void)
35{
36#ifdef CONFIG_RCU_TRACE
37 printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
38#endif
39#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
40 printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
41 CONFIG_RCU_FANOUT);
42#endif
43#ifdef CONFIG_RCU_FANOUT_EXACT
44 printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
45#endif
46#ifdef CONFIG_RCU_FAST_NO_HZ
47 printk(KERN_INFO
48 "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
49#endif
50#ifdef CONFIG_PROVE_RCU
51 printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
52#endif
53#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
54 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
55#endif
56#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif
63#if NUM_RCU_LVL_4 != 0
64 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
65#endif
66}
67
29#ifdef CONFIG_TREE_PREEMPT_RCU 68#ifdef CONFIG_TREE_PREEMPT_RCU
30 69
31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 70struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
38 */ 77 */
39static void __init rcu_bootup_announce(void) 78static void __init rcu_bootup_announce(void)
40{ 79{
41 printk(KERN_INFO 80 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
42 "Experimental preemptable hierarchical RCU implementation.\n"); 81 rcu_bootup_announce_oddness();
43} 82}
44 83
45/* 84/*
@@ -75,13 +114,19 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
75 * that this just means that the task currently running on the CPU is 114 * that this just means that the task currently running on the CPU is
76 * not in a quiescent state. There might be any number of tasks blocked 115 * not in a quiescent state. There might be any number of tasks blocked
77 * while in an RCU read-side critical section. 116 * while in an RCU read-side critical section.
117 *
118 * Unlike the other rcu_*_qs() functions, callers to this function
119 * must disable irqs in order to protect the assignment to
120 * ->rcu_read_unlock_special.
78 */ 121 */
79static void rcu_preempt_qs(int cpu) 122static void rcu_preempt_qs(int cpu)
80{ 123{
81 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 124 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
125
82 rdp->passed_quiesc_completed = rdp->gpnum - 1; 126 rdp->passed_quiesc_completed = rdp->gpnum - 1;
83 barrier(); 127 barrier();
84 rdp->passed_quiesc = 1; 128 rdp->passed_quiesc = 1;
129 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
85} 130}
86 131
87/* 132/*
@@ -144,9 +189,8 @@ static void rcu_preempt_note_context_switch(int cpu)
144 * grace period, then the fact that the task has been enqueued 189 * grace period, then the fact that the task has been enqueued
145 * means that we continue to block the current grace period. 190 * means that we continue to block the current grace period.
146 */ 191 */
147 rcu_preempt_qs(cpu);
148 local_irq_save(flags); 192 local_irq_save(flags);
149 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 193 rcu_preempt_qs(cpu);
150 local_irq_restore(flags); 194 local_irq_restore(flags);
151} 195}
152 196
@@ -236,7 +280,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
236 */ 280 */
237 special = t->rcu_read_unlock_special; 281 special = t->rcu_read_unlock_special;
238 if (special & RCU_READ_UNLOCK_NEED_QS) { 282 if (special & RCU_READ_UNLOCK_NEED_QS) {
239 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
240 rcu_preempt_qs(smp_processor_id()); 283 rcu_preempt_qs(smp_processor_id());
241 } 284 }
242 285
@@ -473,7 +516,6 @@ static void rcu_preempt_check_callbacks(int cpu)
473 struct task_struct *t = current; 516 struct task_struct *t = current;
474 517
475 if (t->rcu_read_lock_nesting == 0) { 518 if (t->rcu_read_lock_nesting == 0) {
476 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
477 rcu_preempt_qs(cpu); 519 rcu_preempt_qs(cpu);
478 return; 520 return;
479 } 521 }
@@ -515,11 +557,13 @@ void synchronize_rcu(void)
515 if (!rcu_scheduler_active) 557 if (!rcu_scheduler_active)
516 return; 558 return;
517 559
560 init_rcu_head_on_stack(&rcu.head);
518 init_completion(&rcu.completion); 561 init_completion(&rcu.completion);
519 /* Will wake me after RCU finished. */ 562 /* Will wake me after RCU finished. */
520 call_rcu(&rcu.head, wakeme_after_rcu); 563 call_rcu(&rcu.head, wakeme_after_rcu);
521 /* Wait for it. */ 564 /* Wait for it. */
522 wait_for_completion(&rcu.completion); 565 wait_for_completion(&rcu.completion);
566 destroy_rcu_head_on_stack(&rcu.head);
523} 567}
524EXPORT_SYMBOL_GPL(synchronize_rcu); 568EXPORT_SYMBOL_GPL(synchronize_rcu);
525 569
@@ -754,6 +798,7 @@ void exit_rcu(void)
754static void __init rcu_bootup_announce(void) 798static void __init rcu_bootup_announce(void)
755{ 799{
756 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 800 printk(KERN_INFO "Hierarchical RCU implementation.\n");
801 rcu_bootup_announce_oddness();
757} 802}
758 803
759/* 804/*
@@ -1008,6 +1053,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1008int rcu_needs_cpu(int cpu) 1053int rcu_needs_cpu(int cpu)
1009{ 1054{
1010 int c = 0; 1055 int c = 0;
1056 int snap;
1057 int snap_nmi;
1011 int thatcpu; 1058 int thatcpu;
1012 1059
1013 /* Check for being in the holdoff period. */ 1060 /* Check for being in the holdoff period. */
@@ -1015,12 +1062,18 @@ int rcu_needs_cpu(int cpu)
1015 return rcu_needs_cpu_quick_check(cpu); 1062 return rcu_needs_cpu_quick_check(cpu);
1016 1063
1017 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 1064 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1018 for_each_cpu_not(thatcpu, nohz_cpu_mask) 1065 for_each_online_cpu(thatcpu) {
1019 if (thatcpu != cpu) { 1066 if (thatcpu == cpu)
1067 continue;
1068 snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
1069 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
1070 smp_mb(); /* Order sampling of snap with end of grace period. */
1071 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
1020 per_cpu(rcu_dyntick_drain, cpu) = 0; 1072 per_cpu(rcu_dyntick_drain, cpu) = 0;
1021 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1073 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1022 return rcu_needs_cpu_quick_check(cpu); 1074 return rcu_needs_cpu_quick_check(cpu);
1023 } 1075 }
1076 }
1024 1077
1025 /* Check and update the rcu_dyntick_drain sequencing. */ 1078 /* Check and update the rcu_dyntick_drain sequencing. */
1026 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 1079 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d45db2e35d27..36c95b45738e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
242{ 242{
243 seq_printf(m, "%3d%cnp=%ld " 243 seq_printf(m, "%3d%cnp=%ld "
244 "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", 244 "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
245 "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
245 rdp->cpu, 246 rdp->cpu,
246 cpu_is_offline(rdp->cpu) ? '!' : ' ', 247 cpu_is_offline(rdp->cpu) ? '!' : ' ',
247 rdp->n_rcu_pending, 248 rdp->n_rcu_pending,
248 rdp->n_rp_qs_pending, 249 rdp->n_rp_qs_pending,
250 rdp->n_rp_report_qs,
249 rdp->n_rp_cb_ready, 251 rdp->n_rp_cb_ready,
250 rdp->n_rp_cpu_needs_gp, 252 rdp->n_rp_cpu_needs_gp,
251 rdp->n_rp_gp_completed, 253 rdp->n_rp_gp_completed,
diff --git a/kernel/sched.c b/kernel/sched.c
index 6af210a7de70..1d93cd0ae4d3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/cpuset.h> 56#include <linux/cpuset.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h> 58#include <linux/proc_fs.h>
60#include <linux/seq_file.h> 59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h> 61#include <linux/sysctl.h>
62#include <linux/syscalls.h> 62#include <linux/syscalls.h>
63#include <linux/times.h> 63#include <linux/times.h>
@@ -323,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p)
323/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 323/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
324static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 324static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
325{ 325{
326 /*
327 * Strictly speaking this rcu_read_lock() is not needed since the
328 * task_group is tied to the cgroup, which in turn can never go away
329 * as long as there are tasks attached to it.
330 *
331 * However since task_group() uses task_subsys_state() which is an
332 * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
333 */
334 rcu_read_lock();
326#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
327 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 336 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
328 p->se.parent = task_group(p)->se[cpu]; 337 p->se.parent = task_group(p)->se[cpu];
@@ -332,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
332 p->rt.rt_rq = task_group(p)->rt_rq[cpu]; 341 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
333 p->rt.parent = task_group(p)->rt_se[cpu]; 342 p->rt.parent = task_group(p)->rt_se[cpu];
334#endif 343#endif
344 rcu_read_unlock();
335} 345}
336 346
337#else 347#else
@@ -493,8 +503,11 @@ struct rq {
493 #define CPU_LOAD_IDX_MAX 5 503 #define CPU_LOAD_IDX_MAX 5
494 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 504 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
495#ifdef CONFIG_NO_HZ 505#ifdef CONFIG_NO_HZ
506 u64 nohz_stamp;
496 unsigned char in_nohz_recently; 507 unsigned char in_nohz_recently;
497#endif 508#endif
509 unsigned int skip_clock_update;
510
498 /* capture load from *all* tasks on this cpu: */ 511 /* capture load from *all* tasks on this cpu: */
499 struct load_weight load; 512 struct load_weight load;
500 unsigned long nr_load_updates; 513 unsigned long nr_load_updates;
@@ -536,15 +549,13 @@ struct rq {
536 int post_schedule; 549 int post_schedule;
537 int active_balance; 550 int active_balance;
538 int push_cpu; 551 int push_cpu;
552 struct cpu_stop_work active_balance_work;
539 /* cpu of this runqueue: */ 553 /* cpu of this runqueue: */
540 int cpu; 554 int cpu;
541 int online; 555 int online;
542 556
543 unsigned long avg_load_per_task; 557 unsigned long avg_load_per_task;
544 558
545 struct task_struct *migration_thread;
546 struct list_head migration_queue;
547
548 u64 rt_avg; 559 u64 rt_avg;
549 u64 age_stamp; 560 u64 age_stamp;
550 u64 idle_stamp; 561 u64 idle_stamp;
@@ -592,6 +603,13 @@ static inline
592void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 603void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
593{ 604{
594 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 605 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
606
607 /*
608 * A queue event has occurred, and we're going to schedule. In
609 * this case, we can save a useless back to back clock update.
610 */
611 if (test_tsk_need_resched(p))
612 rq->skip_clock_update = 1;
595} 613}
596 614
597static inline int cpu_of(struct rq *rq) 615static inline int cpu_of(struct rq *rq)
@@ -626,7 +644,8 @@ static inline int cpu_of(struct rq *rq)
626 644
627inline void update_rq_clock(struct rq *rq) 645inline void update_rq_clock(struct rq *rq)
628{ 646{
629 rq->clock = sched_clock_cpu(cpu_of(rq)); 647 if (!rq->skip_clock_update)
648 rq->clock = sched_clock_cpu(cpu_of(rq));
630} 649}
631 650
632/* 651/*
@@ -904,16 +923,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
904#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 923#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
905 924
906/* 925/*
907 * Check whether the task is waking, we use this to synchronize against 926 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
908 * ttwu() so that task_cpu() reports a stable number. 927 * against ttwu().
909 *
910 * We need to make an exception for PF_STARTING tasks because the fork
911 * path might require task_rq_lock() to work, eg. it can call
912 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
913 */ 928 */
914static inline int task_is_waking(struct task_struct *p) 929static inline int task_is_waking(struct task_struct *p)
915{ 930{
916 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); 931 return unlikely(p->state == TASK_WAKING);
917} 932}
918 933
919/* 934/*
@@ -926,11 +941,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
926 struct rq *rq; 941 struct rq *rq;
927 942
928 for (;;) { 943 for (;;) {
929 while (task_is_waking(p))
930 cpu_relax();
931 rq = task_rq(p); 944 rq = task_rq(p);
932 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
933 if (likely(rq == task_rq(p) && !task_is_waking(p))) 946 if (likely(rq == task_rq(p)))
934 return rq; 947 return rq;
935 raw_spin_unlock(&rq->lock); 948 raw_spin_unlock(&rq->lock);
936 } 949 }
@@ -947,12 +960,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
947 struct rq *rq; 960 struct rq *rq;
948 961
949 for (;;) { 962 for (;;) {
950 while (task_is_waking(p))
951 cpu_relax();
952 local_irq_save(*flags); 963 local_irq_save(*flags);
953 rq = task_rq(p); 964 rq = task_rq(p);
954 raw_spin_lock(&rq->lock); 965 raw_spin_lock(&rq->lock);
955 if (likely(rq == task_rq(p) && !task_is_waking(p))) 966 if (likely(rq == task_rq(p)))
956 return rq; 967 return rq;
957 raw_spin_unlock_irqrestore(&rq->lock, *flags); 968 raw_spin_unlock_irqrestore(&rq->lock, *flags);
958 } 969 }
@@ -1229,6 +1240,17 @@ void wake_up_idle_cpu(int cpu)
1229 if (!tsk_is_polling(rq->idle)) 1240 if (!tsk_is_polling(rq->idle))
1230 smp_send_reschedule(cpu); 1241 smp_send_reschedule(cpu);
1231} 1242}
1243
1244int nohz_ratelimit(int cpu)
1245{
1246 struct rq *rq = cpu_rq(cpu);
1247 u64 diff = rq->clock - rq->nohz_stamp;
1248
1249 rq->nohz_stamp = rq->clock;
1250
1251 return diff < (NSEC_PER_SEC / HZ) >> 1;
1252}
1253
1232#endif /* CONFIG_NO_HZ */ 1254#endif /* CONFIG_NO_HZ */
1233 1255
1234static u64 sched_avg_period(void) 1256static u64 sched_avg_period(void)
@@ -1771,8 +1793,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1771 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1793 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1772 } 1794 }
1773 } 1795 }
1774 update_rq_clock(rq1);
1775 update_rq_clock(rq2);
1776} 1796}
1777 1797
1778/* 1798/*
@@ -1803,7 +1823,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1803} 1823}
1804#endif 1824#endif
1805 1825
1806static void calc_load_account_active(struct rq *this_rq); 1826static void calc_load_account_idle(struct rq *this_rq);
1807static void update_sysctl(void); 1827static void update_sysctl(void);
1808static int get_update_sysctl_factor(void); 1828static int get_update_sysctl_factor(void);
1809 1829
@@ -1860,62 +1880,43 @@ static void set_load_weight(struct task_struct *p)
1860 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1880 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1861} 1881}
1862 1882
1863static void update_avg(u64 *avg, u64 sample) 1883static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1864{
1865 s64 diff = sample - *avg;
1866 *avg += diff >> 3;
1867}
1868
1869static void
1870enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1871{ 1884{
1872 if (wakeup) 1885 update_rq_clock(rq);
1873 p->se.start_runtime = p->se.sum_exec_runtime;
1874
1875 sched_info_queued(p); 1886 sched_info_queued(p);
1876 p->sched_class->enqueue_task(rq, p, wakeup, head); 1887 p->sched_class->enqueue_task(rq, p, flags);
1877 p->se.on_rq = 1; 1888 p->se.on_rq = 1;
1878} 1889}
1879 1890
1880static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1891static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1881{ 1892{
1882 if (sleep) { 1893 update_rq_clock(rq);
1883 if (p->se.last_wakeup) {
1884 update_avg(&p->se.avg_overlap,
1885 p->se.sum_exec_runtime - p->se.last_wakeup);
1886 p->se.last_wakeup = 0;
1887 } else {
1888 update_avg(&p->se.avg_wakeup,
1889 sysctl_sched_wakeup_granularity);
1890 }
1891 }
1892
1893 sched_info_dequeued(p); 1894 sched_info_dequeued(p);
1894 p->sched_class->dequeue_task(rq, p, sleep); 1895 p->sched_class->dequeue_task(rq, p, flags);
1895 p->se.on_rq = 0; 1896 p->se.on_rq = 0;
1896} 1897}
1897 1898
1898/* 1899/*
1899 * activate_task - move a task to the runqueue. 1900 * activate_task - move a task to the runqueue.
1900 */ 1901 */
1901static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 1902static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1902{ 1903{
1903 if (task_contributes_to_load(p)) 1904 if (task_contributes_to_load(p))
1904 rq->nr_uninterruptible--; 1905 rq->nr_uninterruptible--;
1905 1906
1906 enqueue_task(rq, p, wakeup, false); 1907 enqueue_task(rq, p, flags);
1907 inc_nr_running(rq); 1908 inc_nr_running(rq);
1908} 1909}
1909 1910
1910/* 1911/*
1911 * deactivate_task - remove a task from the runqueue. 1912 * deactivate_task - remove a task from the runqueue.
1912 */ 1913 */
1913static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 1914static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1914{ 1915{
1915 if (task_contributes_to_load(p)) 1916 if (task_contributes_to_load(p))
1916 rq->nr_uninterruptible++; 1917 rq->nr_uninterruptible++;
1917 1918
1918 dequeue_task(rq, p, sleep); 1919 dequeue_task(rq, p, flags);
1919 dec_nr_running(rq); 1920 dec_nr_running(rq);
1920} 1921}
1921 1922
@@ -2044,21 +2045,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2044 __set_task_cpu(p, new_cpu); 2045 __set_task_cpu(p, new_cpu);
2045} 2046}
2046 2047
2047struct migration_req { 2048struct migration_arg {
2048 struct list_head list;
2049
2050 struct task_struct *task; 2049 struct task_struct *task;
2051 int dest_cpu; 2050 int dest_cpu;
2052
2053 struct completion done;
2054}; 2051};
2055 2052
2053static int migration_cpu_stop(void *data);
2054
2056/* 2055/*
2057 * The task's runqueue lock must be held. 2056 * The task's runqueue lock must be held.
2058 * Returns true if you have to wait for migration thread. 2057 * Returns true if you have to wait for migration thread.
2059 */ 2058 */
2060static int 2059static bool migrate_task(struct task_struct *p, int dest_cpu)
2061migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2062{ 2060{
2063 struct rq *rq = task_rq(p); 2061 struct rq *rq = task_rq(p);
2064 2062
@@ -2066,58 +2064,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2066 * If the task is not on a runqueue (and not running), then 2064 * If the task is not on a runqueue (and not running), then
2067 * the next wake-up will properly place the task. 2065 * the next wake-up will properly place the task.
2068 */ 2066 */
2069 if (!p->se.on_rq && !task_running(rq, p)) 2067 return p->se.on_rq || task_running(rq, p);
2070 return 0;
2071
2072 init_completion(&req->done);
2073 req->task = p;
2074 req->dest_cpu = dest_cpu;
2075 list_add(&req->list, &rq->migration_queue);
2076
2077 return 1;
2078}
2079
2080/*
2081 * wait_task_context_switch - wait for a thread to complete at least one
2082 * context switch.
2083 *
2084 * @p must not be current.
2085 */
2086void wait_task_context_switch(struct task_struct *p)
2087{
2088 unsigned long nvcsw, nivcsw, flags;
2089 int running;
2090 struct rq *rq;
2091
2092 nvcsw = p->nvcsw;
2093 nivcsw = p->nivcsw;
2094 for (;;) {
2095 /*
2096 * The runqueue is assigned before the actual context
2097 * switch. We need to take the runqueue lock.
2098 *
2099 * We could check initially without the lock but it is
2100 * very likely that we need to take the lock in every
2101 * iteration.
2102 */
2103 rq = task_rq_lock(p, &flags);
2104 running = task_running(rq, p);
2105 task_rq_unlock(rq, &flags);
2106
2107 if (likely(!running))
2108 break;
2109 /*
2110 * The switch count is incremented before the actual
2111 * context switch. We thus wait for two switches to be
2112 * sure at least one completed.
2113 */
2114 if ((p->nvcsw - nvcsw) > 1)
2115 break;
2116 if ((p->nivcsw - nivcsw) > 1)
2117 break;
2118
2119 cpu_relax();
2120 }
2121} 2068}
2122 2069
2123/* 2070/*
@@ -2175,7 +2122,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2175 * just go back and repeat. 2122 * just go back and repeat.
2176 */ 2123 */
2177 rq = task_rq_lock(p, &flags); 2124 rq = task_rq_lock(p, &flags);
2178 trace_sched_wait_task(rq, p); 2125 trace_sched_wait_task(p);
2179 running = task_running(rq, p); 2126 running = task_running(rq, p);
2180 on_rq = p->se.on_rq; 2127 on_rq = p->se.on_rq;
2181 ncsw = 0; 2128 ncsw = 0;
@@ -2273,6 +2220,9 @@ void task_oncpu_function_call(struct task_struct *p,
2273} 2220}
2274 2221
2275#ifdef CONFIG_SMP 2222#ifdef CONFIG_SMP
2223/*
2224 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2225 */
2276static int select_fallback_rq(int cpu, struct task_struct *p) 2226static int select_fallback_rq(int cpu, struct task_struct *p)
2277{ 2227{
2278 int dest_cpu; 2228 int dest_cpu;
@@ -2289,12 +2239,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2289 return dest_cpu; 2239 return dest_cpu;
2290 2240
2291 /* No more Mr. Nice Guy. */ 2241 /* No more Mr. Nice Guy. */
2292 if (dest_cpu >= nr_cpu_ids) { 2242 if (unlikely(dest_cpu >= nr_cpu_ids)) {
2293 rcu_read_lock(); 2243 dest_cpu = cpuset_cpus_allowed_fallback(p);
2294 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2295 rcu_read_unlock();
2296 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2297
2298 /* 2244 /*
2299 * Don't tell them about moving exiting tasks or 2245 * Don't tell them about moving exiting tasks or
2300 * kernel threads (both mm NULL), since they never 2246 * kernel threads (both mm NULL), since they never
@@ -2311,17 +2257,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2311} 2257}
2312 2258
2313/* 2259/*
2314 * Gets called from 3 sites (exec, fork, wakeup), since it is called without 2260 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2315 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2316 * by:
2317 *
2318 * exec: is unstable, retry loop
2319 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2320 */ 2261 */
2321static inline 2262static inline
2322int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2263int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2323{ 2264{
2324 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2265 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2325 2266
2326 /* 2267 /*
2327 * In order not to call set_task_cpu() on a blocking task we need 2268 * In order not to call set_task_cpu() on a blocking task we need
@@ -2339,6 +2280,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2339 2280
2340 return cpu; 2281 return cpu;
2341} 2282}
2283
2284static void update_avg(u64 *avg, u64 sample)
2285{
2286 s64 diff = sample - *avg;
2287 *avg += diff >> 3;
2288}
2342#endif 2289#endif
2343 2290
2344/*** 2291/***
@@ -2360,16 +2307,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2360{ 2307{
2361 int cpu, orig_cpu, this_cpu, success = 0; 2308 int cpu, orig_cpu, this_cpu, success = 0;
2362 unsigned long flags; 2309 unsigned long flags;
2310 unsigned long en_flags = ENQUEUE_WAKEUP;
2363 struct rq *rq; 2311 struct rq *rq;
2364 2312
2365 if (!sched_feat(SYNC_WAKEUPS))
2366 wake_flags &= ~WF_SYNC;
2367
2368 this_cpu = get_cpu(); 2313 this_cpu = get_cpu();
2369 2314
2370 smp_wmb(); 2315 smp_wmb();
2371 rq = task_rq_lock(p, &flags); 2316 rq = task_rq_lock(p, &flags);
2372 update_rq_clock(rq);
2373 if (!(p->state & state)) 2317 if (!(p->state & state))
2374 goto out; 2318 goto out;
2375 2319
@@ -2389,28 +2333,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2389 * 2333 *
2390 * First fix up the nr_uninterruptible count: 2334 * First fix up the nr_uninterruptible count:
2391 */ 2335 */
2392 if (task_contributes_to_load(p)) 2336 if (task_contributes_to_load(p)) {
2393 rq->nr_uninterruptible--; 2337 if (likely(cpu_online(orig_cpu)))
2338 rq->nr_uninterruptible--;
2339 else
2340 this_rq()->nr_uninterruptible--;
2341 }
2394 p->state = TASK_WAKING; 2342 p->state = TASK_WAKING;
2395 2343
2396 if (p->sched_class->task_waking) 2344 if (p->sched_class->task_waking) {
2397 p->sched_class->task_waking(rq, p); 2345 p->sched_class->task_waking(rq, p);
2346 en_flags |= ENQUEUE_WAKING;
2347 }
2398 2348
2399 __task_rq_unlock(rq); 2349 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2400 2350 if (cpu != orig_cpu)
2401 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2402 if (cpu != orig_cpu) {
2403 /*
2404 * Since we migrate the task without holding any rq->lock,
2405 * we need to be careful with task_rq_lock(), since that
2406 * might end up locking an invalid rq.
2407 */
2408 set_task_cpu(p, cpu); 2351 set_task_cpu(p, cpu);
2409 } 2352 __task_rq_unlock(rq);
2410 2353
2411 rq = cpu_rq(cpu); 2354 rq = cpu_rq(cpu);
2412 raw_spin_lock(&rq->lock); 2355 raw_spin_lock(&rq->lock);
2413 update_rq_clock(rq);
2414 2356
2415 /* 2357 /*
2416 * We migrated the task without holding either rq->lock, however 2358 * We migrated the task without holding either rq->lock, however
@@ -2438,36 +2380,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2438 2380
2439out_activate: 2381out_activate:
2440#endif /* CONFIG_SMP */ 2382#endif /* CONFIG_SMP */
2441 schedstat_inc(p, se.nr_wakeups); 2383 schedstat_inc(p, se.statistics.nr_wakeups);
2442 if (wake_flags & WF_SYNC) 2384 if (wake_flags & WF_SYNC)
2443 schedstat_inc(p, se.nr_wakeups_sync); 2385 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2444 if (orig_cpu != cpu) 2386 if (orig_cpu != cpu)
2445 schedstat_inc(p, se.nr_wakeups_migrate); 2387 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2446 if (cpu == this_cpu) 2388 if (cpu == this_cpu)
2447 schedstat_inc(p, se.nr_wakeups_local); 2389 schedstat_inc(p, se.statistics.nr_wakeups_local);
2448 else 2390 else
2449 schedstat_inc(p, se.nr_wakeups_remote); 2391 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2450 activate_task(rq, p, 1); 2392 activate_task(rq, p, en_flags);
2451 success = 1; 2393 success = 1;
2452 2394
2453 /*
2454 * Only attribute actual wakeups done by this task.
2455 */
2456 if (!in_interrupt()) {
2457 struct sched_entity *se = &current->se;
2458 u64 sample = se->sum_exec_runtime;
2459
2460 if (se->last_wakeup)
2461 sample -= se->last_wakeup;
2462 else
2463 sample -= se->start_runtime;
2464 update_avg(&se->avg_wakeup, sample);
2465
2466 se->last_wakeup = se->sum_exec_runtime;
2467 }
2468
2469out_running: 2395out_running:
2470 trace_sched_wakeup(rq, p, success); 2396 trace_sched_wakeup(p, success);
2471 check_preempt_curr(rq, p, wake_flags); 2397 check_preempt_curr(rq, p, wake_flags);
2472 2398
2473 p->state = TASK_RUNNING; 2399 p->state = TASK_RUNNING;
@@ -2527,42 +2453,9 @@ static void __sched_fork(struct task_struct *p)
2527 p->se.sum_exec_runtime = 0; 2453 p->se.sum_exec_runtime = 0;
2528 p->se.prev_sum_exec_runtime = 0; 2454 p->se.prev_sum_exec_runtime = 0;
2529 p->se.nr_migrations = 0; 2455 p->se.nr_migrations = 0;
2530 p->se.last_wakeup = 0;
2531 p->se.avg_overlap = 0;
2532 p->se.start_runtime = 0;
2533 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2534 2456
2535#ifdef CONFIG_SCHEDSTATS 2457#ifdef CONFIG_SCHEDSTATS
2536 p->se.wait_start = 0; 2458 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2537 p->se.wait_max = 0;
2538 p->se.wait_count = 0;
2539 p->se.wait_sum = 0;
2540
2541 p->se.sleep_start = 0;
2542 p->se.sleep_max = 0;
2543 p->se.sum_sleep_runtime = 0;
2544
2545 p->se.block_start = 0;
2546 p->se.block_max = 0;
2547 p->se.exec_max = 0;
2548 p->se.slice_max = 0;
2549
2550 p->se.nr_migrations_cold = 0;
2551 p->se.nr_failed_migrations_affine = 0;
2552 p->se.nr_failed_migrations_running = 0;
2553 p->se.nr_failed_migrations_hot = 0;
2554 p->se.nr_forced_migrations = 0;
2555
2556 p->se.nr_wakeups = 0;
2557 p->se.nr_wakeups_sync = 0;
2558 p->se.nr_wakeups_migrate = 0;
2559 p->se.nr_wakeups_local = 0;
2560 p->se.nr_wakeups_remote = 0;
2561 p->se.nr_wakeups_affine = 0;
2562 p->se.nr_wakeups_affine_attempts = 0;
2563 p->se.nr_wakeups_passive = 0;
2564 p->se.nr_wakeups_idle = 0;
2565
2566#endif 2459#endif
2567 2460
2568 INIT_LIST_HEAD(&p->rt.run_list); 2461 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2583,11 +2476,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2583 2476
2584 __sched_fork(p); 2477 __sched_fork(p);
2585 /* 2478 /*
2586 * We mark the process as waking here. This guarantees that 2479 * We mark the process as running here. This guarantees that
2587 * nobody will actually run it, and a signal or other external 2480 * nobody will actually run it, and a signal or other external
2588 * event cannot wake it up and insert it on the runqueue either. 2481 * event cannot wake it up and insert it on the runqueue either.
2589 */ 2482 */
2590 p->state = TASK_WAKING; 2483 p->state = TASK_RUNNING;
2591 2484
2592 /* 2485 /*
2593 * Revert to default priority/policy on fork if requested. 2486 * Revert to default priority/policy on fork if requested.
@@ -2654,31 +2547,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2654 int cpu __maybe_unused = get_cpu(); 2547 int cpu __maybe_unused = get_cpu();
2655 2548
2656#ifdef CONFIG_SMP 2549#ifdef CONFIG_SMP
2550 rq = task_rq_lock(p, &flags);
2551 p->state = TASK_WAKING;
2552
2657 /* 2553 /*
2658 * Fork balancing, do it here and not earlier because: 2554 * Fork balancing, do it here and not earlier because:
2659 * - cpus_allowed can change in the fork path 2555 * - cpus_allowed can change in the fork path
2660 * - any previously selected cpu might disappear through hotplug 2556 * - any previously selected cpu might disappear through hotplug
2661 * 2557 *
2662 * We still have TASK_WAKING but PF_STARTING is gone now, meaning 2558 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2663 * ->cpus_allowed is stable, we have preemption disabled, meaning 2559 * without people poking at ->cpus_allowed.
2664 * cpu_online_mask is stable.
2665 */ 2560 */
2666 cpu = select_task_rq(p, SD_BALANCE_FORK, 0); 2561 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2667 set_task_cpu(p, cpu); 2562 set_task_cpu(p, cpu);
2668#endif
2669 2563
2670 /*
2671 * Since the task is not on the rq and we still have TASK_WAKING set
2672 * nobody else will migrate this task.
2673 */
2674 rq = cpu_rq(cpu);
2675 raw_spin_lock_irqsave(&rq->lock, flags);
2676
2677 BUG_ON(p->state != TASK_WAKING);
2678 p->state = TASK_RUNNING; 2564 p->state = TASK_RUNNING;
2679 update_rq_clock(rq); 2565 task_rq_unlock(rq, &flags);
2566#endif
2567
2568 rq = task_rq_lock(p, &flags);
2680 activate_task(rq, p, 0); 2569 activate_task(rq, p, 0);
2681 trace_sched_wakeup_new(rq, p, 1); 2570 trace_sched_wakeup_new(p, 1);
2682 check_preempt_curr(rq, p, WF_FORK); 2571 check_preempt_curr(rq, p, WF_FORK);
2683#ifdef CONFIG_SMP 2572#ifdef CONFIG_SMP
2684 if (p->sched_class->task_woken) 2573 if (p->sched_class->task_woken)
@@ -2898,7 +2787,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2898 struct mm_struct *mm, *oldmm; 2787 struct mm_struct *mm, *oldmm;
2899 2788
2900 prepare_task_switch(rq, prev, next); 2789 prepare_task_switch(rq, prev, next);
2901 trace_sched_switch(rq, prev, next); 2790 trace_sched_switch(prev, next);
2902 mm = next->mm; 2791 mm = next->mm;
2903 oldmm = prev->active_mm; 2792 oldmm = prev->active_mm;
2904 /* 2793 /*
@@ -3015,6 +2904,61 @@ static unsigned long calc_load_update;
3015unsigned long avenrun[3]; 2904unsigned long avenrun[3];
3016EXPORT_SYMBOL(avenrun); 2905EXPORT_SYMBOL(avenrun);
3017 2906
2907static long calc_load_fold_active(struct rq *this_rq)
2908{
2909 long nr_active, delta = 0;
2910
2911 nr_active = this_rq->nr_running;
2912 nr_active += (long) this_rq->nr_uninterruptible;
2913
2914 if (nr_active != this_rq->calc_load_active) {
2915 delta = nr_active - this_rq->calc_load_active;
2916 this_rq->calc_load_active = nr_active;
2917 }
2918
2919 return delta;
2920}
2921
2922#ifdef CONFIG_NO_HZ
2923/*
2924 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2925 *
2926 * When making the ILB scale, we should try to pull this in as well.
2927 */
2928static atomic_long_t calc_load_tasks_idle;
2929
2930static void calc_load_account_idle(struct rq *this_rq)
2931{
2932 long delta;
2933
2934 delta = calc_load_fold_active(this_rq);
2935 if (delta)
2936 atomic_long_add(delta, &calc_load_tasks_idle);
2937}
2938
2939static long calc_load_fold_idle(void)
2940{
2941 long delta = 0;
2942
2943 /*
2944 * Its got a race, we don't care...
2945 */
2946 if (atomic_long_read(&calc_load_tasks_idle))
2947 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2948
2949 return delta;
2950}
2951#else
2952static void calc_load_account_idle(struct rq *this_rq)
2953{
2954}
2955
2956static inline long calc_load_fold_idle(void)
2957{
2958 return 0;
2959}
2960#endif
2961
3018/** 2962/**
3019 * get_avenrun - get the load average array 2963 * get_avenrun - get the load average array
3020 * @loads: pointer to dest load array 2964 * @loads: pointer to dest load array
@@ -3061,20 +3005,22 @@ void calc_global_load(void)
3061} 3005}
3062 3006
3063/* 3007/*
3064 * Either called from update_cpu_load() or from a cpu going idle 3008 * Called from update_cpu_load() to periodically update this CPU's
3009 * active count.
3065 */ 3010 */
3066static void calc_load_account_active(struct rq *this_rq) 3011static void calc_load_account_active(struct rq *this_rq)
3067{ 3012{
3068 long nr_active, delta; 3013 long delta;
3069 3014
3070 nr_active = this_rq->nr_running; 3015 if (time_before(jiffies, this_rq->calc_load_update))
3071 nr_active += (long) this_rq->nr_uninterruptible; 3016 return;
3072 3017
3073 if (nr_active != this_rq->calc_load_active) { 3018 delta = calc_load_fold_active(this_rq);
3074 delta = nr_active - this_rq->calc_load_active; 3019 delta += calc_load_fold_idle();
3075 this_rq->calc_load_active = nr_active; 3020 if (delta)
3076 atomic_long_add(delta, &calc_load_tasks); 3021 atomic_long_add(delta, &calc_load_tasks);
3077 } 3022
3023 this_rq->calc_load_update += LOAD_FREQ;
3078} 3024}
3079 3025
3080/* 3026/*
@@ -3106,10 +3052,7 @@ static void update_cpu_load(struct rq *this_rq)
3106 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3052 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3107 } 3053 }
3108 3054
3109 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3055 calc_load_account_active(this_rq);
3110 this_rq->calc_load_update += LOAD_FREQ;
3111 calc_load_account_active(this_rq);
3112 }
3113} 3056}
3114 3057
3115#ifdef CONFIG_SMP 3058#ifdef CONFIG_SMP
@@ -3121,44 +3064,27 @@ static void update_cpu_load(struct rq *this_rq)
3121void sched_exec(void) 3064void sched_exec(void)
3122{ 3065{
3123 struct task_struct *p = current; 3066 struct task_struct *p = current;
3124 struct migration_req req;
3125 int dest_cpu, this_cpu;
3126 unsigned long flags; 3067 unsigned long flags;
3127 struct rq *rq; 3068 struct rq *rq;
3128 3069 int dest_cpu;
3129again:
3130 this_cpu = get_cpu();
3131 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3132 if (dest_cpu == this_cpu) {
3133 put_cpu();
3134 return;
3135 }
3136 3070
3137 rq = task_rq_lock(p, &flags); 3071 rq = task_rq_lock(p, &flags);
3138 put_cpu(); 3072 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3073 if (dest_cpu == smp_processor_id())
3074 goto unlock;
3139 3075
3140 /* 3076 /*
3141 * select_task_rq() can race against ->cpus_allowed 3077 * select_task_rq() can race against ->cpus_allowed
3142 */ 3078 */
3143 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3079 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3144 || unlikely(!cpu_active(dest_cpu))) { 3080 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3145 task_rq_unlock(rq, &flags); 3081 struct migration_arg arg = { p, dest_cpu };
3146 goto again;
3147 }
3148
3149 /* force the process onto the specified CPU */
3150 if (migrate_task(p, dest_cpu, &req)) {
3151 /* Need to wait for migration thread (might exit: take ref). */
3152 struct task_struct *mt = rq->migration_thread;
3153 3082
3154 get_task_struct(mt);
3155 task_rq_unlock(rq, &flags); 3083 task_rq_unlock(rq, &flags);
3156 wake_up_process(mt); 3084 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3157 put_task_struct(mt);
3158 wait_for_completion(&req.done);
3159
3160 return; 3085 return;
3161 } 3086 }
3087unlock:
3162 task_rq_unlock(rq, &flags); 3088 task_rq_unlock(rq, &flags);
3163} 3089}
3164 3090
@@ -3630,23 +3556,9 @@ static inline void schedule_debug(struct task_struct *prev)
3630 3556
3631static void put_prev_task(struct rq *rq, struct task_struct *prev) 3557static void put_prev_task(struct rq *rq, struct task_struct *prev)
3632{ 3558{
3633 if (prev->state == TASK_RUNNING) { 3559 if (prev->se.on_rq)
3634 u64 runtime = prev->se.sum_exec_runtime; 3560 update_rq_clock(rq);
3635 3561 rq->skip_clock_update = 0;
3636 runtime -= prev->se.prev_sum_exec_runtime;
3637 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
3638
3639 /*
3640 * In order to avoid avg_overlap growing stale when we are
3641 * indeed overlapping and hence not getting put to sleep, grow
3642 * the avg_overlap on preemption.
3643 *
3644 * We use the average preemption runtime because that
3645 * correlates to the amount of cache footprint a task can
3646 * build up.
3647 */
3648 update_avg(&prev->se.avg_overlap, runtime);
3649 }
3650 prev->sched_class->put_prev_task(rq, prev); 3562 prev->sched_class->put_prev_task(rq, prev);
3651} 3563}
3652 3564
@@ -3696,7 +3608,7 @@ need_resched:
3696 preempt_disable(); 3608 preempt_disable();
3697 cpu = smp_processor_id(); 3609 cpu = smp_processor_id();
3698 rq = cpu_rq(cpu); 3610 rq = cpu_rq(cpu);
3699 rcu_sched_qs(cpu); 3611 rcu_note_context_switch(cpu);
3700 prev = rq->curr; 3612 prev = rq->curr;
3701 switch_count = &prev->nivcsw; 3613 switch_count = &prev->nivcsw;
3702 3614
@@ -3709,14 +3621,13 @@ need_resched_nonpreemptible:
3709 hrtick_clear(rq); 3621 hrtick_clear(rq);
3710 3622
3711 raw_spin_lock_irq(&rq->lock); 3623 raw_spin_lock_irq(&rq->lock);
3712 update_rq_clock(rq);
3713 clear_tsk_need_resched(prev); 3624 clear_tsk_need_resched(prev);
3714 3625
3715 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3626 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3716 if (unlikely(signal_pending_state(prev->state, prev))) 3627 if (unlikely(signal_pending_state(prev->state, prev)))
3717 prev->state = TASK_RUNNING; 3628 prev->state = TASK_RUNNING;
3718 else 3629 else
3719 deactivate_task(rq, prev, 1); 3630 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3720 switch_count = &prev->nvcsw; 3631 switch_count = &prev->nvcsw;
3721 } 3632 }
3722 3633
@@ -3780,7 +3691,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3780 * the mutex owner just released it and exited. 3691 * the mutex owner just released it and exited.
3781 */ 3692 */
3782 if (probe_kernel_address(&owner->cpu, cpu)) 3693 if (probe_kernel_address(&owner->cpu, cpu))
3783 goto out; 3694 return 0;
3784#else 3695#else
3785 cpu = owner->cpu; 3696 cpu = owner->cpu;
3786#endif 3697#endif
@@ -3790,14 +3701,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3790 * the cpu field may no longer be valid. 3701 * the cpu field may no longer be valid.
3791 */ 3702 */
3792 if (cpu >= nr_cpumask_bits) 3703 if (cpu >= nr_cpumask_bits)
3793 goto out; 3704 return 0;
3794 3705
3795 /* 3706 /*
3796 * We need to validate that we can do a 3707 * We need to validate that we can do a
3797 * get_cpu() and that we have the percpu area. 3708 * get_cpu() and that we have the percpu area.
3798 */ 3709 */
3799 if (!cpu_online(cpu)) 3710 if (!cpu_online(cpu))
3800 goto out; 3711 return 0;
3801 3712
3802 rq = cpu_rq(cpu); 3713 rq = cpu_rq(cpu);
3803 3714
@@ -3816,7 +3727,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3816 3727
3817 cpu_relax(); 3728 cpu_relax();
3818 } 3729 }
3819out: 3730
3820 return 1; 3731 return 1;
3821} 3732}
3822#endif 3733#endif
@@ -4039,8 +3950,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4039 if (!x->done) { 3950 if (!x->done) {
4040 DECLARE_WAITQUEUE(wait, current); 3951 DECLARE_WAITQUEUE(wait, current);
4041 3952
4042 wait.flags |= WQ_FLAG_EXCLUSIVE; 3953 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4043 __add_wait_queue_tail(&x->wait, &wait);
4044 do { 3954 do {
4045 if (signal_pending_state(state, current)) { 3955 if (signal_pending_state(state, current)) {
4046 timeout = -ERESTARTSYS; 3956 timeout = -ERESTARTSYS;
@@ -4266,7 +4176,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4266 BUG_ON(prio < 0 || prio > MAX_PRIO); 4176 BUG_ON(prio < 0 || prio > MAX_PRIO);
4267 4177
4268 rq = task_rq_lock(p, &flags); 4178 rq = task_rq_lock(p, &flags);
4269 update_rq_clock(rq);
4270 4179
4271 oldprio = p->prio; 4180 oldprio = p->prio;
4272 prev_class = p->sched_class; 4181 prev_class = p->sched_class;
@@ -4287,7 +4196,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4287 if (running) 4196 if (running)
4288 p->sched_class->set_curr_task(rq); 4197 p->sched_class->set_curr_task(rq);
4289 if (on_rq) { 4198 if (on_rq) {
4290 enqueue_task(rq, p, 0, oldprio < prio); 4199 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4291 4200
4292 check_class_changed(rq, p, prev_class, oldprio, running); 4201 check_class_changed(rq, p, prev_class, oldprio, running);
4293 } 4202 }
@@ -4309,7 +4218,6 @@ void set_user_nice(struct task_struct *p, long nice)
4309 * the task might be in the middle of scheduling on another CPU. 4218 * the task might be in the middle of scheduling on another CPU.
4310 */ 4219 */
4311 rq = task_rq_lock(p, &flags); 4220 rq = task_rq_lock(p, &flags);
4312 update_rq_clock(rq);
4313 /* 4221 /*
4314 * The RT priorities are set via sched_setscheduler(), but we still 4222 * The RT priorities are set via sched_setscheduler(), but we still
4315 * allow the 'normal' nice value to be set - but as expected 4223 * allow the 'normal' nice value to be set - but as expected
@@ -4331,7 +4239,7 @@ void set_user_nice(struct task_struct *p, long nice)
4331 delta = p->prio - old_prio; 4239 delta = p->prio - old_prio;
4332 4240
4333 if (on_rq) { 4241 if (on_rq) {
4334 enqueue_task(rq, p, 0, false); 4242 enqueue_task(rq, p, 0);
4335 /* 4243 /*
4336 * If the task increased its priority or is running and 4244 * If the task increased its priority or is running and
4337 * lowered its priority, then reschedule its CPU: 4245 * lowered its priority, then reschedule its CPU:
@@ -4592,7 +4500,6 @@ recheck:
4592 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4500 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4593 goto recheck; 4501 goto recheck;
4594 } 4502 }
4595 update_rq_clock(rq);
4596 on_rq = p->se.on_rq; 4503 on_rq = p->se.on_rq;
4597 running = task_current(rq, p); 4504 running = task_current(rq, p);
4598 if (on_rq) 4505 if (on_rq)
@@ -5329,17 +5236,15 @@ static inline void sched_init_granularity(void)
5329/* 5236/*
5330 * This is how migration works: 5237 * This is how migration works:
5331 * 5238 *
5332 * 1) we queue a struct migration_req structure in the source CPU's 5239 * 1) we invoke migration_cpu_stop() on the target CPU using
5333 * runqueue and wake up that CPU's migration thread. 5240 * stop_one_cpu().
5334 * 2) we down() the locked semaphore => thread blocks. 5241 * 2) stopper starts to run (implicitly forcing the migrated thread
5335 * 3) migration thread wakes up (implicitly it forces the migrated 5242 * off the CPU)
5336 * thread off the CPU) 5243 * 3) it checks whether the migrated task is still in the wrong runqueue.
5337 * 4) it gets the migration request and checks whether the migrated 5244 * 4) if it's in the wrong runqueue then the migration thread removes
5338 * task is still in the wrong runqueue.
5339 * 5) if it's in the wrong runqueue then the migration thread removes
5340 * it and puts it into the right queue. 5245 * it and puts it into the right queue.
5341 * 6) migration thread up()s the semaphore. 5246 * 5) stopper completes and stop_one_cpu() returns and the migration
5342 * 7) we wake up and the migration is done. 5247 * is done.
5343 */ 5248 */
5344 5249
5345/* 5250/*
@@ -5353,12 +5258,23 @@ static inline void sched_init_granularity(void)
5353 */ 5258 */
5354int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5259int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5355{ 5260{
5356 struct migration_req req;
5357 unsigned long flags; 5261 unsigned long flags;
5358 struct rq *rq; 5262 struct rq *rq;
5263 unsigned int dest_cpu;
5359 int ret = 0; 5264 int ret = 0;
5360 5265
5266 /*
5267 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5268 * drop the rq->lock and still rely on ->cpus_allowed.
5269 */
5270again:
5271 while (task_is_waking(p))
5272 cpu_relax();
5361 rq = task_rq_lock(p, &flags); 5273 rq = task_rq_lock(p, &flags);
5274 if (task_is_waking(p)) {
5275 task_rq_unlock(rq, &flags);
5276 goto again;
5277 }
5362 5278
5363 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5279 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5364 ret = -EINVAL; 5280 ret = -EINVAL;
@@ -5382,15 +5298,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5382 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5298 if (cpumask_test_cpu(task_cpu(p), new_mask))
5383 goto out; 5299 goto out;
5384 5300
5385 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { 5301 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5302 if (migrate_task(p, dest_cpu)) {
5303 struct migration_arg arg = { p, dest_cpu };
5386 /* Need help from migration thread: drop lock and wait. */ 5304 /* Need help from migration thread: drop lock and wait. */
5387 struct task_struct *mt = rq->migration_thread;
5388
5389 get_task_struct(mt);
5390 task_rq_unlock(rq, &flags); 5305 task_rq_unlock(rq, &flags);
5391 wake_up_process(mt); 5306 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5392 put_task_struct(mt);
5393 wait_for_completion(&req.done);
5394 tlb_migrate_finish(p->mm); 5307 tlb_migrate_finish(p->mm);
5395 return 0; 5308 return 0;
5396 } 5309 }
@@ -5448,98 +5361,49 @@ fail:
5448 return ret; 5361 return ret;
5449} 5362}
5450 5363
5451#define RCU_MIGRATION_IDLE 0
5452#define RCU_MIGRATION_NEED_QS 1
5453#define RCU_MIGRATION_GOT_QS 2
5454#define RCU_MIGRATION_MUST_SYNC 3
5455
5456/* 5364/*
5457 * migration_thread - this is a highprio system thread that performs 5365 * migration_cpu_stop - this will be executed by a highprio stopper thread
5458 * thread migration by bumping thread off CPU then 'pushing' onto 5366 * and performs thread migration by bumping thread off CPU then
5459 * another runqueue. 5367 * 'pushing' onto another runqueue.
5460 */ 5368 */
5461static int migration_thread(void *data) 5369static int migration_cpu_stop(void *data)
5462{
5463 int badcpu;
5464 int cpu = (long)data;
5465 struct rq *rq;
5466
5467 rq = cpu_rq(cpu);
5468 BUG_ON(rq->migration_thread != current);
5469
5470 set_current_state(TASK_INTERRUPTIBLE);
5471 while (!kthread_should_stop()) {
5472 struct migration_req *req;
5473 struct list_head *head;
5474
5475 raw_spin_lock_irq(&rq->lock);
5476
5477 if (cpu_is_offline(cpu)) {
5478 raw_spin_unlock_irq(&rq->lock);
5479 break;
5480 }
5481
5482 if (rq->active_balance) {
5483 active_load_balance(rq, cpu);
5484 rq->active_balance = 0;
5485 }
5486
5487 head = &rq->migration_queue;
5488
5489 if (list_empty(head)) {
5490 raw_spin_unlock_irq(&rq->lock);
5491 schedule();
5492 set_current_state(TASK_INTERRUPTIBLE);
5493 continue;
5494 }
5495 req = list_entry(head->next, struct migration_req, list);
5496 list_del_init(head->next);
5497
5498 if (req->task != NULL) {
5499 raw_spin_unlock(&rq->lock);
5500 __migrate_task(req->task, cpu, req->dest_cpu);
5501 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
5502 req->dest_cpu = RCU_MIGRATION_GOT_QS;
5503 raw_spin_unlock(&rq->lock);
5504 } else {
5505 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5506 raw_spin_unlock(&rq->lock);
5507 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5508 }
5509 local_irq_enable();
5510
5511 complete(&req->done);
5512 }
5513 __set_current_state(TASK_RUNNING);
5514
5515 return 0;
5516}
5517
5518#ifdef CONFIG_HOTPLUG_CPU
5519
5520static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5521{ 5370{
5522 int ret; 5371 struct migration_arg *arg = data;
5523 5372
5373 /*
5374 * The original target cpu might have gone down and we might
5375 * be on another cpu but it doesn't matter.
5376 */
5524 local_irq_disable(); 5377 local_irq_disable();
5525 ret = __migrate_task(p, src_cpu, dest_cpu); 5378 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5526 local_irq_enable(); 5379 local_irq_enable();
5527 return ret; 5380 return 0;
5528} 5381}
5529 5382
5383#ifdef CONFIG_HOTPLUG_CPU
5530/* 5384/*
5531 * Figure out where task on dead CPU should go, use force if necessary. 5385 * Figure out where task on dead CPU should go, use force if necessary.
5532 */ 5386 */
5533static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5387void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5534{ 5388{
5535 int dest_cpu; 5389 struct rq *rq = cpu_rq(dead_cpu);
5390 int needs_cpu, uninitialized_var(dest_cpu);
5391 unsigned long flags;
5536 5392
5537again: 5393 local_irq_save(flags);
5538 dest_cpu = select_fallback_rq(dead_cpu, p);
5539 5394
5540 /* It can have affinity changed while we were choosing. */ 5395 raw_spin_lock(&rq->lock);
5541 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5396 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
5542 goto again; 5397 if (needs_cpu)
5398 dest_cpu = select_fallback_rq(dead_cpu, p);
5399 raw_spin_unlock(&rq->lock);
5400 /*
5401 * It can only fail if we race with set_cpus_allowed(),
5402 * in the racer should migrate the task anyway.
5403 */
5404 if (needs_cpu)
5405 __migrate_task(p, dead_cpu, dest_cpu);
5406 local_irq_restore(flags);
5543} 5407}
5544 5408
5545/* 5409/*
@@ -5603,7 +5467,6 @@ void sched_idle_next(void)
5603 5467
5604 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5468 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5605 5469
5606 update_rq_clock(rq);
5607 activate_task(rq, p, 0); 5470 activate_task(rq, p, 0);
5608 5471
5609 raw_spin_unlock_irqrestore(&rq->lock, flags); 5472 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5658,7 +5521,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5658 for ( ; ; ) { 5521 for ( ; ; ) {
5659 if (!rq->nr_running) 5522 if (!rq->nr_running)
5660 break; 5523 break;
5661 update_rq_clock(rq);
5662 next = pick_next_task(rq); 5524 next = pick_next_task(rq);
5663 if (!next) 5525 if (!next)
5664 break; 5526 break;
@@ -5881,35 +5743,20 @@ static void set_rq_offline(struct rq *rq)
5881static int __cpuinit 5743static int __cpuinit
5882migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5744migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5883{ 5745{
5884 struct task_struct *p;
5885 int cpu = (long)hcpu; 5746 int cpu = (long)hcpu;
5886 unsigned long flags; 5747 unsigned long flags;
5887 struct rq *rq; 5748 struct rq *rq = cpu_rq(cpu);
5888 5749
5889 switch (action) { 5750 switch (action) {
5890 5751
5891 case CPU_UP_PREPARE: 5752 case CPU_UP_PREPARE:
5892 case CPU_UP_PREPARE_FROZEN: 5753 case CPU_UP_PREPARE_FROZEN:
5893 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5894 if (IS_ERR(p))
5895 return NOTIFY_BAD;
5896 kthread_bind(p, cpu);
5897 /* Must be high prio: stop_machine expects to yield to it. */
5898 rq = task_rq_lock(p, &flags);
5899 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5900 task_rq_unlock(rq, &flags);
5901 get_task_struct(p);
5902 cpu_rq(cpu)->migration_thread = p;
5903 rq->calc_load_update = calc_load_update; 5754 rq->calc_load_update = calc_load_update;
5904 break; 5755 break;
5905 5756
5906 case CPU_ONLINE: 5757 case CPU_ONLINE:
5907 case CPU_ONLINE_FROZEN: 5758 case CPU_ONLINE_FROZEN:
5908 /* Strictly unnecessary, as first user will wake it. */
5909 wake_up_process(cpu_rq(cpu)->migration_thread);
5910
5911 /* Update our root-domain */ 5759 /* Update our root-domain */
5912 rq = cpu_rq(cpu);
5913 raw_spin_lock_irqsave(&rq->lock, flags); 5760 raw_spin_lock_irqsave(&rq->lock, flags);
5914 if (rq->rd) { 5761 if (rq->rd) {
5915 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5762 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5920,61 +5767,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5920 break; 5767 break;
5921 5768
5922#ifdef CONFIG_HOTPLUG_CPU 5769#ifdef CONFIG_HOTPLUG_CPU
5923 case CPU_UP_CANCELED:
5924 case CPU_UP_CANCELED_FROZEN:
5925 if (!cpu_rq(cpu)->migration_thread)
5926 break;
5927 /* Unbind it from offline cpu so it can run. Fall thru. */
5928 kthread_bind(cpu_rq(cpu)->migration_thread,
5929 cpumask_any(cpu_online_mask));
5930 kthread_stop(cpu_rq(cpu)->migration_thread);
5931 put_task_struct(cpu_rq(cpu)->migration_thread);
5932 cpu_rq(cpu)->migration_thread = NULL;
5933 break;
5934
5935 case CPU_DEAD: 5770 case CPU_DEAD:
5936 case CPU_DEAD_FROZEN: 5771 case CPU_DEAD_FROZEN:
5937 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
5938 migrate_live_tasks(cpu); 5772 migrate_live_tasks(cpu);
5939 rq = cpu_rq(cpu);
5940 kthread_stop(rq->migration_thread);
5941 put_task_struct(rq->migration_thread);
5942 rq->migration_thread = NULL;
5943 /* Idle task back to normal (off runqueue, low prio) */ 5773 /* Idle task back to normal (off runqueue, low prio) */
5944 raw_spin_lock_irq(&rq->lock); 5774 raw_spin_lock_irq(&rq->lock);
5945 update_rq_clock(rq);
5946 deactivate_task(rq, rq->idle, 0); 5775 deactivate_task(rq, rq->idle, 0);
5947 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5776 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5948 rq->idle->sched_class = &idle_sched_class; 5777 rq->idle->sched_class = &idle_sched_class;
5949 migrate_dead_tasks(cpu); 5778 migrate_dead_tasks(cpu);
5950 raw_spin_unlock_irq(&rq->lock); 5779 raw_spin_unlock_irq(&rq->lock);
5951 cpuset_unlock();
5952 migrate_nr_uninterruptible(rq); 5780 migrate_nr_uninterruptible(rq);
5953 BUG_ON(rq->nr_running != 0); 5781 BUG_ON(rq->nr_running != 0);
5954 calc_global_load_remove(rq); 5782 calc_global_load_remove(rq);
5955 /*
5956 * No need to migrate the tasks: it was best-effort if
5957 * they didn't take sched_hotcpu_mutex. Just wake up
5958 * the requestors.
5959 */
5960 raw_spin_lock_irq(&rq->lock);
5961 while (!list_empty(&rq->migration_queue)) {
5962 struct migration_req *req;
5963
5964 req = list_entry(rq->migration_queue.next,
5965 struct migration_req, list);
5966 list_del_init(&req->list);
5967 raw_spin_unlock_irq(&rq->lock);
5968 complete(&req->done);
5969 raw_spin_lock_irq(&rq->lock);
5970 }
5971 raw_spin_unlock_irq(&rq->lock);
5972 break; 5783 break;
5973 5784
5974 case CPU_DYING: 5785 case CPU_DYING:
5975 case CPU_DYING_FROZEN: 5786 case CPU_DYING_FROZEN:
5976 /* Update our root-domain */ 5787 /* Update our root-domain */
5977 rq = cpu_rq(cpu);
5978 raw_spin_lock_irqsave(&rq->lock, flags); 5788 raw_spin_lock_irqsave(&rq->lock, flags);
5979 if (rq->rd) { 5789 if (rq->rd) {
5980 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5790 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6305,6 +6115,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6305 struct rq *rq = cpu_rq(cpu); 6115 struct rq *rq = cpu_rq(cpu);
6306 struct sched_domain *tmp; 6116 struct sched_domain *tmp;
6307 6117
6118 for (tmp = sd; tmp; tmp = tmp->parent)
6119 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6120
6308 /* Remove the sched domains which do not contribute to scheduling. */ 6121 /* Remove the sched domains which do not contribute to scheduling. */
6309 for (tmp = sd; tmp; ) { 6122 for (tmp = sd; tmp; ) {
6310 struct sched_domain *parent = tmp->parent; 6123 struct sched_domain *parent = tmp->parent;
@@ -7788,10 +7601,8 @@ void __init sched_init(void)
7788 rq->push_cpu = 0; 7601 rq->push_cpu = 0;
7789 rq->cpu = i; 7602 rq->cpu = i;
7790 rq->online = 0; 7603 rq->online = 0;
7791 rq->migration_thread = NULL;
7792 rq->idle_stamp = 0; 7604 rq->idle_stamp = 0;
7793 rq->avg_idle = 2*sysctl_sched_migration_cost; 7605 rq->avg_idle = 2*sysctl_sched_migration_cost;
7794 INIT_LIST_HEAD(&rq->migration_queue);
7795 rq_attach_root(rq, &def_root_domain); 7606 rq_attach_root(rq, &def_root_domain);
7796#endif 7607#endif
7797 init_rq_hrtick(rq); 7608 init_rq_hrtick(rq);
@@ -7892,7 +7703,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7892{ 7703{
7893 int on_rq; 7704 int on_rq;
7894 7705
7895 update_rq_clock(rq);
7896 on_rq = p->se.on_rq; 7706 on_rq = p->se.on_rq;
7897 if (on_rq) 7707 if (on_rq)
7898 deactivate_task(rq, p, 0); 7708 deactivate_task(rq, p, 0);
@@ -7919,9 +7729,9 @@ void normalize_rt_tasks(void)
7919 7729
7920 p->se.exec_start = 0; 7730 p->se.exec_start = 0;
7921#ifdef CONFIG_SCHEDSTATS 7731#ifdef CONFIG_SCHEDSTATS
7922 p->se.wait_start = 0; 7732 p->se.statistics.wait_start = 0;
7923 p->se.sleep_start = 0; 7733 p->se.statistics.sleep_start = 0;
7924 p->se.block_start = 0; 7734 p->se.statistics.block_start = 0;
7925#endif 7735#endif
7926 7736
7927 if (!rt_task(p)) { 7737 if (!rt_task(p)) {
@@ -8254,8 +8064,6 @@ void sched_move_task(struct task_struct *tsk)
8254 8064
8255 rq = task_rq_lock(tsk, &flags); 8065 rq = task_rq_lock(tsk, &flags);
8256 8066
8257 update_rq_clock(rq);
8258
8259 running = task_current(rq, tsk); 8067 running = task_current(rq, tsk);
8260 on_rq = tsk->se.on_rq; 8068 on_rq = tsk->se.on_rq;
8261 8069
@@ -8274,7 +8082,7 @@ void sched_move_task(struct task_struct *tsk)
8274 if (unlikely(running)) 8082 if (unlikely(running))
8275 tsk->sched_class->set_curr_task(rq); 8083 tsk->sched_class->set_curr_task(rq);
8276 if (on_rq) 8084 if (on_rq)
8277 enqueue_task(rq, tsk, 0, false); 8085 enqueue_task(rq, tsk, 0);
8278 8086
8279 task_rq_unlock(rq, &flags); 8087 task_rq_unlock(rq, &flags);
8280} 8088}
@@ -9088,43 +8896,32 @@ struct cgroup_subsys cpuacct_subsys = {
9088 8896
9089#ifndef CONFIG_SMP 8897#ifndef CONFIG_SMP
9090 8898
9091int rcu_expedited_torture_stats(char *page)
9092{
9093 return 0;
9094}
9095EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9096
9097void synchronize_sched_expedited(void) 8899void synchronize_sched_expedited(void)
9098{ 8900{
8901 barrier();
9099} 8902}
9100EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8903EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9101 8904
9102#else /* #ifndef CONFIG_SMP */ 8905#else /* #ifndef CONFIG_SMP */
9103 8906
9104static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 8907static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9105static DEFINE_MUTEX(rcu_sched_expedited_mutex);
9106 8908
9107#define RCU_EXPEDITED_STATE_POST -2 8909static int synchronize_sched_expedited_cpu_stop(void *data)
9108#define RCU_EXPEDITED_STATE_IDLE -1
9109
9110static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9111
9112int rcu_expedited_torture_stats(char *page)
9113{ 8910{
9114 int cnt = 0; 8911 /*
9115 int cpu; 8912 * There must be a full memory barrier on each affected CPU
9116 8913 * between the time that try_stop_cpus() is called and the
9117 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 8914 * time that it returns.
9118 for_each_online_cpu(cpu) { 8915 *
9119 cnt += sprintf(&page[cnt], " %d:%d", 8916 * In the current initial implementation of cpu_stop, the
9120 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 8917 * above condition is already met when the control reaches
9121 } 8918 * this point and the following smp_mb() is not strictly
9122 cnt += sprintf(&page[cnt], "\n"); 8919 * necessary. Do smp_mb() anyway for documentation and
9123 return cnt; 8920 * robustness against future implementation changes.
8921 */
8922 smp_mb(); /* See above comment block. */
8923 return 0;
9124} 8924}
9125EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9126
9127static long synchronize_sched_expedited_count;
9128 8925
9129/* 8926/*
9130 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 8927 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9138,18 +8935,14 @@ static long synchronize_sched_expedited_count;
9138 */ 8935 */
9139void synchronize_sched_expedited(void) 8936void synchronize_sched_expedited(void)
9140{ 8937{
9141 int cpu; 8938 int snap, trycount = 0;
9142 unsigned long flags;
9143 bool need_full_sync = 0;
9144 struct rq *rq;
9145 struct migration_req *req;
9146 long snap;
9147 int trycount = 0;
9148 8939
9149 smp_mb(); /* ensure prior mod happens before capturing snap. */ 8940 smp_mb(); /* ensure prior mod happens before capturing snap. */
9150 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 8941 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9151 get_online_cpus(); 8942 get_online_cpus();
9152 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 8943 while (try_stop_cpus(cpu_online_mask,
8944 synchronize_sched_expedited_cpu_stop,
8945 NULL) == -EAGAIN) {
9153 put_online_cpus(); 8946 put_online_cpus();
9154 if (trycount++ < 10) 8947 if (trycount++ < 10)
9155 udelay(trycount * num_online_cpus()); 8948 udelay(trycount * num_online_cpus());
@@ -9157,41 +8950,15 @@ void synchronize_sched_expedited(void)
9157 synchronize_sched(); 8950 synchronize_sched();
9158 return; 8951 return;
9159 } 8952 }
9160 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 8953 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9161 smp_mb(); /* ensure test happens before caller kfree */ 8954 smp_mb(); /* ensure test happens before caller kfree */
9162 return; 8955 return;
9163 } 8956 }
9164 get_online_cpus(); 8957 get_online_cpus();
9165 } 8958 }
9166 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 8959 atomic_inc(&synchronize_sched_expedited_count);
9167 for_each_online_cpu(cpu) { 8960 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9168 rq = cpu_rq(cpu);
9169 req = &per_cpu(rcu_migration_req, cpu);
9170 init_completion(&req->done);
9171 req->task = NULL;
9172 req->dest_cpu = RCU_MIGRATION_NEED_QS;
9173 raw_spin_lock_irqsave(&rq->lock, flags);
9174 list_add(&req->list, &rq->migration_queue);
9175 raw_spin_unlock_irqrestore(&rq->lock, flags);
9176 wake_up_process(rq->migration_thread);
9177 }
9178 for_each_online_cpu(cpu) {
9179 rcu_expedited_state = cpu;
9180 req = &per_cpu(rcu_migration_req, cpu);
9181 rq = cpu_rq(cpu);
9182 wait_for_completion(&req->done);
9183 raw_spin_lock_irqsave(&rq->lock, flags);
9184 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
9185 need_full_sync = 1;
9186 req->dest_cpu = RCU_MIGRATION_IDLE;
9187 raw_spin_unlock_irqrestore(&rq->lock, flags);
9188 }
9189 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9190 synchronize_sched_expedited_count++;
9191 mutex_unlock(&rcu_sched_expedited_mutex);
9192 put_online_cpus(); 8961 put_online_cpus();
9193 if (need_full_sync)
9194 synchronize_sched();
9195} 8962}
9196EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8963EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9197 8964
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 9b49db144037..87a330a7185f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
70 PN(se->vruntime); 70 PN(se->vruntime);
71 PN(se->sum_exec_runtime); 71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS 72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start); 73 PN(se->statistics.wait_start);
74 PN(se->sleep_start); 74 PN(se->statistics.sleep_start);
75 PN(se->block_start); 75 PN(se->statistics.block_start);
76 PN(se->sleep_max); 76 PN(se->statistics.sleep_max);
77 PN(se->block_max); 77 PN(se->statistics.block_max);
78 PN(se->exec_max); 78 PN(se->statistics.exec_max);
79 PN(se->slice_max); 79 PN(se->statistics.slice_max);
80 PN(se->wait_max); 80 PN(se->statistics.wait_max);
81 PN(se->wait_sum); 81 PN(se->statistics.wait_sum);
82 P(se->wait_count); 82 P(se->statistics.wait_count);
83#endif 83#endif
84 P(se->load.weight); 84 P(se->load.weight);
85#undef PN 85#undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
105 SPLIT_NS(p->se.vruntime), 105 SPLIT_NS(p->se.vruntime),
106 SPLIT_NS(p->se.sum_exec_runtime), 106 SPLIT_NS(p->se.sum_exec_runtime),
107 SPLIT_NS(p->se.sum_sleep_runtime)); 107 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
108#else 108#else
109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
114 { 114 {
115 char path[64]; 115 char path[64];
116 116
117 rcu_read_lock();
117 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); 118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
118 SEQ_printf(m, " %s", path); 120 SEQ_printf(m, " %s", path);
119 } 121 }
120#endif 122#endif
@@ -173,11 +175,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
173 task_group_path(tg, path, sizeof(path)); 175 task_group_path(tg, path, sizeof(path));
174 176
175 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
176#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
177 {
178 uid_t uid = cfs_rq->tg->uid;
179 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
180 }
181#else 178#else
182 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
183#endif 180#endif
@@ -407,40 +404,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
407 PN(se.exec_start); 404 PN(se.exec_start);
408 PN(se.vruntime); 405 PN(se.vruntime);
409 PN(se.sum_exec_runtime); 406 PN(se.sum_exec_runtime);
410 PN(se.avg_overlap);
411 PN(se.avg_wakeup);
412 407
413 nr_switches = p->nvcsw + p->nivcsw; 408 nr_switches = p->nvcsw + p->nivcsw;
414 409
415#ifdef CONFIG_SCHEDSTATS 410#ifdef CONFIG_SCHEDSTATS
416 PN(se.wait_start); 411 PN(se.statistics.wait_start);
417 PN(se.sleep_start); 412 PN(se.statistics.sleep_start);
418 PN(se.block_start); 413 PN(se.statistics.block_start);
419 PN(se.sleep_max); 414 PN(se.statistics.sleep_max);
420 PN(se.block_max); 415 PN(se.statistics.block_max);
421 PN(se.exec_max); 416 PN(se.statistics.exec_max);
422 PN(se.slice_max); 417 PN(se.statistics.slice_max);
423 PN(se.wait_max); 418 PN(se.statistics.wait_max);
424 PN(se.wait_sum); 419 PN(se.statistics.wait_sum);
425 P(se.wait_count); 420 P(se.statistics.wait_count);
426 PN(se.iowait_sum); 421 PN(se.statistics.iowait_sum);
427 P(se.iowait_count); 422 P(se.statistics.iowait_count);
428 P(sched_info.bkl_count); 423 P(sched_info.bkl_count);
429 P(se.nr_migrations); 424 P(se.nr_migrations);
430 P(se.nr_migrations_cold); 425 P(se.statistics.nr_migrations_cold);
431 P(se.nr_failed_migrations_affine); 426 P(se.statistics.nr_failed_migrations_affine);
432 P(se.nr_failed_migrations_running); 427 P(se.statistics.nr_failed_migrations_running);
433 P(se.nr_failed_migrations_hot); 428 P(se.statistics.nr_failed_migrations_hot);
434 P(se.nr_forced_migrations); 429 P(se.statistics.nr_forced_migrations);
435 P(se.nr_wakeups); 430 P(se.statistics.nr_wakeups);
436 P(se.nr_wakeups_sync); 431 P(se.statistics.nr_wakeups_sync);
437 P(se.nr_wakeups_migrate); 432 P(se.statistics.nr_wakeups_migrate);
438 P(se.nr_wakeups_local); 433 P(se.statistics.nr_wakeups_local);
439 P(se.nr_wakeups_remote); 434 P(se.statistics.nr_wakeups_remote);
440 P(se.nr_wakeups_affine); 435 P(se.statistics.nr_wakeups_affine);
441 P(se.nr_wakeups_affine_attempts); 436 P(se.statistics.nr_wakeups_affine_attempts);
442 P(se.nr_wakeups_passive); 437 P(se.statistics.nr_wakeups_passive);
443 P(se.nr_wakeups_idle); 438 P(se.statistics.nr_wakeups_idle);
444 439
445 { 440 {
446 u64 avg_atom, avg_per_cpu; 441 u64 avg_atom, avg_per_cpu;
@@ -491,31 +486,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
491void proc_sched_set_task(struct task_struct *p) 486void proc_sched_set_task(struct task_struct *p)
492{ 487{
493#ifdef CONFIG_SCHEDSTATS 488#ifdef CONFIG_SCHEDSTATS
494 p->se.wait_max = 0; 489 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
495 p->se.wait_sum = 0;
496 p->se.wait_count = 0;
497 p->se.iowait_sum = 0;
498 p->se.iowait_count = 0;
499 p->se.sleep_max = 0;
500 p->se.sum_sleep_runtime = 0;
501 p->se.block_max = 0;
502 p->se.exec_max = 0;
503 p->se.slice_max = 0;
504 p->se.nr_migrations = 0;
505 p->se.nr_migrations_cold = 0;
506 p->se.nr_failed_migrations_affine = 0;
507 p->se.nr_failed_migrations_running = 0;
508 p->se.nr_failed_migrations_hot = 0;
509 p->se.nr_forced_migrations = 0;
510 p->se.nr_wakeups = 0;
511 p->se.nr_wakeups_sync = 0;
512 p->se.nr_wakeups_migrate = 0;
513 p->se.nr_wakeups_local = 0;
514 p->se.nr_wakeups_remote = 0;
515 p->se.nr_wakeups_affine = 0;
516 p->se.nr_wakeups_affine_attempts = 0;
517 p->se.nr_wakeups_passive = 0;
518 p->se.nr_wakeups_idle = 0;
519 p->sched_info.bkl_count = 0;
520#endif 490#endif
521} 491}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5a5ea2cd924f..217e4a9393e4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
35 * (to see the precise effective timeslice length of your workload, 35 * (to see the precise effective timeslice length of your workload,
36 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
37 */ 37 */
38unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 6000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL; 39unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40 40
41/* 41/*
42 * The initial- and re-scaling of tunables is configurable 42 * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 2000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 5; 63static unsigned int sched_nr_latency = 3;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
505{ 505{
506 unsigned long delta_exec_weighted; 506 unsigned long delta_exec_weighted;
507 507
508 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 508 schedstat_set(curr->statistics.exec_max,
509 max((u64)delta_exec, curr->statistics.exec_max));
509 510
510 curr->sum_exec_runtime += delta_exec; 511 curr->sum_exec_runtime += delta_exec;
511 schedstat_add(cfs_rq, exec_clock, delta_exec); 512 schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
548static inline void 549static inline void
549update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 550update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
550{ 551{
551 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); 552 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
552} 553}
553 554
554/* 555/*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
567static void 568static void
568update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 569update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
569{ 570{
570 schedstat_set(se->wait_max, max(se->wait_max, 571 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
571 rq_of(cfs_rq)->clock - se->wait_start)); 572 rq_of(cfs_rq)->clock - se->statistics.wait_start));
572 schedstat_set(se->wait_count, se->wait_count + 1); 573 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
573 schedstat_set(se->wait_sum, se->wait_sum + 574 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
574 rq_of(cfs_rq)->clock - se->wait_start); 575 rq_of(cfs_rq)->clock - se->statistics.wait_start);
575#ifdef CONFIG_SCHEDSTATS 576#ifdef CONFIG_SCHEDSTATS
576 if (entity_is_task(se)) { 577 if (entity_is_task(se)) {
577 trace_sched_stat_wait(task_of(se), 578 trace_sched_stat_wait(task_of(se),
578 rq_of(cfs_rq)->clock - se->wait_start); 579 rq_of(cfs_rq)->clock - se->statistics.wait_start);
579 } 580 }
580#endif 581#endif
581 schedstat_set(se->wait_start, 0); 582 schedstat_set(se->statistics.wait_start, 0);
582} 583}
583 584
584static inline void 585static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
657 if (entity_is_task(se)) 658 if (entity_is_task(se))
658 tsk = task_of(se); 659 tsk = task_of(se);
659 660
660 if (se->sleep_start) { 661 if (se->statistics.sleep_start) {
661 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 662 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
662 663
663 if ((s64)delta < 0) 664 if ((s64)delta < 0)
664 delta = 0; 665 delta = 0;
665 666
666 if (unlikely(delta > se->sleep_max)) 667 if (unlikely(delta > se->statistics.sleep_max))
667 se->sleep_max = delta; 668 se->statistics.sleep_max = delta;
668 669
669 se->sleep_start = 0; 670 se->statistics.sleep_start = 0;
670 se->sum_sleep_runtime += delta; 671 se->statistics.sum_sleep_runtime += delta;
671 672
672 if (tsk) { 673 if (tsk) {
673 account_scheduler_latency(tsk, delta >> 10, 1); 674 account_scheduler_latency(tsk, delta >> 10, 1);
674 trace_sched_stat_sleep(tsk, delta); 675 trace_sched_stat_sleep(tsk, delta);
675 } 676 }
676 } 677 }
677 if (se->block_start) { 678 if (se->statistics.block_start) {
678 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 679 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
679 680
680 if ((s64)delta < 0) 681 if ((s64)delta < 0)
681 delta = 0; 682 delta = 0;
682 683
683 if (unlikely(delta > se->block_max)) 684 if (unlikely(delta > se->statistics.block_max))
684 se->block_max = delta; 685 se->statistics.block_max = delta;
685 686
686 se->block_start = 0; 687 se->statistics.block_start = 0;
687 se->sum_sleep_runtime += delta; 688 se->statistics.sum_sleep_runtime += delta;
688 689
689 if (tsk) { 690 if (tsk) {
690 if (tsk->in_iowait) { 691 if (tsk->in_iowait) {
691 se->iowait_sum += delta; 692 se->statistics.iowait_sum += delta;
692 se->iowait_count++; 693 se->statistics.iowait_count++;
693 trace_sched_stat_iowait(tsk, delta); 694 trace_sched_stat_iowait(tsk, delta);
694 } 695 }
695 696
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
737 vruntime += sched_vslice(cfs_rq, se); 738 vruntime += sched_vslice(cfs_rq, se);
738 739
739 /* sleeps up to a single latency don't count. */ 740 /* sleeps up to a single latency don't count. */
740 if (!initial && sched_feat(FAIR_SLEEPERS)) { 741 if (!initial) {
741 unsigned long thresh = sysctl_sched_latency; 742 unsigned long thresh = sysctl_sched_latency;
742 743
743 /* 744 /*
744 * Convert the sleeper threshold into virtual time.
745 * SCHED_IDLE is a special sub-class. We care about
746 * fairness only relative to other SCHED_IDLE tasks,
747 * all of which have the same weight.
748 */
749 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
750 task_of(se)->policy != SCHED_IDLE))
751 thresh = calc_delta_fair(thresh, se);
752
753 /*
754 * Halve their sleep time's effect, to allow 745 * Halve their sleep time's effect, to allow
755 * for a gentler effect of sleepers: 746 * for a gentler effect of sleepers:
756 */ 747 */
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
766 se->vruntime = vruntime; 757 se->vruntime = vruntime;
767} 758}
768 759
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
772static void 760static void
773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 761enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
774{ 762{
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
776 * Update the normalized vruntime before updating min_vruntime 764 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr(). 765 * through callig update_curr().
778 */ 766 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) 767 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
780 se->vruntime += cfs_rq->min_vruntime; 768 se->vruntime += cfs_rq->min_vruntime;
781 769
782 /* 770 /*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
812} 800}
813 801
814static void 802static void
815dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 803dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
816{ 804{
817 /* 805 /*
818 * Update run-time statistics of the 'current'. 806 * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
820 update_curr(cfs_rq); 808 update_curr(cfs_rq);
821 809
822 update_stats_dequeue(cfs_rq, se); 810 update_stats_dequeue(cfs_rq, se);
823 if (sleep) { 811 if (flags & DEQUEUE_SLEEP) {
824#ifdef CONFIG_SCHEDSTATS 812#ifdef CONFIG_SCHEDSTATS
825 if (entity_is_task(se)) { 813 if (entity_is_task(se)) {
826 struct task_struct *tsk = task_of(se); 814 struct task_struct *tsk = task_of(se);
827 815
828 if (tsk->state & TASK_INTERRUPTIBLE) 816 if (tsk->state & TASK_INTERRUPTIBLE)
829 se->sleep_start = rq_of(cfs_rq)->clock; 817 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
830 if (tsk->state & TASK_UNINTERRUPTIBLE) 818 if (tsk->state & TASK_UNINTERRUPTIBLE)
831 se->block_start = rq_of(cfs_rq)->clock; 819 se->statistics.block_start = rq_of(cfs_rq)->clock;
832 } 820 }
833#endif 821#endif
834 } 822 }
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
845 * update can refer to the ->curr item and we need to reflect this 833 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position. 834 * movement in our normalized position.
847 */ 835 */
848 if (!sleep) 836 if (!(flags & DEQUEUE_SLEEP))
849 se->vruntime -= cfs_rq->min_vruntime; 837 se->vruntime -= cfs_rq->min_vruntime;
850} 838}
851 839
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
912 * when there are only lesser-weight tasks around): 900 * when there are only lesser-weight tasks around):
913 */ 901 */
914 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 902 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
915 se->slice_max = max(se->slice_max, 903 se->statistics.slice_max = max(se->statistics.slice_max,
916 se->sum_exec_runtime - se->prev_sum_exec_runtime); 904 se->sum_exec_runtime - se->prev_sum_exec_runtime);
917 } 905 }
918#endif 906#endif
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
1054 * then put the task into the rbtree: 1042 * then put the task into the rbtree:
1055 */ 1043 */
1056static void 1044static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) 1045enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1058{ 1046{
1059 struct cfs_rq *cfs_rq; 1047 struct cfs_rq *cfs_rq;
1060 struct sched_entity *se = &p->se; 1048 struct sched_entity *se = &p->se;
1061 int flags = 0;
1062
1063 if (wakeup)
1064 flags |= ENQUEUE_WAKEUP;
1065 if (p->state == TASK_WAKING)
1066 flags |= ENQUEUE_MIGRATE;
1067 1049
1068 for_each_sched_entity(se) { 1050 for_each_sched_entity(se) {
1069 if (se->on_rq) 1051 if (se->on_rq)
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1081 * decreased. We remove the task from the rbtree and 1063 * decreased. We remove the task from the rbtree and
1082 * update the fair scheduling stats: 1064 * update the fair scheduling stats:
1083 */ 1065 */
1084static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 1066static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1085{ 1067{
1086 struct cfs_rq *cfs_rq; 1068 struct cfs_rq *cfs_rq;
1087 struct sched_entity *se = &p->se; 1069 struct sched_entity *se = &p->se;
1088 1070
1089 for_each_sched_entity(se) { 1071 for_each_sched_entity(se) {
1090 cfs_rq = cfs_rq_of(se); 1072 cfs_rq = cfs_rq_of(se);
1091 dequeue_entity(cfs_rq, se, sleep); 1073 dequeue_entity(cfs_rq, se, flags);
1092 /* Don't dequeue parent if it has other entities besides us */ 1074 /* Don't dequeue parent if it has other entities besides us */
1093 if (cfs_rq->load.weight) 1075 if (cfs_rq->load.weight)
1094 break; 1076 break;
1095 sleep = 1; 1077 flags |= DEQUEUE_SLEEP;
1096 } 1078 }
1097 1079
1098 hrtick_update(rq); 1080 hrtick_update(rq);
@@ -1240,7 +1222,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1240 1222
1241static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1223static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1242{ 1224{
1243 struct task_struct *curr = current;
1244 unsigned long this_load, load; 1225 unsigned long this_load, load;
1245 int idx, this_cpu, prev_cpu; 1226 int idx, this_cpu, prev_cpu;
1246 unsigned long tl_per_task; 1227 unsigned long tl_per_task;
@@ -1255,18 +1236,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1255 load = source_load(prev_cpu, idx); 1236 load = source_load(prev_cpu, idx);
1256 this_load = target_load(this_cpu, idx); 1237 this_load = target_load(this_cpu, idx);
1257 1238
1258 if (sync) {
1259 if (sched_feat(SYNC_LESS) &&
1260 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1261 p->se.avg_overlap > sysctl_sched_migration_cost))
1262 sync = 0;
1263 } else {
1264 if (sched_feat(SYNC_MORE) &&
1265 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1266 p->se.avg_overlap < sysctl_sched_migration_cost))
1267 sync = 1;
1268 }
1269
1270 /* 1239 /*
1271 * If sync wakeup then subtract the (maximum possible) 1240 * If sync wakeup then subtract the (maximum possible)
1272 * effect of the currently running task from the load 1241 * effect of the currently running task from the load
@@ -1306,7 +1275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1306 if (sync && balanced) 1275 if (sync && balanced)
1307 return 1; 1276 return 1;
1308 1277
1309 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1278 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
1310 tl_per_task = cpu_avg_load_per_task(this_cpu); 1279 tl_per_task = cpu_avg_load_per_task(this_cpu);
1311 1280
1312 if (balanced || 1281 if (balanced ||
@@ -1318,7 +1287,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1318 * there is no bad imbalance. 1287 * there is no bad imbalance.
1319 */ 1288 */
1320 schedstat_inc(sd, ttwu_move_affine); 1289 schedstat_inc(sd, ttwu_move_affine);
1321 schedstat_inc(p, se.nr_wakeups_affine); 1290 schedstat_inc(p, se.statistics.nr_wakeups_affine);
1322 1291
1323 return 1; 1292 return 1;
1324 } 1293 }
@@ -1406,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1406/* 1375/*
1407 * Try and locate an idle CPU in the sched_domain. 1376 * Try and locate an idle CPU in the sched_domain.
1408 */ 1377 */
1409static int 1378static int select_idle_sibling(struct task_struct *p, int target)
1410select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1411{ 1379{
1412 int cpu = smp_processor_id(); 1380 int cpu = smp_processor_id();
1413 int prev_cpu = task_cpu(p); 1381 int prev_cpu = task_cpu(p);
1382 struct sched_domain *sd;
1414 int i; 1383 int i;
1415 1384
1416 /* 1385 /*
1417 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE 1386 * If the task is going to be woken-up on this cpu and if it is
1418 * test in select_task_rq_fair) and the prev_cpu is idle then that's 1387 * already idle, then it is the right target.
1419 * always a better target than the current cpu.
1420 */ 1388 */
1421 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) 1389 if (target == cpu && idle_cpu(cpu))
1390 return cpu;
1391
1392 /*
1393 * If the task is going to be woken-up on the cpu where it previously
1394 * ran and if it is currently idle, then it the right target.
1395 */
1396 if (target == prev_cpu && idle_cpu(prev_cpu))
1422 return prev_cpu; 1397 return prev_cpu;
1423 1398
1424 /* 1399 /*
1425 * Otherwise, iterate the domain and find an elegible idle cpu. 1400 * Otherwise, iterate the domains and find an elegible idle cpu.
1426 */ 1401 */
1427 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 1402 for_each_domain(target, sd) {
1428 if (!cpu_rq(i)->cfs.nr_running) { 1403 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1429 target = i;
1430 break; 1404 break;
1405
1406 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1407 if (idle_cpu(i)) {
1408 target = i;
1409 break;
1410 }
1431 } 1411 }
1412
1413 /*
1414 * Lets stop looking for an idle sibling when we reached
1415 * the domain that spans the current cpu and prev_cpu.
1416 */
1417 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
1418 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1419 break;
1432 } 1420 }
1433 1421
1434 return target; 1422 return target;
@@ -1445,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1445 * 1433 *
1446 * preempt must be disabled. 1434 * preempt must be disabled.
1447 */ 1435 */
1448static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 1436static int
1437select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
1449{ 1438{
1450 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1439 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1451 int cpu = smp_processor_id(); 1440 int cpu = smp_processor_id();
@@ -1456,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1456 int sync = wake_flags & WF_SYNC; 1445 int sync = wake_flags & WF_SYNC;
1457 1446
1458 if (sd_flag & SD_BALANCE_WAKE) { 1447 if (sd_flag & SD_BALANCE_WAKE) {
1459 if (sched_feat(AFFINE_WAKEUPS) && 1448 if (cpumask_test_cpu(cpu, &p->cpus_allowed))
1460 cpumask_test_cpu(cpu, &p->cpus_allowed))
1461 want_affine = 1; 1449 want_affine = 1;
1462 new_cpu = prev_cpu; 1450 new_cpu = prev_cpu;
1463 } 1451 }
@@ -1491,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1491 } 1479 }
1492 1480
1493 /* 1481 /*
1494 * While iterating the domains looking for a spanning 1482 * If both cpu and prev_cpu are part of this domain,
1495 * WAKE_AFFINE domain, adjust the affine target to any idle cpu 1483 * cpu is a valid SD_WAKE_AFFINE target.
1496 * in cache sharing domains along the way.
1497 */ 1484 */
1498 if (want_affine) { 1485 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1499 int target = -1; 1486 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1500 1487 affine_sd = tmp;
1501 /* 1488 want_affine = 0;
1502 * If both cpu and prev_cpu are part of this domain,
1503 * cpu is a valid SD_WAKE_AFFINE target.
1504 */
1505 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1506 target = cpu;
1507
1508 /*
1509 * If there's an idle sibling in this domain, make that
1510 * the wake_affine target instead of the current cpu.
1511 */
1512 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1513 target = select_idle_sibling(p, tmp, target);
1514
1515 if (target >= 0) {
1516 if (tmp->flags & SD_WAKE_AFFINE) {
1517 affine_sd = tmp;
1518 want_affine = 0;
1519 }
1520 cpu = target;
1521 }
1522 } 1489 }
1523 1490
1524 if (!want_sd && !want_affine) 1491 if (!want_sd && !want_affine)
@@ -1531,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1531 sd = tmp; 1498 sd = tmp;
1532 } 1499 }
1533 1500
1501#ifdef CONFIG_FAIR_GROUP_SCHED
1534 if (sched_feat(LB_SHARES_UPDATE)) { 1502 if (sched_feat(LB_SHARES_UPDATE)) {
1535 /* 1503 /*
1536 * Pick the largest domain to update shares over 1504 * Pick the largest domain to update shares over
1537 */ 1505 */
1538 tmp = sd; 1506 tmp = sd;
1539 if (affine_sd && (!tmp || 1507 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1540 cpumask_weight(sched_domain_span(affine_sd)) >
1541 cpumask_weight(sched_domain_span(sd))))
1542 tmp = affine_sd; 1508 tmp = affine_sd;
1543 1509
1544 if (tmp) 1510 if (tmp) {
1511 raw_spin_unlock(&rq->lock);
1545 update_shares(tmp); 1512 update_shares(tmp);
1513 raw_spin_lock(&rq->lock);
1514 }
1546 } 1515 }
1516#endif
1547 1517
1548 if (affine_sd && wake_affine(affine_sd, p, sync)) 1518 if (affine_sd) {
1549 return cpu; 1519 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1520 return select_idle_sibling(p, cpu);
1521 else
1522 return select_idle_sibling(p, prev_cpu);
1523 }
1550 1524
1551 while (sd) { 1525 while (sd) {
1552 int load_idx = sd->forkexec_idx; 1526 int load_idx = sd->forkexec_idx;
@@ -1576,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1576 1550
1577 /* Now try balancing at a lower domain level of new_cpu */ 1551 /* Now try balancing at a lower domain level of new_cpu */
1578 cpu = new_cpu; 1552 cpu = new_cpu;
1579 weight = cpumask_weight(sched_domain_span(sd)); 1553 weight = sd->span_weight;
1580 sd = NULL; 1554 sd = NULL;
1581 for_each_domain(cpu, tmp) { 1555 for_each_domain(cpu, tmp) {
1582 if (weight <= cpumask_weight(sched_domain_span(tmp))) 1556 if (weight <= tmp->span_weight)
1583 break; 1557 break;
1584 if (tmp->flags & sd_flag) 1558 if (tmp->flags & sd_flag)
1585 sd = tmp; 1559 sd = tmp;
@@ -1591,63 +1565,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1591} 1565}
1592#endif /* CONFIG_SMP */ 1566#endif /* CONFIG_SMP */
1593 1567
1594/*
1595 * Adaptive granularity
1596 *
1597 * se->avg_wakeup gives the average time a task runs until it does a wakeup,
1598 * with the limit of wakeup_gran -- when it never does a wakeup.
1599 *
1600 * So the smaller avg_wakeup is the faster we want this task to preempt,
1601 * but we don't want to treat the preemptee unfairly and therefore allow it
1602 * to run for at least the amount of time we'd like to run.
1603 *
1604 * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
1605 *
1606 * NOTE: we use *nr_running to scale with load, this nicely matches the
1607 * degrading latency on load.
1608 */
1609static unsigned long
1610adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
1611{
1612 u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1613 u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
1614 u64 gran = 0;
1615
1616 if (this_run < expected_wakeup)
1617 gran = expected_wakeup - this_run;
1618
1619 return min_t(s64, gran, sysctl_sched_wakeup_granularity);
1620}
1621
1622static unsigned long 1568static unsigned long
1623wakeup_gran(struct sched_entity *curr, struct sched_entity *se) 1569wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1624{ 1570{
1625 unsigned long gran = sysctl_sched_wakeup_granularity; 1571 unsigned long gran = sysctl_sched_wakeup_granularity;
1626 1572
1627 if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
1628 gran = adaptive_gran(curr, se);
1629
1630 /* 1573 /*
1631 * Since its curr running now, convert the gran from real-time 1574 * Since its curr running now, convert the gran from real-time
1632 * to virtual-time in his units. 1575 * to virtual-time in his units.
1576 *
1577 * By using 'se' instead of 'curr' we penalize light tasks, so
1578 * they get preempted easier. That is, if 'se' < 'curr' then
1579 * the resulting gran will be larger, therefore penalizing the
1580 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1581 * be smaller, again penalizing the lighter task.
1582 *
1583 * This is especially important for buddies when the leftmost
1584 * task is higher priority than the buddy.
1633 */ 1585 */
1634 if (sched_feat(ASYM_GRAN)) { 1586 if (unlikely(se->load.weight != NICE_0_LOAD))
1635 /* 1587 gran = calc_delta_fair(gran, se);
1636 * By using 'se' instead of 'curr' we penalize light tasks, so
1637 * they get preempted easier. That is, if 'se' < 'curr' then
1638 * the resulting gran will be larger, therefore penalizing the
1639 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1640 * be smaller, again penalizing the lighter task.
1641 *
1642 * This is especially important for buddies when the leftmost
1643 * task is higher priority than the buddy.
1644 */
1645 if (unlikely(se->load.weight != NICE_0_LOAD))
1646 gran = calc_delta_fair(gran, se);
1647 } else {
1648 if (unlikely(curr->load.weight != NICE_0_LOAD))
1649 gran = calc_delta_fair(gran, curr);
1650 }
1651 1588
1652 return gran; 1589 return gran;
1653} 1590}
@@ -1705,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1705 struct task_struct *curr = rq->curr; 1642 struct task_struct *curr = rq->curr;
1706 struct sched_entity *se = &curr->se, *pse = &p->se; 1643 struct sched_entity *se = &curr->se, *pse = &p->se;
1707 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1644 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1708 int sync = wake_flags & WF_SYNC;
1709 int scale = cfs_rq->nr_running >= sched_nr_latency; 1645 int scale = cfs_rq->nr_running >= sched_nr_latency;
1710 1646
1711 if (unlikely(rt_prio(p->prio))) 1647 if (unlikely(rt_prio(p->prio)))
@@ -1738,14 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1738 if (unlikely(curr->policy == SCHED_IDLE)) 1674 if (unlikely(curr->policy == SCHED_IDLE))
1739 goto preempt; 1675 goto preempt;
1740 1676
1741 if (sched_feat(WAKEUP_SYNC) && sync)
1742 goto preempt;
1743
1744 if (sched_feat(WAKEUP_OVERLAP) &&
1745 se->avg_overlap < sysctl_sched_migration_cost &&
1746 pse->avg_overlap < sysctl_sched_migration_cost)
1747 goto preempt;
1748
1749 if (!sched_feat(WAKEUP_PREEMPT)) 1677 if (!sched_feat(WAKEUP_PREEMPT))
1750 return; 1678 return;
1751 1679
@@ -1844,13 +1772,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1844 * 3) are cache-hot on their current CPU. 1772 * 3) are cache-hot on their current CPU.
1845 */ 1773 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 1774 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine); 1775 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
1848 return 0; 1776 return 0;
1849 } 1777 }
1850 *all_pinned = 0; 1778 *all_pinned = 0;
1851 1779
1852 if (task_running(rq, p)) { 1780 if (task_running(rq, p)) {
1853 schedstat_inc(p, se.nr_failed_migrations_running); 1781 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1854 return 0; 1782 return 0;
1855 } 1783 }
1856 1784
@@ -1866,14 +1794,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1866#ifdef CONFIG_SCHEDSTATS 1794#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) { 1795 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]); 1796 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations); 1797 schedstat_inc(p, se.statistics.nr_forced_migrations);
1870 } 1798 }
1871#endif 1799#endif
1872 return 1; 1800 return 1;
1873 } 1801 }
1874 1802
1875 if (tsk_cache_hot) { 1803 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot); 1804 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
1877 return 0; 1805 return 0;
1878 } 1806 }
1879 return 1; 1807 return 1;
@@ -2311,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2311 2239
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 2240unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{ 2241{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2242 unsigned long weight = sd->span_weight;
2315 unsigned long smt_gain = sd->smt_gain; 2243 unsigned long smt_gain = sd->smt_gain;
2316 2244
2317 smt_gain /= weight; 2245 smt_gain /= weight;
@@ -2344,7 +2272,7 @@ unsigned long scale_rt_power(int cpu)
2344 2272
2345static void update_cpu_power(struct sched_domain *sd, int cpu) 2273static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{ 2274{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2275 unsigned long weight = sd->span_weight;
2348 unsigned long power = SCHED_LOAD_SCALE; 2276 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups; 2277 struct sched_group *sdg = sd->groups;
2350 2278
@@ -2870,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 2798 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871} 2799}
2872 2800
2801static int active_load_balance_cpu_stop(void *data);
2802
2873/* 2803/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2804 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance. 2805 * tasks if there is an imbalance.
@@ -2959,8 +2889,9 @@ redo:
2959 if (need_active_balance(sd, sd_idle, idle)) { 2889 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags); 2890 raw_spin_lock_irqsave(&busiest->lock, flags);
2961 2891
2962 /* don't kick the migration_thread, if the curr 2892 /* don't kick the active_load_balance_cpu_stop,
2963 * task on busiest cpu can't be moved to this_cpu 2893 * if the curr task on busiest cpu can't be
2894 * moved to this_cpu
2964 */ 2895 */
2965 if (!cpumask_test_cpu(this_cpu, 2896 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) { 2897 &busiest->curr->cpus_allowed)) {
@@ -2970,14 +2901,22 @@ redo:
2970 goto out_one_pinned; 2901 goto out_one_pinned;
2971 } 2902 }
2972 2903
2904 /*
2905 * ->active_balance synchronizes accesses to
2906 * ->active_balance_work. Once set, it's cleared
2907 * only after active load balance is finished.
2908 */
2973 if (!busiest->active_balance) { 2909 if (!busiest->active_balance) {
2974 busiest->active_balance = 1; 2910 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu; 2911 busiest->push_cpu = this_cpu;
2976 active_balance = 1; 2912 active_balance = 1;
2977 } 2913 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags); 2914 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2915
2979 if (active_balance) 2916 if (active_balance)
2980 wake_up_process(busiest->migration_thread); 2917 stop_one_cpu_nowait(cpu_of(busiest),
2918 active_load_balance_cpu_stop, busiest,
2919 &busiest->active_balance_work);
2981 2920
2982 /* 2921 /*
2983 * We've kicked active balancing, reset the failure 2922 * We've kicked active balancing, reset the failure
@@ -3084,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3084} 3023}
3085 3024
3086/* 3025/*
3087 * active_load_balance is run by migration threads. It pushes running tasks 3026 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 3027 * running tasks off the busiest CPU onto idle CPUs. It requires at
3089 * running on each physical CPU where possible, and avoids physical / 3028 * least 1 task to be running on each physical CPU where possible, and
3090 * logical imbalances. 3029 * avoids physical / logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */ 3030 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) 3031static int active_load_balance_cpu_stop(void *data)
3095{ 3032{
3033 struct rq *busiest_rq = data;
3034 int busiest_cpu = cpu_of(busiest_rq);
3096 int target_cpu = busiest_rq->push_cpu; 3035 int target_cpu = busiest_rq->push_cpu;
3036 struct rq *target_rq = cpu_rq(target_cpu);
3097 struct sched_domain *sd; 3037 struct sched_domain *sd;
3098 struct rq *target_rq; 3038
3039 raw_spin_lock_irq(&busiest_rq->lock);
3040
3041 /* make sure the requested cpu hasn't gone down in the meantime */
3042 if (unlikely(busiest_cpu != smp_processor_id() ||
3043 !busiest_rq->active_balance))
3044 goto out_unlock;
3099 3045
3100 /* Is there any task to move? */ 3046 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1) 3047 if (busiest_rq->nr_running <= 1)
3102 return; 3048 goto out_unlock;
3103
3104 target_rq = cpu_rq(target_cpu);
3105 3049
3106 /* 3050 /*
3107 * This condition is "impossible", if it occurs 3051 * This condition is "impossible", if it occurs
@@ -3112,8 +3056,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3112 3056
3113 /* move a task from busiest_rq to target_rq */ 3057 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq); 3058 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117 3059
3118 /* Search for an sd spanning us and the target CPU. */ 3060 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) { 3061 for_each_domain(target_cpu, sd) {
@@ -3132,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3132 schedstat_inc(sd, alb_failed); 3074 schedstat_inc(sd, alb_failed);
3133 } 3075 }
3134 double_unlock_balance(busiest_rq, target_rq); 3076 double_unlock_balance(busiest_rq, target_rq);
3077out_unlock:
3078 busiest_rq->active_balance = 0;
3079 raw_spin_unlock_irq(&busiest_rq->lock);
3080 return 0;
3135} 3081}
3136 3082
3137#ifdef CONFIG_NO_HZ 3083#ifdef CONFIG_NO_HZ
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d9..83c66e8ad3ee 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
1/* 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows 2 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart. 4 * rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14 7
15/* 8/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
21
22/*
23 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
24 * tasks 10 * tasks
25 */ 11 */
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
31SCHED_FEAT(WAKEUP_PREEMPT, 1) 17SCHED_FEAT(WAKEUP_PREEMPT, 1)
32 18
33/* 19/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
58 * the remote end is likely to consume the data we just wrote, and
59 * therefore has cache benefit from being placed on the same cpu, see
60 * also AFFINE_WAKEUPS.
61 */
62SCHED_FEAT(SYNC_WAKEUPS, 1)
63
64/*
65 * Based on load and program behaviour, see if it makes sense to place 20 * Based on load and program behaviour, see if it makes sense to place
66 * a newly woken task on the same cpu as the task that woke it -- 21 * a newly woken task on the same cpu as the task that woke it --
67 * improve cache locality. Typically used with SYNC wakeups as 22 * improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
70SCHED_FEAT(AFFINE_WAKEUPS, 1) 25SCHED_FEAT(AFFINE_WAKEUPS, 1)
71 26
72/* 27/*
73 * Weaken SYNC hint based on overlap
74 */
75SCHED_FEAT(SYNC_LESS, 1)
76
77/*
78 * Add SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_MORE, 0)
81
82/*
83 * Prefer to schedule the task we woke last (assuming it failed 28 * Prefer to schedule the task we woke last (assuming it failed
84 * wakeup-preemption), since its likely going to consume data we 29 * wakeup-preemption), since its likely going to consume data we
85 * touched, increases cache locality. 30 * touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a50947..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
10{ 11{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
12} 13}
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
22static struct task_struct *pick_next_task_idle(struct rq *rq) 23static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 24{
24 schedstat_inc(rq, sched_goidle); 25 schedstat_inc(rq, sched_goidle);
25 /* adjust the active tasks as we might go into a long sleep */ 26 calc_load_account_idle(rq);
26 calc_load_account_active(rq);
27 return rq->idle; 27 return rq->idle;
28} 28}
29 29
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
32 * message if some code attempts to do it: 32 * message if some code attempts to do it:
33 */ 33 */
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 raw_spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b5b920ae2ea7..8afb953e31c6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
616 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 616 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
617 617
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
888 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
889 */ 889 */
890static void 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) 891enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
892{ 892{
893 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
894 894
895 if (wakeup) 895 if (flags & ENQUEUE_WAKEUP)
896 rt_se->timeout = 0; 896 rt_se->timeout = 0;
897 897
898 enqueue_rt_entity(rt_se, head); 898 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
899 899
900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
901 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
902} 902}
903 903
904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
905{ 905{
906 struct sched_rt_entity *rt_se = &p->rt; 906 struct sched_rt_entity *rt_se = &p->rt;
907 907
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
948#ifdef CONFIG_SMP 948#ifdef CONFIG_SMP
949static int find_lowest_rq(struct task_struct *task); 949static int find_lowest_rq(struct task_struct *task);
950 950
951static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 951static int
952select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
952{ 953{
953 struct rq *rq = task_rq(p);
954
955 if (sd_flag != SD_BALANCE_WAKE) 954 if (sd_flag != SD_BALANCE_WAKE)
956 return smp_processor_id(); 955 return smp_processor_id();
957 956
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7c1a67ef0274..0db913a5c60f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -716,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
716 preempt_enable_no_resched(); 716 preempt_enable_no_resched();
717 cond_resched(); 717 cond_resched();
718 preempt_disable(); 718 preempt_disable();
719 rcu_sched_qs((long)__bind_cpu); 719 rcu_note_context_switch((long)__bind_cpu);
720 } 720 }
721 preempt_enable(); 721 preempt_enable();
722 set_current_state(TASK_INTERRUPTIBLE); 722 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bb9fb1bd79c..b4e7431e7c78 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,384 @@
1/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. 1/*
2 * GPL v2 and any later version. 2 * kernel/stop_machine.c
3 *
4 * Copyright (C) 2008, 2005 IBM Corporation.
5 * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
6 * Copyright (C) 2010 SUSE Linux Products GmbH
7 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
8 *
9 * This file is released under the GPLv2 and any later version.
3 */ 10 */
11#include <linux/completion.h>
4#include <linux/cpu.h> 12#include <linux/cpu.h>
5#include <linux/err.h> 13#include <linux/init.h>
6#include <linux/kthread.h> 14#include <linux/kthread.h>
7#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/percpu.h>
8#include <linux/sched.h> 17#include <linux/sched.h>
9#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
10#include <linux/syscalls.h>
11#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h>
12 21
13#include <asm/atomic.h> 22#include <asm/atomic.h>
14#include <asm/uaccess.h> 23
24/*
25 * Structure to determine completion condition and record errors. May
26 * be shared by works on different cpus.
27 */
28struct cpu_stop_done {
29 atomic_t nr_todo; /* nr left to execute */
30 bool executed; /* actually executed? */
31 int ret; /* collected return value */
32 struct completion completion; /* fired if nr_todo reaches 0 */
33};
34
35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper {
37 spinlock_t lock;
38 struct list_head works; /* list of pending works */
39 struct task_struct *thread; /* stopper thread */
40 bool enabled; /* is this stopper enabled? */
41};
42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44
45static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
46{
47 memset(done, 0, sizeof(*done));
48 atomic_set(&done->nr_todo, nr_todo);
49 init_completion(&done->completion);
50}
51
52/* signal completion unless @done is NULL */
53static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
54{
55 if (done) {
56 if (executed)
57 done->executed = true;
58 if (atomic_dec_and_test(&done->nr_todo))
59 complete(&done->completion);
60 }
61}
62
63/* queue @work to @stopper. if offline, @work is completed immediately */
64static void cpu_stop_queue_work(struct cpu_stopper *stopper,
65 struct cpu_stop_work *work)
66{
67 unsigned long flags;
68
69 spin_lock_irqsave(&stopper->lock, flags);
70
71 if (stopper->enabled) {
72 list_add_tail(&work->list, &stopper->works);
73 wake_up_process(stopper->thread);
74 } else
75 cpu_stop_signal_done(work->done, false);
76
77 spin_unlock_irqrestore(&stopper->lock, flags);
78}
79
80/**
81 * stop_one_cpu - stop a cpu
82 * @cpu: cpu to stop
83 * @fn: function to execute
84 * @arg: argument to @fn
85 *
86 * Execute @fn(@arg) on @cpu. @fn is run in a process context with
87 * the highest priority preempting any task on the cpu and
88 * monopolizing it. This function returns after the execution is
89 * complete.
90 *
91 * This function doesn't guarantee @cpu stays online till @fn
92 * completes. If @cpu goes down in the middle, execution may happen
93 * partially or fully on different cpus. @fn should either be ready
94 * for that or the caller should ensure that @cpu stays online until
95 * this function completes.
96 *
97 * CONTEXT:
98 * Might sleep.
99 *
100 * RETURNS:
101 * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
102 * otherwise, the return value of @fn.
103 */
104int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
105{
106 struct cpu_stop_done done;
107 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
108
109 cpu_stop_init_done(&done, 1);
110 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
111 wait_for_completion(&done.completion);
112 return done.executed ? done.ret : -ENOENT;
113}
114
115/**
116 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
117 * @cpu: cpu to stop
118 * @fn: function to execute
119 * @arg: argument to @fn
120 *
121 * Similar to stop_one_cpu() but doesn't wait for completion. The
122 * caller is responsible for ensuring @work_buf is currently unused
123 * and will remain untouched until stopper starts executing @fn.
124 *
125 * CONTEXT:
126 * Don't care.
127 */
128void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
129 struct cpu_stop_work *work_buf)
130{
131 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
132 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
133}
134
135/* static data for stop_cpus */
136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
140{
141 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu;
144
145 /* initialize works and done */
146 for_each_cpu(cpu, cpumask) {
147 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn;
149 work->arg = arg;
150 work->done = &done;
151 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153
154 /*
155 * Disable preemption while queueing to avoid getting
156 * preempted by a stopper which might wait for other stoppers
157 * to enter @fn which can lead to deadlock.
158 */
159 preempt_disable();
160 for_each_cpu(cpu, cpumask)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable();
164
165 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT;
167}
168
169/**
170 * stop_cpus - stop multiple cpus
171 * @cpumask: cpus to stop
172 * @fn: function to execute
173 * @arg: argument to @fn
174 *
175 * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
176 * @fn is run in a process context with the highest priority
177 * preempting any task on the cpu and monopolizing it. This function
178 * returns after all executions are complete.
179 *
180 * This function doesn't guarantee the cpus in @cpumask stay online
181 * till @fn completes. If some cpus go down in the middle, execution
182 * on the cpu may happen partially or fully on different cpus. @fn
183 * should either be ready for that or the caller should ensure that
184 * the cpus stay online until this function completes.
185 *
186 * All stop_cpus() calls are serialized making it safe for @fn to wait
187 * for all cpus to start executing it.
188 *
189 * CONTEXT:
190 * Might sleep.
191 *
192 * RETURNS:
193 * -ENOENT if @fn(@arg) was not executed at all because all cpus in
194 * @cpumask were offline; otherwise, 0 if all executions of @fn
195 * returned 0, any non zero return value if any returned non zero.
196 */
197int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
198{
199 int ret;
200
201 /* static works are used, process one request at a time */
202 mutex_lock(&stop_cpus_mutex);
203 ret = __stop_cpus(cpumask, fn, arg);
204 mutex_unlock(&stop_cpus_mutex);
205 return ret;
206}
207
208/**
209 * try_stop_cpus - try to stop multiple cpus
210 * @cpumask: cpus to stop
211 * @fn: function to execute
212 * @arg: argument to @fn
213 *
214 * Identical to stop_cpus() except that it fails with -EAGAIN if
215 * someone else is already using the facility.
216 *
217 * CONTEXT:
218 * Might sleep.
219 *
220 * RETURNS:
221 * -EAGAIN if someone else is already stopping cpus, -ENOENT if
222 * @fn(@arg) was not executed at all because all cpus in @cpumask were
223 * offline; otherwise, 0 if all executions of @fn returned 0, any non
224 * zero return value if any returned non zero.
225 */
226int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
227{
228 int ret;
229
230 /* static works are used, process one request at a time */
231 if (!mutex_trylock(&stop_cpus_mutex))
232 return -EAGAIN;
233 ret = __stop_cpus(cpumask, fn, arg);
234 mutex_unlock(&stop_cpus_mutex);
235 return ret;
236}
237
238static int cpu_stopper_thread(void *data)
239{
240 struct cpu_stopper *stopper = data;
241 struct cpu_stop_work *work;
242 int ret;
243
244repeat:
245 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
246
247 if (kthread_should_stop()) {
248 __set_current_state(TASK_RUNNING);
249 return 0;
250 }
251
252 work = NULL;
253 spin_lock_irq(&stopper->lock);
254 if (!list_empty(&stopper->works)) {
255 work = list_first_entry(&stopper->works,
256 struct cpu_stop_work, list);
257 list_del_init(&work->list);
258 }
259 spin_unlock_irq(&stopper->lock);
260
261 if (work) {
262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN];
266
267 __set_current_state(TASK_RUNNING);
268
269 /* cpu stop callbacks are not allowed to sleep */
270 preempt_disable();
271
272 ret = fn(arg);
273 if (ret)
274 done->ret = ret;
275
276 /* restore preemption and check it's still balanced */
277 preempt_enable();
278 WARN_ONCE(preempt_count(),
279 "cpu_stop: %s(%p) leaked preempt count\n",
280 kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
281 ksym_buf), arg);
282
283 cpu_stop_signal_done(done, true);
284 } else
285 schedule();
286
287 goto repeat;
288}
289
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu)
293{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p;
298
299 switch (action & ~CPU_TASKS_FROZEN) {
300 case CPU_UP_PREPARE:
301 BUG_ON(stopper->thread || stopper->enabled ||
302 !list_empty(&stopper->works));
303 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
304 cpu);
305 if (IS_ERR(p))
306 return NOTIFY_BAD;
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p);
309 stopper->thread = p;
310 break;
311
312 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread);
316 /* mark enabled */
317 spin_lock_irq(&stopper->lock);
318 stopper->enabled = true;
319 spin_unlock_irq(&stopper->lock);
320 break;
321
322#ifdef CONFIG_HOTPLUG_CPU
323 case CPU_UP_CANCELED:
324 case CPU_DEAD:
325 {
326 struct cpu_stop_work *work;
327
328 /* kill the stopper */
329 kthread_stop(stopper->thread);
330 /* drain remaining works */
331 spin_lock_irq(&stopper->lock);
332 list_for_each_entry(work, &stopper->works, list)
333 cpu_stop_signal_done(work->done, false);
334 stopper->enabled = false;
335 spin_unlock_irq(&stopper->lock);
336 /* release the stopper */
337 put_task_struct(stopper->thread);
338 stopper->thread = NULL;
339 break;
340 }
341#endif
342 }
343
344 return NOTIFY_OK;
345}
346
347/*
348 * Give it a higher priority so that cpu stopper is available to other
349 * cpu notifiers. It currently shares the same priority as sched
350 * migration_notifier.
351 */
352static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
353 .notifier_call = cpu_stop_cpu_callback,
354 .priority = 10,
355};
356
357static int __init cpu_stop_init(void)
358{
359 void *bcpu = (void *)(long)smp_processor_id();
360 unsigned int cpu;
361 int err;
362
363 for_each_possible_cpu(cpu) {
364 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
365
366 spin_lock_init(&stopper->lock);
367 INIT_LIST_HEAD(&stopper->works);
368 }
369
370 /* start one for the boot cpu */
371 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
372 bcpu);
373 BUG_ON(err == NOTIFY_BAD);
374 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
375 register_cpu_notifier(&cpu_stop_cpu_notifier);
376
377 return 0;
378}
379early_initcall(cpu_stop_init);
380
381#ifdef CONFIG_STOP_MACHINE
15 382
16/* This controls the threads on each CPU. */ 383/* This controls the threads on each CPU. */
17enum stopmachine_state { 384enum stopmachine_state {
@@ -26,174 +393,94 @@ enum stopmachine_state {
26 /* Exit */ 393 /* Exit */
27 STOPMACHINE_EXIT, 394 STOPMACHINE_EXIT,
28}; 395};
29static enum stopmachine_state state;
30 396
31struct stop_machine_data { 397struct stop_machine_data {
32 int (*fn)(void *); 398 int (*fn)(void *);
33 void *data; 399 void *data;
34 int fnret; 400 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
401 unsigned int num_threads;
402 const struct cpumask *active_cpus;
403
404 enum stopmachine_state state;
405 atomic_t thread_ack;
35}; 406};
36 407
37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ 408static void set_state(struct stop_machine_data *smdata,
38static unsigned int num_threads; 409 enum stopmachine_state newstate)
39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock);
41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus;
48static void __percpu *stop_machine_work;
49
50static void set_state(enum stopmachine_state newstate)
51{ 410{
52 /* Reset ack counter. */ 411 /* Reset ack counter. */
53 atomic_set(&thread_ack, num_threads); 412 atomic_set(&smdata->thread_ack, smdata->num_threads);
54 smp_wmb(); 413 smp_wmb();
55 state = newstate; 414 smdata->state = newstate;
56} 415}
57 416
58/* Last one to ack a state moves to the next state. */ 417/* Last one to ack a state moves to the next state. */
59static void ack_state(void) 418static void ack_state(struct stop_machine_data *smdata)
60{ 419{
61 if (atomic_dec_and_test(&thread_ack)) 420 if (atomic_dec_and_test(&smdata->thread_ack))
62 set_state(state + 1); 421 set_state(smdata, smdata->state + 1);
63} 422}
64 423
65/* This is the actual function which stops the CPU. It runs 424/* This is the cpu_stop function which stops the CPU. */
66 * in the context of a dedicated stopmachine workqueue. */ 425static int stop_machine_cpu_stop(void *data)
67static void stop_cpu(struct work_struct *unused)
68{ 426{
427 struct stop_machine_data *smdata = data;
69 enum stopmachine_state curstate = STOPMACHINE_NONE; 428 enum stopmachine_state curstate = STOPMACHINE_NONE;
70 struct stop_machine_data *smdata = &idle; 429 int cpu = smp_processor_id(), err = 0;
71 int cpu = smp_processor_id(); 430 bool is_active;
72 int err; 431
432 if (!smdata->active_cpus)
433 is_active = cpu == cpumask_first(cpu_online_mask);
434 else
435 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
73 436
74 if (!active_cpus) {
75 if (cpu == cpumask_first(cpu_online_mask))
76 smdata = &active;
77 } else {
78 if (cpumask_test_cpu(cpu, active_cpus))
79 smdata = &active;
80 }
81 /* Simple state machine */ 437 /* Simple state machine */
82 do { 438 do {
83 /* Chill out and ensure we re-read stopmachine_state. */ 439 /* Chill out and ensure we re-read stopmachine_state. */
84 cpu_relax(); 440 cpu_relax();
85 if (state != curstate) { 441 if (smdata->state != curstate) {
86 curstate = state; 442 curstate = smdata->state;
87 switch (curstate) { 443 switch (curstate) {
88 case STOPMACHINE_DISABLE_IRQ: 444 case STOPMACHINE_DISABLE_IRQ:
89 local_irq_disable(); 445 local_irq_disable();
90 hard_irq_disable(); 446 hard_irq_disable();
91 break; 447 break;
92 case STOPMACHINE_RUN: 448 case STOPMACHINE_RUN:
93 /* On multiple CPUs only a single error code 449 if (is_active)
94 * is needed to tell that something failed. */ 450 err = smdata->fn(smdata->data);
95 err = smdata->fn(smdata->data);
96 if (err)
97 smdata->fnret = err;
98 break; 451 break;
99 default: 452 default:
100 break; 453 break;
101 } 454 }
102 ack_state(); 455 ack_state(smdata);
103 } 456 }
104 } while (curstate != STOPMACHINE_EXIT); 457 } while (curstate != STOPMACHINE_EXIT);
105 458
106 local_irq_enable(); 459 local_irq_enable();
460 return err;
107} 461}
108 462
109/* Callback for CPUs which aren't supposed to do anything. */
110static int chill(void *unused)
111{
112 return 0;
113}
114
115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 463int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
153{ 464{
154 struct work_struct *sm_work; 465 struct stop_machine_data smdata = { .fn = fn, .data = data,
155 int i, ret; 466 .num_threads = num_online_cpus(),
156 467 .active_cpus = cpus };
157 /* Set up initial state. */ 468
158 mutex_lock(&lock); 469 /* Set the initial state and stop all online cpus. */
159 num_threads = num_online_cpus(); 470 set_state(&smdata, STOPMACHINE_PREPARE);
160 active_cpus = cpus; 471 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
161 active.fn = fn;
162 active.data = data;
163 active.fnret = 0;
164 idle.fn = chill;
165 idle.data = NULL;
166
167 set_state(STOPMACHINE_PREPARE);
168
169 /* Schedule the stop_cpu work on all cpus: hold this CPU so one
170 * doesn't hit this CPU until we're ready. */
171 get_cpu();
172 for_each_online_cpu(i) {
173 sm_work = per_cpu_ptr(stop_machine_work, i);
174 INIT_WORK(sm_work, stop_cpu);
175 queue_work_on(i, stop_machine_wq, sm_work);
176 }
177 /* This will release the thread on our CPU. */
178 put_cpu();
179 flush_workqueue(stop_machine_wq);
180 ret = active.fnret;
181 mutex_unlock(&lock);
182 return ret;
183} 472}
184 473
185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 474int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
186{ 475{
187 int ret; 476 int ret;
188 477
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
192 /* No CPUs can come up or down during this. */ 478 /* No CPUs can come up or down during this. */
193 get_online_cpus(); 479 get_online_cpus();
194 ret = __stop_machine(fn, data, cpus); 480 ret = __stop_machine(fn, data, cpus);
195 put_online_cpus(); 481 put_online_cpus();
196 stop_machine_destroy();
197 return ret; 482 return ret;
198} 483}
199EXPORT_SYMBOL_GPL(stop_machine); 484EXPORT_SYMBOL_GPL(stop_machine);
485
486#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index 6d1a7e0f9d5b..0d36d889c74d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -492,10 +492,6 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
492 return -ENOMEM; 492 return -ENOMEM;
493 old = current_cred(); 493 old = current_cred();
494 494
495 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
496 if (retval)
497 goto error;
498
499 retval = -EPERM; 495 retval = -EPERM;
500 if (rgid != (gid_t) -1) { 496 if (rgid != (gid_t) -1) {
501 if (old->gid == rgid || 497 if (old->gid == rgid ||
@@ -543,10 +539,6 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
543 return -ENOMEM; 539 return -ENOMEM;
544 old = current_cred(); 540 old = current_cred();
545 541
546 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
547 if (retval)
548 goto error;
549
550 retval = -EPERM; 542 retval = -EPERM;
551 if (capable(CAP_SETGID)) 543 if (capable(CAP_SETGID))
552 new->gid = new->egid = new->sgid = new->fsgid = gid; 544 new->gid = new->egid = new->sgid = new->fsgid = gid;
@@ -610,10 +602,6 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
610 return -ENOMEM; 602 return -ENOMEM;
611 old = current_cred(); 603 old = current_cred();
612 604
613 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
614 if (retval)
615 goto error;
616
617 retval = -EPERM; 605 retval = -EPERM;
618 if (ruid != (uid_t) -1) { 606 if (ruid != (uid_t) -1) {
619 new->uid = ruid; 607 new->uid = ruid;
@@ -675,10 +663,6 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
675 return -ENOMEM; 663 return -ENOMEM;
676 old = current_cred(); 664 old = current_cred();
677 665
678 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
679 if (retval)
680 goto error;
681
682 retval = -EPERM; 666 retval = -EPERM;
683 if (capable(CAP_SETUID)) { 667 if (capable(CAP_SETUID)) {
684 new->suid = new->uid = uid; 668 new->suid = new->uid = uid;
@@ -719,9 +703,6 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
719 if (!new) 703 if (!new)
720 return -ENOMEM; 704 return -ENOMEM;
721 705
722 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
723 if (retval)
724 goto error;
725 old = current_cred(); 706 old = current_cred();
726 707
727 retval = -EPERM; 708 retval = -EPERM;
@@ -788,10 +769,6 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
788 return -ENOMEM; 769 return -ENOMEM;
789 old = current_cred(); 770 old = current_cred();
790 771
791 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
792 if (retval)
793 goto error;
794
795 retval = -EPERM; 772 retval = -EPERM;
796 if (!capable(CAP_SETGID)) { 773 if (!capable(CAP_SETGID)) {
797 if (rgid != (gid_t) -1 && rgid != old->gid && 774 if (rgid != (gid_t) -1 && rgid != old->gid &&
@@ -851,9 +828,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
851 old = current_cred(); 828 old = current_cred();
852 old_fsuid = old->fsuid; 829 old_fsuid = old->fsuid;
853 830
854 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
855 goto error;
856
857 if (uid == old->uid || uid == old->euid || 831 if (uid == old->uid || uid == old->euid ||
858 uid == old->suid || uid == old->fsuid || 832 uid == old->suid || uid == old->fsuid ||
859 capable(CAP_SETUID)) { 833 capable(CAP_SETUID)) {
@@ -864,7 +838,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
864 } 838 }
865 } 839 }
866 840
867error:
868 abort_creds(new); 841 abort_creds(new);
869 return old_fsuid; 842 return old_fsuid;
870 843
@@ -888,9 +861,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
888 old = current_cred(); 861 old = current_cred();
889 old_fsgid = old->fsgid; 862 old_fsgid = old->fsgid;
890 863
891 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
892 goto error;
893
894 if (gid == old->gid || gid == old->egid || 864 if (gid == old->gid || gid == old->egid ||
895 gid == old->sgid || gid == old->fsgid || 865 gid == old->sgid || gid == old->fsgid ||
896 capable(CAP_SETGID)) { 866 capable(CAP_SETGID)) {
@@ -900,7 +870,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
900 } 870 }
901 } 871 }
902 872
903error:
904 abort_creds(new); 873 abort_creds(new);
905 return old_fsgid; 874 return old_fsgid;
906 875
@@ -1118,7 +1087,7 @@ DECLARE_RWSEM(uts_sem);
1118 1087
1119#ifdef COMPAT_UTS_MACHINE 1088#ifdef COMPAT_UTS_MACHINE
1120#define override_architecture(name) \ 1089#define override_architecture(name) \
1121 (current->personality == PER_LINUX32 && \ 1090 (personality(current->personality) == PER_LINUX32 && \
1122 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ 1091 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
1123 sizeof(COMPAT_UTS_MACHINE))) 1092 sizeof(COMPAT_UTS_MACHINE)))
1124#else 1093#else
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8686b0f5fc12..90f536d84643 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -621,7 +621,7 @@ static struct ctl_table kern_table[] = {
621#endif 621#endif
622 { 622 {
623 .procname = "userprocess_debug", 623 .procname = "userprocess_debug",
624 .data = &sysctl_userprocess_debug, 624 .data = &show_unhandled_signals,
625 .maxlen = sizeof(int), 625 .maxlen = sizeof(int),
626 .mode = 0644, 626 .mode = 0644,
627 .proc_handler = proc_dointvec, 627 .proc_handler = proc_dointvec,
@@ -1431,7 +1431,8 @@ static struct ctl_table fs_table[] = {
1431}; 1431};
1432 1432
1433static struct ctl_table debug_table[] = { 1433static struct ctl_table debug_table[] = {
1434#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) 1434#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
1435 defined(CONFIG_S390)
1435 { 1436 {
1436 .procname = "exception-trace", 1437 .procname = "exception-trace",
1437 .data = &show_unhandled_signals, 1438 .data = &show_unhandled_signals,
diff --git a/kernel/time.c b/kernel/time.c
index 656dccfe1cbb..50612faa9baf 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -132,12 +132,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
132 */ 132 */
133static inline void warp_clock(void) 133static inline void warp_clock(void)
134{ 134{
135 write_seqlock_irq(&xtime_lock); 135 struct timespec delta, adjust;
136 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 136 delta.tv_sec = sys_tz.tz_minuteswest * 60;
137 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 137 delta.tv_nsec = 0;
138 update_xtime_cache(0); 138 adjust = timespec_add_safe(current_kernel_time(), delta);
139 write_sequnlock_irq(&xtime_lock); 139 do_settimeofday(&adjust);
140 clock_was_set();
141} 140}
142 141
143/* 142/*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1f5dde637457..f08e99c1d561 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -625,6 +625,54 @@ static void clocksource_enqueue(struct clocksource *cs)
625 list_add(&cs->list, entry); 625 list_add(&cs->list, entry);
626} 626}
627 627
628
629/*
630 * Maximum time we expect to go between ticks. This includes idle
631 * tickless time. It provides the trade off between selecting a
632 * mult/shift pair that is very precise but can only handle a short
633 * period of time, vs. a mult/shift pair that can handle long periods
634 * of time but isn't as precise.
635 *
636 * This is a subsystem constant, and actual hardware limitations
637 * may override it (ie: clocksources that wrap every 3 seconds).
638 */
639#define MAX_UPDATE_LENGTH 5 /* Seconds */
640
641/**
642 * __clocksource_register_scale - Used to install new clocksources
643 * @t: clocksource to be registered
644 * @scale: Scale factor multiplied against freq to get clocksource hz
645 * @freq: clocksource frequency (cycles per second) divided by scale
646 *
647 * Returns -EBUSY if registration fails, zero otherwise.
648 *
649 * This *SHOULD NOT* be called directly! Please use the
650 * clocksource_register_hz() or clocksource_register_khz helper functions.
651 */
652int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
653{
654
655 /*
656 * Ideally we want to use some of the limits used in
657 * clocksource_max_deferment, to provide a more informed
658 * MAX_UPDATE_LENGTH. But for now this just gets the
659 * register interface working properly.
660 */
661 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
662 NSEC_PER_SEC/scale,
663 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs);
665
666 mutex_lock(&clocksource_mutex);
667 clocksource_enqueue(cs);
668 clocksource_select();
669 clocksource_enqueue_watchdog(cs);
670 mutex_unlock(&clocksource_mutex);
671 return 0;
672}
673EXPORT_SYMBOL_GPL(__clocksource_register_scale);
674
675
628/** 676/**
629 * clocksource_register - Used to install new clocksources 677 * clocksource_register - Used to install new clocksources
630 * @t: clocksource to be registered 678 * @t: clocksource to be registered
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7c0f180d6e9d..c63116863a80 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -69,7 +69,7 @@ static s64 time_freq;
69/* time at last adjustment (secs): */ 69/* time at last adjustment (secs): */
70static long time_reftime; 70static long time_reftime;
71 71
72long time_adjust; 72static long time_adjust;
73 73
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 75static s64 ntp_tick_adj;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f51..1d7b9bc1c034 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,14 +150,32 @@ static void tick_nohz_update_jiffies(ktime_t now)
150 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
151} 151}
152 152
153/*
154 * Updates the per cpu time idle statistics counters
155 */
156static void
157update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
158{
159 ktime_t delta;
160
161 if (ts->idle_active) {
162 delta = ktime_sub(now, ts->idle_entrytime);
163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
164 if (nr_iowait_cpu() > 0)
165 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
166 ts->idle_entrytime = now;
167 }
168
169 if (last_update_time)
170 *last_update_time = ktime_to_us(now);
171
172}
173
153static void tick_nohz_stop_idle(int cpu, ktime_t now) 174static void tick_nohz_stop_idle(int cpu, ktime_t now)
154{ 175{
155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 176 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
157 177
158 delta = ktime_sub(now, ts->idle_entrytime); 178 update_ts_time_stats(ts, now, NULL);
159 ts->idle_lastupdate = now;
160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
161 ts->idle_active = 0; 179 ts->idle_active = 0;
162 180
163 sched_clock_idle_wakeup_event(0); 181 sched_clock_idle_wakeup_event(0);
@@ -165,20 +183,32 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
165 183
166static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 184static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
167{ 185{
168 ktime_t now, delta; 186 ktime_t now;
169 187
170 now = ktime_get(); 188 now = ktime_get();
171 if (ts->idle_active) { 189
172 delta = ktime_sub(now, ts->idle_entrytime); 190 update_ts_time_stats(ts, now, NULL);
173 ts->idle_lastupdate = now; 191
174 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
175 }
176 ts->idle_entrytime = now; 192 ts->idle_entrytime = now;
177 ts->idle_active = 1; 193 ts->idle_active = 1;
178 sched_clock_idle_sleep_event(); 194 sched_clock_idle_sleep_event();
179 return now; 195 return now;
180} 196}
181 197
198/**
199 * get_cpu_idle_time_us - get the total idle time of a cpu
200 * @cpu: CPU number to query
201 * @last_update_time: variable to store update time in
202 *
203 * Return the cummulative idle time (since boot) for a given
204 * CPU, in microseconds. The idle time returned includes
205 * the iowait time (unlike what "top" and co report).
206 *
207 * This time is measured via accounting rather than sampling,
208 * and is as accurate as ktime_get() is.
209 *
210 * This function returns -1 if NOHZ is not enabled.
211 */
182u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 212u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
183{ 213{
184 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 214 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
186 if (!tick_nohz_enabled) 216 if (!tick_nohz_enabled)
187 return -1; 217 return -1;
188 218
189 if (ts->idle_active) 219 update_ts_time_stats(ts, ktime_get(), last_update_time);
190 *last_update_time = ktime_to_us(ts->idle_lastupdate);
191 else
192 *last_update_time = ktime_to_us(ktime_get());
193 220
194 return ktime_to_us(ts->idle_sleeptime); 221 return ktime_to_us(ts->idle_sleeptime);
195} 222}
196EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 223EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
197 224
225/*
226 * get_cpu_iowait_time_us - get the total iowait time of a cpu
227 * @cpu: CPU number to query
228 * @last_update_time: variable to store update time in
229 *
230 * Return the cummulative iowait time (since boot) for a given
231 * CPU, in microseconds.
232 *
233 * This time is measured via accounting rather than sampling,
234 * and is as accurate as ktime_get() is.
235 *
236 * This function returns -1 if NOHZ is not enabled.
237 */
238u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
239{
240 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
241
242 if (!tick_nohz_enabled)
243 return -1;
244
245 update_ts_time_stats(ts, ktime_get(), last_update_time);
246
247 return ktime_to_us(ts->iowait_sleeptime);
248}
249EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
250
198/** 251/**
199 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 252 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
200 * 253 *
@@ -262,6 +315,9 @@ void tick_nohz_stop_sched_tick(int inidle)
262 goto end; 315 goto end;
263 } 316 }
264 317
318 if (nohz_ratelimit(cpu))
319 goto end;
320
265 ts->idle_calls++; 321 ts->idle_calls++;
266 /* Read jiffies and the time when jiffies were updated last */ 322 /* Read jiffies and the time when jiffies were updated last */
267 do { 323 do {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 39f6177fafac..caf8d4d4f5c8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,13 +165,6 @@ struct timespec raw_time;
165/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
167 167
168static struct timespec xtime_cache __attribute__ ((aligned (16)));
169void update_xtime_cache(u64 nsec)
170{
171 xtime_cache = xtime;
172 timespec_add_ns(&xtime_cache, nsec);
173}
174
175/* must hold xtime_lock */ 168/* must hold xtime_lock */
176void timekeeping_leap_insert(int leapsecond) 169void timekeeping_leap_insert(int leapsecond)
177{ 170{
@@ -332,8 +325,6 @@ int do_settimeofday(struct timespec *tv)
332 325
333 xtime = *tv; 326 xtime = *tv;
334 327
335 update_xtime_cache(0);
336
337 timekeeper.ntp_error = 0; 328 timekeeper.ntp_error = 0;
338 ntp_clear(); 329 ntp_clear();
339 330
@@ -559,7 +550,6 @@ void __init timekeeping_init(void)
559 } 550 }
560 set_normalized_timespec(&wall_to_monotonic, 551 set_normalized_timespec(&wall_to_monotonic,
561 -boot.tv_sec, -boot.tv_nsec); 552 -boot.tv_sec, -boot.tv_nsec);
562 update_xtime_cache(0);
563 total_sleep_time.tv_sec = 0; 553 total_sleep_time.tv_sec = 0;
564 total_sleep_time.tv_nsec = 0; 554 total_sleep_time.tv_nsec = 0;
565 write_sequnlock_irqrestore(&xtime_lock, flags); 555 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -593,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
593 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
594 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 584 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
595 } 585 }
596 update_xtime_cache(0);
597 /* re-base the last cycle value */ 586 /* re-base the last cycle value */
598 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
599 timekeeper.ntp_error = 0; 588 timekeeper.ntp_error = 0;
@@ -788,7 +777,6 @@ void update_wall_time(void)
788{ 777{
789 struct clocksource *clock; 778 struct clocksource *clock;
790 cycle_t offset; 779 cycle_t offset;
791 u64 nsecs;
792 int shift = 0, maxshift; 780 int shift = 0, maxshift;
793 781
794 /* Make sure we're fully resumed: */ 782 /* Make sure we're fully resumed: */
@@ -847,7 +835,9 @@ void update_wall_time(void)
847 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; 835 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
848 } 836 }
849 837
850 /* store full nanoseconds into xtime after rounding it up and 838
839 /*
840 * Store full nanoseconds into xtime after rounding it up and
851 * add the remainder to the error difference. 841 * add the remainder to the error difference.
852 */ 842 */
853 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; 843 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
@@ -855,8 +845,15 @@ void update_wall_time(void)
855 timekeeper.ntp_error += timekeeper.xtime_nsec << 845 timekeeper.ntp_error += timekeeper.xtime_nsec <<
856 timekeeper.ntp_error_shift; 846 timekeeper.ntp_error_shift;
857 847
858 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); 848 /*
859 update_xtime_cache(nsecs); 849 * Finally, make sure that after the rounding
850 * xtime.tv_nsec isn't larger then NSEC_PER_SEC
851 */
852 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
853 xtime.tv_nsec -= NSEC_PER_SEC;
854 xtime.tv_sec++;
855 second_overflow();
856 }
860 857
861 /* check to see if there is a new clocksource to use */ 858 /* check to see if there is a new clocksource to use */
862 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 859 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
@@ -896,13 +893,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
896 893
897unsigned long get_seconds(void) 894unsigned long get_seconds(void)
898{ 895{
899 return xtime_cache.tv_sec; 896 return xtime.tv_sec;
900} 897}
901EXPORT_SYMBOL(get_seconds); 898EXPORT_SYMBOL(get_seconds);
902 899
903struct timespec __current_kernel_time(void) 900struct timespec __current_kernel_time(void)
904{ 901{
905 return xtime_cache; 902 return xtime;
906} 903}
907 904
908struct timespec current_kernel_time(void) 905struct timespec current_kernel_time(void)
@@ -913,7 +910,7 @@ struct timespec current_kernel_time(void)
913 do { 910 do {
914 seq = read_seqbegin(&xtime_lock); 911 seq = read_seqbegin(&xtime_lock);
915 912
916 now = xtime_cache; 913 now = xtime;
917 } while (read_seqretry(&xtime_lock, seq)); 914 } while (read_seqretry(&xtime_lock, seq));
918 915
919 return now; 916 return now;
@@ -928,7 +925,7 @@ struct timespec get_monotonic_coarse(void)
928 do { 925 do {
929 seq = read_seqbegin(&xtime_lock); 926 seq = read_seqbegin(&xtime_lock);
930 927
931 now = xtime_cache; 928 now = xtime;
932 mono = wall_to_monotonic; 929 mono = wall_to_monotonic;
933 } while (read_seqretry(&xtime_lock, seq)); 930 } while (read_seqretry(&xtime_lock, seq));
934 931
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1a4a7dd78777..ab8f5e33fa92 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
176 P_ns(idle_waketime); 176 P_ns(idle_waketime);
177 P_ns(idle_exittime); 177 P_ns(idle_exittime);
178 P_ns(idle_sleeptime); 178 P_ns(idle_sleeptime);
179 P_ns(iowait_sleeptime);
179 P(last_jiffies); 180 P(last_jiffies);
180 P(next_jiffies); 181 P(next_jiffies);
181 P_ns(idle_expires); 182 P_ns(idle_expires);
diff --git a/kernel/timer.c b/kernel/timer.c
index aeb6a54f2771..9199f3c52215 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -319,6 +319,24 @@ unsigned long round_jiffies_up_relative(unsigned long j)
319} 319}
320EXPORT_SYMBOL_GPL(round_jiffies_up_relative); 320EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
321 321
322/**
323 * set_timer_slack - set the allowed slack for a timer
324 * @slack_hz: the amount of time (in jiffies) allowed for rounding
325 *
326 * Set the amount of time, in jiffies, that a certain timer has
327 * in terms of slack. By setting this value, the timer subsystem
328 * will schedule the actual timer somewhere between
329 * the time mod_timer() asks for, and that time plus the slack.
330 *
331 * By setting the slack to -1, a percentage of the delay is used
332 * instead.
333 */
334void set_timer_slack(struct timer_list *timer, int slack_hz)
335{
336 timer->slack = slack_hz;
337}
338EXPORT_SYMBOL_GPL(set_timer_slack);
339
322 340
323static inline void set_running_timer(struct tvec_base *base, 341static inline void set_running_timer(struct tvec_base *base,
324 struct timer_list *timer) 342 struct timer_list *timer)
@@ -550,6 +568,7 @@ static void __init_timer(struct timer_list *timer,
550{ 568{
551 timer->entry.next = NULL; 569 timer->entry.next = NULL;
552 timer->base = __raw_get_cpu_var(tvec_bases); 570 timer->base = __raw_get_cpu_var(tvec_bases);
571 timer->slack = -1;
553#ifdef CONFIG_TIMER_STATS 572#ifdef CONFIG_TIMER_STATS
554 timer->start_site = NULL; 573 timer->start_site = NULL;
555 timer->start_pid = -1; 574 timer->start_pid = -1;
@@ -715,6 +734,41 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
715} 734}
716EXPORT_SYMBOL(mod_timer_pending); 735EXPORT_SYMBOL(mod_timer_pending);
717 736
737/*
738 * Decide where to put the timer while taking the slack into account
739 *
740 * Algorithm:
741 * 1) calculate the maximum (absolute) time
742 * 2) calculate the highest bit where the expires and new max are different
743 * 3) use this bit to make a mask
744 * 4) use the bitmask to round down the maximum time, so that all last
745 * bits are zeros
746 */
747static inline
748unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
749{
750 unsigned long expires_limit, mask;
751 int bit;
752
753 expires_limit = expires + timer->slack;
754
755 if (timer->slack < 0) /* auto slack: use 0.4% */
756 expires_limit = expires + (expires - jiffies)/256;
757
758 mask = expires ^ expires_limit;
759
760 if (mask == 0)
761 return expires;
762
763 bit = find_last_bit(&mask, BITS_PER_LONG);
764
765 mask = (1 << bit) - 1;
766
767 expires_limit = expires_limit & ~(mask);
768
769 return expires_limit;
770}
771
718/** 772/**
719 * mod_timer - modify a timer's timeout 773 * mod_timer - modify a timer's timeout
720 * @timer: the timer to be modified 774 * @timer: the timer to be modified
@@ -745,6 +799,8 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
745 if (timer_pending(timer) && timer->expires == expires) 799 if (timer_pending(timer) && timer->expires == expires)
746 return 1; 800 return 1;
747 801
802 expires = apply_slack(timer, expires);
803
748 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 804 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
749} 805}
750EXPORT_SYMBOL(mod_timer); 806EXPORT_SYMBOL(mod_timer);
@@ -955,6 +1011,47 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
955 return index; 1011 return index;
956} 1012}
957 1013
1014static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1015 unsigned long data)
1016{
1017 int preempt_count = preempt_count();
1018
1019#ifdef CONFIG_LOCKDEP
1020 /*
1021 * It is permissible to free the timer from inside the
1022 * function that is called from it, this we need to take into
1023 * account for lockdep too. To avoid bogus "held lock freed"
1024 * warnings as well as problems when looking into
1025 * timer->lockdep_map, make a copy and use that here.
1026 */
1027 struct lockdep_map lockdep_map = timer->lockdep_map;
1028#endif
1029 /*
1030 * Couple the lock chain with the lock chain at
1031 * del_timer_sync() by acquiring the lock_map around the fn()
1032 * call here and in del_timer_sync().
1033 */
1034 lock_map_acquire(&lockdep_map);
1035
1036 trace_timer_expire_entry(timer);
1037 fn(data);
1038 trace_timer_expire_exit(timer);
1039
1040 lock_map_release(&lockdep_map);
1041
1042 if (preempt_count != preempt_count()) {
1043 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1044 fn, preempt_count, preempt_count());
1045 /*
1046 * Restore the preempt count. That gives us a decent
1047 * chance to survive and extract information. If the
1048 * callback kept a lock held, bad luck, but not worse
1049 * than the BUG() we had.
1050 */
1051 preempt_count() = preempt_count;
1052 }
1053}
1054
958#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) 1055#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
959 1056
960/** 1057/**
@@ -998,45 +1095,7 @@ static inline void __run_timers(struct tvec_base *base)
998 detach_timer(timer, 1); 1095 detach_timer(timer, 1);
999 1096
1000 spin_unlock_irq(&base->lock); 1097 spin_unlock_irq(&base->lock);
1001 { 1098 call_timer_fn(timer, fn, data);
1002 int preempt_count = preempt_count();
1003
1004#ifdef CONFIG_LOCKDEP
1005 /*
1006 * It is permissible to free the timer from
1007 * inside the function that is called from
1008 * it, this we need to take into account for
1009 * lockdep too. To avoid bogus "held lock
1010 * freed" warnings as well as problems when
1011 * looking into timer->lockdep_map, make a
1012 * copy and use that here.
1013 */
1014 struct lockdep_map lockdep_map =
1015 timer->lockdep_map;
1016#endif
1017 /*
1018 * Couple the lock chain with the lock chain at
1019 * del_timer_sync() by acquiring the lock_map
1020 * around the fn() call here and in
1021 * del_timer_sync().
1022 */
1023 lock_map_acquire(&lockdep_map);
1024
1025 trace_timer_expire_entry(timer);
1026 fn(data);
1027 trace_timer_expire_exit(timer);
1028
1029 lock_map_release(&lockdep_map);
1030
1031 if (preempt_count != preempt_count()) {
1032 printk(KERN_ERR "huh, entered %p "
1033 "with preempt_count %08x, exited"
1034 " with %08x?\n",
1035 fn, preempt_count,
1036 preempt_count());
1037 BUG();
1038 }
1039 }
1040 spin_lock_irq(&base->lock); 1099 spin_lock_irq(&base->lock);
1041 } 1100 }
1042 } 1101 }
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 13e13d428cd3..8b1797c4545b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
44 help 44 help
45 See Documentation/trace/ftrace-design.txt 45 See Documentation/trace/ftrace-design.txt
46 46
47config HAVE_HW_BRANCH_TRACER
48 bool
49
50config HAVE_SYSCALL_TRACEPOINTS 47config HAVE_SYSCALL_TRACEPOINTS
51 bool 48 bool
52 help 49 help
@@ -374,14 +371,6 @@ config STACK_TRACER
374 371
375 Say N if unsure. 372 Say N if unsure.
376 373
377config HW_BRANCH_TRACER
378 depends on HAVE_HW_BRANCH_TRACER
379 bool "Trace hw branches"
380 select GENERIC_TRACER
381 help
382 This tracer records all branches on the system in a circular
383 buffer, giving access to the last N branches for each cpu.
384
385config KMEMTRACE 374config KMEMTRACE
386 bool "Trace SLAB allocations" 375 bool "Trace SLAB allocations"
387 select GENERIC_TRACER 376 select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 78edc6490038..ffb1a5b0550e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o 41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 44obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 45obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 46obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2404b59b3097..32837e19e3bd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -264,6 +264,7 @@ struct ftrace_profile {
264 unsigned long counter; 264 unsigned long counter;
265#ifdef CONFIG_FUNCTION_GRAPH_TRACER 265#ifdef CONFIG_FUNCTION_GRAPH_TRACER
266 unsigned long long time; 266 unsigned long long time;
267 unsigned long long time_squared;
267#endif 268#endif
268}; 269};
269 270
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
366{ 367{
367#ifdef CONFIG_FUNCTION_GRAPH_TRACER 368#ifdef CONFIG_FUNCTION_GRAPH_TRACER
368 seq_printf(m, " Function " 369 seq_printf(m, " Function "
369 "Hit Time Avg\n" 370 "Hit Time Avg s^2\n"
370 " -------- " 371 " -------- "
371 "--- ---- ---\n"); 372 "--- ---- --- ---\n");
372#else 373#else
373 seq_printf(m, " Function Hit\n" 374 seq_printf(m, " Function Hit\n"
374 " -------- ---\n"); 375 " -------- ---\n");
@@ -384,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
384 static DEFINE_MUTEX(mutex); 385 static DEFINE_MUTEX(mutex);
385 static struct trace_seq s; 386 static struct trace_seq s;
386 unsigned long long avg; 387 unsigned long long avg;
388 unsigned long long stddev;
387#endif 389#endif
388 390
389 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 391 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -394,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
394 avg = rec->time; 396 avg = rec->time;
395 do_div(avg, rec->counter); 397 do_div(avg, rec->counter);
396 398
399 /* Sample standard deviation (s^2) */
400 if (rec->counter <= 1)
401 stddev = 0;
402 else {
403 stddev = rec->time_squared - rec->counter * avg * avg;
404 /*
405 * Divide only 1000 for ns^2 -> us^2 conversion.
406 * trace_print_graph_duration will divide 1000 again.
407 */
408 do_div(stddev, (rec->counter - 1) * 1000);
409 }
410
397 mutex_lock(&mutex); 411 mutex_lock(&mutex);
398 trace_seq_init(&s); 412 trace_seq_init(&s);
399 trace_print_graph_duration(rec->time, &s); 413 trace_print_graph_duration(rec->time, &s);
400 trace_seq_puts(&s, " "); 414 trace_seq_puts(&s, " ");
401 trace_print_graph_duration(avg, &s); 415 trace_print_graph_duration(avg, &s);
416 trace_seq_puts(&s, " ");
417 trace_print_graph_duration(stddev, &s);
402 trace_print_seq(m, &s); 418 trace_print_seq(m, &s);
403 mutex_unlock(&mutex); 419 mutex_unlock(&mutex);
404#endif 420#endif
@@ -650,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
650 if (!stat->hash || !ftrace_profile_enabled) 666 if (!stat->hash || !ftrace_profile_enabled)
651 goto out; 667 goto out;
652 668
669 /* If the calltime was zero'd ignore it */
670 if (!trace->calltime)
671 goto out;
672
653 calltime = trace->rettime - trace->calltime; 673 calltime = trace->rettime - trace->calltime;
654 674
655 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { 675 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -668,8 +688,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
668 } 688 }
669 689
670 rec = ftrace_find_profiled_func(stat, trace->func); 690 rec = ftrace_find_profiled_func(stat, trace->func);
671 if (rec) 691 if (rec) {
672 rec->time += calltime; 692 rec->time += calltime;
693 rec->time_squared += calltime * calltime;
694 }
673 695
674 out: 696 out:
675 local_irq_restore(flags); 697 local_irq_restore(flags);
@@ -3212,8 +3234,7 @@ free:
3212} 3234}
3213 3235
3214static void 3236static void
3215ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, 3237ftrace_graph_probe_sched_switch(struct task_struct *prev, struct task_struct *next)
3216 struct task_struct *next)
3217{ 3238{
3218 unsigned long long timestamp; 3239 unsigned long long timestamp;
3219 int index; 3240 int index;
@@ -3339,11 +3360,11 @@ void unregister_ftrace_graph(void)
3339 goto out; 3360 goto out;
3340 3361
3341 ftrace_graph_active--; 3362 ftrace_graph_active--;
3342 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
3343 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 3363 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3344 ftrace_graph_entry = ftrace_graph_entry_stub; 3364 ftrace_graph_entry = ftrace_graph_entry_stub;
3345 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 3365 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
3346 unregister_pm_notifier(&ftrace_suspend_notifier); 3366 unregister_pm_notifier(&ftrace_suspend_notifier);
3367 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
3347 3368
3348 out: 3369 out:
3349 mutex_unlock(&ftrace_lock); 3370 mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 41ca394feb22..7f6059c5aa94 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -319,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
319#define TS_MASK ((1ULL << TS_SHIFT) - 1) 319#define TS_MASK ((1ULL << TS_SHIFT) - 1)
320#define TS_DELTA_TEST (~TS_MASK) 320#define TS_DELTA_TEST (~TS_MASK)
321 321
322/* Flag when events were overwritten */
323#define RB_MISSED_EVENTS (1 << 31)
324/* Missed count stored at end */
325#define RB_MISSED_STORED (1 << 30)
326
322struct buffer_data_page { 327struct buffer_data_page {
323 u64 time_stamp; /* page time stamp */ 328 u64 time_stamp; /* page time stamp */
324 local_t commit; /* write committed index */ 329 local_t commit; /* write committed index */
@@ -338,6 +343,7 @@ struct buffer_page {
338 local_t write; /* index for next write */ 343 local_t write; /* index for next write */
339 unsigned read; /* index for next read */ 344 unsigned read; /* index for next read */
340 local_t entries; /* entries on this page */ 345 local_t entries; /* entries on this page */
346 unsigned long real_end; /* real end of data */
341 struct buffer_data_page *page; /* Actual data page */ 347 struct buffer_data_page *page; /* Actual data page */
342}; 348};
343 349
@@ -417,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
417 (unsigned int)sizeof(field.commit), 423 (unsigned int)sizeof(field.commit),
418 (unsigned int)is_signed_type(long)); 424 (unsigned int)is_signed_type(long));
419 425
426 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
427 "offset:%u;\tsize:%u;\tsigned:%u;\n",
428 (unsigned int)offsetof(typeof(field), commit),
429 1,
430 (unsigned int)is_signed_type(long));
431
420 ret = trace_seq_printf(s, "\tfield: char data;\t" 432 ret = trace_seq_printf(s, "\tfield: char data;\t"
421 "offset:%u;\tsize:%u;\tsigned:%u;\n", 433 "offset:%u;\tsize:%u;\tsigned:%u;\n",
422 (unsigned int)offsetof(typeof(field), data), 434 (unsigned int)offsetof(typeof(field), data),
@@ -440,6 +452,8 @@ struct ring_buffer_per_cpu {
440 struct buffer_page *tail_page; /* write to tail */ 452 struct buffer_page *tail_page; /* write to tail */
441 struct buffer_page *commit_page; /* committed pages */ 453 struct buffer_page *commit_page; /* committed pages */
442 struct buffer_page *reader_page; 454 struct buffer_page *reader_page;
455 unsigned long lost_events;
456 unsigned long last_overrun;
443 local_t commit_overrun; 457 local_t commit_overrun;
444 local_t overrun; 458 local_t overrun;
445 local_t entries; 459 local_t entries;
@@ -1762,6 +1776,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1762 kmemcheck_annotate_bitfield(event, bitfield); 1776 kmemcheck_annotate_bitfield(event, bitfield);
1763 1777
1764 /* 1778 /*
1779 * Save the original length to the meta data.
1780 * This will be used by the reader to add lost event
1781 * counter.
1782 */
1783 tail_page->real_end = tail;
1784
1785 /*
1765 * If this event is bigger than the minimum size, then 1786 * If this event is bigger than the minimum size, then
1766 * we need to be careful that we don't subtract the 1787 * we need to be careful that we don't subtract the
1767 * write counter enough to allow another writer to slip 1788 * write counter enough to allow another writer to slip
@@ -1979,17 +2000,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1979 u64 *ts, u64 *delta) 2000 u64 *ts, u64 *delta)
1980{ 2001{
1981 struct ring_buffer_event *event; 2002 struct ring_buffer_event *event;
1982 static int once;
1983 int ret; 2003 int ret;
1984 2004
1985 if (unlikely(*delta > (1ULL << 59) && !once++)) { 2005 WARN_ONCE(*delta > (1ULL << 59),
1986 printk(KERN_WARNING "Delta way too big! %llu" 2006 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
1987 " ts=%llu write stamp = %llu\n", 2007 (unsigned long long)*delta,
1988 (unsigned long long)*delta, 2008 (unsigned long long)*ts,
1989 (unsigned long long)*ts, 2009 (unsigned long long)cpu_buffer->write_stamp);
1990 (unsigned long long)cpu_buffer->write_stamp);
1991 WARN_ON(1);
1992 }
1993 2010
1994 /* 2011 /*
1995 * The delta is too big, we to add a 2012 * The delta is too big, we to add a
@@ -2838,6 +2855,7 @@ static struct buffer_page *
2838rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 2855rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2839{ 2856{
2840 struct buffer_page *reader = NULL; 2857 struct buffer_page *reader = NULL;
2858 unsigned long overwrite;
2841 unsigned long flags; 2859 unsigned long flags;
2842 int nr_loops = 0; 2860 int nr_loops = 0;
2843 int ret; 2861 int ret;
@@ -2879,6 +2897,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2879 local_set(&cpu_buffer->reader_page->write, 0); 2897 local_set(&cpu_buffer->reader_page->write, 0);
2880 local_set(&cpu_buffer->reader_page->entries, 0); 2898 local_set(&cpu_buffer->reader_page->entries, 0);
2881 local_set(&cpu_buffer->reader_page->page->commit, 0); 2899 local_set(&cpu_buffer->reader_page->page->commit, 0);
2900 cpu_buffer->reader_page->real_end = 0;
2882 2901
2883 spin: 2902 spin:
2884 /* 2903 /*
@@ -2899,6 +2918,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2899 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); 2918 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2900 2919
2901 /* 2920 /*
2921 * We want to make sure we read the overruns after we set up our
2922 * pointers to the next object. The writer side does a
2923 * cmpxchg to cross pages which acts as the mb on the writer
2924 * side. Note, the reader will constantly fail the swap
2925 * while the writer is updating the pointers, so this
2926 * guarantees that the overwrite recorded here is the one we
2927 * want to compare with the last_overrun.
2928 */
2929 smp_mb();
2930 overwrite = local_read(&(cpu_buffer->overrun));
2931
2932 /*
2902 * Here's the tricky part. 2933 * Here's the tricky part.
2903 * 2934 *
2904 * We need to move the pointer past the header page. 2935 * We need to move the pointer past the header page.
@@ -2929,6 +2960,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2929 cpu_buffer->reader_page = reader; 2960 cpu_buffer->reader_page = reader;
2930 rb_reset_reader_page(cpu_buffer); 2961 rb_reset_reader_page(cpu_buffer);
2931 2962
2963 if (overwrite != cpu_buffer->last_overrun) {
2964 cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
2965 cpu_buffer->last_overrun = overwrite;
2966 }
2967
2932 goto again; 2968 goto again;
2933 2969
2934 out: 2970 out:
@@ -3005,8 +3041,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
3005 rb_advance_iter(iter); 3041 rb_advance_iter(iter);
3006} 3042}
3007 3043
3044static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
3045{
3046 return cpu_buffer->lost_events;
3047}
3048
3008static struct ring_buffer_event * 3049static struct ring_buffer_event *
3009rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) 3050rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3051 unsigned long *lost_events)
3010{ 3052{
3011 struct ring_buffer_event *event; 3053 struct ring_buffer_event *event;
3012 struct buffer_page *reader; 3054 struct buffer_page *reader;
@@ -3058,6 +3100,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3058 ring_buffer_normalize_time_stamp(cpu_buffer->buffer, 3100 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3059 cpu_buffer->cpu, ts); 3101 cpu_buffer->cpu, ts);
3060 } 3102 }
3103 if (lost_events)
3104 *lost_events = rb_lost_events(cpu_buffer);
3061 return event; 3105 return event;
3062 3106
3063 default: 3107 default:
@@ -3168,12 +3212,14 @@ static inline int rb_ok_to_lock(void)
3168 * @buffer: The ring buffer to read 3212 * @buffer: The ring buffer to read
3169 * @cpu: The cpu to peak at 3213 * @cpu: The cpu to peak at
3170 * @ts: The timestamp counter of this event. 3214 * @ts: The timestamp counter of this event.
3215 * @lost_events: a variable to store if events were lost (may be NULL)
3171 * 3216 *
3172 * This will return the event that will be read next, but does 3217 * This will return the event that will be read next, but does
3173 * not consume the data. 3218 * not consume the data.
3174 */ 3219 */
3175struct ring_buffer_event * 3220struct ring_buffer_event *
3176ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3221ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3222 unsigned long *lost_events)
3177{ 3223{
3178 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 3224 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3179 struct ring_buffer_event *event; 3225 struct ring_buffer_event *event;
@@ -3188,7 +3234,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3188 local_irq_save(flags); 3234 local_irq_save(flags);
3189 if (dolock) 3235 if (dolock)
3190 spin_lock(&cpu_buffer->reader_lock); 3236 spin_lock(&cpu_buffer->reader_lock);
3191 event = rb_buffer_peek(cpu_buffer, ts); 3237 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3192 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3238 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3193 rb_advance_reader(cpu_buffer); 3239 rb_advance_reader(cpu_buffer);
3194 if (dolock) 3240 if (dolock)
@@ -3230,13 +3276,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3230/** 3276/**
3231 * ring_buffer_consume - return an event and consume it 3277 * ring_buffer_consume - return an event and consume it
3232 * @buffer: The ring buffer to get the next event from 3278 * @buffer: The ring buffer to get the next event from
3279 * @cpu: the cpu to read the buffer from
3280 * @ts: a variable to store the timestamp (may be NULL)
3281 * @lost_events: a variable to store if events were lost (may be NULL)
3233 * 3282 *
3234 * Returns the next event in the ring buffer, and that event is consumed. 3283 * Returns the next event in the ring buffer, and that event is consumed.
3235 * Meaning, that sequential reads will keep returning a different event, 3284 * Meaning, that sequential reads will keep returning a different event,
3236 * and eventually empty the ring buffer if the producer is slower. 3285 * and eventually empty the ring buffer if the producer is slower.
3237 */ 3286 */
3238struct ring_buffer_event * 3287struct ring_buffer_event *
3239ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 3288ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3289 unsigned long *lost_events)
3240{ 3290{
3241 struct ring_buffer_per_cpu *cpu_buffer; 3291 struct ring_buffer_per_cpu *cpu_buffer;
3242 struct ring_buffer_event *event = NULL; 3292 struct ring_buffer_event *event = NULL;
@@ -3257,9 +3307,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3257 if (dolock) 3307 if (dolock)
3258 spin_lock(&cpu_buffer->reader_lock); 3308 spin_lock(&cpu_buffer->reader_lock);
3259 3309
3260 event = rb_buffer_peek(cpu_buffer, ts); 3310 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3261 if (event) 3311 if (event) {
3312 cpu_buffer->lost_events = 0;
3262 rb_advance_reader(cpu_buffer); 3313 rb_advance_reader(cpu_buffer);
3314 }
3263 3315
3264 if (dolock) 3316 if (dolock)
3265 spin_unlock(&cpu_buffer->reader_lock); 3317 spin_unlock(&cpu_buffer->reader_lock);
@@ -3276,23 +3328,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3276EXPORT_SYMBOL_GPL(ring_buffer_consume); 3328EXPORT_SYMBOL_GPL(ring_buffer_consume);
3277 3329
3278/** 3330/**
3279 * ring_buffer_read_start - start a non consuming read of the buffer 3331 * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
3280 * @buffer: The ring buffer to read from 3332 * @buffer: The ring buffer to read from
3281 * @cpu: The cpu buffer to iterate over 3333 * @cpu: The cpu buffer to iterate over
3282 * 3334 *
3283 * This starts up an iteration through the buffer. It also disables 3335 * This performs the initial preparations necessary to iterate
3284 * the recording to the buffer until the reading is finished. 3336 * through the buffer. Memory is allocated, buffer recording
3285 * This prevents the reading from being corrupted. This is not 3337 * is disabled, and the iterator pointer is returned to the caller.
3286 * a consuming read, so a producer is not expected.
3287 * 3338 *
3288 * Must be paired with ring_buffer_finish. 3339 * Disabling buffer recordng prevents the reading from being
3340 * corrupted. This is not a consuming read, so a producer is not
3341 * expected.
3342 *
3343 * After a sequence of ring_buffer_read_prepare calls, the user is
3344 * expected to make at least one call to ring_buffer_prepare_sync.
3345 * Afterwards, ring_buffer_read_start is invoked to get things going
3346 * for real.
3347 *
3348 * This overall must be paired with ring_buffer_finish.
3289 */ 3349 */
3290struct ring_buffer_iter * 3350struct ring_buffer_iter *
3291ring_buffer_read_start(struct ring_buffer *buffer, int cpu) 3351ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3292{ 3352{
3293 struct ring_buffer_per_cpu *cpu_buffer; 3353 struct ring_buffer_per_cpu *cpu_buffer;
3294 struct ring_buffer_iter *iter; 3354 struct ring_buffer_iter *iter;
3295 unsigned long flags;
3296 3355
3297 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3356 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3298 return NULL; 3357 return NULL;
@@ -3306,15 +3365,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3306 iter->cpu_buffer = cpu_buffer; 3365 iter->cpu_buffer = cpu_buffer;
3307 3366
3308 atomic_inc(&cpu_buffer->record_disabled); 3367 atomic_inc(&cpu_buffer->record_disabled);
3368
3369 return iter;
3370}
3371EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
3372
3373/**
3374 * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
3375 *
3376 * All previously invoked ring_buffer_read_prepare calls to prepare
3377 * iterators will be synchronized. Afterwards, read_buffer_read_start
3378 * calls on those iterators are allowed.
3379 */
3380void
3381ring_buffer_read_prepare_sync(void)
3382{
3309 synchronize_sched(); 3383 synchronize_sched();
3384}
3385EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
3386
3387/**
3388 * ring_buffer_read_start - start a non consuming read of the buffer
3389 * @iter: The iterator returned by ring_buffer_read_prepare
3390 *
3391 * This finalizes the startup of an iteration through the buffer.
3392 * The iterator comes from a call to ring_buffer_read_prepare and
3393 * an intervening ring_buffer_read_prepare_sync must have been
3394 * performed.
3395 *
3396 * Must be paired with ring_buffer_finish.
3397 */
3398void
3399ring_buffer_read_start(struct ring_buffer_iter *iter)
3400{
3401 struct ring_buffer_per_cpu *cpu_buffer;
3402 unsigned long flags;
3403
3404 if (!iter)
3405 return;
3406
3407 cpu_buffer = iter->cpu_buffer;
3310 3408
3311 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3409 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3312 arch_spin_lock(&cpu_buffer->lock); 3410 arch_spin_lock(&cpu_buffer->lock);
3313 rb_iter_reset(iter); 3411 rb_iter_reset(iter);
3314 arch_spin_unlock(&cpu_buffer->lock); 3412 arch_spin_unlock(&cpu_buffer->lock);
3315 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3413 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3316
3317 return iter;
3318} 3414}
3319EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3415EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3320 3416
@@ -3408,6 +3504,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3408 cpu_buffer->write_stamp = 0; 3504 cpu_buffer->write_stamp = 0;
3409 cpu_buffer->read_stamp = 0; 3505 cpu_buffer->read_stamp = 0;
3410 3506
3507 cpu_buffer->lost_events = 0;
3508 cpu_buffer->last_overrun = 0;
3509
3411 rb_head_page_activate(cpu_buffer); 3510 rb_head_page_activate(cpu_buffer);
3412} 3511}
3413 3512
@@ -3683,6 +3782,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3683 struct ring_buffer_event *event; 3782 struct ring_buffer_event *event;
3684 struct buffer_data_page *bpage; 3783 struct buffer_data_page *bpage;
3685 struct buffer_page *reader; 3784 struct buffer_page *reader;
3785 unsigned long missed_events;
3686 unsigned long flags; 3786 unsigned long flags;
3687 unsigned int commit; 3787 unsigned int commit;
3688 unsigned int read; 3788 unsigned int read;
@@ -3719,6 +3819,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3719 read = reader->read; 3819 read = reader->read;
3720 commit = rb_page_commit(reader); 3820 commit = rb_page_commit(reader);
3721 3821
3822 /* Check if any events were dropped */
3823 missed_events = cpu_buffer->lost_events;
3824
3722 /* 3825 /*
3723 * If this page has been partially read or 3826 * If this page has been partially read or
3724 * if len is not big enough to read the rest of the page or 3827 * if len is not big enough to read the rest of the page or
@@ -3779,9 +3882,35 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3779 local_set(&reader->entries, 0); 3882 local_set(&reader->entries, 0);
3780 reader->read = 0; 3883 reader->read = 0;
3781 *data_page = bpage; 3884 *data_page = bpage;
3885
3886 /*
3887 * Use the real_end for the data size,
3888 * This gives us a chance to store the lost events
3889 * on the page.
3890 */
3891 if (reader->real_end)
3892 local_set(&bpage->commit, reader->real_end);
3782 } 3893 }
3783 ret = read; 3894 ret = read;
3784 3895
3896 cpu_buffer->lost_events = 0;
3897 /*
3898 * Set a flag in the commit field if we lost events
3899 */
3900 if (missed_events) {
3901 commit = local_read(&bpage->commit);
3902
3903 /* If there is room at the end of the page to save the
3904 * missed events, then record it there.
3905 */
3906 if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
3907 memcpy(&bpage->data[commit], &missed_events,
3908 sizeof(missed_events));
3909 local_add(RB_MISSED_STORED, &bpage->commit);
3910 }
3911 local_add(RB_MISSED_EVENTS, &bpage->commit);
3912 }
3913
3785 out_unlock: 3914 out_unlock:
3786 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3915 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3787 3916
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index df74c7982255..302f8a614635 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -81,7 +81,7 @@ static enum event_status read_event(int cpu)
81 int *entry; 81 int *entry;
82 u64 ts; 82 u64 ts;
83 83
84 event = ring_buffer_consume(buffer, cpu, &ts); 84 event = ring_buffer_consume(buffer, cpu, &ts, NULL);
85 if (!event) 85 if (!event)
86 return EVENT_DROPPED; 86 return EVENT_DROPPED;
87 87
@@ -113,7 +113,8 @@ static enum event_status read_page(int cpu)
113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); 113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
114 if (ret >= 0) { 114 if (ret >= 0) {
115 rpage = bpage; 115 rpage = bpage;
116 commit = local_read(&rpage->commit); 116 /* The commit may have missed event flags set, clear them */
117 commit = local_read(&rpage->commit) & 0xfffff;
117 for (i = 0; i < commit && !kill_test; i += inc) { 118 for (i = 0; i < commit && !kill_test; i += inc) {
118 119
119 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { 120 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44f916a04065..756d7283318b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
117 * 117 *
118 * It is default off, but you can enable it with either specifying 118 * It is default off, but you can enable it with either specifying
119 * "ftrace_dump_on_oops" in the kernel command line, or setting 119 * "ftrace_dump_on_oops" in the kernel command line, or setting
120 * /proc/sys/kernel/ftrace_dump_on_oops to true. 120 * /proc/sys/kernel/ftrace_dump_on_oops
121 * Set 1 if you want to dump buffers of all CPUs
122 * Set 2 if you want to dump the buffer of the CPU that triggered oops
121 */ 123 */
122int ftrace_dump_on_oops; 124
125enum ftrace_dump_mode ftrace_dump_on_oops;
123 126
124static int tracing_set_tracer(const char *buf); 127static int tracing_set_tracer(const char *buf);
125 128
@@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
139 142
140static int __init set_ftrace_dump_on_oops(char *str) 143static int __init set_ftrace_dump_on_oops(char *str)
141{ 144{
142 ftrace_dump_on_oops = 1; 145 if (*str++ != '=' || !*str) {
143 return 1; 146 ftrace_dump_on_oops = DUMP_ALL;
147 return 1;
148 }
149
150 if (!strcmp("orig_cpu", str)) {
151 ftrace_dump_on_oops = DUMP_ORIG;
152 return 1;
153 }
154
155 return 0;
144} 156}
145__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 157__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
146 158
@@ -1545,7 +1557,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
1545} 1557}
1546 1558
1547static struct trace_entry * 1559static struct trace_entry *
1548peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) 1560peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1561 unsigned long *lost_events)
1549{ 1562{
1550 struct ring_buffer_event *event; 1563 struct ring_buffer_event *event;
1551 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1564 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1556,7 +1569,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1556 if (buf_iter) 1569 if (buf_iter)
1557 event = ring_buffer_iter_peek(buf_iter, ts); 1570 event = ring_buffer_iter_peek(buf_iter, ts);
1558 else 1571 else
1559 event = ring_buffer_peek(iter->tr->buffer, cpu, ts); 1572 event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
1573 lost_events);
1560 1574
1561 ftrace_enable_cpu(); 1575 ftrace_enable_cpu();
1562 1576
@@ -1564,10 +1578,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1564} 1578}
1565 1579
1566static struct trace_entry * 1580static struct trace_entry *
1567__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) 1581__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1582 unsigned long *missing_events, u64 *ent_ts)
1568{ 1583{
1569 struct ring_buffer *buffer = iter->tr->buffer; 1584 struct ring_buffer *buffer = iter->tr->buffer;
1570 struct trace_entry *ent, *next = NULL; 1585 struct trace_entry *ent, *next = NULL;
1586 unsigned long lost_events = 0, next_lost = 0;
1571 int cpu_file = iter->cpu_file; 1587 int cpu_file = iter->cpu_file;
1572 u64 next_ts = 0, ts; 1588 u64 next_ts = 0, ts;
1573 int next_cpu = -1; 1589 int next_cpu = -1;
@@ -1580,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1580 if (cpu_file > TRACE_PIPE_ALL_CPU) { 1596 if (cpu_file > TRACE_PIPE_ALL_CPU) {
1581 if (ring_buffer_empty_cpu(buffer, cpu_file)) 1597 if (ring_buffer_empty_cpu(buffer, cpu_file))
1582 return NULL; 1598 return NULL;
1583 ent = peek_next_entry(iter, cpu_file, ent_ts); 1599 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
1584 if (ent_cpu) 1600 if (ent_cpu)
1585 *ent_cpu = cpu_file; 1601 *ent_cpu = cpu_file;
1586 1602
@@ -1592,7 +1608,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1592 if (ring_buffer_empty_cpu(buffer, cpu)) 1608 if (ring_buffer_empty_cpu(buffer, cpu))
1593 continue; 1609 continue;
1594 1610
1595 ent = peek_next_entry(iter, cpu, &ts); 1611 ent = peek_next_entry(iter, cpu, &ts, &lost_events);
1596 1612
1597 /* 1613 /*
1598 * Pick the entry with the smallest timestamp: 1614 * Pick the entry with the smallest timestamp:
@@ -1601,6 +1617,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1601 next = ent; 1617 next = ent;
1602 next_cpu = cpu; 1618 next_cpu = cpu;
1603 next_ts = ts; 1619 next_ts = ts;
1620 next_lost = lost_events;
1604 } 1621 }
1605 } 1622 }
1606 1623
@@ -1610,6 +1627,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1610 if (ent_ts) 1627 if (ent_ts)
1611 *ent_ts = next_ts; 1628 *ent_ts = next_ts;
1612 1629
1630 if (missing_events)
1631 *missing_events = next_lost;
1632
1613 return next; 1633 return next;
1614} 1634}
1615 1635
@@ -1617,13 +1637,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1617struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 1637struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1618 int *ent_cpu, u64 *ent_ts) 1638 int *ent_cpu, u64 *ent_ts)
1619{ 1639{
1620 return __find_next_entry(iter, ent_cpu, ent_ts); 1640 return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
1621} 1641}
1622 1642
1623/* Find the next real entry, and increment the iterator to the next entry */ 1643/* Find the next real entry, and increment the iterator to the next entry */
1624static void *find_next_entry_inc(struct trace_iterator *iter) 1644static void *find_next_entry_inc(struct trace_iterator *iter)
1625{ 1645{
1626 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); 1646 iter->ent = __find_next_entry(iter, &iter->cpu,
1647 &iter->lost_events, &iter->ts);
1627 1648
1628 if (iter->ent) 1649 if (iter->ent)
1629 trace_iterator_increment(iter); 1650 trace_iterator_increment(iter);
@@ -1635,7 +1656,8 @@ static void trace_consume(struct trace_iterator *iter)
1635{ 1656{
1636 /* Don't allow ftrace to trace into the ring buffers */ 1657 /* Don't allow ftrace to trace into the ring buffers */
1637 ftrace_disable_cpu(); 1658 ftrace_disable_cpu();
1638 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); 1659 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
1660 &iter->lost_events);
1639 ftrace_enable_cpu(); 1661 ftrace_enable_cpu();
1640} 1662}
1641 1663
@@ -1786,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
1786} 1808}
1787 1809
1788 1810
1789static void 1811void
1790print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1812print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1791{ 1813{
1792 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1814 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1995,7 +2017,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1995 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; 2017 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
1996} 2018}
1997 2019
1998static int trace_empty(struct trace_iterator *iter) 2020int trace_empty(struct trace_iterator *iter)
1999{ 2021{
2000 int cpu; 2022 int cpu;
2001 2023
@@ -2030,6 +2052,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2030{ 2052{
2031 enum print_line_t ret; 2053 enum print_line_t ret;
2032 2054
2055 if (iter->lost_events)
2056 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2057 iter->cpu, iter->lost_events);
2058
2033 if (iter->trace && iter->trace->print_line) { 2059 if (iter->trace && iter->trace->print_line) {
2034 ret = iter->trace->print_line(iter); 2060 ret = iter->trace->print_line(iter);
2035 if (ret != TRACE_TYPE_UNHANDLED) 2061 if (ret != TRACE_TYPE_UNHANDLED)
@@ -2058,6 +2084,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2058 return print_trace_fmt(iter); 2084 return print_trace_fmt(iter);
2059} 2085}
2060 2086
2087void trace_default_header(struct seq_file *m)
2088{
2089 struct trace_iterator *iter = m->private;
2090
2091 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2092 /* print nothing if the buffers are empty */
2093 if (trace_empty(iter))
2094 return;
2095 print_trace_header(m, iter);
2096 if (!(trace_flags & TRACE_ITER_VERBOSE))
2097 print_lat_help_header(m);
2098 } else {
2099 if (!(trace_flags & TRACE_ITER_VERBOSE))
2100 print_func_help_header(m);
2101 }
2102}
2103
2061static int s_show(struct seq_file *m, void *v) 2104static int s_show(struct seq_file *m, void *v)
2062{ 2105{
2063 struct trace_iterator *iter = v; 2106 struct trace_iterator *iter = v;
@@ -2070,17 +2113,9 @@ static int s_show(struct seq_file *m, void *v)
2070 } 2113 }
2071 if (iter->trace && iter->trace->print_header) 2114 if (iter->trace && iter->trace->print_header)
2072 iter->trace->print_header(m); 2115 iter->trace->print_header(m);
2073 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2116 else
2074 /* print nothing if the buffers are empty */ 2117 trace_default_header(m);
2075 if (trace_empty(iter)) 2118
2076 return 0;
2077 print_trace_header(m, iter);
2078 if (!(trace_flags & TRACE_ITER_VERBOSE))
2079 print_lat_help_header(m);
2080 } else {
2081 if (!(trace_flags & TRACE_ITER_VERBOSE))
2082 print_func_help_header(m);
2083 }
2084 } else if (iter->leftover) { 2119 } else if (iter->leftover) {
2085 /* 2120 /*
2086 * If we filled the seq_file buffer earlier, we 2121 * If we filled the seq_file buffer earlier, we
@@ -2166,15 +2201,20 @@ __tracing_open(struct inode *inode, struct file *file)
2166 2201
2167 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2202 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2168 for_each_tracing_cpu(cpu) { 2203 for_each_tracing_cpu(cpu) {
2169
2170 iter->buffer_iter[cpu] = 2204 iter->buffer_iter[cpu] =
2171 ring_buffer_read_start(iter->tr->buffer, cpu); 2205 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2206 }
2207 ring_buffer_read_prepare_sync();
2208 for_each_tracing_cpu(cpu) {
2209 ring_buffer_read_start(iter->buffer_iter[cpu]);
2172 tracing_iter_reset(iter, cpu); 2210 tracing_iter_reset(iter, cpu);
2173 } 2211 }
2174 } else { 2212 } else {
2175 cpu = iter->cpu_file; 2213 cpu = iter->cpu_file;
2176 iter->buffer_iter[cpu] = 2214 iter->buffer_iter[cpu] =
2177 ring_buffer_read_start(iter->tr->buffer, cpu); 2215 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2216 ring_buffer_read_prepare_sync();
2217 ring_buffer_read_start(iter->buffer_iter[cpu]);
2178 tracing_iter_reset(iter, cpu); 2218 tracing_iter_reset(iter, cpu);
2179 } 2219 }
2180 2220
@@ -4324,7 +4364,7 @@ static int trace_panic_handler(struct notifier_block *this,
4324 unsigned long event, void *unused) 4364 unsigned long event, void *unused)
4325{ 4365{
4326 if (ftrace_dump_on_oops) 4366 if (ftrace_dump_on_oops)
4327 ftrace_dump(); 4367 ftrace_dump(ftrace_dump_on_oops);
4328 return NOTIFY_OK; 4368 return NOTIFY_OK;
4329} 4369}
4330 4370
@@ -4341,7 +4381,7 @@ static int trace_die_handler(struct notifier_block *self,
4341 switch (val) { 4381 switch (val) {
4342 case DIE_OOPS: 4382 case DIE_OOPS:
4343 if (ftrace_dump_on_oops) 4383 if (ftrace_dump_on_oops)
4344 ftrace_dump(); 4384 ftrace_dump(ftrace_dump_on_oops);
4345 break; 4385 break;
4346 default: 4386 default:
4347 break; 4387 break;
@@ -4382,7 +4422,8 @@ trace_printk_seq(struct trace_seq *s)
4382 trace_seq_init(s); 4422 trace_seq_init(s);
4383} 4423}
4384 4424
4385static void __ftrace_dump(bool disable_tracing) 4425static void
4426__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4386{ 4427{
4387 static arch_spinlock_t ftrace_dump_lock = 4428 static arch_spinlock_t ftrace_dump_lock =
4388 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 4429 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4415,12 +4456,25 @@ static void __ftrace_dump(bool disable_tracing)
4415 /* don't look at user memory in panic mode */ 4456 /* don't look at user memory in panic mode */
4416 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 4457 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
4417 4458
4418 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4419
4420 /* Simulate the iterator */ 4459 /* Simulate the iterator */
4421 iter.tr = &global_trace; 4460 iter.tr = &global_trace;
4422 iter.trace = current_trace; 4461 iter.trace = current_trace;
4423 iter.cpu_file = TRACE_PIPE_ALL_CPU; 4462
4463 switch (oops_dump_mode) {
4464 case DUMP_ALL:
4465 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4466 break;
4467 case DUMP_ORIG:
4468 iter.cpu_file = raw_smp_processor_id();
4469 break;
4470 case DUMP_NONE:
4471 goto out_enable;
4472 default:
4473 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
4474 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4475 }
4476
4477 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4424 4478
4425 /* 4479 /*
4426 * We need to stop all tracing on all CPUS to read the 4480 * We need to stop all tracing on all CPUS to read the
@@ -4459,6 +4513,7 @@ static void __ftrace_dump(bool disable_tracing)
4459 else 4513 else
4460 printk(KERN_TRACE "---------------------------------\n"); 4514 printk(KERN_TRACE "---------------------------------\n");
4461 4515
4516 out_enable:
4462 /* Re-enable tracing if requested */ 4517 /* Re-enable tracing if requested */
4463 if (!disable_tracing) { 4518 if (!disable_tracing) {
4464 trace_flags |= old_userobj; 4519 trace_flags |= old_userobj;
@@ -4475,9 +4530,9 @@ static void __ftrace_dump(bool disable_tracing)
4475} 4530}
4476 4531
4477/* By default: disable tracing after the dump */ 4532/* By default: disable tracing after the dump */
4478void ftrace_dump(void) 4533void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4479{ 4534{
4480 __ftrace_dump(true); 4535 __ftrace_dump(true, oops_dump_mode);
4481} 4536}
4482 4537
4483__init static int tracer_alloc_buffers(void) 4538__init static int tracer_alloc_buffers(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2825ef2c0b15..d1ce0bec1b3f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,7 +34,6 @@ enum trace_type {
34 TRACE_GRAPH_RET, 34 TRACE_GRAPH_RET,
35 TRACE_GRAPH_ENT, 35 TRACE_GRAPH_ENT,
36 TRACE_USER_STACK, 36 TRACE_USER_STACK,
37 TRACE_HW_BRANCHES,
38 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
39 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
40 TRACE_BLK, 39 TRACE_BLK,
@@ -103,29 +102,17 @@ struct syscall_trace_exit {
103 long ret; 102 long ret;
104}; 103};
105 104
106struct kprobe_trace_entry { 105struct kprobe_trace_entry_head {
107 struct trace_entry ent; 106 struct trace_entry ent;
108 unsigned long ip; 107 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111}; 108};
112 109
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ 110struct kretprobe_trace_entry_head {
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent; 111 struct trace_entry ent;
119 unsigned long func; 112 unsigned long func;
120 unsigned long ret_ip; 113 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123}; 114};
124 115
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
129/* 116/*
130 * trace_flag_type is an enumeration that holds different 117 * trace_flag_type is an enumeration that holds different
131 * states when a trace occurs. These are: 118 * states when a trace occurs. These are:
@@ -229,7 +216,6 @@ extern void __ftrace_bad_type(void);
229 TRACE_GRAPH_ENT); \ 216 TRACE_GRAPH_ENT); \
230 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 217 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
231 TRACE_GRAPH_RET); \ 218 TRACE_GRAPH_RET); \
232 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
233 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 219 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
234 TRACE_KMEM_ALLOC); \ 220 TRACE_KMEM_ALLOC); \
235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 221 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
@@ -378,6 +364,9 @@ void trace_function(struct trace_array *tr,
378 unsigned long ip, 364 unsigned long ip,
379 unsigned long parent_ip, 365 unsigned long parent_ip,
380 unsigned long flags, int pc); 366 unsigned long flags, int pc);
367void trace_default_header(struct seq_file *m);
368void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
369int trace_empty(struct trace_iterator *iter);
381 370
382void trace_graph_return(struct ftrace_graph_ret *trace); 371void trace_graph_return(struct ftrace_graph_ret *trace);
383int trace_graph_entry(struct ftrace_graph_ent *trace); 372int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -467,8 +456,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
467 struct trace_array *tr); 456 struct trace_array *tr);
468extern int trace_selftest_startup_branch(struct tracer *trace, 457extern int trace_selftest_startup_branch(struct tracer *trace,
469 struct trace_array *tr); 458 struct trace_array *tr);
470extern int trace_selftest_startup_hw_branches(struct tracer *trace,
471 struct trace_array *tr);
472extern int trace_selftest_startup_ksym(struct tracer *trace, 459extern int trace_selftest_startup_ksym(struct tracer *trace,
473 struct trace_array *tr); 460 struct trace_array *tr);
474#endif /* CONFIG_FTRACE_STARTUP_TEST */ 461#endif /* CONFIG_FTRACE_STARTUP_TEST */
@@ -491,9 +478,29 @@ extern int trace_clock_id;
491 478
492/* Standard output formatting function used for function return traces */ 479/* Standard output formatting function used for function return traces */
493#ifdef CONFIG_FUNCTION_GRAPH_TRACER 480#ifdef CONFIG_FUNCTION_GRAPH_TRACER
494extern enum print_line_t print_graph_function(struct trace_iterator *iter); 481
482/* Flag options */
483#define TRACE_GRAPH_PRINT_OVERRUN 0x1
484#define TRACE_GRAPH_PRINT_CPU 0x2
485#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
486#define TRACE_GRAPH_PRINT_PROC 0x8
487#define TRACE_GRAPH_PRINT_DURATION 0x10
488#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
489
490extern enum print_line_t
491print_graph_function_flags(struct trace_iterator *iter, u32 flags);
492extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
495extern enum print_line_t 493extern enum print_line_t
496trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); 494trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
495extern void graph_trace_open(struct trace_iterator *iter);
496extern void graph_trace_close(struct trace_iterator *iter);
497extern int __trace_graph_entry(struct trace_array *tr,
498 struct ftrace_graph_ent *trace,
499 unsigned long flags, int pc);
500extern void __trace_graph_return(struct trace_array *tr,
501 struct ftrace_graph_ret *trace,
502 unsigned long flags, int pc);
503
497 504
498#ifdef CONFIG_DYNAMIC_FTRACE 505#ifdef CONFIG_DYNAMIC_FTRACE
499/* TODO: make this variable */ 506/* TODO: make this variable */
@@ -524,7 +531,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
524#endif /* CONFIG_DYNAMIC_FTRACE */ 531#endif /* CONFIG_DYNAMIC_FTRACE */
525#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 532#else /* CONFIG_FUNCTION_GRAPH_TRACER */
526static inline enum print_line_t 533static inline enum print_line_t
527print_graph_function(struct trace_iterator *iter) 534print_graph_function_flags(struct trace_iterator *iter, u32 flags)
528{ 535{
529 return TRACE_TYPE_UNHANDLED; 536 return TRACE_TYPE_UNHANDLED;
530} 537}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399df..dc008c1240da 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch,
318 __entry->func, __entry->file, __entry->correct) 318 __entry->func, __entry->file, __entry->correct)
319); 319);
320 320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, 321FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334 322
335 TRACE_KMEM_ALLOC, 323 TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 88c0b6dbd7fe..58092d844a1f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1398,7 +1398,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1398 } 1398 }
1399 1399
1400 err = -EINVAL; 1400 err = -EINVAL;
1401 if (!call) 1401 if (&call->list == &ftrace_events)
1402 goto out_unlock; 1402 goto out_unlock;
1403 1403
1404 err = -EEXIST; 1404 err = -EEXIST;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 9aed1a5cf553..dd11c830eb84 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -40,7 +40,7 @@ struct fgraph_data {
40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
41#define TRACE_GRAPH_PRINT_PROC 0x8 41#define TRACE_GRAPH_PRINT_PROC 0x8
42#define TRACE_GRAPH_PRINT_DURATION 0x10 42#define TRACE_GRAPH_PRINT_DURATION 0x10
43#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
44 44
45static struct tracer_opt trace_opts[] = { 45static struct tracer_opt trace_opts[] = {
46 /* Display overruns? (for self-debug purpose) */ 46 /* Display overruns? (for self-debug purpose) */
@@ -179,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
179 return ret; 179 return ret;
180} 180}
181 181
182static int __trace_graph_entry(struct trace_array *tr, 182int __trace_graph_entry(struct trace_array *tr,
183 struct ftrace_graph_ent *trace, 183 struct ftrace_graph_ent *trace,
184 unsigned long flags, 184 unsigned long flags,
185 int pc) 185 int pc)
@@ -246,7 +246,7 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
246 return trace_graph_entry(trace); 246 return trace_graph_entry(trace);
247} 247}
248 248
249static void __trace_graph_return(struct trace_array *tr, 249void __trace_graph_return(struct trace_array *tr,
250 struct ftrace_graph_ret *trace, 250 struct ftrace_graph_ret *trace,
251 unsigned long flags, 251 unsigned long flags,
252 int pc) 252 int pc)
@@ -490,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
490 * We need to consume the current entry to see 490 * We need to consume the current entry to see
491 * the next one. 491 * the next one.
492 */ 492 */
493 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 493 ring_buffer_consume(iter->tr->buffer, iter->cpu,
494 NULL, NULL);
494 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 495 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
495 NULL); 496 NULL, NULL);
496 } 497 }
497 498
498 if (!event) 499 if (!event)
@@ -526,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
526 527
527/* Signal a overhead of time execution to the output */ 528/* Signal a overhead of time execution to the output */
528static int 529static int
529print_graph_overhead(unsigned long long duration, struct trace_seq *s) 530print_graph_overhead(unsigned long long duration, struct trace_seq *s,
531 u32 flags)
530{ 532{
531 /* If duration disappear, we don't need anything */ 533 /* If duration disappear, we don't need anything */
532 if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) 534 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
533 return 1; 535 return 1;
534 536
535 /* Non nested entry or return */ 537 /* Non nested entry or return */
536 if (duration == -1) 538 if (duration == -1)
537 return trace_seq_printf(s, " "); 539 return trace_seq_printf(s, " ");
538 540
539 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 541 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
540 /* Duration exceeded 100 msecs */ 542 /* Duration exceeded 100 msecs */
541 if (duration > 100000ULL) 543 if (duration > 100000ULL)
542 return trace_seq_printf(s, "! "); 544 return trace_seq_printf(s, "! ");
@@ -562,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
562 564
563static enum print_line_t 565static enum print_line_t
564print_graph_irq(struct trace_iterator *iter, unsigned long addr, 566print_graph_irq(struct trace_iterator *iter, unsigned long addr,
565 enum trace_type type, int cpu, pid_t pid) 567 enum trace_type type, int cpu, pid_t pid, u32 flags)
566{ 568{
567 int ret; 569 int ret;
568 struct trace_seq *s = &iter->seq; 570 struct trace_seq *s = &iter->seq;
@@ -572,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
572 return TRACE_TYPE_UNHANDLED; 574 return TRACE_TYPE_UNHANDLED;
573 575
574 /* Absolute time */ 576 /* Absolute time */
575 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 577 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
576 ret = print_graph_abs_time(iter->ts, s); 578 ret = print_graph_abs_time(iter->ts, s);
577 if (!ret) 579 if (!ret)
578 return TRACE_TYPE_PARTIAL_LINE; 580 return TRACE_TYPE_PARTIAL_LINE;
579 } 581 }
580 582
581 /* Cpu */ 583 /* Cpu */
582 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 584 if (flags & TRACE_GRAPH_PRINT_CPU) {
583 ret = print_graph_cpu(s, cpu); 585 ret = print_graph_cpu(s, cpu);
584 if (ret == TRACE_TYPE_PARTIAL_LINE) 586 if (ret == TRACE_TYPE_PARTIAL_LINE)
585 return TRACE_TYPE_PARTIAL_LINE; 587 return TRACE_TYPE_PARTIAL_LINE;
586 } 588 }
587 589
588 /* Proc */ 590 /* Proc */
589 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 591 if (flags & TRACE_GRAPH_PRINT_PROC) {
590 ret = print_graph_proc(s, pid); 592 ret = print_graph_proc(s, pid);
591 if (ret == TRACE_TYPE_PARTIAL_LINE) 593 if (ret == TRACE_TYPE_PARTIAL_LINE)
592 return TRACE_TYPE_PARTIAL_LINE; 594 return TRACE_TYPE_PARTIAL_LINE;
@@ -596,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
596 } 598 }
597 599
598 /* No overhead */ 600 /* No overhead */
599 ret = print_graph_overhead(-1, s); 601 ret = print_graph_overhead(-1, s, flags);
600 if (!ret) 602 if (!ret)
601 return TRACE_TYPE_PARTIAL_LINE; 603 return TRACE_TYPE_PARTIAL_LINE;
602 604
@@ -609,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
609 return TRACE_TYPE_PARTIAL_LINE; 611 return TRACE_TYPE_PARTIAL_LINE;
610 612
611 /* Don't close the duration column if haven't one */ 613 /* Don't close the duration column if haven't one */
612 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 614 if (flags & TRACE_GRAPH_PRINT_DURATION)
613 trace_seq_printf(s, " |"); 615 trace_seq_printf(s, " |");
614 ret = trace_seq_printf(s, "\n"); 616 ret = trace_seq_printf(s, "\n");
615 617
@@ -679,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
679static enum print_line_t 681static enum print_line_t
680print_graph_entry_leaf(struct trace_iterator *iter, 682print_graph_entry_leaf(struct trace_iterator *iter,
681 struct ftrace_graph_ent_entry *entry, 683 struct ftrace_graph_ent_entry *entry,
682 struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) 684 struct ftrace_graph_ret_entry *ret_entry,
685 struct trace_seq *s, u32 flags)
683{ 686{
684 struct fgraph_data *data = iter->private; 687 struct fgraph_data *data = iter->private;
685 struct ftrace_graph_ret *graph_ret; 688 struct ftrace_graph_ret *graph_ret;
@@ -711,12 +714,12 @@ print_graph_entry_leaf(struct trace_iterator *iter,
711 } 714 }
712 715
713 /* Overhead */ 716 /* Overhead */
714 ret = print_graph_overhead(duration, s); 717 ret = print_graph_overhead(duration, s, flags);
715 if (!ret) 718 if (!ret)
716 return TRACE_TYPE_PARTIAL_LINE; 719 return TRACE_TYPE_PARTIAL_LINE;
717 720
718 /* Duration */ 721 /* Duration */
719 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 722 if (flags & TRACE_GRAPH_PRINT_DURATION) {
720 ret = print_graph_duration(duration, s); 723 ret = print_graph_duration(duration, s);
721 if (ret == TRACE_TYPE_PARTIAL_LINE) 724 if (ret == TRACE_TYPE_PARTIAL_LINE)
722 return TRACE_TYPE_PARTIAL_LINE; 725 return TRACE_TYPE_PARTIAL_LINE;
@@ -739,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
739static enum print_line_t 742static enum print_line_t
740print_graph_entry_nested(struct trace_iterator *iter, 743print_graph_entry_nested(struct trace_iterator *iter,
741 struct ftrace_graph_ent_entry *entry, 744 struct ftrace_graph_ent_entry *entry,
742 struct trace_seq *s, int cpu) 745 struct trace_seq *s, int cpu, u32 flags)
743{ 746{
744 struct ftrace_graph_ent *call = &entry->graph_ent; 747 struct ftrace_graph_ent *call = &entry->graph_ent;
745 struct fgraph_data *data = iter->private; 748 struct fgraph_data *data = iter->private;
@@ -759,12 +762,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
759 } 762 }
760 763
761 /* No overhead */ 764 /* No overhead */
762 ret = print_graph_overhead(-1, s); 765 ret = print_graph_overhead(-1, s, flags);
763 if (!ret) 766 if (!ret)
764 return TRACE_TYPE_PARTIAL_LINE; 767 return TRACE_TYPE_PARTIAL_LINE;
765 768
766 /* No time */ 769 /* No time */
767 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 770 if (flags & TRACE_GRAPH_PRINT_DURATION) {
768 ret = trace_seq_printf(s, " | "); 771 ret = trace_seq_printf(s, " | ");
769 if (!ret) 772 if (!ret)
770 return TRACE_TYPE_PARTIAL_LINE; 773 return TRACE_TYPE_PARTIAL_LINE;
@@ -790,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
790 793
791static enum print_line_t 794static enum print_line_t
792print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, 795print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
793 int type, unsigned long addr) 796 int type, unsigned long addr, u32 flags)
794{ 797{
795 struct fgraph_data *data = iter->private; 798 struct fgraph_data *data = iter->private;
796 struct trace_entry *ent = iter->ent; 799 struct trace_entry *ent = iter->ent;
@@ -803,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
803 806
804 if (type) { 807 if (type) {
805 /* Interrupt */ 808 /* Interrupt */
806 ret = print_graph_irq(iter, addr, type, cpu, ent->pid); 809 ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
807 if (ret == TRACE_TYPE_PARTIAL_LINE) 810 if (ret == TRACE_TYPE_PARTIAL_LINE)
808 return TRACE_TYPE_PARTIAL_LINE; 811 return TRACE_TYPE_PARTIAL_LINE;
809 } 812 }
810 813
811 /* Absolute time */ 814 /* Absolute time */
812 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 815 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
813 ret = print_graph_abs_time(iter->ts, s); 816 ret = print_graph_abs_time(iter->ts, s);
814 if (!ret) 817 if (!ret)
815 return TRACE_TYPE_PARTIAL_LINE; 818 return TRACE_TYPE_PARTIAL_LINE;
816 } 819 }
817 820
818 /* Cpu */ 821 /* Cpu */
819 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 822 if (flags & TRACE_GRAPH_PRINT_CPU) {
820 ret = print_graph_cpu(s, cpu); 823 ret = print_graph_cpu(s, cpu);
821 if (ret == TRACE_TYPE_PARTIAL_LINE) 824 if (ret == TRACE_TYPE_PARTIAL_LINE)
822 return TRACE_TYPE_PARTIAL_LINE; 825 return TRACE_TYPE_PARTIAL_LINE;
823 } 826 }
824 827
825 /* Proc */ 828 /* Proc */
826 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 829 if (flags & TRACE_GRAPH_PRINT_PROC) {
827 ret = print_graph_proc(s, ent->pid); 830 ret = print_graph_proc(s, ent->pid);
828 if (ret == TRACE_TYPE_PARTIAL_LINE) 831 if (ret == TRACE_TYPE_PARTIAL_LINE)
829 return TRACE_TYPE_PARTIAL_LINE; 832 return TRACE_TYPE_PARTIAL_LINE;
@@ -845,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
845 848
846static enum print_line_t 849static enum print_line_t
847print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 850print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
848 struct trace_iterator *iter) 851 struct trace_iterator *iter, u32 flags)
849{ 852{
850 struct fgraph_data *data = iter->private; 853 struct fgraph_data *data = iter->private;
851 struct ftrace_graph_ent *call = &field->graph_ent; 854 struct ftrace_graph_ent *call = &field->graph_ent;
@@ -853,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
853 static enum print_line_t ret; 856 static enum print_line_t ret;
854 int cpu = iter->cpu; 857 int cpu = iter->cpu;
855 858
856 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 859 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
857 return TRACE_TYPE_PARTIAL_LINE; 860 return TRACE_TYPE_PARTIAL_LINE;
858 861
859 leaf_ret = get_return_for_leaf(iter, field); 862 leaf_ret = get_return_for_leaf(iter, field);
860 if (leaf_ret) 863 if (leaf_ret)
861 ret = print_graph_entry_leaf(iter, field, leaf_ret, s); 864 ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
862 else 865 else
863 ret = print_graph_entry_nested(iter, field, s, cpu); 866 ret = print_graph_entry_nested(iter, field, s, cpu, flags);
864 867
865 if (data) { 868 if (data) {
866 /* 869 /*
@@ -879,7 +882,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
879 882
880static enum print_line_t 883static enum print_line_t
881print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, 884print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
882 struct trace_entry *ent, struct trace_iterator *iter) 885 struct trace_entry *ent, struct trace_iterator *iter,
886 u32 flags)
883{ 887{
884 unsigned long long duration = trace->rettime - trace->calltime; 888 unsigned long long duration = trace->rettime - trace->calltime;
885 struct fgraph_data *data = iter->private; 889 struct fgraph_data *data = iter->private;
@@ -909,16 +913,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
909 } 913 }
910 } 914 }
911 915
912 if (print_graph_prologue(iter, s, 0, 0)) 916 if (print_graph_prologue(iter, s, 0, 0, flags))
913 return TRACE_TYPE_PARTIAL_LINE; 917 return TRACE_TYPE_PARTIAL_LINE;
914 918
915 /* Overhead */ 919 /* Overhead */
916 ret = print_graph_overhead(duration, s); 920 ret = print_graph_overhead(duration, s, flags);
917 if (!ret) 921 if (!ret)
918 return TRACE_TYPE_PARTIAL_LINE; 922 return TRACE_TYPE_PARTIAL_LINE;
919 923
920 /* Duration */ 924 /* Duration */
921 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 925 if (flags & TRACE_GRAPH_PRINT_DURATION) {
922 ret = print_graph_duration(duration, s); 926 ret = print_graph_duration(duration, s);
923 if (ret == TRACE_TYPE_PARTIAL_LINE) 927 if (ret == TRACE_TYPE_PARTIAL_LINE)
924 return TRACE_TYPE_PARTIAL_LINE; 928 return TRACE_TYPE_PARTIAL_LINE;
@@ -948,14 +952,15 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
948 } 952 }
949 953
950 /* Overrun */ 954 /* Overrun */
951 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 955 if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
952 ret = trace_seq_printf(s, " (Overruns: %lu)\n", 956 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
953 trace->overrun); 957 trace->overrun);
954 if (!ret) 958 if (!ret)
955 return TRACE_TYPE_PARTIAL_LINE; 959 return TRACE_TYPE_PARTIAL_LINE;
956 } 960 }
957 961
958 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid); 962 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
963 cpu, pid, flags);
959 if (ret == TRACE_TYPE_PARTIAL_LINE) 964 if (ret == TRACE_TYPE_PARTIAL_LINE)
960 return TRACE_TYPE_PARTIAL_LINE; 965 return TRACE_TYPE_PARTIAL_LINE;
961 966
@@ -963,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
963} 968}
964 969
965static enum print_line_t 970static enum print_line_t
966print_graph_comment(struct trace_seq *s, struct trace_entry *ent, 971print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
967 struct trace_iterator *iter) 972 struct trace_iterator *iter, u32 flags)
968{ 973{
969 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 974 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
970 struct fgraph_data *data = iter->private; 975 struct fgraph_data *data = iter->private;
@@ -976,16 +981,16 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
976 if (data) 981 if (data)
977 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; 982 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
978 983
979 if (print_graph_prologue(iter, s, 0, 0)) 984 if (print_graph_prologue(iter, s, 0, 0, flags))
980 return TRACE_TYPE_PARTIAL_LINE; 985 return TRACE_TYPE_PARTIAL_LINE;
981 986
982 /* No overhead */ 987 /* No overhead */
983 ret = print_graph_overhead(-1, s); 988 ret = print_graph_overhead(-1, s, flags);
984 if (!ret) 989 if (!ret)
985 return TRACE_TYPE_PARTIAL_LINE; 990 return TRACE_TYPE_PARTIAL_LINE;
986 991
987 /* No time */ 992 /* No time */
988 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 993 if (flags & TRACE_GRAPH_PRINT_DURATION) {
989 ret = trace_seq_printf(s, " | "); 994 ret = trace_seq_printf(s, " | ");
990 if (!ret) 995 if (!ret)
991 return TRACE_TYPE_PARTIAL_LINE; 996 return TRACE_TYPE_PARTIAL_LINE;
@@ -1040,7 +1045,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1040 1045
1041 1046
1042enum print_line_t 1047enum print_line_t
1043print_graph_function(struct trace_iterator *iter) 1048print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1044{ 1049{
1045 struct ftrace_graph_ent_entry *field; 1050 struct ftrace_graph_ent_entry *field;
1046 struct fgraph_data *data = iter->private; 1051 struct fgraph_data *data = iter->private;
@@ -1061,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
1061 if (data && data->failed) { 1066 if (data && data->failed) {
1062 field = &data->ent; 1067 field = &data->ent;
1063 iter->cpu = data->cpu; 1068 iter->cpu = data->cpu;
1064 ret = print_graph_entry(field, s, iter); 1069 ret = print_graph_entry(field, s, iter, flags);
1065 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { 1070 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1066 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; 1071 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1067 ret = TRACE_TYPE_NO_CONSUME; 1072 ret = TRACE_TYPE_NO_CONSUME;
@@ -1081,32 +1086,49 @@ print_graph_function(struct trace_iterator *iter)
1081 struct ftrace_graph_ent_entry saved; 1086 struct ftrace_graph_ent_entry saved;
1082 trace_assign_type(field, entry); 1087 trace_assign_type(field, entry);
1083 saved = *field; 1088 saved = *field;
1084 return print_graph_entry(&saved, s, iter); 1089 return print_graph_entry(&saved, s, iter, flags);
1085 } 1090 }
1086 case TRACE_GRAPH_RET: { 1091 case TRACE_GRAPH_RET: {
1087 struct ftrace_graph_ret_entry *field; 1092 struct ftrace_graph_ret_entry *field;
1088 trace_assign_type(field, entry); 1093 trace_assign_type(field, entry);
1089 return print_graph_return(&field->ret, s, entry, iter); 1094 return print_graph_return(&field->ret, s, entry, iter, flags);
1090 } 1095 }
1096 case TRACE_STACK:
1097 case TRACE_FN:
1098 /* dont trace stack and functions as comments */
1099 return TRACE_TYPE_UNHANDLED;
1100
1091 default: 1101 default:
1092 return print_graph_comment(s, entry, iter); 1102 return print_graph_comment(s, entry, iter, flags);
1093 } 1103 }
1094 1104
1095 return TRACE_TYPE_HANDLED; 1105 return TRACE_TYPE_HANDLED;
1096} 1106}
1097 1107
1098static void print_lat_header(struct seq_file *s) 1108static enum print_line_t
1109print_graph_function(struct trace_iterator *iter)
1110{
1111 return print_graph_function_flags(iter, tracer_flags.val);
1112}
1113
1114static enum print_line_t
1115print_graph_function_event(struct trace_iterator *iter, int flags)
1116{
1117 return print_graph_function(iter);
1118}
1119
1120static void print_lat_header(struct seq_file *s, u32 flags)
1099{ 1121{
1100 static const char spaces[] = " " /* 16 spaces */ 1122 static const char spaces[] = " " /* 16 spaces */
1101 " " /* 4 spaces */ 1123 " " /* 4 spaces */
1102 " "; /* 17 spaces */ 1124 " "; /* 17 spaces */
1103 int size = 0; 1125 int size = 0;
1104 1126
1105 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1127 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1106 size += 16; 1128 size += 16;
1107 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1129 if (flags & TRACE_GRAPH_PRINT_CPU)
1108 size += 4; 1130 size += 4;
1109 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1131 if (flags & TRACE_GRAPH_PRINT_PROC)
1110 size += 17; 1132 size += 17;
1111 1133
1112 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); 1134 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
@@ -1117,43 +1139,48 @@ static void print_lat_header(struct seq_file *s)
1117 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1139 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1118} 1140}
1119 1141
1120static void print_graph_headers(struct seq_file *s) 1142void print_graph_headers_flags(struct seq_file *s, u32 flags)
1121{ 1143{
1122 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1144 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1123 1145
1124 if (lat) 1146 if (lat)
1125 print_lat_header(s); 1147 print_lat_header(s, flags);
1126 1148
1127 /* 1st line */ 1149 /* 1st line */
1128 seq_printf(s, "#"); 1150 seq_printf(s, "#");
1129 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1151 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1130 seq_printf(s, " TIME "); 1152 seq_printf(s, " TIME ");
1131 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1153 if (flags & TRACE_GRAPH_PRINT_CPU)
1132 seq_printf(s, " CPU"); 1154 seq_printf(s, " CPU");
1133 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1155 if (flags & TRACE_GRAPH_PRINT_PROC)
1134 seq_printf(s, " TASK/PID "); 1156 seq_printf(s, " TASK/PID ");
1135 if (lat) 1157 if (lat)
1136 seq_printf(s, "|||||"); 1158 seq_printf(s, "|||||");
1137 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1159 if (flags & TRACE_GRAPH_PRINT_DURATION)
1138 seq_printf(s, " DURATION "); 1160 seq_printf(s, " DURATION ");
1139 seq_printf(s, " FUNCTION CALLS\n"); 1161 seq_printf(s, " FUNCTION CALLS\n");
1140 1162
1141 /* 2nd line */ 1163 /* 2nd line */
1142 seq_printf(s, "#"); 1164 seq_printf(s, "#");
1143 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1165 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1144 seq_printf(s, " | "); 1166 seq_printf(s, " | ");
1145 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1167 if (flags & TRACE_GRAPH_PRINT_CPU)
1146 seq_printf(s, " | "); 1168 seq_printf(s, " | ");
1147 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1169 if (flags & TRACE_GRAPH_PRINT_PROC)
1148 seq_printf(s, " | | "); 1170 seq_printf(s, " | | ");
1149 if (lat) 1171 if (lat)
1150 seq_printf(s, "|||||"); 1172 seq_printf(s, "|||||");
1151 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1173 if (flags & TRACE_GRAPH_PRINT_DURATION)
1152 seq_printf(s, " | | "); 1174 seq_printf(s, " | | ");
1153 seq_printf(s, " | | | |\n"); 1175 seq_printf(s, " | | | |\n");
1154} 1176}
1155 1177
1156static void graph_trace_open(struct trace_iterator *iter) 1178void print_graph_headers(struct seq_file *s)
1179{
1180 print_graph_headers_flags(s, tracer_flags.val);
1181}
1182
1183void graph_trace_open(struct trace_iterator *iter)
1157{ 1184{
1158 /* pid and depth on the last trace processed */ 1185 /* pid and depth on the last trace processed */
1159 struct fgraph_data *data; 1186 struct fgraph_data *data;
@@ -1188,7 +1215,7 @@ static void graph_trace_open(struct trace_iterator *iter)
1188 pr_warning("function graph tracer: not enough memory\n"); 1215 pr_warning("function graph tracer: not enough memory\n");
1189} 1216}
1190 1217
1191static void graph_trace_close(struct trace_iterator *iter) 1218void graph_trace_close(struct trace_iterator *iter)
1192{ 1219{
1193 struct fgraph_data *data = iter->private; 1220 struct fgraph_data *data = iter->private;
1194 1221
@@ -1198,6 +1225,16 @@ static void graph_trace_close(struct trace_iterator *iter)
1198 } 1225 }
1199} 1226}
1200 1227
1228static struct trace_event graph_trace_entry_event = {
1229 .type = TRACE_GRAPH_ENT,
1230 .trace = print_graph_function_event,
1231};
1232
1233static struct trace_event graph_trace_ret_event = {
1234 .type = TRACE_GRAPH_RET,
1235 .trace = print_graph_function_event,
1236};
1237
1201static struct tracer graph_trace __read_mostly = { 1238static struct tracer graph_trace __read_mostly = {
1202 .name = "function_graph", 1239 .name = "function_graph",
1203 .open = graph_trace_open, 1240 .open = graph_trace_open,
@@ -1219,6 +1256,16 @@ static __init int init_graph_trace(void)
1219{ 1256{
1220 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1257 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1221 1258
1259 if (!register_ftrace_event(&graph_trace_entry_event)) {
1260 pr_warning("Warning: could not register graph trace events\n");
1261 return 1;
1262 }
1263
1264 if (!register_ftrace_event(&graph_trace_ret_event)) {
1265 pr_warning("Warning: could not register graph trace events\n");
1266 return 1;
1267 }
1268
1222 return register_tracer(&graph_trace); 1269 return register_tracer(&graph_trace);
1223} 1270}
1224 1271
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f5..000000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
1/*
2 * h/w branch tracer for x86 based on BTS
3 *
4 * Copyright (C) 2008-2009 Intel Corporation.
5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */
7#include <linux/kallsyms.h>
8#include <linux/debugfs.h>
9#include <linux/ftrace.h>
10#include <linux/module.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
13#include <linux/fs.h>
14
15#include <asm/ds.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20
21#define BTS_BUFFER_SIZE (1 << 13)
22
23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25
26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27
28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly;
30static struct trace_array *hw_branch_trace __read_mostly;
31
32
33static void bts_trace_init_cpu(int cpu)
34{
35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
39
40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
41 per_cpu(hwb_tracer, cpu) = NULL;
42}
43
44static int bts_trace_init(struct trace_array *tr)
45{
46 int cpu;
47
48 hw_branch_trace = tr;
49 trace_hw_branches_enabled = 0;
50
51 get_online_cpus();
52 for_each_online_cpu(cpu) {
53 bts_trace_init_cpu(cpu);
54
55 if (likely(per_cpu(hwb_tracer, cpu)))
56 trace_hw_branches_enabled = 1;
57 }
58 trace_hw_branches_suspended = 0;
59 put_online_cpus();
60
61 /* If we could not enable tracing on a single cpu, we fail. */
62 return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
63}
64
65static void bts_trace_reset(struct trace_array *tr)
66{
67 int cpu;
68
69 get_online_cpus();
70 for_each_online_cpu(cpu) {
71 if (likely(per_cpu(hwb_tracer, cpu))) {
72 ds_release_bts(per_cpu(hwb_tracer, cpu));
73 per_cpu(hwb_tracer, cpu) = NULL;
74 }
75 }
76 trace_hw_branches_enabled = 0;
77 trace_hw_branches_suspended = 0;
78 put_online_cpus();
79}
80
81static void bts_trace_start(struct trace_array *tr)
82{
83 int cpu;
84
85 get_online_cpus();
86 for_each_online_cpu(cpu)
87 if (likely(per_cpu(hwb_tracer, cpu)))
88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
89 trace_hw_branches_suspended = 0;
90 put_online_cpus();
91}
92
93static void bts_trace_stop(struct trace_array *tr)
94{
95 int cpu;
96
97 get_online_cpus();
98 for_each_online_cpu(cpu)
99 if (likely(per_cpu(hwb_tracer, cpu)))
100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
101 trace_hw_branches_suspended = 1;
102 put_online_cpus();
103}
104
105static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
106 unsigned long action, void *hcpu)
107{
108 int cpu = (long)hcpu;
109
110 switch (action) {
111 case CPU_ONLINE:
112 case CPU_DOWN_FAILED:
113 /* The notification is sent with interrupts enabled. */
114 if (trace_hw_branches_enabled) {
115 bts_trace_init_cpu(cpu);
116
117 if (trace_hw_branches_suspended &&
118 likely(per_cpu(hwb_tracer, cpu)))
119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
120 }
121 break;
122
123 case CPU_DOWN_PREPARE:
124 /* The notification is sent with interrupts enabled. */
125 if (likely(per_cpu(hwb_tracer, cpu))) {
126 ds_release_bts(per_cpu(hwb_tracer, cpu));
127 per_cpu(hwb_tracer, cpu) = NULL;
128 }
129 }
130
131 return NOTIFY_DONE;
132}
133
134static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
135 .notifier_call = bts_hotcpu_handler
136};
137
138static void bts_trace_print_header(struct seq_file *m)
139{
140 seq_puts(m, "# CPU# TO <- FROM\n");
141}
142
143static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
144{
145 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
146 struct trace_entry *entry = iter->ent;
147 struct trace_seq *seq = &iter->seq;
148 struct hw_branch_entry *it;
149
150 trace_assign_type(it, entry);
151
152 if (entry->type == TRACE_HW_BRANCHES) {
153 if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
154 seq_print_ip_sym(seq, it->to, symflags) &&
155 trace_seq_printf(seq, "\t <- ") &&
156 seq_print_ip_sym(seq, it->from, symflags) &&
157 trace_seq_printf(seq, "\n"))
158 return TRACE_TYPE_HANDLED;
159 return TRACE_TYPE_PARTIAL_LINE;
160 }
161 return TRACE_TYPE_UNHANDLED;
162}
163
164void trace_hw_branch(u64 from, u64 to)
165{
166 struct ftrace_event_call *call = &event_hw_branch;
167 struct trace_array *tr = hw_branch_trace;
168 struct ring_buffer_event *event;
169 struct ring_buffer *buf;
170 struct hw_branch_entry *entry;
171 unsigned long irq1;
172 int cpu;
173
174 if (unlikely(!tr))
175 return;
176
177 if (unlikely(!trace_hw_branches_enabled))
178 return;
179
180 local_irq_save(irq1);
181 cpu = raw_smp_processor_id();
182 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
183 goto out;
184
185 buf = tr->buffer;
186 event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
187 sizeof(*entry), 0, 0);
188 if (!event)
189 goto out;
190 entry = ring_buffer_event_data(event);
191 tracing_generic_entry_update(&entry->ent, 0, from);
192 entry->ent.type = TRACE_HW_BRANCHES;
193 entry->from = from;
194 entry->to = to;
195 if (!filter_check_discard(call, entry, buf, event))
196 trace_buffer_unlock_commit(buf, event, 0, 0);
197
198 out:
199 atomic_dec(&tr->data[cpu]->disabled);
200 local_irq_restore(irq1);
201}
202
203static void trace_bts_at(const struct bts_trace *trace, void *at)
204{
205 struct bts_struct bts;
206 int err = 0;
207
208 WARN_ON_ONCE(!trace->read);
209 if (!trace->read)
210 return;
211
212 err = trace->read(this_tracer, at, &bts);
213 if (err < 0)
214 return;
215
216 switch (bts.qualifier) {
217 case BTS_BRANCH:
218 trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
219 break;
220 }
221}
222
223/*
224 * Collect the trace on the current cpu and write it into the ftrace buffer.
225 *
226 * pre: tracing must be suspended on the current cpu
227 */
228static void trace_bts_cpu(void *arg)
229{
230 struct trace_array *tr = (struct trace_array *)arg;
231 const struct bts_trace *trace;
232 unsigned char *at;
233
234 if (unlikely(!tr))
235 return;
236
237 if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
238 return;
239
240 if (unlikely(!this_tracer))
241 return;
242
243 trace = ds_read_bts(this_tracer);
244 if (!trace)
245 return;
246
247 for (at = trace->ds.top; (void *)at < trace->ds.end;
248 at += trace->ds.size)
249 trace_bts_at(trace, at);
250
251 for (at = trace->ds.begin; (void *)at < trace->ds.top;
252 at += trace->ds.size)
253 trace_bts_at(trace, at);
254}
255
256static void trace_bts_prepare(struct trace_iterator *iter)
257{
258 int cpu;
259
260 get_online_cpus();
261 for_each_online_cpu(cpu)
262 if (likely(per_cpu(hwb_tracer, cpu)))
263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
264 /*
265 * We need to collect the trace on the respective cpu since ftrace
266 * implicitly adds the record for the current cpu.
267 * Once that is more flexible, we could collect the data from any cpu.
268 */
269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
270
271 for_each_online_cpu(cpu)
272 if (likely(per_cpu(hwb_tracer, cpu)))
273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
274 put_online_cpus();
275}
276
277static void trace_bts_close(struct trace_iterator *iter)
278{
279 tracing_reset_online_cpus(iter->tr);
280}
281
282void trace_hw_branch_oops(void)
283{
284 if (this_tracer) {
285 ds_suspend_bts_noirq(this_tracer);
286 trace_bts_cpu(hw_branch_trace);
287 ds_resume_bts_noirq(this_tracer);
288 }
289}
290
291struct tracer bts_tracer __read_mostly =
292{
293 .name = "hw-branch-tracer",
294 .init = bts_trace_init,
295 .reset = bts_trace_reset,
296 .print_header = bts_trace_print_header,
297 .print_line = bts_trace_print_line,
298 .start = bts_trace_start,
299 .stop = bts_trace_stop,
300 .open = trace_bts_prepare,
301 .close = trace_bts_close,
302#ifdef CONFIG_FTRACE_SELFTEST
303 .selftest = trace_selftest_startup_hw_branches,
304#endif /* CONFIG_FTRACE_SELFTEST */
305};
306
307__init static int init_bts_trace(void)
308{
309 register_hotcpu_notifier(&bts_hotcpu_notifier);
310 return register_tracer(&bts_tracer);
311}
312device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c7..6fd486e0cef4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
34 34
35static int save_lat_flag; 35static int save_lat_flag;
36 36
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph);
39
37#ifdef CONFIG_PREEMPT_TRACER 40#ifdef CONFIG_PREEMPT_TRACER
38static inline int 41static inline int
39preempt_trace(void) 42preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
55# define irq_trace() (0) 58# define irq_trace() (0)
56#endif 59#endif
57 60
61#define TRACE_DISPLAY_GRAPH 1
62
63static struct tracer_opt trace_opts[] = {
64#ifdef CONFIG_FUNCTION_GRAPH_TRACER
65 /* display latency trace as call graph */
66 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
67#endif
68 { } /* Empty entry */
69};
70
71static struct tracer_flags tracer_flags = {
72 .val = 0,
73 .opts = trace_opts,
74};
75
76#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
77
58/* 78/*
59 * Sequence count - we record it when starting a measurement and 79 * Sequence count - we record it when starting a measurement and
60 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
108}; 128};
109#endif /* CONFIG_FUNCTION_TRACER */ 129#endif /* CONFIG_FUNCTION_TRACER */
110 130
131#ifdef CONFIG_FUNCTION_GRAPH_TRACER
132static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
133{
134 int cpu;
135
136 if (!(bit & TRACE_DISPLAY_GRAPH))
137 return -EINVAL;
138
139 if (!(is_graph() ^ set))
140 return 0;
141
142 stop_irqsoff_tracer(irqsoff_trace, !set);
143
144 for_each_possible_cpu(cpu)
145 per_cpu(tracing_cpu, cpu) = 0;
146
147 tracing_max_latency = 0;
148 tracing_reset_online_cpus(irqsoff_trace);
149
150 return start_irqsoff_tracer(irqsoff_trace, set);
151}
152
153static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
154{
155 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data;
157 unsigned long flags;
158 long disabled;
159 int ret;
160 int cpu;
161 int pc;
162
163 cpu = raw_smp_processor_id();
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0;
166
167 local_save_flags(flags);
168 /* slight chance to get a false positive on tracing_cpu */
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled);
182 return ret;
183}
184
185static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
186{
187 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data;
189 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc;
193
194 cpu = raw_smp_processor_id();
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return;
197
198 local_save_flags(flags);
199 /* slight chance to get a false positive on tracing_cpu */
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled);
212}
213
214static void irqsoff_trace_open(struct trace_iterator *iter)
215{
216 if (is_graph())
217 graph_trace_open(iter);
218
219}
220
221static void irqsoff_trace_close(struct trace_iterator *iter)
222{
223 if (iter->private)
224 graph_trace_close(iter);
225}
226
227#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
228 TRACE_GRAPH_PRINT_PROC)
229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /*
240 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler
242 */
243 if (is_graph())
244 return print_graph_function_flags(iter, flags);
245
246 return TRACE_TYPE_UNHANDLED;
247}
248
249static void irqsoff_print_header(struct seq_file *s)
250{
251 if (is_graph()) {
252 struct trace_iterator *iter = s->private;
253 u32 flags = GRAPH_TRACER_FLAGS;
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s);
268}
269
270static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc)
294{
295 if (!is_graph())
296 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301}
302
303#else
304#define __trace_function trace_function
305
306static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
307{
308 return -EINVAL;
309}
310
311static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
312{
313 return -1;
314}
315
316static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
317{
318 return TRACE_TYPE_UNHANDLED;
319}
320
321static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
322static void irqsoff_print_header(struct seq_file *s) { }
323static void irqsoff_trace_open(struct trace_iterator *iter) { }
324static void irqsoff_trace_close(struct trace_iterator *iter) { }
325#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
326
111/* 327/*
112 * Should this new latency be reported/recorded? 328 * Should this new latency be reported/recorded?
113 */ 329 */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
150 if (!report_latency(delta)) 366 if (!report_latency(delta))
151 goto out_unlock; 367 goto out_unlock;
152 368
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 369 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */ 370 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc); 371 __trace_stack(tr, flags, 5, pc);
156 372
@@ -172,7 +388,7 @@ out_unlock:
172out: 388out:
173 data->critical_sequence = max_sequence; 389 data->critical_sequence = max_sequence;
174 data->preempt_timestamp = ftrace_now(cpu); 390 data->preempt_timestamp = ftrace_now(cpu);
175 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 391 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
176} 392}
177 393
178static inline void 394static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
204 420
205 local_save_flags(flags); 421 local_save_flags(flags);
206 422
207 trace_function(tr, ip, parent_ip, flags, preempt_count()); 423 __trace_function(tr, ip, parent_ip, flags, preempt_count());
208 424
209 per_cpu(tracing_cpu, cpu) = 1; 425 per_cpu(tracing_cpu, cpu) = 1;
210 426
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
238 atomic_inc(&data->disabled); 454 atomic_inc(&data->disabled);
239 455
240 local_save_flags(flags); 456 local_save_flags(flags);
241 trace_function(tr, ip, parent_ip, flags, preempt_count()); 457 __trace_function(tr, ip, parent_ip, flags, preempt_count());
242 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 458 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
243 data->critical_start = 0; 459 data->critical_start = 0;
244 atomic_dec(&data->disabled); 460 atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
347} 563}
348#endif /* CONFIG_PREEMPT_TRACER */ 564#endif /* CONFIG_PREEMPT_TRACER */
349 565
350static void start_irqsoff_tracer(struct trace_array *tr) 566static int start_irqsoff_tracer(struct trace_array *tr, int graph)
351{ 567{
352 register_ftrace_function(&trace_ops); 568 int ret = 0;
353 if (tracing_is_enabled()) 569
570 if (!graph)
571 ret = register_ftrace_function(&trace_ops);
572 else
573 ret = register_ftrace_graph(&irqsoff_graph_return,
574 &irqsoff_graph_entry);
575
576 if (!ret && tracing_is_enabled())
354 tracer_enabled = 1; 577 tracer_enabled = 1;
355 else 578 else
356 tracer_enabled = 0; 579 tracer_enabled = 0;
580
581 return ret;
357} 582}
358 583
359static void stop_irqsoff_tracer(struct trace_array *tr) 584static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
360{ 585{
361 tracer_enabled = 0; 586 tracer_enabled = 0;
362 unregister_ftrace_function(&trace_ops); 587
588 if (!graph)
589 unregister_ftrace_function(&trace_ops);
590 else
591 unregister_ftrace_graph();
363} 592}
364 593
365static void __irqsoff_tracer_init(struct trace_array *tr) 594static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
372 /* make sure that the tracer is visible */ 601 /* make sure that the tracer is visible */
373 smp_wmb(); 602 smp_wmb();
374 tracing_reset_online_cpus(tr); 603 tracing_reset_online_cpus(tr);
375 start_irqsoff_tracer(tr); 604
605 if (start_irqsoff_tracer(tr, is_graph()))
606 printk(KERN_ERR "failed to start irqsoff tracer\n");
376} 607}
377 608
378static void irqsoff_tracer_reset(struct trace_array *tr) 609static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 610{
380 stop_irqsoff_tracer(tr); 611 stop_irqsoff_tracer(tr, is_graph());
381 612
382 if (!save_lat_flag) 613 if (!save_lat_flag)
383 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 614 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
409 .start = irqsoff_tracer_start, 640 .start = irqsoff_tracer_start,
410 .stop = irqsoff_tracer_stop, 641 .stop = irqsoff_tracer_stop,
411 .print_max = 1, 642 .print_max = 1,
643 .print_header = irqsoff_print_header,
644 .print_line = irqsoff_print_line,
645 .flags = &tracer_flags,
646 .set_flag = irqsoff_set_flag,
412#ifdef CONFIG_FTRACE_SELFTEST 647#ifdef CONFIG_FTRACE_SELFTEST
413 .selftest = trace_selftest_startup_irqsoff, 648 .selftest = trace_selftest_startup_irqsoff,
414#endif 649#endif
650 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close,
415}; 652};
416# define register_irqsoff(trace) register_tracer(&trace) 653# define register_irqsoff(trace) register_tracer(&trace)
417#else 654#else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
435 .start = irqsoff_tracer_start, 672 .start = irqsoff_tracer_start,
436 .stop = irqsoff_tracer_stop, 673 .stop = irqsoff_tracer_stop,
437 .print_max = 1, 674 .print_max = 1,
675 .print_header = irqsoff_print_header,
676 .print_line = irqsoff_print_line,
677 .flags = &tracer_flags,
678 .set_flag = irqsoff_set_flag,
438#ifdef CONFIG_FTRACE_SELFTEST 679#ifdef CONFIG_FTRACE_SELFTEST
439 .selftest = trace_selftest_startup_preemptoff, 680 .selftest = trace_selftest_startup_preemptoff,
440#endif 681#endif
682 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close,
441}; 684};
442# define register_preemptoff(trace) register_tracer(&trace) 685# define register_preemptoff(trace) register_tracer(&trace)
443#else 686#else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
463 .start = irqsoff_tracer_start, 706 .start = irqsoff_tracer_start,
464 .stop = irqsoff_tracer_stop, 707 .stop = irqsoff_tracer_stop,
465 .print_max = 1, 708 .print_max = 1,
709 .print_header = irqsoff_print_header,
710 .print_line = irqsoff_print_line,
711 .flags = &tracer_flags,
712 .set_flag = irqsoff_set_flag,
466#ifdef CONFIG_FTRACE_SELFTEST 713#ifdef CONFIG_FTRACE_SELFTEST
467 .selftest = trace_selftest_startup_preemptirqsoff, 714 .selftest = trace_selftest_startup_preemptirqsoff,
468#endif 715#endif
716 .open = irqsoff_trace_open,
717 .close = irqsoff_trace_close,
469}; 718};
470 719
471# define register_preemptirqsoff(trace) register_tracer(&trace) 720# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1251e367bae9..a7514326052b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,6 +29,8 @@
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h>
33#include <asm/bitsperlong.h>
32 34
33#include "trace.h" 35#include "trace.h"
34#include "trace_output.h" 36#include "trace_output.h"
@@ -40,7 +42,6 @@
40 42
41/* Reserved field names */ 43/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip" 44#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip" 45#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func" 46#define FIELD_STRING_FUNC "__probe_func"
46 47
@@ -52,56 +53,102 @@ const char *reserved_field_names[] = {
52 "common_tgid", 53 "common_tgid",
53 "common_lock_depth", 54 "common_lock_depth",
54 FIELD_STRING_IP, 55 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP, 56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC, 57 FIELD_STRING_FUNC,
58}; 58};
59 59
60struct fetch_func { 60/* Printing function type */
61 unsigned long (*func)(struct pt_regs *, void *); 61typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
62#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
63#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
64
65/* Printing in basic type function template */
66#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
67static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
68 const char *name, void *data)\
69{ \
70 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
71} \
72static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
73
74DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
75DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
76DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
77DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
78DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
79DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
80DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
82
83/* Data fetch function type */
84typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
85
86struct fetch_param {
87 fetch_func_t fn;
62 void *data; 88 void *data;
63}; 89};
64 90
65static __kprobes unsigned long call_fetch(struct fetch_func *f, 91static __kprobes void call_fetch(struct fetch_param *fprm,
66 struct pt_regs *regs) 92 struct pt_regs *regs, void *dest)
67{ 93{
68 return f->func(regs, f->data); 94 return fprm->fn(regs, fprm->data, dest);
69} 95}
70 96
71/* fetch handlers */ 97#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type
72static __kprobes unsigned long fetch_register(struct pt_regs *regs, 98/*
73 void *offset) 99 * Define macro for basic types - we don't need to define s* types, because
74{ 100 * we have to care only about bitwidth at recording time.
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset)); 101 */
102#define DEFINE_BASIC_FETCH_FUNCS(kind) \
103DEFINE_FETCH_##kind(u8) \
104DEFINE_FETCH_##kind(u16) \
105DEFINE_FETCH_##kind(u32) \
106DEFINE_FETCH_##kind(u64)
107
108#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \
109 ((FETCH_FUNC_NAME(kind, u8) == fn) || \
110 (FETCH_FUNC_NAME(kind, u16) == fn) || \
111 (FETCH_FUNC_NAME(kind, u32) == fn) || \
112 (FETCH_FUNC_NAME(kind, u64) == fn))
113
114/* Data fetch function templates */
115#define DEFINE_FETCH_reg(type) \
116static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
117 void *offset, void *dest) \
118{ \
119 *(type *)dest = (type)regs_get_register(regs, \
120 (unsigned int)((unsigned long)offset)); \
76} 121}
77 122DEFINE_BASIC_FETCH_FUNCS(reg)
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs, 123
79 void *num) 124#define DEFINE_FETCH_stack(type) \
80{ 125static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
81 return regs_get_kernel_stack_nth(regs, 126 void *offset, void *dest) \
82 (unsigned int)((unsigned long)num)); 127{ \
128 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
129 (unsigned int)((unsigned long)offset)); \
83} 130}
131DEFINE_BASIC_FETCH_FUNCS(stack)
84 132
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) 133#define DEFINE_FETCH_retval(type) \
86{ 134static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
87 unsigned long retval; 135 void *dummy, void *dest) \
88 136{ \
89 if (probe_kernel_address(addr, retval)) 137 *(type *)dest = (type)regs_return_value(regs); \
90 return 0;
91 return retval;
92} 138}
93 139DEFINE_BASIC_FETCH_FUNCS(retval)
94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 140
95 void *dummy) 141#define DEFINE_FETCH_memory(type) \
96{ 142static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
97 return regs_return_value(regs); 143 void *addr, void *dest) \
98} 144{ \
99 145 type retval; \
100static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, 146 if (probe_kernel_address(addr, retval)) \
101 void *dummy) 147 *(type *)dest = 0; \
102{ 148 else \
103 return kernel_stack_pointer(regs); 149 *(type *)dest = retval; \
104} 150}
151DEFINE_BASIC_FETCH_FUNCS(memory)
105 152
106/* Memory fetching by symbol */ 153/* Memory fetching by symbol */
107struct symbol_cache { 154struct symbol_cache {
@@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
145 return sc; 192 return sc;
146} 193}
147 194
148static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) 195#define DEFINE_FETCH_symbol(type) \
149{ 196static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
150 struct symbol_cache *sc = data; 197 void *data, void *dest) \
151 198{ \
152 if (sc->addr) 199 struct symbol_cache *sc = data; \
153 return fetch_memory(regs, (void *)sc->addr); 200 if (sc->addr) \
154 else 201 fetch_memory_##type(regs, (void *)sc->addr, dest); \
155 return 0; 202 else \
203 *(type *)dest = 0; \
156} 204}
205DEFINE_BASIC_FETCH_FUNCS(symbol)
157 206
158/* Special indirect memory access interface */ 207/* Dereference memory access function */
159struct indirect_fetch_data { 208struct deref_fetch_param {
160 struct fetch_func orig; 209 struct fetch_param orig;
161 long offset; 210 long offset;
162}; 211};
163 212
164static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) 213#define DEFINE_FETCH_deref(type) \
165{ 214static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
166 struct indirect_fetch_data *ind = data; 215 void *data, void *dest) \
167 unsigned long addr; 216{ \
168 217 struct deref_fetch_param *dprm = data; \
169 addr = call_fetch(&ind->orig, regs); 218 unsigned long addr; \
170 if (addr) { 219 call_fetch(&dprm->orig, regs, &addr); \
171 addr += ind->offset; 220 if (addr) { \
172 return fetch_memory(regs, (void *)addr); 221 addr += dprm->offset; \
173 } else 222 fetch_memory_##type(regs, (void *)addr, dest); \
174 return 0; 223 } else \
224 *(type *)dest = 0; \
175} 225}
226DEFINE_BASIC_FETCH_FUNCS(deref)
176 227
177static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) 228static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
178{ 229{
179 if (data->orig.func == fetch_indirect) 230 if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
180 free_indirect_fetch_data(data->orig.data); 231 free_deref_fetch_param(data->orig.data);
181 else if (data->orig.func == fetch_symbol) 232 else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
182 free_symbol_cache(data->orig.data); 233 free_symbol_cache(data->orig.data);
183 kfree(data); 234 kfree(data);
184} 235}
185 236
237/* Default (unsigned long) fetch type */
238#define __DEFAULT_FETCH_TYPE(t) u##t
239#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
240#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
241#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
242
243#define ASSIGN_FETCH_FUNC(kind, type) \
244 .kind = FETCH_FUNC_NAME(kind, type)
245
246#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
247 {.name = #ptype, \
248 .size = sizeof(ftype), \
249 .is_signed = sign, \
250 .print = PRINT_TYPE_FUNC_NAME(ptype), \
251 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
252ASSIGN_FETCH_FUNC(reg, ftype), \
253ASSIGN_FETCH_FUNC(stack, ftype), \
254ASSIGN_FETCH_FUNC(retval, ftype), \
255ASSIGN_FETCH_FUNC(memory, ftype), \
256ASSIGN_FETCH_FUNC(symbol, ftype), \
257ASSIGN_FETCH_FUNC(deref, ftype), \
258 }
259
260/* Fetch type information table */
261static const struct fetch_type {
262 const char *name; /* Name of type */
263 size_t size; /* Byte size of type */
264 int is_signed; /* Signed flag */
265 print_type_func_t print; /* Print functions */
266 const char *fmt; /* Fromat string */
267 /* Fetch functions */
268 fetch_func_t reg;
269 fetch_func_t stack;
270 fetch_func_t retval;
271 fetch_func_t memory;
272 fetch_func_t symbol;
273 fetch_func_t deref;
274} fetch_type_table[] = {
275 ASSIGN_FETCH_TYPE(u8, u8, 0),
276 ASSIGN_FETCH_TYPE(u16, u16, 0),
277 ASSIGN_FETCH_TYPE(u32, u32, 0),
278 ASSIGN_FETCH_TYPE(u64, u64, 0),
279 ASSIGN_FETCH_TYPE(s8, u8, 1),
280 ASSIGN_FETCH_TYPE(s16, u16, 1),
281 ASSIGN_FETCH_TYPE(s32, u32, 1),
282 ASSIGN_FETCH_TYPE(s64, u64, 1),
283};
284
285static const struct fetch_type *find_fetch_type(const char *type)
286{
287 int i;
288
289 if (!type)
290 type = DEFAULT_FETCH_TYPE_STR;
291
292 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
293 if (strcmp(type, fetch_type_table[i].name) == 0)
294 return &fetch_type_table[i];
295 return NULL;
296}
297
298/* Special function : only accept unsigned long */
299static __kprobes void fetch_stack_address(struct pt_regs *regs,
300 void *dummy, void *dest)
301{
302 *(unsigned long *)dest = kernel_stack_pointer(regs);
303}
304
186/** 305/**
187 * Kprobe event core functions 306 * Kprobe event core functions
188 */ 307 */
189 308
190struct probe_arg { 309struct probe_arg {
191 struct fetch_func fetch; 310 struct fetch_param fetch;
192 const char *name; 311 unsigned int offset; /* Offset from argument entry */
312 const char *name; /* Name of this argument */
313 const char *comm; /* Command of this argument */
314 const struct fetch_type *type; /* Type of this argument */
193}; 315};
194 316
195/* Flags for trace_probe */ 317/* Flags for trace_probe */
@@ -204,6 +326,7 @@ struct trace_probe {
204 const char *symbol; /* symbol name */ 326 const char *symbol; /* symbol name */
205 struct ftrace_event_call call; 327 struct ftrace_event_call call;
206 struct trace_event event; 328 struct trace_event event;
329 ssize_t size; /* trace entry size */
207 unsigned int nr_args; 330 unsigned int nr_args;
208 struct probe_arg args[]; 331 struct probe_arg args[];
209}; 332};
@@ -212,6 +335,7 @@ struct trace_probe {
212 (offsetof(struct trace_probe, args) + \ 335 (offsetof(struct trace_probe, args) + \
213 (sizeof(struct probe_arg) * (n))) 336 (sizeof(struct probe_arg) * (n)))
214 337
338
215static __kprobes int probe_is_return(struct trace_probe *tp) 339static __kprobes int probe_is_return(struct trace_probe *tp)
216{ 340{
217 return tp->rp.handler != NULL; 341 return tp->rp.handler != NULL;
@@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
222 return tp->symbol ? tp->symbol : "unknown"; 346 return tp->symbol ? tp->symbol : "unknown";
223} 347}
224 348
225static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
226{
227 int ret = -EINVAL;
228
229 if (ff->func == fetch_register) {
230 const char *name;
231 name = regs_query_register_name((unsigned int)((long)ff->data));
232 ret = snprintf(buf, n, "%%%s", name);
233 } else if (ff->func == fetch_stack)
234 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
235 else if (ff->func == fetch_memory)
236 ret = snprintf(buf, n, "@0x%p", ff->data);
237 else if (ff->func == fetch_symbol) {
238 struct symbol_cache *sc = ff->data;
239 if (sc->offset)
240 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
241 sc->offset);
242 else
243 ret = snprintf(buf, n, "@%s", sc->symbol);
244 } else if (ff->func == fetch_retvalue)
245 ret = snprintf(buf, n, "$retval");
246 else if (ff->func == fetch_stack_address)
247 ret = snprintf(buf, n, "$stack");
248 else if (ff->func == fetch_indirect) {
249 struct indirect_fetch_data *id = ff->data;
250 size_t l = 0;
251 ret = snprintf(buf, n, "%+ld(", id->offset);
252 if (ret >= n)
253 goto end;
254 l += ret;
255 ret = probe_arg_string(buf + l, n - l, &id->orig);
256 if (ret < 0)
257 goto end;
258 l += ret;
259 ret = snprintf(buf + l, n - l, ")");
260 ret += l;
261 }
262end:
263 if (ret >= n)
264 return -ENOSPC;
265 return ret;
266}
267
268static int register_probe_event(struct trace_probe *tp); 349static int register_probe_event(struct trace_probe *tp);
269static void unregister_probe_event(struct trace_probe *tp); 350static void unregister_probe_event(struct trace_probe *tp);
270 351
@@ -347,11 +428,12 @@ error:
347 428
348static void free_probe_arg(struct probe_arg *arg) 429static void free_probe_arg(struct probe_arg *arg)
349{ 430{
350 if (arg->fetch.func == fetch_symbol) 431 if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
432 free_deref_fetch_param(arg->fetch.data);
433 else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
351 free_symbol_cache(arg->fetch.data); 434 free_symbol_cache(arg->fetch.data);
352 else if (arg->fetch.func == fetch_indirect)
353 free_indirect_fetch_data(arg->fetch.data);
354 kfree(arg->name); 435 kfree(arg->name);
436 kfree(arg->comm);
355} 437}
356 438
357static void free_trace_probe(struct trace_probe *tp) 439static void free_trace_probe(struct trace_probe *tp)
@@ -457,28 +539,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
457#define PARAM_MAX_ARGS 16 539#define PARAM_MAX_ARGS 16
458#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) 540#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
459 541
460static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) 542static int parse_probe_vars(char *arg, const struct fetch_type *t,
543 struct fetch_param *f, int is_return)
461{ 544{
462 int ret = 0; 545 int ret = 0;
463 unsigned long param; 546 unsigned long param;
464 547
465 if (strcmp(arg, "retval") == 0) { 548 if (strcmp(arg, "retval") == 0) {
466 if (is_return) { 549 if (is_return)
467 ff->func = fetch_retvalue; 550 f->fn = t->retval;
468 ff->data = NULL; 551 else
469 } else
470 ret = -EINVAL; 552 ret = -EINVAL;
471 } else if (strncmp(arg, "stack", 5) == 0) { 553 } else if (strncmp(arg, "stack", 5) == 0) {
472 if (arg[5] == '\0') { 554 if (arg[5] == '\0') {
473 ff->func = fetch_stack_address; 555 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
474 ff->data = NULL; 556 f->fn = fetch_stack_address;
557 else
558 ret = -EINVAL;
475 } else if (isdigit(arg[5])) { 559 } else if (isdigit(arg[5])) {
476 ret = strict_strtoul(arg + 5, 10, &param); 560 ret = strict_strtoul(arg + 5, 10, &param);
477 if (ret || param > PARAM_MAX_STACK) 561 if (ret || param > PARAM_MAX_STACK)
478 ret = -EINVAL; 562 ret = -EINVAL;
479 else { 563 else {
480 ff->func = fetch_stack; 564 f->fn = t->stack;
481 ff->data = (void *)param; 565 f->data = (void *)param;
482 } 566 }
483 } else 567 } else
484 ret = -EINVAL; 568 ret = -EINVAL;
@@ -488,7 +572,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
488} 572}
489 573
490/* Recursive argument parser */ 574/* Recursive argument parser */
491static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 575static int __parse_probe_arg(char *arg, const struct fetch_type *t,
576 struct fetch_param *f, int is_return)
492{ 577{
493 int ret = 0; 578 int ret = 0;
494 unsigned long param; 579 unsigned long param;
@@ -497,13 +582,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
497 582
498 switch (arg[0]) { 583 switch (arg[0]) {
499 case '$': 584 case '$':
500 ret = parse_probe_vars(arg + 1, ff, is_return); 585 ret = parse_probe_vars(arg + 1, t, f, is_return);
501 break; 586 break;
502 case '%': /* named register */ 587 case '%': /* named register */
503 ret = regs_query_register_offset(arg + 1); 588 ret = regs_query_register_offset(arg + 1);
504 if (ret >= 0) { 589 if (ret >= 0) {
505 ff->func = fetch_register; 590 f->fn = t->reg;
506 ff->data = (void *)(unsigned long)ret; 591 f->data = (void *)(unsigned long)ret;
507 ret = 0; 592 ret = 0;
508 } 593 }
509 break; 594 break;
@@ -512,26 +597,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
512 ret = strict_strtoul(arg + 1, 0, &param); 597 ret = strict_strtoul(arg + 1, 0, &param);
513 if (ret) 598 if (ret)
514 break; 599 break;
515 ff->func = fetch_memory; 600 f->fn = t->memory;
516 ff->data = (void *)param; 601 f->data = (void *)param;
517 } else { 602 } else {
518 ret = split_symbol_offset(arg + 1, &offset); 603 ret = split_symbol_offset(arg + 1, &offset);
519 if (ret) 604 if (ret)
520 break; 605 break;
521 ff->data = alloc_symbol_cache(arg + 1, offset); 606 f->data = alloc_symbol_cache(arg + 1, offset);
522 if (ff->data) 607 if (f->data)
523 ff->func = fetch_symbol; 608 f->fn = t->symbol;
524 else
525 ret = -EINVAL;
526 } 609 }
527 break; 610 break;
528 case '+': /* indirect memory */ 611 case '+': /* deref memory */
529 case '-': 612 case '-':
530 tmp = strchr(arg, '('); 613 tmp = strchr(arg, '(');
531 if (!tmp) { 614 if (!tmp)
532 ret = -EINVAL;
533 break; 615 break;
534 }
535 *tmp = '\0'; 616 *tmp = '\0';
536 ret = strict_strtol(arg + 1, 0, &offset); 617 ret = strict_strtol(arg + 1, 0, &offset);
537 if (ret) 618 if (ret)
@@ -541,38 +622,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
541 arg = tmp + 1; 622 arg = tmp + 1;
542 tmp = strrchr(arg, ')'); 623 tmp = strrchr(arg, ')');
543 if (tmp) { 624 if (tmp) {
544 struct indirect_fetch_data *id; 625 struct deref_fetch_param *dprm;
626 const struct fetch_type *t2 = find_fetch_type(NULL);
545 *tmp = '\0'; 627 *tmp = '\0';
546 id = kzalloc(sizeof(struct indirect_fetch_data), 628 dprm = kzalloc(sizeof(struct deref_fetch_param),
547 GFP_KERNEL); 629 GFP_KERNEL);
548 if (!id) 630 if (!dprm)
549 return -ENOMEM; 631 return -ENOMEM;
550 id->offset = offset; 632 dprm->offset = offset;
551 ret = __parse_probe_arg(arg, &id->orig, is_return); 633 ret = __parse_probe_arg(arg, t2, &dprm->orig,
634 is_return);
552 if (ret) 635 if (ret)
553 kfree(id); 636 kfree(dprm);
554 else { 637 else {
555 ff->func = fetch_indirect; 638 f->fn = t->deref;
556 ff->data = (void *)id; 639 f->data = (void *)dprm;
557 } 640 }
558 } else 641 }
559 ret = -EINVAL;
560 break; 642 break;
561 default:
562 /* TODO: support custom handler */
563 ret = -EINVAL;
564 } 643 }
644 if (!ret && !f->fn)
645 ret = -EINVAL;
565 return ret; 646 return ret;
566} 647}
567 648
568/* String length checking wrapper */ 649/* String length checking wrapper */
569static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 650static int parse_probe_arg(char *arg, struct trace_probe *tp,
651 struct probe_arg *parg, int is_return)
570{ 652{
653 const char *t;
654
571 if (strlen(arg) > MAX_ARGSTR_LEN) { 655 if (strlen(arg) > MAX_ARGSTR_LEN) {
572 pr_info("Argument is too long.: %s\n", arg); 656 pr_info("Argument is too long.: %s\n", arg);
573 return -ENOSPC; 657 return -ENOSPC;
574 } 658 }
575 return __parse_probe_arg(arg, ff, is_return); 659 parg->comm = kstrdup(arg, GFP_KERNEL);
660 if (!parg->comm) {
661 pr_info("Failed to allocate memory for command '%s'.\n", arg);
662 return -ENOMEM;
663 }
664 t = strchr(parg->comm, ':');
665 if (t) {
666 arg[t - parg->comm] = '\0';
667 t++;
668 }
669 parg->type = find_fetch_type(t);
670 if (!parg->type) {
671 pr_info("Unsupported type: %s\n", t);
672 return -EINVAL;
673 }
674 parg->offset = tp->size;
675 tp->size += parg->type->size;
676 return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
576} 677}
577 678
578/* Return 1 if name is reserved or already used by another argument */ 679/* Return 1 if name is reserved or already used by another argument */
@@ -602,15 +703,18 @@ static int create_trace_probe(int argc, char **argv)
602 * @ADDR : fetch memory at ADDR (ADDR should be in kernel) 703 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
603 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) 704 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
604 * %REG : fetch register REG 705 * %REG : fetch register REG
605 * Indirect memory fetch: 706 * Dereferencing memory fetch:
606 * +|-offs(ARG) : fetch memory at ARG +|- offs address. 707 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
607 * Alias name of args: 708 * Alias name of args:
608 * NAME=FETCHARG : set NAME as alias of FETCHARG. 709 * NAME=FETCHARG : set NAME as alias of FETCHARG.
710 * Type of args:
711 * FETCHARG:TYPE : use TYPE instead of unsigned long.
609 */ 712 */
610 struct trace_probe *tp; 713 struct trace_probe *tp;
611 int i, ret = 0; 714 int i, ret = 0;
612 int is_return = 0, is_delete = 0; 715 int is_return = 0, is_delete = 0;
613 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; 716 char *symbol = NULL, *event = NULL, *group = NULL;
717 char *arg, *tmp;
614 unsigned long offset = 0; 718 unsigned long offset = 0;
615 void *addr = NULL; 719 void *addr = NULL;
616 char buf[MAX_EVENT_NAME_LEN]; 720 char buf[MAX_EVENT_NAME_LEN];
@@ -723,13 +827,6 @@ static int create_trace_probe(int argc, char **argv)
723 else 827 else
724 arg = argv[i]; 828 arg = argv[i];
725 829
726 if (conflict_field_name(argv[i], tp->args, i)) {
727 pr_info("Argument%d name '%s' conflicts with "
728 "another field.\n", i, argv[i]);
729 ret = -EINVAL;
730 goto error;
731 }
732
733 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); 830 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
734 if (!tp->args[i].name) { 831 if (!tp->args[i].name) {
735 pr_info("Failed to allocate argument%d name '%s'.\n", 832 pr_info("Failed to allocate argument%d name '%s'.\n",
@@ -737,9 +834,19 @@ static int create_trace_probe(int argc, char **argv)
737 ret = -ENOMEM; 834 ret = -ENOMEM;
738 goto error; 835 goto error;
739 } 836 }
837 tmp = strchr(tp->args[i].name, ':');
838 if (tmp)
839 *tmp = '_'; /* convert : to _ */
840
841 if (conflict_field_name(tp->args[i].name, tp->args, i)) {
842 pr_info("Argument%d name '%s' conflicts with "
843 "another field.\n", i, argv[i]);
844 ret = -EINVAL;
845 goto error;
846 }
740 847
741 /* Parse fetch argument */ 848 /* Parse fetch argument */
742 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); 849 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
743 if (ret) { 850 if (ret) {
744 pr_info("Parse error at argument%d. (%d)\n", i, ret); 851 pr_info("Parse error at argument%d. (%d)\n", i, ret);
745 kfree(tp->args[i].name); 852 kfree(tp->args[i].name);
@@ -794,8 +901,7 @@ static void probes_seq_stop(struct seq_file *m, void *v)
794static int probes_seq_show(struct seq_file *m, void *v) 901static int probes_seq_show(struct seq_file *m, void *v)
795{ 902{
796 struct trace_probe *tp = v; 903 struct trace_probe *tp = v;
797 int i, ret; 904 int i;
798 char buf[MAX_ARGSTR_LEN + 1];
799 905
800 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 906 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
801 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); 907 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
@@ -807,15 +913,10 @@ static int probes_seq_show(struct seq_file *m, void *v)
807 else 913 else
808 seq_printf(m, " %s", probe_symbol(tp)); 914 seq_printf(m, " %s", probe_symbol(tp));
809 915
810 for (i = 0; i < tp->nr_args; i++) { 916 for (i = 0; i < tp->nr_args; i++)
811 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); 917 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
812 if (ret < 0) {
813 pr_warning("Argument%d decoding error(%d).\n", i, ret);
814 return ret;
815 }
816 seq_printf(m, " %s=%s", tp->args[i].name, buf);
817 }
818 seq_printf(m, "\n"); 918 seq_printf(m, "\n");
919
819 return 0; 920 return 0;
820} 921}
821 922
@@ -945,9 +1046,10 @@ static const struct file_operations kprobe_profile_ops = {
945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 1046static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
946{ 1047{
947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1048 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
948 struct kprobe_trace_entry *entry; 1049 struct kprobe_trace_entry_head *entry;
949 struct ring_buffer_event *event; 1050 struct ring_buffer_event *event;
950 struct ring_buffer *buffer; 1051 struct ring_buffer *buffer;
1052 u8 *data;
951 int size, i, pc; 1053 int size, i, pc;
952 unsigned long irq_flags; 1054 unsigned long irq_flags;
953 struct ftrace_event_call *call = &tp->call; 1055 struct ftrace_event_call *call = &tp->call;
@@ -957,7 +1059,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
957 local_save_flags(irq_flags); 1059 local_save_flags(irq_flags);
958 pc = preempt_count(); 1060 pc = preempt_count();
959 1061
960 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1062 size = sizeof(*entry) + tp->size;
961 1063
962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1064 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
963 irq_flags, pc); 1065 irq_flags, pc);
@@ -965,10 +1067,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
965 return; 1067 return;
966 1068
967 entry = ring_buffer_event_data(event); 1069 entry = ring_buffer_event_data(event);
968 entry->nargs = tp->nr_args;
969 entry->ip = (unsigned long)kp->addr; 1070 entry->ip = (unsigned long)kp->addr;
1071 data = (u8 *)&entry[1];
970 for (i = 0; i < tp->nr_args; i++) 1072 for (i = 0; i < tp->nr_args; i++)
971 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1073 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
972 1074
973 if (!filter_current_check_discard(buffer, call, entry, event)) 1075 if (!filter_current_check_discard(buffer, call, entry, event))
974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1076 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -979,9 +1081,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
979 struct pt_regs *regs) 1081 struct pt_regs *regs)
980{ 1082{
981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1083 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
982 struct kretprobe_trace_entry *entry; 1084 struct kretprobe_trace_entry_head *entry;
983 struct ring_buffer_event *event; 1085 struct ring_buffer_event *event;
984 struct ring_buffer *buffer; 1086 struct ring_buffer *buffer;
1087 u8 *data;
985 int size, i, pc; 1088 int size, i, pc;
986 unsigned long irq_flags; 1089 unsigned long irq_flags;
987 struct ftrace_event_call *call = &tp->call; 1090 struct ftrace_event_call *call = &tp->call;
@@ -989,7 +1092,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
989 local_save_flags(irq_flags); 1092 local_save_flags(irq_flags);
990 pc = preempt_count(); 1093 pc = preempt_count();
991 1094
992 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1095 size = sizeof(*entry) + tp->size;
993 1096
994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1097 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
995 irq_flags, pc); 1098 irq_flags, pc);
@@ -997,11 +1100,11 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
997 return; 1100 return;
998 1101
999 entry = ring_buffer_event_data(event); 1102 entry = ring_buffer_event_data(event);
1000 entry->nargs = tp->nr_args;
1001 entry->func = (unsigned long)tp->rp.kp.addr; 1103 entry->func = (unsigned long)tp->rp.kp.addr;
1002 entry->ret_ip = (unsigned long)ri->ret_addr; 1104 entry->ret_ip = (unsigned long)ri->ret_addr;
1105 data = (u8 *)&entry[1];
1003 for (i = 0; i < tp->nr_args; i++) 1106 for (i = 0; i < tp->nr_args; i++)
1004 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1107 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1005 1108
1006 if (!filter_current_check_discard(buffer, call, entry, event)) 1109 if (!filter_current_check_discard(buffer, call, entry, event))
1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1110 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1011,13 +1114,14 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1011enum print_line_t 1114enum print_line_t
1012print_kprobe_event(struct trace_iterator *iter, int flags) 1115print_kprobe_event(struct trace_iterator *iter, int flags)
1013{ 1116{
1014 struct kprobe_trace_entry *field; 1117 struct kprobe_trace_entry_head *field;
1015 struct trace_seq *s = &iter->seq; 1118 struct trace_seq *s = &iter->seq;
1016 struct trace_event *event; 1119 struct trace_event *event;
1017 struct trace_probe *tp; 1120 struct trace_probe *tp;
1121 u8 *data;
1018 int i; 1122 int i;
1019 1123
1020 field = (struct kprobe_trace_entry *)iter->ent; 1124 field = (struct kprobe_trace_entry_head *)iter->ent;
1021 event = ftrace_find_event(field->ent.type); 1125 event = ftrace_find_event(field->ent.type);
1022 tp = container_of(event, struct trace_probe, event); 1126 tp = container_of(event, struct trace_probe, event);
1023 1127
@@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
1030 if (!trace_seq_puts(s, ")")) 1134 if (!trace_seq_puts(s, ")"))
1031 goto partial; 1135 goto partial;
1032 1136
1033 for (i = 0; i < field->nargs; i++) 1137 data = (u8 *)&field[1];
1034 if (!trace_seq_printf(s, " %s=%lx", 1138 for (i = 0; i < tp->nr_args; i++)
1035 tp->args[i].name, field->args[i])) 1139 if (!tp->args[i].type->print(s, tp->args[i].name,
1140 data + tp->args[i].offset))
1036 goto partial; 1141 goto partial;
1037 1142
1038 if (!trace_seq_puts(s, "\n")) 1143 if (!trace_seq_puts(s, "\n"))
@@ -1046,13 +1151,14 @@ partial:
1046enum print_line_t 1151enum print_line_t
1047print_kretprobe_event(struct trace_iterator *iter, int flags) 1152print_kretprobe_event(struct trace_iterator *iter, int flags)
1048{ 1153{
1049 struct kretprobe_trace_entry *field; 1154 struct kretprobe_trace_entry_head *field;
1050 struct trace_seq *s = &iter->seq; 1155 struct trace_seq *s = &iter->seq;
1051 struct trace_event *event; 1156 struct trace_event *event;
1052 struct trace_probe *tp; 1157 struct trace_probe *tp;
1158 u8 *data;
1053 int i; 1159 int i;
1054 1160
1055 field = (struct kretprobe_trace_entry *)iter->ent; 1161 field = (struct kretprobe_trace_entry_head *)iter->ent;
1056 event = ftrace_find_event(field->ent.type); 1162 event = ftrace_find_event(field->ent.type);
1057 tp = container_of(event, struct trace_probe, event); 1163 tp = container_of(event, struct trace_probe, event);
1058 1164
@@ -1071,9 +1177,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
1071 if (!trace_seq_puts(s, ")")) 1177 if (!trace_seq_puts(s, ")"))
1072 goto partial; 1178 goto partial;
1073 1179
1074 for (i = 0; i < field->nargs; i++) 1180 data = (u8 *)&field[1];
1075 if (!trace_seq_printf(s, " %s=%lx", 1181 for (i = 0; i < tp->nr_args; i++)
1076 tp->args[i].name, field->args[i])) 1182 if (!tp->args[i].type->print(s, tp->args[i].name,
1183 data + tp->args[i].offset))
1077 goto partial; 1184 goto partial;
1078 1185
1079 if (!trace_seq_puts(s, "\n")) 1186 if (!trace_seq_puts(s, "\n"))
@@ -1129,29 +1236,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call)
1129static int kprobe_event_define_fields(struct ftrace_event_call *event_call) 1236static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1130{ 1237{
1131 int ret, i; 1238 int ret, i;
1132 struct kprobe_trace_entry field; 1239 struct kprobe_trace_entry_head field;
1133 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1240 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1134 1241
1135 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1242 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1136 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1137 /* Set argument names as fields */ 1243 /* Set argument names as fields */
1138 for (i = 0; i < tp->nr_args; i++) 1244 for (i = 0; i < tp->nr_args; i++) {
1139 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1245 ret = trace_define_field(event_call, tp->args[i].type->name,
1246 tp->args[i].name,
1247 sizeof(field) + tp->args[i].offset,
1248 tp->args[i].type->size,
1249 tp->args[i].type->is_signed,
1250 FILTER_OTHER);
1251 if (ret)
1252 return ret;
1253 }
1140 return 0; 1254 return 0;
1141} 1255}
1142 1256
1143static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) 1257static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1144{ 1258{
1145 int ret, i; 1259 int ret, i;
1146 struct kretprobe_trace_entry field; 1260 struct kretprobe_trace_entry_head field;
1147 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1261 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1148 1262
1149 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); 1263 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1150 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1264 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1151 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1152 /* Set argument names as fields */ 1265 /* Set argument names as fields */
1153 for (i = 0; i < tp->nr_args; i++) 1266 for (i = 0; i < tp->nr_args; i++) {
1154 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1267 ret = trace_define_field(event_call, tp->args[i].type->name,
1268 tp->args[i].name,
1269 sizeof(field) + tp->args[i].offset,
1270 tp->args[i].type->size,
1271 tp->args[i].type->is_signed,
1272 FILTER_OTHER);
1273 if (ret)
1274 return ret;
1275 }
1155 return 0; 1276 return 0;
1156} 1277}
1157 1278
@@ -1176,8 +1297,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); 1297 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1177 1298
1178 for (i = 0; i < tp->nr_args; i++) { 1299 for (i = 0; i < tp->nr_args; i++) {
1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", 1300 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
1180 tp->args[i].name); 1301 tp->args[i].name, tp->args[i].type->fmt);
1181 } 1302 }
1182 1303
1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); 1304 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
@@ -1219,12 +1340,13 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1219{ 1340{
1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1341 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1221 struct ftrace_event_call *call = &tp->call; 1342 struct ftrace_event_call *call = &tp->call;
1222 struct kprobe_trace_entry *entry; 1343 struct kprobe_trace_entry_head *entry;
1344 u8 *data;
1223 int size, __size, i; 1345 int size, __size, i;
1224 unsigned long irq_flags; 1346 unsigned long irq_flags;
1225 int rctx; 1347 int rctx;
1226 1348
1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1349 __size = sizeof(*entry) + tp->size;
1228 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1350 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1229 size -= sizeof(u32); 1351 size -= sizeof(u32);
1230 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1352 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1235,10 +1357,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1235 if (!entry) 1357 if (!entry)
1236 return; 1358 return;
1237 1359
1238 entry->nargs = tp->nr_args;
1239 entry->ip = (unsigned long)kp->addr; 1360 entry->ip = (unsigned long)kp->addr;
1361 data = (u8 *)&entry[1];
1240 for (i = 0; i < tp->nr_args; i++) 1362 for (i = 0; i < tp->nr_args; i++)
1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1363 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1242 1364
1243 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); 1365 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
1244} 1366}
@@ -1249,12 +1371,13 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1249{ 1371{
1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1372 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1251 struct ftrace_event_call *call = &tp->call; 1373 struct ftrace_event_call *call = &tp->call;
1252 struct kretprobe_trace_entry *entry; 1374 struct kretprobe_trace_entry_head *entry;
1375 u8 *data;
1253 int size, __size, i; 1376 int size, __size, i;
1254 unsigned long irq_flags; 1377 unsigned long irq_flags;
1255 int rctx; 1378 int rctx;
1256 1379
1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1380 __size = sizeof(*entry) + tp->size;
1258 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1381 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1259 size -= sizeof(u32); 1382 size -= sizeof(u32);
1260 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1383 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1265,11 +1388,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1265 if (!entry) 1388 if (!entry)
1266 return; 1389 return;
1267 1390
1268 entry->nargs = tp->nr_args;
1269 entry->func = (unsigned long)tp->rp.kp.addr; 1391 entry->func = (unsigned long)tp->rp.kp.addr;
1270 entry->ret_ip = (unsigned long)ri->ret_addr; 1392 entry->ret_ip = (unsigned long)ri->ret_addr;
1393 data = (u8 *)&entry[1];
1271 for (i = 0; i < tp->nr_args; i++) 1394 for (i = 0; i < tp->nr_args; i++)
1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1395 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1273 1396
1274 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, 1397 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
1275 irq_flags, regs); 1398 irq_flags, regs);
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index d59cd6879477..8eaf00749b65 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -34,12 +34,6 @@
34 34
35#include <asm/atomic.h> 35#include <asm/atomic.h>
36 36
37/*
38 * For now, let us restrict the no. of symbols traced simultaneously to number
39 * of available hardware breakpoint registers.
40 */
41#define KSYM_TRACER_MAX HBP_NUM
42
43#define KSYM_TRACER_OP_LEN 3 /* rw- */ 37#define KSYM_TRACER_OP_LEN 3 /* rw- */
44 38
45struct trace_ksym { 39struct trace_ksym {
@@ -53,7 +47,6 @@ struct trace_ksym {
53 47
54static struct trace_array *ksym_trace_array; 48static struct trace_array *ksym_trace_array;
55 49
56static unsigned int ksym_filter_entry_count;
57static unsigned int ksym_tracing_enabled; 50static unsigned int ksym_tracing_enabled;
58 51
59static HLIST_HEAD(ksym_filter_head); 52static HLIST_HEAD(ksym_filter_head);
@@ -181,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
181 struct trace_ksym *entry; 174 struct trace_ksym *entry;
182 int ret = -ENOMEM; 175 int ret = -ENOMEM;
183 176
184 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
185 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
186 " new requests for tracing can be accepted now.\n",
187 KSYM_TRACER_MAX);
188 return -ENOSPC;
189 }
190
191 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); 177 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
192 if (!entry) 178 if (!entry)
193 return -ENOMEM; 179 return -ENOMEM;
@@ -203,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
203 189
204 if (IS_ERR(entry->ksym_hbp)) { 190 if (IS_ERR(entry->ksym_hbp)) {
205 ret = PTR_ERR(entry->ksym_hbp); 191 ret = PTR_ERR(entry->ksym_hbp);
206 printk(KERN_INFO "ksym_tracer request failed. Try again" 192 if (ret == -ENOSPC) {
207 " later!!\n"); 193 printk(KERN_ERR "ksym_tracer: Maximum limit reached."
194 " No new requests for tracing can be accepted now.\n");
195 } else {
196 printk(KERN_INFO "ksym_tracer request failed. Try again"
197 " later!!\n");
198 }
208 goto err; 199 goto err;
209 } 200 }
210 201
211 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); 202 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
212 ksym_filter_entry_count++;
213 203
214 return 0; 204 return 0;
215 205
@@ -265,7 +255,6 @@ static void __ksym_trace_reset(void)
265 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, 255 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
266 ksym_hlist) { 256 ksym_hlist) {
267 unregister_wide_hw_breakpoint(entry->ksym_hbp); 257 unregister_wide_hw_breakpoint(entry->ksym_hbp);
268 ksym_filter_entry_count--;
269 hlist_del_rcu(&(entry->ksym_hlist)); 258 hlist_del_rcu(&(entry->ksym_hlist));
270 synchronize_rcu(); 259 synchronize_rcu();
271 kfree(entry); 260 kfree(entry);
@@ -338,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file,
338 goto out_unlock; 327 goto out_unlock;
339 } 328 }
340 /* Error or "symbol:---" case: drop it */ 329 /* Error or "symbol:---" case: drop it */
341 ksym_filter_entry_count--;
342 hlist_del_rcu(&(entry->ksym_hlist)); 330 hlist_del_rcu(&(entry->ksym_hlist));
343 synchronize_rcu(); 331 synchronize_rcu();
344 kfree(entry); 332 kfree(entry);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cdc..2404c129a8c9 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -253,7 +253,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
253 void *ret; 253 void *ret;
254 254
255 if (s->full) 255 if (s->full)
256 return 0; 256 return NULL;
257 257
258 if (len > ((PAGE_SIZE - 1) - s->len)) { 258 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1; 259 s->full = 1;
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde4..a55fccfede5d 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
50} 50}
51 51
52static void 52static void
53probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(struct task_struct *prev, struct task_struct *next)
54 struct task_struct *next)
55{ 54{
56 struct trace_array_cpu *data; 55 struct trace_array_cpu *data;
57 unsigned long flags; 56 unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
109} 108}
110 109
111static void 110static void
112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 111probe_sched_wakeup(struct task_struct *wakee, int success)
113{ 112{
114 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
115 unsigned long flags; 114 unsigned long flags;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8d..8052446ceeaa 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -107,8 +107,7 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
107} 107}
108 108
109static void notrace 109static void notrace
110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
111 struct task_struct *next)
112{ 111{
113 struct trace_array_cpu *data; 112 struct trace_array_cpu *data;
114 cycle_t T0, T1, delta; 113 cycle_t T0, T1, delta;
@@ -200,7 +199,7 @@ static void wakeup_reset(struct trace_array *tr)
200} 199}
201 200
202static void 201static void
203probe_wakeup(struct rq *rq, struct task_struct *p, int success) 202probe_wakeup(struct task_struct *p, int success)
204{ 203{
205 struct trace_array_cpu *data; 204 struct trace_array_cpu *data;
206 int cpu = smp_processor_id(); 205 int cpu = smp_processor_id();
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 81003b4d617f..250e7f9bd2f0 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,7 +17,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
17 case TRACE_BRANCH: 17 case TRACE_BRANCH:
18 case TRACE_GRAPH_ENT: 18 case TRACE_GRAPH_ENT:
19 case TRACE_GRAPH_RET: 19 case TRACE_GRAPH_RET:
20 case TRACE_HW_BRANCHES:
21 case TRACE_KSYM: 20 case TRACE_KSYM:
22 return 1; 21 return 1;
23 } 22 }
@@ -30,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
30 struct trace_entry *entry; 29 struct trace_entry *entry;
31 unsigned int loops = 0; 30 unsigned int loops = 0;
32 31
33 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { 32 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
34 entry = ring_buffer_event_data(event); 33 entry = ring_buffer_event_data(event);
35 34
36 /* 35 /*
@@ -256,7 +255,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
256/* Maximum number of functions to trace before diagnosing a hang */ 255/* Maximum number of functions to trace before diagnosing a hang */
257#define GRAPH_MAX_FUNC_TEST 100000000 256#define GRAPH_MAX_FUNC_TEST 100000000
258 257
259static void __ftrace_dump(bool disable_tracing); 258static void
259__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
260static unsigned int graph_hang_thresh; 260static unsigned int graph_hang_thresh;
261 261
262/* Wrap the real function entry probe to avoid possible hanging */ 262/* Wrap the real function entry probe to avoid possible hanging */
@@ -267,7 +267,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
267 ftrace_graph_stop(); 267 ftrace_graph_stop();
268 printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); 268 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
269 if (ftrace_dump_on_oops) 269 if (ftrace_dump_on_oops)
270 __ftrace_dump(false); 270 __ftrace_dump(false, DUMP_ALL);
271 return 0; 271 return 0;
272 } 272 }
273 273
@@ -755,62 +755,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
755} 755}
756#endif /* CONFIG_BRANCH_TRACER */ 756#endif /* CONFIG_BRANCH_TRACER */
757 757
758#ifdef CONFIG_HW_BRANCH_TRACER
759int
760trace_selftest_startup_hw_branches(struct tracer *trace,
761 struct trace_array *tr)
762{
763 struct trace_iterator *iter;
764 struct tracer tracer;
765 unsigned long count;
766 int ret;
767
768 if (!trace->open) {
769 printk(KERN_CONT "missing open function...");
770 return -1;
771 }
772
773 ret = tracer_init(trace, tr);
774 if (ret) {
775 warn_failed_init_tracer(trace, ret);
776 return ret;
777 }
778
779 /*
780 * The hw-branch tracer needs to collect the trace from the various
781 * cpu trace buffers - before tracing is stopped.
782 */
783 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
784 if (!iter)
785 return -ENOMEM;
786
787 memcpy(&tracer, trace, sizeof(tracer));
788
789 iter->trace = &tracer;
790 iter->tr = tr;
791 iter->pos = -1;
792 mutex_init(&iter->mutex);
793
794 trace->open(iter);
795
796 mutex_destroy(&iter->mutex);
797 kfree(iter);
798
799 tracing_stop();
800
801 ret = trace_test_buffer(tr, &count);
802 trace->reset(tr);
803 tracing_start();
804
805 if (!ret && !count) {
806 printk(KERN_CONT "no entries found..");
807 ret = -1;
808 }
809
810 return ret;
811}
812#endif /* CONFIG_HW_BRANCH_TRACER */
813
814#ifdef CONFIG_KSYM_TRACER 758#ifdef CONFIG_KSYM_TRACER
815static int ksym_selftest_dummy; 759static int ksym_selftest_dummy;
816 760
diff --git a/kernel/user.c b/kernel/user.c
index 766467b3bcb7..7e72614b736d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include "cred-internals.h"
20 19
21struct user_namespace init_user_ns = { 20struct user_namespace init_user_ns = {
22 .kref = { 21 .kref = {
@@ -137,9 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
137 struct hlist_head *hashent = uidhashentry(ns, uid); 136 struct hlist_head *hashent = uidhashentry(ns, uid);
138 struct user_struct *up, *new; 137 struct user_struct *up, *new;
139 138
140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
141 * atomic.
142 */
143 spin_lock_irq(&uidhash_lock); 139 spin_lock_irq(&uidhash_lock);
144 up = uid_hash_find(uid, hashent); 140 up = uid_hash_find(uid, hashent);
145 spin_unlock_irq(&uidhash_lock); 141 spin_unlock_irq(&uidhash_lock);
@@ -161,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
161 spin_lock_irq(&uidhash_lock); 157 spin_lock_irq(&uidhash_lock);
162 up = uid_hash_find(uid, hashent); 158 up = uid_hash_find(uid, hashent);
163 if (up) { 159 if (up) {
164 /* This case is not possible when CONFIG_USER_SCHED
165 * is defined, since we serialize alloc_uid() using
166 * uids_mutex. Hence no need to call
167 * sched_destroy_user() or remove_user_sysfs_dir().
168 */
169 key_put(new->uid_keyring); 160 key_put(new->uid_keyring);
170 key_put(new->session_keyring); 161 key_put(new->session_keyring);
171 kmem_cache_free(uid_cachep, new); 162 kmem_cache_free(uid_cachep, new);
@@ -178,8 +169,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
178 169
179 return up; 170 return up;
180 171
181 put_user_ns(new->user_ns);
182 kmem_cache_free(uid_cachep, new);
183out_unlock: 172out_unlock:
184 return NULL; 173 return NULL;
185} 174}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dee48658805c..77dabbf64b8f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -229,6 +229,16 @@ static inline void set_wq_data(struct work_struct *work,
229 atomic_long_set(&work->data, new); 229 atomic_long_set(&work->data, new);
230} 230}
231 231
232/*
233 * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
234 */
235static inline void clear_wq_data(struct work_struct *work)
236{
237 unsigned long flags = *work_data_bits(work) &
238 (1UL << WORK_STRUCT_STATIC);
239 atomic_long_set(&work->data, flags);
240}
241
232static inline 242static inline
233struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) 243struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
234{ 244{
@@ -671,7 +681,7 @@ static int __cancel_work_timer(struct work_struct *work,
671 wait_on_work(work); 681 wait_on_work(work);
672 } while (unlikely(ret < 0)); 682 } while (unlikely(ret < 0));
673 683
674 work_clear_pending(work); 684 clear_wq_data(work);
675 return ret; 685 return ret;
676} 686}
677 687
@@ -774,7 +784,7 @@ void flush_delayed_work(struct delayed_work *dwork)
774{ 784{
775 if (del_timer_sync(&dwork->timer)) { 785 if (del_timer_sync(&dwork->timer)) {
776 struct cpu_workqueue_struct *cwq; 786 struct cpu_workqueue_struct *cwq;
777 cwq = wq_per_cpu(keventd_wq, get_cpu()); 787 cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
778 __queue_work(cwq, &dwork->work); 788 __queue_work(cwq, &dwork->work);
779 put_cpu(); 789 put_cpu();
780 } 790 }
@@ -845,6 +855,30 @@ int schedule_on_each_cpu(work_func_t func)
845 return 0; 855 return 0;
846} 856}
847 857
858/**
859 * flush_scheduled_work - ensure that any scheduled work has run to completion.
860 *
861 * Forces execution of the kernel-global workqueue and blocks until its
862 * completion.
863 *
864 * Think twice before calling this function! It's very easy to get into
865 * trouble if you don't take great care. Either of the following situations
866 * will lead to deadlock:
867 *
868 * One of the work items currently on the workqueue needs to acquire
869 * a lock held by your code or its caller.
870 *
871 * Your code is running in the context of a work routine.
872 *
873 * They will be detected by lockdep when they occur, but the first might not
874 * occur very often. It depends on what work items are on the workqueue and
875 * what locks they need, which you have no control over.
876 *
877 * In most situations flushing the entire workqueue is overkill; you merely
878 * need to know that a particular work item isn't queued and isn't running.
879 * In such cases you should use cancel_delayed_work_sync() or
880 * cancel_work_sync() instead.
881 */
848void flush_scheduled_work(void) 882void flush_scheduled_work(void)
849{ 883{
850 flush_workqueue(keventd_wq); 884 flush_workqueue(keventd_wq);