aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorJens Axboe <jens.axboe@oracle.com>2010-05-21 15:27:26 -0400
committerJens Axboe <jens.axboe@oracle.com>2010-05-21 15:27:26 -0400
commitee9a3607fb03e804ddf624544105f4e34260c380 (patch)
treece41b6e0fa10982a306f6c142a92dbf3c9961284 /kernel
parentb492e95be0ae672922f4734acf3f5d35c30be948 (diff)
parentd515e86e639890b33a09390d062b0831664f04a2 (diff)
Merge branch 'master' into for-2.6.35
Conflicts: fs/ext3/fsync.c Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/acct.c37
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup.c66
-rw-r--r--kernel/cgroup_freezer.c26
-rw-r--r--kernel/compat.c25
-rw-r--r--kernel/cpu.c26
-rw-r--r--kernel/cpuset.c67
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c5
-rw-r--r--kernel/debug/Makefile6
-rw-r--r--kernel/debug/debug_core.c983
-rw-r--r--kernel/debug/debug_core.h81
-rw-r--r--kernel/debug/gdbstub.c1017
-rw-r--r--kernel/debug/kdb/.gitignore1
-rw-r--r--kernel/debug/kdb/Makefile25
-rw-r--r--kernel/debug/kdb/kdb_bp.c564
-rw-r--r--kernel/debug/kdb/kdb_bt.c210
-rw-r--r--kernel/debug/kdb/kdb_cmds35
-rw-r--r--kernel/debug/kdb/kdb_debugger.c169
-rw-r--r--kernel/debug/kdb/kdb_io.c826
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c212
-rw-r--r--kernel/debug/kdb/kdb_main.c2849
-rw-r--r--kernel/debug/kdb/kdb_private.h300
-rw-r--r--kernel/debug/kdb/kdb_support.c927
-rw-r--r--kernel/exit.c1
-rw-r--r--kernel/fork.c4
-rw-r--r--kernel/groups.c6
-rw-r--r--kernel/hrtimer.c67
-rw-r--r--kernel/hw_breakpoint.c196
-rw-r--r--kernel/irq/handle.c3
-rw-r--r--kernel/irq/manage.c89
-rw-r--r--kernel/irq/proc.c60
-rw-r--r--kernel/kallsyms.c21
-rw-r--r--kernel/kexec.c6
-rw-r--r--kernel/kgdb.c1764
-rw-r--r--kernel/kprobes.c132
-rw-r--r--kernel/ksysfs.c3
-rw-r--r--kernel/lockdep.c93
-rw-r--r--kernel/lockdep_internals.h72
-rw-r--r--kernel/lockdep_proc.c58
-rw-r--r--kernel/module.c28
-rw-r--r--kernel/perf_event.c379
-rw-r--r--kernel/pm_qos_params.c218
-rw-r--r--kernel/posix-cpu-timers.c298
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/snapshot.c145
-rw-r--r--kernel/power/swap.c333
-rw-r--r--kernel/power/user.c37
-rw-r--r--kernel/printk.c25
-rw-r--r--kernel/profile.c4
-rw-r--r--kernel/ptrace.c12
-rw-r--r--kernel/rcupdate.c30
-rw-r--r--kernel/rcutiny.c35
-rw-r--r--kernel/rcutiny_plugin.h39
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/rcutree.c131
-rw-r--r--kernel/rcutree.h2
-rw-r--r--kernel/rcutree_plugin.h69
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/sched.c797
-rw-r--r--kernel/sched_debug.c110
-rw-r--r--kernel/sched_fair.c350
-rw-r--r--kernel/sched_features.h55
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c15
-rw-r--r--kernel/signal.c40
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/stop_machine.c537
-rw-r--r--kernel/sys.c31
-rw-r--r--kernel/sysctl.c579
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/time.c11
-rw-r--r--kernel/time/clocksource.c48
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-sched.c84
-rw-r--r--kernel/time/timekeeping.c35
-rw-r--r--kernel/time/timer_list.c1
-rw-r--r--kernel/timer.c137
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c33
-rw-r--r--kernel/trace/ring_buffer.c179
-rw-r--r--kernel/trace/ring_buffer_benchmark.c5
-rw-r--r--kernel/trace/trace.c127
-rw-r--r--kernel/trace/trace.h47
-rw-r--r--kernel/trace/trace_entries.h12
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_functions_graph.c169
-rw-r--r--kernel/trace/trace_hw_branches.c312
-rw-r--r--kernel/trace/trace_irqsoff.c271
-rw-r--r--kernel/trace/trace_kprobe.c535
-rw-r--r--kernel/trace/trace_ksym.c26
-rw-r--r--kernel/trace/trace_output.c18
-rw-r--r--kernel/trace/trace_sched_switch.c5
-rw-r--r--kernel/trace/trace_sched_wakeup.c5
-rw-r--r--kernel/trace/trace_selftest.c64
-rw-r--r--kernel/user.c11
-rw-r--r--kernel/user_namespace.c4
-rw-r--r--kernel/workqueue.c38
102 files changed, 12638 insertions, 5064 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1676b5..057472fbc272 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,14 +68,14 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
68obj-$(CONFIG_PID_NS) += pid_namespace.o 68obj-$(CONFIG_PID_NS) += pid_namespace.o
69obj-$(CONFIG_IKCONFIG) += configs.o 69obj-$(CONFIG_IKCONFIG) += configs.o
70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
71obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
75obj-$(CONFIG_GCOV_KERNEL) += gcov/ 75obj-$(CONFIG_GCOV_KERNEL) += gcov/
76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
77obj-$(CONFIG_KPROBES) += kprobes.o 77obj-$(CONFIG_KPROBES) += kprobes.o
78obj-$(CONFIG_KGDB) += kgdb.o 78obj-$(CONFIG_KGDB) += debug/
79obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o 79obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
diff --git a/kernel/acct.c b/kernel/acct.c
index 24f8c81fc48d..385b88461c29 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -216,7 +216,6 @@ static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt; 218 struct vfsmount *mnt;
219 int error;
220 struct pid_namespace *ns; 219 struct pid_namespace *ns;
221 struct bsd_acct_struct *acct = NULL; 220 struct bsd_acct_struct *acct = NULL;
222 221
@@ -244,13 +243,6 @@ static int acct_on(char *name)
244 } 243 }
245 } 244 }
246 245
247 error = security_acct(file);
248 if (error) {
249 kfree(acct);
250 filp_close(file, NULL);
251 return error;
252 }
253
254 spin_lock(&acct_lock); 246 spin_lock(&acct_lock);
255 if (ns->bacct == NULL) { 247 if (ns->bacct == NULL) {
256 ns->bacct = acct; 248 ns->bacct = acct;
@@ -281,7 +273,7 @@ static int acct_on(char *name)
281 */ 273 */
282SYSCALL_DEFINE1(acct, const char __user *, name) 274SYSCALL_DEFINE1(acct, const char __user *, name)
283{ 275{
284 int error; 276 int error = 0;
285 277
286 if (!capable(CAP_SYS_PACCT)) 278 if (!capable(CAP_SYS_PACCT))
287 return -EPERM; 279 return -EPERM;
@@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
299 if (acct == NULL) 291 if (acct == NULL)
300 return 0; 292 return 0;
301 293
302 error = security_acct(NULL); 294 spin_lock(&acct_lock);
303 if (!error) { 295 acct_file_reopen(acct, NULL, NULL);
304 spin_lock(&acct_lock); 296 spin_unlock(&acct_lock);
305 acct_file_reopen(acct, NULL, NULL);
306 spin_unlock(&acct_lock);
307 }
308 } 297 }
298
309 return error; 299 return error;
310} 300}
311 301
@@ -353,17 +343,18 @@ restart:
353 343
354void acct_exit_ns(struct pid_namespace *ns) 344void acct_exit_ns(struct pid_namespace *ns)
355{ 345{
356 struct bsd_acct_struct *acct; 346 struct bsd_acct_struct *acct = ns->bacct;
357 347
358 spin_lock(&acct_lock); 348 if (acct == NULL)
359 acct = ns->bacct; 349 return;
360 if (acct != NULL) {
361 if (acct->file != NULL)
362 acct_file_reopen(acct, NULL, NULL);
363 350
364 kfree(acct); 351 del_timer_sync(&acct->timer);
365 } 352 spin_lock(&acct_lock);
353 if (acct->file != NULL)
354 acct_file_reopen(acct, NULL, NULL);
366 spin_unlock(&acct_lock); 355 spin_unlock(&acct_lock);
356
357 kfree(acct);
367} 358}
368 359
369/* 360/*
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e4697e9b276..2f05303715a5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "cred-internals.h"
19 18
20/* 19/*
21 * Leveraged for setting/resetting capabilities 20 * Leveraged for setting/resetting capabilities
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e2769e13980c..291775021b2e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1646,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1647{ 1647{
1648 char *start; 1648 char *start;
1649 struct dentry *dentry = rcu_dereference(cgrp->dentry); 1649 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1650 rcu_read_lock_held() ||
1651 cgroup_lock_is_held());
1650 1652
1651 if (!dentry || cgrp == dummytop) { 1653 if (!dentry || cgrp == dummytop) {
1652 /* 1654 /*
@@ -1662,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1662 *--start = '\0'; 1664 *--start = '\0';
1663 for (;;) { 1665 for (;;) {
1664 int len = dentry->d_name.len; 1666 int len = dentry->d_name.len;
1667
1665 if ((start -= len) < buf) 1668 if ((start -= len) < buf)
1666 return -ENAMETOOLONG; 1669 return -ENAMETOOLONG;
1667 memcpy(start, cgrp->dentry->d_name.name, len); 1670 memcpy(start, dentry->d_name.name, len);
1668 cgrp = cgrp->parent; 1671 cgrp = cgrp->parent;
1669 if (!cgrp) 1672 if (!cgrp)
1670 break; 1673 break;
1671 dentry = rcu_dereference(cgrp->dentry); 1674
1675 dentry = rcu_dereference_check(cgrp->dentry,
1676 rcu_read_lock_held() ||
1677 cgroup_lock_is_held());
1672 if (!cgrp->parent) 1678 if (!cgrp->parent)
1673 continue; 1679 continue;
1674 if (--start < buf) 1680 if (--start < buf)
@@ -3010,7 +3016,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3010 unsigned long flags = (unsigned long)key; 3016 unsigned long flags = (unsigned long)key;
3011 3017
3012 if (flags & POLLHUP) { 3018 if (flags & POLLHUP) {
3013 remove_wait_queue_locked(event->wqh, &event->wait); 3019 __remove_wait_queue(event->wqh, &event->wait);
3014 spin_lock(&cgrp->event_list_lock); 3020 spin_lock(&cgrp->event_list_lock);
3015 list_del(&event->list); 3021 list_del(&event->list);
3016 spin_unlock(&cgrp->event_list_lock); 3022 spin_unlock(&cgrp->event_list_lock);
@@ -3609,7 +3615,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3609 * @ss: the subsystem to load 3615 * @ss: the subsystem to load
3610 * 3616 *
3611 * This function should be called in a modular subsystem's initcall. If the 3617 * This function should be called in a modular subsystem's initcall. If the
3612 * subsytem is built as a module, it will be assigned a new subsys_id and set 3618 * subsystem is built as a module, it will be assigned a new subsys_id and set
3613 * up for use. If the subsystem is built-in anyway, work is delegated to the 3619 * up for use. If the subsystem is built-in anyway, work is delegated to the
3614 * simpler cgroup_init_subsys. 3620 * simpler cgroup_init_subsys.
3615 */ 3621 */
@@ -4429,7 +4435,15 @@ __setup("cgroup_disable=", cgroup_disable);
4429 */ 4435 */
4430unsigned short css_id(struct cgroup_subsys_state *css) 4436unsigned short css_id(struct cgroup_subsys_state *css)
4431{ 4437{
4432 struct css_id *cssid = rcu_dereference(css->id); 4438 struct css_id *cssid;
4439
4440 /*
4441 * This css_id() can return correct value when somone has refcnt
4442 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4443 * it's unchanged until freed.
4444 */
4445 cssid = rcu_dereference_check(css->id,
4446 rcu_read_lock_held() || atomic_read(&css->refcnt));
4433 4447
4434 if (cssid) 4448 if (cssid)
4435 return cssid->id; 4449 return cssid->id;
@@ -4439,7 +4453,10 @@ EXPORT_SYMBOL_GPL(css_id);
4439 4453
4440unsigned short css_depth(struct cgroup_subsys_state *css) 4454unsigned short css_depth(struct cgroup_subsys_state *css)
4441{ 4455{
4442 struct css_id *cssid = rcu_dereference(css->id); 4456 struct css_id *cssid;
4457
4458 cssid = rcu_dereference_check(css->id,
4459 rcu_read_lock_held() || atomic_read(&css->refcnt));
4443 4460
4444 if (cssid) 4461 if (cssid)
4445 return cssid->depth; 4462 return cssid->depth;
@@ -4447,15 +4464,36 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4447} 4464}
4448EXPORT_SYMBOL_GPL(css_depth); 4465EXPORT_SYMBOL_GPL(css_depth);
4449 4466
4467/**
4468 * css_is_ancestor - test "root" css is an ancestor of "child"
4469 * @child: the css to be tested.
4470 * @root: the css supporsed to be an ancestor of the child.
4471 *
4472 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4473 * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
4474 * But, considering usual usage, the csses should be valid objects after test.
4475 * Assuming that the caller will do some action to the child if this returns
4476 * returns true, the caller must take "child";s reference count.
4477 * If "child" is valid object and this returns true, "root" is valid, too.
4478 */
4479
4450bool css_is_ancestor(struct cgroup_subsys_state *child, 4480bool css_is_ancestor(struct cgroup_subsys_state *child,
4451 const struct cgroup_subsys_state *root) 4481 const struct cgroup_subsys_state *root)
4452{ 4482{
4453 struct css_id *child_id = rcu_dereference(child->id); 4483 struct css_id *child_id;
4454 struct css_id *root_id = rcu_dereference(root->id); 4484 struct css_id *root_id;
4485 bool ret = true;
4455 4486
4456 if (!child_id || !root_id || (child_id->depth < root_id->depth)) 4487 rcu_read_lock();
4457 return false; 4488 child_id = rcu_dereference(child->id);
4458 return child_id->stack[root_id->depth] == root_id->id; 4489 root_id = rcu_dereference(root->id);
4490 if (!child_id
4491 || !root_id
4492 || (child_id->depth < root_id->depth)
4493 || (child_id->stack[root_id->depth] != root_id->id))
4494 ret = false;
4495 rcu_read_unlock();
4496 return ret;
4459} 4497}
4460 4498
4461static void __free_css_id_cb(struct rcu_head *head) 4499static void __free_css_id_cb(struct rcu_head *head)
@@ -4555,13 +4593,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
4555{ 4593{
4556 int subsys_id, i, depth = 0; 4594 int subsys_id, i, depth = 0;
4557 struct cgroup_subsys_state *parent_css, *child_css; 4595 struct cgroup_subsys_state *parent_css, *child_css;
4558 struct css_id *child_id, *parent_id = NULL; 4596 struct css_id *child_id, *parent_id;
4559 4597
4560 subsys_id = ss->subsys_id; 4598 subsys_id = ss->subsys_id;
4561 parent_css = parent->subsys[subsys_id]; 4599 parent_css = parent->subsys[subsys_id];
4562 child_css = child->subsys[subsys_id]; 4600 child_css = child->subsys[subsys_id];
4563 depth = css_depth(parent_css) + 1;
4564 parent_id = parent_css->id; 4601 parent_id = parent_css->id;
4602 depth = parent_id->depth;
4565 4603
4566 child_id = get_new_cssid(ss, depth); 4604 child_id = get_new_cssid(ss, depth);
4567 if (IS_ERR(child_id)) 4605 if (IS_ERR(child_id))
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index da5e13975531..ce71ed53e88f 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -89,10 +89,10 @@ struct cgroup_subsys freezer_subsys;
89 89
90/* Locks taken and their ordering 90/* Locks taken and their ordering
91 * ------------------------------ 91 * ------------------------------
92 * css_set_lock
93 * cgroup_mutex (AKA cgroup_lock) 92 * cgroup_mutex (AKA cgroup_lock)
94 * task->alloc_lock (AKA task_lock)
95 * freezer->lock 93 * freezer->lock
94 * css_set_lock
95 * task->alloc_lock (AKA task_lock)
96 * task->sighand->siglock 96 * task->sighand->siglock
97 * 97 *
98 * cgroup code forces css_set_lock to be taken before task->alloc_lock 98 * cgroup code forces css_set_lock to be taken before task->alloc_lock
@@ -100,33 +100,38 @@ struct cgroup_subsys freezer_subsys;
100 * freezer_create(), freezer_destroy(): 100 * freezer_create(), freezer_destroy():
101 * cgroup_mutex [ by cgroup core ] 101 * cgroup_mutex [ by cgroup core ]
102 * 102 *
103 * can_attach(): 103 * freezer_can_attach():
104 * cgroup_mutex 104 * cgroup_mutex (held by caller of can_attach)
105 * 105 *
106 * cgroup_frozen(): 106 * cgroup_freezing_or_frozen():
107 * task->alloc_lock (to get task's cgroup) 107 * task->alloc_lock (to get task's cgroup)
108 * 108 *
109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): 109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
110 * task->alloc_lock (to get task's cgroup)
111 * freezer->lock 110 * freezer->lock
112 * sighand->siglock (if the cgroup is freezing) 111 * sighand->siglock (if the cgroup is freezing)
113 * 112 *
114 * freezer_read(): 113 * freezer_read():
115 * cgroup_mutex 114 * cgroup_mutex
116 * freezer->lock 115 * freezer->lock
116 * write_lock css_set_lock (cgroup iterator start)
117 * task->alloc_lock
117 * read_lock css_set_lock (cgroup iterator start) 118 * read_lock css_set_lock (cgroup iterator start)
118 * 119 *
119 * freezer_write() (freeze): 120 * freezer_write() (freeze):
120 * cgroup_mutex 121 * cgroup_mutex
121 * freezer->lock 122 * freezer->lock
123 * write_lock css_set_lock (cgroup iterator start)
124 * task->alloc_lock
122 * read_lock css_set_lock (cgroup iterator start) 125 * read_lock css_set_lock (cgroup iterator start)
123 * sighand->siglock 126 * sighand->siglock (fake signal delivery inside freeze_task())
124 * 127 *
125 * freezer_write() (unfreeze): 128 * freezer_write() (unfreeze):
126 * cgroup_mutex 129 * cgroup_mutex
127 * freezer->lock 130 * freezer->lock
131 * write_lock css_set_lock (cgroup iterator start)
132 * task->alloc_lock
128 * read_lock css_set_lock (cgroup iterator start) 133 * read_lock css_set_lock (cgroup iterator start)
129 * task->alloc_lock (to prevent races with freeze_task()) 134 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
130 * sighand->siglock 135 * sighand->siglock
131 */ 136 */
132static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 137static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -205,9 +210,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
205 * No lock is needed, since the task isn't on tasklist yet, 210 * No lock is needed, since the task isn't on tasklist yet,
206 * so it can't be moved to another cgroup, which means the 211 * so it can't be moved to another cgroup, which means the
207 * freezer won't be removed and will be valid during this 212 * freezer won't be removed and will be valid during this
208 * function call. 213 * function call. Nevertheless, apply RCU read-side critical
214 * section to suppress RCU lockdep false positives.
209 */ 215 */
216 rcu_read_lock();
210 freezer = task_freezer(task); 217 freezer = task_freezer(task);
218 rcu_read_unlock();
211 219
212 /* 220 /*
213 * The root cgroup is non-freezable, so we can skip the 221 * The root cgroup is non-freezable, so we can skip the
diff --git a/kernel/compat.c b/kernel/compat.c
index 7f40e9275fd9..5adab05a3172 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -495,29 +495,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
495{ 495{
496 int ret; 496 int ret;
497 cpumask_var_t mask; 497 cpumask_var_t mask;
498 unsigned long *k;
499 unsigned int min_length = cpumask_size();
500
501 if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
502 min_length = sizeof(compat_ulong_t);
503 498
504 if (len < min_length) 499 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
500 return -EINVAL;
501 if (len & (sizeof(compat_ulong_t)-1))
505 return -EINVAL; 502 return -EINVAL;
506 503
507 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 504 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
508 return -ENOMEM; 505 return -ENOMEM;
509 506
510 ret = sched_getaffinity(pid, mask); 507 ret = sched_getaffinity(pid, mask);
511 if (ret < 0) 508 if (ret == 0) {
512 goto out; 509 size_t retlen = min_t(size_t, len, cpumask_size());
513 510
514 k = cpumask_bits(mask); 511 if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
515 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); 512 ret = -EFAULT;
516 if (ret == 0) 513 else
517 ret = min_length; 514 ret = retlen;
518 515 }
519out:
520 free_cpumask_var(mask); 516 free_cpumask_var(mask);
517
521 return ret; 518 return ret;
522} 519}
523 520
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 25bba73b1be3..545777574779 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -164,6 +164,7 @@ static inline void check_for_tasks(int cpu)
164} 164}
165 165
166struct take_cpu_down_param { 166struct take_cpu_down_param {
167 struct task_struct *caller;
167 unsigned long mod; 168 unsigned long mod;
168 void *hcpu; 169 void *hcpu;
169}; 170};
@@ -172,6 +173,7 @@ struct take_cpu_down_param {
172static int __ref take_cpu_down(void *_param) 173static int __ref take_cpu_down(void *_param)
173{ 174{
174 struct take_cpu_down_param *param = _param; 175 struct take_cpu_down_param *param = _param;
176 unsigned int cpu = (unsigned long)param->hcpu;
175 int err; 177 int err;
176 178
177 /* Ensure this CPU doesn't handle any more interrupts. */ 179 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -182,6 +184,8 @@ static int __ref take_cpu_down(void *_param)
182 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, 184 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
183 param->hcpu); 185 param->hcpu);
184 186
187 if (task_cpu(param->caller) == cpu)
188 move_task_off_dead_cpu(cpu, param->caller);
185 /* Force idle task to run as soon as we yield: it should 189 /* Force idle task to run as soon as we yield: it should
186 immediately notice cpu is offline and die quickly. */ 190 immediately notice cpu is offline and die quickly. */
187 sched_idle_next(); 191 sched_idle_next();
@@ -192,10 +196,10 @@ static int __ref take_cpu_down(void *_param)
192static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 196static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
193{ 197{
194 int err, nr_calls = 0; 198 int err, nr_calls = 0;
195 cpumask_var_t old_allowed;
196 void *hcpu = (void *)(long)cpu; 199 void *hcpu = (void *)(long)cpu;
197 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 200 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
198 struct take_cpu_down_param tcd_param = { 201 struct take_cpu_down_param tcd_param = {
202 .caller = current,
199 .mod = mod, 203 .mod = mod,
200 .hcpu = hcpu, 204 .hcpu = hcpu,
201 }; 205 };
@@ -206,9 +210,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
206 if (!cpu_online(cpu)) 210 if (!cpu_online(cpu))
207 return -EINVAL; 211 return -EINVAL;
208 212
209 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
210 return -ENOMEM;
211
212 cpu_hotplug_begin(); 213 cpu_hotplug_begin();
213 set_cpu_active(cpu, false); 214 set_cpu_active(cpu, false);
214 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 215 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
@@ -225,10 +226,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
225 goto out_release; 226 goto out_release;
226 } 227 }
227 228
228 /* Ensure that we are not runnable on dying cpu */
229 cpumask_copy(old_allowed, &current->cpus_allowed);
230 set_cpus_allowed_ptr(current, cpu_active_mask);
231
232 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 229 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
233 if (err) { 230 if (err) {
234 set_cpu_active(cpu, true); 231 set_cpu_active(cpu, true);
@@ -237,7 +234,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
237 hcpu) == NOTIFY_BAD) 234 hcpu) == NOTIFY_BAD)
238 BUG(); 235 BUG();
239 236
240 goto out_allowed; 237 goto out_release;
241 } 238 }
242 BUG_ON(cpu_online(cpu)); 239 BUG_ON(cpu_online(cpu));
243 240
@@ -255,8 +252,6 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
255 252
256 check_for_tasks(cpu); 253 check_for_tasks(cpu);
257 254
258out_allowed:
259 set_cpus_allowed_ptr(current, old_allowed);
260out_release: 255out_release:
261 cpu_hotplug_done(); 256 cpu_hotplug_done();
262 if (!err) { 257 if (!err) {
@@ -264,7 +259,6 @@ out_release:
264 hcpu) == NOTIFY_BAD) 259 hcpu) == NOTIFY_BAD)
265 BUG(); 260 BUG();
266 } 261 }
267 free_cpumask_var(old_allowed);
268 return err; 262 return err;
269} 263}
270 264
@@ -272,9 +266,6 @@ int __ref cpu_down(unsigned int cpu)
272{ 266{
273 int err; 267 int err;
274 268
275 err = stop_machine_create();
276 if (err)
277 return err;
278 cpu_maps_update_begin(); 269 cpu_maps_update_begin();
279 270
280 if (cpu_hotplug_disabled) { 271 if (cpu_hotplug_disabled) {
@@ -286,7 +277,6 @@ int __ref cpu_down(unsigned int cpu)
286 277
287out: 278out:
288 cpu_maps_update_done(); 279 cpu_maps_update_done();
289 stop_machine_destroy();
290 return err; 280 return err;
291} 281}
292EXPORT_SYMBOL(cpu_down); 282EXPORT_SYMBOL(cpu_down);
@@ -367,9 +357,6 @@ int disable_nonboot_cpus(void)
367{ 357{
368 int cpu, first_cpu, error; 358 int cpu, first_cpu, error;
369 359
370 error = stop_machine_create();
371 if (error)
372 return error;
373 cpu_maps_update_begin(); 360 cpu_maps_update_begin();
374 first_cpu = cpumask_first(cpu_online_mask); 361 first_cpu = cpumask_first(cpu_online_mask);
375 /* 362 /*
@@ -400,7 +387,6 @@ int disable_nonboot_cpus(void)
400 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 387 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
401 } 388 }
402 cpu_maps_update_done(); 389 cpu_maps_update_done();
403 stop_machine_destroy();
404 return error; 390 return error;
405} 391}
406 392
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d10946748ec2..9a50c5f6e727 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2182,19 +2182,52 @@ void __init cpuset_init_smp(void)
2182void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2182void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2183{ 2183{
2184 mutex_lock(&callback_mutex); 2184 mutex_lock(&callback_mutex);
2185 cpuset_cpus_allowed_locked(tsk, pmask); 2185 task_lock(tsk);
2186 guarantee_online_cpus(task_cs(tsk), pmask);
2187 task_unlock(tsk);
2186 mutex_unlock(&callback_mutex); 2188 mutex_unlock(&callback_mutex);
2187} 2189}
2188 2190
2189/** 2191int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2190 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2191 * Must be called with callback_mutex held.
2192 **/
2193void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2194{ 2192{
2195 task_lock(tsk); 2193 const struct cpuset *cs;
2196 guarantee_online_cpus(task_cs(tsk), pmask); 2194 int cpu;
2197 task_unlock(tsk); 2195
2196 rcu_read_lock();
2197 cs = task_cs(tsk);
2198 if (cs)
2199 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
2200 rcu_read_unlock();
2201
2202 /*
2203 * We own tsk->cpus_allowed, nobody can change it under us.
2204 *
2205 * But we used cs && cs->cpus_allowed lockless and thus can
2206 * race with cgroup_attach_task() or update_cpumask() and get
2207 * the wrong tsk->cpus_allowed. However, both cases imply the
2208 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
2209 * which takes task_rq_lock().
2210 *
2211 * If we are called after it dropped the lock we must see all
2212 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2213 * set any mask even if it is not right from task_cs() pov,
2214 * the pending set_cpus_allowed_ptr() will fix things.
2215 */
2216
2217 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2218 if (cpu >= nr_cpu_ids) {
2219 /*
2220 * Either tsk->cpus_allowed is wrong (see above) or it
2221 * is actually empty. The latter case is only possible
2222 * if we are racing with remove_tasks_in_empty_cpuset().
2223 * Like above we can temporary set any mask and rely on
2224 * set_cpus_allowed_ptr() as synchronization point.
2225 */
2226 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
2227 cpu = cpumask_any(cpu_active_mask);
2228 }
2229
2230 return cpu;
2198} 2231}
2199 2232
2200void cpuset_init_current_mems_allowed(void) 2233void cpuset_init_current_mems_allowed(void)
@@ -2383,22 +2416,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2383} 2416}
2384 2417
2385/** 2418/**
2386 * cpuset_lock - lock out any changes to cpuset structures
2387 *
2388 * The out of memory (oom) code needs to mutex_lock cpusets
2389 * from being changed while it scans the tasklist looking for a
2390 * task in an overlapping cpuset. Expose callback_mutex via this
2391 * cpuset_lock() routine, so the oom code can lock it, before
2392 * locking the task list. The tasklist_lock is a spinlock, so
2393 * must be taken inside callback_mutex.
2394 */
2395
2396void cpuset_lock(void)
2397{
2398 mutex_lock(&callback_mutex);
2399}
2400
2401/**
2402 * cpuset_unlock - release lock on cpuset changes 2419 * cpuset_unlock - release lock on cpuset changes
2403 * 2420 *
2404 * Undo the lock taken in a previous cpuset_lock() call. 2421 * Undo the lock taken in a previous cpuset_lock() call.
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf1..000000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
1/* Internal credentials stuff
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12/*
13 * user.c
14 */
15static inline void sched_switch_user(struct task_struct *p)
16{
17#ifdef CONFIG_USER_SCHED
18 sched_move_task(p);
19#endif /* CONFIG_USER_SCHED */
20}
21
diff --git a/kernel/cred.c b/kernel/cred.c
index 62af1816c235..2c24870c55d1 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -17,7 +17,6 @@
17#include <linux/init_task.h> 17#include <linux/init_task.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/cn_proc.h> 19#include <linux/cn_proc.h>
20#include "cred-internals.h"
21 20
22#if 0 21#if 0
23#define kdebug(FMT, ...) \ 22#define kdebug(FMT, ...) \
@@ -523,8 +522,6 @@ int commit_creds(struct cred *new)
523#endif 522#endif
524 BUG_ON(atomic_read(&new->usage) < 1); 523 BUG_ON(atomic_read(&new->usage) < 1);
525 524
526 security_commit_creds(new, old);
527
528 get_cred(new); /* we will require a ref for the subj creds too */ 525 get_cred(new); /* we will require a ref for the subj creds too */
529 526
530 /* dumpability changes */ 527 /* dumpability changes */
@@ -560,8 +557,6 @@ int commit_creds(struct cred *new)
560 atomic_dec(&old->user->processes); 557 atomic_dec(&old->user->processes);
561 alter_cred_subscribers(old, -2); 558 alter_cred_subscribers(old, -2);
562 559
563 sched_switch_user(task);
564
565 /* send notifications */ 560 /* send notifications */
566 if (new->uid != old->uid || 561 if (new->uid != old->uid ||
567 new->euid != old->euid || 562 new->euid != old->euid ||
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
new file mode 100644
index 000000000000..a85edc339985
--- /dev/null
+++ b/kernel/debug/Makefile
@@ -0,0 +1,6 @@
1#
2# Makefile for the linux kernel debugger
3#
4
5obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
6obj-$(CONFIG_KGDB_KDB) += kdb/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
new file mode 100644
index 000000000000..5cb7cd1de10c
--- /dev/null
+++ b/kernel/debug/debug_core.c
@@ -0,0 +1,983 @@
1/*
2 * Kernel Debug Core
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/console.h>
35#include <linux/threads.h>
36#include <linux/uaccess.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/ptrace.h>
40#include <linux/string.h>
41#include <linux/delay.h>
42#include <linux/sched.h>
43#include <linux/sysrq.h>
44#include <linux/init.h>
45#include <linux/kgdb.h>
46#include <linux/kdb.h>
47#include <linux/pid.h>
48#include <linux/smp.h>
49#include <linux/mm.h>
50
51#include <asm/cacheflush.h>
52#include <asm/byteorder.h>
53#include <asm/atomic.h>
54#include <asm/system.h>
55
56#include "debug_core.h"
57
58static int kgdb_break_asap;
59
60struct debuggerinfo_struct kgdb_info[NR_CPUS];
61
62/**
63 * kgdb_connected - Is a host GDB connected to us?
64 */
65int kgdb_connected;
66EXPORT_SYMBOL_GPL(kgdb_connected);
67
68/* All the KGDB handlers are installed */
69int kgdb_io_module_registered;
70
71/* Guard for recursive entry */
72static int exception_level;
73
74struct kgdb_io *dbg_io_ops;
75static DEFINE_SPINLOCK(kgdb_registration_lock);
76
77/* kgdb console driver is loaded */
78static int kgdb_con_registered;
79/* determine if kgdb console output should be used */
80static int kgdb_use_con;
81/* Flag for alternate operations for early debugging */
82bool dbg_is_early = true;
83/* Next cpu to become the master debug core */
84int dbg_switch_cpu;
85
86/* Use kdb or gdbserver mode */
87int dbg_kdb_mode = 1;
88
89static int __init opt_kgdb_con(char *str)
90{
91 kgdb_use_con = 1;
92 return 0;
93}
94
95early_param("kgdbcon", opt_kgdb_con);
96
97module_param(kgdb_use_con, int, 0644);
98
99/*
100 * Holds information about breakpoints in a kernel. These breakpoints are
101 * added and removed by gdb.
102 */
103static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
104 [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
105};
106
107/*
108 * The CPU# of the active CPU, or -1 if none:
109 */
110atomic_t kgdb_active = ATOMIC_INIT(-1);
111EXPORT_SYMBOL_GPL(kgdb_active);
112
113/*
114 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
115 * bootup code (which might not have percpu set up yet):
116 */
117static atomic_t passive_cpu_wait[NR_CPUS];
118static atomic_t cpu_in_kgdb[NR_CPUS];
119static atomic_t kgdb_break_tasklet_var;
120atomic_t kgdb_setting_breakpoint;
121
122struct task_struct *kgdb_usethread;
123struct task_struct *kgdb_contthread;
124
125int kgdb_single_step;
126static pid_t kgdb_sstep_pid;
127
128/* to keep track of the CPU which is doing the single stepping*/
129atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
130
131/*
132 * If you are debugging a problem where roundup (the collection of
133 * all other CPUs) is a problem [this should be extremely rare],
134 * then use the nokgdbroundup option to avoid roundup. In that case
135 * the other CPUs might interfere with your debugging context, so
136 * use this with care:
137 */
138static int kgdb_do_roundup = 1;
139
140static int __init opt_nokgdbroundup(char *str)
141{
142 kgdb_do_roundup = 0;
143
144 return 0;
145}
146
147early_param("nokgdbroundup", opt_nokgdbroundup);
148
149/*
150 * Finally, some KGDB code :-)
151 */
152
153/*
154 * Weak aliases for breakpoint management,
155 * can be overriden by architectures when needed:
156 */
157int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
158{
159 int err;
160
161 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
162 if (err)
163 return err;
164
165 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
166 BREAK_INSTR_SIZE);
167}
168
169int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
170{
171 return probe_kernel_write((char *)addr,
172 (char *)bundle, BREAK_INSTR_SIZE);
173}
174
175int __weak kgdb_validate_break_address(unsigned long addr)
176{
177 char tmp_variable[BREAK_INSTR_SIZE];
178 int err;
179 /* Validate setting the breakpoint and then removing it. In the
180 * remove fails, the kernel needs to emit a bad message because we
181 * are deep trouble not being able to put things back the way we
182 * found them.
183 */
184 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
185 if (err)
186 return err;
187 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
188 if (err)
189 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
190 "memory destroyed at: %lx", addr);
191 return err;
192}
193
194unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
195{
196 return instruction_pointer(regs);
197}
198
199int __weak kgdb_arch_init(void)
200{
201 return 0;
202}
203
204int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
205{
206 return 0;
207}
208
209/**
210 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
211 * @regs: Current &struct pt_regs.
212 *
213 * This function will be called if the particular architecture must
214 * disable hardware debugging while it is processing gdb packets or
215 * handling exception.
216 */
217void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
218{
219}
220
221/*
222 * Some architectures need cache flushes when we set/clear a
223 * breakpoint:
224 */
225static void kgdb_flush_swbreak_addr(unsigned long addr)
226{
227 if (!CACHE_FLUSH_IS_SAFE)
228 return;
229
230 if (current->mm && current->mm->mmap_cache) {
231 flush_cache_range(current->mm->mmap_cache,
232 addr, addr + BREAK_INSTR_SIZE);
233 }
234 /* Force flush instruction cache if it was outside the mm */
235 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
236}
237
238/*
239 * SW breakpoint management:
240 */
241int dbg_activate_sw_breakpoints(void)
242{
243 unsigned long addr;
244 int error;
245 int ret = 0;
246 int i;
247
248 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
249 if (kgdb_break[i].state != BP_SET)
250 continue;
251
252 addr = kgdb_break[i].bpt_addr;
253 error = kgdb_arch_set_breakpoint(addr,
254 kgdb_break[i].saved_instr);
255 if (error) {
256 ret = error;
257 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
258 continue;
259 }
260
261 kgdb_flush_swbreak_addr(addr);
262 kgdb_break[i].state = BP_ACTIVE;
263 }
264 return ret;
265}
266
267int dbg_set_sw_break(unsigned long addr)
268{
269 int err = kgdb_validate_break_address(addr);
270 int breakno = -1;
271 int i;
272
273 if (err)
274 return err;
275
276 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
277 if ((kgdb_break[i].state == BP_SET) &&
278 (kgdb_break[i].bpt_addr == addr))
279 return -EEXIST;
280 }
281 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
282 if (kgdb_break[i].state == BP_REMOVED &&
283 kgdb_break[i].bpt_addr == addr) {
284 breakno = i;
285 break;
286 }
287 }
288
289 if (breakno == -1) {
290 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
291 if (kgdb_break[i].state == BP_UNDEFINED) {
292 breakno = i;
293 break;
294 }
295 }
296 }
297
298 if (breakno == -1)
299 return -E2BIG;
300
301 kgdb_break[breakno].state = BP_SET;
302 kgdb_break[breakno].type = BP_BREAKPOINT;
303 kgdb_break[breakno].bpt_addr = addr;
304
305 return 0;
306}
307
308int dbg_deactivate_sw_breakpoints(void)
309{
310 unsigned long addr;
311 int error;
312 int ret = 0;
313 int i;
314
315 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
316 if (kgdb_break[i].state != BP_ACTIVE)
317 continue;
318 addr = kgdb_break[i].bpt_addr;
319 error = kgdb_arch_remove_breakpoint(addr,
320 kgdb_break[i].saved_instr);
321 if (error) {
322 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
323 ret = error;
324 }
325
326 kgdb_flush_swbreak_addr(addr);
327 kgdb_break[i].state = BP_SET;
328 }
329 return ret;
330}
331
332int dbg_remove_sw_break(unsigned long addr)
333{
334 int i;
335
336 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
337 if ((kgdb_break[i].state == BP_SET) &&
338 (kgdb_break[i].bpt_addr == addr)) {
339 kgdb_break[i].state = BP_REMOVED;
340 return 0;
341 }
342 }
343 return -ENOENT;
344}
345
346int kgdb_isremovedbreak(unsigned long addr)
347{
348 int i;
349
350 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
351 if ((kgdb_break[i].state == BP_REMOVED) &&
352 (kgdb_break[i].bpt_addr == addr))
353 return 1;
354 }
355 return 0;
356}
357
358int dbg_remove_all_break(void)
359{
360 unsigned long addr;
361 int error;
362 int i;
363
364 /* Clear memory breakpoints. */
365 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
366 if (kgdb_break[i].state != BP_ACTIVE)
367 goto setundefined;
368 addr = kgdb_break[i].bpt_addr;
369 error = kgdb_arch_remove_breakpoint(addr,
370 kgdb_break[i].saved_instr);
371 if (error)
372 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
373 addr);
374setundefined:
375 kgdb_break[i].state = BP_UNDEFINED;
376 }
377
378 /* Clear hardware breakpoints. */
379 if (arch_kgdb_ops.remove_all_hw_break)
380 arch_kgdb_ops.remove_all_hw_break();
381
382 return 0;
383}
384
385/*
386 * Return true if there is a valid kgdb I/O module. Also if no
387 * debugger is attached a message can be printed to the console about
388 * waiting for the debugger to attach.
389 *
390 * The print_wait argument is only to be true when called from inside
391 * the core kgdb_handle_exception, because it will wait for the
392 * debugger to attach.
393 */
394static int kgdb_io_ready(int print_wait)
395{
396 if (!dbg_io_ops)
397 return 0;
398 if (kgdb_connected)
399 return 1;
400 if (atomic_read(&kgdb_setting_breakpoint))
401 return 1;
402 if (print_wait) {
403#ifdef CONFIG_KGDB_KDB
404 if (!dbg_kdb_mode)
405 printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
406#else
407 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
408#endif
409 }
410 return 1;
411}
412
413static int kgdb_reenter_check(struct kgdb_state *ks)
414{
415 unsigned long addr;
416
417 if (atomic_read(&kgdb_active) != raw_smp_processor_id())
418 return 0;
419
420 /* Panic on recursive debugger calls: */
421 exception_level++;
422 addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
423 dbg_deactivate_sw_breakpoints();
424
425 /*
426 * If the break point removed ok at the place exception
427 * occurred, try to recover and print a warning to the end
428 * user because the user planted a breakpoint in a place that
429 * KGDB needs in order to function.
430 */
431 if (dbg_remove_sw_break(addr) == 0) {
432 exception_level = 0;
433 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
434 dbg_activate_sw_breakpoints();
435 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
436 addr);
437 WARN_ON_ONCE(1);
438
439 return 1;
440 }
441 dbg_remove_all_break();
442 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
443
444 if (exception_level > 1) {
445 dump_stack();
446 panic("Recursive entry to debugger");
447 }
448
449 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
450#ifdef CONFIG_KGDB_KDB
451 /* Allow kdb to debug itself one level */
452 return 0;
453#endif
454 dump_stack();
455 panic("Recursive entry to debugger");
456
457 return 1;
458}
459
460static void dbg_cpu_switch(int cpu, int next_cpu)
461{
462 /* Mark the cpu we are switching away from as a slave when it
463 * holds the kgdb_active token. This must be done so that the
464 * that all the cpus wait in for the debug core will not enter
465 * again as the master. */
466 if (cpu == atomic_read(&kgdb_active)) {
467 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
468 kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
469 }
470 kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
471}
472
473static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
474{
475 unsigned long flags;
476 int sstep_tries = 100;
477 int error;
478 int i, cpu;
479 int trace_on = 0;
480acquirelock:
481 /*
482 * Interrupts will be restored by the 'trap return' code, except when
483 * single stepping.
484 */
485 local_irq_save(flags);
486
487 cpu = ks->cpu;
488 kgdb_info[cpu].debuggerinfo = regs;
489 kgdb_info[cpu].task = current;
490 kgdb_info[cpu].ret_state = 0;
491 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
492 /*
493 * Make sure the above info reaches the primary CPU before
494 * our cpu_in_kgdb[] flag setting does:
495 */
496 atomic_inc(&cpu_in_kgdb[cpu]);
497
498 if (exception_level == 1)
499 goto cpu_master_loop;
500
501 /*
502 * CPU will loop if it is a slave or request to become a kgdb
503 * master cpu and acquire the kgdb_active lock:
504 */
505 while (1) {
506cpu_loop:
507 if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
509 goto cpu_master_loop;
510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
511 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
512 break;
513 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
514 if (!atomic_read(&passive_cpu_wait[cpu]))
515 goto return_normal;
516 } else {
517return_normal:
518 /* Return to normal operation by executing any
519 * hw breakpoint fixup.
520 */
521 if (arch_kgdb_ops.correct_hw_break)
522 arch_kgdb_ops.correct_hw_break();
523 if (trace_on)
524 tracing_on();
525 atomic_dec(&cpu_in_kgdb[cpu]);
526 touch_softlockup_watchdog_sync();
527 clocksource_touch_watchdog();
528 local_irq_restore(flags);
529 return 0;
530 }
531 cpu_relax();
532 }
533
534 /*
535 * For single stepping, try to only enter on the processor
536 * that was single stepping. To gaurd against a deadlock, the
537 * kernel will only try for the value of sstep_tries before
538 * giving up and continuing on.
539 */
540 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
541 (kgdb_info[cpu].task &&
542 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
543 atomic_set(&kgdb_active, -1);
544 touch_softlockup_watchdog_sync();
545 clocksource_touch_watchdog();
546 local_irq_restore(flags);
547
548 goto acquirelock;
549 }
550
551 if (!kgdb_io_ready(1)) {
552 kgdb_info[cpu].ret_state = 1;
553 goto kgdb_restore; /* No I/O connection, resume the system */
554 }
555
556 /*
557 * Don't enter if we have hit a removed breakpoint.
558 */
559 if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
560 goto kgdb_restore;
561
562 /* Call the I/O driver's pre_exception routine */
563 if (dbg_io_ops->pre_exception)
564 dbg_io_ops->pre_exception();
565
566 kgdb_disable_hw_debug(ks->linux_regs);
567
568 /*
569 * Get the passive CPU lock which will hold all the non-primary
570 * CPU in a spin state while the debugger is active
571 */
572 if (!kgdb_single_step) {
573 for (i = 0; i < NR_CPUS; i++)
574 atomic_inc(&passive_cpu_wait[i]);
575 }
576
577#ifdef CONFIG_SMP
578 /* Signal the other CPUs to enter kgdb_wait() */
579 if ((!kgdb_single_step) && kgdb_do_roundup)
580 kgdb_roundup_cpus(flags);
581#endif
582
583 /*
584 * Wait for the other CPUs to be notified and be waiting for us:
585 */
586 for_each_online_cpu(i) {
587 while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
588 cpu_relax();
589 }
590
591 /*
592 * At this point the primary processor is completely
593 * in the debugger and all secondary CPUs are quiescent
594 */
595 dbg_deactivate_sw_breakpoints();
596 kgdb_single_step = 0;
597 kgdb_contthread = current;
598 exception_level = 0;
599 trace_on = tracing_is_on();
600 if (trace_on)
601 tracing_off();
602
603 while (1) {
604cpu_master_loop:
605 if (dbg_kdb_mode) {
606 kgdb_connected = 1;
607 error = kdb_stub(ks);
608 } else {
609 error = gdb_serial_stub(ks);
610 }
611
612 if (error == DBG_PASS_EVENT) {
613 dbg_kdb_mode = !dbg_kdb_mode;
614 kgdb_connected = 0;
615 } else if (error == DBG_SWITCH_CPU_EVENT) {
616 dbg_cpu_switch(cpu, dbg_switch_cpu);
617 goto cpu_loop;
618 } else {
619 kgdb_info[cpu].ret_state = error;
620 break;
621 }
622 }
623
624 /* Call the I/O driver's post_exception routine */
625 if (dbg_io_ops->post_exception)
626 dbg_io_ops->post_exception();
627
628 atomic_dec(&cpu_in_kgdb[ks->cpu]);
629
630 if (!kgdb_single_step) {
631 for (i = NR_CPUS-1; i >= 0; i--)
632 atomic_dec(&passive_cpu_wait[i]);
633 /*
634 * Wait till all the CPUs have quit from the debugger,
635 * but allow a CPU that hit an exception and is
636 * waiting to become the master to remain in the debug
637 * core.
638 */
639 for_each_online_cpu(i) {
640 while (kgdb_do_roundup &&
641 atomic_read(&cpu_in_kgdb[i]) &&
642 !(kgdb_info[i].exception_state &
643 DCPU_WANT_MASTER))
644 cpu_relax();
645 }
646 }
647
648kgdb_restore:
649 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
650 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
651 if (kgdb_info[sstep_cpu].task)
652 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
653 else
654 kgdb_sstep_pid = 0;
655 }
656 if (trace_on)
657 tracing_on();
658 /* Free kgdb_active */
659 atomic_set(&kgdb_active, -1);
660 touch_softlockup_watchdog_sync();
661 clocksource_touch_watchdog();
662 local_irq_restore(flags);
663
664 return kgdb_info[cpu].ret_state;
665}
666
667/*
668 * kgdb_handle_exception() - main entry point from a kernel exception
669 *
670 * Locking hierarchy:
671 * interface locks, if any (begin_session)
672 * kgdb lock (kgdb_active)
673 */
674int
675kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
676{
677 struct kgdb_state kgdb_var;
678 struct kgdb_state *ks = &kgdb_var;
679 int ret;
680
681 ks->cpu = raw_smp_processor_id();
682 ks->ex_vector = evector;
683 ks->signo = signo;
684 ks->err_code = ecode;
685 ks->kgdb_usethreadid = 0;
686 ks->linux_regs = regs;
687
688 if (kgdb_reenter_check(ks))
689 return 0; /* Ouch, double exception ! */
690 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
691 ret = kgdb_cpu_enter(ks, regs);
692 kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
693 DCPU_IS_SLAVE);
694 return ret;
695}
696
697int kgdb_nmicallback(int cpu, void *regs)
698{
699#ifdef CONFIG_SMP
700 struct kgdb_state kgdb_var;
701 struct kgdb_state *ks = &kgdb_var;
702
703 memset(ks, 0, sizeof(struct kgdb_state));
704 ks->cpu = cpu;
705 ks->linux_regs = regs;
706
707 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
708 atomic_read(&kgdb_active) != -1 &&
709 atomic_read(&kgdb_active) != cpu) {
710 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
711 kgdb_cpu_enter(ks, regs);
712 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
713 return 0;
714 }
715#endif
716 return 1;
717}
718
719static void kgdb_console_write(struct console *co, const char *s,
720 unsigned count)
721{
722 unsigned long flags;
723
724 /* If we're debugging, or KGDB has not connected, don't try
725 * and print. */
726 if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
727 return;
728
729 local_irq_save(flags);
730 gdbstub_msg_write(s, count);
731 local_irq_restore(flags);
732}
733
734static struct console kgdbcons = {
735 .name = "kgdb",
736 .write = kgdb_console_write,
737 .flags = CON_PRINTBUFFER | CON_ENABLED,
738 .index = -1,
739};
740
741#ifdef CONFIG_MAGIC_SYSRQ
742static void sysrq_handle_dbg(int key, struct tty_struct *tty)
743{
744 if (!dbg_io_ops) {
745 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
746 return;
747 }
748 if (!kgdb_connected) {
749#ifdef CONFIG_KGDB_KDB
750 if (!dbg_kdb_mode)
751 printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
752#else
753 printk(KERN_CRIT "Entering KGDB\n");
754#endif
755 }
756
757 kgdb_breakpoint();
758}
759
760static struct sysrq_key_op sysrq_dbg_op = {
761 .handler = sysrq_handle_dbg,
762 .help_msg = "debug(G)",
763 .action_msg = "DEBUG",
764};
765#endif
766
767static int kgdb_panic_event(struct notifier_block *self,
768 unsigned long val,
769 void *data)
770{
771 if (dbg_kdb_mode)
772 kdb_printf("PANIC: %s\n", (char *)data);
773 kgdb_breakpoint();
774 return NOTIFY_DONE;
775}
776
777static struct notifier_block kgdb_panic_event_nb = {
778 .notifier_call = kgdb_panic_event,
779 .priority = INT_MAX,
780};
781
782void __weak kgdb_arch_late(void)
783{
784}
785
786void __init dbg_late_init(void)
787{
788 dbg_is_early = false;
789 if (kgdb_io_module_registered)
790 kgdb_arch_late();
791 kdb_init(KDB_INIT_FULL);
792}
793
794static void kgdb_register_callbacks(void)
795{
796 if (!kgdb_io_module_registered) {
797 kgdb_io_module_registered = 1;
798 kgdb_arch_init();
799 if (!dbg_is_early)
800 kgdb_arch_late();
801 atomic_notifier_chain_register(&panic_notifier_list,
802 &kgdb_panic_event_nb);
803#ifdef CONFIG_MAGIC_SYSRQ
804 register_sysrq_key('g', &sysrq_dbg_op);
805#endif
806 if (kgdb_use_con && !kgdb_con_registered) {
807 register_console(&kgdbcons);
808 kgdb_con_registered = 1;
809 }
810 }
811}
812
813static void kgdb_unregister_callbacks(void)
814{
815 /*
816 * When this routine is called KGDB should unregister from the
817 * panic handler and clean up, making sure it is not handling any
818 * break exceptions at the time.
819 */
820 if (kgdb_io_module_registered) {
821 kgdb_io_module_registered = 0;
822 atomic_notifier_chain_unregister(&panic_notifier_list,
823 &kgdb_panic_event_nb);
824 kgdb_arch_exit();
825#ifdef CONFIG_MAGIC_SYSRQ
826 unregister_sysrq_key('g', &sysrq_dbg_op);
827#endif
828 if (kgdb_con_registered) {
829 unregister_console(&kgdbcons);
830 kgdb_con_registered = 0;
831 }
832 }
833}
834
835/*
836 * There are times a tasklet needs to be used vs a compiled in
837 * break point so as to cause an exception outside a kgdb I/O module,
838 * such as is the case with kgdboe, where calling a breakpoint in the
839 * I/O driver itself would be fatal.
840 */
841static void kgdb_tasklet_bpt(unsigned long ing)
842{
843 kgdb_breakpoint();
844 atomic_set(&kgdb_break_tasklet_var, 0);
845}
846
847static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
848
849void kgdb_schedule_breakpoint(void)
850{
851 if (atomic_read(&kgdb_break_tasklet_var) ||
852 atomic_read(&kgdb_active) != -1 ||
853 atomic_read(&kgdb_setting_breakpoint))
854 return;
855 atomic_inc(&kgdb_break_tasklet_var);
856 tasklet_schedule(&kgdb_tasklet_breakpoint);
857}
858EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
859
860static void kgdb_initial_breakpoint(void)
861{
862 kgdb_break_asap = 0;
863
864 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
865 kgdb_breakpoint();
866}
867
868/**
869 * kgdb_register_io_module - register KGDB IO module
870 * @new_dbg_io_ops: the io ops vector
871 *
872 * Register it with the KGDB core.
873 */
874int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
875{
876 int err;
877
878 spin_lock(&kgdb_registration_lock);
879
880 if (dbg_io_ops) {
881 spin_unlock(&kgdb_registration_lock);
882
883 printk(KERN_ERR "kgdb: Another I/O driver is already "
884 "registered with KGDB.\n");
885 return -EBUSY;
886 }
887
888 if (new_dbg_io_ops->init) {
889 err = new_dbg_io_ops->init();
890 if (err) {
891 spin_unlock(&kgdb_registration_lock);
892 return err;
893 }
894 }
895
896 dbg_io_ops = new_dbg_io_ops;
897
898 spin_unlock(&kgdb_registration_lock);
899
900 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
901 new_dbg_io_ops->name);
902
903 /* Arm KGDB now. */
904 kgdb_register_callbacks();
905
906 if (kgdb_break_asap)
907 kgdb_initial_breakpoint();
908
909 return 0;
910}
911EXPORT_SYMBOL_GPL(kgdb_register_io_module);
912
913/**
914 * kkgdb_unregister_io_module - unregister KGDB IO module
915 * @old_dbg_io_ops: the io ops vector
916 *
917 * Unregister it with the KGDB core.
918 */
919void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
920{
921 BUG_ON(kgdb_connected);
922
923 /*
924 * KGDB is no longer able to communicate out, so
925 * unregister our callbacks and reset state.
926 */
927 kgdb_unregister_callbacks();
928
929 spin_lock(&kgdb_registration_lock);
930
931 WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops);
932 dbg_io_ops = NULL;
933
934 spin_unlock(&kgdb_registration_lock);
935
936 printk(KERN_INFO
937 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
938 old_dbg_io_ops->name);
939}
940EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
941
942int dbg_io_get_char(void)
943{
944 int ret = dbg_io_ops->read_char();
945 if (ret == NO_POLL_CHAR)
946 return -1;
947 if (!dbg_kdb_mode)
948 return ret;
949 if (ret == 127)
950 return 8;
951 return ret;
952}
953
954/**
955 * kgdb_breakpoint - generate breakpoint exception
956 *
957 * This function will generate a breakpoint exception. It is used at the
958 * beginning of a program to sync up with a debugger and can be used
959 * otherwise as a quick means to stop program execution and "break" into
960 * the debugger.
961 */
962void kgdb_breakpoint(void)
963{
964 atomic_inc(&kgdb_setting_breakpoint);
965 wmb(); /* Sync point before breakpoint */
966 arch_kgdb_breakpoint();
967 wmb(); /* Sync point after breakpoint */
968 atomic_dec(&kgdb_setting_breakpoint);
969}
970EXPORT_SYMBOL_GPL(kgdb_breakpoint);
971
972static int __init opt_kgdb_wait(char *str)
973{
974 kgdb_break_asap = 1;
975
976 kdb_init(KDB_INIT_EARLY);
977 if (kgdb_io_module_registered)
978 kgdb_initial_breakpoint();
979
980 return 0;
981}
982
983early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
new file mode 100644
index 000000000000..c5d753d80f67
--- /dev/null
+++ b/kernel/debug/debug_core.h
@@ -0,0 +1,81 @@
1/*
2 * Created by: Jason Wessel <jason.wessel@windriver.com>
3 *
4 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
5 *
6 * This file is licensed under the terms of the GNU General Public
7 * License version 2. This program is licensed "as is" without any
8 * warranty of any kind, whether express or implied.
9 */
10
11#ifndef _DEBUG_CORE_H_
12#define _DEBUG_CORE_H_
13/*
14 * These are the private implementation headers between the kernel
15 * debugger core and the debugger front end code.
16 */
17
18/* kernel debug core data structures */
19struct kgdb_state {
20 int ex_vector;
21 int signo;
22 int err_code;
23 int cpu;
24 int pass_exception;
25 unsigned long thr_query;
26 unsigned long threadid;
27 long kgdb_usethreadid;
28 struct pt_regs *linux_regs;
29};
30
31/* Exception state values */
32#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
33#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
34#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
35#define DCPU_SSTEP 0x8 /* CPU is single stepping */
36
37struct debuggerinfo_struct {
38 void *debuggerinfo;
39 struct task_struct *task;
40 int exception_state;
41 int ret_state;
42 int irq_depth;
43};
44
45extern struct debuggerinfo_struct kgdb_info[];
46
47/* kernel debug core break point routines */
48extern int dbg_remove_all_break(void);
49extern int dbg_set_sw_break(unsigned long addr);
50extern int dbg_remove_sw_break(unsigned long addr);
51extern int dbg_activate_sw_breakpoints(void);
52extern int dbg_deactivate_sw_breakpoints(void);
53
54/* polled character access to i/o module */
55extern int dbg_io_get_char(void);
56
57/* stub return value for switching between the gdbstub and kdb */
58#define DBG_PASS_EVENT -12345
59/* Switch from one cpu to another */
60#define DBG_SWITCH_CPU_EVENT -123456
61extern int dbg_switch_cpu;
62
63/* gdbstub interface functions */
64extern int gdb_serial_stub(struct kgdb_state *ks);
65extern void gdbstub_msg_write(const char *s, int len);
66
67/* gdbstub functions used for kdb <-> gdbstub transition */
68extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
69extern int dbg_kdb_mode;
70
71#ifdef CONFIG_KGDB_KDB
72extern int kdb_stub(struct kgdb_state *ks);
73extern int kdb_parse(const char *cmdstr);
74#else /* ! CONFIG_KGDB_KDB */
75static inline int kdb_stub(struct kgdb_state *ks)
76{
77 return DBG_PASS_EVENT;
78}
79#endif /* CONFIG_KGDB_KDB */
80
81#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
new file mode 100644
index 000000000000..4b17b3269525
--- /dev/null
+++ b/kernel/debug/gdbstub.c
@@ -0,0 +1,1017 @@
1/*
2 * Kernel Debug Core
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30
31#include <linux/kernel.h>
32#include <linux/kgdb.h>
33#include <linux/kdb.h>
34#include <linux/reboot.h>
35#include <linux/uaccess.h>
36#include <asm/cacheflush.h>
37#include <asm/unaligned.h>
38#include "debug_core.h"
39
40#define KGDB_MAX_THREAD_QUERY 17
41
42/* Our I/O buffers. */
43static char remcom_in_buffer[BUFMAX];
44static char remcom_out_buffer[BUFMAX];
45
46/* Storage for the registers, in GDB format. */
47static unsigned long gdb_regs[(NUMREGBYTES +
48 sizeof(unsigned long) - 1) /
49 sizeof(unsigned long)];
50
51/*
52 * GDB remote protocol parser:
53 */
54
55static int hex(char ch)
56{
57 if ((ch >= 'a') && (ch <= 'f'))
58 return ch - 'a' + 10;
59 if ((ch >= '0') && (ch <= '9'))
60 return ch - '0';
61 if ((ch >= 'A') && (ch <= 'F'))
62 return ch - 'A' + 10;
63 return -1;
64}
65
66#ifdef CONFIG_KGDB_KDB
67static int gdbstub_read_wait(void)
68{
69 int ret = -1;
70 int i;
71
72 /* poll any additional I/O interfaces that are defined */
73 while (ret < 0)
74 for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
75 ret = kdb_poll_funcs[i]();
76 if (ret > 0)
77 break;
78 }
79 return ret;
80}
81#else
82static int gdbstub_read_wait(void)
83{
84 int ret = dbg_io_ops->read_char();
85 while (ret == NO_POLL_CHAR)
86 ret = dbg_io_ops->read_char();
87 return ret;
88}
89#endif
90/* scan for the sequence $<data>#<checksum> */
91static void get_packet(char *buffer)
92{
93 unsigned char checksum;
94 unsigned char xmitcsum;
95 int count;
96 char ch;
97
98 do {
99 /*
100 * Spin and wait around for the start character, ignore all
101 * other characters:
102 */
103 while ((ch = (gdbstub_read_wait())) != '$')
104 /* nothing */;
105
106 kgdb_connected = 1;
107 checksum = 0;
108 xmitcsum = -1;
109
110 count = 0;
111
112 /*
113 * now, read until a # or end of buffer is found:
114 */
115 while (count < (BUFMAX - 1)) {
116 ch = gdbstub_read_wait();
117 if (ch == '#')
118 break;
119 checksum = checksum + ch;
120 buffer[count] = ch;
121 count = count + 1;
122 }
123 buffer[count] = 0;
124
125 if (ch == '#') {
126 xmitcsum = hex(gdbstub_read_wait()) << 4;
127 xmitcsum += hex(gdbstub_read_wait());
128
129 if (checksum != xmitcsum)
130 /* failed checksum */
131 dbg_io_ops->write_char('-');
132 else
133 /* successful transfer */
134 dbg_io_ops->write_char('+');
135 if (dbg_io_ops->flush)
136 dbg_io_ops->flush();
137 }
138 } while (checksum != xmitcsum);
139}
140
141/*
142 * Send the packet in buffer.
143 * Check for gdb connection if asked for.
144 */
145static void put_packet(char *buffer)
146{
147 unsigned char checksum;
148 int count;
149 char ch;
150
151 /*
152 * $<packet info>#<checksum>.
153 */
154 while (1) {
155 dbg_io_ops->write_char('$');
156 checksum = 0;
157 count = 0;
158
159 while ((ch = buffer[count])) {
160 dbg_io_ops->write_char(ch);
161 checksum += ch;
162 count++;
163 }
164
165 dbg_io_ops->write_char('#');
166 dbg_io_ops->write_char(hex_asc_hi(checksum));
167 dbg_io_ops->write_char(hex_asc_lo(checksum));
168 if (dbg_io_ops->flush)
169 dbg_io_ops->flush();
170
171 /* Now see what we get in reply. */
172 ch = gdbstub_read_wait();
173
174 if (ch == 3)
175 ch = gdbstub_read_wait();
176
177 /* If we get an ACK, we are done. */
178 if (ch == '+')
179 return;
180
181 /*
182 * If we get the start of another packet, this means
183 * that GDB is attempting to reconnect. We will NAK
184 * the packet being sent, and stop trying to send this
185 * packet.
186 */
187 if (ch == '$') {
188 dbg_io_ops->write_char('-');
189 if (dbg_io_ops->flush)
190 dbg_io_ops->flush();
191 return;
192 }
193 }
194}
195
196static char gdbmsgbuf[BUFMAX + 1];
197
198void gdbstub_msg_write(const char *s, int len)
199{
200 char *bufptr;
201 int wcount;
202 int i;
203
204 if (len == 0)
205 len = strlen(s);
206
207 /* 'O'utput */
208 gdbmsgbuf[0] = 'O';
209
210 /* Fill and send buffers... */
211 while (len > 0) {
212 bufptr = gdbmsgbuf + 1;
213
214 /* Calculate how many this time */
215 if ((len << 1) > (BUFMAX - 2))
216 wcount = (BUFMAX - 2) >> 1;
217 else
218 wcount = len;
219
220 /* Pack in hex chars */
221 for (i = 0; i < wcount; i++)
222 bufptr = pack_hex_byte(bufptr, s[i]);
223 *bufptr = '\0';
224
225 /* Move up */
226 s += wcount;
227 len -= wcount;
228
229 /* Write packet */
230 put_packet(gdbmsgbuf);
231 }
232}
233
234/*
235 * Convert the memory pointed to by mem into hex, placing result in
236 * buf. Return a pointer to the last char put in buf (null). May
237 * return an error.
238 */
239int kgdb_mem2hex(char *mem, char *buf, int count)
240{
241 char *tmp;
242 int err;
243
244 /*
245 * We use the upper half of buf as an intermediate buffer for the
246 * raw memory copy. Hex conversion will work against this one.
247 */
248 tmp = buf + count;
249
250 err = probe_kernel_read(tmp, mem, count);
251 if (!err) {
252 while (count > 0) {
253 buf = pack_hex_byte(buf, *tmp);
254 tmp++;
255 count--;
256 }
257
258 *buf = 0;
259 }
260
261 return err;
262}
263
264/*
265 * Convert the hex array pointed to by buf into binary to be placed in
266 * mem. Return a pointer to the character AFTER the last byte
267 * written. May return an error.
268 */
269int kgdb_hex2mem(char *buf, char *mem, int count)
270{
271 char *tmp_raw;
272 char *tmp_hex;
273
274 /*
275 * We use the upper half of buf as an intermediate buffer for the
276 * raw memory that is converted from hex.
277 */
278 tmp_raw = buf + count * 2;
279
280 tmp_hex = tmp_raw - 1;
281 while (tmp_hex >= buf) {
282 tmp_raw--;
283 *tmp_raw = hex(*tmp_hex--);
284 *tmp_raw |= hex(*tmp_hex--) << 4;
285 }
286
287 return probe_kernel_write(mem, tmp_raw, count);
288}
289
290/*
291 * While we find nice hex chars, build a long_val.
292 * Return number of chars processed.
293 */
294int kgdb_hex2long(char **ptr, unsigned long *long_val)
295{
296 int hex_val;
297 int num = 0;
298 int negate = 0;
299
300 *long_val = 0;
301
302 if (**ptr == '-') {
303 negate = 1;
304 (*ptr)++;
305 }
306 while (**ptr) {
307 hex_val = hex(**ptr);
308 if (hex_val < 0)
309 break;
310
311 *long_val = (*long_val << 4) | hex_val;
312 num++;
313 (*ptr)++;
314 }
315
316 if (negate)
317 *long_val = -*long_val;
318
319 return num;
320}
321
322/*
323 * Copy the binary array pointed to by buf into mem. Fix $, #, and
324 * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
325 * The input buf is overwitten with the result to write to mem.
326 */
327static int kgdb_ebin2mem(char *buf, char *mem, int count)
328{
329 int size = 0;
330 char *c = buf;
331
332 while (count-- > 0) {
333 c[size] = *buf++;
334 if (c[size] == 0x7d)
335 c[size] = *buf++ ^ 0x20;
336 size++;
337 }
338
339 return probe_kernel_write(mem, c, size);
340}
341
342/* Write memory due to an 'M' or 'X' packet. */
343static int write_mem_msg(int binary)
344{
345 char *ptr = &remcom_in_buffer[1];
346 unsigned long addr;
347 unsigned long length;
348 int err;
349
350 if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
351 kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
352 if (binary)
353 err = kgdb_ebin2mem(ptr, (char *)addr, length);
354 else
355 err = kgdb_hex2mem(ptr, (char *)addr, length);
356 if (err)
357 return err;
358 if (CACHE_FLUSH_IS_SAFE)
359 flush_icache_range(addr, addr + length);
360 return 0;
361 }
362
363 return -EINVAL;
364}
365
366static void error_packet(char *pkt, int error)
367{
368 error = -error;
369 pkt[0] = 'E';
370 pkt[1] = hex_asc[(error / 10)];
371 pkt[2] = hex_asc[(error % 10)];
372 pkt[3] = '\0';
373}
374
375/*
376 * Thread ID accessors. We represent a flat TID space to GDB, where
377 * the per CPU idle threads (which under Linux all have PID 0) are
378 * remapped to negative TIDs.
379 */
380
381#define BUF_THREAD_ID_SIZE 16
382
383static char *pack_threadid(char *pkt, unsigned char *id)
384{
385 char *limit;
386
387 limit = pkt + BUF_THREAD_ID_SIZE;
388 while (pkt < limit)
389 pkt = pack_hex_byte(pkt, *id++);
390
391 return pkt;
392}
393
394static void int_to_threadref(unsigned char *id, int value)
395{
396 unsigned char *scan;
397 int i = 4;
398
399 scan = (unsigned char *)id;
400 while (i--)
401 *scan++ = 0;
402 put_unaligned_be32(value, scan);
403}
404
405static struct task_struct *getthread(struct pt_regs *regs, int tid)
406{
407 /*
408 * Non-positive TIDs are remapped to the cpu shadow information
409 */
410 if (tid == 0 || tid == -1)
411 tid = -atomic_read(&kgdb_active) - 2;
412 if (tid < -1 && tid > -NR_CPUS - 2) {
413 if (kgdb_info[-tid - 2].task)
414 return kgdb_info[-tid - 2].task;
415 else
416 return idle_task(-tid - 2);
417 }
418 if (tid <= 0) {
419 printk(KERN_ERR "KGDB: Internal thread select error\n");
420 dump_stack();
421 return NULL;
422 }
423
424 /*
425 * find_task_by_pid_ns() does not take the tasklist lock anymore
426 * but is nicely RCU locked - hence is a pretty resilient
427 * thing to use:
428 */
429 return find_task_by_pid_ns(tid, &init_pid_ns);
430}
431
432
433/*
434 * Remap normal tasks to their real PID,
435 * CPU shadow threads are mapped to -CPU - 2
436 */
437static inline int shadow_pid(int realpid)
438{
439 if (realpid)
440 return realpid;
441
442 return -raw_smp_processor_id() - 2;
443}
444
445/*
446 * All the functions that start with gdb_cmd are the various
447 * operations to implement the handlers for the gdbserial protocol
448 * where KGDB is communicating with an external debugger
449 */
450
451/* Handle the '?' status packets */
452static void gdb_cmd_status(struct kgdb_state *ks)
453{
454 /*
455 * We know that this packet is only sent
456 * during initial connect. So to be safe,
457 * we clear out our breakpoints now in case
458 * GDB is reconnecting.
459 */
460 dbg_remove_all_break();
461
462 remcom_out_buffer[0] = 'S';
463 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
464}
465
466/* Handle the 'g' get registers request */
467static void gdb_cmd_getregs(struct kgdb_state *ks)
468{
469 struct task_struct *thread;
470 void *local_debuggerinfo;
471 int i;
472
473 thread = kgdb_usethread;
474 if (!thread) {
475 thread = kgdb_info[ks->cpu].task;
476 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
477 } else {
478 local_debuggerinfo = NULL;
479 for_each_online_cpu(i) {
480 /*
481 * Try to find the task on some other
482 * or possibly this node if we do not
483 * find the matching task then we try
484 * to approximate the results.
485 */
486 if (thread == kgdb_info[i].task)
487 local_debuggerinfo = kgdb_info[i].debuggerinfo;
488 }
489 }
490
491 /*
492 * All threads that don't have debuggerinfo should be
493 * in schedule() sleeping, since all other CPUs
494 * are in kgdb_wait, and thus have debuggerinfo.
495 */
496 if (local_debuggerinfo) {
497 pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
498 } else {
499 /*
500 * Pull stuff saved during switch_to; nothing
501 * else is accessible (or even particularly
502 * relevant).
503 *
504 * This should be enough for a stack trace.
505 */
506 sleeping_thread_to_gdb_regs(gdb_regs, thread);
507 }
508 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
509}
510
511/* Handle the 'G' set registers request */
512static void gdb_cmd_setregs(struct kgdb_state *ks)
513{
514 kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
515
516 if (kgdb_usethread && kgdb_usethread != current) {
517 error_packet(remcom_out_buffer, -EINVAL);
518 } else {
519 gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
520 strcpy(remcom_out_buffer, "OK");
521 }
522}
523
524/* Handle the 'm' memory read bytes */
525static void gdb_cmd_memread(struct kgdb_state *ks)
526{
527 char *ptr = &remcom_in_buffer[1];
528 unsigned long length;
529 unsigned long addr;
530 int err;
531
532 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
533 kgdb_hex2long(&ptr, &length) > 0) {
534 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
535 if (err)
536 error_packet(remcom_out_buffer, err);
537 } else {
538 error_packet(remcom_out_buffer, -EINVAL);
539 }
540}
541
542/* Handle the 'M' memory write bytes */
543static void gdb_cmd_memwrite(struct kgdb_state *ks)
544{
545 int err = write_mem_msg(0);
546
547 if (err)
548 error_packet(remcom_out_buffer, err);
549 else
550 strcpy(remcom_out_buffer, "OK");
551}
552
553/* Handle the 'X' memory binary write bytes */
554static void gdb_cmd_binwrite(struct kgdb_state *ks)
555{
556 int err = write_mem_msg(1);
557
558 if (err)
559 error_packet(remcom_out_buffer, err);
560 else
561 strcpy(remcom_out_buffer, "OK");
562}
563
564/* Handle the 'D' or 'k', detach or kill packets */
565static void gdb_cmd_detachkill(struct kgdb_state *ks)
566{
567 int error;
568
569 /* The detach case */
570 if (remcom_in_buffer[0] == 'D') {
571 error = dbg_remove_all_break();
572 if (error < 0) {
573 error_packet(remcom_out_buffer, error);
574 } else {
575 strcpy(remcom_out_buffer, "OK");
576 kgdb_connected = 0;
577 }
578 put_packet(remcom_out_buffer);
579 } else {
580 /*
581 * Assume the kill case, with no exit code checking,
582 * trying to force detach the debugger:
583 */
584 dbg_remove_all_break();
585 kgdb_connected = 0;
586 }
587}
588
589/* Handle the 'R' reboot packets */
590static int gdb_cmd_reboot(struct kgdb_state *ks)
591{
592 /* For now, only honor R0 */
593 if (strcmp(remcom_in_buffer, "R0") == 0) {
594 printk(KERN_CRIT "Executing emergency reboot\n");
595 strcpy(remcom_out_buffer, "OK");
596 put_packet(remcom_out_buffer);
597
598 /*
599 * Execution should not return from
600 * machine_emergency_restart()
601 */
602 machine_emergency_restart();
603 kgdb_connected = 0;
604
605 return 1;
606 }
607 return 0;
608}
609
610/* Handle the 'q' query packets */
611static void gdb_cmd_query(struct kgdb_state *ks)
612{
613 struct task_struct *g;
614 struct task_struct *p;
615 unsigned char thref[8];
616 char *ptr;
617 int i;
618 int cpu;
619 int finished = 0;
620
621 switch (remcom_in_buffer[1]) {
622 case 's':
623 case 'f':
624 if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
625 error_packet(remcom_out_buffer, -EINVAL);
626 break;
627 }
628
629 i = 0;
630 remcom_out_buffer[0] = 'm';
631 ptr = remcom_out_buffer + 1;
632 if (remcom_in_buffer[1] == 'f') {
633 /* Each cpu is a shadow thread */
634 for_each_online_cpu(cpu) {
635 ks->thr_query = 0;
636 int_to_threadref(thref, -cpu - 2);
637 pack_threadid(ptr, thref);
638 ptr += BUF_THREAD_ID_SIZE;
639 *(ptr++) = ',';
640 i++;
641 }
642 }
643
644 do_each_thread(g, p) {
645 if (i >= ks->thr_query && !finished) {
646 int_to_threadref(thref, p->pid);
647 pack_threadid(ptr, thref);
648 ptr += BUF_THREAD_ID_SIZE;
649 *(ptr++) = ',';
650 ks->thr_query++;
651 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
652 finished = 1;
653 }
654 i++;
655 } while_each_thread(g, p);
656
657 *(--ptr) = '\0';
658 break;
659
660 case 'C':
661 /* Current thread id */
662 strcpy(remcom_out_buffer, "QC");
663 ks->threadid = shadow_pid(current->pid);
664 int_to_threadref(thref, ks->threadid);
665 pack_threadid(remcom_out_buffer + 2, thref);
666 break;
667 case 'T':
668 if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
669 error_packet(remcom_out_buffer, -EINVAL);
670 break;
671 }
672 ks->threadid = 0;
673 ptr = remcom_in_buffer + 17;
674 kgdb_hex2long(&ptr, &ks->threadid);
675 if (!getthread(ks->linux_regs, ks->threadid)) {
676 error_packet(remcom_out_buffer, -EINVAL);
677 break;
678 }
679 if ((int)ks->threadid > 0) {
680 kgdb_mem2hex(getthread(ks->linux_regs,
681 ks->threadid)->comm,
682 remcom_out_buffer, 16);
683 } else {
684 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
685
686 sprintf(tmpstr, "shadowCPU%d",
687 (int)(-ks->threadid - 2));
688 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
689 }
690 break;
691#ifdef CONFIG_KGDB_KDB
692 case 'R':
693 if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) {
694 int len = strlen(remcom_in_buffer + 6);
695
696 if ((len % 2) != 0) {
697 strcpy(remcom_out_buffer, "E01");
698 break;
699 }
700 kgdb_hex2mem(remcom_in_buffer + 6,
701 remcom_out_buffer, len);
702 len = len / 2;
703 remcom_out_buffer[len++] = 0;
704
705 kdb_parse(remcom_out_buffer);
706 strcpy(remcom_out_buffer, "OK");
707 }
708 break;
709#endif
710 }
711}
712
713/* Handle the 'H' task query packets */
714static void gdb_cmd_task(struct kgdb_state *ks)
715{
716 struct task_struct *thread;
717 char *ptr;
718
719 switch (remcom_in_buffer[1]) {
720 case 'g':
721 ptr = &remcom_in_buffer[2];
722 kgdb_hex2long(&ptr, &ks->threadid);
723 thread = getthread(ks->linux_regs, ks->threadid);
724 if (!thread && ks->threadid > 0) {
725 error_packet(remcom_out_buffer, -EINVAL);
726 break;
727 }
728 kgdb_usethread = thread;
729 ks->kgdb_usethreadid = ks->threadid;
730 strcpy(remcom_out_buffer, "OK");
731 break;
732 case 'c':
733 ptr = &remcom_in_buffer[2];
734 kgdb_hex2long(&ptr, &ks->threadid);
735 if (!ks->threadid) {
736 kgdb_contthread = NULL;
737 } else {
738 thread = getthread(ks->linux_regs, ks->threadid);
739 if (!thread && ks->threadid > 0) {
740 error_packet(remcom_out_buffer, -EINVAL);
741 break;
742 }
743 kgdb_contthread = thread;
744 }
745 strcpy(remcom_out_buffer, "OK");
746 break;
747 }
748}
749
750/* Handle the 'T' thread query packets */
751static void gdb_cmd_thread(struct kgdb_state *ks)
752{
753 char *ptr = &remcom_in_buffer[1];
754 struct task_struct *thread;
755
756 kgdb_hex2long(&ptr, &ks->threadid);
757 thread = getthread(ks->linux_regs, ks->threadid);
758 if (thread)
759 strcpy(remcom_out_buffer, "OK");
760 else
761 error_packet(remcom_out_buffer, -EINVAL);
762}
763
764/* Handle the 'z' or 'Z' breakpoint remove or set packets */
765static void gdb_cmd_break(struct kgdb_state *ks)
766{
767 /*
768 * Since GDB-5.3, it's been drafted that '0' is a software
769 * breakpoint, '1' is a hardware breakpoint, so let's do that.
770 */
771 char *bpt_type = &remcom_in_buffer[1];
772 char *ptr = &remcom_in_buffer[2];
773 unsigned long addr;
774 unsigned long length;
775 int error = 0;
776
777 if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
778 /* Unsupported */
779 if (*bpt_type > '4')
780 return;
781 } else {
782 if (*bpt_type != '0' && *bpt_type != '1')
783 /* Unsupported. */
784 return;
785 }
786
787 /*
788 * Test if this is a hardware breakpoint, and
789 * if we support it:
790 */
791 if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
792 /* Unsupported. */
793 return;
794
795 if (*(ptr++) != ',') {
796 error_packet(remcom_out_buffer, -EINVAL);
797 return;
798 }
799 if (!kgdb_hex2long(&ptr, &addr)) {
800 error_packet(remcom_out_buffer, -EINVAL);
801 return;
802 }
803 if (*(ptr++) != ',' ||
804 !kgdb_hex2long(&ptr, &length)) {
805 error_packet(remcom_out_buffer, -EINVAL);
806 return;
807 }
808
809 if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
810 error = dbg_set_sw_break(addr);
811 else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
812 error = dbg_remove_sw_break(addr);
813 else if (remcom_in_buffer[0] == 'Z')
814 error = arch_kgdb_ops.set_hw_breakpoint(addr,
815 (int)length, *bpt_type - '0');
816 else if (remcom_in_buffer[0] == 'z')
817 error = arch_kgdb_ops.remove_hw_breakpoint(addr,
818 (int) length, *bpt_type - '0');
819
820 if (error == 0)
821 strcpy(remcom_out_buffer, "OK");
822 else
823 error_packet(remcom_out_buffer, error);
824}
825
826/* Handle the 'C' signal / exception passing packets */
827static int gdb_cmd_exception_pass(struct kgdb_state *ks)
828{
829 /* C09 == pass exception
830 * C15 == detach kgdb, pass exception
831 */
832 if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
833
834 ks->pass_exception = 1;
835 remcom_in_buffer[0] = 'c';
836
837 } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
838
839 ks->pass_exception = 1;
840 remcom_in_buffer[0] = 'D';
841 dbg_remove_all_break();
842 kgdb_connected = 0;
843 return 1;
844
845 } else {
846 gdbstub_msg_write("KGDB only knows signal 9 (pass)"
847 " and 15 (pass and disconnect)\n"
848 "Executing a continue without signal passing\n", 0);
849 remcom_in_buffer[0] = 'c';
850 }
851
852 /* Indicate fall through */
853 return -1;
854}
855
856/*
857 * This function performs all gdbserial command procesing
858 */
859int gdb_serial_stub(struct kgdb_state *ks)
860{
861 int error = 0;
862 int tmp;
863
864 /* Clear the out buffer. */
865 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
866
867 if (kgdb_connected) {
868 unsigned char thref[8];
869 char *ptr;
870
871 /* Reply to host that an exception has occurred */
872 ptr = remcom_out_buffer;
873 *ptr++ = 'T';
874 ptr = pack_hex_byte(ptr, ks->signo);
875 ptr += strlen(strcpy(ptr, "thread:"));
876 int_to_threadref(thref, shadow_pid(current->pid));
877 ptr = pack_threadid(ptr, thref);
878 *ptr++ = ';';
879 put_packet(remcom_out_buffer);
880 }
881
882 kgdb_usethread = kgdb_info[ks->cpu].task;
883 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
884 ks->pass_exception = 0;
885
886 while (1) {
887 error = 0;
888
889 /* Clear the out buffer. */
890 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
891
892 get_packet(remcom_in_buffer);
893
894 switch (remcom_in_buffer[0]) {
895 case '?': /* gdbserial status */
896 gdb_cmd_status(ks);
897 break;
898 case 'g': /* return the value of the CPU registers */
899 gdb_cmd_getregs(ks);
900 break;
901 case 'G': /* set the value of the CPU registers - return OK */
902 gdb_cmd_setregs(ks);
903 break;
904 case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
905 gdb_cmd_memread(ks);
906 break;
907 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
908 gdb_cmd_memwrite(ks);
909 break;
910 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
911 gdb_cmd_binwrite(ks);
912 break;
913 /* kill or detach. KGDB should treat this like a
914 * continue.
915 */
916 case 'D': /* Debugger detach */
917 case 'k': /* Debugger detach via kill */
918 gdb_cmd_detachkill(ks);
919 goto default_handle;
920 case 'R': /* Reboot */
921 if (gdb_cmd_reboot(ks))
922 goto default_handle;
923 break;
924 case 'q': /* query command */
925 gdb_cmd_query(ks);
926 break;
927 case 'H': /* task related */
928 gdb_cmd_task(ks);
929 break;
930 case 'T': /* Query thread status */
931 gdb_cmd_thread(ks);
932 break;
933 case 'z': /* Break point remove */
934 case 'Z': /* Break point set */
935 gdb_cmd_break(ks);
936 break;
937#ifdef CONFIG_KGDB_KDB
938 case '3': /* Escape into back into kdb */
939 if (remcom_in_buffer[1] == '\0') {
940 gdb_cmd_detachkill(ks);
941 return DBG_PASS_EVENT;
942 }
943#endif
944 case 'C': /* Exception passing */
945 tmp = gdb_cmd_exception_pass(ks);
946 if (tmp > 0)
947 goto default_handle;
948 if (tmp == 0)
949 break;
950 /* Fall through on tmp < 0 */
951 case 'c': /* Continue packet */
952 case 's': /* Single step packet */
953 if (kgdb_contthread && kgdb_contthread != current) {
954 /* Can't switch threads in kgdb */
955 error_packet(remcom_out_buffer, -EINVAL);
956 break;
957 }
958 dbg_activate_sw_breakpoints();
959 /* Fall through to default processing */
960 default:
961default_handle:
962 error = kgdb_arch_handle_exception(ks->ex_vector,
963 ks->signo,
964 ks->err_code,
965 remcom_in_buffer,
966 remcom_out_buffer,
967 ks->linux_regs);
968 /*
969 * Leave cmd processing on error, detach,
970 * kill, continue, or single step.
971 */
972 if (error >= 0 || remcom_in_buffer[0] == 'D' ||
973 remcom_in_buffer[0] == 'k') {
974 error = 0;
975 goto kgdb_exit;
976 }
977
978 }
979
980 /* reply to the request */
981 put_packet(remcom_out_buffer);
982 }
983
984kgdb_exit:
985 if (ks->pass_exception)
986 error = 1;
987 return error;
988}
989
990int gdbstub_state(struct kgdb_state *ks, char *cmd)
991{
992 int error;
993
994 switch (cmd[0]) {
995 case 'e':
996 error = kgdb_arch_handle_exception(ks->ex_vector,
997 ks->signo,
998 ks->err_code,
999 remcom_in_buffer,
1000 remcom_out_buffer,
1001 ks->linux_regs);
1002 return error;
1003 case 's':
1004 case 'c':
1005 strcpy(remcom_in_buffer, cmd);
1006 return 0;
1007 case '?':
1008 gdb_cmd_status(ks);
1009 break;
1010 case '\0':
1011 strcpy(remcom_out_buffer, "");
1012 break;
1013 }
1014 dbg_io_ops->write_char('+');
1015 put_packet(remcom_out_buffer);
1016 return 0;
1017}
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
new file mode 100644
index 000000000000..396d12eda9e8
--- /dev/null
+++ b/kernel/debug/kdb/.gitignore
@@ -0,0 +1 @@
gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
new file mode 100644
index 000000000000..d4fc58f4b88d
--- /dev/null
+++ b/kernel/debug/kdb/Makefile
@@ -0,0 +1,25 @@
1# This file is subject to the terms and conditions of the GNU General Public
2# License. See the file "COPYING" in the main directory of this archive
3# for more details.
4#
5# Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
6# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
7#
8
9CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
10obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
11obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
12
13clean-files := gen-kdb_cmds.c
14
15quiet_cmd_gen-kdb = GENKDB $@
16 cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
17 /^\#/{next} \
18 /^[ \t]*$$/{next} \
19 {gsub(/"/, "\\\"", $$0); \
20 print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
21 END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" NULL\n};");}' \
22 $(filter-out %/Makefile,$^) > $@#
23
24$(obj)/gen-kdb_cmds.c: $(src)/kdb_cmds $(src)/Makefile
25 $(call cmd,gen-kdb)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
new file mode 100644
index 000000000000..75bd9b3ebbb7
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -0,0 +1,564 @@
1/*
2 * Kernel Debugger Architecture Independent Breakpoint Handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/string.h>
13#include <linux/kernel.h>
14#include <linux/init.h>
15#include <linux/kdb.h>
16#include <linux/kgdb.h>
17#include <linux/smp.h>
18#include <linux/sched.h>
19#include <linux/interrupt.h>
20#include "kdb_private.h"
21
22/*
23 * Table of kdb_breakpoints
24 */
25kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
26
27static void kdb_setsinglestep(struct pt_regs *regs)
28{
29 KDB_STATE_SET(DOING_SS);
30}
31
32static char *kdb_rwtypes[] = {
33 "Instruction(i)",
34 "Instruction(Register)",
35 "Data Write",
36 "I/O",
37 "Data Access"
38};
39
40static char *kdb_bptype(kdb_bp_t *bp)
41{
42 if (bp->bp_type < 0 || bp->bp_type > 4)
43 return "";
44
45 return kdb_rwtypes[bp->bp_type];
46}
47
48static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
49{
50 int nextarg = *nextargp;
51 int diag;
52
53 bp->bph_length = 1;
54 if ((argc + 1) != nextarg) {
55 if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
56 bp->bp_type = BP_ACCESS_WATCHPOINT;
57 else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
58 bp->bp_type = BP_WRITE_WATCHPOINT;
59 else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
60 bp->bp_type = BP_HARDWARE_BREAKPOINT;
61 else
62 return KDB_ARGCOUNT;
63
64 bp->bph_length = 1;
65
66 nextarg++;
67
68 if ((argc + 1) != nextarg) {
69 unsigned long len;
70
71 diag = kdbgetularg((char *)argv[nextarg],
72 &len);
73 if (diag)
74 return diag;
75
76
77 if (len > 8)
78 return KDB_BADLENGTH;
79
80 bp->bph_length = len;
81 nextarg++;
82 }
83
84 if ((argc + 1) != nextarg)
85 return KDB_ARGCOUNT;
86 }
87
88 *nextargp = nextarg;
89 return 0;
90}
91
92static int _kdb_bp_remove(kdb_bp_t *bp)
93{
94 int ret = 1;
95 if (!bp->bp_installed)
96 return ret;
97 if (!bp->bp_type)
98 ret = dbg_remove_sw_break(bp->bp_addr);
99 else
100 ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
101 bp->bph_length,
102 bp->bp_type);
103 if (ret == 0)
104 bp->bp_installed = 0;
105 return ret;
106}
107
108static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
109{
110 if (KDB_DEBUG(BP))
111 kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
112
113 /*
114 * Setup single step
115 */
116 kdb_setsinglestep(regs);
117
118 /*
119 * Reset delay attribute
120 */
121 bp->bp_delay = 0;
122 bp->bp_delayed = 1;
123}
124
125static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
126{
127 int ret;
128 /*
129 * Install the breakpoint, if it is not already installed.
130 */
131
132 if (KDB_DEBUG(BP))
133 kdb_printf("%s: bp_installed %d\n",
134 __func__, bp->bp_installed);
135 if (!KDB_STATE(SSBPT))
136 bp->bp_delay = 0;
137 if (bp->bp_installed)
138 return 1;
139 if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
140 if (KDB_DEBUG(BP))
141 kdb_printf("%s: delayed bp\n", __func__);
142 kdb_handle_bp(regs, bp);
143 return 0;
144 }
145 if (!bp->bp_type)
146 ret = dbg_set_sw_break(bp->bp_addr);
147 else
148 ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
149 bp->bph_length,
150 bp->bp_type);
151 if (ret == 0) {
152 bp->bp_installed = 1;
153 } else {
154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
155 __func__, bp->bp_addr);
156 return 1;
157 }
158 return 0;
159}
160
161/*
162 * kdb_bp_install
163 *
164 * Install kdb_breakpoints prior to returning from the
165 * kernel debugger. This allows the kdb_breakpoints to be set
166 * upon functions that are used internally by kdb, such as
167 * printk(). This function is only called once per kdb session.
168 */
169void kdb_bp_install(struct pt_regs *regs)
170{
171 int i;
172
173 for (i = 0; i < KDB_MAXBPT; i++) {
174 kdb_bp_t *bp = &kdb_breakpoints[i];
175
176 if (KDB_DEBUG(BP)) {
177 kdb_printf("%s: bp %d bp_enabled %d\n",
178 __func__, i, bp->bp_enabled);
179 }
180 if (bp->bp_enabled)
181 _kdb_bp_install(regs, bp);
182 }
183}
184
185/*
186 * kdb_bp_remove
187 *
188 * Remove kdb_breakpoints upon entry to the kernel debugger.
189 *
190 * Parameters:
191 * None.
192 * Outputs:
193 * None.
194 * Returns:
195 * None.
196 * Locking:
197 * None.
198 * Remarks:
199 */
200void kdb_bp_remove(void)
201{
202 int i;
203
204 for (i = KDB_MAXBPT - 1; i >= 0; i--) {
205 kdb_bp_t *bp = &kdb_breakpoints[i];
206
207 if (KDB_DEBUG(BP)) {
208 kdb_printf("%s: bp %d bp_enabled %d\n",
209 __func__, i, bp->bp_enabled);
210 }
211 if (bp->bp_enabled)
212 _kdb_bp_remove(bp);
213 }
214}
215
216
217/*
218 * kdb_printbp
219 *
220 * Internal function to format and print a breakpoint entry.
221 *
222 * Parameters:
223 * None.
224 * Outputs:
225 * None.
226 * Returns:
227 * None.
228 * Locking:
229 * None.
230 * Remarks:
231 */
232
233static void kdb_printbp(kdb_bp_t *bp, int i)
234{
235 kdb_printf("%s ", kdb_bptype(bp));
236 kdb_printf("BP #%d at ", i);
237 kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
238
239 if (bp->bp_enabled)
240 kdb_printf("\n is enabled");
241 else
242 kdb_printf("\n is disabled");
243
244 kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
245 bp->bp_addr, bp->bp_type, bp->bp_installed);
246
247 kdb_printf("\n");
248}
249
250/*
251 * kdb_bp
252 *
253 * Handle the bp commands.
254 *
255 * [bp|bph] <addr-expression> [DATAR|DATAW]
256 *
257 * Parameters:
258 * argc Count of arguments in argv
259 * argv Space delimited command line arguments
260 * Outputs:
261 * None.
262 * Returns:
263 * Zero for success, a kdb diagnostic if failure.
264 * Locking:
265 * None.
266 * Remarks:
267 *
268 * bp Set breakpoint on all cpus. Only use hardware assist if need.
269 * bph Set breakpoint on all cpus. Force hardware register
270 */
271
272static int kdb_bp(int argc, const char **argv)
273{
274 int i, bpno;
275 kdb_bp_t *bp, *bp_check;
276 int diag;
277 int free;
278 char *symname = NULL;
279 long offset = 0ul;
280 int nextarg;
281 kdb_bp_t template = {0};
282
283 if (argc == 0) {
284 /*
285 * Display breakpoint table
286 */
287 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
288 bpno++, bp++) {
289 if (bp->bp_free)
290 continue;
291 kdb_printbp(bp, bpno);
292 }
293
294 return 0;
295 }
296
297 nextarg = 1;
298 diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
299 &offset, &symname);
300 if (diag)
301 return diag;
302 if (!template.bp_addr)
303 return KDB_BADINT;
304
305 /*
306 * Find an empty bp structure to allocate
307 */
308 free = KDB_MAXBPT;
309 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
310 if (bp->bp_free)
311 break;
312 }
313
314 if (bpno == KDB_MAXBPT)
315 return KDB_TOOMANYBPT;
316
317 if (strcmp(argv[0], "bph") == 0) {
318 template.bp_type = BP_HARDWARE_BREAKPOINT;
319 diag = kdb_parsebp(argc, argv, &nextarg, &template);
320 if (diag)
321 return diag;
322 } else {
323 template.bp_type = BP_BREAKPOINT;
324 }
325
326 /*
327 * Check for clashing breakpoints.
328 *
329 * Note, in this design we can't have hardware breakpoints
330 * enabled for both read and write on the same address.
331 */
332 for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
333 i++, bp_check++) {
334 if (!bp_check->bp_free &&
335 bp_check->bp_addr == template.bp_addr) {
336 kdb_printf("You already have a breakpoint at "
337 kdb_bfd_vma_fmt0 "\n", template.bp_addr);
338 return KDB_DUPBPT;
339 }
340 }
341
342 template.bp_enabled = 1;
343
344 /*
345 * Actually allocate the breakpoint found earlier
346 */
347 *bp = template;
348 bp->bp_free = 0;
349
350 kdb_printbp(bp, bpno);
351
352 return 0;
353}
354
355/*
356 * kdb_bc
357 *
358 * Handles the 'bc', 'be', and 'bd' commands
359 *
360 * [bd|bc|be] <breakpoint-number>
361 * [bd|bc|be] *
362 *
363 * Parameters:
364 * argc Count of arguments in argv
365 * argv Space delimited command line arguments
366 * Outputs:
367 * None.
368 * Returns:
369 * Zero for success, a kdb diagnostic for failure
370 * Locking:
371 * None.
372 * Remarks:
373 */
374static int kdb_bc(int argc, const char **argv)
375{
376 unsigned long addr;
377 kdb_bp_t *bp = NULL;
378 int lowbp = KDB_MAXBPT;
379 int highbp = 0;
380 int done = 0;
381 int i;
382 int diag = 0;
383
384 int cmd; /* KDBCMD_B? */
385#define KDBCMD_BC 0
386#define KDBCMD_BE 1
387#define KDBCMD_BD 2
388
389 if (strcmp(argv[0], "be") == 0)
390 cmd = KDBCMD_BE;
391 else if (strcmp(argv[0], "bd") == 0)
392 cmd = KDBCMD_BD;
393 else
394 cmd = KDBCMD_BC;
395
396 if (argc != 1)
397 return KDB_ARGCOUNT;
398
399 if (strcmp(argv[1], "*") == 0) {
400 lowbp = 0;
401 highbp = KDB_MAXBPT;
402 } else {
403 diag = kdbgetularg(argv[1], &addr);
404 if (diag)
405 return diag;
406
407 /*
408 * For addresses less than the maximum breakpoint number,
409 * assume that the breakpoint number is desired.
410 */
411 if (addr < KDB_MAXBPT) {
412 bp = &kdb_breakpoints[addr];
413 lowbp = highbp = addr;
414 highbp++;
415 } else {
416 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
417 i++, bp++) {
418 if (bp->bp_addr == addr) {
419 lowbp = highbp = i;
420 highbp++;
421 break;
422 }
423 }
424 }
425 }
426
427 /*
428 * Now operate on the set of breakpoints matching the input
429 * criteria (either '*' for all, or an individual breakpoint).
430 */
431 for (bp = &kdb_breakpoints[lowbp], i = lowbp;
432 i < highbp;
433 i++, bp++) {
434 if (bp->bp_free)
435 continue;
436
437 done++;
438
439 switch (cmd) {
440 case KDBCMD_BC:
441 bp->bp_enabled = 0;
442
443 kdb_printf("Breakpoint %d at "
444 kdb_bfd_vma_fmt " cleared\n",
445 i, bp->bp_addr);
446
447 bp->bp_addr = 0;
448 bp->bp_free = 1;
449
450 break;
451 case KDBCMD_BE:
452 bp->bp_enabled = 1;
453
454 kdb_printf("Breakpoint %d at "
455 kdb_bfd_vma_fmt " enabled",
456 i, bp->bp_addr);
457
458 kdb_printf("\n");
459 break;
460 case KDBCMD_BD:
461 if (!bp->bp_enabled)
462 break;
463
464 bp->bp_enabled = 0;
465
466 kdb_printf("Breakpoint %d at "
467 kdb_bfd_vma_fmt " disabled\n",
468 i, bp->bp_addr);
469
470 break;
471 }
472 if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
473 bp->bp_delay = 0;
474 KDB_STATE_CLEAR(SSBPT);
475 }
476 }
477
478 return (!done) ? KDB_BPTNOTFOUND : 0;
479}
480
481/*
482 * kdb_ss
483 *
484 * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
485 * commands.
486 *
487 * ss
488 * ssb
489 *
490 * Parameters:
491 * argc Argument count
492 * argv Argument vector
493 * Outputs:
494 * None.
495 * Returns:
496 * KDB_CMD_SS[B] for success, a kdb error if failure.
497 * Locking:
498 * None.
499 * Remarks:
500 *
501 * Set the arch specific option to trigger a debug trap after the next
502 * instruction.
503 *
504 * For 'ssb', set the trace flag in the debug trap handler
505 * after printing the current insn and return directly without
506 * invoking the kdb command processor, until a branch instruction
507 * is encountered.
508 */
509
510static int kdb_ss(int argc, const char **argv)
511{
512 int ssb = 0;
513
514 ssb = (strcmp(argv[0], "ssb") == 0);
515 if (argc != 0)
516 return KDB_ARGCOUNT;
517 /*
518 * Set trace flag and go.
519 */
520 KDB_STATE_SET(DOING_SS);
521 if (ssb) {
522 KDB_STATE_SET(DOING_SSB);
523 return KDB_CMD_SSB;
524 }
525 return KDB_CMD_SS;
526}
527
528/* Initialize the breakpoint table and register breakpoint commands. */
529
530void __init kdb_initbptab(void)
531{
532 int i;
533 kdb_bp_t *bp;
534
535 /*
536 * First time initialization.
537 */
538 memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
539
540 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
541 bp->bp_free = 1;
542
543 kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
544 "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
545 kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
546 "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
547 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
548 kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
549 "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS);
550 kdb_register_repeat("bc", kdb_bc, "<bpnum>",
551 "Clear Breakpoint", 0, KDB_REPEAT_NONE);
552 kdb_register_repeat("be", kdb_bc, "<bpnum>",
553 "Enable Breakpoint", 0, KDB_REPEAT_NONE);
554 kdb_register_repeat("bd", kdb_bc, "<bpnum>",
555 "Disable Breakpoint", 0, KDB_REPEAT_NONE);
556
557 kdb_register_repeat("ss", kdb_ss, "",
558 "Single Step", 1, KDB_REPEAT_NO_ARGS);
559 kdb_register_repeat("ssb", kdb_ss, "",
560 "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
561 /*
562 * Architecture dependent initialization.
563 */
564}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
new file mode 100644
index 000000000000..2f62fe85f16a
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -0,0 +1,210 @@
1/*
2 * Kernel Debugger Architecture Independent Stack Traceback
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/ctype.h>
13#include <linux/string.h>
14#include <linux/kernel.h>
15#include <linux/sched.h>
16#include <linux/kdb.h>
17#include <linux/nmi.h>
18#include <asm/system.h>
19#include "kdb_private.h"
20
21
22static void kdb_show_stack(struct task_struct *p, void *addr)
23{
24 int old_lvl = console_loglevel;
25 console_loglevel = 15;
26 kdb_trap_printk++;
27 kdb_set_current_task(p);
28 if (addr) {
29 show_stack((struct task_struct *)p, addr);
30 } else if (kdb_current_regs) {
31#ifdef CONFIG_X86
32 show_stack(p, &kdb_current_regs->sp);
33#else
34 show_stack(p, NULL);
35#endif
36 } else {
37 show_stack(p, NULL);
38 }
39 console_loglevel = old_lvl;
40 kdb_trap_printk--;
41}
42
43/*
44 * kdb_bt
45 *
46 * This function implements the 'bt' command. Print a stack
47 * traceback.
48 *
49 * bt [<address-expression>] (addr-exp is for alternate stacks)
50 * btp <pid> Kernel stack for <pid>
51 * btt <address-expression> Kernel stack for task structure at
52 * <address-expression>
53 * bta [DRSTCZEUIMA] All useful processes, optionally
54 * filtered by state
55 * btc [<cpu>] The current process on one cpu,
56 * default is all cpus
57 *
58 * bt <address-expression> refers to a address on the stack, that location
59 * is assumed to contain a return address.
60 *
61 * btt <address-expression> refers to the address of a struct task.
62 *
63 * Inputs:
64 * argc argument count
65 * argv argument vector
66 * Outputs:
67 * None.
68 * Returns:
69 * zero for success, a kdb diagnostic if error
70 * Locking:
71 * none.
72 * Remarks:
73 * Backtrack works best when the code uses frame pointers. But even
74 * without frame pointers we should get a reasonable trace.
75 *
76 * mds comes in handy when examining the stack to do a manual traceback or
77 * to get a starting point for bt <address-expression>.
78 */
79
80static int
81kdb_bt1(struct task_struct *p, unsigned long mask,
82 int argcount, int btaprompt)
83{
84 char buffer[2];
85 if (kdb_getarea(buffer[0], (unsigned long)p) ||
86 kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
87 return KDB_BADADDR;
88 if (!kdb_task_state(p, mask))
89 return 0;
90 kdb_printf("Stack traceback for pid %d\n", p->pid);
91 kdb_ps1(p);
92 kdb_show_stack(p, NULL);
93 if (btaprompt) {
94 kdb_getstr(buffer, sizeof(buffer),
95 "Enter <q> to end, <cr> to continue:");
96 if (buffer[0] == 'q') {
97 kdb_printf("\n");
98 return 1;
99 }
100 }
101 touch_nmi_watchdog();
102 return 0;
103}
104
105int
106kdb_bt(int argc, const char **argv)
107{
108 int diag;
109 int argcount = 5;
110 int btaprompt = 1;
111 int nextarg;
112 unsigned long addr;
113 long offset;
114
115 kdbgetintenv("BTARGS", &argcount); /* Arguments to print */
116 kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each
117 * proc in bta */
118
119 if (strcmp(argv[0], "bta") == 0) {
120 struct task_struct *g, *p;
121 unsigned long cpu;
122 unsigned long mask = kdb_task_state_string(argc ? argv[1] :
123 NULL);
124 if (argc == 0)
125 kdb_ps_suppressed();
126 /* Run the active tasks first */
127 for_each_online_cpu(cpu) {
128 p = kdb_curr_task(cpu);
129 if (kdb_bt1(p, mask, argcount, btaprompt))
130 return 0;
131 }
132 /* Now the inactive tasks */
133 kdb_do_each_thread(g, p) {
134 if (task_curr(p))
135 continue;
136 if (kdb_bt1(p, mask, argcount, btaprompt))
137 return 0;
138 } kdb_while_each_thread(g, p);
139 } else if (strcmp(argv[0], "btp") == 0) {
140 struct task_struct *p;
141 unsigned long pid;
142 if (argc != 1)
143 return KDB_ARGCOUNT;
144 diag = kdbgetularg((char *)argv[1], &pid);
145 if (diag)
146 return diag;
147 p = find_task_by_pid_ns(pid, &init_pid_ns);
148 if (p) {
149 kdb_set_current_task(p);
150 return kdb_bt1(p, ~0UL, argcount, 0);
151 }
152 kdb_printf("No process with pid == %ld found\n", pid);
153 return 0;
154 } else if (strcmp(argv[0], "btt") == 0) {
155 if (argc != 1)
156 return KDB_ARGCOUNT;
157 diag = kdbgetularg((char *)argv[1], &addr);
158 if (diag)
159 return diag;
160 kdb_set_current_task((struct task_struct *)addr);
161 return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
162 } else if (strcmp(argv[0], "btc") == 0) {
163 unsigned long cpu = ~0;
164 struct task_struct *save_current_task = kdb_current_task;
165 char buf[80];
166 if (argc > 1)
167 return KDB_ARGCOUNT;
168 if (argc == 1) {
169 diag = kdbgetularg((char *)argv[1], &cpu);
170 if (diag)
171 return diag;
172 }
173 /* Recursive use of kdb_parse, do not use argv after
174 * this point */
175 argv = NULL;
176 if (cpu != ~0) {
177 if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
178 kdb_printf("no process for cpu %ld\n", cpu);
179 return 0;
180 }
181 sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
182 kdb_parse(buf);
183 return 0;
184 }
185 kdb_printf("btc: cpu status: ");
186 kdb_parse("cpu\n");
187 for_each_online_cpu(cpu) {
188 sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
189 kdb_parse(buf);
190 touch_nmi_watchdog();
191 }
192 kdb_set_current_task(save_current_task);
193 return 0;
194 } else {
195 if (argc) {
196 nextarg = 1;
197 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
198 &offset, NULL);
199 if (diag)
200 return diag;
201 kdb_show_stack(kdb_current_task, (void *)addr);
202 return 0;
203 } else {
204 return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
205 }
206 }
207
208 /* NOTREACHED */
209 return 0;
210}
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
new file mode 100644
index 000000000000..56c88e4db309
--- /dev/null
+++ b/kernel/debug/kdb/kdb_cmds
@@ -0,0 +1,35 @@
1# Initial commands for kdb, alter to suit your needs.
2# These commands are executed in kdb_init() context, no SMP, no
3# processes. Commands that require process data (including stack or
4# registers) are not reliable this early. set and bp commands should
5# be safe. Global breakpoint commands affect each cpu as it is booted.
6
7# Standard debugging information for first level support, just type archkdb
8# or archkdbcpu or archkdbshort at the kdb prompt.
9
10defcmd dumpcommon "" "Common kdb debugging"
11 set BTAPROMPT 0
12 set LINES 10000
13 -summary
14 -cpu
15 -ps
16 -dmesg 600
17 -bt
18endefcmd
19
20defcmd dumpall "" "First line debugging"
21 set BTSYMARG 1
22 set BTARGS 9
23 pid R
24 -dumpcommon
25 -bta
26endefcmd
27
28defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
29 set BTSYMARG 1
30 set BTARGS 9
31 pid R
32 -dumpcommon
33 -btc
34endefcmd
35
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
new file mode 100644
index 000000000000..bf6e8270e957
--- /dev/null
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -0,0 +1,169 @@
1/*
2 * Created by: Jason Wessel <jason.wessel@windriver.com>
3 *
4 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
5 *
6 * This file is licensed under the terms of the GNU General Public
7 * License version 2. This program is licensed "as is" without any
8 * warranty of any kind, whether express or implied.
9 */
10
11#include <linux/kgdb.h>
12#include <linux/kdb.h>
13#include <linux/kdebug.h>
14#include "kdb_private.h"
15#include "../debug_core.h"
16
17/*
18 * KDB interface to KGDB internals
19 */
20get_char_func kdb_poll_funcs[] = {
21 dbg_io_get_char,
22 NULL,
23 NULL,
24 NULL,
25 NULL,
26 NULL,
27};
28EXPORT_SYMBOL_GPL(kdb_poll_funcs);
29
30int kdb_poll_idx = 1;
31EXPORT_SYMBOL_GPL(kdb_poll_idx);
32
33int kdb_stub(struct kgdb_state *ks)
34{
35 int error = 0;
36 kdb_bp_t *bp;
37 unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
38 kdb_reason_t reason = KDB_REASON_OOPS;
39 kdb_dbtrap_t db_result = KDB_DB_NOBPT;
40 int i;
41
42 if (KDB_STATE(REENTRY)) {
43 reason = KDB_REASON_SWITCH;
44 KDB_STATE_CLEAR(REENTRY);
45 addr = instruction_pointer(ks->linux_regs);
46 }
47 ks->pass_exception = 0;
48 if (atomic_read(&kgdb_setting_breakpoint))
49 reason = KDB_REASON_KEYBOARD;
50
51 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
52 if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
53 reason = KDB_REASON_BREAK;
54 db_result = KDB_DB_BPT;
55 if (addr != instruction_pointer(ks->linux_regs))
56 kgdb_arch_set_pc(ks->linux_regs, addr);
57 break;
58 }
59 }
60 if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
61 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
62 if (bp->bp_free)
63 continue;
64 if (bp->bp_addr == addr) {
65 bp->bp_delay = 1;
66 bp->bp_delayed = 1;
67 /*
68 * SSBPT is set when the kernel debugger must single step a
69 * task in order to re-establish an instruction breakpoint
70 * which uses the instruction replacement mechanism. It is
71 * cleared by any action that removes the need to single-step
72 * the breakpoint.
73 */
74 reason = KDB_REASON_BREAK;
75 db_result = KDB_DB_BPT;
76 KDB_STATE_SET(SSBPT);
77 break;
78 }
79 }
80 }
81
82 if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
83 ks->signo == SIGTRAP) {
84 reason = KDB_REASON_SSTEP;
85 db_result = KDB_DB_BPT;
86 }
87 /* Set initial kdb state variables */
88 KDB_STATE_CLEAR(KGDB_TRANS);
89 kdb_initial_cpu = ks->cpu;
90 kdb_current_task = kgdb_info[ks->cpu].task;
91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
92 /* Remove any breakpoints as needed by kdb and clear single step */
93 kdb_bp_remove();
94 KDB_STATE_CLEAR(DOING_SS);
95 KDB_STATE_CLEAR(DOING_SSB);
96 KDB_STATE_SET(PAGER);
97 /* zero out any offline cpu data */
98 for_each_present_cpu(i) {
99 if (!cpu_online(i)) {
100 kgdb_info[i].debuggerinfo = NULL;
101 kgdb_info[i].task = NULL;
102 }
103 }
104 if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
105 ks->pass_exception = 1;
106 KDB_FLAG_SET(CATASTROPHIC);
107 }
108 kdb_initial_cpu = ks->cpu;
109 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
110 KDB_STATE_CLEAR(SSBPT);
111 KDB_STATE_CLEAR(DOING_SS);
112 } else {
113 /* Start kdb main loop */
114 error = kdb_main_loop(KDB_REASON_ENTER, reason,
115 ks->err_code, db_result, ks->linux_regs);
116 }
117 /*
118 * Upon exit from the kdb main loop setup break points and restart
119 * the system based on the requested continue state
120 */
121 kdb_initial_cpu = -1;
122 kdb_current_task = NULL;
123 kdb_current_regs = NULL;
124 KDB_STATE_CLEAR(PAGER);
125 kdbnearsym_cleanup();
126 if (error == KDB_CMD_KGDB) {
127 if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
128 /*
129 * This inteface glue which allows kdb to transition in into
130 * the gdb stub. In order to do this the '?' or '' gdb serial
131 * packet response is processed here. And then control is
132 * passed to the gdbstub.
133 */
134 if (KDB_STATE(DOING_KGDB))
135 gdbstub_state(ks, "?");
136 else
137 gdbstub_state(ks, "");
138 KDB_STATE_CLEAR(DOING_KGDB);
139 KDB_STATE_CLEAR(DOING_KGDB2);
140 }
141 return DBG_PASS_EVENT;
142 }
143 kdb_bp_install(ks->linux_regs);
144 dbg_activate_sw_breakpoints();
145 /* Set the exit state to a single step or a continue */
146 if (KDB_STATE(DOING_SS))
147 gdbstub_state(ks, "s");
148 else
149 gdbstub_state(ks, "c");
150
151 KDB_FLAG_CLEAR(CATASTROPHIC);
152
153 /* Invoke arch specific exception handling prior to system resume */
154 kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
155 if (ks->pass_exception)
156 kgdb_info[ks->cpu].ret_state = 1;
157 if (error == KDB_CMD_CPU) {
158 KDB_STATE_SET(REENTRY);
159 /*
160 * Force clear the single step bit because kdb emulates this
161 * differently vs the gdbstub
162 */
163 kgdb_single_step = 0;
164 dbg_deactivate_sw_breakpoints();
165 return DBG_SWITCH_CPU_EVENT;
166 }
167 return kgdb_info[ks->cpu].ret_state;
168}
169
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
new file mode 100644
index 000000000000..c9b7f4f90bba
--- /dev/null
+++ b/kernel/debug/kdb/kdb_io.c
@@ -0,0 +1,826 @@
1/*
2 * Kernel Debugger Architecture Independent Console I/O handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/ctype.h>
15#include <linux/kernel.h>
16#include <linux/init.h>
17#include <linux/kdev_t.h>
18#include <linux/console.h>
19#include <linux/string.h>
20#include <linux/sched.h>
21#include <linux/smp.h>
22#include <linux/nmi.h>
23#include <linux/delay.h>
24#include <linux/kgdb.h>
25#include <linux/kdb.h>
26#include <linux/kallsyms.h>
27#include "kdb_private.h"
28
29#define CMD_BUFLEN 256
30char kdb_prompt_str[CMD_BUFLEN];
31
32int kdb_trap_printk;
33
34static void kgdb_transition_check(char *buffer)
35{
36 int slen = strlen(buffer);
37 if (strncmp(buffer, "$?#3f", slen) != 0 &&
38 strncmp(buffer, "$qSupported#37", slen) != 0 &&
39 strncmp(buffer, "+$qSupported#37", slen) != 0) {
40 KDB_STATE_SET(KGDB_TRANS);
41 kdb_printf("%s", buffer);
42 }
43}
44
45static int kdb_read_get_key(char *buffer, size_t bufsize)
46{
47#define ESCAPE_UDELAY 1000
48#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
49 char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
50 char *ped = escape_data;
51 int escape_delay = 0;
52 get_char_func *f, *f_escape = NULL;
53 int key;
54
55 for (f = &kdb_poll_funcs[0]; ; ++f) {
56 if (*f == NULL) {
57 /* Reset NMI watchdog once per poll loop */
58 touch_nmi_watchdog();
59 f = &kdb_poll_funcs[0];
60 }
61 if (escape_delay == 2) {
62 *ped = '\0';
63 ped = escape_data;
64 --escape_delay;
65 }
66 if (escape_delay == 1) {
67 key = *ped++;
68 if (!*ped)
69 --escape_delay;
70 break;
71 }
72 key = (*f)();
73 if (key == -1) {
74 if (escape_delay) {
75 udelay(ESCAPE_UDELAY);
76 --escape_delay;
77 }
78 continue;
79 }
80 if (bufsize <= 2) {
81 if (key == '\r')
82 key = '\n';
83 *buffer++ = key;
84 *buffer = '\0';
85 return -1;
86 }
87 if (escape_delay == 0 && key == '\e') {
88 escape_delay = ESCAPE_DELAY;
89 ped = escape_data;
90 f_escape = f;
91 }
92 if (escape_delay) {
93 *ped++ = key;
94 if (f_escape != f) {
95 escape_delay = 2;
96 continue;
97 }
98 if (ped - escape_data == 1) {
99 /* \e */
100 continue;
101 } else if (ped - escape_data == 2) {
102 /* \e<something> */
103 if (key != '[')
104 escape_delay = 2;
105 continue;
106 } else if (ped - escape_data == 3) {
107 /* \e[<something> */
108 int mapkey = 0;
109 switch (key) {
110 case 'A': /* \e[A, up arrow */
111 mapkey = 16;
112 break;
113 case 'B': /* \e[B, down arrow */
114 mapkey = 14;
115 break;
116 case 'C': /* \e[C, right arrow */
117 mapkey = 6;
118 break;
119 case 'D': /* \e[D, left arrow */
120 mapkey = 2;
121 break;
122 case '1': /* dropthrough */
123 case '3': /* dropthrough */
124 /* \e[<1,3,4>], may be home, del, end */
125 case '4':
126 mapkey = -1;
127 break;
128 }
129 if (mapkey != -1) {
130 if (mapkey > 0) {
131 escape_data[0] = mapkey;
132 escape_data[1] = '\0';
133 }
134 escape_delay = 2;
135 }
136 continue;
137 } else if (ped - escape_data == 4) {
138 /* \e[<1,3,4><something> */
139 int mapkey = 0;
140 if (key == '~') {
141 switch (escape_data[2]) {
142 case '1': /* \e[1~, home */
143 mapkey = 1;
144 break;
145 case '3': /* \e[3~, del */
146 mapkey = 4;
147 break;
148 case '4': /* \e[4~, end */
149 mapkey = 5;
150 break;
151 }
152 }
153 if (mapkey > 0) {
154 escape_data[0] = mapkey;
155 escape_data[1] = '\0';
156 }
157 escape_delay = 2;
158 continue;
159 }
160 }
161 break; /* A key to process */
162 }
163 return key;
164}
165
166/*
167 * kdb_read
168 *
169 * This function reads a string of characters, terminated by
170 * a newline, or by reaching the end of the supplied buffer,
171 * from the current kernel debugger console device.
172 * Parameters:
173 * buffer - Address of character buffer to receive input characters.
174 * bufsize - size, in bytes, of the character buffer
175 * Returns:
176 * Returns a pointer to the buffer containing the received
177 * character string. This string will be terminated by a
178 * newline character.
179 * Locking:
180 * No locks are required to be held upon entry to this
181 * function. It is not reentrant - it relies on the fact
182 * that while kdb is running on only one "master debug" cpu.
183 * Remarks:
184 *
185 * The buffer size must be >= 2. A buffer size of 2 means that the caller only
186 * wants a single key.
187 *
188 * An escape key could be the start of a vt100 control sequence such as \e[D
189 * (left arrow) or it could be a character in its own right. The standard
190 * method for detecting the difference is to wait for 2 seconds to see if there
191 * are any other characters. kdb is complicated by the lack of a timer service
192 * (interrupts are off), by multiple input sources and by the need to sometimes
193 * return after just one key. Escape sequence processing has to be done as
194 * states in the polling loop.
195 */
196
197static char *kdb_read(char *buffer, size_t bufsize)
198{
199 char *cp = buffer;
200 char *bufend = buffer+bufsize-2; /* Reserve space for newline
201 * and null byte */
202 char *lastchar;
203 char *p_tmp;
204 char tmp;
205 static char tmpbuffer[CMD_BUFLEN];
206 int len = strlen(buffer);
207 int len_tmp;
208 int tab = 0;
209 int count;
210 int i;
211 int diag, dtab_count;
212 int key;
213
214
215 diag = kdbgetintenv("DTABCOUNT", &dtab_count);
216 if (diag)
217 dtab_count = 30;
218
219 if (len > 0) {
220 cp += len;
221 if (*(buffer+len-1) == '\n')
222 cp--;
223 }
224
225 lastchar = cp;
226 *cp = '\0';
227 kdb_printf("%s", buffer);
228poll_again:
229 key = kdb_read_get_key(buffer, bufsize);
230 if (key == -1)
231 return buffer;
232 if (key != 9)
233 tab = 0;
234 switch (key) {
235 case 8: /* backspace */
236 if (cp > buffer) {
237 if (cp < lastchar) {
238 memcpy(tmpbuffer, cp, lastchar - cp);
239 memcpy(cp-1, tmpbuffer, lastchar - cp);
240 }
241 *(--lastchar) = '\0';
242 --cp;
243 kdb_printf("\b%s \r", cp);
244 tmp = *cp;
245 *cp = '\0';
246 kdb_printf(kdb_prompt_str);
247 kdb_printf("%s", buffer);
248 *cp = tmp;
249 }
250 break;
251 case 13: /* enter */
252 *lastchar++ = '\n';
253 *lastchar++ = '\0';
254 kdb_printf("\n");
255 return buffer;
256 case 4: /* Del */
257 if (cp < lastchar) {
258 memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
259 memcpy(cp, tmpbuffer, lastchar - cp - 1);
260 *(--lastchar) = '\0';
261 kdb_printf("%s \r", cp);
262 tmp = *cp;
263 *cp = '\0';
264 kdb_printf(kdb_prompt_str);
265 kdb_printf("%s", buffer);
266 *cp = tmp;
267 }
268 break;
269 case 1: /* Home */
270 if (cp > buffer) {
271 kdb_printf("\r");
272 kdb_printf(kdb_prompt_str);
273 cp = buffer;
274 }
275 break;
276 case 5: /* End */
277 if (cp < lastchar) {
278 kdb_printf("%s", cp);
279 cp = lastchar;
280 }
281 break;
282 case 2: /* Left */
283 if (cp > buffer) {
284 kdb_printf("\b");
285 --cp;
286 }
287 break;
288 case 14: /* Down */
289 memset(tmpbuffer, ' ',
290 strlen(kdb_prompt_str) + (lastchar-buffer));
291 *(tmpbuffer+strlen(kdb_prompt_str) +
292 (lastchar-buffer)) = '\0';
293 kdb_printf("\r%s\r", tmpbuffer);
294 *lastchar = (char)key;
295 *(lastchar+1) = '\0';
296 return lastchar;
297 case 6: /* Right */
298 if (cp < lastchar) {
299 kdb_printf("%c", *cp);
300 ++cp;
301 }
302 break;
303 case 16: /* Up */
304 memset(tmpbuffer, ' ',
305 strlen(kdb_prompt_str) + (lastchar-buffer));
306 *(tmpbuffer+strlen(kdb_prompt_str) +
307 (lastchar-buffer)) = '\0';
308 kdb_printf("\r%s\r", tmpbuffer);
309 *lastchar = (char)key;
310 *(lastchar+1) = '\0';
311 return lastchar;
312 case 9: /* Tab */
313 if (tab < 2)
314 ++tab;
315 p_tmp = buffer;
316 while (*p_tmp == ' ')
317 p_tmp++;
318 if (p_tmp > cp)
319 break;
320 memcpy(tmpbuffer, p_tmp, cp-p_tmp);
321 *(tmpbuffer + (cp-p_tmp)) = '\0';
322 p_tmp = strrchr(tmpbuffer, ' ');
323 if (p_tmp)
324 ++p_tmp;
325 else
326 p_tmp = tmpbuffer;
327 len = strlen(p_tmp);
328 count = kallsyms_symbol_complete(p_tmp,
329 sizeof(tmpbuffer) -
330 (p_tmp - tmpbuffer));
331 if (tab == 2 && count > 0) {
332 kdb_printf("\n%d symbols are found.", count);
333 if (count > dtab_count) {
334 count = dtab_count;
335 kdb_printf(" But only first %d symbols will"
336 " be printed.\nYou can change the"
337 " environment variable DTABCOUNT.",
338 count);
339 }
340 kdb_printf("\n");
341 for (i = 0; i < count; i++) {
342 if (kallsyms_symbol_next(p_tmp, i) < 0)
343 break;
344 kdb_printf("%s ", p_tmp);
345 *(p_tmp + len) = '\0';
346 }
347 if (i >= dtab_count)
348 kdb_printf("...");
349 kdb_printf("\n");
350 kdb_printf(kdb_prompt_str);
351 kdb_printf("%s", buffer);
352 } else if (tab != 2 && count > 0) {
353 len_tmp = strlen(p_tmp);
354 strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
355 len_tmp = strlen(p_tmp);
356 strncpy(cp, p_tmp+len, len_tmp-len + 1);
357 len = len_tmp - len;
358 kdb_printf("%s", cp);
359 cp += len;
360 lastchar += len;
361 }
362 kdb_nextline = 1; /* reset output line number */
363 break;
364 default:
365 if (key >= 32 && lastchar < bufend) {
366 if (cp < lastchar) {
367 memcpy(tmpbuffer, cp, lastchar - cp);
368 memcpy(cp+1, tmpbuffer, lastchar - cp);
369 *++lastchar = '\0';
370 *cp = key;
371 kdb_printf("%s\r", cp);
372 ++cp;
373 tmp = *cp;
374 *cp = '\0';
375 kdb_printf(kdb_prompt_str);
376 kdb_printf("%s", buffer);
377 *cp = tmp;
378 } else {
379 *++lastchar = '\0';
380 *cp++ = key;
381 /* The kgdb transition check will hide
382 * printed characters if we think that
383 * kgdb is connecting, until the check
384 * fails */
385 if (!KDB_STATE(KGDB_TRANS))
386 kgdb_transition_check(buffer);
387 else
388 kdb_printf("%c", key);
389 }
390 /* Special escape to kgdb */
391 if (lastchar - buffer >= 5 &&
392 strcmp(lastchar - 5, "$?#3f") == 0) {
393 strcpy(buffer, "kgdb");
394 KDB_STATE_SET(DOING_KGDB);
395 return buffer;
396 }
397 if (lastchar - buffer >= 14 &&
398 strcmp(lastchar - 14, "$qSupported#37") == 0) {
399 strcpy(buffer, "kgdb");
400 KDB_STATE_SET(DOING_KGDB2);
401 return buffer;
402 }
403 }
404 break;
405 }
406 goto poll_again;
407}
408
409/*
410 * kdb_getstr
411 *
412 * Print the prompt string and read a command from the
413 * input device.
414 *
415 * Parameters:
416 * buffer Address of buffer to receive command
417 * bufsize Size of buffer in bytes
418 * prompt Pointer to string to use as prompt string
419 * Returns:
420 * Pointer to command buffer.
421 * Locking:
422 * None.
423 * Remarks:
424 * For SMP kernels, the processor number will be
425 * substituted for %d, %x or %o in the prompt.
426 */
427
428char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
429{
430 if (prompt && kdb_prompt_str != prompt)
431 strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
432 kdb_printf(kdb_prompt_str);
433 kdb_nextline = 1; /* Prompt and input resets line number */
434 return kdb_read(buffer, bufsize);
435}
436
437/*
438 * kdb_input_flush
439 *
440 * Get rid of any buffered console input.
441 *
442 * Parameters:
443 * none
444 * Returns:
445 * nothing
446 * Locking:
447 * none
448 * Remarks:
449 * Call this function whenever you want to flush input. If there is any
450 * outstanding input, it ignores all characters until there has been no
451 * data for approximately 1ms.
452 */
453
454static void kdb_input_flush(void)
455{
456 get_char_func *f;
457 int res;
458 int flush_delay = 1;
459 while (flush_delay) {
460 flush_delay--;
461empty:
462 touch_nmi_watchdog();
463 for (f = &kdb_poll_funcs[0]; *f; ++f) {
464 res = (*f)();
465 if (res != -1) {
466 flush_delay = 1;
467 goto empty;
468 }
469 }
470 if (flush_delay)
471 mdelay(1);
472 }
473}
474
475/*
476 * kdb_printf
477 *
478 * Print a string to the output device(s).
479 *
480 * Parameters:
481 * printf-like format and optional args.
482 * Returns:
483 * 0
484 * Locking:
485 * None.
486 * Remarks:
487 * use 'kdbcons->write()' to avoid polluting 'log_buf' with
488 * kdb output.
489 *
490 * If the user is doing a cmd args | grep srch
491 * then kdb_grepping_flag is set.
492 * In that case we need to accumulate full lines (ending in \n) before
493 * searching for the pattern.
494 */
495
496static char kdb_buffer[256]; /* A bit too big to go on stack */
497static char *next_avail = kdb_buffer;
498static int size_avail;
499static int suspend_grep;
500
501/*
502 * search arg1 to see if it contains arg2
503 * (kdmain.c provides flags for ^pat and pat$)
504 *
505 * return 1 for found, 0 for not found
506 */
507static int kdb_search_string(char *searched, char *searchfor)
508{
509 char firstchar, *cp;
510 int len1, len2;
511
512 /* not counting the newline at the end of "searched" */
513 len1 = strlen(searched)-1;
514 len2 = strlen(searchfor);
515 if (len1 < len2)
516 return 0;
517 if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
518 return 0;
519 if (kdb_grep_leading) {
520 if (!strncmp(searched, searchfor, len2))
521 return 1;
522 } else if (kdb_grep_trailing) {
523 if (!strncmp(searched+len1-len2, searchfor, len2))
524 return 1;
525 } else {
526 firstchar = *searchfor;
527 cp = searched;
528 while ((cp = strchr(cp, firstchar))) {
529 if (!strncmp(cp, searchfor, len2))
530 return 1;
531 cp++;
532 }
533 }
534 return 0;
535}
536
537int vkdb_printf(const char *fmt, va_list ap)
538{
539 int diag;
540 int linecount;
541 int logging, saved_loglevel = 0;
542 int saved_trap_printk;
543 int got_printf_lock = 0;
544 int retlen = 0;
545 int fnd, len;
546 char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
547 char *moreprompt = "more> ";
548 struct console *c = console_drivers;
549 static DEFINE_SPINLOCK(kdb_printf_lock);
550 unsigned long uninitialized_var(flags);
551
552 preempt_disable();
553 saved_trap_printk = kdb_trap_printk;
554 kdb_trap_printk = 0;
555
556 /* Serialize kdb_printf if multiple cpus try to write at once.
557 * But if any cpu goes recursive in kdb, just print the output,
558 * even if it is interleaved with any other text.
559 */
560 if (!KDB_STATE(PRINTF_LOCK)) {
561 KDB_STATE_SET(PRINTF_LOCK);
562 spin_lock_irqsave(&kdb_printf_lock, flags);
563 got_printf_lock = 1;
564 atomic_inc(&kdb_event);
565 } else {
566 __acquire(kdb_printf_lock);
567 }
568
569 diag = kdbgetintenv("LINES", &linecount);
570 if (diag || linecount <= 1)
571 linecount = 24;
572
573 diag = kdbgetintenv("LOGGING", &logging);
574 if (diag)
575 logging = 0;
576
577 if (!kdb_grepping_flag || suspend_grep) {
578 /* normally, every vsnprintf starts a new buffer */
579 next_avail = kdb_buffer;
580 size_avail = sizeof(kdb_buffer);
581 }
582 vsnprintf(next_avail, size_avail, fmt, ap);
583
584 /*
585 * If kdb_parse() found that the command was cmd xxx | grep yyy
586 * then kdb_grepping_flag is set, and kdb_grep_string contains yyy
587 *
588 * Accumulate the print data up to a newline before searching it.
589 * (vsnprintf does null-terminate the string that it generates)
590 */
591
592 /* skip the search if prints are temporarily unconditional */
593 if (!suspend_grep && kdb_grepping_flag) {
594 cp = strchr(kdb_buffer, '\n');
595 if (!cp) {
596 /*
597 * Special cases that don't end with newlines
598 * but should be written without one:
599 * The "[nn]kdb> " prompt should
600 * appear at the front of the buffer.
601 *
602 * The "[nn]more " prompt should also be
603 * (MOREPROMPT -> moreprompt)
604 * written * but we print that ourselves,
605 * we set the suspend_grep flag to make
606 * it unconditional.
607 *
608 */
609 if (next_avail == kdb_buffer) {
610 /*
611 * these should occur after a newline,
612 * so they will be at the front of the
613 * buffer
614 */
615 cp2 = kdb_buffer;
616 len = strlen(kdb_prompt_str);
617 if (!strncmp(cp2, kdb_prompt_str, len)) {
618 /*
619 * We're about to start a new
620 * command, so we can go back
621 * to normal mode.
622 */
623 kdb_grepping_flag = 0;
624 goto kdb_printit;
625 }
626 }
627 /* no newline; don't search/write the buffer
628 until one is there */
629 len = strlen(kdb_buffer);
630 next_avail = kdb_buffer + len;
631 size_avail = sizeof(kdb_buffer) - len;
632 goto kdb_print_out;
633 }
634
635 /*
636 * The newline is present; print through it or discard
637 * it, depending on the results of the search.
638 */
639 cp++; /* to byte after the newline */
640 replaced_byte = *cp; /* remember what/where it was */
641 cphold = cp;
642 *cp = '\0'; /* end the string for our search */
643
644 /*
645 * We now have a newline at the end of the string
646 * Only continue with this output if it contains the
647 * search string.
648 */
649 fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
650 if (!fnd) {
651 /*
652 * At this point the complete line at the start
653 * of kdb_buffer can be discarded, as it does
654 * not contain what the user is looking for.
655 * Shift the buffer left.
656 */
657 *cphold = replaced_byte;
658 strcpy(kdb_buffer, cphold);
659 len = strlen(kdb_buffer);
660 next_avail = kdb_buffer + len;
661 size_avail = sizeof(kdb_buffer) - len;
662 goto kdb_print_out;
663 }
664 /*
665 * at this point the string is a full line and
666 * should be printed, up to the null.
667 */
668 }
669kdb_printit:
670
671 /*
672 * Write to all consoles.
673 */
674 retlen = strlen(kdb_buffer);
675 if (!dbg_kdb_mode && kgdb_connected) {
676 gdbstub_msg_write(kdb_buffer, retlen);
677 } else {
678 if (!dbg_io_ops->is_console) {
679 len = strlen(kdb_buffer);
680 cp = kdb_buffer;
681 while (len--) {
682 dbg_io_ops->write_char(*cp);
683 cp++;
684 }
685 }
686 while (c) {
687 c->write(c, kdb_buffer, retlen);
688 touch_nmi_watchdog();
689 c = c->next;
690 }
691 }
692 if (logging) {
693 saved_loglevel = console_loglevel;
694 console_loglevel = 0;
695 printk(KERN_INFO "%s", kdb_buffer);
696 }
697
698 if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
699 kdb_nextline++;
700
701 /* check for having reached the LINES number of printed lines */
702 if (kdb_nextline == linecount) {
703 char buf1[16] = "";
704#if defined(CONFIG_SMP)
705 char buf2[32];
706#endif
707
708 /* Watch out for recursion here. Any routine that calls
709 * kdb_printf will come back through here. And kdb_read
710 * uses kdb_printf to echo on serial consoles ...
711 */
712 kdb_nextline = 1; /* In case of recursion */
713
714 /*
715 * Pause until cr.
716 */
717 moreprompt = kdbgetenv("MOREPROMPT");
718 if (moreprompt == NULL)
719 moreprompt = "more> ";
720
721#if defined(CONFIG_SMP)
722 if (strchr(moreprompt, '%')) {
723 sprintf(buf2, moreprompt, get_cpu());
724 put_cpu();
725 moreprompt = buf2;
726 }
727#endif
728
729 kdb_input_flush();
730 c = console_drivers;
731
732 if (!dbg_io_ops->is_console) {
733 len = strlen(moreprompt);
734 cp = moreprompt;
735 while (len--) {
736 dbg_io_ops->write_char(*cp);
737 cp++;
738 }
739 }
740 while (c) {
741 c->write(c, moreprompt, strlen(moreprompt));
742 touch_nmi_watchdog();
743 c = c->next;
744 }
745
746 if (logging)
747 printk("%s", moreprompt);
748
749 kdb_read(buf1, 2); /* '2' indicates to return
750 * immediately after getting one key. */
751 kdb_nextline = 1; /* Really set output line 1 */
752
753 /* empty and reset the buffer: */
754 kdb_buffer[0] = '\0';
755 next_avail = kdb_buffer;
756 size_avail = sizeof(kdb_buffer);
757 if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
758 /* user hit q or Q */
759 KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
760 KDB_STATE_CLEAR(PAGER);
761 /* end of command output; back to normal mode */
762 kdb_grepping_flag = 0;
763 kdb_printf("\n");
764 } else if (buf1[0] == ' ') {
765 kdb_printf("\n");
766 suspend_grep = 1; /* for this recursion */
767 } else if (buf1[0] == '\n') {
768 kdb_nextline = linecount - 1;
769 kdb_printf("\r");
770 suspend_grep = 1; /* for this recursion */
771 } else if (buf1[0] && buf1[0] != '\n') {
772 /* user hit something other than enter */
773 suspend_grep = 1; /* for this recursion */
774 kdb_printf("\nOnly 'q' or 'Q' are processed at more "
775 "prompt, input ignored\n");
776 } else if (kdb_grepping_flag) {
777 /* user hit enter */
778 suspend_grep = 1; /* for this recursion */
779 kdb_printf("\n");
780 }
781 kdb_input_flush();
782 }
783
784 /*
785 * For grep searches, shift the printed string left.
786 * replaced_byte contains the character that was overwritten with
787 * the terminating null, and cphold points to the null.
788 * Then adjust the notion of available space in the buffer.
789 */
790 if (kdb_grepping_flag && !suspend_grep) {
791 *cphold = replaced_byte;
792 strcpy(kdb_buffer, cphold);
793 len = strlen(kdb_buffer);
794 next_avail = kdb_buffer + len;
795 size_avail = sizeof(kdb_buffer) - len;
796 }
797
798kdb_print_out:
799 suspend_grep = 0; /* end of what may have been a recursive call */
800 if (logging)
801 console_loglevel = saved_loglevel;
802 if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
803 got_printf_lock = 0;
804 spin_unlock_irqrestore(&kdb_printf_lock, flags);
805 KDB_STATE_CLEAR(PRINTF_LOCK);
806 atomic_dec(&kdb_event);
807 } else {
808 __release(kdb_printf_lock);
809 }
810 kdb_trap_printk = saved_trap_printk;
811 preempt_enable();
812 return retlen;
813}
814
815int kdb_printf(const char *fmt, ...)
816{
817 va_list ap;
818 int r;
819
820 va_start(ap, fmt);
821 r = vkdb_printf(fmt, ap);
822 va_end(ap);
823
824 return r;
825}
826
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
new file mode 100644
index 000000000000..4bca634975c0
--- /dev/null
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -0,0 +1,212 @@
1/*
2 * Kernel Debugger Architecture Dependent Console I/O handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License.
6 *
7 * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
8 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
9 */
10
11#include <linux/kdb.h>
12#include <linux/keyboard.h>
13#include <linux/ctype.h>
14#include <linux/module.h>
15#include <linux/io.h>
16
17/* Keyboard Controller Registers on normal PCs. */
18
19#define KBD_STATUS_REG 0x64 /* Status register (R) */
20#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */
21
22/* Status Register Bits */
23
24#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
26
27static int kbd_exists;
28
29/*
30 * Check if the keyboard controller has a keypress for us.
31 * Some parts (Enter Release, LED change) are still blocking polled here,
32 * but hopefully they are all short.
33 */
34int kdb_get_kbd_char(void)
35{
36 int scancode, scanstatus;
37 static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */
38 static int shift_key; /* Shift next keypress */
39 static int ctrl_key;
40 u_short keychar;
41
42 if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
43 (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
44 kbd_exists = 0;
45 return -1;
46 }
47 kbd_exists = 1;
48
49 if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
50 return -1;
51
52 /*
53 * Fetch the scancode
54 */
55 scancode = inb(KBD_DATA_REG);
56 scanstatus = inb(KBD_STATUS_REG);
57
58 /*
59 * Ignore mouse events.
60 */
61 if (scanstatus & KBD_STAT_MOUSE_OBF)
62 return -1;
63
64 /*
65 * Ignore release, trigger on make
66 * (except for shift keys, where we want to
67 * keep the shift state so long as the key is
68 * held down).
69 */
70
71 if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
72 /*
73 * Next key may use shift table
74 */
75 if ((scancode & 0x80) == 0)
76 shift_key = 1;
77 else
78 shift_key = 0;
79 return -1;
80 }
81
82 if ((scancode&0x7f) == 0x1d) {
83 /*
84 * Left ctrl key
85 */
86 if ((scancode & 0x80) == 0)
87 ctrl_key = 1;
88 else
89 ctrl_key = 0;
90 return -1;
91 }
92
93 if ((scancode & 0x80) != 0)
94 return -1;
95
96 scancode &= 0x7f;
97
98 /*
99 * Translate scancode
100 */
101
102 if (scancode == 0x3a) {
103 /*
104 * Toggle caps lock
105 */
106 shift_lock ^= 1;
107
108#ifdef KDB_BLINK_LED
109 kdb_toggleled(0x4);
110#endif
111 return -1;
112 }
113
114 if (scancode == 0x0e) {
115 /*
116 * Backspace
117 */
118 return 8;
119 }
120
121 /* Special Key */
122 switch (scancode) {
123 case 0xF: /* Tab */
124 return 9;
125 case 0x53: /* Del */
126 return 4;
127 case 0x47: /* Home */
128 return 1;
129 case 0x4F: /* End */
130 return 5;
131 case 0x4B: /* Left */
132 return 2;
133 case 0x48: /* Up */
134 return 16;
135 case 0x50: /* Down */
136 return 14;
137 case 0x4D: /* Right */
138 return 6;
139 }
140
141 if (scancode == 0xe0)
142 return -1;
143
144 /*
145 * For Japanese 86/106 keyboards
146 * See comment in drivers/char/pc_keyb.c.
147 * - Masahiro Adegawa
148 */
149 if (scancode == 0x73)
150 scancode = 0x59;
151 else if (scancode == 0x7d)
152 scancode = 0x7c;
153
154 if (!shift_lock && !shift_key && !ctrl_key) {
155 keychar = plain_map[scancode];
156 } else if ((shift_lock || shift_key) && key_maps[1]) {
157 keychar = key_maps[1][scancode];
158 } else if (ctrl_key && key_maps[4]) {
159 keychar = key_maps[4][scancode];
160 } else {
161 keychar = 0x0020;
162 kdb_printf("Unknown state/scancode (%d)\n", scancode);
163 }
164 keychar &= 0x0fff;
165 if (keychar == '\t')
166 keychar = ' ';
167 switch (KTYP(keychar)) {
168 case KT_LETTER:
169 case KT_LATIN:
170 if (isprint(keychar))
171 break; /* printable characters */
172 /* drop through */
173 case KT_SPEC:
174 if (keychar == K_ENTER)
175 break;
176 /* drop through */
177 default:
178 return -1; /* ignore unprintables */
179 }
180
181 if ((scancode & 0x7f) == 0x1c) {
182 /*
183 * enter key. All done. Absorb the release scancode.
184 */
185 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
186 ;
187
188 /*
189 * Fetch the scancode
190 */
191 scancode = inb(KBD_DATA_REG);
192 scanstatus = inb(KBD_STATUS_REG);
193
194 while (scanstatus & KBD_STAT_MOUSE_OBF) {
195 scancode = inb(KBD_DATA_REG);
196 scanstatus = inb(KBD_STATUS_REG);
197 }
198
199 if (scancode != 0x9c) {
200 /*
201 * Wasn't an enter-release, why not?
202 */
203 kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
204 scancode, scanstatus);
205 }
206
207 return 13;
208 }
209
210 return keychar & 0xff;
211}
212EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
new file mode 100644
index 000000000000..b724c791b6d4
--- /dev/null
+++ b/kernel/debug/kdb/kdb_main.c
@@ -0,0 +1,2849 @@
1/*
2 * Kernel Debugger Architecture Independent Main Code
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
10 * Xscale (R) modifications copyright (C) 2003 Intel Corporation.
11 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
12 */
13
14#include <linux/ctype.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/reboot.h>
18#include <linux/sched.h>
19#include <linux/sysrq.h>
20#include <linux/smp.h>
21#include <linux/utsname.h>
22#include <linux/vmalloc.h>
23#include <linux/module.h>
24#include <linux/mm.h>
25#include <linux/init.h>
26#include <linux/kallsyms.h>
27#include <linux/kgdb.h>
28#include <linux/kdb.h>
29#include <linux/notifier.h>
30#include <linux/interrupt.h>
31#include <linux/delay.h>
32#include <linux/nmi.h>
33#include <linux/time.h>
34#include <linux/ptrace.h>
35#include <linux/sysctl.h>
36#include <linux/cpu.h>
37#include <linux/kdebug.h>
38#include <linux/proc_fs.h>
39#include <linux/uaccess.h>
40#include <linux/slab.h>
41#include "kdb_private.h"
42
43#define GREP_LEN 256
44char kdb_grep_string[GREP_LEN];
45int kdb_grepping_flag;
46EXPORT_SYMBOL(kdb_grepping_flag);
47int kdb_grep_leading;
48int kdb_grep_trailing;
49
50/*
51 * Kernel debugger state flags
52 */
53int kdb_flags;
54atomic_t kdb_event;
55
56/*
57 * kdb_lock protects updates to kdb_initial_cpu. Used to
58 * single thread processors through the kernel debugger.
59 */
60int kdb_initial_cpu = -1; /* cpu number that owns kdb */
61int kdb_nextline = 1;
62int kdb_state; /* General KDB state */
63
64struct task_struct *kdb_current_task;
65EXPORT_SYMBOL(kdb_current_task);
66struct pt_regs *kdb_current_regs;
67
68const char *kdb_diemsg;
69static int kdb_go_count;
70#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC
71static unsigned int kdb_continue_catastrophic =
72 CONFIG_KDB_CONTINUE_CATASTROPHIC;
73#else
74static unsigned int kdb_continue_catastrophic;
75#endif
76
77/* kdb_commands describes the available commands. */
78static kdbtab_t *kdb_commands;
79#define KDB_BASE_CMD_MAX 50
80static int kdb_max_commands = KDB_BASE_CMD_MAX;
81static kdbtab_t kdb_base_commands[50];
82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \
85 num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
86
87typedef struct _kdbmsg {
88 int km_diag; /* kdb diagnostic */
89 char *km_msg; /* Corresponding message text */
90} kdbmsg_t;
91
92#define KDBMSG(msgnum, text) \
93 { KDB_##msgnum, text }
94
95static kdbmsg_t kdbmsgs[] = {
96 KDBMSG(NOTFOUND, "Command Not Found"),
97 KDBMSG(ARGCOUNT, "Improper argument count, see usage."),
98 KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, "
99 "8 is only allowed on 64 bit systems"),
100 KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"),
101 KDBMSG(NOTENV, "Cannot find environment variable"),
102 KDBMSG(NOENVVALUE, "Environment variable should have value"),
103 KDBMSG(NOTIMP, "Command not implemented"),
104 KDBMSG(ENVFULL, "Environment full"),
105 KDBMSG(ENVBUFFULL, "Environment buffer full"),
106 KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
107#ifdef CONFIG_CPU_XSCALE
108 KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
109#else
110 KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"),
111#endif
112 KDBMSG(DUPBPT, "Duplicate breakpoint address"),
113 KDBMSG(BPTNOTFOUND, "Breakpoint not found"),
114 KDBMSG(BADMODE, "Invalid IDMODE"),
115 KDBMSG(BADINT, "Illegal numeric value"),
116 KDBMSG(INVADDRFMT, "Invalid symbolic address format"),
117 KDBMSG(BADREG, "Invalid register name"),
118 KDBMSG(BADCPUNUM, "Invalid cpu number"),
119 KDBMSG(BADLENGTH, "Invalid length field"),
120 KDBMSG(NOBP, "No Breakpoint exists"),
121 KDBMSG(BADADDR, "Invalid address"),
122};
123#undef KDBMSG
124
125static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
126
127
128/*
129 * Initial environment. This is all kept static and local to
130 * this file. We don't want to rely on the memory allocation
131 * mechanisms in the kernel, so we use a very limited allocate-only
132 * heap for new and altered environment variables. The entire
133 * environment is limited to a fixed number of entries (add more
134 * to __env[] if required) and a fixed amount of heap (add more to
135 * KDB_ENVBUFSIZE if required).
136 */
137
138static char *__env[] = {
139#if defined(CONFIG_SMP)
140 "PROMPT=[%d]kdb> ",
141 "MOREPROMPT=[%d]more> ",
142#else
143 "PROMPT=kdb> ",
144 "MOREPROMPT=more> ",
145#endif
146 "RADIX=16",
147 "MDCOUNT=8", /* lines of md output */
148 "BTARGS=9", /* 9 possible args in bt */
149 KDB_PLATFORM_ENV,
150 "DTABCOUNT=30",
151 "NOSECT=1",
152 (char *)0,
153 (char *)0,
154 (char *)0,
155 (char *)0,
156 (char *)0,
157 (char *)0,
158 (char *)0,
159 (char *)0,
160 (char *)0,
161 (char *)0,
162 (char *)0,
163 (char *)0,
164 (char *)0,
165 (char *)0,
166 (char *)0,
167 (char *)0,
168 (char *)0,
169 (char *)0,
170 (char *)0,
171 (char *)0,
172 (char *)0,
173 (char *)0,
174 (char *)0,
175};
176
177static const int __nenv = (sizeof(__env) / sizeof(char *));
178
179struct task_struct *kdb_curr_task(int cpu)
180{
181 struct task_struct *p = curr_task(cpu);
182#ifdef _TIF_MCA_INIT
183 if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
184 p = krp->p;
185#endif
186 return p;
187}
188
189/*
190 * kdbgetenv - This function will return the character string value of
191 * an environment variable.
192 * Parameters:
193 * match A character string representing an environment variable.
194 * Returns:
195 * NULL No environment variable matches 'match'
196 * char* Pointer to string value of environment variable.
197 */
198char *kdbgetenv(const char *match)
199{
200 char **ep = __env;
201 int matchlen = strlen(match);
202 int i;
203
204 for (i = 0; i < __nenv; i++) {
205 char *e = *ep++;
206
207 if (!e)
208 continue;
209
210 if ((strncmp(match, e, matchlen) == 0)
211 && ((e[matchlen] == '\0')
212 || (e[matchlen] == '='))) {
213 char *cp = strchr(e, '=');
214 return cp ? ++cp : "";
215 }
216 }
217 return NULL;
218}
219
220/*
221 * kdballocenv - This function is used to allocate bytes for
222 * environment entries.
223 * Parameters:
224 * match A character string representing a numeric value
225 * Outputs:
226 * *value the unsigned long representation of the env variable 'match'
227 * Returns:
228 * Zero on success, a kdb diagnostic on failure.
229 * Remarks:
230 * We use a static environment buffer (envbuffer) to hold the values
231 * of dynamically generated environment variables (see kdb_set). Buffer
232 * space once allocated is never free'd, so over time, the amount of space
233 * (currently 512 bytes) will be exhausted if env variables are changed
234 * frequently.
235 */
236static char *kdballocenv(size_t bytes)
237{
238#define KDB_ENVBUFSIZE 512
239 static char envbuffer[KDB_ENVBUFSIZE];
240 static int envbufsize;
241 char *ep = NULL;
242
243 if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
244 ep = &envbuffer[envbufsize];
245 envbufsize += bytes;
246 }
247 return ep;
248}
249
250/*
251 * kdbgetulenv - This function will return the value of an unsigned
252 * long-valued environment variable.
253 * Parameters:
254 * match A character string representing a numeric value
255 * Outputs:
256 * *value the unsigned long represntation of the env variable 'match'
257 * Returns:
258 * Zero on success, a kdb diagnostic on failure.
259 */
260static int kdbgetulenv(const char *match, unsigned long *value)
261{
262 char *ep;
263
264 ep = kdbgetenv(match);
265 if (!ep)
266 return KDB_NOTENV;
267 if (strlen(ep) == 0)
268 return KDB_NOENVVALUE;
269
270 *value = simple_strtoul(ep, NULL, 0);
271
272 return 0;
273}
274
275/*
276 * kdbgetintenv - This function will return the value of an
277 * integer-valued environment variable.
278 * Parameters:
279 * match A character string representing an integer-valued env variable
280 * Outputs:
281 * *value the integer representation of the environment variable 'match'
282 * Returns:
283 * Zero on success, a kdb diagnostic on failure.
284 */
285int kdbgetintenv(const char *match, int *value)
286{
287 unsigned long val;
288 int diag;
289
290 diag = kdbgetulenv(match, &val);
291 if (!diag)
292 *value = (int) val;
293 return diag;
294}
295
296/*
297 * kdbgetularg - This function will convert a numeric string into an
298 * unsigned long value.
299 * Parameters:
300 * arg A character string representing a numeric value
301 * Outputs:
302 * *value the unsigned long represntation of arg.
303 * Returns:
304 * Zero on success, a kdb diagnostic on failure.
305 */
306int kdbgetularg(const char *arg, unsigned long *value)
307{
308 char *endp;
309 unsigned long val;
310
311 val = simple_strtoul(arg, &endp, 0);
312
313 if (endp == arg) {
314 /*
315 * Try base 16, for us folks too lazy to type the
316 * leading 0x...
317 */
318 val = simple_strtoul(arg, &endp, 16);
319 if (endp == arg)
320 return KDB_BADINT;
321 }
322
323 *value = val;
324
325 return 0;
326}
327
328/*
329 * kdb_set - This function implements the 'set' command. Alter an
330 * existing environment variable or create a new one.
331 */
332int kdb_set(int argc, const char **argv)
333{
334 int i;
335 char *ep;
336 size_t varlen, vallen;
337
338 /*
339 * we can be invoked two ways:
340 * set var=value argv[1]="var", argv[2]="value"
341 * set var = value argv[1]="var", argv[2]="=", argv[3]="value"
342 * - if the latter, shift 'em down.
343 */
344 if (argc == 3) {
345 argv[2] = argv[3];
346 argc--;
347 }
348
349 if (argc != 2)
350 return KDB_ARGCOUNT;
351
352 /*
353 * Check for internal variables
354 */
355 if (strcmp(argv[1], "KDBDEBUG") == 0) {
356 unsigned int debugflags;
357 char *cp;
358
359 debugflags = simple_strtoul(argv[2], &cp, 0);
360 if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
361 kdb_printf("kdb: illegal debug flags '%s'\n",
362 argv[2]);
363 return 0;
364 }
365 kdb_flags = (kdb_flags &
366 ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
367 | (debugflags << KDB_DEBUG_FLAG_SHIFT);
368
369 return 0;
370 }
371
372 /*
373 * Tokenizer squashed the '=' sign. argv[1] is variable
374 * name, argv[2] = value.
375 */
376 varlen = strlen(argv[1]);
377 vallen = strlen(argv[2]);
378 ep = kdballocenv(varlen + vallen + 2);
379 if (ep == (char *)0)
380 return KDB_ENVBUFFULL;
381
382 sprintf(ep, "%s=%s", argv[1], argv[2]);
383
384 ep[varlen+vallen+1] = '\0';
385
386 for (i = 0; i < __nenv; i++) {
387 if (__env[i]
388 && ((strncmp(__env[i], argv[1], varlen) == 0)
389 && ((__env[i][varlen] == '\0')
390 || (__env[i][varlen] == '=')))) {
391 __env[i] = ep;
392 return 0;
393 }
394 }
395
396 /*
397 * Wasn't existing variable. Fit into slot.
398 */
399 for (i = 0; i < __nenv-1; i++) {
400 if (__env[i] == (char *)0) {
401 __env[i] = ep;
402 return 0;
403 }
404 }
405
406 return KDB_ENVFULL;
407}
408
409static int kdb_check_regs(void)
410{
411 if (!kdb_current_regs) {
412 kdb_printf("No current kdb registers."
413 " You may need to select another task\n");
414 return KDB_BADREG;
415 }
416 return 0;
417}
418
419/*
420 * kdbgetaddrarg - This function is responsible for parsing an
421 * address-expression and returning the value of the expression,
422 * symbol name, and offset to the caller.
423 *
424 * The argument may consist of a numeric value (decimal or
425 * hexidecimal), a symbol name, a register name (preceeded by the
426 * percent sign), an environment variable with a numeric value
427 * (preceeded by a dollar sign) or a simple arithmetic expression
428 * consisting of a symbol name, +/-, and a numeric constant value
429 * (offset).
430 * Parameters:
431 * argc - count of arguments in argv
432 * argv - argument vector
433 * *nextarg - index to next unparsed argument in argv[]
434 * regs - Register state at time of KDB entry
435 * Outputs:
436 * *value - receives the value of the address-expression
437 * *offset - receives the offset specified, if any
438 * *name - receives the symbol name, if any
439 * *nextarg - index to next unparsed argument in argv[]
440 * Returns:
441 * zero is returned on success, a kdb diagnostic code is
442 * returned on error.
443 */
444int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
445 unsigned long *value, long *offset,
446 char **name)
447{
448 unsigned long addr;
449 unsigned long off = 0;
450 int positive;
451 int diag;
452 int found = 0;
453 char *symname;
454 char symbol = '\0';
455 char *cp;
456 kdb_symtab_t symtab;
457
458 /*
459 * Process arguments which follow the following syntax:
460 *
461 * symbol | numeric-address [+/- numeric-offset]
462 * %register
463 * $environment-variable
464 */
465
466 if (*nextarg > argc)
467 return KDB_ARGCOUNT;
468
469 symname = (char *)argv[*nextarg];
470
471 /*
472 * If there is no whitespace between the symbol
473 * or address and the '+' or '-' symbols, we
474 * remember the character and replace it with a
475 * null so the symbol/value can be properly parsed
476 */
477 cp = strpbrk(symname, "+-");
478 if (cp != NULL) {
479 symbol = *cp;
480 *cp++ = '\0';
481 }
482
483 if (symname[0] == '$') {
484 diag = kdbgetulenv(&symname[1], &addr);
485 if (diag)
486 return diag;
487 } else if (symname[0] == '%') {
488 diag = kdb_check_regs();
489 if (diag)
490 return diag;
491 /* Implement register values with % at a later time as it is
492 * arch optional.
493 */
494 return KDB_NOTIMP;
495 } else {
496 found = kdbgetsymval(symname, &symtab);
497 if (found) {
498 addr = symtab.sym_start;
499 } else {
500 diag = kdbgetularg(argv[*nextarg], &addr);
501 if (diag)
502 return diag;
503 }
504 }
505
506 if (!found)
507 found = kdbnearsym(addr, &symtab);
508
509 (*nextarg)++;
510
511 if (name)
512 *name = symname;
513 if (value)
514 *value = addr;
515 if (offset && name && *name)
516 *offset = addr - symtab.sym_start;
517
518 if ((*nextarg > argc)
519 && (symbol == '\0'))
520 return 0;
521
522 /*
523 * check for +/- and offset
524 */
525
526 if (symbol == '\0') {
527 if ((argv[*nextarg][0] != '+')
528 && (argv[*nextarg][0] != '-')) {
529 /*
530 * Not our argument. Return.
531 */
532 return 0;
533 } else {
534 positive = (argv[*nextarg][0] == '+');
535 (*nextarg)++;
536 }
537 } else
538 positive = (symbol == '+');
539
540 /*
541 * Now there must be an offset!
542 */
543 if ((*nextarg > argc)
544 && (symbol == '\0')) {
545 return KDB_INVADDRFMT;
546 }
547
548 if (!symbol) {
549 cp = (char *)argv[*nextarg];
550 (*nextarg)++;
551 }
552
553 diag = kdbgetularg(cp, &off);
554 if (diag)
555 return diag;
556
557 if (!positive)
558 off = -off;
559
560 if (offset)
561 *offset += off;
562
563 if (value)
564 *value += off;
565
566 return 0;
567}
568
569static void kdb_cmderror(int diag)
570{
571 int i;
572
573 if (diag >= 0) {
574 kdb_printf("no error detected (diagnostic is %d)\n", diag);
575 return;
576 }
577
578 for (i = 0; i < __nkdb_err; i++) {
579 if (kdbmsgs[i].km_diag == diag) {
580 kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg);
581 return;
582 }
583 }
584
585 kdb_printf("Unknown diag %d\n", -diag);
586}
587
588/*
589 * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd'
590 * command which defines one command as a set of other commands,
591 * terminated by endefcmd. kdb_defcmd processes the initial
592 * 'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for
593 * the following commands until 'endefcmd'.
594 * Inputs:
595 * argc argument count
596 * argv argument vector
597 * Returns:
598 * zero for success, a kdb diagnostic if error
599 */
600struct defcmd_set {
601 int count;
602 int usable;
603 char *name;
604 char *usage;
605 char *help;
606 char **command;
607};
608static struct defcmd_set *defcmd_set;
609static int defcmd_set_count;
610static int defcmd_in_progress;
611
612/* Forward references */
613static int kdb_exec_defcmd(int argc, const char **argv);
614
615static int kdb_defcmd2(const char *cmdstr, const char *argv0)
616{
617 struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
618 char **save_command = s->command;
619 if (strcmp(argv0, "endefcmd") == 0) {
620 defcmd_in_progress = 0;
621 if (!s->count)
622 s->usable = 0;
623 if (s->usable)
624 kdb_register(s->name, kdb_exec_defcmd,
625 s->usage, s->help, 0);
626 return 0;
627 }
628 if (!s->usable)
629 return KDB_NOTIMP;
630 s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
631 if (!s->command) {
632 kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
633 cmdstr);
634 s->usable = 0;
635 return KDB_NOTIMP;
636 }
637 memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
638 s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
639 kfree(save_command);
640 return 0;
641}
642
643static int kdb_defcmd(int argc, const char **argv)
644{
645 struct defcmd_set *save_defcmd_set = defcmd_set, *s;
646 if (defcmd_in_progress) {
647 kdb_printf("kdb: nested defcmd detected, assuming missing "
648 "endefcmd\n");
649 kdb_defcmd2("endefcmd", "endefcmd");
650 }
651 if (argc == 0) {
652 int i;
653 for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
654 kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
655 s->usage, s->help);
656 for (i = 0; i < s->count; ++i)
657 kdb_printf("%s", s->command[i]);
658 kdb_printf("endefcmd\n");
659 }
660 return 0;
661 }
662 if (argc != 3)
663 return KDB_ARGCOUNT;
664 defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
665 GFP_KDB);
666 if (!defcmd_set) {
667 kdb_printf("Could not allocate new defcmd_set entry for %s\n",
668 argv[1]);
669 defcmd_set = save_defcmd_set;
670 return KDB_NOTIMP;
671 }
672 memcpy(defcmd_set, save_defcmd_set,
673 defcmd_set_count * sizeof(*defcmd_set));
674 kfree(save_defcmd_set);
675 s = defcmd_set + defcmd_set_count;
676 memset(s, 0, sizeof(*s));
677 s->usable = 1;
678 s->name = kdb_strdup(argv[1], GFP_KDB);
679 s->usage = kdb_strdup(argv[2], GFP_KDB);
680 s->help = kdb_strdup(argv[3], GFP_KDB);
681 if (s->usage[0] == '"') {
682 strcpy(s->usage, s->usage+1);
683 s->usage[strlen(s->usage)-1] = '\0';
684 }
685 if (s->help[0] == '"') {
686 strcpy(s->help, s->help+1);
687 s->help[strlen(s->help)-1] = '\0';
688 }
689 ++defcmd_set_count;
690 defcmd_in_progress = 1;
691 return 0;
692}
693
694/*
695 * kdb_exec_defcmd - Execute the set of commands associated with this
696 * defcmd name.
697 * Inputs:
698 * argc argument count
699 * argv argument vector
700 * Returns:
701 * zero for success, a kdb diagnostic if error
702 */
703static int kdb_exec_defcmd(int argc, const char **argv)
704{
705 int i, ret;
706 struct defcmd_set *s;
707 if (argc != 0)
708 return KDB_ARGCOUNT;
709 for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
710 if (strcmp(s->name, argv[0]) == 0)
711 break;
712 }
713 if (i == defcmd_set_count) {
714 kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
715 argv[0]);
716 return KDB_NOTIMP;
717 }
718 for (i = 0; i < s->count; ++i) {
719 /* Recursive use of kdb_parse, do not use argv after
720 * this point */
721 argv = NULL;
722 kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
723 ret = kdb_parse(s->command[i]);
724 if (ret)
725 return ret;
726 }
727 return 0;
728}
729
730/* Command history */
731#define KDB_CMD_HISTORY_COUNT 32
732#define CMD_BUFLEN 200 /* kdb_printf: max printline
733 * size == 256 */
734static unsigned int cmd_head, cmd_tail;
735static unsigned int cmdptr;
736static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN];
737static char cmd_cur[CMD_BUFLEN];
738
739/*
740 * The "str" argument may point to something like | grep xyz
741 */
742static void parse_grep(const char *str)
743{
744 int len;
745 char *cp = (char *)str, *cp2;
746
747 /* sanity check: we should have been called with the \ first */
748 if (*cp != '|')
749 return;
750 cp++;
751 while (isspace(*cp))
752 cp++;
753 if (strncmp(cp, "grep ", 5)) {
754 kdb_printf("invalid 'pipe', see grephelp\n");
755 return;
756 }
757 cp += 5;
758 while (isspace(*cp))
759 cp++;
760 cp2 = strchr(cp, '\n');
761 if (cp2)
762 *cp2 = '\0'; /* remove the trailing newline */
763 len = strlen(cp);
764 if (len == 0) {
765 kdb_printf("invalid 'pipe', see grephelp\n");
766 return;
767 }
768 /* now cp points to a nonzero length search string */
769 if (*cp == '"') {
770 /* allow it be "x y z" by removing the "'s - there must
771 be two of them */
772 cp++;
773 cp2 = strchr(cp, '"');
774 if (!cp2) {
775 kdb_printf("invalid quoted string, see grephelp\n");
776 return;
777 }
778 *cp2 = '\0'; /* end the string where the 2nd " was */
779 }
780 kdb_grep_leading = 0;
781 if (*cp == '^') {
782 kdb_grep_leading = 1;
783 cp++;
784 }
785 len = strlen(cp);
786 kdb_grep_trailing = 0;
787 if (*(cp+len-1) == '$') {
788 kdb_grep_trailing = 1;
789 *(cp+len-1) = '\0';
790 }
791 len = strlen(cp);
792 if (!len)
793 return;
794 if (len >= GREP_LEN) {
795 kdb_printf("search string too long\n");
796 return;
797 }
798 strcpy(kdb_grep_string, cp);
799 kdb_grepping_flag++;
800 return;
801}
802
803/*
804 * kdb_parse - Parse the command line, search the command table for a
805 * matching command and invoke the command function. This
806 * function may be called recursively, if it is, the second call
807 * will overwrite argv and cbuf. It is the caller's
808 * responsibility to save their argv if they recursively call
809 * kdb_parse().
810 * Parameters:
811 * cmdstr The input command line to be parsed.
812 * regs The registers at the time kdb was entered.
813 * Returns:
814 * Zero for success, a kdb diagnostic if failure.
815 * Remarks:
816 * Limited to 20 tokens.
817 *
818 * Real rudimentary tokenization. Basically only whitespace
819 * is considered a token delimeter (but special consideration
820 * is taken of the '=' sign as used by the 'set' command).
821 *
822 * The algorithm used to tokenize the input string relies on
823 * there being at least one whitespace (or otherwise useless)
824 * character between tokens as the character immediately following
825 * the token is altered in-place to a null-byte to terminate the
826 * token string.
827 */
828
829#define MAXARGC 20
830
831int kdb_parse(const char *cmdstr)
832{
833 static char *argv[MAXARGC];
834 static int argc;
835 static char cbuf[CMD_BUFLEN+2];
836 char *cp;
837 char *cpp, quoted;
838 kdbtab_t *tp;
839 int i, escaped, ignore_errors = 0, check_grep;
840
841 /*
842 * First tokenize the command string.
843 */
844 cp = (char *)cmdstr;
845 kdb_grepping_flag = check_grep = 0;
846
847 if (KDB_FLAG(CMD_INTERRUPT)) {
848 /* Previous command was interrupted, newline must not
849 * repeat the command */
850 KDB_FLAG_CLEAR(CMD_INTERRUPT);
851 KDB_STATE_SET(PAGER);
852 argc = 0; /* no repeat */
853 }
854
855 if (*cp != '\n' && *cp != '\0') {
856 argc = 0;
857 cpp = cbuf;
858 while (*cp) {
859 /* skip whitespace */
860 while (isspace(*cp))
861 cp++;
862 if ((*cp == '\0') || (*cp == '\n') ||
863 (*cp == '#' && !defcmd_in_progress))
864 break;
865 /* special case: check for | grep pattern */
866 if (*cp == '|') {
867 check_grep++;
868 break;
869 }
870 if (cpp >= cbuf + CMD_BUFLEN) {
871 kdb_printf("kdb_parse: command buffer "
872 "overflow, command ignored\n%s\n",
873 cmdstr);
874 return KDB_NOTFOUND;
875 }
876 if (argc >= MAXARGC - 1) {
877 kdb_printf("kdb_parse: too many arguments, "
878 "command ignored\n%s\n", cmdstr);
879 return KDB_NOTFOUND;
880 }
881 argv[argc++] = cpp;
882 escaped = 0;
883 quoted = '\0';
884 /* Copy to next unquoted and unescaped
885 * whitespace or '=' */
886 while (*cp && *cp != '\n' &&
887 (escaped || quoted || !isspace(*cp))) {
888 if (cpp >= cbuf + CMD_BUFLEN)
889 break;
890 if (escaped) {
891 escaped = 0;
892 *cpp++ = *cp++;
893 continue;
894 }
895 if (*cp == '\\') {
896 escaped = 1;
897 ++cp;
898 continue;
899 }
900 if (*cp == quoted)
901 quoted = '\0';
902 else if (*cp == '\'' || *cp == '"')
903 quoted = *cp;
904 *cpp = *cp++;
905 if (*cpp == '=' && !quoted)
906 break;
907 ++cpp;
908 }
909 *cpp++ = '\0'; /* Squash a ws or '=' character */
910 }
911 }
912 if (!argc)
913 return 0;
914 if (check_grep)
915 parse_grep(cp);
916 if (defcmd_in_progress) {
917 int result = kdb_defcmd2(cmdstr, argv[0]);
918 if (!defcmd_in_progress) {
919 argc = 0; /* avoid repeat on endefcmd */
920 *(argv[0]) = '\0';
921 }
922 return result;
923 }
924 if (argv[0][0] == '-' && argv[0][1] &&
925 (argv[0][1] < '0' || argv[0][1] > '9')) {
926 ignore_errors = 1;
927 ++argv[0];
928 }
929
930 for_each_kdbcmd(tp, i) {
931 if (tp->cmd_name) {
932 /*
933 * If this command is allowed to be abbreviated,
934 * check to see if this is it.
935 */
936
937 if (tp->cmd_minlen
938 && (strlen(argv[0]) <= tp->cmd_minlen)) {
939 if (strncmp(argv[0],
940 tp->cmd_name,
941 tp->cmd_minlen) == 0) {
942 break;
943 }
944 }
945
946 if (strcmp(argv[0], tp->cmd_name) == 0)
947 break;
948 }
949 }
950
951 /*
952 * If we don't find a command by this name, see if the first
953 * few characters of this match any of the known commands.
954 * e.g., md1c20 should match md.
955 */
956 if (i == kdb_max_commands) {
957 for_each_kdbcmd(tp, i) {
958 if (tp->cmd_name) {
959 if (strncmp(argv[0],
960 tp->cmd_name,
961 strlen(tp->cmd_name)) == 0) {
962 break;
963 }
964 }
965 }
966 }
967
968 if (i < kdb_max_commands) {
969 int result;
970 KDB_STATE_SET(CMD);
971 result = (*tp->cmd_func)(argc-1, (const char **)argv);
972 if (result && ignore_errors && result > KDB_CMD_GO)
973 result = 0;
974 KDB_STATE_CLEAR(CMD);
975 switch (tp->cmd_repeat) {
976 case KDB_REPEAT_NONE:
977 argc = 0;
978 if (argv[0])
979 *(argv[0]) = '\0';
980 break;
981 case KDB_REPEAT_NO_ARGS:
982 argc = 1;
983 if (argv[1])
984 *(argv[1]) = '\0';
985 break;
986 case KDB_REPEAT_WITH_ARGS:
987 break;
988 }
989 return result;
990 }
991
992 /*
993 * If the input with which we were presented does not
994 * map to an existing command, attempt to parse it as an
995 * address argument and display the result. Useful for
996 * obtaining the address of a variable, or the nearest symbol
997 * to an address contained in a register.
998 */
999 {
1000 unsigned long value;
1001 char *name = NULL;
1002 long offset;
1003 int nextarg = 0;
1004
1005 if (kdbgetaddrarg(0, (const char **)argv, &nextarg,
1006 &value, &offset, &name)) {
1007 return KDB_NOTFOUND;
1008 }
1009
1010 kdb_printf("%s = ", argv[0]);
1011 kdb_symbol_print(value, NULL, KDB_SP_DEFAULT);
1012 kdb_printf("\n");
1013 return 0;
1014 }
1015}
1016
1017
1018static int handle_ctrl_cmd(char *cmd)
1019{
1020#define CTRL_P 16
1021#define CTRL_N 14
1022
1023 /* initial situation */
1024 if (cmd_head == cmd_tail)
1025 return 0;
1026 switch (*cmd) {
1027 case CTRL_P:
1028 if (cmdptr != cmd_tail)
1029 cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
1030 strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
1031 return 1;
1032 case CTRL_N:
1033 if (cmdptr != cmd_head)
1034 cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
1035 strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
1036 return 1;
1037 }
1038 return 0;
1039}
1040
1041/*
1042 * kdb_reboot - This function implements the 'reboot' command. Reboot
1043 * the system immediately, or loop for ever on failure.
1044 */
1045static int kdb_reboot(int argc, const char **argv)
1046{
1047 emergency_restart();
1048 kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n");
1049 while (1)
1050 cpu_relax();
1051 /* NOTREACHED */
1052 return 0;
1053}
1054
1055static void kdb_dumpregs(struct pt_regs *regs)
1056{
1057 int old_lvl = console_loglevel;
1058 console_loglevel = 15;
1059 kdb_trap_printk++;
1060 show_regs(regs);
1061 kdb_trap_printk--;
1062 kdb_printf("\n");
1063 console_loglevel = old_lvl;
1064}
1065
1066void kdb_set_current_task(struct task_struct *p)
1067{
1068 kdb_current_task = p;
1069
1070 if (kdb_task_has_cpu(p)) {
1071 kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p));
1072 return;
1073 }
1074 kdb_current_regs = NULL;
1075}
1076
1077/*
1078 * kdb_local - The main code for kdb. This routine is invoked on a
1079 * specific processor, it is not global. The main kdb() routine
1080 * ensures that only one processor at a time is in this routine.
1081 * This code is called with the real reason code on the first
1082 * entry to a kdb session, thereafter it is called with reason
1083 * SWITCH, even if the user goes back to the original cpu.
1084 * Inputs:
1085 * reason The reason KDB was invoked
1086 * error The hardware-defined error code
1087 * regs The exception frame at time of fault/breakpoint.
1088 * db_result Result code from the break or debug point.
1089 * Returns:
1090 * 0 KDB was invoked for an event which it wasn't responsible
1091 * 1 KDB handled the event for which it was invoked.
1092 * KDB_CMD_GO User typed 'go'.
1093 * KDB_CMD_CPU User switched to another cpu.
1094 * KDB_CMD_SS Single step.
1095 * KDB_CMD_SSB Single step until branch.
1096 */
1097static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1098 kdb_dbtrap_t db_result)
1099{
1100 char *cmdbuf;
1101 int diag;
1102 struct task_struct *kdb_current =
1103 kdb_curr_task(raw_smp_processor_id());
1104
1105 KDB_DEBUG_STATE("kdb_local 1", reason);
1106 kdb_go_count = 0;
1107 if (reason == KDB_REASON_DEBUG) {
1108 /* special case below */
1109 } else {
1110 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
1111 kdb_current, kdb_current->pid);
1112#if defined(CONFIG_SMP)
1113 kdb_printf("on processor %d ", raw_smp_processor_id());
1114#endif
1115 }
1116
1117 switch (reason) {
1118 case KDB_REASON_DEBUG:
1119 {
1120 /*
1121 * If re-entering kdb after a single step
1122 * command, don't print the message.
1123 */
1124 switch (db_result) {
1125 case KDB_DB_BPT:
1126 kdb_printf("\nEntering kdb (0x%p, pid %d) ",
1127 kdb_current, kdb_current->pid);
1128#if defined(CONFIG_SMP)
1129 kdb_printf("on processor %d ", raw_smp_processor_id());
1130#endif
1131 kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
1132 instruction_pointer(regs));
1133 break;
1134 case KDB_DB_SSB:
1135 /*
1136 * In the midst of ssb command. Just return.
1137 */
1138 KDB_DEBUG_STATE("kdb_local 3", reason);
1139 return KDB_CMD_SSB; /* Continue with SSB command */
1140
1141 break;
1142 case KDB_DB_SS:
1143 break;
1144 case KDB_DB_SSBPT:
1145 KDB_DEBUG_STATE("kdb_local 4", reason);
1146 return 1; /* kdba_db_trap did the work */
1147 default:
1148 kdb_printf("kdb: Bad result from kdba_db_trap: %d\n",
1149 db_result);
1150 break;
1151 }
1152
1153 }
1154 break;
1155 case KDB_REASON_ENTER:
1156 if (KDB_STATE(KEYBOARD))
1157 kdb_printf("due to Keyboard Entry\n");
1158 else
1159 kdb_printf("due to KDB_ENTER()\n");
1160 break;
1161 case KDB_REASON_KEYBOARD:
1162 KDB_STATE_SET(KEYBOARD);
1163 kdb_printf("due to Keyboard Entry\n");
1164 break;
1165 case KDB_REASON_ENTER_SLAVE:
1166 /* drop through, slaves only get released via cpu switch */
1167 case KDB_REASON_SWITCH:
1168 kdb_printf("due to cpu switch\n");
1169 break;
1170 case KDB_REASON_OOPS:
1171 kdb_printf("Oops: %s\n", kdb_diemsg);
1172 kdb_printf("due to oops @ " kdb_machreg_fmt "\n",
1173 instruction_pointer(regs));
1174 kdb_dumpregs(regs);
1175 break;
1176 case KDB_REASON_NMI:
1177 kdb_printf("due to NonMaskable Interrupt @ "
1178 kdb_machreg_fmt "\n",
1179 instruction_pointer(regs));
1180 kdb_dumpregs(regs);
1181 break;
1182 case KDB_REASON_SSTEP:
1183 case KDB_REASON_BREAK:
1184 kdb_printf("due to %s @ " kdb_machreg_fmt "\n",
1185 reason == KDB_REASON_BREAK ?
1186 "Breakpoint" : "SS trap", instruction_pointer(regs));
1187 /*
1188 * Determine if this breakpoint is one that we
1189 * are interested in.
1190 */
1191 if (db_result != KDB_DB_BPT) {
1192 kdb_printf("kdb: error return from kdba_bp_trap: %d\n",
1193 db_result);
1194 KDB_DEBUG_STATE("kdb_local 6", reason);
1195 return 0; /* Not for us, dismiss it */
1196 }
1197 break;
1198 case KDB_REASON_RECURSE:
1199 kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n",
1200 instruction_pointer(regs));
1201 break;
1202 default:
1203 kdb_printf("kdb: unexpected reason code: %d\n", reason);
1204 KDB_DEBUG_STATE("kdb_local 8", reason);
1205 return 0; /* Not for us, dismiss it */
1206 }
1207
1208 while (1) {
1209 /*
1210 * Initialize pager context.
1211 */
1212 kdb_nextline = 1;
1213 KDB_STATE_CLEAR(SUPPRESS);
1214
1215 cmdbuf = cmd_cur;
1216 *cmdbuf = '\0';
1217 *(cmd_hist[cmd_head]) = '\0';
1218
1219 if (KDB_FLAG(ONLY_DO_DUMP)) {
1220 /* kdb is off but a catastrophic error requires a dump.
1221 * Take the dump and reboot.
1222 * Turn on logging so the kdb output appears in the log
1223 * buffer in the dump.
1224 */
1225 const char *setargs[] = { "set", "LOGGING", "1" };
1226 kdb_set(2, setargs);
1227 kdb_reboot(0, NULL);
1228 /*NOTREACHED*/
1229 }
1230
1231do_full_getstr:
1232#if defined(CONFIG_SMP)
1233 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
1234 raw_smp_processor_id());
1235#else
1236 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
1237#endif
1238 if (defcmd_in_progress)
1239 strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
1240
1241 /*
1242 * Fetch command from keyboard
1243 */
1244 cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str);
1245 if (*cmdbuf != '\n') {
1246 if (*cmdbuf < 32) {
1247 if (cmdptr == cmd_head) {
1248 strncpy(cmd_hist[cmd_head], cmd_cur,
1249 CMD_BUFLEN);
1250 *(cmd_hist[cmd_head] +
1251 strlen(cmd_hist[cmd_head])-1) = '\0';
1252 }
1253 if (!handle_ctrl_cmd(cmdbuf))
1254 *(cmd_cur+strlen(cmd_cur)-1) = '\0';
1255 cmdbuf = cmd_cur;
1256 goto do_full_getstr;
1257 } else {
1258 strncpy(cmd_hist[cmd_head], cmd_cur,
1259 CMD_BUFLEN);
1260 }
1261
1262 cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT;
1263 if (cmd_head == cmd_tail)
1264 cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT;
1265 }
1266
1267 cmdptr = cmd_head;
1268 diag = kdb_parse(cmdbuf);
1269 if (diag == KDB_NOTFOUND) {
1270 kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
1271 diag = 0;
1272 }
1273 if (diag == KDB_CMD_GO
1274 || diag == KDB_CMD_CPU
1275 || diag == KDB_CMD_SS
1276 || diag == KDB_CMD_SSB
1277 || diag == KDB_CMD_KGDB)
1278 break;
1279
1280 if (diag)
1281 kdb_cmderror(diag);
1282 }
1283 KDB_DEBUG_STATE("kdb_local 9", diag);
1284 return diag;
1285}
1286
1287
1288/*
1289 * kdb_print_state - Print the state data for the current processor
1290 * for debugging.
1291 * Inputs:
1292 * text Identifies the debug point
1293 * value Any integer value to be printed, e.g. reason code.
1294 */
1295void kdb_print_state(const char *text, int value)
1296{
1297 kdb_printf("state: %s cpu %d value %d initial %d state %x\n",
1298 text, raw_smp_processor_id(), value, kdb_initial_cpu,
1299 kdb_state);
1300}
1301
1302/*
1303 * kdb_main_loop - After initial setup and assignment of the
1304 * controlling cpu, all cpus are in this loop. One cpu is in
1305 * control and will issue the kdb prompt, the others will spin
1306 * until 'go' or cpu switch.
1307 *
1308 * To get a consistent view of the kernel stacks for all
1309 * processes, this routine is invoked from the main kdb code via
1310 * an architecture specific routine. kdba_main_loop is
1311 * responsible for making the kernel stacks consistent for all
1312 * processes, there should be no difference between a blocked
1313 * process and a running process as far as kdb is concerned.
1314 * Inputs:
1315 * reason The reason KDB was invoked
1316 * error The hardware-defined error code
1317 * reason2 kdb's current reason code.
1318 * Initially error but can change
1319 * acording to kdb state.
1320 * db_result Result code from break or debug point.
1321 * regs The exception frame at time of fault/breakpoint.
1322 * should always be valid.
1323 * Returns:
1324 * 0 KDB was invoked for an event which it wasn't responsible
1325 * 1 KDB handled the event for which it was invoked.
1326 */
1327int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1328 kdb_dbtrap_t db_result, struct pt_regs *regs)
1329{
1330 int result = 1;
1331 /* Stay in kdb() until 'go', 'ss[b]' or an error */
1332 while (1) {
1333 /*
1334 * All processors except the one that is in control
1335 * will spin here.
1336 */
1337 KDB_DEBUG_STATE("kdb_main_loop 1", reason);
1338 while (KDB_STATE(HOLD_CPU)) {
1339 /* state KDB is turned off by kdb_cpu to see if the
1340 * other cpus are still live, each cpu in this loop
1341 * turns it back on.
1342 */
1343 if (!KDB_STATE(KDB))
1344 KDB_STATE_SET(KDB);
1345 }
1346
1347 KDB_STATE_CLEAR(SUPPRESS);
1348 KDB_DEBUG_STATE("kdb_main_loop 2", reason);
1349 if (KDB_STATE(LEAVING))
1350 break; /* Another cpu said 'go' */
1351 /* Still using kdb, this processor is in control */
1352 result = kdb_local(reason2, error, regs, db_result);
1353 KDB_DEBUG_STATE("kdb_main_loop 3", result);
1354
1355 if (result == KDB_CMD_CPU)
1356 break;
1357
1358 if (result == KDB_CMD_SS) {
1359 KDB_STATE_SET(DOING_SS);
1360 break;
1361 }
1362
1363 if (result == KDB_CMD_SSB) {
1364 KDB_STATE_SET(DOING_SS);
1365 KDB_STATE_SET(DOING_SSB);
1366 break;
1367 }
1368
1369 if (result == KDB_CMD_KGDB) {
1370 if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)))
1371 kdb_printf("Entering please attach debugger "
1372 "or use $D#44+ or $3#33\n");
1373 break;
1374 }
1375 if (result && result != 1 && result != KDB_CMD_GO)
1376 kdb_printf("\nUnexpected kdb_local return code %d\n",
1377 result);
1378 KDB_DEBUG_STATE("kdb_main_loop 4", reason);
1379 break;
1380 }
1381 if (KDB_STATE(DOING_SS))
1382 KDB_STATE_CLEAR(SSBPT);
1383
1384 return result;
1385}
1386
1387/*
1388 * kdb_mdr - This function implements the guts of the 'mdr', memory
1389 * read command.
1390 * mdr <addr arg>,<byte count>
1391 * Inputs:
1392 * addr Start address
1393 * count Number of bytes
1394 * Returns:
1395 * Always 0. Any errors are detected and printed by kdb_getarea.
1396 */
1397static int kdb_mdr(unsigned long addr, unsigned int count)
1398{
1399 unsigned char c;
1400 while (count--) {
1401 if (kdb_getarea(c, addr))
1402 return 0;
1403 kdb_printf("%02x", c);
1404 addr++;
1405 }
1406 kdb_printf("\n");
1407 return 0;
1408}
1409
1410/*
1411 * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4',
1412 * 'md8' 'mdr' and 'mds' commands.
1413 *
1414 * md|mds [<addr arg> [<line count> [<radix>]]]
1415 * mdWcN [<addr arg> [<line count> [<radix>]]]
1416 * where W = is the width (1, 2, 4 or 8) and N is the count.
1417 * for eg., md1c20 reads 20 bytes, 1 at a time.
1418 * mdr <addr arg>,<byte count>
1419 */
1420static void kdb_md_line(const char *fmtstr, unsigned long addr,
1421 int symbolic, int nosect, int bytesperword,
1422 int num, int repeat, int phys)
1423{
1424 /* print just one line of data */
1425 kdb_symtab_t symtab;
1426 char cbuf[32];
1427 char *c = cbuf;
1428 int i;
1429 unsigned long word;
1430
1431 memset(cbuf, '\0', sizeof(cbuf));
1432 if (phys)
1433 kdb_printf("phys " kdb_machreg_fmt0 " ", addr);
1434 else
1435 kdb_printf(kdb_machreg_fmt0 " ", addr);
1436
1437 for (i = 0; i < num && repeat--; i++) {
1438 if (phys) {
1439 if (kdb_getphysword(&word, addr, bytesperword))
1440 break;
1441 } else if (kdb_getword(&word, addr, bytesperword))
1442 break;
1443 kdb_printf(fmtstr, word);
1444 if (symbolic)
1445 kdbnearsym(word, &symtab);
1446 else
1447 memset(&symtab, 0, sizeof(symtab));
1448 if (symtab.sym_name) {
1449 kdb_symbol_print(word, &symtab, 0);
1450 if (!nosect) {
1451 kdb_printf("\n");
1452 kdb_printf(" %s %s "
1453 kdb_machreg_fmt " "
1454 kdb_machreg_fmt " "
1455 kdb_machreg_fmt, symtab.mod_name,
1456 symtab.sec_name, symtab.sec_start,
1457 symtab.sym_start, symtab.sym_end);
1458 }
1459 addr += bytesperword;
1460 } else {
1461 union {
1462 u64 word;
1463 unsigned char c[8];
1464 } wc;
1465 unsigned char *cp;
1466#ifdef __BIG_ENDIAN
1467 cp = wc.c + 8 - bytesperword;
1468#else
1469 cp = wc.c;
1470#endif
1471 wc.word = word;
1472#define printable_char(c) \
1473 ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
1474 switch (bytesperword) {
1475 case 8:
1476 *c++ = printable_char(*cp++);
1477 *c++ = printable_char(*cp++);
1478 *c++ = printable_char(*cp++);
1479 *c++ = printable_char(*cp++);
1480 addr += 4;
1481 case 4:
1482 *c++ = printable_char(*cp++);
1483 *c++ = printable_char(*cp++);
1484 addr += 2;
1485 case 2:
1486 *c++ = printable_char(*cp++);
1487 addr++;
1488 case 1:
1489 *c++ = printable_char(*cp++);
1490 addr++;
1491 break;
1492 }
1493#undef printable_char
1494 }
1495 }
1496 kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1),
1497 " ", cbuf);
1498}
1499
1500static int kdb_md(int argc, const char **argv)
1501{
1502 static unsigned long last_addr;
1503 static int last_radix, last_bytesperword, last_repeat;
1504 int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat;
1505 int nosect = 0;
1506 char fmtchar, fmtstr[64];
1507 unsigned long addr;
1508 unsigned long word;
1509 long offset = 0;
1510 int symbolic = 0;
1511 int valid = 0;
1512 int phys = 0;
1513
1514 kdbgetintenv("MDCOUNT", &mdcount);
1515 kdbgetintenv("RADIX", &radix);
1516 kdbgetintenv("BYTESPERWORD", &bytesperword);
1517
1518 /* Assume 'md <addr>' and start with environment values */
1519 repeat = mdcount * 16 / bytesperword;
1520
1521 if (strcmp(argv[0], "mdr") == 0) {
1522 if (argc != 2)
1523 return KDB_ARGCOUNT;
1524 valid = 1;
1525 } else if (isdigit(argv[0][2])) {
1526 bytesperword = (int)(argv[0][2] - '0');
1527 if (bytesperword == 0) {
1528 bytesperword = last_bytesperword;
1529 if (bytesperword == 0)
1530 bytesperword = 4;
1531 }
1532 last_bytesperword = bytesperword;
1533 repeat = mdcount * 16 / bytesperword;
1534 if (!argv[0][3])
1535 valid = 1;
1536 else if (argv[0][3] == 'c' && argv[0][4]) {
1537 char *p;
1538 repeat = simple_strtoul(argv[0] + 4, &p, 10);
1539 mdcount = ((repeat * bytesperword) + 15) / 16;
1540 valid = !*p;
1541 }
1542 last_repeat = repeat;
1543 } else if (strcmp(argv[0], "md") == 0)
1544 valid = 1;
1545 else if (strcmp(argv[0], "mds") == 0)
1546 valid = 1;
1547 else if (strcmp(argv[0], "mdp") == 0) {
1548 phys = valid = 1;
1549 }
1550 if (!valid)
1551 return KDB_NOTFOUND;
1552
1553 if (argc == 0) {
1554 if (last_addr == 0)
1555 return KDB_ARGCOUNT;
1556 addr = last_addr;
1557 radix = last_radix;
1558 bytesperword = last_bytesperword;
1559 repeat = last_repeat;
1560 mdcount = ((repeat * bytesperword) + 15) / 16;
1561 }
1562
1563 if (argc) {
1564 unsigned long val;
1565 int diag, nextarg = 1;
1566 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
1567 &offset, NULL);
1568 if (diag)
1569 return diag;
1570 if (argc > nextarg+2)
1571 return KDB_ARGCOUNT;
1572
1573 if (argc >= nextarg) {
1574 diag = kdbgetularg(argv[nextarg], &val);
1575 if (!diag) {
1576 mdcount = (int) val;
1577 repeat = mdcount * 16 / bytesperword;
1578 }
1579 }
1580 if (argc >= nextarg+1) {
1581 diag = kdbgetularg(argv[nextarg+1], &val);
1582 if (!diag)
1583 radix = (int) val;
1584 }
1585 }
1586
1587 if (strcmp(argv[0], "mdr") == 0)
1588 return kdb_mdr(addr, mdcount);
1589
1590 switch (radix) {
1591 case 10:
1592 fmtchar = 'd';
1593 break;
1594 case 16:
1595 fmtchar = 'x';
1596 break;
1597 case 8:
1598 fmtchar = 'o';
1599 break;
1600 default:
1601 return KDB_BADRADIX;
1602 }
1603
1604 last_radix = radix;
1605
1606 if (bytesperword > KDB_WORD_SIZE)
1607 return KDB_BADWIDTH;
1608
1609 switch (bytesperword) {
1610 case 8:
1611 sprintf(fmtstr, "%%16.16l%c ", fmtchar);
1612 break;
1613 case 4:
1614 sprintf(fmtstr, "%%8.8l%c ", fmtchar);
1615 break;
1616 case 2:
1617 sprintf(fmtstr, "%%4.4l%c ", fmtchar);
1618 break;
1619 case 1:
1620 sprintf(fmtstr, "%%2.2l%c ", fmtchar);
1621 break;
1622 default:
1623 return KDB_BADWIDTH;
1624 }
1625
1626 last_repeat = repeat;
1627 last_bytesperword = bytesperword;
1628
1629 if (strcmp(argv[0], "mds") == 0) {
1630 symbolic = 1;
1631 /* Do not save these changes as last_*, they are temporary mds
1632 * overrides.
1633 */
1634 bytesperword = KDB_WORD_SIZE;
1635 repeat = mdcount;
1636 kdbgetintenv("NOSECT", &nosect);
1637 }
1638
1639 /* Round address down modulo BYTESPERWORD */
1640
1641 addr &= ~(bytesperword-1);
1642
1643 while (repeat > 0) {
1644 unsigned long a;
1645 int n, z, num = (symbolic ? 1 : (16 / bytesperword));
1646
1647 if (KDB_FLAG(CMD_INTERRUPT))
1648 return 0;
1649 for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) {
1650 if (phys) {
1651 if (kdb_getphysword(&word, a, bytesperword)
1652 || word)
1653 break;
1654 } else if (kdb_getword(&word, a, bytesperword) || word)
1655 break;
1656 }
1657 n = min(num, repeat);
1658 kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword,
1659 num, repeat, phys);
1660 addr += bytesperword * n;
1661 repeat -= n;
1662 z = (z + num - 1) / num;
1663 if (z > 2) {
1664 int s = num * (z-2);
1665 kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0
1666 " zero suppressed\n",
1667 addr, addr + bytesperword * s - 1);
1668 addr += bytesperword * s;
1669 repeat -= s;
1670 }
1671 }
1672 last_addr = addr;
1673
1674 return 0;
1675}
1676
1677/*
1678 * kdb_mm - This function implements the 'mm' command.
1679 * mm address-expression new-value
1680 * Remarks:
1681 * mm works on machine words, mmW works on bytes.
1682 */
1683static int kdb_mm(int argc, const char **argv)
1684{
1685 int diag;
1686 unsigned long addr;
1687 long offset = 0;
1688 unsigned long contents;
1689 int nextarg;
1690 int width;
1691
1692 if (argv[0][2] && !isdigit(argv[0][2]))
1693 return KDB_NOTFOUND;
1694
1695 if (argc < 2)
1696 return KDB_ARGCOUNT;
1697
1698 nextarg = 1;
1699 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
1700 if (diag)
1701 return diag;
1702
1703 if (nextarg > argc)
1704 return KDB_ARGCOUNT;
1705 diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL);
1706 if (diag)
1707 return diag;
1708
1709 if (nextarg != argc + 1)
1710 return KDB_ARGCOUNT;
1711
1712 width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE);
1713 diag = kdb_putword(addr, contents, width);
1714 if (diag)
1715 return diag;
1716
1717 kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents);
1718
1719 return 0;
1720}
1721
1722/*
1723 * kdb_go - This function implements the 'go' command.
1724 * go [address-expression]
1725 */
1726static int kdb_go(int argc, const char **argv)
1727{
1728 unsigned long addr;
1729 int diag;
1730 int nextarg;
1731 long offset;
1732
1733 if (argc == 1) {
1734 if (raw_smp_processor_id() != kdb_initial_cpu) {
1735 kdb_printf("go <address> must be issued from the "
1736 "initial cpu, do cpu %d first\n",
1737 kdb_initial_cpu);
1738 return KDB_ARGCOUNT;
1739 }
1740 nextarg = 1;
1741 diag = kdbgetaddrarg(argc, argv, &nextarg,
1742 &addr, &offset, NULL);
1743 if (diag)
1744 return diag;
1745 } else if (argc) {
1746 return KDB_ARGCOUNT;
1747 }
1748
1749 diag = KDB_CMD_GO;
1750 if (KDB_FLAG(CATASTROPHIC)) {
1751 kdb_printf("Catastrophic error detected\n");
1752 kdb_printf("kdb_continue_catastrophic=%d, ",
1753 kdb_continue_catastrophic);
1754 if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) {
1755 kdb_printf("type go a second time if you really want "
1756 "to continue\n");
1757 return 0;
1758 }
1759 if (kdb_continue_catastrophic == 2) {
1760 kdb_printf("forcing reboot\n");
1761 kdb_reboot(0, NULL);
1762 }
1763 kdb_printf("attempting to continue\n");
1764 }
1765 return diag;
1766}
1767
1768/*
1769 * kdb_rd - This function implements the 'rd' command.
1770 */
1771static int kdb_rd(int argc, const char **argv)
1772{
1773 int diag = kdb_check_regs();
1774 if (diag)
1775 return diag;
1776
1777 kdb_dumpregs(kdb_current_regs);
1778 return 0;
1779}
1780
1781/*
1782 * kdb_rm - This function implements the 'rm' (register modify) command.
1783 * rm register-name new-contents
1784 * Remarks:
1785 * Currently doesn't allow modification of control or
1786 * debug registers.
1787 */
1788static int kdb_rm(int argc, const char **argv)
1789{
1790 int diag;
1791 int ind = 0;
1792 unsigned long contents;
1793
1794 if (argc != 2)
1795 return KDB_ARGCOUNT;
1796 /*
1797 * Allow presence or absence of leading '%' symbol.
1798 */
1799 if (argv[1][0] == '%')
1800 ind = 1;
1801
1802 diag = kdbgetularg(argv[2], &contents);
1803 if (diag)
1804 return diag;
1805
1806 diag = kdb_check_regs();
1807 if (diag)
1808 return diag;
1809 kdb_printf("ERROR: Register set currently not implemented\n");
1810 return 0;
1811}
1812
1813#if defined(CONFIG_MAGIC_SYSRQ)
1814/*
1815 * kdb_sr - This function implements the 'sr' (SYSRQ key) command
1816 * which interfaces to the soi-disant MAGIC SYSRQ functionality.
1817 * sr <magic-sysrq-code>
1818 */
1819static int kdb_sr(int argc, const char **argv)
1820{
1821 if (argc != 1)
1822 return KDB_ARGCOUNT;
1823 sysrq_toggle_support(1);
1824 kdb_trap_printk++;
1825 handle_sysrq(*argv[1], NULL);
1826 kdb_trap_printk--;
1827
1828 return 0;
1829}
1830#endif /* CONFIG_MAGIC_SYSRQ */
1831
1832/*
1833 * kdb_ef - This function implements the 'regs' (display exception
1834 * frame) command. This command takes an address and expects to
1835 * find an exception frame at that address, formats and prints
1836 * it.
1837 * regs address-expression
1838 * Remarks:
1839 * Not done yet.
1840 */
1841static int kdb_ef(int argc, const char **argv)
1842{
1843 int diag;
1844 unsigned long addr;
1845 long offset;
1846 int nextarg;
1847
1848 if (argc != 1)
1849 return KDB_ARGCOUNT;
1850
1851 nextarg = 1;
1852 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
1853 if (diag)
1854 return diag;
1855 show_regs((struct pt_regs *)addr);
1856 return 0;
1857}
1858
1859#if defined(CONFIG_MODULES)
1860/* modules using other modules */
1861struct module_use {
1862 struct list_head list;
1863 struct module *module_which_uses;
1864};
1865
1866/*
1867 * kdb_lsmod - This function implements the 'lsmod' command. Lists
1868 * currently loaded kernel modules.
1869 * Mostly taken from userland lsmod.
1870 */
1871static int kdb_lsmod(int argc, const char **argv)
1872{
1873 struct module *mod;
1874
1875 if (argc != 0)
1876 return KDB_ARGCOUNT;
1877
1878 kdb_printf("Module Size modstruct Used by\n");
1879 list_for_each_entry(mod, kdb_modules, list) {
1880
1881 kdb_printf("%-20s%8u 0x%p ", mod->name,
1882 mod->core_size, (void *)mod);
1883#ifdef CONFIG_MODULE_UNLOAD
1884 kdb_printf("%4d ", module_refcount(mod));
1885#endif
1886 if (mod->state == MODULE_STATE_GOING)
1887 kdb_printf(" (Unloading)");
1888 else if (mod->state == MODULE_STATE_COMING)
1889 kdb_printf(" (Loading)");
1890 else
1891 kdb_printf(" (Live)");
1892
1893#ifdef CONFIG_MODULE_UNLOAD
1894 {
1895 struct module_use *use;
1896 kdb_printf(" [ ");
1897 list_for_each_entry(use, &mod->modules_which_use_me,
1898 list)
1899 kdb_printf("%s ", use->module_which_uses->name);
1900 kdb_printf("]\n");
1901 }
1902#endif
1903 }
1904
1905 return 0;
1906}
1907
1908#endif /* CONFIG_MODULES */
1909
1910/*
1911 * kdb_env - This function implements the 'env' command. Display the
1912 * current environment variables.
1913 */
1914
1915static int kdb_env(int argc, const char **argv)
1916{
1917 int i;
1918
1919 for (i = 0; i < __nenv; i++) {
1920 if (__env[i])
1921 kdb_printf("%s\n", __env[i]);
1922 }
1923
1924 if (KDB_DEBUG(MASK))
1925 kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
1926
1927 return 0;
1928}
1929
1930#ifdef CONFIG_PRINTK
1931/*
1932 * kdb_dmesg - This function implements the 'dmesg' command to display
1933 * the contents of the syslog buffer.
1934 * dmesg [lines] [adjust]
1935 */
1936static int kdb_dmesg(int argc, const char **argv)
1937{
1938 char *syslog_data[4], *start, *end, c = '\0', *p;
1939 int diag, logging, logsize, lines = 0, adjust = 0, n;
1940
1941 if (argc > 2)
1942 return KDB_ARGCOUNT;
1943 if (argc) {
1944 char *cp;
1945 lines = simple_strtol(argv[1], &cp, 0);
1946 if (*cp)
1947 lines = 0;
1948 if (argc > 1) {
1949 adjust = simple_strtoul(argv[2], &cp, 0);
1950 if (*cp || adjust < 0)
1951 adjust = 0;
1952 }
1953 }
1954
1955 /* disable LOGGING if set */
1956 diag = kdbgetintenv("LOGGING", &logging);
1957 if (!diag && logging) {
1958 const char *setargs[] = { "set", "LOGGING", "0" };
1959 kdb_set(2, setargs);
1960 }
1961
1962 /* syslog_data[0,1] physical start, end+1. syslog_data[2,3]
1963 * logical start, end+1. */
1964 kdb_syslog_data(syslog_data);
1965 if (syslog_data[2] == syslog_data[3])
1966 return 0;
1967 logsize = syslog_data[1] - syslog_data[0];
1968 start = syslog_data[2];
1969 end = syslog_data[3];
1970#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
1971 for (n = 0, p = start; p < end; ++p) {
1972 c = *KDB_WRAP(p);
1973 if (c == '\n')
1974 ++n;
1975 }
1976 if (c != '\n')
1977 ++n;
1978 if (lines < 0) {
1979 if (adjust >= n)
1980 kdb_printf("buffer only contains %d lines, nothing "
1981 "printed\n", n);
1982 else if (adjust - lines >= n)
1983 kdb_printf("buffer only contains %d lines, last %d "
1984 "lines printed\n", n, n - adjust);
1985 if (adjust) {
1986 for (; start < end && adjust; ++start) {
1987 if (*KDB_WRAP(start) == '\n')
1988 --adjust;
1989 }
1990 if (start < end)
1991 ++start;
1992 }
1993 for (p = start; p < end && lines; ++p) {
1994 if (*KDB_WRAP(p) == '\n')
1995 ++lines;
1996 }
1997 end = p;
1998 } else if (lines > 0) {
1999 int skip = n - (adjust + lines);
2000 if (adjust >= n) {
2001 kdb_printf("buffer only contains %d lines, "
2002 "nothing printed\n", n);
2003 skip = n;
2004 } else if (skip < 0) {
2005 lines += skip;
2006 skip = 0;
2007 kdb_printf("buffer only contains %d lines, first "
2008 "%d lines printed\n", n, lines);
2009 }
2010 for (; start < end && skip; ++start) {
2011 if (*KDB_WRAP(start) == '\n')
2012 --skip;
2013 }
2014 for (p = start; p < end && lines; ++p) {
2015 if (*KDB_WRAP(p) == '\n')
2016 --lines;
2017 }
2018 end = p;
2019 }
2020 /* Do a line at a time (max 200 chars) to reduce protocol overhead */
2021 c = '\n';
2022 while (start != end) {
2023 char buf[201];
2024 p = buf;
2025 if (KDB_FLAG(CMD_INTERRUPT))
2026 return 0;
2027 while (start < end && (c = *KDB_WRAP(start)) &&
2028 (p - buf) < sizeof(buf)-1) {
2029 ++start;
2030 *p++ = c;
2031 if (c == '\n')
2032 break;
2033 }
2034 *p = '\0';
2035 kdb_printf("%s", buf);
2036 }
2037 if (c != '\n')
2038 kdb_printf("\n");
2039
2040 return 0;
2041}
2042#endif /* CONFIG_PRINTK */
2043/*
2044 * kdb_cpu - This function implements the 'cpu' command.
2045 * cpu [<cpunum>]
2046 * Returns:
2047 * KDB_CMD_CPU for success, a kdb diagnostic if error
2048 */
2049static void kdb_cpu_status(void)
2050{
2051 int i, start_cpu, first_print = 1;
2052 char state, prev_state = '?';
2053
2054 kdb_printf("Currently on cpu %d\n", raw_smp_processor_id());
2055 kdb_printf("Available cpus: ");
2056 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
2057 if (!cpu_online(i)) {
2058 state = 'F'; /* cpu is offline */
2059 } else {
2060 state = ' '; /* cpu is responding to kdb */
2061 if (kdb_task_state_char(KDB_TSK(i)) == 'I')
2062 state = 'I'; /* idle task */
2063 }
2064 if (state != prev_state) {
2065 if (prev_state != '?') {
2066 if (!first_print)
2067 kdb_printf(", ");
2068 first_print = 0;
2069 kdb_printf("%d", start_cpu);
2070 if (start_cpu < i-1)
2071 kdb_printf("-%d", i-1);
2072 if (prev_state != ' ')
2073 kdb_printf("(%c)", prev_state);
2074 }
2075 prev_state = state;
2076 start_cpu = i;
2077 }
2078 }
2079 /* print the trailing cpus, ignoring them if they are all offline */
2080 if (prev_state != 'F') {
2081 if (!first_print)
2082 kdb_printf(", ");
2083 kdb_printf("%d", start_cpu);
2084 if (start_cpu < i-1)
2085 kdb_printf("-%d", i-1);
2086 if (prev_state != ' ')
2087 kdb_printf("(%c)", prev_state);
2088 }
2089 kdb_printf("\n");
2090}
2091
2092static int kdb_cpu(int argc, const char **argv)
2093{
2094 unsigned long cpunum;
2095 int diag;
2096
2097 if (argc == 0) {
2098 kdb_cpu_status();
2099 return 0;
2100 }
2101
2102 if (argc != 1)
2103 return KDB_ARGCOUNT;
2104
2105 diag = kdbgetularg(argv[1], &cpunum);
2106 if (diag)
2107 return diag;
2108
2109 /*
2110 * Validate cpunum
2111 */
2112 if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
2113 return KDB_BADCPUNUM;
2114
2115 dbg_switch_cpu = cpunum;
2116
2117 /*
2118 * Switch to other cpu
2119 */
2120 return KDB_CMD_CPU;
2121}
2122
2123/* The user may not realize that ps/bta with no parameters does not print idle
2124 * or sleeping system daemon processes, so tell them how many were suppressed.
2125 */
2126void kdb_ps_suppressed(void)
2127{
2128 int idle = 0, daemon = 0;
2129 unsigned long mask_I = kdb_task_state_string("I"),
2130 mask_M = kdb_task_state_string("M");
2131 unsigned long cpu;
2132 const struct task_struct *p, *g;
2133 for_each_online_cpu(cpu) {
2134 p = kdb_curr_task(cpu);
2135 if (kdb_task_state(p, mask_I))
2136 ++idle;
2137 }
2138 kdb_do_each_thread(g, p) {
2139 if (kdb_task_state(p, mask_M))
2140 ++daemon;
2141 } kdb_while_each_thread(g, p);
2142 if (idle || daemon) {
2143 if (idle)
2144 kdb_printf("%d idle process%s (state I)%s\n",
2145 idle, idle == 1 ? "" : "es",
2146 daemon ? " and " : "");
2147 if (daemon)
2148 kdb_printf("%d sleeping system daemon (state M) "
2149 "process%s", daemon,
2150 daemon == 1 ? "" : "es");
2151 kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
2152 }
2153}
2154
2155/*
2156 * kdb_ps - This function implements the 'ps' command which shows a
2157 * list of the active processes.
2158 * ps [DRSTCZEUIMA] All processes, optionally filtered by state
2159 */
2160void kdb_ps1(const struct task_struct *p)
2161{
2162 int cpu;
2163 unsigned long tmp;
2164
2165 if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
2166 return;
2167
2168 cpu = kdb_process_cpu(p);
2169 kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n",
2170 (void *)p, p->pid, p->parent->pid,
2171 kdb_task_has_cpu(p), kdb_process_cpu(p),
2172 kdb_task_state_char(p),
2173 (void *)(&p->thread),
2174 p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
2175 p->comm);
2176 if (kdb_task_has_cpu(p)) {
2177 if (!KDB_TSK(cpu)) {
2178 kdb_printf(" Error: no saved data for this cpu\n");
2179 } else {
2180 if (KDB_TSK(cpu) != p)
2181 kdb_printf(" Error: does not match running "
2182 "process table (0x%p)\n", KDB_TSK(cpu));
2183 }
2184 }
2185}
2186
2187static int kdb_ps(int argc, const char **argv)
2188{
2189 struct task_struct *g, *p;
2190 unsigned long mask, cpu;
2191
2192 if (argc == 0)
2193 kdb_ps_suppressed();
2194 kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
2195 (int)(2*sizeof(void *))+2, "Task Addr",
2196 (int)(2*sizeof(void *))+2, "Thread");
2197 mask = kdb_task_state_string(argc ? argv[1] : NULL);
2198 /* Run the active tasks first */
2199 for_each_online_cpu(cpu) {
2200 if (KDB_FLAG(CMD_INTERRUPT))
2201 return 0;
2202 p = kdb_curr_task(cpu);
2203 if (kdb_task_state(p, mask))
2204 kdb_ps1(p);
2205 }
2206 kdb_printf("\n");
2207 /* Now the real tasks */
2208 kdb_do_each_thread(g, p) {
2209 if (KDB_FLAG(CMD_INTERRUPT))
2210 return 0;
2211 if (kdb_task_state(p, mask))
2212 kdb_ps1(p);
2213 } kdb_while_each_thread(g, p);
2214
2215 return 0;
2216}
2217
2218/*
2219 * kdb_pid - This function implements the 'pid' command which switches
2220 * the currently active process.
2221 * pid [<pid> | R]
2222 */
2223static int kdb_pid(int argc, const char **argv)
2224{
2225 struct task_struct *p;
2226 unsigned long val;
2227 int diag;
2228
2229 if (argc > 1)
2230 return KDB_ARGCOUNT;
2231
2232 if (argc) {
2233 if (strcmp(argv[1], "R") == 0) {
2234 p = KDB_TSK(kdb_initial_cpu);
2235 } else {
2236 diag = kdbgetularg(argv[1], &val);
2237 if (diag)
2238 return KDB_BADINT;
2239
2240 p = find_task_by_pid_ns((pid_t)val, &init_pid_ns);
2241 if (!p) {
2242 kdb_printf("No task with pid=%d\n", (pid_t)val);
2243 return 0;
2244 }
2245 }
2246 kdb_set_current_task(p);
2247 }
2248 kdb_printf("KDB current process is %s(pid=%d)\n",
2249 kdb_current_task->comm,
2250 kdb_current_task->pid);
2251
2252 return 0;
2253}
2254
2255/*
2256 * kdb_ll - This function implements the 'll' command which follows a
2257 * linked list and executes an arbitrary command for each
2258 * element.
2259 */
2260static int kdb_ll(int argc, const char **argv)
2261{
2262 int diag;
2263 unsigned long addr;
2264 long offset = 0;
2265 unsigned long va;
2266 unsigned long linkoffset;
2267 int nextarg;
2268 const char *command;
2269
2270 if (argc != 3)
2271 return KDB_ARGCOUNT;
2272
2273 nextarg = 1;
2274 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
2275 if (diag)
2276 return diag;
2277
2278 diag = kdbgetularg(argv[2], &linkoffset);
2279 if (diag)
2280 return diag;
2281
2282 /*
2283 * Using the starting address as
2284 * the first element in the list, and assuming that
2285 * the list ends with a null pointer.
2286 */
2287
2288 va = addr;
2289 command = kdb_strdup(argv[3], GFP_KDB);
2290 if (!command) {
2291 kdb_printf("%s: cannot duplicate command\n", __func__);
2292 return 0;
2293 }
2294 /* Recursive use of kdb_parse, do not use argv after this point */
2295 argv = NULL;
2296
2297 while (va) {
2298 char buf[80];
2299
2300 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2301 diag = kdb_parse(buf);
2302 if (diag)
2303 return diag;
2304
2305 addr = va + linkoffset;
2306 if (kdb_getword(&va, addr, sizeof(va)))
2307 return 0;
2308 }
2309 kfree(command);
2310
2311 return 0;
2312}
2313
2314static int kdb_kgdb(int argc, const char **argv)
2315{
2316 return KDB_CMD_KGDB;
2317}
2318
2319/*
2320 * kdb_help - This function implements the 'help' and '?' commands.
2321 */
2322static int kdb_help(int argc, const char **argv)
2323{
2324 kdbtab_t *kt;
2325 int i;
2326
2327 kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
2328 kdb_printf("-----------------------------"
2329 "-----------------------------\n");
2330 for_each_kdbcmd(kt, i) {
2331 if (kt->cmd_name)
2332 kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
2333 kt->cmd_usage, kt->cmd_help);
2334 if (KDB_FLAG(CMD_INTERRUPT))
2335 return 0;
2336 }
2337 return 0;
2338}
2339
2340/*
2341 * kdb_kill - This function implements the 'kill' commands.
2342 */
2343static int kdb_kill(int argc, const char **argv)
2344{
2345 long sig, pid;
2346 char *endp;
2347 struct task_struct *p;
2348 struct siginfo info;
2349
2350 if (argc != 2)
2351 return KDB_ARGCOUNT;
2352
2353 sig = simple_strtol(argv[1], &endp, 0);
2354 if (*endp)
2355 return KDB_BADINT;
2356 if (sig >= 0) {
2357 kdb_printf("Invalid signal parameter.<-signal>\n");
2358 return 0;
2359 }
2360 sig = -sig;
2361
2362 pid = simple_strtol(argv[2], &endp, 0);
2363 if (*endp)
2364 return KDB_BADINT;
2365 if (pid <= 0) {
2366 kdb_printf("Process ID must be large than 0.\n");
2367 return 0;
2368 }
2369
2370 /* Find the process. */
2371 p = find_task_by_pid_ns(pid, &init_pid_ns);
2372 if (!p) {
2373 kdb_printf("The specified process isn't found.\n");
2374 return 0;
2375 }
2376 p = p->group_leader;
2377 info.si_signo = sig;
2378 info.si_errno = 0;
2379 info.si_code = SI_USER;
2380 info.si_pid = pid; /* same capabilities as process being signalled */
2381 info.si_uid = 0; /* kdb has root authority */
2382 kdb_send_sig_info(p, &info);
2383 return 0;
2384}
2385
2386struct kdb_tm {
2387 int tm_sec; /* seconds */
2388 int tm_min; /* minutes */
2389 int tm_hour; /* hours */
2390 int tm_mday; /* day of the month */
2391 int tm_mon; /* month */
2392 int tm_year; /* year */
2393};
2394
2395static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
2396{
2397 /* This will work from 1970-2099, 2100 is not a leap year */
2398 static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
2399 31, 30, 31, 30, 31 };
2400 memset(tm, 0, sizeof(*tm));
2401 tm->tm_sec = tv->tv_sec % (24 * 60 * 60);
2402 tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
2403 (2 * 365 + 1); /* shift base from 1970 to 1968 */
2404 tm->tm_min = tm->tm_sec / 60 % 60;
2405 tm->tm_hour = tm->tm_sec / 60 / 60;
2406 tm->tm_sec = tm->tm_sec % 60;
2407 tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
2408 tm->tm_mday %= (4*365+1);
2409 mon_day[1] = 29;
2410 while (tm->tm_mday >= mon_day[tm->tm_mon]) {
2411 tm->tm_mday -= mon_day[tm->tm_mon];
2412 if (++tm->tm_mon == 12) {
2413 tm->tm_mon = 0;
2414 ++tm->tm_year;
2415 mon_day[1] = 28;
2416 }
2417 }
2418 ++tm->tm_mday;
2419}
2420
2421/*
2422 * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
2423 * I cannot call that code directly from kdb, it has an unconditional
2424 * cli()/sti() and calls routines that take locks which can stop the debugger.
2425 */
2426static void kdb_sysinfo(struct sysinfo *val)
2427{
2428 struct timespec uptime;
2429 do_posix_clock_monotonic_gettime(&uptime);
2430 memset(val, 0, sizeof(*val));
2431 val->uptime = uptime.tv_sec;
2432 val->loads[0] = avenrun[0];
2433 val->loads[1] = avenrun[1];
2434 val->loads[2] = avenrun[2];
2435 val->procs = nr_threads-1;
2436 si_meminfo(val);
2437
2438 return;
2439}
2440
2441/*
2442 * kdb_summary - This function implements the 'summary' command.
2443 */
2444static int kdb_summary(int argc, const char **argv)
2445{
2446 struct kdb_tm tm;
2447 struct sysinfo val;
2448
2449 if (argc)
2450 return KDB_ARGCOUNT;
2451
2452 kdb_printf("sysname %s\n", init_uts_ns.name.sysname);
2453 kdb_printf("release %s\n", init_uts_ns.name.release);
2454 kdb_printf("version %s\n", init_uts_ns.name.version);
2455 kdb_printf("machine %s\n", init_uts_ns.name.machine);
2456 kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
2457 kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
2458 kdb_printf("ccversion %s\n", __stringify(CCVERSION));
2459
2460 kdb_gmtime(&xtime, &tm);
2461 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
2462 "tz_minuteswest %d\n",
2463 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
2464 tm.tm_hour, tm.tm_min, tm.tm_sec,
2465 sys_tz.tz_minuteswest);
2466
2467 kdb_sysinfo(&val);
2468 kdb_printf("uptime ");
2469 if (val.uptime > (24*60*60)) {
2470 int days = val.uptime / (24*60*60);
2471 val.uptime %= (24*60*60);
2472 kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
2473 }
2474 kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
2475
2476 /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
2477
2478#define LOAD_INT(x) ((x) >> FSHIFT)
2479#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
2480 kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
2481 LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
2482 LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
2483 LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
2484#undef LOAD_INT
2485#undef LOAD_FRAC
2486 /* Display in kilobytes */
2487#define K(x) ((x) << (PAGE_SHIFT - 10))
2488 kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
2489 "Buffers: %8lu kB\n",
2490 val.totalram, val.freeram, val.bufferram);
2491 return 0;
2492}
2493
2494/*
2495 * kdb_per_cpu - This function implements the 'per_cpu' command.
2496 */
2497static int kdb_per_cpu(int argc, const char **argv)
2498{
2499 char buf[256], fmtstr[64];
2500 kdb_symtab_t symtab;
2501 cpumask_t suppress = CPU_MASK_NONE;
2502 int cpu, diag;
2503 unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
2504
2505 if (argc < 1 || argc > 3)
2506 return KDB_ARGCOUNT;
2507
2508 snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
2509 if (!kdbgetsymval(buf, &symtab)) {
2510 kdb_printf("%s is not a per_cpu variable\n", argv[1]);
2511 return KDB_BADADDR;
2512 }
2513 if (argc >= 2) {
2514 diag = kdbgetularg(argv[2], &bytesperword);
2515 if (diag)
2516 return diag;
2517 }
2518 if (!bytesperword)
2519 bytesperword = KDB_WORD_SIZE;
2520 else if (bytesperword > KDB_WORD_SIZE)
2521 return KDB_BADWIDTH;
2522 sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword));
2523 if (argc >= 3) {
2524 diag = kdbgetularg(argv[3], &whichcpu);
2525 if (diag)
2526 return diag;
2527 if (!cpu_online(whichcpu)) {
2528 kdb_printf("cpu %ld is not online\n", whichcpu);
2529 return KDB_BADCPUNUM;
2530 }
2531 }
2532
2533 /* Most architectures use __per_cpu_offset[cpu], some use
2534 * __per_cpu_offset(cpu), smp has no __per_cpu_offset.
2535 */
2536#ifdef __per_cpu_offset
2537#define KDB_PCU(cpu) __per_cpu_offset(cpu)
2538#else
2539#ifdef CONFIG_SMP
2540#define KDB_PCU(cpu) __per_cpu_offset[cpu]
2541#else
2542#define KDB_PCU(cpu) 0
2543#endif
2544#endif
2545
2546 for_each_online_cpu(cpu) {
2547 if (whichcpu != ~0UL && whichcpu != cpu)
2548 continue;
2549 addr = symtab.sym_start + KDB_PCU(cpu);
2550 diag = kdb_getword(&val, addr, bytesperword);
2551 if (diag) {
2552 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
2553 "read, diag=%d\n", cpu, addr, diag);
2554 continue;
2555 }
2556#ifdef CONFIG_SMP
2557 if (!val) {
2558 cpu_set(cpu, suppress);
2559 continue;
2560 }
2561#endif /* CONFIG_SMP */
2562 kdb_printf("%5d ", cpu);
2563 kdb_md_line(fmtstr, addr,
2564 bytesperword == KDB_WORD_SIZE,
2565 1, bytesperword, 1, 1, 0);
2566 }
2567 if (cpus_weight(suppress) == 0)
2568 return 0;
2569 kdb_printf("Zero suppressed cpu(s):");
2570 for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
2571 cpu = next_cpu(cpu, suppress)) {
2572 kdb_printf(" %d", cpu);
2573 if (cpu == num_possible_cpus() - 1 ||
2574 next_cpu(cpu, suppress) != cpu + 1)
2575 continue;
2576 while (cpu < num_possible_cpus() &&
2577 next_cpu(cpu, suppress) == cpu + 1)
2578 ++cpu;
2579 kdb_printf("-%d", cpu);
2580 }
2581 kdb_printf("\n");
2582
2583#undef KDB_PCU
2584
2585 return 0;
2586}
2587
2588/*
2589 * display help for the use of cmd | grep pattern
2590 */
2591static int kdb_grep_help(int argc, const char **argv)
2592{
2593 kdb_printf("Usage of cmd args | grep pattern:\n");
2594 kdb_printf(" Any command's output may be filtered through an ");
2595 kdb_printf("emulated 'pipe'.\n");
2596 kdb_printf(" 'grep' is just a key word.\n");
2597 kdb_printf(" The pattern may include a very limited set of "
2598 "metacharacters:\n");
2599 kdb_printf(" pattern or ^pattern or pattern$ or ^pattern$\n");
2600 kdb_printf(" And if there are spaces in the pattern, you may "
2601 "quote it:\n");
2602 kdb_printf(" \"pat tern\" or \"^pat tern\" or \"pat tern$\""
2603 " or \"^pat tern$\"\n");
2604 return 0;
2605}
2606
2607/*
2608 * kdb_register_repeat - This function is used to register a kernel
2609 * debugger command.
2610 * Inputs:
2611 * cmd Command name
2612 * func Function to execute the command
2613 * usage A simple usage string showing arguments
2614 * help A simple help string describing command
2615 * repeat Does the command auto repeat on enter?
2616 * Returns:
2617 * zero for success, one if a duplicate command.
2618 */
2619#define kdb_command_extend 50 /* arbitrary */
2620int kdb_register_repeat(char *cmd,
2621 kdb_func_t func,
2622 char *usage,
2623 char *help,
2624 short minlen,
2625 kdb_repeat_t repeat)
2626{
2627 int i;
2628 kdbtab_t *kp;
2629
2630 /*
2631 * Brute force method to determine duplicates
2632 */
2633 for_each_kdbcmd(kp, i) {
2634 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2635 kdb_printf("Duplicate kdb command registered: "
2636 "%s, func %p help %s\n", cmd, func, help);
2637 return 1;
2638 }
2639 }
2640
2641 /*
2642 * Insert command into first available location in table
2643 */
2644 for_each_kdbcmd(kp, i) {
2645 if (kp->cmd_name == NULL)
2646 break;
2647 }
2648
2649 if (i >= kdb_max_commands) {
2650 kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
2651 kdb_command_extend) * sizeof(*new), GFP_KDB);
2652 if (!new) {
2653 kdb_printf("Could not allocate new kdb_command "
2654 "table\n");
2655 return 1;
2656 }
2657 if (kdb_commands) {
2658 memcpy(new, kdb_commands,
2659 kdb_max_commands * sizeof(*new));
2660 kfree(kdb_commands);
2661 }
2662 memset(new + kdb_max_commands, 0,
2663 kdb_command_extend * sizeof(*new));
2664 kdb_commands = new;
2665 kp = kdb_commands + kdb_max_commands;
2666 kdb_max_commands += kdb_command_extend;
2667 }
2668
2669 kp->cmd_name = cmd;
2670 kp->cmd_func = func;
2671 kp->cmd_usage = usage;
2672 kp->cmd_help = help;
2673 kp->cmd_flags = 0;
2674 kp->cmd_minlen = minlen;
2675 kp->cmd_repeat = repeat;
2676
2677 return 0;
2678}
2679
2680/*
2681 * kdb_register - Compatibility register function for commands that do
2682 * not need to specify a repeat state. Equivalent to
2683 * kdb_register_repeat with KDB_REPEAT_NONE.
2684 * Inputs:
2685 * cmd Command name
2686 * func Function to execute the command
2687 * usage A simple usage string showing arguments
2688 * help A simple help string describing command
2689 * Returns:
2690 * zero for success, one if a duplicate command.
2691 */
2692int kdb_register(char *cmd,
2693 kdb_func_t func,
2694 char *usage,
2695 char *help,
2696 short minlen)
2697{
2698 return kdb_register_repeat(cmd, func, usage, help, minlen,
2699 KDB_REPEAT_NONE);
2700}
2701
2702/*
2703 * kdb_unregister - This function is used to unregister a kernel
2704 * debugger command. It is generally called when a module which
2705 * implements kdb commands is unloaded.
2706 * Inputs:
2707 * cmd Command name
2708 * Returns:
2709 * zero for success, one command not registered.
2710 */
2711int kdb_unregister(char *cmd)
2712{
2713 int i;
2714 kdbtab_t *kp;
2715
2716 /*
2717 * find the command.
2718 */
2719 for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) {
2720 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2721 kp->cmd_name = NULL;
2722 return 0;
2723 }
2724 }
2725
2726 /* Couldn't find it. */
2727 return 1;
2728}
2729
2730/* Initialize the kdb command table. */
2731static void __init kdb_inittab(void)
2732{
2733 int i;
2734 kdbtab_t *kp;
2735
2736 for_each_kdbcmd(kp, i)
2737 kp->cmd_name = NULL;
2738
2739 kdb_register_repeat("md", kdb_md, "<vaddr>",
2740 "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
2741 KDB_REPEAT_NO_ARGS);
2742 kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
2743 "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
2744 kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
2745 "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
2746 kdb_register_repeat("mds", kdb_md, "<vaddr>",
2747 "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
2748 kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
2749 "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
2750 kdb_register_repeat("go", kdb_go, "[<vaddr>]",
2751 "Continue Execution", 1, KDB_REPEAT_NONE);
2752 kdb_register_repeat("rd", kdb_rd, "",
2753 "Display Registers", 0, KDB_REPEAT_NONE);
2754 kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
2755 "Modify Registers", 0, KDB_REPEAT_NONE);
2756 kdb_register_repeat("ef", kdb_ef, "<vaddr>",
2757 "Display exception frame", 0, KDB_REPEAT_NONE);
2758 kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
2759 "Stack traceback", 1, KDB_REPEAT_NONE);
2760 kdb_register_repeat("btp", kdb_bt, "<pid>",
2761 "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
2762 kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
2763 "Display stack all processes", 0, KDB_REPEAT_NONE);
2764 kdb_register_repeat("btc", kdb_bt, "",
2765 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
2766 kdb_register_repeat("btt", kdb_bt, "<vaddr>",
2767 "Backtrace process given its struct task address", 0,
2768 KDB_REPEAT_NONE);
2769 kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
2770 "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
2771 kdb_register_repeat("env", kdb_env, "",
2772 "Show environment variables", 0, KDB_REPEAT_NONE);
2773 kdb_register_repeat("set", kdb_set, "",
2774 "Set environment variables", 0, KDB_REPEAT_NONE);
2775 kdb_register_repeat("help", kdb_help, "",
2776 "Display Help Message", 1, KDB_REPEAT_NONE);
2777 kdb_register_repeat("?", kdb_help, "",
2778 "Display Help Message", 0, KDB_REPEAT_NONE);
2779 kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
2780 "Switch to new cpu", 0, KDB_REPEAT_NONE);
2781 kdb_register_repeat("kgdb", kdb_kgdb, "",
2782 "Enter kgdb mode", 0, KDB_REPEAT_NONE);
2783 kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
2784 "Display active task list", 0, KDB_REPEAT_NONE);
2785 kdb_register_repeat("pid", kdb_pid, "<pidnum>",
2786 "Switch to another task", 0, KDB_REPEAT_NONE);
2787 kdb_register_repeat("reboot", kdb_reboot, "",
2788 "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
2789#if defined(CONFIG_MODULES)
2790 kdb_register_repeat("lsmod", kdb_lsmod, "",
2791 "List loaded kernel modules", 0, KDB_REPEAT_NONE);
2792#endif
2793#if defined(CONFIG_MAGIC_SYSRQ)
2794 kdb_register_repeat("sr", kdb_sr, "<key>",
2795 "Magic SysRq key", 0, KDB_REPEAT_NONE);
2796#endif
2797#if defined(CONFIG_PRINTK)
2798 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
2799 "Display syslog buffer", 0, KDB_REPEAT_NONE);
2800#endif
2801 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
2802 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
2803 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
2804 "Send a signal to a process", 0, KDB_REPEAT_NONE);
2805 kdb_register_repeat("summary", kdb_summary, "",
2806 "Summarize the system", 4, KDB_REPEAT_NONE);
2807 kdb_register_repeat("per_cpu", kdb_per_cpu, "",
2808 "Display per_cpu variables", 3, KDB_REPEAT_NONE);
2809 kdb_register_repeat("grephelp", kdb_grep_help, "",
2810 "Display help on | grep", 0, KDB_REPEAT_NONE);
2811}
2812
2813/* Execute any commands defined in kdb_cmds. */
2814static void __init kdb_cmd_init(void)
2815{
2816 int i, diag;
2817 for (i = 0; kdb_cmds[i]; ++i) {
2818 diag = kdb_parse(kdb_cmds[i]);
2819 if (diag)
2820 kdb_printf("kdb command %s failed, kdb diag %d\n",
2821 kdb_cmds[i], diag);
2822 }
2823 if (defcmd_in_progress) {
2824 kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n");
2825 kdb_parse("endefcmd");
2826 }
2827}
2828
2829/* Intialize kdb_printf, breakpoint tables and kdb state */
2830void __init kdb_init(int lvl)
2831{
2832 static int kdb_init_lvl = KDB_NOT_INITIALIZED;
2833 int i;
2834
2835 if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl)
2836 return;
2837 for (i = kdb_init_lvl; i < lvl; i++) {
2838 switch (i) {
2839 case KDB_NOT_INITIALIZED:
2840 kdb_inittab(); /* Initialize Command Table */
2841 kdb_initbptab(); /* Initialize Breakpoints */
2842 break;
2843 case KDB_INIT_EARLY:
2844 kdb_cmd_init(); /* Build kdb_cmds tables */
2845 break;
2846 }
2847 }
2848 kdb_init_lvl = lvl;
2849}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
new file mode 100644
index 000000000000..97d3ba69775d
--- /dev/null
+++ b/kernel/debug/kdb/kdb_private.h
@@ -0,0 +1,300 @@
1#ifndef _KDBPRIVATE_H
2#define _KDBPRIVATE_H
3
4/*
5 * Kernel Debugger Architecture Independent Private Headers
6 *
7 * This file is subject to the terms and conditions of the GNU General Public
8 * License. See the file "COPYING" in the main directory of this archive
9 * for more details.
10 *
11 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
12 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
13 */
14
15#include <linux/kgdb.h>
16#include "../debug_core.h"
17
18/* Kernel Debugger Error codes. Must not overlap with command codes. */
19#define KDB_NOTFOUND (-1)
20#define KDB_ARGCOUNT (-2)
21#define KDB_BADWIDTH (-3)
22#define KDB_BADRADIX (-4)
23#define KDB_NOTENV (-5)
24#define KDB_NOENVVALUE (-6)
25#define KDB_NOTIMP (-7)
26#define KDB_ENVFULL (-8)
27#define KDB_ENVBUFFULL (-9)
28#define KDB_TOOMANYBPT (-10)
29#define KDB_TOOMANYDBREGS (-11)
30#define KDB_DUPBPT (-12)
31#define KDB_BPTNOTFOUND (-13)
32#define KDB_BADMODE (-14)
33#define KDB_BADINT (-15)
34#define KDB_INVADDRFMT (-16)
35#define KDB_BADREG (-17)
36#define KDB_BADCPUNUM (-18)
37#define KDB_BADLENGTH (-19)
38#define KDB_NOBP (-20)
39#define KDB_BADADDR (-21)
40
41/* Kernel Debugger Command codes. Must not overlap with error codes. */
42#define KDB_CMD_GO (-1001)
43#define KDB_CMD_CPU (-1002)
44#define KDB_CMD_SS (-1003)
45#define KDB_CMD_SSB (-1004)
46#define KDB_CMD_KGDB (-1005)
47#define KDB_CMD_KGDB2 (-1006)
48
49/* Internal debug flags */
50#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
51#define KDB_DEBUG_FLAG_BB_SUMM 0x0004 /* Basic block analysis, summary only */
52#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */
53#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */
54#define KDB_DEBUG_FLAG_BB 0x0020 /* All basic block analysis */
55#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */
56#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */
57#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */
58
59#define KDB_DEBUG(flag) (kdb_flags & \
60 (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
61#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
62 kdb_print_state(text, value)
63
64#if BITS_PER_LONG == 32
65
66#define KDB_PLATFORM_ENV "BYTESPERWORD=4"
67
68#define kdb_machreg_fmt "0x%lx"
69#define kdb_machreg_fmt0 "0x%08lx"
70#define kdb_bfd_vma_fmt "0x%lx"
71#define kdb_bfd_vma_fmt0 "0x%08lx"
72#define kdb_elfw_addr_fmt "0x%x"
73#define kdb_elfw_addr_fmt0 "0x%08x"
74#define kdb_f_count_fmt "%d"
75
76#elif BITS_PER_LONG == 64
77
78#define KDB_PLATFORM_ENV "BYTESPERWORD=8"
79
80#define kdb_machreg_fmt "0x%lx"
81#define kdb_machreg_fmt0 "0x%016lx"
82#define kdb_bfd_vma_fmt "0x%lx"
83#define kdb_bfd_vma_fmt0 "0x%016lx"
84#define kdb_elfw_addr_fmt "0x%x"
85#define kdb_elfw_addr_fmt0 "0x%016x"
86#define kdb_f_count_fmt "%ld"
87
88#endif
89
90/*
91 * KDB_MAXBPT describes the total number of breakpoints
92 * supported by this architecure.
93 */
94#define KDB_MAXBPT 16
95
96/* Maximum number of arguments to a function */
97#define KDB_MAXARGS 16
98
99typedef enum {
100 KDB_REPEAT_NONE = 0, /* Do not repeat this command */
101 KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */
102 KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */
103} kdb_repeat_t;
104
105typedef int (*kdb_func_t)(int, const char **);
106
107/* Symbol table format returned by kallsyms. */
108typedef struct __ksymtab {
109 unsigned long value; /* Address of symbol */
110 const char *mod_name; /* Module containing symbol or
111 * "kernel" */
112 unsigned long mod_start;
113 unsigned long mod_end;
114 const char *sec_name; /* Section containing symbol */
115 unsigned long sec_start;
116 unsigned long sec_end;
117 const char *sym_name; /* Full symbol name, including
118 * any version */
119 unsigned long sym_start;
120 unsigned long sym_end;
121 } kdb_symtab_t;
122extern int kallsyms_symbol_next(char *prefix_name, int flag);
123extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
124
125/* Exported Symbols for kernel loadable modules to use. */
126extern int kdb_register(char *, kdb_func_t, char *, char *, short);
127extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
128 short, kdb_repeat_t);
129extern int kdb_unregister(char *);
130
131extern int kdb_getarea_size(void *, unsigned long, size_t);
132extern int kdb_putarea_size(unsigned long, void *, size_t);
133
134/*
135 * Like get_user and put_user, kdb_getarea and kdb_putarea take variable
136 * names, not pointers. The underlying *_size functions take pointers.
137 */
138#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
139#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
140
141extern int kdb_getphysword(unsigned long *word,
142 unsigned long addr, size_t size);
143extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t);
145
146extern int kdbgetularg(const char *, unsigned long *);
147extern int kdb_set(int, const char **);
148extern char *kdbgetenv(const char *);
149extern int kdbgetintenv(const char *, int *);
150extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
151 long *, char **);
152extern int kdbgetsymval(const char *, kdb_symtab_t *);
153extern int kdbnearsym(unsigned long, kdb_symtab_t *);
154extern void kdbnearsym_cleanup(void);
155extern char *kdb_strdup(const char *str, gfp_t type);
156extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
157
158/* Routine for debugging the debugger state. */
159extern void kdb_print_state(const char *, int);
160
161extern int kdb_state;
162#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */
163#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */
164#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */
165#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under
166 * kdb control */
167#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
168#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
169#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
170 * DOING_SS is also set */
171#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
172 * after one ss, independent of
173 * DOING_SS */
174#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */
175#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */
176#define KDB_STATE_PAGER 0x00000400 /* pager is available */
177#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
178 * back to initial cpu */
179#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
180#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
181#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
182#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
183 * adjusted */
184#define KDB_STATE_GO1 0x00010000 /* go only releases one cpu */
185#define KDB_STATE_KEYBOARD 0x00020000 /* kdb entered via
186 * keyboard on this cpu */
187#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
188#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
189#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */
190#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
191#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
192 * specific use */
193
194#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
195#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
196#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
197
198extern int kdb_nextline; /* Current number of lines displayed */
199
200typedef struct _kdb_bp {
201 unsigned long bp_addr; /* Address breakpoint is present at */
202 unsigned int bp_free:1; /* This entry is available */
203 unsigned int bp_enabled:1; /* Breakpoint is active in register */
204 unsigned int bp_type:4; /* Uses hardware register */
205 unsigned int bp_installed:1; /* Breakpoint is installed */
206 unsigned int bp_delay:1; /* Do delayed bp handling */
207 unsigned int bp_delayed:1; /* Delayed breakpoint */
208 unsigned int bph_length; /* HW break length */
209} kdb_bp_t;
210
211#ifdef CONFIG_KGDB_KDB
212extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
213
214/* The KDB shell command table */
215typedef struct _kdbtab {
216 char *cmd_name; /* Command name */
217 kdb_func_t cmd_func; /* Function to execute command */
218 char *cmd_usage; /* Usage String for this command */
219 char *cmd_help; /* Help message for this command */
220 short cmd_flags; /* Parsing flags */
221 short cmd_minlen; /* Minimum legal # command
222 * chars required */
223 kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */
224} kdbtab_t;
225
226extern int kdb_bt(int, const char **); /* KDB display back trace */
227
228/* KDB breakpoint management functions */
229extern void kdb_initbptab(void);
230extern void kdb_bp_install(struct pt_regs *);
231extern void kdb_bp_remove(void);
232
233typedef enum {
234 KDB_DB_BPT, /* Breakpoint */
235 KDB_DB_SS, /* Single-step trap */
236 KDB_DB_SSB, /* Single step to branch */
237 KDB_DB_SSBPT, /* Single step over breakpoint */
238 KDB_DB_NOBPT /* Spurious breakpoint */
239} kdb_dbtrap_t;
240
241extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
242 int, kdb_dbtrap_t, struct pt_regs *);
243
244/* Miscellaneous functions and data areas */
245extern int kdb_grepping_flag;
246extern char kdb_grep_string[];
247extern int kdb_grep_leading;
248extern int kdb_grep_trailing;
249extern char *kdb_cmds[];
250extern void kdb_syslog_data(char *syslog_data[]);
251extern unsigned long kdb_task_state_string(const char *);
252extern char kdb_task_state_char (const struct task_struct *);
253extern unsigned long kdb_task_state(const struct task_struct *p,
254 unsigned long mask);
255extern void kdb_ps_suppressed(void);
256extern void kdb_ps1(const struct task_struct *p);
257extern void kdb_print_nameval(const char *name, unsigned long val);
258extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
259extern void kdb_meminfo_proc_show(void);
260extern const char *kdb_walk_kallsyms(loff_t *pos);
261extern char *kdb_getstr(char *, size_t, char *);
262
263/* Defines for kdb_symbol_print */
264#define KDB_SP_SPACEB 0x0001 /* Space before string */
265#define KDB_SP_SPACEA 0x0002 /* Space after string */
266#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */
267#define KDB_SP_VALUE 0x0008 /* Print the value of the address */
268#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */
269#define KDB_SP_NEWLINE 0x0020 /* Newline after string */
270#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
271
272#define KDB_TSK(cpu) kgdb_info[cpu].task
273#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
274
275extern struct task_struct *kdb_curr_task(int);
276
277#define kdb_task_has_cpu(p) (task_curr(p))
278
279/* Simplify coexistence with NPTL */
280#define kdb_do_each_thread(g, p) do_each_thread(g, p)
281#define kdb_while_each_thread(g, p) while_each_thread(g, p)
282
283#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
284
285extern void *debug_kmalloc(size_t size, gfp_t flags);
286extern void debug_kfree(void *);
287extern void debug_kusage(void);
288
289extern void kdb_set_current_task(struct task_struct *);
290extern struct task_struct *kdb_current_task;
291#ifdef CONFIG_MODULES
292extern struct list_head *kdb_modules;
293#endif /* CONFIG_MODULES */
294
295extern char kdb_prompt_str[];
296
297#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
298
299#endif /* CONFIG_KGDB_KDB */
300#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
new file mode 100644
index 000000000000..45344d5c53dd
--- /dev/null
+++ b/kernel/debug/kdb/kdb_support.c
@@ -0,0 +1,927 @@
1/*
2 * Kernel Debugger Architecture Independent Support Functions
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 * 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
11 */
12
13#include <stdarg.h>
14#include <linux/types.h>
15#include <linux/sched.h>
16#include <linux/mm.h>
17#include <linux/kallsyms.h>
18#include <linux/stddef.h>
19#include <linux/vmalloc.h>
20#include <linux/ptrace.h>
21#include <linux/module.h>
22#include <linux/highmem.h>
23#include <linux/hardirq.h>
24#include <linux/delay.h>
25#include <linux/uaccess.h>
26#include <linux/kdb.h>
27#include <linux/slab.h>
28#include "kdb_private.h"
29
30/*
31 * kdbgetsymval - Return the address of the given symbol.
32 *
33 * Parameters:
34 * symname Character string containing symbol name
35 * symtab Structure to receive results
36 * Returns:
37 * 0 Symbol not found, symtab zero filled
38 * 1 Symbol mapped to module/symbol/section, data in symtab
39 */
40int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
41{
42 if (KDB_DEBUG(AR))
43 kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
44 symtab);
45 memset(symtab, 0, sizeof(*symtab));
46 symtab->sym_start = kallsyms_lookup_name(symname);
47 if (symtab->sym_start) {
48 if (KDB_DEBUG(AR))
49 kdb_printf("kdbgetsymval: returns 1, "
50 "symtab->sym_start=0x%lx\n",
51 symtab->sym_start);
52 return 1;
53 }
54 if (KDB_DEBUG(AR))
55 kdb_printf("kdbgetsymval: returns 0\n");
56 return 0;
57}
58EXPORT_SYMBOL(kdbgetsymval);
59
60static char *kdb_name_table[100]; /* arbitrary size */
61
62/*
63 * kdbnearsym - Return the name of the symbol with the nearest address
64 * less than 'addr'.
65 *
66 * Parameters:
67 * addr Address to check for symbol near
68 * symtab Structure to receive results
69 * Returns:
70 * 0 No sections contain this address, symtab zero filled
71 * 1 Address mapped to module/symbol/section, data in symtab
72 * Remarks:
73 * 2.6 kallsyms has a "feature" where it unpacks the name into a
74 * string. If that string is reused before the caller expects it
75 * then the caller sees its string change without warning. To
76 * avoid cluttering up the main kdb code with lots of kdb_strdup,
77 * tests and kfree calls, kdbnearsym maintains an LRU list of the
78 * last few unique strings. The list is sized large enough to
79 * hold active strings, no kdb caller of kdbnearsym makes more
80 * than ~20 later calls before using a saved value.
81 */
82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
83{
84 int ret = 0;
85 unsigned long symbolsize;
86 unsigned long offset;
87#define knt1_size 128 /* must be >= kallsyms table size */
88 char *knt1 = NULL;
89
90 if (KDB_DEBUG(AR))
91 kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
92 memset(symtab, 0, sizeof(*symtab));
93
94 if (addr < 4096)
95 goto out;
96 knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
97 if (!knt1) {
98 kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
99 addr);
100 goto out;
101 }
102 symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
103 (char **)(&symtab->mod_name), knt1);
104 if (offset > 8*1024*1024) {
105 symtab->sym_name = NULL;
106 addr = offset = symbolsize = 0;
107 }
108 symtab->sym_start = addr - offset;
109 symtab->sym_end = symtab->sym_start + symbolsize;
110 ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
111
112 if (ret) {
113 int i;
114 /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
115 * set but the buffer passed into kallsyms_lookup is not used,
116 * so it contains garbage. The caller has to work out which
117 * buffer needs to be saved.
118 *
119 * What was Rusty smoking when he wrote that code?
120 */
121 if (symtab->sym_name != knt1) {
122 strncpy(knt1, symtab->sym_name, knt1_size);
123 knt1[knt1_size-1] = '\0';
124 }
125 for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
126 if (kdb_name_table[i] &&
127 strcmp(kdb_name_table[i], knt1) == 0)
128 break;
129 }
130 if (i >= ARRAY_SIZE(kdb_name_table)) {
131 debug_kfree(kdb_name_table[0]);
132 memcpy(kdb_name_table, kdb_name_table+1,
133 sizeof(kdb_name_table[0]) *
134 (ARRAY_SIZE(kdb_name_table)-1));
135 } else {
136 debug_kfree(knt1);
137 knt1 = kdb_name_table[i];
138 memcpy(kdb_name_table+i, kdb_name_table+i+1,
139 sizeof(kdb_name_table[0]) *
140 (ARRAY_SIZE(kdb_name_table)-i-1));
141 }
142 i = ARRAY_SIZE(kdb_name_table) - 1;
143 kdb_name_table[i] = knt1;
144 symtab->sym_name = kdb_name_table[i];
145 knt1 = NULL;
146 }
147
148 if (symtab->mod_name == NULL)
149 symtab->mod_name = "kernel";
150 if (KDB_DEBUG(AR))
151 kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
152 "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
153 symtab->sym_start, symtab->mod_name, symtab->sym_name,
154 symtab->sym_name);
155
156out:
157 debug_kfree(knt1);
158 return ret;
159}
160
161void kdbnearsym_cleanup(void)
162{
163 int i;
164 for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
165 if (kdb_name_table[i]) {
166 debug_kfree(kdb_name_table[i]);
167 kdb_name_table[i] = NULL;
168 }
169 }
170}
171
172static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
173
174/*
175 * kallsyms_symbol_complete
176 *
177 * Parameters:
178 * prefix_name prefix of a symbol name to lookup
179 * max_len maximum length that can be returned
180 * Returns:
181 * Number of symbols which match the given prefix.
182 * Notes:
183 * prefix_name is changed to contain the longest unique prefix that
184 * starts with this prefix (tab completion).
185 */
186int kallsyms_symbol_complete(char *prefix_name, int max_len)
187{
188 loff_t pos = 0;
189 int prefix_len = strlen(prefix_name), prev_len = 0;
190 int i, number = 0;
191 const char *name;
192
193 while ((name = kdb_walk_kallsyms(&pos))) {
194 if (strncmp(name, prefix_name, prefix_len) == 0) {
195 strcpy(ks_namebuf, name);
196 /* Work out the longest name that matches the prefix */
197 if (++number == 1) {
198 prev_len = min_t(int, max_len-1,
199 strlen(ks_namebuf));
200 memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
201 ks_namebuf_prev[prev_len] = '\0';
202 continue;
203 }
204 for (i = 0; i < prev_len; i++) {
205 if (ks_namebuf[i] != ks_namebuf_prev[i]) {
206 prev_len = i;
207 ks_namebuf_prev[i] = '\0';
208 break;
209 }
210 }
211 }
212 }
213 if (prev_len > prefix_len)
214 memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
215 return number;
216}
217
218/*
219 * kallsyms_symbol_next
220 *
221 * Parameters:
222 * prefix_name prefix of a symbol name to lookup
223 * flag 0 means search from the head, 1 means continue search.
224 * Returns:
225 * 1 if a symbol matches the given prefix.
226 * 0 if no string found
227 */
228int kallsyms_symbol_next(char *prefix_name, int flag)
229{
230 int prefix_len = strlen(prefix_name);
231 static loff_t pos;
232 const char *name;
233
234 if (!flag)
235 pos = 0;
236
237 while ((name = kdb_walk_kallsyms(&pos))) {
238 if (strncmp(name, prefix_name, prefix_len) == 0) {
239 strncpy(prefix_name, name, strlen(name)+1);
240 return 1;
241 }
242 }
243 return 0;
244}
245
246/*
247 * kdb_symbol_print - Standard method for printing a symbol name and offset.
248 * Inputs:
249 * addr Address to be printed.
250 * symtab Address of symbol data, if NULL this routine does its
251 * own lookup.
252 * punc Punctuation for string, bit field.
253 * Remarks:
254 * The string and its punctuation is only printed if the address
255 * is inside the kernel, except that the value is always printed
256 * when requested.
257 */
258void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
259 unsigned int punc)
260{
261 kdb_symtab_t symtab, *symtab_p2;
262 if (symtab_p) {
263 symtab_p2 = (kdb_symtab_t *)symtab_p;
264 } else {
265 symtab_p2 = &symtab;
266 kdbnearsym(addr, symtab_p2);
267 }
268 if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
269 return;
270 if (punc & KDB_SP_SPACEB)
271 kdb_printf(" ");
272 if (punc & KDB_SP_VALUE)
273 kdb_printf(kdb_machreg_fmt0, addr);
274 if (symtab_p2->sym_name) {
275 if (punc & KDB_SP_VALUE)
276 kdb_printf(" ");
277 if (punc & KDB_SP_PAREN)
278 kdb_printf("(");
279 if (strcmp(symtab_p2->mod_name, "kernel"))
280 kdb_printf("[%s]", symtab_p2->mod_name);
281 kdb_printf("%s", symtab_p2->sym_name);
282 if (addr != symtab_p2->sym_start)
283 kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
284 if (punc & KDB_SP_SYMSIZE)
285 kdb_printf("/0x%lx",
286 symtab_p2->sym_end - symtab_p2->sym_start);
287 if (punc & KDB_SP_PAREN)
288 kdb_printf(")");
289 }
290 if (punc & KDB_SP_SPACEA)
291 kdb_printf(" ");
292 if (punc & KDB_SP_NEWLINE)
293 kdb_printf("\n");
294}
295
296/*
297 * kdb_strdup - kdb equivalent of strdup, for disasm code.
298 * Inputs:
299 * str The string to duplicate.
300 * type Flags to kmalloc for the new string.
301 * Returns:
302 * Address of the new string, NULL if storage could not be allocated.
303 * Remarks:
304 * This is not in lib/string.c because it uses kmalloc which is not
305 * available when string.o is used in boot loaders.
306 */
307char *kdb_strdup(const char *str, gfp_t type)
308{
309 int n = strlen(str)+1;
310 char *s = kmalloc(n, type);
311 if (!s)
312 return NULL;
313 return strcpy(s, str);
314}
315
316/*
317 * kdb_getarea_size - Read an area of data. The kdb equivalent of
318 * copy_from_user, with kdb messages for invalid addresses.
319 * Inputs:
320 * res Pointer to the area to receive the result.
321 * addr Address of the area to copy.
322 * size Size of the area.
323 * Returns:
324 * 0 for success, < 0 for error.
325 */
326int kdb_getarea_size(void *res, unsigned long addr, size_t size)
327{
328 int ret = probe_kernel_read((char *)res, (char *)addr, size);
329 if (ret) {
330 if (!KDB_STATE(SUPPRESS)) {
331 kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
332 KDB_STATE_SET(SUPPRESS);
333 }
334 ret = KDB_BADADDR;
335 } else {
336 KDB_STATE_CLEAR(SUPPRESS);
337 }
338 return ret;
339}
340
341/*
342 * kdb_putarea_size - Write an area of data. The kdb equivalent of
343 * copy_to_user, with kdb messages for invalid addresses.
344 * Inputs:
345 * addr Address of the area to write to.
346 * res Pointer to the area holding the data.
347 * size Size of the area.
348 * Returns:
349 * 0 for success, < 0 for error.
350 */
351int kdb_putarea_size(unsigned long addr, void *res, size_t size)
352{
353 int ret = probe_kernel_read((char *)addr, (char *)res, size);
354 if (ret) {
355 if (!KDB_STATE(SUPPRESS)) {
356 kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
357 KDB_STATE_SET(SUPPRESS);
358 }
359 ret = KDB_BADADDR;
360 } else {
361 KDB_STATE_CLEAR(SUPPRESS);
362 }
363 return ret;
364}
365
366/*
367 * kdb_getphys - Read data from a physical address. Validate the
368 * address is in range, use kmap_atomic() to get data
369 * similar to kdb_getarea() - but for phys addresses
370 * Inputs:
371 * res Pointer to the word to receive the result
372 * addr Physical address of the area to copy
373 * size Size of the area
374 * Returns:
375 * 0 for success, < 0 for error.
376 */
377static int kdb_getphys(void *res, unsigned long addr, size_t size)
378{
379 unsigned long pfn;
380 void *vaddr;
381 struct page *page;
382
383 pfn = (addr >> PAGE_SHIFT);
384 if (!pfn_valid(pfn))
385 return 1;
386 page = pfn_to_page(pfn);
387 vaddr = kmap_atomic(page, KM_KDB);
388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
389 kunmap_atomic(vaddr, KM_KDB);
390
391 return 0;
392}
393
394/*
395 * kdb_getphysword
396 * Inputs:
397 * word Pointer to the word to receive the result.
398 * addr Address of the area to copy.
399 * size Size of the area.
400 * Returns:
401 * 0 for success, < 0 for error.
402 */
403int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
404{
405 int diag;
406 __u8 w1;
407 __u16 w2;
408 __u32 w4;
409 __u64 w8;
410 *word = 0; /* Default value if addr or size is invalid */
411
412 switch (size) {
413 case 1:
414 diag = kdb_getphys(&w1, addr, sizeof(w1));
415 if (!diag)
416 *word = w1;
417 break;
418 case 2:
419 diag = kdb_getphys(&w2, addr, sizeof(w2));
420 if (!diag)
421 *word = w2;
422 break;
423 case 4:
424 diag = kdb_getphys(&w4, addr, sizeof(w4));
425 if (!diag)
426 *word = w4;
427 break;
428 case 8:
429 if (size <= sizeof(*word)) {
430 diag = kdb_getphys(&w8, addr, sizeof(w8));
431 if (!diag)
432 *word = w8;
433 break;
434 }
435 /* drop through */
436 default:
437 diag = KDB_BADWIDTH;
438 kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
439 }
440 return diag;
441}
442
443/*
444 * kdb_getword - Read a binary value. Unlike kdb_getarea, this treats
445 * data as numbers.
446 * Inputs:
447 * word Pointer to the word to receive the result.
448 * addr Address of the area to copy.
449 * size Size of the area.
450 * Returns:
451 * 0 for success, < 0 for error.
452 */
453int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
454{
455 int diag;
456 __u8 w1;
457 __u16 w2;
458 __u32 w4;
459 __u64 w8;
460 *word = 0; /* Default value if addr or size is invalid */
461 switch (size) {
462 case 1:
463 diag = kdb_getarea(w1, addr);
464 if (!diag)
465 *word = w1;
466 break;
467 case 2:
468 diag = kdb_getarea(w2, addr);
469 if (!diag)
470 *word = w2;
471 break;
472 case 4:
473 diag = kdb_getarea(w4, addr);
474 if (!diag)
475 *word = w4;
476 break;
477 case 8:
478 if (size <= sizeof(*word)) {
479 diag = kdb_getarea(w8, addr);
480 if (!diag)
481 *word = w8;
482 break;
483 }
484 /* drop through */
485 default:
486 diag = KDB_BADWIDTH;
487 kdb_printf("kdb_getword: bad width %ld\n", (long) size);
488 }
489 return diag;
490}
491
492/*
493 * kdb_putword - Write a binary value. Unlike kdb_putarea, this
494 * treats data as numbers.
495 * Inputs:
496 * addr Address of the area to write to..
497 * word The value to set.
498 * size Size of the area.
499 * Returns:
500 * 0 for success, < 0 for error.
501 */
502int kdb_putword(unsigned long addr, unsigned long word, size_t size)
503{
504 int diag;
505 __u8 w1;
506 __u16 w2;
507 __u32 w4;
508 __u64 w8;
509 switch (size) {
510 case 1:
511 w1 = word;
512 diag = kdb_putarea(addr, w1);
513 break;
514 case 2:
515 w2 = word;
516 diag = kdb_putarea(addr, w2);
517 break;
518 case 4:
519 w4 = word;
520 diag = kdb_putarea(addr, w4);
521 break;
522 case 8:
523 if (size <= sizeof(word)) {
524 w8 = word;
525 diag = kdb_putarea(addr, w8);
526 break;
527 }
528 /* drop through */
529 default:
530 diag = KDB_BADWIDTH;
531 kdb_printf("kdb_putword: bad width %ld\n", (long) size);
532 }
533 return diag;
534}
535
536/*
537 * kdb_task_state_string - Convert a string containing any of the
538 * letters DRSTCZEUIMA to a mask for the process state field and
539 * return the value. If no argument is supplied, return the mask
540 * that corresponds to environment variable PS, DRSTCZEU by
541 * default.
542 * Inputs:
543 * s String to convert
544 * Returns:
545 * Mask for process state.
546 * Notes:
547 * The mask folds data from several sources into a single long value, so
548 * be carefull not to overlap the bits. TASK_* bits are in the LSB,
549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
550 * is no overlap between TASK_* and EXIT_* but that may not always be
551 * true, so EXIT_* bits are shifted left 16 bits before being stored in
552 * the mask.
553 */
554
555/* unrunnable is < 0 */
556#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
557#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
558#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
559#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
560
561unsigned long kdb_task_state_string(const char *s)
562{
563 long res = 0;
564 if (!s) {
565 s = kdbgetenv("PS");
566 if (!s)
567 s = "DRSTCZEU"; /* default value for ps */
568 }
569 while (*s) {
570 switch (*s) {
571 case 'D':
572 res |= TASK_UNINTERRUPTIBLE;
573 break;
574 case 'R':
575 res |= RUNNING;
576 break;
577 case 'S':
578 res |= TASK_INTERRUPTIBLE;
579 break;
580 case 'T':
581 res |= TASK_STOPPED;
582 break;
583 case 'C':
584 res |= TASK_TRACED;
585 break;
586 case 'Z':
587 res |= EXIT_ZOMBIE << 16;
588 break;
589 case 'E':
590 res |= EXIT_DEAD << 16;
591 break;
592 case 'U':
593 res |= UNRUNNABLE;
594 break;
595 case 'I':
596 res |= IDLE;
597 break;
598 case 'M':
599 res |= DAEMON;
600 break;
601 case 'A':
602 res = ~0UL;
603 break;
604 default:
605 kdb_printf("%s: unknown flag '%c' ignored\n",
606 __func__, *s);
607 break;
608 }
609 ++s;
610 }
611 return res;
612}
613
614/*
615 * kdb_task_state_char - Return the character that represents the task state.
616 * Inputs:
617 * p struct task for the process
618 * Returns:
619 * One character to represent the task state.
620 */
621char kdb_task_state_char (const struct task_struct *p)
622{
623 int cpu;
624 char state;
625 unsigned long tmp;
626
627 if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
628 return 'E';
629
630 cpu = kdb_process_cpu(p);
631 state = (p->state == 0) ? 'R' :
632 (p->state < 0) ? 'U' :
633 (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
634 (p->state & TASK_STOPPED) ? 'T' :
635 (p->state & TASK_TRACED) ? 'C' :
636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
637 (p->exit_state & EXIT_DEAD) ? 'E' :
638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
639 if (p->pid == 0) {
640 /* Idle task. Is it really idle, apart from the kdb
641 * interrupt? */
642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
643 if (cpu != kdb_initial_cpu)
644 state = 'I'; /* idle task */
645 }
646 } else if (!p->mm && state == 'S') {
647 state = 'M'; /* sleeping system daemon */
648 }
649 return state;
650}
651
652/*
653 * kdb_task_state - Return true if a process has the desired state
654 * given by the mask.
655 * Inputs:
656 * p struct task for the process
657 * mask mask from kdb_task_state_string to select processes
658 * Returns:
659 * True if the process matches at least one criteria defined by the mask.
660 */
661unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
662{
663 char state[] = { kdb_task_state_char(p), '\0' };
664 return (mask & kdb_task_state_string(state)) != 0;
665}
666
667/*
668 * kdb_print_nameval - Print a name and its value, converting the
669 * value to a symbol lookup if possible.
670 * Inputs:
671 * name field name to print
672 * val value of field
673 */
674void kdb_print_nameval(const char *name, unsigned long val)
675{
676 kdb_symtab_t symtab;
677 kdb_printf(" %-11.11s ", name);
678 if (kdbnearsym(val, &symtab))
679 kdb_symbol_print(val, &symtab,
680 KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
681 else
682 kdb_printf("0x%lx\n", val);
683}
684
685/* Last ditch allocator for debugging, so we can still debug even when
686 * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
687 * for space usage, not for speed. One smallish memory pool, the free
688 * chain is always in ascending address order to allow coalescing,
689 * allocations are done in brute force best fit.
690 */
691
692struct debug_alloc_header {
693 u32 next; /* offset of next header from start of pool */
694 u32 size;
695 void *caller;
696};
697
698/* The memory returned by this allocator must be aligned, which means
699 * so must the header size. Do not assume that sizeof(struct
700 * debug_alloc_header) is a multiple of the alignment, explicitly
701 * calculate the overhead of this header, including the alignment.
702 * The rest of this code must not use sizeof() on any header or
703 * pointer to a header.
704 */
705#define dah_align 8
706#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
707
708static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
709static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
710static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
711
712/* Locking is awkward. The debug code is called from all contexts,
713 * including non maskable interrupts. A normal spinlock is not safe
714 * in NMI context. Try to get the debug allocator lock, if it cannot
715 * be obtained after a second then give up. If the lock could not be
716 * previously obtained on this cpu then only try once.
717 *
718 * sparse has no annotation for "this function _sometimes_ acquires a
719 * lock", so fudge the acquire/release notation.
720 */
721static DEFINE_SPINLOCK(dap_lock);
722static int get_dap_lock(void)
723 __acquires(dap_lock)
724{
725 static int dap_locked = -1;
726 int count;
727 if (dap_locked == smp_processor_id())
728 count = 1;
729 else
730 count = 1000;
731 while (1) {
732 if (spin_trylock(&dap_lock)) {
733 dap_locked = -1;
734 return 1;
735 }
736 if (!count--)
737 break;
738 udelay(1000);
739 }
740 dap_locked = smp_processor_id();
741 __acquire(dap_lock);
742 return 0;
743}
744
745void *debug_kmalloc(size_t size, gfp_t flags)
746{
747 unsigned int rem, h_offset;
748 struct debug_alloc_header *best, *bestprev, *prev, *h;
749 void *p = NULL;
750 if (!get_dap_lock()) {
751 __release(dap_lock); /* we never actually got it */
752 return NULL;
753 }
754 h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
755 if (dah_first_call) {
756 h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
757 dah_first_call = 0;
758 }
759 size = ALIGN(size, dah_align);
760 prev = best = bestprev = NULL;
761 while (1) {
762 if (h->size >= size && (!best || h->size < best->size)) {
763 best = h;
764 bestprev = prev;
765 if (h->size == size)
766 break;
767 }
768 if (!h->next)
769 break;
770 prev = h;
771 h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
772 }
773 if (!best)
774 goto out;
775 rem = best->size - size;
776 /* The pool must always contain at least one header */
777 if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
778 goto out;
779 if (rem >= dah_overhead) {
780 best->size = size;
781 h_offset = ((char *)best - debug_alloc_pool) +
782 dah_overhead + best->size;
783 h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
784 h->size = rem - dah_overhead;
785 h->next = best->next;
786 } else
787 h_offset = best->next;
788 best->caller = __builtin_return_address(0);
789 dah_used += best->size;
790 dah_used_max = max(dah_used, dah_used_max);
791 if (bestprev)
792 bestprev->next = h_offset;
793 else
794 dah_first = h_offset;
795 p = (char *)best + dah_overhead;
796 memset(p, POISON_INUSE, best->size - 1);
797 *((char *)p + best->size - 1) = POISON_END;
798out:
799 spin_unlock(&dap_lock);
800 return p;
801}
802
803void debug_kfree(void *p)
804{
805 struct debug_alloc_header *h;
806 unsigned int h_offset;
807 if (!p)
808 return;
809 if ((char *)p < debug_alloc_pool ||
810 (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
811 kfree(p);
812 return;
813 }
814 if (!get_dap_lock()) {
815 __release(dap_lock); /* we never actually got it */
816 return; /* memory leak, cannot be helped */
817 }
818 h = (struct debug_alloc_header *)((char *)p - dah_overhead);
819 memset(p, POISON_FREE, h->size - 1);
820 *((char *)p + h->size - 1) = POISON_END;
821 h->caller = NULL;
822 dah_used -= h->size;
823 h_offset = (char *)h - debug_alloc_pool;
824 if (h_offset < dah_first) {
825 h->next = dah_first;
826 dah_first = h_offset;
827 } else {
828 struct debug_alloc_header *prev;
829 unsigned int prev_offset;
830 prev = (struct debug_alloc_header *)(debug_alloc_pool +
831 dah_first);
832 while (1) {
833 if (!prev->next || prev->next > h_offset)
834 break;
835 prev = (struct debug_alloc_header *)
836 (debug_alloc_pool + prev->next);
837 }
838 prev_offset = (char *)prev - debug_alloc_pool;
839 if (prev_offset + dah_overhead + prev->size == h_offset) {
840 prev->size += dah_overhead + h->size;
841 memset(h, POISON_FREE, dah_overhead - 1);
842 *((char *)h + dah_overhead - 1) = POISON_END;
843 h = prev;
844 h_offset = prev_offset;
845 } else {
846 h->next = prev->next;
847 prev->next = h_offset;
848 }
849 }
850 if (h_offset + dah_overhead + h->size == h->next) {
851 struct debug_alloc_header *next;
852 next = (struct debug_alloc_header *)
853 (debug_alloc_pool + h->next);
854 h->size += dah_overhead + next->size;
855 h->next = next->next;
856 memset(next, POISON_FREE, dah_overhead - 1);
857 *((char *)next + dah_overhead - 1) = POISON_END;
858 }
859 spin_unlock(&dap_lock);
860}
861
862void debug_kusage(void)
863{
864 struct debug_alloc_header *h_free, *h_used;
865#ifdef CONFIG_IA64
866 /* FIXME: using dah for ia64 unwind always results in a memory leak.
867 * Fix that memory leak first, then set debug_kusage_one_time = 1 for
868 * all architectures.
869 */
870 static int debug_kusage_one_time;
871#else
872 static int debug_kusage_one_time = 1;
873#endif
874 if (!get_dap_lock()) {
875 __release(dap_lock); /* we never actually got it */
876 return;
877 }
878 h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
879 if (dah_first == 0 &&
880 (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
881 dah_first_call))
882 goto out;
883 if (!debug_kusage_one_time)
884 goto out;
885 debug_kusage_one_time = 0;
886 kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
887 __func__, dah_first);
888 if (dah_first) {
889 h_used = (struct debug_alloc_header *)debug_alloc_pool;
890 kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
891 h_used->size);
892 }
893 do {
894 h_used = (struct debug_alloc_header *)
895 ((char *)h_free + dah_overhead + h_free->size);
896 kdb_printf("%s: h_used %p size %d caller %p\n",
897 __func__, h_used, h_used->size, h_used->caller);
898 h_free = (struct debug_alloc_header *)
899 (debug_alloc_pool + h_free->next);
900 } while (h_free->next);
901 h_used = (struct debug_alloc_header *)
902 ((char *)h_free + dah_overhead + h_free->size);
903 if ((char *)h_used - debug_alloc_pool !=
904 sizeof(debug_alloc_pool_aligned))
905 kdb_printf("%s: h_used %p size %d caller %p\n",
906 __func__, h_used, h_used->size, h_used->caller);
907out:
908 spin_unlock(&dap_lock);
909}
910
911/* Maintain a small stack of kdb_flags to allow recursion without disturbing
912 * the global kdb state.
913 */
914
915static int kdb_flags_stack[4], kdb_flags_index;
916
917void kdb_save_flags(void)
918{
919 BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
920 kdb_flags_stack[kdb_flags_index++] = kdb_flags;
921}
922
923void kdb_restore_flags(void)
924{
925 BUG_ON(kdb_flags_index <= 0);
926 kdb_flags = kdb_flags_stack[--kdb_flags_index];
927}
diff --git a/kernel/exit.c b/kernel/exit.c
index 7f2683a10ac4..eabca5a73a85 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,7 +55,6 @@
55#include <asm/unistd.h> 55#include <asm/unistd.h>
56#include <asm/pgtable.h> 56#include <asm/pgtable.h>
57#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
58#include "cred-internals.h"
59 58
60static void exit_mm(struct task_struct * tsk); 59static void exit_mm(struct task_struct * tsk);
61 60
diff --git a/kernel/fork.c b/kernel/fork.c
index 44b0791b0a2e..4d57d9e3a6e9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1112,10 +1112,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1112 p->memcg_batch.memcg = NULL; 1112 p->memcg_batch.memcg = NULL;
1113#endif 1113#endif
1114 1114
1115 p->bts = NULL;
1116
1117 p->stack_start = stack_start;
1118
1119 /* Perform scheduler related setup. Assign this task to a CPU. */ 1115 /* Perform scheduler related setup. Assign this task to a CPU. */
1120 sched_fork(p, clone_flags); 1116 sched_fork(p, clone_flags);
1121 1117
diff --git a/kernel/groups.c b/kernel/groups.c
index 2b45b2ee3964..53b1916c9492 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -164,12 +164,6 @@ int groups_search(const struct group_info *group_info, gid_t grp)
164 */ 164 */
165int set_groups(struct cred *new, struct group_info *group_info) 165int set_groups(struct cred *new, struct group_info *group_info)
166{ 166{
167 int retval;
168
169 retval = security_task_setgroups(group_info);
170 if (retval)
171 return retval;
172
173 put_group_info(new->group_info); 167 put_group_info(new->group_info);
174 groups_sort(group_info); 168 groups_sort(group_info);
175 get_group_info(group_info); 169 get_group_info(group_info);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0086628b6e97..b9b134b35088 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1749,35 +1749,15 @@ void __init hrtimers_init(void)
1749} 1749}
1750 1750
1751/** 1751/**
1752 * schedule_hrtimeout_range - sleep until timeout 1752 * schedule_hrtimeout_range_clock - sleep until timeout
1753 * @expires: timeout value (ktime_t) 1753 * @expires: timeout value (ktime_t)
1754 * @delta: slack in expires timeout (ktime_t) 1754 * @delta: slack in expires timeout (ktime_t)
1755 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL 1755 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1756 * 1756 * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
1757 * Make the current task sleep until the given expiry time has
1758 * elapsed. The routine will return immediately unless
1759 * the current task state has been set (see set_current_state()).
1760 *
1761 * The @delta argument gives the kernel the freedom to schedule the
1762 * actual wakeup to a time that is both power and performance friendly.
1763 * The kernel give the normal best effort behavior for "@expires+@delta",
1764 * but may decide to fire the timer earlier, but no earlier than @expires.
1765 *
1766 * You can set the task state as follows -
1767 *
1768 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1769 * pass before the routine returns.
1770 *
1771 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1772 * delivered to the current task.
1773 *
1774 * The current task state is guaranteed to be TASK_RUNNING when this
1775 * routine returns.
1776 *
1777 * Returns 0 when the timer has expired otherwise -EINTR
1778 */ 1757 */
1779int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, 1758int __sched
1780 const enum hrtimer_mode mode) 1759schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1760 const enum hrtimer_mode mode, int clock)
1781{ 1761{
1782 struct hrtimer_sleeper t; 1762 struct hrtimer_sleeper t;
1783 1763
@@ -1799,7 +1779,7 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1799 return -EINTR; 1779 return -EINTR;
1800 } 1780 }
1801 1781
1802 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); 1782 hrtimer_init_on_stack(&t.timer, clock, mode);
1803 hrtimer_set_expires_range_ns(&t.timer, *expires, delta); 1783 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1804 1784
1805 hrtimer_init_sleeper(&t, current); 1785 hrtimer_init_sleeper(&t, current);
@@ -1818,6 +1798,41 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1818 1798
1819 return !t.task ? 0 : -EINTR; 1799 return !t.task ? 0 : -EINTR;
1820} 1800}
1801
1802/**
1803 * schedule_hrtimeout_range - sleep until timeout
1804 * @expires: timeout value (ktime_t)
1805 * @delta: slack in expires timeout (ktime_t)
1806 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1807 *
1808 * Make the current task sleep until the given expiry time has
1809 * elapsed. The routine will return immediately unless
1810 * the current task state has been set (see set_current_state()).
1811 *
1812 * The @delta argument gives the kernel the freedom to schedule the
1813 * actual wakeup to a time that is both power and performance friendly.
1814 * The kernel give the normal best effort behavior for "@expires+@delta",
1815 * but may decide to fire the timer earlier, but no earlier than @expires.
1816 *
1817 * You can set the task state as follows -
1818 *
1819 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1820 * pass before the routine returns.
1821 *
1822 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1823 * delivered to the current task.
1824 *
1825 * The current task state is guaranteed to be TASK_RUNNING when this
1826 * routine returns.
1827 *
1828 * Returns 0 when the timer has expired otherwise -EINTR
1829 */
1830int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1831 const enum hrtimer_mode mode)
1832{
1833 return schedule_hrtimeout_range_clock(expires, delta, mode,
1834 CLOCK_MONOTONIC);
1835}
1821EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); 1836EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
1822 1837
1823/** 1838/**
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 03808ed342a6..7a56b22e0602 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,23 +40,29 @@
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/slab.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/smp.h> 45#include <linux/smp.h>
45 46
46#include <linux/hw_breakpoint.h> 47#include <linux/hw_breakpoint.h>
47 48
49
48/* 50/*
49 * Constraints data 51 * Constraints data
50 */ 52 */
51 53
52/* Number of pinned cpu breakpoints in a cpu */ 54/* Number of pinned cpu breakpoints in a cpu */
53static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); 55static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
54 56
55/* Number of pinned task breakpoints in a cpu */ 57/* Number of pinned task breakpoints in a cpu */
56static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); 58static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
57 59
58/* Number of non-pinned cpu/task breakpoints in a cpu */ 60/* Number of non-pinned cpu/task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); 61static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
62
63static int nr_slots[TYPE_MAX];
64
65static int constraints_initialized;
60 66
61/* Gather the number of total pinned and un-pinned bp in a cpuset */ 67/* Gather the number of total pinned and un-pinned bp in a cpuset */
62struct bp_busy_slots { 68struct bp_busy_slots {
@@ -67,16 +73,29 @@ struct bp_busy_slots {
67/* Serialize accesses to the above constraints */ 73/* Serialize accesses to the above constraints */
68static DEFINE_MUTEX(nr_bp_mutex); 74static DEFINE_MUTEX(nr_bp_mutex);
69 75
76__weak int hw_breakpoint_weight(struct perf_event *bp)
77{
78 return 1;
79}
80
81static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
82{
83 if (bp->attr.bp_type & HW_BREAKPOINT_RW)
84 return TYPE_DATA;
85
86 return TYPE_INST;
87}
88
70/* 89/*
71 * Report the maximum number of pinned breakpoints a task 90 * Report the maximum number of pinned breakpoints a task
72 * have in this cpu 91 * have in this cpu
73 */ 92 */
74static unsigned int max_task_bp_pinned(int cpu) 93static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
75{ 94{
76 int i; 95 int i;
77 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 96 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
78 97
79 for (i = HBP_NUM -1; i >= 0; i--) { 98 for (i = nr_slots[type] - 1; i >= 0; i--) {
80 if (tsk_pinned[i] > 0) 99 if (tsk_pinned[i] > 0)
81 return i + 1; 100 return i + 1;
82 } 101 }
@@ -84,7 +103,7 @@ static unsigned int max_task_bp_pinned(int cpu)
84 return 0; 103 return 0;
85} 104}
86 105
87static int task_bp_pinned(struct task_struct *tsk) 106static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
88{ 107{
89 struct perf_event_context *ctx = tsk->perf_event_ctxp; 108 struct perf_event_context *ctx = tsk->perf_event_ctxp;
90 struct list_head *list; 109 struct list_head *list;
@@ -105,7 +124,8 @@ static int task_bp_pinned(struct task_struct *tsk)
105 */ 124 */
106 list_for_each_entry(bp, list, event_entry) { 125 list_for_each_entry(bp, list, event_entry) {
107 if (bp->attr.type == PERF_TYPE_BREAKPOINT) 126 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
108 count++; 127 if (find_slot_idx(bp) == type)
128 count += hw_breakpoint_weight(bp);
109 } 129 }
110 130
111 raw_spin_unlock_irqrestore(&ctx->lock, flags); 131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -118,18 +138,19 @@ static int task_bp_pinned(struct task_struct *tsk)
118 * a given cpu (cpu > -1) or in all of them (cpu = -1). 138 * a given cpu (cpu > -1) or in all of them (cpu = -1).
119 */ 139 */
120static void 140static void
121fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) 141fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
142 enum bp_type_idx type)
122{ 143{
123 int cpu = bp->cpu; 144 int cpu = bp->cpu;
124 struct task_struct *tsk = bp->ctx->task; 145 struct task_struct *tsk = bp->ctx->task;
125 146
126 if (cpu >= 0) { 147 if (cpu >= 0) {
127 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); 148 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
128 if (!tsk) 149 if (!tsk)
129 slots->pinned += max_task_bp_pinned(cpu); 150 slots->pinned += max_task_bp_pinned(cpu, type);
130 else 151 else
131 slots->pinned += task_bp_pinned(tsk); 152 slots->pinned += task_bp_pinned(tsk, type);
132 slots->flexible = per_cpu(nr_bp_flexible, cpu); 153 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
133 154
134 return; 155 return;
135 } 156 }
@@ -137,16 +158,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
137 for_each_online_cpu(cpu) { 158 for_each_online_cpu(cpu) {
138 unsigned int nr; 159 unsigned int nr;
139 160
140 nr = per_cpu(nr_cpu_bp_pinned, cpu); 161 nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
141 if (!tsk) 162 if (!tsk)
142 nr += max_task_bp_pinned(cpu); 163 nr += max_task_bp_pinned(cpu, type);
143 else 164 else
144 nr += task_bp_pinned(tsk); 165 nr += task_bp_pinned(tsk, type);
145 166
146 if (nr > slots->pinned) 167 if (nr > slots->pinned)
147 slots->pinned = nr; 168 slots->pinned = nr;
148 169
149 nr = per_cpu(nr_bp_flexible, cpu); 170 nr = per_cpu(nr_bp_flexible[type], cpu);
150 171
151 if (nr > slots->flexible) 172 if (nr > slots->flexible)
152 slots->flexible = nr; 173 slots->flexible = nr;
@@ -154,31 +175,49 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
154} 175}
155 176
156/* 177/*
178 * For now, continue to consider flexible as pinned, until we can
179 * ensure no flexible event can ever be scheduled before a pinned event
180 * in a same cpu.
181 */
182static void
183fetch_this_slot(struct bp_busy_slots *slots, int weight)
184{
185 slots->pinned += weight;
186}
187
188/*
157 * Add a pinned breakpoint for the given task in our constraint table 189 * Add a pinned breakpoint for the given task in our constraint table
158 */ 190 */
159static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) 191static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
192 enum bp_type_idx type, int weight)
160{ 193{
161 unsigned int *tsk_pinned; 194 unsigned int *tsk_pinned;
162 int count = 0; 195 int old_count = 0;
196 int old_idx = 0;
197 int idx = 0;
163 198
164 count = task_bp_pinned(tsk); 199 old_count = task_bp_pinned(tsk, type);
200 old_idx = old_count - 1;
201 idx = old_idx + weight;
165 202
166 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 203 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
167 if (enable) { 204 if (enable) {
168 tsk_pinned[count]++; 205 tsk_pinned[idx]++;
169 if (count > 0) 206 if (old_count > 0)
170 tsk_pinned[count-1]--; 207 tsk_pinned[old_idx]--;
171 } else { 208 } else {
172 tsk_pinned[count]--; 209 tsk_pinned[idx]--;
173 if (count > 0) 210 if (old_count > 0)
174 tsk_pinned[count-1]++; 211 tsk_pinned[old_idx]++;
175 } 212 }
176} 213}
177 214
178/* 215/*
179 * Add/remove the given breakpoint in our constraint table 216 * Add/remove the given breakpoint in our constraint table
180 */ 217 */
181static void toggle_bp_slot(struct perf_event *bp, bool enable) 218static void
219toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
220 int weight)
182{ 221{
183 int cpu = bp->cpu; 222 int cpu = bp->cpu;
184 struct task_struct *tsk = bp->ctx->task; 223 struct task_struct *tsk = bp->ctx->task;
@@ -186,20 +225,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
186 /* Pinned counter task profiling */ 225 /* Pinned counter task profiling */
187 if (tsk) { 226 if (tsk) {
188 if (cpu >= 0) { 227 if (cpu >= 0) {
189 toggle_bp_task_slot(tsk, cpu, enable); 228 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
190 return; 229 return;
191 } 230 }
192 231
193 for_each_online_cpu(cpu) 232 for_each_online_cpu(cpu)
194 toggle_bp_task_slot(tsk, cpu, enable); 233 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
195 return; 234 return;
196 } 235 }
197 236
198 /* Pinned counter cpu profiling */ 237 /* Pinned counter cpu profiling */
199 if (enable) 238 if (enable)
200 per_cpu(nr_cpu_bp_pinned, bp->cpu)++; 239 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
201 else 240 else
202 per_cpu(nr_cpu_bp_pinned, bp->cpu)--; 241 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
203} 242}
204 243
205/* 244/*
@@ -246,14 +285,29 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
246static int __reserve_bp_slot(struct perf_event *bp) 285static int __reserve_bp_slot(struct perf_event *bp)
247{ 286{
248 struct bp_busy_slots slots = {0}; 287 struct bp_busy_slots slots = {0};
288 enum bp_type_idx type;
289 int weight;
249 290
250 fetch_bp_busy_slots(&slots, bp); 291 /* We couldn't initialize breakpoint constraints on boot */
292 if (!constraints_initialized)
293 return -ENOMEM;
294
295 /* Basic checks */
296 if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
297 bp->attr.bp_type == HW_BREAKPOINT_INVALID)
298 return -EINVAL;
299
300 type = find_slot_idx(bp);
301 weight = hw_breakpoint_weight(bp);
302
303 fetch_bp_busy_slots(&slots, bp, type);
304 fetch_this_slot(&slots, weight);
251 305
252 /* Flexible counters need to keep at least one slot */ 306 /* Flexible counters need to keep at least one slot */
253 if (slots.pinned + (!!slots.flexible) == HBP_NUM) 307 if (slots.pinned + (!!slots.flexible) > nr_slots[type])
254 return -ENOSPC; 308 return -ENOSPC;
255 309
256 toggle_bp_slot(bp, true); 310 toggle_bp_slot(bp, true, type, weight);
257 311
258 return 0; 312 return 0;
259} 313}
@@ -273,7 +327,12 @@ int reserve_bp_slot(struct perf_event *bp)
273 327
274static void __release_bp_slot(struct perf_event *bp) 328static void __release_bp_slot(struct perf_event *bp)
275{ 329{
276 toggle_bp_slot(bp, false); 330 enum bp_type_idx type;
331 int weight;
332
333 type = find_slot_idx(bp);
334 weight = hw_breakpoint_weight(bp);
335 toggle_bp_slot(bp, false, type, weight);
277} 336}
278 337
279void release_bp_slot(struct perf_event *bp) 338void release_bp_slot(struct perf_event *bp)
@@ -308,6 +367,28 @@ int dbg_release_bp_slot(struct perf_event *bp)
308 return 0; 367 return 0;
309} 368}
310 369
370static int validate_hw_breakpoint(struct perf_event *bp)
371{
372 int ret;
373
374 ret = arch_validate_hwbkpt_settings(bp);
375 if (ret)
376 return ret;
377
378 if (arch_check_bp_in_kernelspace(bp)) {
379 if (bp->attr.exclude_kernel)
380 return -EINVAL;
381 /*
382 * Don't let unprivileged users set a breakpoint in the trap
383 * path to avoid trap recursion attacks.
384 */
385 if (!capable(CAP_SYS_ADMIN))
386 return -EPERM;
387 }
388
389 return 0;
390}
391
311int register_perf_hw_breakpoint(struct perf_event *bp) 392int register_perf_hw_breakpoint(struct perf_event *bp)
312{ 393{
313 int ret; 394 int ret;
@@ -316,17 +397,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
316 if (ret) 397 if (ret)
317 return ret; 398 return ret;
318 399
319 /* 400 ret = validate_hw_breakpoint(bp);
320 * Ptrace breakpoints can be temporary perf events only
321 * meant to reserve a slot. In this case, it is created disabled and
322 * we don't want to check the params right now (as we put a null addr)
323 * But perf tools create events as disabled and we want to check
324 * the params for them.
325 * This is a quick hack that will be removed soon, once we remove
326 * the tmp breakpoints from ptrace
327 */
328 if (!bp->attr.disabled || !bp->overflow_handler)
329 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
330 401
331 /* if arch_validate_hwbkpt_settings() fails then release bp slot */ 402 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
332 if (ret) 403 if (ret)
@@ -373,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
373 if (attr->disabled) 444 if (attr->disabled)
374 goto end; 445 goto end;
375 446
376 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); 447 err = validate_hw_breakpoint(bp);
377 if (!err) 448 if (!err)
378 perf_event_enable(bp); 449 perf_event_enable(bp);
379 450
@@ -480,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
480 551
481static int __init init_hw_breakpoint(void) 552static int __init init_hw_breakpoint(void)
482{ 553{
554 unsigned int **task_bp_pinned;
555 int cpu, err_cpu;
556 int i;
557
558 for (i = 0; i < TYPE_MAX; i++)
559 nr_slots[i] = hw_breakpoint_slots(i);
560
561 for_each_possible_cpu(cpu) {
562 for (i = 0; i < TYPE_MAX; i++) {
563 task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
564 *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
565 GFP_KERNEL);
566 if (!*task_bp_pinned)
567 goto err_alloc;
568 }
569 }
570
571 constraints_initialized = 1;
572
483 return register_die_notifier(&hw_breakpoint_exceptions_nb); 573 return register_die_notifier(&hw_breakpoint_exceptions_nb);
574
575 err_alloc:
576 for_each_possible_cpu(err_cpu) {
577 if (err_cpu == cpu)
578 break;
579 for (i = 0; i < TYPE_MAX; i++)
580 kfree(per_cpu(nr_task_bp_pinned[i], cpu));
581 }
582
583 return -ENOMEM;
484} 584}
485core_initcall(init_hw_breakpoint); 585core_initcall(init_hw_breakpoint);
486 586
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 76d5a671bfe1..27e5c6911223 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -370,9 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
370 irqreturn_t ret, retval = IRQ_NONE; 370 irqreturn_t ret, retval = IRQ_NONE;
371 unsigned int status = 0; 371 unsigned int status = 0;
372 372
373 if (!(action->flags & IRQF_DISABLED))
374 local_irq_enable_in_hardirq();
375
376 do { 373 do {
377 trace_irq_handler_entry(irq, action); 374 trace_irq_handler_entry(irq, action);
378 ret = action->handler(irq, action->dev_id); 375 ret = action->handler(irq, action->dev_id);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 704e488730a5..3164ba7ce151 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
138 return 0; 138 return 0;
139} 139}
140 140
141int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
142{
143 struct irq_desc *desc = irq_to_desc(irq);
144 unsigned long flags;
145
146 if (!desc)
147 return -EINVAL;
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->affinity_hint = m;
151 raw_spin_unlock_irqrestore(&desc->lock, flags);
152
153 return 0;
154}
155EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
156
141#ifndef CONFIG_AUTO_IRQ_AFFINITY 157#ifndef CONFIG_AUTO_IRQ_AFFINITY
142/* 158/*
143 * Generic version of the affinity autoselector. 159 * Generic version of the affinity autoselector.
@@ -757,16 +773,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
757 if (new->flags & IRQF_ONESHOT) 773 if (new->flags & IRQF_ONESHOT)
758 desc->status |= IRQ_ONESHOT; 774 desc->status |= IRQ_ONESHOT;
759 775
760 /*
761 * Force MSI interrupts to run with interrupts
762 * disabled. The multi vector cards can cause stack
763 * overflows due to nested interrupts when enough of
764 * them are directed to a core and fire at the same
765 * time.
766 */
767 if (desc->msi_desc)
768 new->flags |= IRQF_DISABLED;
769
770 if (!(desc->status & IRQ_NOAUTOEN)) { 776 if (!(desc->status & IRQ_NOAUTOEN)) {
771 desc->depth = 0; 777 desc->depth = 0;
772 desc->status &= ~IRQ_DISABLED; 778 desc->status &= ~IRQ_DISABLED;
@@ -916,6 +922,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
916 desc->chip->disable(irq); 922 desc->chip->disable(irq);
917 } 923 }
918 924
925#ifdef CONFIG_SMP
926 /* make sure affinity_hint is cleaned up */
927 if (WARN_ON_ONCE(desc->affinity_hint))
928 desc->affinity_hint = NULL;
929#endif
930
919 raw_spin_unlock_irqrestore(&desc->lock, flags); 931 raw_spin_unlock_irqrestore(&desc->lock, flags);
920 932
921 unregister_handler_proc(irq, action); 933 unregister_handler_proc(irq, action);
@@ -1027,7 +1039,6 @@ EXPORT_SYMBOL(free_irq);
1027 * Flags: 1039 * Flags:
1028 * 1040 *
1029 * IRQF_SHARED Interrupt is shared 1041 * IRQF_SHARED Interrupt is shared
1030 * IRQF_DISABLED Disable local interrupts while processing
1031 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy 1042 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
1032 * IRQF_TRIGGER_* Specify active edge(s) or level 1043 * IRQF_TRIGGER_* Specify active edge(s) or level
1033 * 1044 *
@@ -1041,25 +1052,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1041 int retval; 1052 int retval;
1042 1053
1043 /* 1054 /*
1044 * handle_IRQ_event() always ignores IRQF_DISABLED except for
1045 * the _first_ irqaction (sigh). That can cause oopsing, but
1046 * the behavior is classified as "will not fix" so we need to
1047 * start nudging drivers away from using that idiom.
1048 */
1049 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
1050 (IRQF_SHARED|IRQF_DISABLED)) {
1051 pr_warning(
1052 "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
1053 irq, devname);
1054 }
1055
1056#ifdef CONFIG_LOCKDEP
1057 /*
1058 * Lockdep wants atomic interrupt handlers:
1059 */
1060 irqflags |= IRQF_DISABLED;
1061#endif
1062 /*
1063 * Sanity-check: shared interrupts must pass in a real dev-ID, 1055 * Sanity-check: shared interrupts must pass in a real dev-ID,
1064 * otherwise we'll have trouble later trying to figure out 1056 * otherwise we'll have trouble later trying to figure out
1065 * which interrupt is which (messes up the interrupt freeing 1057 * which interrupt is which (messes up the interrupt freeing
@@ -1120,3 +1112,40 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1120 return retval; 1112 return retval;
1121} 1113}
1122EXPORT_SYMBOL(request_threaded_irq); 1114EXPORT_SYMBOL(request_threaded_irq);
1115
1116/**
1117 * request_any_context_irq - allocate an interrupt line
1118 * @irq: Interrupt line to allocate
1119 * @handler: Function to be called when the IRQ occurs.
1120 * Threaded handler for threaded interrupts.
1121 * @flags: Interrupt type flags
1122 * @name: An ascii name for the claiming device
1123 * @dev_id: A cookie passed back to the handler function
1124 *
1125 * This call allocates interrupt resources and enables the
1126 * interrupt line and IRQ handling. It selects either a
1127 * hardirq or threaded handling method depending on the
1128 * context.
1129 *
1130 * On failure, it returns a negative value. On success,
1131 * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
1132 */
1133int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1134 unsigned long flags, const char *name, void *dev_id)
1135{
1136 struct irq_desc *desc = irq_to_desc(irq);
1137 int ret;
1138
1139 if (!desc)
1140 return -EINVAL;
1141
1142 if (desc->status & IRQ_NESTED_THREAD) {
1143 ret = request_threaded_irq(irq, NULL, handler,
1144 flags, name, dev_id);
1145 return !ret ? IRQC_IS_NESTED : ret;
1146 }
1147
1148 ret = request_irq(irq, handler, flags, name, dev_id);
1149 return !ret ? IRQC_IS_HARDIRQ : ret;
1150}
1151EXPORT_SYMBOL_GPL(request_any_context_irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7a6eb04ef6b5..09a2ee540bd2 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -32,6 +32,27 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
32 return 0; 32 return 0;
33} 33}
34 34
35static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
36{
37 struct irq_desc *desc = irq_to_desc((long)m->private);
38 unsigned long flags;
39 cpumask_var_t mask;
40
41 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
42 return -ENOMEM;
43
44 raw_spin_lock_irqsave(&desc->lock, flags);
45 if (desc->affinity_hint)
46 cpumask_copy(mask, desc->affinity_hint);
47 raw_spin_unlock_irqrestore(&desc->lock, flags);
48
49 seq_cpumask(m, mask);
50 seq_putc(m, '\n');
51 free_cpumask_var(mask);
52
53 return 0;
54}
55
35#ifndef is_affinity_mask_valid 56#ifndef is_affinity_mask_valid
36#define is_affinity_mask_valid(val) 1 57#define is_affinity_mask_valid(val) 1
37#endif 58#endif
@@ -84,6 +105,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file)
84 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 105 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
85} 106}
86 107
108static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
109{
110 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
111}
112
87static const struct file_operations irq_affinity_proc_fops = { 113static const struct file_operations irq_affinity_proc_fops = {
88 .open = irq_affinity_proc_open, 114 .open = irq_affinity_proc_open,
89 .read = seq_read, 115 .read = seq_read,
@@ -92,6 +118,13 @@ static const struct file_operations irq_affinity_proc_fops = {
92 .write = irq_affinity_proc_write, 118 .write = irq_affinity_proc_write,
93}; 119};
94 120
121static const struct file_operations irq_affinity_hint_proc_fops = {
122 .open = irq_affinity_hint_proc_open,
123 .read = seq_read,
124 .llseek = seq_lseek,
125 .release = single_release,
126};
127
95static int default_affinity_show(struct seq_file *m, void *v) 128static int default_affinity_show(struct seq_file *m, void *v)
96{ 129{
97 seq_cpumask(m, irq_default_affinity); 130 seq_cpumask(m, irq_default_affinity);
@@ -147,6 +180,26 @@ static const struct file_operations default_affinity_proc_fops = {
147 .release = single_release, 180 .release = single_release,
148 .write = default_affinity_write, 181 .write = default_affinity_write,
149}; 182};
183
184static int irq_node_proc_show(struct seq_file *m, void *v)
185{
186 struct irq_desc *desc = irq_to_desc((long) m->private);
187
188 seq_printf(m, "%d\n", desc->node);
189 return 0;
190}
191
192static int irq_node_proc_open(struct inode *inode, struct file *file)
193{
194 return single_open(file, irq_node_proc_show, PDE(inode)->data);
195}
196
197static const struct file_operations irq_node_proc_fops = {
198 .open = irq_node_proc_open,
199 .read = seq_read,
200 .llseek = seq_lseek,
201 .release = single_release,
202};
150#endif 203#endif
151 204
152static int irq_spurious_proc_show(struct seq_file *m, void *v) 205static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -231,6 +284,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
231 /* create /proc/irq/<irq>/smp_affinity */ 284 /* create /proc/irq/<irq>/smp_affinity */
232 proc_create_data("smp_affinity", 0600, desc->dir, 285 proc_create_data("smp_affinity", 0600, desc->dir,
233 &irq_affinity_proc_fops, (void *)(long)irq); 286 &irq_affinity_proc_fops, (void *)(long)irq);
287
288 /* create /proc/irq/<irq>/affinity_hint */
289 proc_create_data("affinity_hint", 0400, desc->dir,
290 &irq_affinity_hint_proc_fops, (void *)(long)irq);
291
292 proc_create_data("node", 0444, desc->dir,
293 &irq_node_proc_fops, (void *)(long)irq);
234#endif 294#endif
235 295
236 proc_create_data("spurious", 0444, desc->dir, 296 proc_create_data("spurious", 0444, desc->dir,
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 13aff293f4de..6f6d091b5757 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -16,6 +16,7 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/kdb.h>
19#include <linux/err.h> 20#include <linux/err.h>
20#include <linux/proc_fs.h> 21#include <linux/proc_fs.h>
21#include <linux/sched.h> /* for cond_resched */ 22#include <linux/sched.h> /* for cond_resched */
@@ -516,6 +517,26 @@ static int kallsyms_open(struct inode *inode, struct file *file)
516 return ret; 517 return ret;
517} 518}
518 519
520#ifdef CONFIG_KGDB_KDB
521const char *kdb_walk_kallsyms(loff_t *pos)
522{
523 static struct kallsym_iter kdb_walk_kallsyms_iter;
524 if (*pos == 0) {
525 memset(&kdb_walk_kallsyms_iter, 0,
526 sizeof(kdb_walk_kallsyms_iter));
527 reset_iter(&kdb_walk_kallsyms_iter, 0);
528 }
529 while (1) {
530 if (!update_iter(&kdb_walk_kallsyms_iter, *pos))
531 return NULL;
532 ++*pos;
533 /* Some debugging symbols have no name. Ignore them. */
534 if (kdb_walk_kallsyms_iter.name[0])
535 return kdb_walk_kallsyms_iter.name;
536 }
537}
538#endif /* CONFIG_KGDB_KDB */
539
519static const struct file_operations kallsyms_operations = { 540static const struct file_operations kallsyms_operations = {
520 .open = kallsyms_open, 541 .open = kallsyms_open,
521 .read = seq_read, 542 .read = seq_read,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87ebe8adc474..474a84715eac 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1134,11 +1134,9 @@ int crash_shrink_memory(unsigned long new_size)
1134 1134
1135 free_reserved_phys_range(end, crashk_res.end); 1135 free_reserved_phys_range(end, crashk_res.end);
1136 1136
1137 if (start == end) { 1137 if (start == end)
1138 crashk_res.end = end;
1139 release_resource(&crashk_res); 1138 release_resource(&crashk_res);
1140 } else 1139 crashk_res.end = end - 1;
1141 crashk_res.end = end - 1;
1142 1140
1143unlock: 1141unlock:
1144 mutex_unlock(&kexec_mutex); 1142 mutex_unlock(&kexec_mutex);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
deleted file mode 100644
index 11f3515ca83f..000000000000
--- a/kernel/kgdb.c
+++ /dev/null
@@ -1,1764 +0,0 @@
1/*
2 * KGDB stub.
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2008 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/console.h>
35#include <linux/threads.h>
36#include <linux/uaccess.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/ptrace.h>
40#include <linux/reboot.h>
41#include <linux/string.h>
42#include <linux/delay.h>
43#include <linux/sched.h>
44#include <linux/sysrq.h>
45#include <linux/init.h>
46#include <linux/kgdb.h>
47#include <linux/pid.h>
48#include <linux/smp.h>
49#include <linux/mm.h>
50
51#include <asm/cacheflush.h>
52#include <asm/byteorder.h>
53#include <asm/atomic.h>
54#include <asm/system.h>
55#include <asm/unaligned.h>
56
57static int kgdb_break_asap;
58
59#define KGDB_MAX_THREAD_QUERY 17
60struct kgdb_state {
61 int ex_vector;
62 int signo;
63 int err_code;
64 int cpu;
65 int pass_exception;
66 unsigned long thr_query;
67 unsigned long threadid;
68 long kgdb_usethreadid;
69 struct pt_regs *linux_regs;
70};
71
72/* Exception state values */
73#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
74#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
75#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
76#define DCPU_SSTEP 0x8 /* CPU is single stepping */
77
78static struct debuggerinfo_struct {
79 void *debuggerinfo;
80 struct task_struct *task;
81 int exception_state;
82} kgdb_info[NR_CPUS];
83
84/**
85 * kgdb_connected - Is a host GDB connected to us?
86 */
87int kgdb_connected;
88EXPORT_SYMBOL_GPL(kgdb_connected);
89
90/* All the KGDB handlers are installed */
91static int kgdb_io_module_registered;
92
93/* Guard for recursive entry */
94static int exception_level;
95
96static struct kgdb_io *kgdb_io_ops;
97static DEFINE_SPINLOCK(kgdb_registration_lock);
98
99/* kgdb console driver is loaded */
100static int kgdb_con_registered;
101/* determine if kgdb console output should be used */
102static int kgdb_use_con;
103
104static int __init opt_kgdb_con(char *str)
105{
106 kgdb_use_con = 1;
107 return 0;
108}
109
110early_param("kgdbcon", opt_kgdb_con);
111
112module_param(kgdb_use_con, int, 0644);
113
114/*
115 * Holds information about breakpoints in a kernel. These breakpoints are
116 * added and removed by gdb.
117 */
118static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
119 [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
120};
121
122/*
123 * The CPU# of the active CPU, or -1 if none:
124 */
125atomic_t kgdb_active = ATOMIC_INIT(-1);
126
127/*
128 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
129 * bootup code (which might not have percpu set up yet):
130 */
131static atomic_t passive_cpu_wait[NR_CPUS];
132static atomic_t cpu_in_kgdb[NR_CPUS];
133atomic_t kgdb_setting_breakpoint;
134
135struct task_struct *kgdb_usethread;
136struct task_struct *kgdb_contthread;
137
138int kgdb_single_step;
139pid_t kgdb_sstep_pid;
140
141/* Our I/O buffers. */
142static char remcom_in_buffer[BUFMAX];
143static char remcom_out_buffer[BUFMAX];
144
145/* Storage for the registers, in GDB format. */
146static unsigned long gdb_regs[(NUMREGBYTES +
147 sizeof(unsigned long) - 1) /
148 sizeof(unsigned long)];
149
150/* to keep track of the CPU which is doing the single stepping*/
151atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
152
153/*
154 * If you are debugging a problem where roundup (the collection of
155 * all other CPUs) is a problem [this should be extremely rare],
156 * then use the nokgdbroundup option to avoid roundup. In that case
157 * the other CPUs might interfere with your debugging context, so
158 * use this with care:
159 */
160static int kgdb_do_roundup = 1;
161
162static int __init opt_nokgdbroundup(char *str)
163{
164 kgdb_do_roundup = 0;
165
166 return 0;
167}
168
169early_param("nokgdbroundup", opt_nokgdbroundup);
170
171/*
172 * Finally, some KGDB code :-)
173 */
174
175/*
176 * Weak aliases for breakpoint management,
177 * can be overriden by architectures when needed:
178 */
179int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
180{
181 int err;
182
183 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
184 if (err)
185 return err;
186
187 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
188 BREAK_INSTR_SIZE);
189}
190
191int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
192{
193 return probe_kernel_write((char *)addr,
194 (char *)bundle, BREAK_INSTR_SIZE);
195}
196
197int __weak kgdb_validate_break_address(unsigned long addr)
198{
199 char tmp_variable[BREAK_INSTR_SIZE];
200 int err;
201 /* Validate setting the breakpoint and then removing it. In the
202 * remove fails, the kernel needs to emit a bad message because we
203 * are deep trouble not being able to put things back the way we
204 * found them.
205 */
206 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
207 if (err)
208 return err;
209 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
210 if (err)
211 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
212 "memory destroyed at: %lx", addr);
213 return err;
214}
215
216unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
217{
218 return instruction_pointer(regs);
219}
220
221int __weak kgdb_arch_init(void)
222{
223 return 0;
224}
225
226int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
227{
228 return 0;
229}
230
231void __weak
232kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
233{
234 return;
235}
236
237/**
238 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
239 * @regs: Current &struct pt_regs.
240 *
241 * This function will be called if the particular architecture must
242 * disable hardware debugging while it is processing gdb packets or
243 * handling exception.
244 */
245void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
246{
247}
248
249/*
250 * GDB remote protocol parser:
251 */
252
253static int hex(char ch)
254{
255 if ((ch >= 'a') && (ch <= 'f'))
256 return ch - 'a' + 10;
257 if ((ch >= '0') && (ch <= '9'))
258 return ch - '0';
259 if ((ch >= 'A') && (ch <= 'F'))
260 return ch - 'A' + 10;
261 return -1;
262}
263
264/* scan for the sequence $<data>#<checksum> */
265static void get_packet(char *buffer)
266{
267 unsigned char checksum;
268 unsigned char xmitcsum;
269 int count;
270 char ch;
271
272 do {
273 /*
274 * Spin and wait around for the start character, ignore all
275 * other characters:
276 */
277 while ((ch = (kgdb_io_ops->read_char())) != '$')
278 /* nothing */;
279
280 kgdb_connected = 1;
281 checksum = 0;
282 xmitcsum = -1;
283
284 count = 0;
285
286 /*
287 * now, read until a # or end of buffer is found:
288 */
289 while (count < (BUFMAX - 1)) {
290 ch = kgdb_io_ops->read_char();
291 if (ch == '#')
292 break;
293 checksum = checksum + ch;
294 buffer[count] = ch;
295 count = count + 1;
296 }
297 buffer[count] = 0;
298
299 if (ch == '#') {
300 xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
301 xmitcsum += hex(kgdb_io_ops->read_char());
302
303 if (checksum != xmitcsum)
304 /* failed checksum */
305 kgdb_io_ops->write_char('-');
306 else
307 /* successful transfer */
308 kgdb_io_ops->write_char('+');
309 if (kgdb_io_ops->flush)
310 kgdb_io_ops->flush();
311 }
312 } while (checksum != xmitcsum);
313}
314
315/*
316 * Send the packet in buffer.
317 * Check for gdb connection if asked for.
318 */
319static void put_packet(char *buffer)
320{
321 unsigned char checksum;
322 int count;
323 char ch;
324
325 /*
326 * $<packet info>#<checksum>.
327 */
328 while (1) {
329 kgdb_io_ops->write_char('$');
330 checksum = 0;
331 count = 0;
332
333 while ((ch = buffer[count])) {
334 kgdb_io_ops->write_char(ch);
335 checksum += ch;
336 count++;
337 }
338
339 kgdb_io_ops->write_char('#');
340 kgdb_io_ops->write_char(hex_asc_hi(checksum));
341 kgdb_io_ops->write_char(hex_asc_lo(checksum));
342 if (kgdb_io_ops->flush)
343 kgdb_io_ops->flush();
344
345 /* Now see what we get in reply. */
346 ch = kgdb_io_ops->read_char();
347
348 if (ch == 3)
349 ch = kgdb_io_ops->read_char();
350
351 /* If we get an ACK, we are done. */
352 if (ch == '+')
353 return;
354
355 /*
356 * If we get the start of another packet, this means
357 * that GDB is attempting to reconnect. We will NAK
358 * the packet being sent, and stop trying to send this
359 * packet.
360 */
361 if (ch == '$') {
362 kgdb_io_ops->write_char('-');
363 if (kgdb_io_ops->flush)
364 kgdb_io_ops->flush();
365 return;
366 }
367 }
368}
369
370/*
371 * Convert the memory pointed to by mem into hex, placing result in buf.
372 * Return a pointer to the last char put in buf (null). May return an error.
373 */
374int kgdb_mem2hex(char *mem, char *buf, int count)
375{
376 char *tmp;
377 int err;
378
379 /*
380 * We use the upper half of buf as an intermediate buffer for the
381 * raw memory copy. Hex conversion will work against this one.
382 */
383 tmp = buf + count;
384
385 err = probe_kernel_read(tmp, mem, count);
386 if (!err) {
387 while (count > 0) {
388 buf = pack_hex_byte(buf, *tmp);
389 tmp++;
390 count--;
391 }
392
393 *buf = 0;
394 }
395
396 return err;
397}
398
399/*
400 * Copy the binary array pointed to by buf into mem. Fix $, #, and
401 * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
402 * The input buf is overwitten with the result to write to mem.
403 */
404static int kgdb_ebin2mem(char *buf, char *mem, int count)
405{
406 int size = 0;
407 char *c = buf;
408
409 while (count-- > 0) {
410 c[size] = *buf++;
411 if (c[size] == 0x7d)
412 c[size] = *buf++ ^ 0x20;
413 size++;
414 }
415
416 return probe_kernel_write(mem, c, size);
417}
418
419/*
420 * Convert the hex array pointed to by buf into binary to be placed in mem.
421 * Return a pointer to the character AFTER the last byte written.
422 * May return an error.
423 */
424int kgdb_hex2mem(char *buf, char *mem, int count)
425{
426 char *tmp_raw;
427 char *tmp_hex;
428
429 /*
430 * We use the upper half of buf as an intermediate buffer for the
431 * raw memory that is converted from hex.
432 */
433 tmp_raw = buf + count * 2;
434
435 tmp_hex = tmp_raw - 1;
436 while (tmp_hex >= buf) {
437 tmp_raw--;
438 *tmp_raw = hex(*tmp_hex--);
439 *tmp_raw |= hex(*tmp_hex--) << 4;
440 }
441
442 return probe_kernel_write(mem, tmp_raw, count);
443}
444
445/*
446 * While we find nice hex chars, build a long_val.
447 * Return number of chars processed.
448 */
449int kgdb_hex2long(char **ptr, unsigned long *long_val)
450{
451 int hex_val;
452 int num = 0;
453 int negate = 0;
454
455 *long_val = 0;
456
457 if (**ptr == '-') {
458 negate = 1;
459 (*ptr)++;
460 }
461 while (**ptr) {
462 hex_val = hex(**ptr);
463 if (hex_val < 0)
464 break;
465
466 *long_val = (*long_val << 4) | hex_val;
467 num++;
468 (*ptr)++;
469 }
470
471 if (negate)
472 *long_val = -*long_val;
473
474 return num;
475}
476
477/* Write memory due to an 'M' or 'X' packet. */
478static int write_mem_msg(int binary)
479{
480 char *ptr = &remcom_in_buffer[1];
481 unsigned long addr;
482 unsigned long length;
483 int err;
484
485 if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
486 kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
487 if (binary)
488 err = kgdb_ebin2mem(ptr, (char *)addr, length);
489 else
490 err = kgdb_hex2mem(ptr, (char *)addr, length);
491 if (err)
492 return err;
493 if (CACHE_FLUSH_IS_SAFE)
494 flush_icache_range(addr, addr + length);
495 return 0;
496 }
497
498 return -EINVAL;
499}
500
501static void error_packet(char *pkt, int error)
502{
503 error = -error;
504 pkt[0] = 'E';
505 pkt[1] = hex_asc[(error / 10)];
506 pkt[2] = hex_asc[(error % 10)];
507 pkt[3] = '\0';
508}
509
510/*
511 * Thread ID accessors. We represent a flat TID space to GDB, where
512 * the per CPU idle threads (which under Linux all have PID 0) are
513 * remapped to negative TIDs.
514 */
515
516#define BUF_THREAD_ID_SIZE 16
517
518static char *pack_threadid(char *pkt, unsigned char *id)
519{
520 char *limit;
521
522 limit = pkt + BUF_THREAD_ID_SIZE;
523 while (pkt < limit)
524 pkt = pack_hex_byte(pkt, *id++);
525
526 return pkt;
527}
528
529static void int_to_threadref(unsigned char *id, int value)
530{
531 unsigned char *scan;
532 int i = 4;
533
534 scan = (unsigned char *)id;
535 while (i--)
536 *scan++ = 0;
537 put_unaligned_be32(value, scan);
538}
539
540static struct task_struct *getthread(struct pt_regs *regs, int tid)
541{
542 /*
543 * Non-positive TIDs are remapped to the cpu shadow information
544 */
545 if (tid == 0 || tid == -1)
546 tid = -atomic_read(&kgdb_active) - 2;
547 if (tid < -1 && tid > -NR_CPUS - 2) {
548 if (kgdb_info[-tid - 2].task)
549 return kgdb_info[-tid - 2].task;
550 else
551 return idle_task(-tid - 2);
552 }
553 if (tid <= 0) {
554 printk(KERN_ERR "KGDB: Internal thread select error\n");
555 dump_stack();
556 return NULL;
557 }
558
559 /*
560 * find_task_by_pid_ns() does not take the tasklist lock anymore
561 * but is nicely RCU locked - hence is a pretty resilient
562 * thing to use:
563 */
564 return find_task_by_pid_ns(tid, &init_pid_ns);
565}
566
567/*
568 * Some architectures need cache flushes when we set/clear a
569 * breakpoint:
570 */
571static void kgdb_flush_swbreak_addr(unsigned long addr)
572{
573 if (!CACHE_FLUSH_IS_SAFE)
574 return;
575
576 if (current->mm && current->mm->mmap_cache) {
577 flush_cache_range(current->mm->mmap_cache,
578 addr, addr + BREAK_INSTR_SIZE);
579 }
580 /* Force flush instruction cache if it was outside the mm */
581 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
582}
583
584/*
585 * SW breakpoint management:
586 */
587static int kgdb_activate_sw_breakpoints(void)
588{
589 unsigned long addr;
590 int error;
591 int ret = 0;
592 int i;
593
594 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
595 if (kgdb_break[i].state != BP_SET)
596 continue;
597
598 addr = kgdb_break[i].bpt_addr;
599 error = kgdb_arch_set_breakpoint(addr,
600 kgdb_break[i].saved_instr);
601 if (error) {
602 ret = error;
603 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
604 continue;
605 }
606
607 kgdb_flush_swbreak_addr(addr);
608 kgdb_break[i].state = BP_ACTIVE;
609 }
610 return ret;
611}
612
613static int kgdb_set_sw_break(unsigned long addr)
614{
615 int err = kgdb_validate_break_address(addr);
616 int breakno = -1;
617 int i;
618
619 if (err)
620 return err;
621
622 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
623 if ((kgdb_break[i].state == BP_SET) &&
624 (kgdb_break[i].bpt_addr == addr))
625 return -EEXIST;
626 }
627 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
628 if (kgdb_break[i].state == BP_REMOVED &&
629 kgdb_break[i].bpt_addr == addr) {
630 breakno = i;
631 break;
632 }
633 }
634
635 if (breakno == -1) {
636 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
637 if (kgdb_break[i].state == BP_UNDEFINED) {
638 breakno = i;
639 break;
640 }
641 }
642 }
643
644 if (breakno == -1)
645 return -E2BIG;
646
647 kgdb_break[breakno].state = BP_SET;
648 kgdb_break[breakno].type = BP_BREAKPOINT;
649 kgdb_break[breakno].bpt_addr = addr;
650
651 return 0;
652}
653
654static int kgdb_deactivate_sw_breakpoints(void)
655{
656 unsigned long addr;
657 int error;
658 int ret = 0;
659 int i;
660
661 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
662 if (kgdb_break[i].state != BP_ACTIVE)
663 continue;
664 addr = kgdb_break[i].bpt_addr;
665 error = kgdb_arch_remove_breakpoint(addr,
666 kgdb_break[i].saved_instr);
667 if (error) {
668 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
669 ret = error;
670 }
671
672 kgdb_flush_swbreak_addr(addr);
673 kgdb_break[i].state = BP_SET;
674 }
675 return ret;
676}
677
678static int kgdb_remove_sw_break(unsigned long addr)
679{
680 int i;
681
682 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
683 if ((kgdb_break[i].state == BP_SET) &&
684 (kgdb_break[i].bpt_addr == addr)) {
685 kgdb_break[i].state = BP_REMOVED;
686 return 0;
687 }
688 }
689 return -ENOENT;
690}
691
692int kgdb_isremovedbreak(unsigned long addr)
693{
694 int i;
695
696 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
697 if ((kgdb_break[i].state == BP_REMOVED) &&
698 (kgdb_break[i].bpt_addr == addr))
699 return 1;
700 }
701 return 0;
702}
703
704static int remove_all_break(void)
705{
706 unsigned long addr;
707 int error;
708 int i;
709
710 /* Clear memory breakpoints. */
711 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
712 if (kgdb_break[i].state != BP_ACTIVE)
713 goto setundefined;
714 addr = kgdb_break[i].bpt_addr;
715 error = kgdb_arch_remove_breakpoint(addr,
716 kgdb_break[i].saved_instr);
717 if (error)
718 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
719 addr);
720setundefined:
721 kgdb_break[i].state = BP_UNDEFINED;
722 }
723
724 /* Clear hardware breakpoints. */
725 if (arch_kgdb_ops.remove_all_hw_break)
726 arch_kgdb_ops.remove_all_hw_break();
727
728 return 0;
729}
730
731/*
732 * Remap normal tasks to their real PID,
733 * CPU shadow threads are mapped to -CPU - 2
734 */
735static inline int shadow_pid(int realpid)
736{
737 if (realpid)
738 return realpid;
739
740 return -raw_smp_processor_id() - 2;
741}
742
743static char gdbmsgbuf[BUFMAX + 1];
744
745static void kgdb_msg_write(const char *s, int len)
746{
747 char *bufptr;
748 int wcount;
749 int i;
750
751 /* 'O'utput */
752 gdbmsgbuf[0] = 'O';
753
754 /* Fill and send buffers... */
755 while (len > 0) {
756 bufptr = gdbmsgbuf + 1;
757
758 /* Calculate how many this time */
759 if ((len << 1) > (BUFMAX - 2))
760 wcount = (BUFMAX - 2) >> 1;
761 else
762 wcount = len;
763
764 /* Pack in hex chars */
765 for (i = 0; i < wcount; i++)
766 bufptr = pack_hex_byte(bufptr, s[i]);
767 *bufptr = '\0';
768
769 /* Move up */
770 s += wcount;
771 len -= wcount;
772
773 /* Write packet */
774 put_packet(gdbmsgbuf);
775 }
776}
777
778/*
779 * Return true if there is a valid kgdb I/O module. Also if no
780 * debugger is attached a message can be printed to the console about
781 * waiting for the debugger to attach.
782 *
783 * The print_wait argument is only to be true when called from inside
784 * the core kgdb_handle_exception, because it will wait for the
785 * debugger to attach.
786 */
787static int kgdb_io_ready(int print_wait)
788{
789 if (!kgdb_io_ops)
790 return 0;
791 if (kgdb_connected)
792 return 1;
793 if (atomic_read(&kgdb_setting_breakpoint))
794 return 1;
795 if (print_wait)
796 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
797 return 1;
798}
799
800/*
801 * All the functions that start with gdb_cmd are the various
802 * operations to implement the handlers for the gdbserial protocol
803 * where KGDB is communicating with an external debugger
804 */
805
806/* Handle the '?' status packets */
807static void gdb_cmd_status(struct kgdb_state *ks)
808{
809 /*
810 * We know that this packet is only sent
811 * during initial connect. So to be safe,
812 * we clear out our breakpoints now in case
813 * GDB is reconnecting.
814 */
815 remove_all_break();
816
817 remcom_out_buffer[0] = 'S';
818 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
819}
820
821/* Handle the 'g' get registers request */
822static void gdb_cmd_getregs(struct kgdb_state *ks)
823{
824 struct task_struct *thread;
825 void *local_debuggerinfo;
826 int i;
827
828 thread = kgdb_usethread;
829 if (!thread) {
830 thread = kgdb_info[ks->cpu].task;
831 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
832 } else {
833 local_debuggerinfo = NULL;
834 for_each_online_cpu(i) {
835 /*
836 * Try to find the task on some other
837 * or possibly this node if we do not
838 * find the matching task then we try
839 * to approximate the results.
840 */
841 if (thread == kgdb_info[i].task)
842 local_debuggerinfo = kgdb_info[i].debuggerinfo;
843 }
844 }
845
846 /*
847 * All threads that don't have debuggerinfo should be
848 * in schedule() sleeping, since all other CPUs
849 * are in kgdb_wait, and thus have debuggerinfo.
850 */
851 if (local_debuggerinfo) {
852 pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
853 } else {
854 /*
855 * Pull stuff saved during switch_to; nothing
856 * else is accessible (or even particularly
857 * relevant).
858 *
859 * This should be enough for a stack trace.
860 */
861 sleeping_thread_to_gdb_regs(gdb_regs, thread);
862 }
863 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
864}
865
866/* Handle the 'G' set registers request */
867static void gdb_cmd_setregs(struct kgdb_state *ks)
868{
869 kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
870
871 if (kgdb_usethread && kgdb_usethread != current) {
872 error_packet(remcom_out_buffer, -EINVAL);
873 } else {
874 gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
875 strcpy(remcom_out_buffer, "OK");
876 }
877}
878
879/* Handle the 'm' memory read bytes */
880static void gdb_cmd_memread(struct kgdb_state *ks)
881{
882 char *ptr = &remcom_in_buffer[1];
883 unsigned long length;
884 unsigned long addr;
885 int err;
886
887 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
888 kgdb_hex2long(&ptr, &length) > 0) {
889 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
890 if (err)
891 error_packet(remcom_out_buffer, err);
892 } else {
893 error_packet(remcom_out_buffer, -EINVAL);
894 }
895}
896
897/* Handle the 'M' memory write bytes */
898static void gdb_cmd_memwrite(struct kgdb_state *ks)
899{
900 int err = write_mem_msg(0);
901
902 if (err)
903 error_packet(remcom_out_buffer, err);
904 else
905 strcpy(remcom_out_buffer, "OK");
906}
907
908/* Handle the 'X' memory binary write bytes */
909static void gdb_cmd_binwrite(struct kgdb_state *ks)
910{
911 int err = write_mem_msg(1);
912
913 if (err)
914 error_packet(remcom_out_buffer, err);
915 else
916 strcpy(remcom_out_buffer, "OK");
917}
918
919/* Handle the 'D' or 'k', detach or kill packets */
920static void gdb_cmd_detachkill(struct kgdb_state *ks)
921{
922 int error;
923
924 /* The detach case */
925 if (remcom_in_buffer[0] == 'D') {
926 error = remove_all_break();
927 if (error < 0) {
928 error_packet(remcom_out_buffer, error);
929 } else {
930 strcpy(remcom_out_buffer, "OK");
931 kgdb_connected = 0;
932 }
933 put_packet(remcom_out_buffer);
934 } else {
935 /*
936 * Assume the kill case, with no exit code checking,
937 * trying to force detach the debugger:
938 */
939 remove_all_break();
940 kgdb_connected = 0;
941 }
942}
943
944/* Handle the 'R' reboot packets */
945static int gdb_cmd_reboot(struct kgdb_state *ks)
946{
947 /* For now, only honor R0 */
948 if (strcmp(remcom_in_buffer, "R0") == 0) {
949 printk(KERN_CRIT "Executing emergency reboot\n");
950 strcpy(remcom_out_buffer, "OK");
951 put_packet(remcom_out_buffer);
952
953 /*
954 * Execution should not return from
955 * machine_emergency_restart()
956 */
957 machine_emergency_restart();
958 kgdb_connected = 0;
959
960 return 1;
961 }
962 return 0;
963}
964
965/* Handle the 'q' query packets */
966static void gdb_cmd_query(struct kgdb_state *ks)
967{
968 struct task_struct *g;
969 struct task_struct *p;
970 unsigned char thref[8];
971 char *ptr;
972 int i;
973 int cpu;
974 int finished = 0;
975
976 switch (remcom_in_buffer[1]) {
977 case 's':
978 case 'f':
979 if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
980 error_packet(remcom_out_buffer, -EINVAL);
981 break;
982 }
983
984 i = 0;
985 remcom_out_buffer[0] = 'm';
986 ptr = remcom_out_buffer + 1;
987 if (remcom_in_buffer[1] == 'f') {
988 /* Each cpu is a shadow thread */
989 for_each_online_cpu(cpu) {
990 ks->thr_query = 0;
991 int_to_threadref(thref, -cpu - 2);
992 pack_threadid(ptr, thref);
993 ptr += BUF_THREAD_ID_SIZE;
994 *(ptr++) = ',';
995 i++;
996 }
997 }
998
999 do_each_thread(g, p) {
1000 if (i >= ks->thr_query && !finished) {
1001 int_to_threadref(thref, p->pid);
1002 pack_threadid(ptr, thref);
1003 ptr += BUF_THREAD_ID_SIZE;
1004 *(ptr++) = ',';
1005 ks->thr_query++;
1006 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
1007 finished = 1;
1008 }
1009 i++;
1010 } while_each_thread(g, p);
1011
1012 *(--ptr) = '\0';
1013 break;
1014
1015 case 'C':
1016 /* Current thread id */
1017 strcpy(remcom_out_buffer, "QC");
1018 ks->threadid = shadow_pid(current->pid);
1019 int_to_threadref(thref, ks->threadid);
1020 pack_threadid(remcom_out_buffer + 2, thref);
1021 break;
1022 case 'T':
1023 if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
1024 error_packet(remcom_out_buffer, -EINVAL);
1025 break;
1026 }
1027 ks->threadid = 0;
1028 ptr = remcom_in_buffer + 17;
1029 kgdb_hex2long(&ptr, &ks->threadid);
1030 if (!getthread(ks->linux_regs, ks->threadid)) {
1031 error_packet(remcom_out_buffer, -EINVAL);
1032 break;
1033 }
1034 if ((int)ks->threadid > 0) {
1035 kgdb_mem2hex(getthread(ks->linux_regs,
1036 ks->threadid)->comm,
1037 remcom_out_buffer, 16);
1038 } else {
1039 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
1040
1041 sprintf(tmpstr, "shadowCPU%d",
1042 (int)(-ks->threadid - 2));
1043 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
1044 }
1045 break;
1046 }
1047}
1048
1049/* Handle the 'H' task query packets */
1050static void gdb_cmd_task(struct kgdb_state *ks)
1051{
1052 struct task_struct *thread;
1053 char *ptr;
1054
1055 switch (remcom_in_buffer[1]) {
1056 case 'g':
1057 ptr = &remcom_in_buffer[2];
1058 kgdb_hex2long(&ptr, &ks->threadid);
1059 thread = getthread(ks->linux_regs, ks->threadid);
1060 if (!thread && ks->threadid > 0) {
1061 error_packet(remcom_out_buffer, -EINVAL);
1062 break;
1063 }
1064 kgdb_usethread = thread;
1065 ks->kgdb_usethreadid = ks->threadid;
1066 strcpy(remcom_out_buffer, "OK");
1067 break;
1068 case 'c':
1069 ptr = &remcom_in_buffer[2];
1070 kgdb_hex2long(&ptr, &ks->threadid);
1071 if (!ks->threadid) {
1072 kgdb_contthread = NULL;
1073 } else {
1074 thread = getthread(ks->linux_regs, ks->threadid);
1075 if (!thread && ks->threadid > 0) {
1076 error_packet(remcom_out_buffer, -EINVAL);
1077 break;
1078 }
1079 kgdb_contthread = thread;
1080 }
1081 strcpy(remcom_out_buffer, "OK");
1082 break;
1083 }
1084}
1085
1086/* Handle the 'T' thread query packets */
1087static void gdb_cmd_thread(struct kgdb_state *ks)
1088{
1089 char *ptr = &remcom_in_buffer[1];
1090 struct task_struct *thread;
1091
1092 kgdb_hex2long(&ptr, &ks->threadid);
1093 thread = getthread(ks->linux_regs, ks->threadid);
1094 if (thread)
1095 strcpy(remcom_out_buffer, "OK");
1096 else
1097 error_packet(remcom_out_buffer, -EINVAL);
1098}
1099
1100/* Handle the 'z' or 'Z' breakpoint remove or set packets */
1101static void gdb_cmd_break(struct kgdb_state *ks)
1102{
1103 /*
1104 * Since GDB-5.3, it's been drafted that '0' is a software
1105 * breakpoint, '1' is a hardware breakpoint, so let's do that.
1106 */
1107 char *bpt_type = &remcom_in_buffer[1];
1108 char *ptr = &remcom_in_buffer[2];
1109 unsigned long addr;
1110 unsigned long length;
1111 int error = 0;
1112
1113 if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
1114 /* Unsupported */
1115 if (*bpt_type > '4')
1116 return;
1117 } else {
1118 if (*bpt_type != '0' && *bpt_type != '1')
1119 /* Unsupported. */
1120 return;
1121 }
1122
1123 /*
1124 * Test if this is a hardware breakpoint, and
1125 * if we support it:
1126 */
1127 if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
1128 /* Unsupported. */
1129 return;
1130
1131 if (*(ptr++) != ',') {
1132 error_packet(remcom_out_buffer, -EINVAL);
1133 return;
1134 }
1135 if (!kgdb_hex2long(&ptr, &addr)) {
1136 error_packet(remcom_out_buffer, -EINVAL);
1137 return;
1138 }
1139 if (*(ptr++) != ',' ||
1140 !kgdb_hex2long(&ptr, &length)) {
1141 error_packet(remcom_out_buffer, -EINVAL);
1142 return;
1143 }
1144
1145 if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
1146 error = kgdb_set_sw_break(addr);
1147 else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
1148 error = kgdb_remove_sw_break(addr);
1149 else if (remcom_in_buffer[0] == 'Z')
1150 error = arch_kgdb_ops.set_hw_breakpoint(addr,
1151 (int)length, *bpt_type - '0');
1152 else if (remcom_in_buffer[0] == 'z')
1153 error = arch_kgdb_ops.remove_hw_breakpoint(addr,
1154 (int) length, *bpt_type - '0');
1155
1156 if (error == 0)
1157 strcpy(remcom_out_buffer, "OK");
1158 else
1159 error_packet(remcom_out_buffer, error);
1160}
1161
1162/* Handle the 'C' signal / exception passing packets */
1163static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1164{
1165 /* C09 == pass exception
1166 * C15 == detach kgdb, pass exception
1167 */
1168 if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
1169
1170 ks->pass_exception = 1;
1171 remcom_in_buffer[0] = 'c';
1172
1173 } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
1174
1175 ks->pass_exception = 1;
1176 remcom_in_buffer[0] = 'D';
1177 remove_all_break();
1178 kgdb_connected = 0;
1179 return 1;
1180
1181 } else {
1182 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1183 " and 15 (pass and disconnect)\n"
1184 "Executing a continue without signal passing\n", 0);
1185 remcom_in_buffer[0] = 'c';
1186 }
1187
1188 /* Indicate fall through */
1189 return -1;
1190}
1191
1192/*
1193 * This function performs all gdbserial command procesing
1194 */
1195static int gdb_serial_stub(struct kgdb_state *ks)
1196{
1197 int error = 0;
1198 int tmp;
1199
1200 /* Clear the out buffer. */
1201 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1202
1203 if (kgdb_connected) {
1204 unsigned char thref[8];
1205 char *ptr;
1206
1207 /* Reply to host that an exception has occurred */
1208 ptr = remcom_out_buffer;
1209 *ptr++ = 'T';
1210 ptr = pack_hex_byte(ptr, ks->signo);
1211 ptr += strlen(strcpy(ptr, "thread:"));
1212 int_to_threadref(thref, shadow_pid(current->pid));
1213 ptr = pack_threadid(ptr, thref);
1214 *ptr++ = ';';
1215 put_packet(remcom_out_buffer);
1216 }
1217
1218 kgdb_usethread = kgdb_info[ks->cpu].task;
1219 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
1220 ks->pass_exception = 0;
1221
1222 while (1) {
1223 error = 0;
1224
1225 /* Clear the out buffer. */
1226 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1227
1228 get_packet(remcom_in_buffer);
1229
1230 switch (remcom_in_buffer[0]) {
1231 case '?': /* gdbserial status */
1232 gdb_cmd_status(ks);
1233 break;
1234 case 'g': /* return the value of the CPU registers */
1235 gdb_cmd_getregs(ks);
1236 break;
1237 case 'G': /* set the value of the CPU registers - return OK */
1238 gdb_cmd_setregs(ks);
1239 break;
1240 case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
1241 gdb_cmd_memread(ks);
1242 break;
1243 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1244 gdb_cmd_memwrite(ks);
1245 break;
1246 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1247 gdb_cmd_binwrite(ks);
1248 break;
1249 /* kill or detach. KGDB should treat this like a
1250 * continue.
1251 */
1252 case 'D': /* Debugger detach */
1253 case 'k': /* Debugger detach via kill */
1254 gdb_cmd_detachkill(ks);
1255 goto default_handle;
1256 case 'R': /* Reboot */
1257 if (gdb_cmd_reboot(ks))
1258 goto default_handle;
1259 break;
1260 case 'q': /* query command */
1261 gdb_cmd_query(ks);
1262 break;
1263 case 'H': /* task related */
1264 gdb_cmd_task(ks);
1265 break;
1266 case 'T': /* Query thread status */
1267 gdb_cmd_thread(ks);
1268 break;
1269 case 'z': /* Break point remove */
1270 case 'Z': /* Break point set */
1271 gdb_cmd_break(ks);
1272 break;
1273 case 'C': /* Exception passing */
1274 tmp = gdb_cmd_exception_pass(ks);
1275 if (tmp > 0)
1276 goto default_handle;
1277 if (tmp == 0)
1278 break;
1279 /* Fall through on tmp < 0 */
1280 case 'c': /* Continue packet */
1281 case 's': /* Single step packet */
1282 if (kgdb_contthread && kgdb_contthread != current) {
1283 /* Can't switch threads in kgdb */
1284 error_packet(remcom_out_buffer, -EINVAL);
1285 break;
1286 }
1287 kgdb_activate_sw_breakpoints();
1288 /* Fall through to default processing */
1289 default:
1290default_handle:
1291 error = kgdb_arch_handle_exception(ks->ex_vector,
1292 ks->signo,
1293 ks->err_code,
1294 remcom_in_buffer,
1295 remcom_out_buffer,
1296 ks->linux_regs);
1297 /*
1298 * Leave cmd processing on error, detach,
1299 * kill, continue, or single step.
1300 */
1301 if (error >= 0 || remcom_in_buffer[0] == 'D' ||
1302 remcom_in_buffer[0] == 'k') {
1303 error = 0;
1304 goto kgdb_exit;
1305 }
1306
1307 }
1308
1309 /* reply to the request */
1310 put_packet(remcom_out_buffer);
1311 }
1312
1313kgdb_exit:
1314 if (ks->pass_exception)
1315 error = 1;
1316 return error;
1317}
1318
1319static int kgdb_reenter_check(struct kgdb_state *ks)
1320{
1321 unsigned long addr;
1322
1323 if (atomic_read(&kgdb_active) != raw_smp_processor_id())
1324 return 0;
1325
1326 /* Panic on recursive debugger calls: */
1327 exception_level++;
1328 addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
1329 kgdb_deactivate_sw_breakpoints();
1330
1331 /*
1332 * If the break point removed ok at the place exception
1333 * occurred, try to recover and print a warning to the end
1334 * user because the user planted a breakpoint in a place that
1335 * KGDB needs in order to function.
1336 */
1337 if (kgdb_remove_sw_break(addr) == 0) {
1338 exception_level = 0;
1339 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1340 kgdb_activate_sw_breakpoints();
1341 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
1342 addr);
1343 WARN_ON_ONCE(1);
1344
1345 return 1;
1346 }
1347 remove_all_break();
1348 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1349
1350 if (exception_level > 1) {
1351 dump_stack();
1352 panic("Recursive entry to debugger");
1353 }
1354
1355 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
1356 dump_stack();
1357 panic("Recursive entry to debugger");
1358
1359 return 1;
1360}
1361
1362static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
1363{
1364 unsigned long flags;
1365 int sstep_tries = 100;
1366 int error = 0;
1367 int i, cpu;
1368 int trace_on = 0;
1369acquirelock:
1370 /*
1371 * Interrupts will be restored by the 'trap return' code, except when
1372 * single stepping.
1373 */
1374 local_irq_save(flags);
1375
1376 cpu = ks->cpu;
1377 kgdb_info[cpu].debuggerinfo = regs;
1378 kgdb_info[cpu].task = current;
1379 /*
1380 * Make sure the above info reaches the primary CPU before
1381 * our cpu_in_kgdb[] flag setting does:
1382 */
1383 atomic_inc(&cpu_in_kgdb[cpu]);
1384
1385 /*
1386 * CPU will loop if it is a slave or request to become a kgdb
1387 * master cpu and acquire the kgdb_active lock:
1388 */
1389 while (1) {
1390 if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
1391 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
1392 break;
1393 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
1394 if (!atomic_read(&passive_cpu_wait[cpu]))
1395 goto return_normal;
1396 } else {
1397return_normal:
1398 /* Return to normal operation by executing any
1399 * hw breakpoint fixup.
1400 */
1401 if (arch_kgdb_ops.correct_hw_break)
1402 arch_kgdb_ops.correct_hw_break();
1403 if (trace_on)
1404 tracing_on();
1405 atomic_dec(&cpu_in_kgdb[cpu]);
1406 touch_softlockup_watchdog_sync();
1407 clocksource_touch_watchdog();
1408 local_irq_restore(flags);
1409 return 0;
1410 }
1411 cpu_relax();
1412 }
1413
1414 /*
1415 * For single stepping, try to only enter on the processor
1416 * that was single stepping. To gaurd against a deadlock, the
1417 * kernel will only try for the value of sstep_tries before
1418 * giving up and continuing on.
1419 */
1420 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1421 (kgdb_info[cpu].task &&
1422 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1423 atomic_set(&kgdb_active, -1);
1424 touch_softlockup_watchdog_sync();
1425 clocksource_touch_watchdog();
1426 local_irq_restore(flags);
1427
1428 goto acquirelock;
1429 }
1430
1431 if (!kgdb_io_ready(1)) {
1432 error = 1;
1433 goto kgdb_restore; /* No I/O connection, so resume the system */
1434 }
1435
1436 /*
1437 * Don't enter if we have hit a removed breakpoint.
1438 */
1439 if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
1440 goto kgdb_restore;
1441
1442 /* Call the I/O driver's pre_exception routine */
1443 if (kgdb_io_ops->pre_exception)
1444 kgdb_io_ops->pre_exception();
1445
1446 kgdb_disable_hw_debug(ks->linux_regs);
1447
1448 /*
1449 * Get the passive CPU lock which will hold all the non-primary
1450 * CPU in a spin state while the debugger is active
1451 */
1452 if (!kgdb_single_step) {
1453 for (i = 0; i < NR_CPUS; i++)
1454 atomic_inc(&passive_cpu_wait[i]);
1455 }
1456
1457#ifdef CONFIG_SMP
1458 /* Signal the other CPUs to enter kgdb_wait() */
1459 if ((!kgdb_single_step) && kgdb_do_roundup)
1460 kgdb_roundup_cpus(flags);
1461#endif
1462
1463 /*
1464 * Wait for the other CPUs to be notified and be waiting for us:
1465 */
1466 for_each_online_cpu(i) {
1467 while (!atomic_read(&cpu_in_kgdb[i]))
1468 cpu_relax();
1469 }
1470
1471 /*
1472 * At this point the primary processor is completely
1473 * in the debugger and all secondary CPUs are quiescent
1474 */
1475 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1476 kgdb_deactivate_sw_breakpoints();
1477 kgdb_single_step = 0;
1478 kgdb_contthread = current;
1479 exception_level = 0;
1480 trace_on = tracing_is_on();
1481 if (trace_on)
1482 tracing_off();
1483
1484 /* Talk to debugger with gdbserial protocol */
1485 error = gdb_serial_stub(ks);
1486
1487 /* Call the I/O driver's post_exception routine */
1488 if (kgdb_io_ops->post_exception)
1489 kgdb_io_ops->post_exception();
1490
1491 atomic_dec(&cpu_in_kgdb[ks->cpu]);
1492
1493 if (!kgdb_single_step) {
1494 for (i = NR_CPUS-1; i >= 0; i--)
1495 atomic_dec(&passive_cpu_wait[i]);
1496 /*
1497 * Wait till all the CPUs have quit
1498 * from the debugger.
1499 */
1500 for_each_online_cpu(i) {
1501 while (atomic_read(&cpu_in_kgdb[i]))
1502 cpu_relax();
1503 }
1504 }
1505
1506kgdb_restore:
1507 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1508 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1509 if (kgdb_info[sstep_cpu].task)
1510 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1511 else
1512 kgdb_sstep_pid = 0;
1513 }
1514 if (trace_on)
1515 tracing_on();
1516 /* Free kgdb_active */
1517 atomic_set(&kgdb_active, -1);
1518 touch_softlockup_watchdog_sync();
1519 clocksource_touch_watchdog();
1520 local_irq_restore(flags);
1521
1522 return error;
1523}
1524
1525/*
1526 * kgdb_handle_exception() - main entry point from a kernel exception
1527 *
1528 * Locking hierarchy:
1529 * interface locks, if any (begin_session)
1530 * kgdb lock (kgdb_active)
1531 */
1532int
1533kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1534{
1535 struct kgdb_state kgdb_var;
1536 struct kgdb_state *ks = &kgdb_var;
1537 int ret;
1538
1539 ks->cpu = raw_smp_processor_id();
1540 ks->ex_vector = evector;
1541 ks->signo = signo;
1542 ks->ex_vector = evector;
1543 ks->err_code = ecode;
1544 ks->kgdb_usethreadid = 0;
1545 ks->linux_regs = regs;
1546
1547 if (kgdb_reenter_check(ks))
1548 return 0; /* Ouch, double exception ! */
1549 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
1550 ret = kgdb_cpu_enter(ks, regs);
1551 kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
1552 return ret;
1553}
1554
1555int kgdb_nmicallback(int cpu, void *regs)
1556{
1557#ifdef CONFIG_SMP
1558 struct kgdb_state kgdb_var;
1559 struct kgdb_state *ks = &kgdb_var;
1560
1561 memset(ks, 0, sizeof(struct kgdb_state));
1562 ks->cpu = cpu;
1563 ks->linux_regs = regs;
1564
1565 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
1566 atomic_read(&kgdb_active) != -1 &&
1567 atomic_read(&kgdb_active) != cpu) {
1568 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
1569 kgdb_cpu_enter(ks, regs);
1570 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
1571 return 0;
1572 }
1573#endif
1574 return 1;
1575}
1576
1577static void kgdb_console_write(struct console *co, const char *s,
1578 unsigned count)
1579{
1580 unsigned long flags;
1581
1582 /* If we're debugging, or KGDB has not connected, don't try
1583 * and print. */
1584 if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
1585 return;
1586
1587 local_irq_save(flags);
1588 kgdb_msg_write(s, count);
1589 local_irq_restore(flags);
1590}
1591
1592static struct console kgdbcons = {
1593 .name = "kgdb",
1594 .write = kgdb_console_write,
1595 .flags = CON_PRINTBUFFER | CON_ENABLED,
1596 .index = -1,
1597};
1598
1599#ifdef CONFIG_MAGIC_SYSRQ
1600static void sysrq_handle_gdb(int key, struct tty_struct *tty)
1601{
1602 if (!kgdb_io_ops) {
1603 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
1604 return;
1605 }
1606 if (!kgdb_connected)
1607 printk(KERN_CRIT "Entering KGDB\n");
1608
1609 kgdb_breakpoint();
1610}
1611
1612static struct sysrq_key_op sysrq_gdb_op = {
1613 .handler = sysrq_handle_gdb,
1614 .help_msg = "debug(G)",
1615 .action_msg = "DEBUG",
1616};
1617#endif
1618
1619static void kgdb_register_callbacks(void)
1620{
1621 if (!kgdb_io_module_registered) {
1622 kgdb_io_module_registered = 1;
1623 kgdb_arch_init();
1624#ifdef CONFIG_MAGIC_SYSRQ
1625 register_sysrq_key('g', &sysrq_gdb_op);
1626#endif
1627 if (kgdb_use_con && !kgdb_con_registered) {
1628 register_console(&kgdbcons);
1629 kgdb_con_registered = 1;
1630 }
1631 }
1632}
1633
1634static void kgdb_unregister_callbacks(void)
1635{
1636 /*
1637 * When this routine is called KGDB should unregister from the
1638 * panic handler and clean up, making sure it is not handling any
1639 * break exceptions at the time.
1640 */
1641 if (kgdb_io_module_registered) {
1642 kgdb_io_module_registered = 0;
1643 kgdb_arch_exit();
1644#ifdef CONFIG_MAGIC_SYSRQ
1645 unregister_sysrq_key('g', &sysrq_gdb_op);
1646#endif
1647 if (kgdb_con_registered) {
1648 unregister_console(&kgdbcons);
1649 kgdb_con_registered = 0;
1650 }
1651 }
1652}
1653
1654static void kgdb_initial_breakpoint(void)
1655{
1656 kgdb_break_asap = 0;
1657
1658 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
1659 kgdb_breakpoint();
1660}
1661
1662/**
1663 * kgdb_register_io_module - register KGDB IO module
1664 * @new_kgdb_io_ops: the io ops vector
1665 *
1666 * Register it with the KGDB core.
1667 */
1668int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
1669{
1670 int err;
1671
1672 spin_lock(&kgdb_registration_lock);
1673
1674 if (kgdb_io_ops) {
1675 spin_unlock(&kgdb_registration_lock);
1676
1677 printk(KERN_ERR "kgdb: Another I/O driver is already "
1678 "registered with KGDB.\n");
1679 return -EBUSY;
1680 }
1681
1682 if (new_kgdb_io_ops->init) {
1683 err = new_kgdb_io_ops->init();
1684 if (err) {
1685 spin_unlock(&kgdb_registration_lock);
1686 return err;
1687 }
1688 }
1689
1690 kgdb_io_ops = new_kgdb_io_ops;
1691
1692 spin_unlock(&kgdb_registration_lock);
1693
1694 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
1695 new_kgdb_io_ops->name);
1696
1697 /* Arm KGDB now. */
1698 kgdb_register_callbacks();
1699
1700 if (kgdb_break_asap)
1701 kgdb_initial_breakpoint();
1702
1703 return 0;
1704}
1705EXPORT_SYMBOL_GPL(kgdb_register_io_module);
1706
1707/**
1708 * kkgdb_unregister_io_module - unregister KGDB IO module
1709 * @old_kgdb_io_ops: the io ops vector
1710 *
1711 * Unregister it with the KGDB core.
1712 */
1713void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
1714{
1715 BUG_ON(kgdb_connected);
1716
1717 /*
1718 * KGDB is no longer able to communicate out, so
1719 * unregister our callbacks and reset state.
1720 */
1721 kgdb_unregister_callbacks();
1722
1723 spin_lock(&kgdb_registration_lock);
1724
1725 WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
1726 kgdb_io_ops = NULL;
1727
1728 spin_unlock(&kgdb_registration_lock);
1729
1730 printk(KERN_INFO
1731 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
1732 old_kgdb_io_ops->name);
1733}
1734EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
1735
1736/**
1737 * kgdb_breakpoint - generate breakpoint exception
1738 *
1739 * This function will generate a breakpoint exception. It is used at the
1740 * beginning of a program to sync up with a debugger and can be used
1741 * otherwise as a quick means to stop program execution and "break" into
1742 * the debugger.
1743 */
1744void kgdb_breakpoint(void)
1745{
1746 atomic_inc(&kgdb_setting_breakpoint);
1747 wmb(); /* Sync point before breakpoint */
1748 arch_kgdb_breakpoint();
1749 wmb(); /* Sync point after breakpoint */
1750 atomic_dec(&kgdb_setting_breakpoint);
1751}
1752EXPORT_SYMBOL_GPL(kgdb_breakpoint);
1753
1754static int __init opt_kgdb_wait(char *str)
1755{
1756 kgdb_break_asap = 1;
1757
1758 if (kgdb_io_module_registered)
1759 kgdb_initial_breakpoint();
1760
1761 return 0;
1762}
1763
1764early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0ed46f3e51e9..282035f3ae96 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1588,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1588 arch_remove_kprobe(p); 1588 arch_remove_kprobe(p);
1589} 1589}
1590 1590
1591/* Disable one kprobe */
1592int __kprobes disable_kprobe(struct kprobe *kp)
1593{
1594 int ret = 0;
1595 struct kprobe *p;
1596
1597 mutex_lock(&kprobe_mutex);
1598
1599 /* Check whether specified probe is valid. */
1600 p = __get_valid_kprobe(kp);
1601 if (unlikely(p == NULL)) {
1602 ret = -EINVAL;
1603 goto out;
1604 }
1605
1606 /* If the probe is already disabled (or gone), just return */
1607 if (kprobe_disabled(kp))
1608 goto out;
1609
1610 kp->flags |= KPROBE_FLAG_DISABLED;
1611 if (p != kp)
1612 /* When kp != p, p is always enabled. */
1613 try_to_disable_aggr_kprobe(p);
1614
1615 if (!kprobes_all_disarmed && kprobe_disabled(p))
1616 disarm_kprobe(p);
1617out:
1618 mutex_unlock(&kprobe_mutex);
1619 return ret;
1620}
1621EXPORT_SYMBOL_GPL(disable_kprobe);
1622
1623/* Enable one kprobe */
1624int __kprobes enable_kprobe(struct kprobe *kp)
1625{
1626 int ret = 0;
1627 struct kprobe *p;
1628
1629 mutex_lock(&kprobe_mutex);
1630
1631 /* Check whether specified probe is valid. */
1632 p = __get_valid_kprobe(kp);
1633 if (unlikely(p == NULL)) {
1634 ret = -EINVAL;
1635 goto out;
1636 }
1637
1638 if (kprobe_gone(kp)) {
1639 /* This kprobe has gone, we couldn't enable it. */
1640 ret = -EINVAL;
1641 goto out;
1642 }
1643
1644 if (p != kp)
1645 kp->flags &= ~KPROBE_FLAG_DISABLED;
1646
1647 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1648 p->flags &= ~KPROBE_FLAG_DISABLED;
1649 arm_kprobe(p);
1650 }
1651out:
1652 mutex_unlock(&kprobe_mutex);
1653 return ret;
1654}
1655EXPORT_SYMBOL_GPL(enable_kprobe);
1656
1591void __kprobes dump_kprobe(struct kprobe *kp) 1657void __kprobes dump_kprobe(struct kprobe *kp)
1592{ 1658{
1593 printk(KERN_WARNING "Dumping kprobe:\n"); 1659 printk(KERN_WARNING "Dumping kprobe:\n");
@@ -1805,72 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = {
1805 .release = seq_release, 1871 .release = seq_release,
1806}; 1872};
1807 1873
1808/* Disable one kprobe */
1809int __kprobes disable_kprobe(struct kprobe *kp)
1810{
1811 int ret = 0;
1812 struct kprobe *p;
1813
1814 mutex_lock(&kprobe_mutex);
1815
1816 /* Check whether specified probe is valid. */
1817 p = __get_valid_kprobe(kp);
1818 if (unlikely(p == NULL)) {
1819 ret = -EINVAL;
1820 goto out;
1821 }
1822
1823 /* If the probe is already disabled (or gone), just return */
1824 if (kprobe_disabled(kp))
1825 goto out;
1826
1827 kp->flags |= KPROBE_FLAG_DISABLED;
1828 if (p != kp)
1829 /* When kp != p, p is always enabled. */
1830 try_to_disable_aggr_kprobe(p);
1831
1832 if (!kprobes_all_disarmed && kprobe_disabled(p))
1833 disarm_kprobe(p);
1834out:
1835 mutex_unlock(&kprobe_mutex);
1836 return ret;
1837}
1838EXPORT_SYMBOL_GPL(disable_kprobe);
1839
1840/* Enable one kprobe */
1841int __kprobes enable_kprobe(struct kprobe *kp)
1842{
1843 int ret = 0;
1844 struct kprobe *p;
1845
1846 mutex_lock(&kprobe_mutex);
1847
1848 /* Check whether specified probe is valid. */
1849 p = __get_valid_kprobe(kp);
1850 if (unlikely(p == NULL)) {
1851 ret = -EINVAL;
1852 goto out;
1853 }
1854
1855 if (kprobe_gone(kp)) {
1856 /* This kprobe has gone, we couldn't enable it. */
1857 ret = -EINVAL;
1858 goto out;
1859 }
1860
1861 if (p != kp)
1862 kp->flags &= ~KPROBE_FLAG_DISABLED;
1863
1864 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1865 p->flags &= ~KPROBE_FLAG_DISABLED;
1866 arm_kprobe(p);
1867 }
1868out:
1869 mutex_unlock(&kprobe_mutex);
1870 return ret;
1871}
1872EXPORT_SYMBOL_GPL(enable_kprobe);
1873
1874static void __kprobes arm_all_kprobes(void) 1874static void __kprobes arm_all_kprobes(void)
1875{ 1875{
1876 struct hlist_head *head; 1876 struct hlist_head *head;
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 21fe3c426948..0b624e791805 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -138,7 +138,8 @@ extern const void __start_notes __attribute__((weak));
138extern const void __stop_notes __attribute__((weak)); 138extern const void __stop_notes __attribute__((weak));
139#define notes_size (&__stop_notes - &__start_notes) 139#define notes_size (&__stop_notes - &__start_notes)
140 140
141static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, 141static ssize_t notes_read(struct file *filp, struct kobject *kobj,
142 struct bin_attribute *bin_attr,
142 char *buf, loff_t off, size_t count) 143 char *buf, loff_t off, size_t count)
143{ 144{
144 memcpy(buf, &__start_notes + off, count); 145 memcpy(buf, &__start_notes + off, count);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2594e1ce41cb..54286798c37b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -431,20 +431,7 @@ static struct stack_trace lockdep_init_trace = {
431/* 431/*
432 * Various lockdep statistics: 432 * Various lockdep statistics:
433 */ 433 */
434atomic_t chain_lookup_hits; 434DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
435atomic_t chain_lookup_misses;
436atomic_t hardirqs_on_events;
437atomic_t hardirqs_off_events;
438atomic_t redundant_hardirqs_on;
439atomic_t redundant_hardirqs_off;
440atomic_t softirqs_on_events;
441atomic_t softirqs_off_events;
442atomic_t redundant_softirqs_on;
443atomic_t redundant_softirqs_off;
444atomic_t nr_unused_locks;
445atomic_t nr_cyclic_checks;
446atomic_t nr_find_usage_forwards_checks;
447atomic_t nr_find_usage_backwards_checks;
448#endif 435#endif
449 436
450/* 437/*
@@ -748,7 +735,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
748 return NULL; 735 return NULL;
749 } 736 }
750 class = lock_classes + nr_lock_classes++; 737 class = lock_classes + nr_lock_classes++;
751 debug_atomic_inc(&nr_unused_locks); 738 debug_atomic_inc(nr_unused_locks);
752 class->key = key; 739 class->key = key;
753 class->name = lock->name; 740 class->name = lock->name;
754 class->subclass = subclass; 741 class->subclass = subclass;
@@ -818,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
818 * Add a new dependency to the head of the list: 805 * Add a new dependency to the head of the list:
819 */ 806 */
820static int add_lock_to_list(struct lock_class *class, struct lock_class *this, 807static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
821 struct list_head *head, unsigned long ip, int distance) 808 struct list_head *head, unsigned long ip,
809 int distance, struct stack_trace *trace)
822{ 810{
823 struct lock_list *entry; 811 struct lock_list *entry;
824 /* 812 /*
@@ -829,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
829 if (!entry) 817 if (!entry)
830 return 0; 818 return 0;
831 819
832 if (!save_trace(&entry->trace))
833 return 0;
834
835 entry->class = this; 820 entry->class = this;
836 entry->distance = distance; 821 entry->distance = distance;
822 entry->trace = *trace;
837 /* 823 /*
838 * Since we never remove from the dependency list, the list can 824 * Since we never remove from the dependency list, the list can
839 * be walked lockless by other CPUs, it's only allocation 825 * be walked lockless by other CPUs, it's only allocation
@@ -1205,7 +1191,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
1205{ 1191{
1206 int result; 1192 int result;
1207 1193
1208 debug_atomic_inc(&nr_cyclic_checks); 1194 debug_atomic_inc(nr_cyclic_checks);
1209 1195
1210 result = __bfs_forwards(root, target, class_equal, target_entry); 1196 result = __bfs_forwards(root, target, class_equal, target_entry);
1211 1197
@@ -1242,7 +1228,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1242{ 1228{
1243 int result; 1229 int result;
1244 1230
1245 debug_atomic_inc(&nr_find_usage_forwards_checks); 1231 debug_atomic_inc(nr_find_usage_forwards_checks);
1246 1232
1247 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); 1233 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1248 1234
@@ -1265,7 +1251,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1265{ 1251{
1266 int result; 1252 int result;
1267 1253
1268 debug_atomic_inc(&nr_find_usage_backwards_checks); 1254 debug_atomic_inc(nr_find_usage_backwards_checks);
1269 1255
1270 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); 1256 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1271 1257
@@ -1635,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1635 */ 1621 */
1636static int 1622static int
1637check_prev_add(struct task_struct *curr, struct held_lock *prev, 1623check_prev_add(struct task_struct *curr, struct held_lock *prev,
1638 struct held_lock *next, int distance) 1624 struct held_lock *next, int distance, int trylock_loop)
1639{ 1625{
1640 struct lock_list *entry; 1626 struct lock_list *entry;
1641 int ret; 1627 int ret;
1642 struct lock_list this; 1628 struct lock_list this;
1643 struct lock_list *uninitialized_var(target_entry); 1629 struct lock_list *uninitialized_var(target_entry);
1630 /*
1631 * Static variable, serialized by the graph_lock().
1632 *
1633 * We use this static variable to save the stack trace in case
1634 * we call into this function multiple times due to encountering
1635 * trylocks in the held lock stack.
1636 */
1637 static struct stack_trace trace;
1644 1638
1645 /* 1639 /*
1646 * Prove that the new <prev> -> <next> dependency would not 1640 * Prove that the new <prev> -> <next> dependency would not
@@ -1688,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1688 } 1682 }
1689 } 1683 }
1690 1684
1685 if (!trylock_loop && !save_trace(&trace))
1686 return 0;
1687
1691 /* 1688 /*
1692 * Ok, all validations passed, add the new lock 1689 * Ok, all validations passed, add the new lock
1693 * to the previous lock's dependency list: 1690 * to the previous lock's dependency list:
1694 */ 1691 */
1695 ret = add_lock_to_list(hlock_class(prev), hlock_class(next), 1692 ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
1696 &hlock_class(prev)->locks_after, 1693 &hlock_class(prev)->locks_after,
1697 next->acquire_ip, distance); 1694 next->acquire_ip, distance, &trace);
1698 1695
1699 if (!ret) 1696 if (!ret)
1700 return 0; 1697 return 0;
1701 1698
1702 ret = add_lock_to_list(hlock_class(next), hlock_class(prev), 1699 ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
1703 &hlock_class(next)->locks_before, 1700 &hlock_class(next)->locks_before,
1704 next->acquire_ip, distance); 1701 next->acquire_ip, distance, &trace);
1705 if (!ret) 1702 if (!ret)
1706 return 0; 1703 return 0;
1707 1704
@@ -1731,6 +1728,7 @@ static int
1731check_prevs_add(struct task_struct *curr, struct held_lock *next) 1728check_prevs_add(struct task_struct *curr, struct held_lock *next)
1732{ 1729{
1733 int depth = curr->lockdep_depth; 1730 int depth = curr->lockdep_depth;
1731 int trylock_loop = 0;
1734 struct held_lock *hlock; 1732 struct held_lock *hlock;
1735 1733
1736 /* 1734 /*
@@ -1756,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1756 * added: 1754 * added:
1757 */ 1755 */
1758 if (hlock->read != 2) { 1756 if (hlock->read != 2) {
1759 if (!check_prev_add(curr, hlock, next, distance)) 1757 if (!check_prev_add(curr, hlock, next,
1758 distance, trylock_loop))
1760 return 0; 1759 return 0;
1761 /* 1760 /*
1762 * Stop after the first non-trylock entry, 1761 * Stop after the first non-trylock entry,
@@ -1779,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1779 if (curr->held_locks[depth].irq_context != 1778 if (curr->held_locks[depth].irq_context !=
1780 curr->held_locks[depth-1].irq_context) 1779 curr->held_locks[depth-1].irq_context)
1781 break; 1780 break;
1781 trylock_loop = 1;
1782 } 1782 }
1783 return 1; 1783 return 1;
1784out_bug: 1784out_bug:
@@ -1825,7 +1825,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1825 list_for_each_entry(chain, hash_head, entry) { 1825 list_for_each_entry(chain, hash_head, entry) {
1826 if (chain->chain_key == chain_key) { 1826 if (chain->chain_key == chain_key) {
1827cache_hit: 1827cache_hit:
1828 debug_atomic_inc(&chain_lookup_hits); 1828 debug_atomic_inc(chain_lookup_hits);
1829 if (very_verbose(class)) 1829 if (very_verbose(class))
1830 printk("\nhash chain already cached, key: " 1830 printk("\nhash chain already cached, key: "
1831 "%016Lx tail class: [%p] %s\n", 1831 "%016Lx tail class: [%p] %s\n",
@@ -1890,7 +1890,7 @@ cache_hit:
1890 chain_hlocks[chain->base + j] = class - lock_classes; 1890 chain_hlocks[chain->base + j] = class - lock_classes;
1891 } 1891 }
1892 list_add_tail_rcu(&chain->entry, hash_head); 1892 list_add_tail_rcu(&chain->entry, hash_head);
1893 debug_atomic_inc(&chain_lookup_misses); 1893 debug_atomic_inc(chain_lookup_misses);
1894 inc_chains(); 1894 inc_chains();
1895 1895
1896 return 1; 1896 return 1;
@@ -2311,7 +2311,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
2311 return; 2311 return;
2312 2312
2313 if (unlikely(curr->hardirqs_enabled)) { 2313 if (unlikely(curr->hardirqs_enabled)) {
2314 debug_atomic_inc(&redundant_hardirqs_on); 2314 /*
2315 * Neither irq nor preemption are disabled here
2316 * so this is racy by nature but loosing one hit
2317 * in a stat is not a big deal.
2318 */
2319 __debug_atomic_inc(redundant_hardirqs_on);
2315 return; 2320 return;
2316 } 2321 }
2317 /* we'll do an OFF -> ON transition: */ 2322 /* we'll do an OFF -> ON transition: */
@@ -2338,7 +2343,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2338 2343
2339 curr->hardirq_enable_ip = ip; 2344 curr->hardirq_enable_ip = ip;
2340 curr->hardirq_enable_event = ++curr->irq_events; 2345 curr->hardirq_enable_event = ++curr->irq_events;
2341 debug_atomic_inc(&hardirqs_on_events); 2346 debug_atomic_inc(hardirqs_on_events);
2342} 2347}
2343EXPORT_SYMBOL(trace_hardirqs_on_caller); 2348EXPORT_SYMBOL(trace_hardirqs_on_caller);
2344 2349
@@ -2370,9 +2375,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
2370 curr->hardirqs_enabled = 0; 2375 curr->hardirqs_enabled = 0;
2371 curr->hardirq_disable_ip = ip; 2376 curr->hardirq_disable_ip = ip;
2372 curr->hardirq_disable_event = ++curr->irq_events; 2377 curr->hardirq_disable_event = ++curr->irq_events;
2373 debug_atomic_inc(&hardirqs_off_events); 2378 debug_atomic_inc(hardirqs_off_events);
2374 } else 2379 } else
2375 debug_atomic_inc(&redundant_hardirqs_off); 2380 debug_atomic_inc(redundant_hardirqs_off);
2376} 2381}
2377EXPORT_SYMBOL(trace_hardirqs_off_caller); 2382EXPORT_SYMBOL(trace_hardirqs_off_caller);
2378 2383
@@ -2396,7 +2401,7 @@ void trace_softirqs_on(unsigned long ip)
2396 return; 2401 return;
2397 2402
2398 if (curr->softirqs_enabled) { 2403 if (curr->softirqs_enabled) {
2399 debug_atomic_inc(&redundant_softirqs_on); 2404 debug_atomic_inc(redundant_softirqs_on);
2400 return; 2405 return;
2401 } 2406 }
2402 2407
@@ -2406,7 +2411,7 @@ void trace_softirqs_on(unsigned long ip)
2406 curr->softirqs_enabled = 1; 2411 curr->softirqs_enabled = 1;
2407 curr->softirq_enable_ip = ip; 2412 curr->softirq_enable_ip = ip;
2408 curr->softirq_enable_event = ++curr->irq_events; 2413 curr->softirq_enable_event = ++curr->irq_events;
2409 debug_atomic_inc(&softirqs_on_events); 2414 debug_atomic_inc(softirqs_on_events);
2410 /* 2415 /*
2411 * We are going to turn softirqs on, so set the 2416 * We are going to turn softirqs on, so set the
2412 * usage bit for all held locks, if hardirqs are 2417 * usage bit for all held locks, if hardirqs are
@@ -2436,10 +2441,10 @@ void trace_softirqs_off(unsigned long ip)
2436 curr->softirqs_enabled = 0; 2441 curr->softirqs_enabled = 0;
2437 curr->softirq_disable_ip = ip; 2442 curr->softirq_disable_ip = ip;
2438 curr->softirq_disable_event = ++curr->irq_events; 2443 curr->softirq_disable_event = ++curr->irq_events;
2439 debug_atomic_inc(&softirqs_off_events); 2444 debug_atomic_inc(softirqs_off_events);
2440 DEBUG_LOCKS_WARN_ON(!softirq_count()); 2445 DEBUG_LOCKS_WARN_ON(!softirq_count());
2441 } else 2446 } else
2442 debug_atomic_inc(&redundant_softirqs_off); 2447 debug_atomic_inc(redundant_softirqs_off);
2443} 2448}
2444 2449
2445static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) 2450static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2644,7 +2649,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2644 return 0; 2649 return 0;
2645 break; 2650 break;
2646 case LOCK_USED: 2651 case LOCK_USED:
2647 debug_atomic_dec(&nr_unused_locks); 2652 debug_atomic_dec(nr_unused_locks);
2648 break; 2653 break;
2649 default: 2654 default:
2650 if (!debug_locks_off_graph_unlock()) 2655 if (!debug_locks_off_graph_unlock())
@@ -2706,6 +2711,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2706} 2711}
2707EXPORT_SYMBOL_GPL(lockdep_init_map); 2712EXPORT_SYMBOL_GPL(lockdep_init_map);
2708 2713
2714struct lock_class_key __lockdep_no_validate__;
2715
2709/* 2716/*
2710 * This gets called for every mutex_lock*()/spin_lock*() operation. 2717 * This gets called for every mutex_lock*()/spin_lock*() operation.
2711 * We maintain the dependency maps and validate the locking attempt: 2718 * We maintain the dependency maps and validate the locking attempt:
@@ -2740,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2740 return 0; 2747 return 0;
2741 } 2748 }
2742 2749
2750 if (lock->key == &__lockdep_no_validate__)
2751 check = 1;
2752
2743 if (!subclass) 2753 if (!subclass)
2744 class = lock->class_cache; 2754 class = lock->class_cache;
2745 /* 2755 /*
@@ -2750,7 +2760,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2750 if (!class) 2760 if (!class)
2751 return 0; 2761 return 0;
2752 } 2762 }
2753 debug_atomic_inc((atomic_t *)&class->ops); 2763 atomic_inc((atomic_t *)&class->ops);
2754 if (very_verbose(class)) { 2764 if (very_verbose(class)) {
2755 printk("\nacquire class [%p] %s", class->key, class->name); 2765 printk("\nacquire class [%p] %s", class->key, class->name);
2756 if (class->name_version > 1) 2766 if (class->name_version > 1)
@@ -3227,7 +3237,7 @@ void lock_release(struct lockdep_map *lock, int nested,
3227 raw_local_irq_save(flags); 3237 raw_local_irq_save(flags);
3228 check_flags(flags); 3238 check_flags(flags);
3229 current->lockdep_recursion = 1; 3239 current->lockdep_recursion = 1;
3230 trace_lock_release(lock, nested, ip); 3240 trace_lock_release(lock, ip);
3231 __lock_release(lock, nested, ip); 3241 __lock_release(lock, nested, ip);
3232 current->lockdep_recursion = 0; 3242 current->lockdep_recursion = 0;
3233 raw_local_irq_restore(flags); 3243 raw_local_irq_restore(flags);
@@ -3380,7 +3390,7 @@ found_it:
3380 hlock->holdtime_stamp = now; 3390 hlock->holdtime_stamp = now;
3381 } 3391 }
3382 3392
3383 trace_lock_acquired(lock, ip, waittime); 3393 trace_lock_acquired(lock, ip);
3384 3394
3385 stats = get_lock_stats(hlock_class(hlock)); 3395 stats = get_lock_stats(hlock_class(hlock));
3386 if (waittime) { 3396 if (waittime) {
@@ -3801,8 +3811,11 @@ void lockdep_rcu_dereference(const char *file, const int line)
3801{ 3811{
3802 struct task_struct *curr = current; 3812 struct task_struct *curr = current;
3803 3813
3814#ifndef CONFIG_PROVE_RCU_REPEATEDLY
3804 if (!debug_locks_off()) 3815 if (!debug_locks_off())
3805 return; 3816 return;
3817#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
3818 /* Note: the following can be executed concurrently, so be careful. */
3806 printk("\n===================================================\n"); 3819 printk("\n===================================================\n");
3807 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); 3820 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3808 printk( "---------------------------------------------------\n"); 3821 printk( "---------------------------------------------------\n");
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad1313..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,30 +110,60 @@ lockdep_count_backward_deps(struct lock_class *class)
110#endif 110#endif
111 111
112#ifdef CONFIG_DEBUG_LOCKDEP 112#ifdef CONFIG_DEBUG_LOCKDEP
113
114#include <asm/local.h>
113/* 115/*
114 * Various lockdep statistics: 116 * Various lockdep statistics.
117 * We want them per cpu as they are often accessed in fast path
118 * and we want to avoid too much cache bouncing.
115 */ 119 */
116extern atomic_t chain_lookup_hits; 120struct lockdep_stats {
117extern atomic_t chain_lookup_misses; 121 int chain_lookup_hits;
118extern atomic_t hardirqs_on_events; 122 int chain_lookup_misses;
119extern atomic_t hardirqs_off_events; 123 int hardirqs_on_events;
120extern atomic_t redundant_hardirqs_on; 124 int hardirqs_off_events;
121extern atomic_t redundant_hardirqs_off; 125 int redundant_hardirqs_on;
122extern atomic_t softirqs_on_events; 126 int redundant_hardirqs_off;
123extern atomic_t softirqs_off_events; 127 int softirqs_on_events;
124extern atomic_t redundant_softirqs_on; 128 int softirqs_off_events;
125extern atomic_t redundant_softirqs_off; 129 int redundant_softirqs_on;
126extern atomic_t nr_unused_locks; 130 int redundant_softirqs_off;
127extern atomic_t nr_cyclic_checks; 131 int nr_unused_locks;
128extern atomic_t nr_cyclic_check_recursions; 132 int nr_cyclic_checks;
129extern atomic_t nr_find_usage_forwards_checks; 133 int nr_cyclic_check_recursions;
130extern atomic_t nr_find_usage_forwards_recursions; 134 int nr_find_usage_forwards_checks;
131extern atomic_t nr_find_usage_backwards_checks; 135 int nr_find_usage_forwards_recursions;
132extern atomic_t nr_find_usage_backwards_recursions; 136 int nr_find_usage_backwards_checks;
133# define debug_atomic_inc(ptr) atomic_inc(ptr) 137 int nr_find_usage_backwards_recursions;
134# define debug_atomic_dec(ptr) atomic_dec(ptr) 138};
135# define debug_atomic_read(ptr) atomic_read(ptr) 139
140DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
141
142#define __debug_atomic_inc(ptr) \
143 this_cpu_inc(lockdep_stats.ptr);
144
145#define debug_atomic_inc(ptr) { \
146 WARN_ON_ONCE(!irqs_disabled()); \
147 __this_cpu_inc(lockdep_stats.ptr); \
148}
149
150#define debug_atomic_dec(ptr) { \
151 WARN_ON_ONCE(!irqs_disabled()); \
152 __this_cpu_dec(lockdep_stats.ptr); \
153}
154
155#define debug_atomic_read(ptr) ({ \
156 struct lockdep_stats *__cpu_lockdep_stats; \
157 unsigned long long __total = 0; \
158 int __cpu; \
159 for_each_possible_cpu(__cpu) { \
160 __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \
161 __total += __cpu_lockdep_stats->ptr; \
162 } \
163 __total; \
164})
136#else 165#else
166# define __debug_atomic_inc(ptr) do { } while (0)
137# define debug_atomic_inc(ptr) do { } while (0) 167# define debug_atomic_inc(ptr) do { } while (0)
138# define debug_atomic_dec(ptr) do { } while (0) 168# define debug_atomic_dec(ptr) do { } while (0)
139# define debug_atomic_read(ptr) 0 169# define debug_atomic_read(ptr) 0
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584c..59b76c8ce9d7 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
184static void lockdep_stats_debug_show(struct seq_file *m) 184static void lockdep_stats_debug_show(struct seq_file *m)
185{ 185{
186#ifdef CONFIG_DEBUG_LOCKDEP 186#ifdef CONFIG_DEBUG_LOCKDEP
187 unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), 187 unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
188 hi2 = debug_atomic_read(&hardirqs_off_events), 188 hi2 = debug_atomic_read(hardirqs_off_events),
189 hr1 = debug_atomic_read(&redundant_hardirqs_on), 189 hr1 = debug_atomic_read(redundant_hardirqs_on),
190 hr2 = debug_atomic_read(&redundant_hardirqs_off), 190 hr2 = debug_atomic_read(redundant_hardirqs_off),
191 si1 = debug_atomic_read(&softirqs_on_events), 191 si1 = debug_atomic_read(softirqs_on_events),
192 si2 = debug_atomic_read(&softirqs_off_events), 192 si2 = debug_atomic_read(softirqs_off_events),
193 sr1 = debug_atomic_read(&redundant_softirqs_on), 193 sr1 = debug_atomic_read(redundant_softirqs_on),
194 sr2 = debug_atomic_read(&redundant_softirqs_off); 194 sr2 = debug_atomic_read(redundant_softirqs_off);
195 195
196 seq_printf(m, " chain lookup misses: %11u\n", 196 seq_printf(m, " chain lookup misses: %11llu\n",
197 debug_atomic_read(&chain_lookup_misses)); 197 debug_atomic_read(chain_lookup_misses));
198 seq_printf(m, " chain lookup hits: %11u\n", 198 seq_printf(m, " chain lookup hits: %11llu\n",
199 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(chain_lookup_hits));
200 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11llu\n",
201 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(nr_cyclic_checks));
202 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11llu\n",
203 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(nr_find_usage_forwards_checks));
204 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11llu\n",
205 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(nr_find_usage_backwards_checks));
206 206
207 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11llu\n", hi1);
208 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11llu\n", hi2);
209 seq_printf(m, " redundant hardirq ons: %11u\n", hr1); 209 seq_printf(m, " redundant hardirq ons: %11llu\n", hr1);
210 seq_printf(m, " redundant hardirq offs: %11u\n", hr2); 210 seq_printf(m, " redundant hardirq offs: %11llu\n", hr2);
211 seq_printf(m, " softirq on events: %11u\n", si1); 211 seq_printf(m, " softirq on events: %11llu\n", si1);
212 seq_printf(m, " softirq off events: %11u\n", si2); 212 seq_printf(m, " softirq off events: %11llu\n", si2);
213 seq_printf(m, " redundant softirq ons: %11u\n", sr1); 213 seq_printf(m, " redundant softirq ons: %11llu\n", sr1);
214 seq_printf(m, " redundant softirq offs: %11u\n", sr2); 214 seq_printf(m, " redundant softirq offs: %11llu\n", sr2);
215#endif 215#endif
216} 216}
217 217
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
263#endif 263#endif
264 } 264 }
265#ifdef CONFIG_DEBUG_LOCKDEP 265#ifdef CONFIG_DEBUG_LOCKDEP
266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); 266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
267#endif 267#endif
268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n", 268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
269 nr_lock_classes, MAX_LOCKDEP_KEYS); 269 nr_lock_classes, MAX_LOCKDEP_KEYS);
diff --git a/kernel/module.c b/kernel/module.c
index 1016b75b026a..3c4fc4bb4b82 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
59#define CREATE_TRACE_POINTS 59#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 60#include <trace/events/module.h>
61 61
62EXPORT_TRACEPOINT_SYMBOL(module_get);
63
64#if 0 62#if 0
65#define DEBUGP printk 63#define DEBUGP printk
66#else 64#else
@@ -79,6 +77,10 @@ EXPORT_TRACEPOINT_SYMBOL(module_get);
79DEFINE_MUTEX(module_mutex); 77DEFINE_MUTEX(module_mutex);
80EXPORT_SYMBOL_GPL(module_mutex); 78EXPORT_SYMBOL_GPL(module_mutex);
81static LIST_HEAD(modules); 79static LIST_HEAD(modules);
80#ifdef CONFIG_KGDB_KDB
81struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
82#endif /* CONFIG_KGDB_KDB */
83
82 84
83/* Block module loading/unloading? */ 85/* Block module loading/unloading? */
84int modules_disabled = 0; 86int modules_disabled = 0;
@@ -515,6 +517,9 @@ MODINFO_ATTR(srcversion);
515static char last_unloaded_module[MODULE_NAME_LEN+1]; 517static char last_unloaded_module[MODULE_NAME_LEN+1];
516 518
517#ifdef CONFIG_MODULE_UNLOAD 519#ifdef CONFIG_MODULE_UNLOAD
520
521EXPORT_TRACEPOINT_SYMBOL(module_get);
522
518/* Init the unload section of the module. */ 523/* Init the unload section of the module. */
519static void module_unload_init(struct module *mod) 524static void module_unload_init(struct module *mod)
520{ 525{
@@ -723,16 +728,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
723 return -EFAULT; 728 return -EFAULT;
724 name[MODULE_NAME_LEN-1] = '\0'; 729 name[MODULE_NAME_LEN-1] = '\0';
725 730
726 /* Create stop_machine threads since free_module relies on 731 if (mutex_lock_interruptible(&module_mutex) != 0)
727 * a non-failing stop_machine call. */ 732 return -EINTR;
728 ret = stop_machine_create();
729 if (ret)
730 return ret;
731
732 if (mutex_lock_interruptible(&module_mutex) != 0) {
733 ret = -EINTR;
734 goto out_stop;
735 }
736 733
737 mod = find_module(name); 734 mod = find_module(name);
738 if (!mod) { 735 if (!mod) {
@@ -792,8 +789,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
792 789
793 out: 790 out:
794 mutex_unlock(&module_mutex); 791 mutex_unlock(&module_mutex);
795out_stop:
796 stop_machine_destroy();
797 return ret; 792 return ret;
798} 793}
799 794
@@ -867,8 +862,7 @@ void module_put(struct module *module)
867 smp_wmb(); /* see comment in module_refcount */ 862 smp_wmb(); /* see comment in module_refcount */
868 __this_cpu_inc(module->refptr->decs); 863 __this_cpu_inc(module->refptr->decs);
869 864
870 trace_module_put(module, _RET_IP_, 865 trace_module_put(module, _RET_IP_);
871 __this_cpu_read(module->refptr->decs));
872 /* Maybe they're waiting for us to drop reference? */ 866 /* Maybe they're waiting for us to drop reference? */
873 if (unlikely(!module_is_live(module))) 867 if (unlikely(!module_is_live(module)))
874 wake_up_process(module->waiter); 868 wake_up_process(module->waiter);
@@ -1192,7 +1186,7 @@ struct module_notes_attrs {
1192 struct bin_attribute attrs[0]; 1186 struct bin_attribute attrs[0];
1193}; 1187};
1194 1188
1195static ssize_t module_notes_read(struct kobject *kobj, 1189static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
1196 struct bin_attribute *bin_attr, 1190 struct bin_attribute *bin_attr,
1197 char *buf, loff_t pos, size_t count) 1191 char *buf, loff_t pos, size_t count)
1198{ 1192{
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2f3fbf84215a..a4fa381db3c2 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -16,6 +16,7 @@
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/poll.h> 17#include <linux/poll.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/hash.h>
19#include <linux/sysfs.h> 20#include <linux/sysfs.h>
20#include <linux/dcache.h> 21#include <linux/dcache.h>
21#include <linux/percpu.h> 22#include <linux/percpu.h>
@@ -82,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
82void __weak hw_perf_disable(void) { barrier(); } 83void __weak hw_perf_disable(void) { barrier(); }
83void __weak hw_perf_enable(void) { barrier(); } 84void __weak hw_perf_enable(void) { barrier(); }
84 85
85int __weak
86hw_perf_group_sched_in(struct perf_event *group_leader,
87 struct perf_cpu_context *cpuctx,
88 struct perf_event_context *ctx)
89{
90 return 0;
91}
92
93void __weak perf_event_print_debug(void) { } 86void __weak perf_event_print_debug(void) { }
94 87
95static DEFINE_PER_CPU(int, perf_disable_count); 88static DEFINE_PER_CPU(int, perf_disable_count);
@@ -262,6 +255,18 @@ static void update_event_times(struct perf_event *event)
262 event->total_time_running = run_end - event->tstamp_running; 255 event->total_time_running = run_end - event->tstamp_running;
263} 256}
264 257
258/*
259 * Update total_time_enabled and total_time_running for all events in a group.
260 */
261static void update_group_times(struct perf_event *leader)
262{
263 struct perf_event *event;
264
265 update_event_times(leader);
266 list_for_each_entry(event, &leader->sibling_list, group_entry)
267 update_event_times(event);
268}
269
265static struct list_head * 270static struct list_head *
266ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) 271ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
267{ 272{
@@ -315,8 +320,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
315static void 320static void
316list_del_event(struct perf_event *event, struct perf_event_context *ctx) 321list_del_event(struct perf_event *event, struct perf_event_context *ctx)
317{ 322{
318 struct perf_event *sibling, *tmp;
319
320 if (list_empty(&event->group_entry)) 323 if (list_empty(&event->group_entry))
321 return; 324 return;
322 ctx->nr_events--; 325 ctx->nr_events--;
@@ -329,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
329 if (event->group_leader != event) 332 if (event->group_leader != event)
330 event->group_leader->nr_siblings--; 333 event->group_leader->nr_siblings--;
331 334
332 update_event_times(event); 335 update_group_times(event);
333 336
334 /* 337 /*
335 * If event was in error state, then keep it 338 * If event was in error state, then keep it
@@ -340,6 +343,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
340 */ 343 */
341 if (event->state > PERF_EVENT_STATE_OFF) 344 if (event->state > PERF_EVENT_STATE_OFF)
342 event->state = PERF_EVENT_STATE_OFF; 345 event->state = PERF_EVENT_STATE_OFF;
346}
347
348static void
349perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
350{
351 struct perf_event *sibling, *tmp;
343 352
344 /* 353 /*
345 * If this was a group event with sibling events then 354 * If this was a group event with sibling events then
@@ -505,18 +514,6 @@ retry:
505} 514}
506 515
507/* 516/*
508 * Update total_time_enabled and total_time_running for all events in a group.
509 */
510static void update_group_times(struct perf_event *leader)
511{
512 struct perf_event *event;
513
514 update_event_times(leader);
515 list_for_each_entry(event, &leader->sibling_list, group_entry)
516 update_event_times(event);
517}
518
519/*
520 * Cross CPU call to disable a performance event 517 * Cross CPU call to disable a performance event
521 */ 518 */
522static void __perf_event_disable(void *info) 519static void __perf_event_disable(void *info)
@@ -640,15 +637,20 @@ group_sched_in(struct perf_event *group_event,
640 struct perf_cpu_context *cpuctx, 637 struct perf_cpu_context *cpuctx,
641 struct perf_event_context *ctx) 638 struct perf_event_context *ctx)
642{ 639{
643 struct perf_event *event, *partial_group; 640 struct perf_event *event, *partial_group = NULL;
641 const struct pmu *pmu = group_event->pmu;
642 bool txn = false;
644 int ret; 643 int ret;
645 644
646 if (group_event->state == PERF_EVENT_STATE_OFF) 645 if (group_event->state == PERF_EVENT_STATE_OFF)
647 return 0; 646 return 0;
648 647
649 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); 648 /* Check if group transaction availabe */
650 if (ret) 649 if (pmu->start_txn)
651 return ret < 0 ? ret : 0; 650 txn = true;
651
652 if (txn)
653 pmu->start_txn(pmu);
652 654
653 if (event_sched_in(group_event, cpuctx, ctx)) 655 if (event_sched_in(group_event, cpuctx, ctx))
654 return -EAGAIN; 656 return -EAGAIN;
@@ -663,9 +665,19 @@ group_sched_in(struct perf_event *group_event,
663 } 665 }
664 } 666 }
665 667
666 return 0; 668 if (!txn)
669 return 0;
670
671 ret = pmu->commit_txn(pmu);
672 if (!ret) {
673 pmu->cancel_txn(pmu);
674 return 0;
675 }
667 676
668group_error: 677group_error:
678 if (txn)
679 pmu->cancel_txn(pmu);
680
669 /* 681 /*
670 * Groups can be scheduled in as one unit only, so undo any 682 * Groups can be scheduled in as one unit only, so undo any
671 * partial group before returning: 683 * partial group before returning:
@@ -1367,6 +1379,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1367 if (cpuctx->task_ctx == ctx) 1379 if (cpuctx->task_ctx == ctx)
1368 return; 1380 return;
1369 1381
1382 perf_disable();
1383
1370 /* 1384 /*
1371 * We want to keep the following priority order: 1385 * We want to keep the following priority order:
1372 * cpu pinned (that don't need to move), task pinned, 1386 * cpu pinned (that don't need to move), task pinned,
@@ -1379,6 +1393,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1379 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 1393 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1380 1394
1381 cpuctx->task_ctx = ctx; 1395 cpuctx->task_ctx = ctx;
1396
1397 perf_enable();
1382} 1398}
1383 1399
1384#define MAX_INTERRUPTS (~0ULL) 1400#define MAX_INTERRUPTS (~0ULL)
@@ -1856,9 +1872,30 @@ int perf_event_release_kernel(struct perf_event *event)
1856{ 1872{
1857 struct perf_event_context *ctx = event->ctx; 1873 struct perf_event_context *ctx = event->ctx;
1858 1874
1875 /*
1876 * Remove from the PMU, can't get re-enabled since we got
1877 * here because the last ref went.
1878 */
1879 perf_event_disable(event);
1880
1859 WARN_ON_ONCE(ctx->parent_ctx); 1881 WARN_ON_ONCE(ctx->parent_ctx);
1860 mutex_lock(&ctx->mutex); 1882 /*
1861 perf_event_remove_from_context(event); 1883 * There are two ways this annotation is useful:
1884 *
1885 * 1) there is a lock recursion from perf_event_exit_task
1886 * see the comment there.
1887 *
1888 * 2) there is a lock-inversion with mmap_sem through
1889 * perf_event_read_group(), which takes faults while
1890 * holding ctx->mutex, however this is called after
1891 * the last filedesc died, so there is no possibility
1892 * to trigger the AB-BA case.
1893 */
1894 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
1895 raw_spin_lock_irq(&ctx->lock);
1896 list_del_event(event, ctx);
1897 perf_destroy_group(event, ctx);
1898 raw_spin_unlock_irq(&ctx->lock);
1862 mutex_unlock(&ctx->mutex); 1899 mutex_unlock(&ctx->mutex);
1863 1900
1864 mutex_lock(&event->owner->perf_event_mutex); 1901 mutex_lock(&event->owner->perf_event_mutex);
@@ -2642,6 +2679,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
2642} 2679}
2643 2680
2644static const struct file_operations perf_fops = { 2681static const struct file_operations perf_fops = {
2682 .llseek = no_llseek,
2645 .release = perf_release, 2683 .release = perf_release,
2646 .read = perf_read, 2684 .read = perf_read,
2647 .poll = perf_poll, 2685 .poll = perf_poll,
@@ -2792,6 +2830,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
2792 2830
2793 2831
2794/* 2832/*
2833 * We assume there is only KVM supporting the callbacks.
2834 * Later on, we might change it to a list if there is
2835 * another virtualization implementation supporting the callbacks.
2836 */
2837struct perf_guest_info_callbacks *perf_guest_cbs;
2838
2839int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2840{
2841 perf_guest_cbs = cbs;
2842 return 0;
2843}
2844EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
2845
2846int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2847{
2848 perf_guest_cbs = NULL;
2849 return 0;
2850}
2851EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2852
2853/*
2795 * Output 2854 * Output
2796 */ 2855 */
2797static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, 2856static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
@@ -3743,7 +3802,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3743 .event_id = { 3802 .event_id = {
3744 .header = { 3803 .header = {
3745 .type = PERF_RECORD_MMAP, 3804 .type = PERF_RECORD_MMAP,
3746 .misc = 0, 3805 .misc = PERF_RECORD_MISC_USER,
3747 /* .size */ 3806 /* .size */
3748 }, 3807 },
3749 /* .pid */ 3808 /* .pid */
@@ -3961,36 +4020,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3961 perf_swevent_overflow(event, 0, nmi, data, regs); 4020 perf_swevent_overflow(event, 0, nmi, data, regs);
3962} 4021}
3963 4022
3964static int perf_swevent_is_counting(struct perf_event *event)
3965{
3966 /*
3967 * The event is active, we're good!
3968 */
3969 if (event->state == PERF_EVENT_STATE_ACTIVE)
3970 return 1;
3971
3972 /*
3973 * The event is off/error, not counting.
3974 */
3975 if (event->state != PERF_EVENT_STATE_INACTIVE)
3976 return 0;
3977
3978 /*
3979 * The event is inactive, if the context is active
3980 * we're part of a group that didn't make it on the 'pmu',
3981 * not counting.
3982 */
3983 if (event->ctx->is_active)
3984 return 0;
3985
3986 /*
3987 * We're inactive and the context is too, this means the
3988 * task is scheduled out, we're counting events that happen
3989 * to us, like migration events.
3990 */
3991 return 1;
3992}
3993
3994static int perf_tp_event_match(struct perf_event *event, 4023static int perf_tp_event_match(struct perf_event *event,
3995 struct perf_sample_data *data); 4024 struct perf_sample_data *data);
3996 4025
@@ -4014,12 +4043,6 @@ static int perf_swevent_match(struct perf_event *event,
4014 struct perf_sample_data *data, 4043 struct perf_sample_data *data,
4015 struct pt_regs *regs) 4044 struct pt_regs *regs)
4016{ 4045{
4017 if (event->cpu != -1 && event->cpu != smp_processor_id())
4018 return 0;
4019
4020 if (!perf_swevent_is_counting(event))
4021 return 0;
4022
4023 if (event->attr.type != type) 4046 if (event->attr.type != type)
4024 return 0; 4047 return 0;
4025 4048
@@ -4036,18 +4059,53 @@ static int perf_swevent_match(struct perf_event *event,
4036 return 1; 4059 return 1;
4037} 4060}
4038 4061
4039static void perf_swevent_ctx_event(struct perf_event_context *ctx, 4062static inline u64 swevent_hash(u64 type, u32 event_id)
4040 enum perf_type_id type, 4063{
4041 u32 event_id, u64 nr, int nmi, 4064 u64 val = event_id | (type << 32);
4042 struct perf_sample_data *data, 4065
4043 struct pt_regs *regs) 4066 return hash_64(val, SWEVENT_HLIST_BITS);
4067}
4068
4069static struct hlist_head *
4070find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4071{
4072 u64 hash;
4073 struct swevent_hlist *hlist;
4074
4075 hash = swevent_hash(type, event_id);
4076
4077 hlist = rcu_dereference(ctx->swevent_hlist);
4078 if (!hlist)
4079 return NULL;
4080
4081 return &hlist->heads[hash];
4082}
4083
4084static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4085 u64 nr, int nmi,
4086 struct perf_sample_data *data,
4087 struct pt_regs *regs)
4044{ 4088{
4089 struct perf_cpu_context *cpuctx;
4045 struct perf_event *event; 4090 struct perf_event *event;
4091 struct hlist_node *node;
4092 struct hlist_head *head;
4046 4093
4047 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4094 cpuctx = &__get_cpu_var(perf_cpu_context);
4095
4096 rcu_read_lock();
4097
4098 head = find_swevent_head(cpuctx, type, event_id);
4099
4100 if (!head)
4101 goto end;
4102
4103 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4048 if (perf_swevent_match(event, type, event_id, data, regs)) 4104 if (perf_swevent_match(event, type, event_id, data, regs))
4049 perf_swevent_add(event, nr, nmi, data, regs); 4105 perf_swevent_add(event, nr, nmi, data, regs);
4050 } 4106 }
4107end:
4108 rcu_read_unlock();
4051} 4109}
4052 4110
4053int perf_swevent_get_recursion_context(void) 4111int perf_swevent_get_recursion_context(void)
@@ -4085,27 +4143,6 @@ void perf_swevent_put_recursion_context(int rctx)
4085} 4143}
4086EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); 4144EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4087 4145
4088static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4089 u64 nr, int nmi,
4090 struct perf_sample_data *data,
4091 struct pt_regs *regs)
4092{
4093 struct perf_cpu_context *cpuctx;
4094 struct perf_event_context *ctx;
4095
4096 cpuctx = &__get_cpu_var(perf_cpu_context);
4097 rcu_read_lock();
4098 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4099 nr, nmi, data, regs);
4100 /*
4101 * doesn't really matter which of the child contexts the
4102 * events ends up in.
4103 */
4104 ctx = rcu_dereference(current->perf_event_ctxp);
4105 if (ctx)
4106 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4107 rcu_read_unlock();
4108}
4109 4146
4110void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4147void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4111 struct pt_regs *regs, u64 addr) 4148 struct pt_regs *regs, u64 addr)
@@ -4131,16 +4168,28 @@ static void perf_swevent_read(struct perf_event *event)
4131static int perf_swevent_enable(struct perf_event *event) 4168static int perf_swevent_enable(struct perf_event *event)
4132{ 4169{
4133 struct hw_perf_event *hwc = &event->hw; 4170 struct hw_perf_event *hwc = &event->hw;
4171 struct perf_cpu_context *cpuctx;
4172 struct hlist_head *head;
4173
4174 cpuctx = &__get_cpu_var(perf_cpu_context);
4134 4175
4135 if (hwc->sample_period) { 4176 if (hwc->sample_period) {
4136 hwc->last_period = hwc->sample_period; 4177 hwc->last_period = hwc->sample_period;
4137 perf_swevent_set_period(event); 4178 perf_swevent_set_period(event);
4138 } 4179 }
4180
4181 head = find_swevent_head(cpuctx, event->attr.type, event->attr.config);
4182 if (WARN_ON_ONCE(!head))
4183 return -EINVAL;
4184
4185 hlist_add_head_rcu(&event->hlist_entry, head);
4186
4139 return 0; 4187 return 0;
4140} 4188}
4141 4189
4142static void perf_swevent_disable(struct perf_event *event) 4190static void perf_swevent_disable(struct perf_event *event)
4143{ 4191{
4192 hlist_del_rcu(&event->hlist_entry);
4144} 4193}
4145 4194
4146static const struct pmu perf_ops_generic = { 4195static const struct pmu perf_ops_generic = {
@@ -4168,15 +4217,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4168 perf_sample_data_init(&data, 0); 4217 perf_sample_data_init(&data, 0);
4169 data.period = event->hw.last_period; 4218 data.period = event->hw.last_period;
4170 regs = get_irq_regs(); 4219 regs = get_irq_regs();
4171 /*
4172 * In case we exclude kernel IPs or are somehow not in interrupt
4173 * context, provide the next best thing, the user IP.
4174 */
4175 if ((event->attr.exclude_kernel || !regs) &&
4176 !event->attr.exclude_user)
4177 regs = task_pt_regs(current);
4178 4220
4179 if (regs) { 4221 if (regs && !perf_exclude_event(event, regs)) {
4180 if (!(event->attr.exclude_idle && current->pid == 0)) 4222 if (!(event->attr.exclude_idle && current->pid == 0))
4181 if (perf_event_overflow(event, 0, &data, regs)) 4223 if (perf_event_overflow(event, 0, &data, regs))
4182 ret = HRTIMER_NORESTART; 4224 ret = HRTIMER_NORESTART;
@@ -4324,6 +4366,105 @@ static const struct pmu perf_ops_task_clock = {
4324 .read = task_clock_perf_event_read, 4366 .read = task_clock_perf_event_read,
4325}; 4367};
4326 4368
4369static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4370{
4371 struct swevent_hlist *hlist;
4372
4373 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
4374 kfree(hlist);
4375}
4376
4377static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
4378{
4379 struct swevent_hlist *hlist;
4380
4381 if (!cpuctx->swevent_hlist)
4382 return;
4383
4384 hlist = cpuctx->swevent_hlist;
4385 rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
4386 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4387}
4388
4389static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4390{
4391 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4392
4393 mutex_lock(&cpuctx->hlist_mutex);
4394
4395 if (!--cpuctx->hlist_refcount)
4396 swevent_hlist_release(cpuctx);
4397
4398 mutex_unlock(&cpuctx->hlist_mutex);
4399}
4400
4401static void swevent_hlist_put(struct perf_event *event)
4402{
4403 int cpu;
4404
4405 if (event->cpu != -1) {
4406 swevent_hlist_put_cpu(event, event->cpu);
4407 return;
4408 }
4409
4410 for_each_possible_cpu(cpu)
4411 swevent_hlist_put_cpu(event, cpu);
4412}
4413
4414static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4415{
4416 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4417 int err = 0;
4418
4419 mutex_lock(&cpuctx->hlist_mutex);
4420
4421 if (!cpuctx->swevent_hlist && cpu_online(cpu)) {
4422 struct swevent_hlist *hlist;
4423
4424 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
4425 if (!hlist) {
4426 err = -ENOMEM;
4427 goto exit;
4428 }
4429 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
4430 }
4431 cpuctx->hlist_refcount++;
4432 exit:
4433 mutex_unlock(&cpuctx->hlist_mutex);
4434
4435 return err;
4436}
4437
4438static int swevent_hlist_get(struct perf_event *event)
4439{
4440 int err;
4441 int cpu, failed_cpu;
4442
4443 if (event->cpu != -1)
4444 return swevent_hlist_get_cpu(event, event->cpu);
4445
4446 get_online_cpus();
4447 for_each_possible_cpu(cpu) {
4448 err = swevent_hlist_get_cpu(event, cpu);
4449 if (err) {
4450 failed_cpu = cpu;
4451 goto fail;
4452 }
4453 }
4454 put_online_cpus();
4455
4456 return 0;
4457 fail:
4458 for_each_possible_cpu(cpu) {
4459 if (cpu == failed_cpu)
4460 break;
4461 swevent_hlist_put_cpu(event, cpu);
4462 }
4463
4464 put_online_cpus();
4465 return err;
4466}
4467
4327#ifdef CONFIG_EVENT_TRACING 4468#ifdef CONFIG_EVENT_TRACING
4328 4469
4329void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4470void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
@@ -4357,10 +4498,13 @@ static int perf_tp_event_match(struct perf_event *event,
4357static void tp_perf_event_destroy(struct perf_event *event) 4498static void tp_perf_event_destroy(struct perf_event *event)
4358{ 4499{
4359 perf_trace_disable(event->attr.config); 4500 perf_trace_disable(event->attr.config);
4501 swevent_hlist_put(event);
4360} 4502}
4361 4503
4362static const struct pmu *tp_perf_event_init(struct perf_event *event) 4504static const struct pmu *tp_perf_event_init(struct perf_event *event)
4363{ 4505{
4506 int err;
4507
4364 /* 4508 /*
4365 * Raw tracepoint data is a severe data leak, only allow root to 4509 * Raw tracepoint data is a severe data leak, only allow root to
4366 * have these. 4510 * have these.
@@ -4374,6 +4518,11 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4374 return NULL; 4518 return NULL;
4375 4519
4376 event->destroy = tp_perf_event_destroy; 4520 event->destroy = tp_perf_event_destroy;
4521 err = swevent_hlist_get(event);
4522 if (err) {
4523 perf_trace_disable(event->attr.config);
4524 return ERR_PTR(err);
4525 }
4377 4526
4378 return &perf_ops_generic; 4527 return &perf_ops_generic;
4379} 4528}
@@ -4474,6 +4623,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4474 WARN_ON(event->parent); 4623 WARN_ON(event->parent);
4475 4624
4476 atomic_dec(&perf_swevent_enabled[event_id]); 4625 atomic_dec(&perf_swevent_enabled[event_id]);
4626 swevent_hlist_put(event);
4477} 4627}
4478 4628
4479static const struct pmu *sw_perf_event_init(struct perf_event *event) 4629static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4512,6 +4662,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4512 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 4662 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4513 case PERF_COUNT_SW_EMULATION_FAULTS: 4663 case PERF_COUNT_SW_EMULATION_FAULTS:
4514 if (!event->parent) { 4664 if (!event->parent) {
4665 int err;
4666
4667 err = swevent_hlist_get(event);
4668 if (err)
4669 return ERR_PTR(err);
4670
4515 atomic_inc(&perf_swevent_enabled[event_id]); 4671 atomic_inc(&perf_swevent_enabled[event_id]);
4516 event->destroy = sw_perf_event_destroy; 4672 event->destroy = sw_perf_event_destroy;
4517 } 4673 }
@@ -4897,7 +5053,7 @@ err_fput_free_put_context:
4897 5053
4898err_free_put_context: 5054err_free_put_context:
4899 if (err < 0) 5055 if (err < 0)
4900 kfree(event); 5056 free_event(event);
4901 5057
4902err_put_context: 5058err_put_context:
4903 if (err < 0) 5059 if (err < 0)
@@ -5176,7 +5332,7 @@ void perf_event_exit_task(struct task_struct *child)
5176 * 5332 *
5177 * But since its the parent context it won't be the same instance. 5333 * But since its the parent context it won't be the same instance.
5178 */ 5334 */
5179 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5335 mutex_lock(&child_ctx->mutex);
5180 5336
5181again: 5337again:
5182 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, 5338 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
@@ -5384,6 +5540,7 @@ static void __init perf_event_init_all_cpus(void)
5384 5540
5385 for_each_possible_cpu(cpu) { 5541 for_each_possible_cpu(cpu) {
5386 cpuctx = &per_cpu(perf_cpu_context, cpu); 5542 cpuctx = &per_cpu(perf_cpu_context, cpu);
5543 mutex_init(&cpuctx->hlist_mutex);
5387 __perf_event_init_context(&cpuctx->ctx, NULL); 5544 __perf_event_init_context(&cpuctx->ctx, NULL);
5388 } 5545 }
5389} 5546}
@@ -5397,6 +5554,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
5397 spin_lock(&perf_resource_lock); 5554 spin_lock(&perf_resource_lock);
5398 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5555 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5399 spin_unlock(&perf_resource_lock); 5556 spin_unlock(&perf_resource_lock);
5557
5558 mutex_lock(&cpuctx->hlist_mutex);
5559 if (cpuctx->hlist_refcount > 0) {
5560 struct swevent_hlist *hlist;
5561
5562 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5563 WARN_ON_ONCE(!hlist);
5564 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
5565 }
5566 mutex_unlock(&cpuctx->hlist_mutex);
5400} 5567}
5401 5568
5402#ifdef CONFIG_HOTPLUG_CPU 5569#ifdef CONFIG_HOTPLUG_CPU
@@ -5416,6 +5583,10 @@ static void perf_event_exit_cpu(int cpu)
5416 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 5583 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5417 struct perf_event_context *ctx = &cpuctx->ctx; 5584 struct perf_event_context *ctx = &cpuctx->ctx;
5418 5585
5586 mutex_lock(&cpuctx->hlist_mutex);
5587 swevent_hlist_release(cpuctx);
5588 mutex_unlock(&cpuctx->hlist_mutex);
5589
5419 mutex_lock(&ctx->mutex); 5590 mutex_lock(&ctx->mutex);
5420 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); 5591 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5421 mutex_unlock(&ctx->mutex); 5592 mutex_unlock(&ctx->mutex);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 3db49b9ca374..f42d3f737a33 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -2,7 +2,7 @@
2 * This module exposes the interface to kernel space for specifying 2 * This module exposes the interface to kernel space for specifying
3 * QoS dependencies. It provides infrastructure for registration of: 3 * QoS dependencies. It provides infrastructure for registration of:
4 * 4 *
5 * Dependents on a QoS value : register requirements 5 * Dependents on a QoS value : register requests
6 * Watchers of QoS value : get notified when target QoS value changes 6 * Watchers of QoS value : get notified when target QoS value changes
7 * 7 *
8 * This QoS design is best effort based. Dependents register their QoS needs. 8 * This QoS design is best effort based. Dependents register their QoS needs.
@@ -14,19 +14,21 @@
14 * timeout: usec <-- currently not used. 14 * timeout: usec <-- currently not used.
15 * throughput: kbs (kilo byte / sec) 15 * throughput: kbs (kilo byte / sec)
16 * 16 *
17 * There are lists of pm_qos_objects each one wrapping requirements, notifiers 17 * There are lists of pm_qos_objects each one wrapping requests, notifiers
18 * 18 *
19 * User mode requirements on a QOS parameter register themselves to the 19 * User mode requests on a QOS parameter register themselves to the
20 * subsystem by opening the device node /dev/... and writing there request to 20 * subsystem by opening the device node /dev/... and writing there request to
21 * the node. As long as the process holds a file handle open to the node the 21 * the node. As long as the process holds a file handle open to the node the
22 * client continues to be accounted for. Upon file release the usermode 22 * client continues to be accounted for. Upon file release the usermode
23 * requirement is removed and a new qos target is computed. This way when the 23 * request is removed and a new qos target is computed. This way when the
24 * requirement that the application has is cleaned up when closes the file 24 * request that the application has is cleaned up when closes the file
25 * pointer or exits the pm_qos_object will get an opportunity to clean up. 25 * pointer or exits the pm_qos_object will get an opportunity to clean up.
26 * 26 *
27 * Mark Gross <mgross@linux.intel.com> 27 * Mark Gross <mgross@linux.intel.com>
28 */ 28 */
29 29
30/*#define DEBUG*/
31
30#include <linux/pm_qos_params.h> 32#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 33#include <linux/sched.h>
32#include <linux/spinlock.h> 34#include <linux/spinlock.h>
@@ -42,25 +44,25 @@
42#include <linux/uaccess.h> 44#include <linux/uaccess.h>
43 45
44/* 46/*
45 * locking rule: all changes to requirements or notifiers lists 47 * locking rule: all changes to requests or notifiers lists
46 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
47 * held, taken with _irqsave. One lock to rule them all 49 * held, taken with _irqsave. One lock to rule them all
48 */ 50 */
49struct requirement_list { 51struct pm_qos_request_list {
50 struct list_head list; 52 struct list_head list;
51 union { 53 union {
52 s32 value; 54 s32 value;
53 s32 usec; 55 s32 usec;
54 s32 kbps; 56 s32 kbps;
55 }; 57 };
56 char *name; 58 int pm_qos_class;
57}; 59};
58 60
59static s32 max_compare(s32 v1, s32 v2); 61static s32 max_compare(s32 v1, s32 v2);
60static s32 min_compare(s32 v1, s32 v2); 62static s32 min_compare(s32 v1, s32 v2);
61 63
62struct pm_qos_object { 64struct pm_qos_object {
63 struct requirement_list requirements; 65 struct pm_qos_request_list requests;
64 struct blocking_notifier_head *notifiers; 66 struct blocking_notifier_head *notifiers;
65 struct miscdevice pm_qos_power_miscdev; 67 struct miscdevice pm_qos_power_miscdev;
66 char *name; 68 char *name;
@@ -72,7 +74,7 @@ struct pm_qos_object {
72static struct pm_qos_object null_pm_qos; 74static struct pm_qos_object null_pm_qos;
73static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
74static struct pm_qos_object cpu_dma_pm_qos = { 76static struct pm_qos_object cpu_dma_pm_qos = {
75 .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)}, 77 .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
76 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
77 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
78 .default_value = 2000 * USEC_PER_SEC, 80 .default_value = 2000 * USEC_PER_SEC,
@@ -82,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
82 84
83static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
84static struct pm_qos_object network_lat_pm_qos = { 86static struct pm_qos_object network_lat_pm_qos = {
85 .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)}, 87 .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
86 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
87 .name = "network_latency", 89 .name = "network_latency",
88 .default_value = 2000 * USEC_PER_SEC, 90 .default_value = 2000 * USEC_PER_SEC,
@@ -93,8 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
93 95
94static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
95static struct pm_qos_object network_throughput_pm_qos = { 97static struct pm_qos_object network_throughput_pm_qos = {
96 .requirements = 98 .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
97 {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
98 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
99 .name = "network_throughput", 100 .name = "network_throughput",
100 .default_value = 0, 101 .default_value = 0,
@@ -135,31 +136,34 @@ static s32 min_compare(s32 v1, s32 v2)
135} 136}
136 137
137 138
138static void update_target(int target) 139static void update_target(int pm_qos_class)
139{ 140{
140 s32 extreme_value; 141 s32 extreme_value;
141 struct requirement_list *node; 142 struct pm_qos_request_list *node;
142 unsigned long flags; 143 unsigned long flags;
143 int call_notifier = 0; 144 int call_notifier = 0;
144 145
145 spin_lock_irqsave(&pm_qos_lock, flags); 146 spin_lock_irqsave(&pm_qos_lock, flags);
146 extreme_value = pm_qos_array[target]->default_value; 147 extreme_value = pm_qos_array[pm_qos_class]->default_value;
147 list_for_each_entry(node, 148 list_for_each_entry(node,
148 &pm_qos_array[target]->requirements.list, list) { 149 &pm_qos_array[pm_qos_class]->requests.list, list) {
149 extreme_value = pm_qos_array[target]->comparitor( 150 extreme_value = pm_qos_array[pm_qos_class]->comparitor(
150 extreme_value, node->value); 151 extreme_value, node->value);
151 } 152 }
152 if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { 153 if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
154 extreme_value) {
153 call_notifier = 1; 155 call_notifier = 1;
154 atomic_set(&pm_qos_array[target]->target_value, extreme_value); 156 atomic_set(&pm_qos_array[pm_qos_class]->target_value,
155 pr_debug(KERN_ERR "new target for qos %d is %d\n", target, 157 extreme_value);
156 atomic_read(&pm_qos_array[target]->target_value)); 158 pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
159 atomic_read(&pm_qos_array[pm_qos_class]->target_value));
157 } 160 }
158 spin_unlock_irqrestore(&pm_qos_lock, flags); 161 spin_unlock_irqrestore(&pm_qos_lock, flags);
159 162
160 if (call_notifier) 163 if (call_notifier)
161 blocking_notifier_call_chain(pm_qos_array[target]->notifiers, 164 blocking_notifier_call_chain(
162 (unsigned long) extreme_value, NULL); 165 pm_qos_array[pm_qos_class]->notifiers,
166 (unsigned long) extreme_value, NULL);
163} 167}
164 168
165static int register_pm_qos_misc(struct pm_qos_object *qos) 169static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -185,125 +189,112 @@ static int find_pm_qos_object_by_minor(int minor)
185} 189}
186 190
187/** 191/**
188 * pm_qos_requirement - returns current system wide qos expectation 192 * pm_qos_request - returns current system wide qos expectation
189 * @pm_qos_class: identification of which qos value is requested 193 * @pm_qos_class: identification of which qos value is requested
190 * 194 *
191 * This function returns the current target value in an atomic manner. 195 * This function returns the current target value in an atomic manner.
192 */ 196 */
193int pm_qos_requirement(int pm_qos_class) 197int pm_qos_request(int pm_qos_class)
194{ 198{
195 return atomic_read(&pm_qos_array[pm_qos_class]->target_value); 199 return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
196} 200}
197EXPORT_SYMBOL_GPL(pm_qos_requirement); 201EXPORT_SYMBOL_GPL(pm_qos_request);
198 202
199/** 203/**
200 * pm_qos_add_requirement - inserts new qos request into the list 204 * pm_qos_add_request - inserts new qos request into the list
201 * @pm_qos_class: identifies which list of qos request to us 205 * @pm_qos_class: identifies which list of qos request to us
202 * @name: identifies the request
203 * @value: defines the qos request 206 * @value: defines the qos request
204 * 207 *
205 * This function inserts a new entry in the pm_qos_class list of requested qos 208 * This function inserts a new entry in the pm_qos_class list of requested qos
206 * performance characteristics. It recomputes the aggregate QoS expectations 209 * performance characteristics. It recomputes the aggregate QoS expectations
207 * for the pm_qos_class of parameters. 210 * for the pm_qos_class of parameters, and returns the pm_qos_request list
211 * element as a handle for use in updating and removal. Call needs to save
212 * this handle for later use.
208 */ 213 */
209int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) 214struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
210{ 215{
211 struct requirement_list *dep; 216 struct pm_qos_request_list *dep;
212 unsigned long flags; 217 unsigned long flags;
213 218
214 dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL); 219 dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
215 if (dep) { 220 if (dep) {
216 if (value == PM_QOS_DEFAULT_VALUE) 221 if (value == PM_QOS_DEFAULT_VALUE)
217 dep->value = pm_qos_array[pm_qos_class]->default_value; 222 dep->value = pm_qos_array[pm_qos_class]->default_value;
218 else 223 else
219 dep->value = value; 224 dep->value = value;
220 dep->name = kstrdup(name, GFP_KERNEL); 225 dep->pm_qos_class = pm_qos_class;
221 if (!dep->name)
222 goto cleanup;
223 226
224 spin_lock_irqsave(&pm_qos_lock, flags); 227 spin_lock_irqsave(&pm_qos_lock, flags);
225 list_add(&dep->list, 228 list_add(&dep->list,
226 &pm_qos_array[pm_qos_class]->requirements.list); 229 &pm_qos_array[pm_qos_class]->requests.list);
227 spin_unlock_irqrestore(&pm_qos_lock, flags); 230 spin_unlock_irqrestore(&pm_qos_lock, flags);
228 update_target(pm_qos_class); 231 update_target(pm_qos_class);
229
230 return 0;
231 } 232 }
232 233
233cleanup: 234 return dep;
234 kfree(dep);
235 return -ENOMEM;
236} 235}
237EXPORT_SYMBOL_GPL(pm_qos_add_requirement); 236EXPORT_SYMBOL_GPL(pm_qos_add_request);
238 237
239/** 238/**
240 * pm_qos_update_requirement - modifies an existing qos request 239 * pm_qos_update_request - modifies an existing qos request
241 * @pm_qos_class: identifies which list of qos request to us 240 * @pm_qos_req : handle to list element holding a pm_qos request to use
242 * @name: identifies the request
243 * @value: defines the qos request 241 * @value: defines the qos request
244 * 242 *
245 * Updates an existing qos requirement for the pm_qos_class of parameters along 243 * Updates an existing qos request for the pm_qos_class of parameters along
246 * with updating the target pm_qos_class value. 244 * with updating the target pm_qos_class value.
247 * 245 *
248 * If the named request isn't in the list then no change is made. 246 * Attempts are made to make this code callable on hot code paths.
249 */ 247 */
250int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) 248void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
249 s32 new_value)
251{ 250{
252 unsigned long flags; 251 unsigned long flags;
253 struct requirement_list *node;
254 int pending_update = 0; 252 int pending_update = 0;
253 s32 temp;
255 254
256 spin_lock_irqsave(&pm_qos_lock, flags); 255 if (pm_qos_req) { /*guard against callers passing in null */
257 list_for_each_entry(node, 256 spin_lock_irqsave(&pm_qos_lock, flags);
258 &pm_qos_array[pm_qos_class]->requirements.list, list) { 257 if (new_value == PM_QOS_DEFAULT_VALUE)
259 if (strcmp(node->name, name) == 0) { 258 temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
260 if (new_value == PM_QOS_DEFAULT_VALUE) 259 else
261 node->value = 260 temp = new_value;
262 pm_qos_array[pm_qos_class]->default_value; 261
263 else 262 if (temp != pm_qos_req->value) {
264 node->value = new_value;
265 pending_update = 1; 263 pending_update = 1;
266 break; 264 pm_qos_req->value = temp;
267 } 265 }
266 spin_unlock_irqrestore(&pm_qos_lock, flags);
267 if (pending_update)
268 update_target(pm_qos_req->pm_qos_class);
268 } 269 }
269 spin_unlock_irqrestore(&pm_qos_lock, flags);
270 if (pending_update)
271 update_target(pm_qos_class);
272
273 return 0;
274} 270}
275EXPORT_SYMBOL_GPL(pm_qos_update_requirement); 271EXPORT_SYMBOL_GPL(pm_qos_update_request);
276 272
277/** 273/**
278 * pm_qos_remove_requirement - modifies an existing qos request 274 * pm_qos_remove_request - modifies an existing qos request
279 * @pm_qos_class: identifies which list of qos request to us 275 * @pm_qos_req: handle to request list element
280 * @name: identifies the request
281 * 276 *
282 * Will remove named qos request from pm_qos_class list of parameters and 277 * Will remove pm qos request from the list of requests and
283 * recompute the current target value for the pm_qos_class. 278 * recompute the current target value for the pm_qos_class. Call this
279 * on slow code paths.
284 */ 280 */
285void pm_qos_remove_requirement(int pm_qos_class, char *name) 281void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
286{ 282{
287 unsigned long flags; 283 unsigned long flags;
288 struct requirement_list *node; 284 int qos_class;
289 int pending_update = 0;
290 285
286 if (pm_qos_req == NULL)
287 return;
288 /* silent return to keep pcm code cleaner */
289
290 qos_class = pm_qos_req->pm_qos_class;
291 spin_lock_irqsave(&pm_qos_lock, flags); 291 spin_lock_irqsave(&pm_qos_lock, flags);
292 list_for_each_entry(node, 292 list_del(&pm_qos_req->list);
293 &pm_qos_array[pm_qos_class]->requirements.list, list) { 293 kfree(pm_qos_req);
294 if (strcmp(node->name, name) == 0) {
295 kfree(node->name);
296 list_del(&node->list);
297 kfree(node);
298 pending_update = 1;
299 break;
300 }
301 }
302 spin_unlock_irqrestore(&pm_qos_lock, flags); 294 spin_unlock_irqrestore(&pm_qos_lock, flags);
303 if (pending_update) 295 update_target(qos_class);
304 update_target(pm_qos_class);
305} 296}
306EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); 297EXPORT_SYMBOL_GPL(pm_qos_remove_request);
307 298
308/** 299/**
309 * pm_qos_add_notifier - sets notification entry for changes to target value 300 * pm_qos_add_notifier - sets notification entry for changes to target value
@@ -313,7 +304,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
313 * will register the notifier into a notification chain that gets called 304 * will register the notifier into a notification chain that gets called
314 * upon changes to the pm_qos_class target value. 305 * upon changes to the pm_qos_class target value.
315 */ 306 */
316 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) 307int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
317{ 308{
318 int retval; 309 int retval;
319 310
@@ -343,21 +334,16 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
343} 334}
344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 335EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
345 336
346#define PID_NAME_LEN 32
347
348static int pm_qos_power_open(struct inode *inode, struct file *filp) 337static int pm_qos_power_open(struct inode *inode, struct file *filp)
349{ 338{
350 int ret;
351 long pm_qos_class; 339 long pm_qos_class;
352 char name[PID_NAME_LEN];
353 340
354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 341 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
355 if (pm_qos_class >= 0) { 342 if (pm_qos_class >= 0) {
356 filp->private_data = (void *)pm_qos_class; 343 filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 344 PM_QOS_DEFAULT_VALUE);
358 ret = pm_qos_add_requirement(pm_qos_class, name, 345
359 PM_QOS_DEFAULT_VALUE); 346 if (filp->private_data)
360 if (ret >= 0)
361 return 0; 347 return 0;
362 } 348 }
363 return -EPERM; 349 return -EPERM;
@@ -365,32 +351,40 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
365 351
366static int pm_qos_power_release(struct inode *inode, struct file *filp) 352static int pm_qos_power_release(struct inode *inode, struct file *filp)
367{ 353{
368 int pm_qos_class; 354 struct pm_qos_request_list *req;
369 char name[PID_NAME_LEN];
370 355
371 pm_qos_class = (long)filp->private_data; 356 req = (struct pm_qos_request_list *)filp->private_data;
372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 357 pm_qos_remove_request(req);
373 pm_qos_remove_requirement(pm_qos_class, name);
374 358
375 return 0; 359 return 0;
376} 360}
377 361
362
378static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 363static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
379 size_t count, loff_t *f_pos) 364 size_t count, loff_t *f_pos)
380{ 365{
381 s32 value; 366 s32 value;
382 int pm_qos_class; 367 int x;
383 char name[PID_NAME_LEN]; 368 char ascii_value[11];
384 369 struct pm_qos_request_list *pm_qos_req;
385 pm_qos_class = (long)filp->private_data; 370
386 if (count != sizeof(s32)) 371 if (count == sizeof(s32)) {
372 if (copy_from_user(&value, buf, sizeof(s32)))
373 return -EFAULT;
374 } else if (count == 11) { /* len('0x12345678/0') */
375 if (copy_from_user(ascii_value, buf, 11))
376 return -EFAULT;
377 x = sscanf(ascii_value, "%x", &value);
378 if (x != 1)
379 return -EINVAL;
380 pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value);
381 } else
387 return -EINVAL; 382 return -EINVAL;
388 if (copy_from_user(&value, buf, sizeof(s32)))
389 return -EFAULT;
390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
391 pm_qos_update_requirement(pm_qos_class, name, value);
392 383
393 return sizeof(s32); 384 pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
385 pm_qos_update_request(pm_qos_req, value);
386
387 return count;
394} 388}
395 389
396 390
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bc7704b3a443..00bb252f29a2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -11,19 +11,18 @@
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12 12
13/* 13/*
14 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 14 * Called after updating RLIMIT_CPU to run cpu timer and update
15 * tsk->signal->cputime_expires expiration cache if necessary. Needs
16 * siglock protection since other code may update expiration cache as
17 * well.
15 */ 18 */
16void update_rlimit_cpu(unsigned long rlim_new) 19void update_rlimit_cpu(unsigned long rlim_new)
17{ 20{
18 cputime_t cputime = secs_to_cputime(rlim_new); 21 cputime_t cputime = secs_to_cputime(rlim_new);
19 struct signal_struct *const sig = current->signal;
20 22
21 if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || 23 spin_lock_irq(&current->sighand->siglock);
22 cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { 24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
23 spin_lock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&current->sighand->siglock);
24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
25 spin_unlock_irq(&current->sighand->siglock);
26 }
27} 26}
28 27
29static int check_clock(const clockid_t which_clock) 28static int check_clock(const clockid_t which_clock)
@@ -548,111 +547,62 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
548 cputime_gt(expires, new_exp); 547 cputime_gt(expires, new_exp);
549} 548}
550 549
551static inline int expires_le(cputime_t expires, cputime_t new_exp)
552{
553 return !cputime_eq(expires, cputime_zero) &&
554 cputime_le(expires, new_exp);
555}
556/* 550/*
557 * Insert the timer on the appropriate list before any timers that 551 * Insert the timer on the appropriate list before any timers that
558 * expire later. This must be called with the tasklist_lock held 552 * expire later. This must be called with the tasklist_lock held
559 * for reading, and interrupts disabled. 553 * for reading, interrupts disabled and p->sighand->siglock taken.
560 */ 554 */
561static void arm_timer(struct k_itimer *timer, union cpu_time_count now) 555static void arm_timer(struct k_itimer *timer)
562{ 556{
563 struct task_struct *p = timer->it.cpu.task; 557 struct task_struct *p = timer->it.cpu.task;
564 struct list_head *head, *listpos; 558 struct list_head *head, *listpos;
559 struct task_cputime *cputime_expires;
565 struct cpu_timer_list *const nt = &timer->it.cpu; 560 struct cpu_timer_list *const nt = &timer->it.cpu;
566 struct cpu_timer_list *next; 561 struct cpu_timer_list *next;
567 unsigned long i;
568 562
569 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? 563 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
570 p->cpu_timers : p->signal->cpu_timers); 564 head = p->cpu_timers;
565 cputime_expires = &p->cputime_expires;
566 } else {
567 head = p->signal->cpu_timers;
568 cputime_expires = &p->signal->cputime_expires;
569 }
571 head += CPUCLOCK_WHICH(timer->it_clock); 570 head += CPUCLOCK_WHICH(timer->it_clock);
572 571
573 BUG_ON(!irqs_disabled());
574 spin_lock(&p->sighand->siglock);
575
576 listpos = head; 572 listpos = head;
577 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 573 list_for_each_entry(next, head, entry) {
578 list_for_each_entry(next, head, entry) { 574 if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
579 if (next->expires.sched > nt->expires.sched) 575 break;
580 break; 576 listpos = &next->entry;
581 listpos = &next->entry;
582 }
583 } else {
584 list_for_each_entry(next, head, entry) {
585 if (cputime_gt(next->expires.cpu, nt->expires.cpu))
586 break;
587 listpos = &next->entry;
588 }
589 } 577 }
590 list_add(&nt->entry, listpos); 578 list_add(&nt->entry, listpos);
591 579
592 if (listpos == head) { 580 if (listpos == head) {
581 union cpu_time_count *exp = &nt->expires;
582
593 /* 583 /*
594 * We are the new earliest-expiring timer. 584 * We are the new earliest-expiring POSIX 1.b timer, hence
595 * If we are a thread timer, there can always 585 * need to update expiration cache. Take into account that
596 * be a process timer telling us to stop earlier. 586 * for process timers we share expiration cache with itimers
587 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
597 */ 588 */
598 589
599 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 590 switch (CPUCLOCK_WHICH(timer->it_clock)) {
600 union cpu_time_count *exp = &nt->expires; 591 case CPUCLOCK_PROF:
601 592 if (expires_gt(cputime_expires->prof_exp, exp->cpu))
602 switch (CPUCLOCK_WHICH(timer->it_clock)) { 593 cputime_expires->prof_exp = exp->cpu;
603 default: 594 break;
604 BUG(); 595 case CPUCLOCK_VIRT:
605 case CPUCLOCK_PROF: 596 if (expires_gt(cputime_expires->virt_exp, exp->cpu))
606 if (expires_gt(p->cputime_expires.prof_exp, 597 cputime_expires->virt_exp = exp->cpu;
607 exp->cpu)) 598 break;
608 p->cputime_expires.prof_exp = exp->cpu; 599 case CPUCLOCK_SCHED:
609 break; 600 if (cputime_expires->sched_exp == 0 ||
610 case CPUCLOCK_VIRT: 601 cputime_expires->sched_exp > exp->sched)
611 if (expires_gt(p->cputime_expires.virt_exp, 602 cputime_expires->sched_exp = exp->sched;
612 exp->cpu)) 603 break;
613 p->cputime_expires.virt_exp = exp->cpu;
614 break;
615 case CPUCLOCK_SCHED:
616 if (p->cputime_expires.sched_exp == 0 ||
617 p->cputime_expires.sched_exp > exp->sched)
618 p->cputime_expires.sched_exp =
619 exp->sched;
620 break;
621 }
622 } else {
623 struct signal_struct *const sig = p->signal;
624 union cpu_time_count *exp = &timer->it.cpu.expires;
625
626 /*
627 * For a process timer, set the cached expiration time.
628 */
629 switch (CPUCLOCK_WHICH(timer->it_clock)) {
630 default:
631 BUG();
632 case CPUCLOCK_VIRT:
633 if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
634 exp->cpu))
635 break;
636 sig->cputime_expires.virt_exp = exp->cpu;
637 break;
638 case CPUCLOCK_PROF:
639 if (expires_le(sig->it[CPUCLOCK_PROF].expires,
640 exp->cpu))
641 break;
642 i = sig->rlim[RLIMIT_CPU].rlim_cur;
643 if (i != RLIM_INFINITY &&
644 i <= cputime_to_secs(exp->cpu))
645 break;
646 sig->cputime_expires.prof_exp = exp->cpu;
647 break;
648 case CPUCLOCK_SCHED:
649 sig->cputime_expires.sched_exp = exp->sched;
650 break;
651 }
652 } 604 }
653 } 605 }
654
655 spin_unlock(&p->sighand->siglock);
656} 606}
657 607
658/* 608/*
@@ -660,7 +610,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
660 */ 610 */
661static void cpu_timer_fire(struct k_itimer *timer) 611static void cpu_timer_fire(struct k_itimer *timer)
662{ 612{
663 if (unlikely(timer->sigq == NULL)) { 613 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
614 /*
615 * User don't want any signal.
616 */
617 timer->it.cpu.expires.sched = 0;
618 } else if (unlikely(timer->sigq == NULL)) {
664 /* 619 /*
665 * This a special case for clock_nanosleep, 620 * This a special case for clock_nanosleep,
666 * not a normal timer from sys_timer_create. 621 * not a normal timer from sys_timer_create.
@@ -721,7 +676,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
721 struct itimerspec *new, struct itimerspec *old) 676 struct itimerspec *new, struct itimerspec *old)
722{ 677{
723 struct task_struct *p = timer->it.cpu.task; 678 struct task_struct *p = timer->it.cpu.task;
724 union cpu_time_count old_expires, new_expires, val; 679 union cpu_time_count old_expires, new_expires, old_incr, val;
725 int ret; 680 int ret;
726 681
727 if (unlikely(p == NULL)) { 682 if (unlikely(p == NULL)) {
@@ -752,6 +707,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
752 BUG_ON(!irqs_disabled()); 707 BUG_ON(!irqs_disabled());
753 708
754 ret = 0; 709 ret = 0;
710 old_incr = timer->it.cpu.incr;
755 spin_lock(&p->sighand->siglock); 711 spin_lock(&p->sighand->siglock);
756 old_expires = timer->it.cpu.expires; 712 old_expires = timer->it.cpu.expires;
757 if (unlikely(timer->it.cpu.firing)) { 713 if (unlikely(timer->it.cpu.firing)) {
@@ -759,7 +715,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
759 ret = TIMER_RETRY; 715 ret = TIMER_RETRY;
760 } else 716 } else
761 list_del_init(&timer->it.cpu.entry); 717 list_del_init(&timer->it.cpu.entry);
762 spin_unlock(&p->sighand->siglock);
763 718
764 /* 719 /*
765 * We need to sample the current value to convert the new 720 * We need to sample the current value to convert the new
@@ -813,6 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
813 * disable this firing since we are already reporting 768 * disable this firing since we are already reporting
814 * it as an overrun (thanks to bump_cpu_timer above). 769 * it as an overrun (thanks to bump_cpu_timer above).
815 */ 770 */
771 spin_unlock(&p->sighand->siglock);
816 read_unlock(&tasklist_lock); 772 read_unlock(&tasklist_lock);
817 goto out; 773 goto out;
818 } 774 }
@@ -828,11 +784,11 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
828 */ 784 */
829 timer->it.cpu.expires = new_expires; 785 timer->it.cpu.expires = new_expires;
830 if (new_expires.sched != 0 && 786 if (new_expires.sched != 0 &&
831 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
832 cpu_time_before(timer->it_clock, val, new_expires)) { 787 cpu_time_before(timer->it_clock, val, new_expires)) {
833 arm_timer(timer, val); 788 arm_timer(timer);
834 } 789 }
835 790
791 spin_unlock(&p->sighand->siglock);
836 read_unlock(&tasklist_lock); 792 read_unlock(&tasklist_lock);
837 793
838 /* 794 /*
@@ -853,7 +809,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
853 timer->it_overrun = -1; 809 timer->it_overrun = -1;
854 810
855 if (new_expires.sched != 0 && 811 if (new_expires.sched != 0 &&
856 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
857 !cpu_time_before(timer->it_clock, val, new_expires)) { 812 !cpu_time_before(timer->it_clock, val, new_expires)) {
858 /* 813 /*
859 * The designated time already passed, so we notify 814 * The designated time already passed, so we notify
@@ -867,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
867 out: 822 out:
868 if (old) { 823 if (old) {
869 sample_to_timespec(timer->it_clock, 824 sample_to_timespec(timer->it_clock,
870 timer->it.cpu.incr, &old->it_interval); 825 old_incr, &old->it_interval);
871 } 826 }
872 return ret; 827 return ret;
873} 828}
@@ -927,25 +882,6 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
927 read_unlock(&tasklist_lock); 882 read_unlock(&tasklist_lock);
928 } 883 }
929 884
930 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
931 if (timer->it.cpu.incr.sched == 0 &&
932 cpu_time_before(timer->it_clock,
933 timer->it.cpu.expires, now)) {
934 /*
935 * Do-nothing timer expired and has no reload,
936 * so it's as if it was never set.
937 */
938 timer->it.cpu.expires.sched = 0;
939 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
940 return;
941 }
942 /*
943 * Account for any expirations and reloads that should
944 * have happened.
945 */
946 bump_cpu_timer(timer, now);
947 }
948
949 if (unlikely(clear_dead)) { 885 if (unlikely(clear_dead)) {
950 /* 886 /*
951 * We've noticed that the thread is dead, but 887 * We've noticed that the thread is dead, but
@@ -1066,16 +1002,9 @@ static void stop_process_timers(struct signal_struct *sig)
1066 struct thread_group_cputimer *cputimer = &sig->cputimer; 1002 struct thread_group_cputimer *cputimer = &sig->cputimer;
1067 unsigned long flags; 1003 unsigned long flags;
1068 1004
1069 if (!cputimer->running)
1070 return;
1071
1072 spin_lock_irqsave(&cputimer->lock, flags); 1005 spin_lock_irqsave(&cputimer->lock, flags);
1073 cputimer->running = 0; 1006 cputimer->running = 0;
1074 spin_unlock_irqrestore(&cputimer->lock, flags); 1007 spin_unlock_irqrestore(&cputimer->lock, flags);
1075
1076 sig->cputime_expires.prof_exp = cputime_zero;
1077 sig->cputime_expires.virt_exp = cputime_zero;
1078 sig->cputime_expires.sched_exp = 0;
1079} 1008}
1080 1009
1081static u32 onecputick; 1010static u32 onecputick;
@@ -1112,6 +1041,23 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1112 } 1041 }
1113} 1042}
1114 1043
1044/**
1045 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1046 *
1047 * @cputime: The struct to compare.
1048 *
1049 * Checks @cputime to see if all fields are zero. Returns true if all fields
1050 * are zero, false if any field is nonzero.
1051 */
1052static inline int task_cputime_zero(const struct task_cputime *cputime)
1053{
1054 if (cputime_eq(cputime->utime, cputime_zero) &&
1055 cputime_eq(cputime->stime, cputime_zero) &&
1056 cputime->sum_exec_runtime == 0)
1057 return 1;
1058 return 0;
1059}
1060
1115/* 1061/*
1116 * Check for any per-thread CPU timers that have fired and move them 1062 * Check for any per-thread CPU timers that have fired and move them
1117 * off the tsk->*_timers list onto the firing list. Per-thread timers 1063 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1129,19 +1075,6 @@ static void check_process_timers(struct task_struct *tsk,
1129 unsigned long soft; 1075 unsigned long soft;
1130 1076
1131 /* 1077 /*
1132 * Don't sample the current process CPU clocks if there are no timers.
1133 */
1134 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1135 cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
1136 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1137 list_empty(&timers[CPUCLOCK_VIRT]) &&
1138 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1139 list_empty(&timers[CPUCLOCK_SCHED])) {
1140 stop_process_timers(sig);
1141 return;
1142 }
1143
1144 /*
1145 * Collect the current process totals. 1078 * Collect the current process totals.
1146 */ 1079 */
1147 thread_group_cputimer(tsk, &cputime); 1080 thread_group_cputimer(tsk, &cputime);
@@ -1230,18 +1163,11 @@ static void check_process_timers(struct task_struct *tsk,
1230 } 1163 }
1231 } 1164 }
1232 1165
1233 if (!cputime_eq(prof_expires, cputime_zero) && 1166 sig->cputime_expires.prof_exp = prof_expires;
1234 (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || 1167 sig->cputime_expires.virt_exp = virt_expires;
1235 cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) 1168 sig->cputime_expires.sched_exp = sched_expires;
1236 sig->cputime_expires.prof_exp = prof_expires; 1169 if (task_cputime_zero(&sig->cputime_expires))
1237 if (!cputime_eq(virt_expires, cputime_zero) && 1170 stop_process_timers(sig);
1238 (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
1239 cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
1240 sig->cputime_expires.virt_exp = virt_expires;
1241 if (sched_expires != 0 &&
1242 (sig->cputime_expires.sched_exp == 0 ||
1243 sig->cputime_expires.sched_exp > sched_expires))
1244 sig->cputime_expires.sched_exp = sched_expires;
1245} 1171}
1246 1172
1247/* 1173/*
@@ -1270,6 +1196,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1270 goto out; 1196 goto out;
1271 } 1197 }
1272 read_lock(&tasklist_lock); /* arm_timer needs it. */ 1198 read_lock(&tasklist_lock); /* arm_timer needs it. */
1199 spin_lock(&p->sighand->siglock);
1273 } else { 1200 } else {
1274 read_lock(&tasklist_lock); 1201 read_lock(&tasklist_lock);
1275 if (unlikely(p->signal == NULL)) { 1202 if (unlikely(p->signal == NULL)) {
@@ -1290,6 +1217,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1290 clear_dead_task(timer, now); 1217 clear_dead_task(timer, now);
1291 goto out_unlock; 1218 goto out_unlock;
1292 } 1219 }
1220 spin_lock(&p->sighand->siglock);
1293 cpu_timer_sample_group(timer->it_clock, p, &now); 1221 cpu_timer_sample_group(timer->it_clock, p, &now);
1294 bump_cpu_timer(timer, now); 1222 bump_cpu_timer(timer, now);
1295 /* Leave the tasklist_lock locked for the call below. */ 1223 /* Leave the tasklist_lock locked for the call below. */
@@ -1298,7 +1226,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1298 /* 1226 /*
1299 * Now re-arm for the new expiry time. 1227 * Now re-arm for the new expiry time.
1300 */ 1228 */
1301 arm_timer(timer, now); 1229 BUG_ON(!irqs_disabled());
1230 arm_timer(timer);
1231 spin_unlock(&p->sighand->siglock);
1302 1232
1303out_unlock: 1233out_unlock:
1304 read_unlock(&tasklist_lock); 1234 read_unlock(&tasklist_lock);
@@ -1310,23 +1240,6 @@ out:
1310} 1240}
1311 1241
1312/** 1242/**
1313 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1314 *
1315 * @cputime: The struct to compare.
1316 *
1317 * Checks @cputime to see if all fields are zero. Returns true if all fields
1318 * are zero, false if any field is nonzero.
1319 */
1320static inline int task_cputime_zero(const struct task_cputime *cputime)
1321{
1322 if (cputime_eq(cputime->utime, cputime_zero) &&
1323 cputime_eq(cputime->stime, cputime_zero) &&
1324 cputime->sum_exec_runtime == 0)
1325 return 1;
1326 return 0;
1327}
1328
1329/**
1330 * task_cputime_expired - Compare two task_cputime entities. 1243 * task_cputime_expired - Compare two task_cputime entities.
1331 * 1244 *
1332 * @sample: The task_cputime structure to be checked for expiration. 1245 * @sample: The task_cputime structure to be checked for expiration.
@@ -1382,7 +1295,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1382 } 1295 }
1383 1296
1384 sig = tsk->signal; 1297 sig = tsk->signal;
1385 if (!task_cputime_zero(&sig->cputime_expires)) { 1298 if (sig->cputimer.running) {
1386 struct task_cputime group_sample; 1299 struct task_cputime group_sample;
1387 1300
1388 thread_group_cputimer(tsk, &group_sample); 1301 thread_group_cputimer(tsk, &group_sample);
@@ -1390,7 +1303,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1390 return 1; 1303 return 1;
1391 } 1304 }
1392 1305
1393 return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY; 1306 return 0;
1394} 1307}
1395 1308
1396/* 1309/*
@@ -1419,7 +1332,12 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1419 * put them on the firing list. 1332 * put them on the firing list.
1420 */ 1333 */
1421 check_thread_timers(tsk, &firing); 1334 check_thread_timers(tsk, &firing);
1422 check_process_timers(tsk, &firing); 1335 /*
1336 * If there are any active process wide timers (POSIX 1.b, itimers,
1337 * RLIMIT_CPU) cputimer must be running.
1338 */
1339 if (tsk->signal->cputimer.running)
1340 check_process_timers(tsk, &firing);
1423 1341
1424 /* 1342 /*
1425 * We must release these locks before taking any timer's lock. 1343 * We must release these locks before taking any timer's lock.
@@ -1456,21 +1374,23 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1456} 1374}
1457 1375
1458/* 1376/*
1459 * Set one of the process-wide special case CPU timers. 1377 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1460 * The tsk->sighand->siglock must be held by the caller. 1378 * The tsk->sighand->siglock must be held by the caller.
1461 * The *newval argument is relative and we update it to be absolute, *oldval
1462 * is absolute and we update it to be relative.
1463 */ 1379 */
1464void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1380void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1465 cputime_t *newval, cputime_t *oldval) 1381 cputime_t *newval, cputime_t *oldval)
1466{ 1382{
1467 union cpu_time_count now; 1383 union cpu_time_count now;
1468 struct list_head *head;
1469 1384
1470 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1385 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1471 cpu_timer_sample_group(clock_idx, tsk, &now); 1386 cpu_timer_sample_group(clock_idx, tsk, &now);
1472 1387
1473 if (oldval) { 1388 if (oldval) {
1389 /*
1390 * We are setting itimer. The *oldval is absolute and we update
1391 * it to be relative, *newval argument is relative and we update
1392 * it to be absolute.
1393 */
1474 if (!cputime_eq(*oldval, cputime_zero)) { 1394 if (!cputime_eq(*oldval, cputime_zero)) {
1475 if (cputime_le(*oldval, now.cpu)) { 1395 if (cputime_le(*oldval, now.cpu)) {
1476 /* Just about to fire. */ 1396 /* Just about to fire. */
@@ -1483,33 +1403,21 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1483 if (cputime_eq(*newval, cputime_zero)) 1403 if (cputime_eq(*newval, cputime_zero))
1484 return; 1404 return;
1485 *newval = cputime_add(*newval, now.cpu); 1405 *newval = cputime_add(*newval, now.cpu);
1486
1487 /*
1488 * If the RLIMIT_CPU timer will expire before the
1489 * ITIMER_PROF timer, we have nothing else to do.
1490 */
1491 if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
1492 < cputime_to_secs(*newval))
1493 return;
1494 } 1406 }
1495 1407
1496 /* 1408 /*
1497 * Check whether there are any process timers already set to fire 1409 * Update expiration cache if we are the earliest timer, or eventually
1498 * before this one. If so, we don't have anything more to do. 1410 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
1499 */ 1411 */
1500 head = &tsk->signal->cpu_timers[clock_idx]; 1412 switch (clock_idx) {
1501 if (list_empty(head) || 1413 case CPUCLOCK_PROF:
1502 cputime_ge(list_first_entry(head, 1414 if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1503 struct cpu_timer_list, entry)->expires.cpu,
1504 *newval)) {
1505 switch (clock_idx) {
1506 case CPUCLOCK_PROF:
1507 tsk->signal->cputime_expires.prof_exp = *newval; 1415 tsk->signal->cputime_expires.prof_exp = *newval;
1508 break; 1416 break;
1509 case CPUCLOCK_VIRT: 1417 case CPUCLOCK_VIRT:
1418 if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1510 tsk->signal->cputime_expires.virt_exp = *newval; 1419 tsk->signal->cputime_expires.virt_exp = *newval;
1511 break; 1420 break;
1512 }
1513 } 1421 }
1514} 1422}
1515 1423
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 43191815f874..524e058dcf06 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12 block_io.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 13obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
13 14
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
new file mode 100644
index 000000000000..97024fd40cd5
--- /dev/null
+++ b/kernel/power/block_io.c
@@ -0,0 +1,103 @@
1/*
2 * This file provides functions for block I/O operations on swap/file.
3 *
4 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
5 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#include <linux/bio.h>
11#include <linux/kernel.h>
12#include <linux/pagemap.h>
13#include <linux/swap.h>
14
15#include "power.h"
16
17/**
18 * submit - submit BIO request.
19 * @rw: READ or WRITE.
20 * @off physical offset of page.
21 * @page: page we're reading or writing.
22 * @bio_chain: list of pending biod (for async reading)
23 *
24 * Straight from the textbook - allocate and initialize the bio.
25 * If we're reading, make sure the page is marked as dirty.
26 * Then submit it and, if @bio_chain == NULL, wait.
27 */
28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain)
30{
31 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
32 struct bio *bio;
33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
35 bio->bi_sector = sector;
36 bio->bi_bdev = bdev;
37 bio->bi_end_io = end_swap_bio_read;
38
39 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
40 printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
41 (unsigned long long)sector);
42 bio_put(bio);
43 return -EFAULT;
44 }
45
46 lock_page(page);
47 bio_get(bio);
48
49 if (bio_chain == NULL) {
50 submit_bio(bio_rw, bio);
51 wait_on_page_locked(page);
52 if (rw == READ)
53 bio_set_pages_dirty(bio);
54 bio_put(bio);
55 } else {
56 if (rw == READ)
57 get_page(page); /* These pages are freed later */
58 bio->bi_private = *bio_chain;
59 *bio_chain = bio;
60 submit_bio(bio_rw, bio);
61 }
62 return 0;
63}
64
65int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
66{
67 return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
68 virt_to_page(addr), bio_chain);
69}
70
71int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
72{
73 return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
74 virt_to_page(addr), bio_chain);
75}
76
77int hib_wait_on_bio_chain(struct bio **bio_chain)
78{
79 struct bio *bio;
80 struct bio *next_bio;
81 int ret = 0;
82
83 if (bio_chain == NULL)
84 return 0;
85
86 bio = *bio_chain;
87 if (bio == NULL)
88 return 0;
89 while (bio) {
90 struct page *page;
91
92 next_bio = bio->bi_private;
93 page = bio->bi_io_vec[0].bv_page;
94 wait_on_page_locked(page);
95 if (!PageUptodate(page) || PageError(page))
96 ret = -EIO;
97 put_page(page);
98 bio_put(bio);
99 bio = next_bio;
100 }
101 *bio_chain = NULL;
102 return ret;
103}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46c5a26630a3..006270fe382d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void);
97 */ 97 */
98 98
99struct snapshot_handle { 99struct snapshot_handle {
100 loff_t offset; /* number of the last byte ready for reading
101 * or writing in the sequence
102 */
103 unsigned int cur; /* number of the block of PAGE_SIZE bytes the 100 unsigned int cur; /* number of the block of PAGE_SIZE bytes the
104 * next operation will refer to (ie. current) 101 * next operation will refer to (ie. current)
105 */ 102 */
106 unsigned int cur_offset; /* offset with respect to the current
107 * block (for the next operation)
108 */
109 unsigned int prev; /* number of the block of PAGE_SIZE bytes that
110 * was the current one previously
111 */
112 void *buffer; /* address of the block to read from 103 void *buffer; /* address of the block to read from
113 * or write to 104 * or write to
114 */ 105 */
115 unsigned int buf_offset; /* location to read from or write to,
116 * given as a displacement from 'buffer'
117 */
118 int sync_read; /* Set to one to notify the caller of 106 int sync_read; /* Set to one to notify the caller of
119 * snapshot_write_next() that it may 107 * snapshot_write_next() that it may
120 * need to call wait_on_bio_chain() 108 * need to call wait_on_bio_chain()
@@ -125,12 +113,12 @@ struct snapshot_handle {
125 * snapshot_read_next()/snapshot_write_next() is allowed to 113 * snapshot_read_next()/snapshot_write_next() is allowed to
126 * read/write data after the function returns 114 * read/write data after the function returns
127 */ 115 */
128#define data_of(handle) ((handle).buffer + (handle).buf_offset) 116#define data_of(handle) ((handle).buffer)
129 117
130extern unsigned int snapshot_additional_pages(struct zone *zone); 118extern unsigned int snapshot_additional_pages(struct zone *zone);
131extern unsigned long snapshot_get_image_size(void); 119extern unsigned long snapshot_get_image_size(void);
132extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 120extern int snapshot_read_next(struct snapshot_handle *handle);
133extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 121extern int snapshot_write_next(struct snapshot_handle *handle);
134extern void snapshot_write_finalize(struct snapshot_handle *handle); 122extern void snapshot_write_finalize(struct snapshot_handle *handle);
135extern int snapshot_image_loaded(struct snapshot_handle *handle); 123extern int snapshot_image_loaded(struct snapshot_handle *handle);
136 124
@@ -154,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p);
154extern int swsusp_write(unsigned int flags); 142extern int swsusp_write(unsigned int flags);
155extern void swsusp_close(fmode_t); 143extern void swsusp_close(fmode_t);
156 144
145/* kernel/power/block_io.c */
146extern struct block_device *hib_resume_bdev;
147
148extern int hib_bio_read_page(pgoff_t page_off, void *addr,
149 struct bio **bio_chain);
150extern int hib_bio_write_page(pgoff_t page_off, void *addr,
151 struct bio **bio_chain);
152extern int hib_wait_on_bio_chain(struct bio **bio_chain);
153
157struct timeval; 154struct timeval;
158/* kernel/power/swsusp.c */ 155/* kernel/power/swsusp.c */
159extern void swsusp_show_speed(struct timeval *, struct timeval *, 156extern void swsusp_show_speed(struct timeval *, struct timeval *,
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index be861c26dda7..25ce010e9f8b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1604,14 +1604,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1604 * snapshot_handle structure. The structure gets updated and a pointer 1604 * snapshot_handle structure. The structure gets updated and a pointer
1605 * to it should be passed to this function every next time. 1605 * to it should be passed to this function every next time.
1606 * 1606 *
1607 * The @count parameter should contain the number of bytes the caller
1608 * wants to read from the snapshot. It must not be zero.
1609 *
1610 * On success the function returns a positive number. Then, the caller 1607 * On success the function returns a positive number. Then, the caller
1611 * is allowed to read up to the returned number of bytes from the memory 1608 * is allowed to read up to the returned number of bytes from the memory
1612 * location computed by the data_of() macro. The number returned 1609 * location computed by the data_of() macro.
1613 * may be smaller than @count, but this only happens if the read would
1614 * cross a page boundary otherwise.
1615 * 1610 *
1616 * The function returns 0 to indicate the end of data stream condition, 1611 * The function returns 0 to indicate the end of data stream condition,
1617 * and a negative number is returned on error. In such cases the 1612 * and a negative number is returned on error. In such cases the
@@ -1619,7 +1614,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1619 * any more. 1614 * any more.
1620 */ 1615 */
1621 1616
1622int snapshot_read_next(struct snapshot_handle *handle, size_t count) 1617int snapshot_read_next(struct snapshot_handle *handle)
1623{ 1618{
1624 if (handle->cur > nr_meta_pages + nr_copy_pages) 1619 if (handle->cur > nr_meta_pages + nr_copy_pages)
1625 return 0; 1620 return 0;
@@ -1630,7 +1625,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1630 if (!buffer) 1625 if (!buffer)
1631 return -ENOMEM; 1626 return -ENOMEM;
1632 } 1627 }
1633 if (!handle->offset) { 1628 if (!handle->cur) {
1634 int error; 1629 int error;
1635 1630
1636 error = init_header((struct swsusp_info *)buffer); 1631 error = init_header((struct swsusp_info *)buffer);
@@ -1639,42 +1634,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1639 handle->buffer = buffer; 1634 handle->buffer = buffer;
1640 memory_bm_position_reset(&orig_bm); 1635 memory_bm_position_reset(&orig_bm);
1641 memory_bm_position_reset(&copy_bm); 1636 memory_bm_position_reset(&copy_bm);
1642 } 1637 } else if (handle->cur <= nr_meta_pages) {
1643 if (handle->prev < handle->cur) { 1638 memset(buffer, 0, PAGE_SIZE);
1644 if (handle->cur <= nr_meta_pages) { 1639 pack_pfns(buffer, &orig_bm);
1645 memset(buffer, 0, PAGE_SIZE); 1640 } else {
1646 pack_pfns(buffer, &orig_bm); 1641 struct page *page;
1647 } else {
1648 struct page *page;
1649 1642
1650 page = pfn_to_page(memory_bm_next_pfn(&copy_bm)); 1643 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1651 if (PageHighMem(page)) { 1644 if (PageHighMem(page)) {
1652 /* Highmem pages are copied to the buffer, 1645 /* Highmem pages are copied to the buffer,
1653 * because we can't return with a kmapped 1646 * because we can't return with a kmapped
1654 * highmem page (we may not be called again). 1647 * highmem page (we may not be called again).
1655 */ 1648 */
1656 void *kaddr; 1649 void *kaddr;
1657 1650
1658 kaddr = kmap_atomic(page, KM_USER0); 1651 kaddr = kmap_atomic(page, KM_USER0);
1659 memcpy(buffer, kaddr, PAGE_SIZE); 1652 memcpy(buffer, kaddr, PAGE_SIZE);
1660 kunmap_atomic(kaddr, KM_USER0); 1653 kunmap_atomic(kaddr, KM_USER0);
1661 handle->buffer = buffer; 1654 handle->buffer = buffer;
1662 } else { 1655 } else {
1663 handle->buffer = page_address(page); 1656 handle->buffer = page_address(page);
1664 }
1665 } 1657 }
1666 handle->prev = handle->cur;
1667 }
1668 handle->buf_offset = handle->cur_offset;
1669 if (handle->cur_offset + count >= PAGE_SIZE) {
1670 count = PAGE_SIZE - handle->cur_offset;
1671 handle->cur_offset = 0;
1672 handle->cur++;
1673 } else {
1674 handle->cur_offset += count;
1675 } 1658 }
1676 handle->offset += count; 1659 handle->cur++;
1677 return count; 1660 return PAGE_SIZE;
1678} 1661}
1679 1662
1680/** 1663/**
@@ -2133,14 +2116,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2133 * snapshot_handle structure. The structure gets updated and a pointer 2116 * snapshot_handle structure. The structure gets updated and a pointer
2134 * to it should be passed to this function every next time. 2117 * to it should be passed to this function every next time.
2135 * 2118 *
2136 * The @count parameter should contain the number of bytes the caller
2137 * wants to write to the image. It must not be zero.
2138 *
2139 * On success the function returns a positive number. Then, the caller 2119 * On success the function returns a positive number. Then, the caller
2140 * is allowed to write up to the returned number of bytes to the memory 2120 * is allowed to write up to the returned number of bytes to the memory
2141 * location computed by the data_of() macro. The number returned 2121 * location computed by the data_of() macro.
2142 * may be smaller than @count, but this only happens if the write would
2143 * cross a page boundary otherwise.
2144 * 2122 *
2145 * The function returns 0 to indicate the "end of file" condition, 2123 * The function returns 0 to indicate the "end of file" condition,
2146 * and a negative number is returned on error. In such cases the 2124 * and a negative number is returned on error. In such cases the
@@ -2148,16 +2126,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2148 * any more. 2126 * any more.
2149 */ 2127 */
2150 2128
2151int snapshot_write_next(struct snapshot_handle *handle, size_t count) 2129int snapshot_write_next(struct snapshot_handle *handle)
2152{ 2130{
2153 static struct chain_allocator ca; 2131 static struct chain_allocator ca;
2154 int error = 0; 2132 int error = 0;
2155 2133
2156 /* Check if we have already loaded the entire image */ 2134 /* Check if we have already loaded the entire image */
2157 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 2135 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
2158 return 0; 2136 return 0;
2159 2137
2160 if (handle->offset == 0) { 2138 handle->sync_read = 1;
2139
2140 if (!handle->cur) {
2161 if (!buffer) 2141 if (!buffer)
2162 /* This makes the buffer be freed by swsusp_free() */ 2142 /* This makes the buffer be freed by swsusp_free() */
2163 buffer = get_image_page(GFP_ATOMIC, PG_ANY); 2143 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
@@ -2166,56 +2146,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
2166 return -ENOMEM; 2146 return -ENOMEM;
2167 2147
2168 handle->buffer = buffer; 2148 handle->buffer = buffer;
2169 } 2149 } else if (handle->cur == 1) {
2170 handle->sync_read = 1; 2150 error = load_header(buffer);
2171 if (handle->prev < handle->cur) { 2151 if (error)
2172 if (handle->prev == 0) { 2152 return error;
2173 error = load_header(buffer);
2174 if (error)
2175 return error;
2176 2153
2177 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY); 2154 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
2178 if (error) 2155 if (error)
2179 return error; 2156 return error;
2157
2158 } else if (handle->cur <= nr_meta_pages + 1) {
2159 error = unpack_orig_pfns(buffer, &copy_bm);
2160 if (error)
2161 return error;
2180 2162
2181 } else if (handle->prev <= nr_meta_pages) { 2163 if (handle->cur == nr_meta_pages + 1) {
2182 error = unpack_orig_pfns(buffer, &copy_bm); 2164 error = prepare_image(&orig_bm, &copy_bm);
2183 if (error) 2165 if (error)
2184 return error; 2166 return error;
2185 2167
2186 if (handle->prev == nr_meta_pages) { 2168 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2187 error = prepare_image(&orig_bm, &copy_bm); 2169 memory_bm_position_reset(&orig_bm);
2188 if (error) 2170 restore_pblist = NULL;
2189 return error;
2190
2191 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2192 memory_bm_position_reset(&orig_bm);
2193 restore_pblist = NULL;
2194 handle->buffer = get_buffer(&orig_bm, &ca);
2195 handle->sync_read = 0;
2196 if (IS_ERR(handle->buffer))
2197 return PTR_ERR(handle->buffer);
2198 }
2199 } else {
2200 copy_last_highmem_page();
2201 handle->buffer = get_buffer(&orig_bm, &ca); 2171 handle->buffer = get_buffer(&orig_bm, &ca);
2172 handle->sync_read = 0;
2202 if (IS_ERR(handle->buffer)) 2173 if (IS_ERR(handle->buffer))
2203 return PTR_ERR(handle->buffer); 2174 return PTR_ERR(handle->buffer);
2204 if (handle->buffer != buffer)
2205 handle->sync_read = 0;
2206 } 2175 }
2207 handle->prev = handle->cur;
2208 }
2209 handle->buf_offset = handle->cur_offset;
2210 if (handle->cur_offset + count >= PAGE_SIZE) {
2211 count = PAGE_SIZE - handle->cur_offset;
2212 handle->cur_offset = 0;
2213 handle->cur++;
2214 } else { 2176 } else {
2215 handle->cur_offset += count; 2177 copy_last_highmem_page();
2178 handle->buffer = get_buffer(&orig_bm, &ca);
2179 if (IS_ERR(handle->buffer))
2180 return PTR_ERR(handle->buffer);
2181 if (handle->buffer != buffer)
2182 handle->sync_read = 0;
2216 } 2183 }
2217 handle->offset += count; 2184 handle->cur++;
2218 return count; 2185 return PAGE_SIZE;
2219} 2186}
2220 2187
2221/** 2188/**
@@ -2230,7 +2197,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
2230{ 2197{
2231 copy_last_highmem_page(); 2198 copy_last_highmem_page();
2232 /* Free only if we have loaded the image entirely */ 2199 /* Free only if we have loaded the image entirely */
2233 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { 2200 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2234 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2201 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
2235 free_highmem_data(); 2202 free_highmem_data();
2236 } 2203 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 66824d71983a..b0bb21778391 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -29,6 +29,40 @@
29 29
30#define SWSUSP_SIG "S1SUSPEND" 30#define SWSUSP_SIG "S1SUSPEND"
31 31
32/*
33 * The swap map is a data structure used for keeping track of each page
34 * written to a swap partition. It consists of many swap_map_page
35 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
36 * These structures are stored on the swap and linked together with the
37 * help of the .next_swap member.
38 *
39 * The swap map is created during suspend. The swap map pages are
40 * allocated and populated one at a time, so we only need one memory
41 * page to set up the entire structure.
42 *
43 * During resume we also only need to use one swap_map_page structure
44 * at a time.
45 */
46
47#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
48
49struct swap_map_page {
50 sector_t entries[MAP_PAGE_ENTRIES];
51 sector_t next_swap;
52};
53
54/**
55 * The swap_map_handle structure is used for handling swap in
56 * a file-alike way
57 */
58
59struct swap_map_handle {
60 struct swap_map_page *cur;
61 sector_t cur_swap;
62 sector_t first_sector;
63 unsigned int k;
64};
65
32struct swsusp_header { 66struct swsusp_header {
33 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; 67 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
34 sector_t image; 68 sector_t image;
@@ -145,110 +179,24 @@ int swsusp_swap_in_use(void)
145 */ 179 */
146 180
147static unsigned short root_swap = 0xffff; 181static unsigned short root_swap = 0xffff;
148static struct block_device *resume_bdev; 182struct block_device *hib_resume_bdev;
149
150/**
151 * submit - submit BIO request.
152 * @rw: READ or WRITE.
153 * @off physical offset of page.
154 * @page: page we're reading or writing.
155 * @bio_chain: list of pending biod (for async reading)
156 *
157 * Straight from the textbook - allocate and initialize the bio.
158 * If we're reading, make sure the page is marked as dirty.
159 * Then submit it and, if @bio_chain == NULL, wait.
160 */
161static int submit(int rw, pgoff_t page_off, struct page *page,
162 struct bio **bio_chain)
163{
164 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
165 struct bio *bio;
166
167 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
168 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
169 bio->bi_bdev = resume_bdev;
170 bio->bi_end_io = end_swap_bio_read;
171
172 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
173 printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
174 page_off);
175 bio_put(bio);
176 return -EFAULT;
177 }
178
179 lock_page(page);
180 bio_get(bio);
181
182 if (bio_chain == NULL) {
183 submit_bio(bio_rw, bio);
184 wait_on_page_locked(page);
185 if (rw == READ)
186 bio_set_pages_dirty(bio);
187 bio_put(bio);
188 } else {
189 if (rw == READ)
190 get_page(page); /* These pages are freed later */
191 bio->bi_private = *bio_chain;
192 *bio_chain = bio;
193 submit_bio(bio_rw, bio);
194 }
195 return 0;
196}
197
198static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
199{
200 return submit(READ, page_off, virt_to_page(addr), bio_chain);
201}
202
203static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
204{
205 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
206}
207
208static int wait_on_bio_chain(struct bio **bio_chain)
209{
210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235 183
236/* 184/*
237 * Saving part 185 * Saving part
238 */ 186 */
239 187
240static int mark_swapfiles(sector_t start, unsigned int flags) 188static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
241{ 189{
242 int error; 190 int error;
243 191
244 bio_read_page(swsusp_resume_block, swsusp_header, NULL); 192 hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
245 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 193 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
246 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 194 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
247 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 195 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
248 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 196 memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
249 swsusp_header->image = start; 197 swsusp_header->image = handle->first_sector;
250 swsusp_header->flags = flags; 198 swsusp_header->flags = flags;
251 error = bio_write_page(swsusp_resume_block, 199 error = hib_bio_write_page(swsusp_resume_block,
252 swsusp_header, NULL); 200 swsusp_header, NULL);
253 } else { 201 } else {
254 printk(KERN_ERR "PM: Swap header not found!\n"); 202 printk(KERN_ERR "PM: Swap header not found!\n");
@@ -260,25 +208,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
260/** 208/**
261 * swsusp_swap_check - check if the resume device is a swap device 209 * swsusp_swap_check - check if the resume device is a swap device
262 * and get its index (if so) 210 * and get its index (if so)
211 *
212 * This is called before saving image
263 */ 213 */
264 214static int swsusp_swap_check(void)
265static int swsusp_swap_check(void) /* This is called before saving image */
266{ 215{
267 int res; 216 int res;
268 217
269 res = swap_type_of(swsusp_resume_device, swsusp_resume_block, 218 res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
270 &resume_bdev); 219 &hib_resume_bdev);
271 if (res < 0) 220 if (res < 0)
272 return res; 221 return res;
273 222
274 root_swap = res; 223 root_swap = res;
275 res = blkdev_get(resume_bdev, FMODE_WRITE); 224 res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
276 if (res) 225 if (res)
277 return res; 226 return res;
278 227
279 res = set_blocksize(resume_bdev, PAGE_SIZE); 228 res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
280 if (res < 0) 229 if (res < 0)
281 blkdev_put(resume_bdev, FMODE_WRITE); 230 blkdev_put(hib_resume_bdev, FMODE_WRITE);
282 231
283 return res; 232 return res;
284} 233}
@@ -309,42 +258,9 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
309 } else { 258 } else {
310 src = buf; 259 src = buf;
311 } 260 }
312 return bio_write_page(offset, src, bio_chain); 261 return hib_bio_write_page(offset, src, bio_chain);
313} 262}
314 263
315/*
316 * The swap map is a data structure used for keeping track of each page
317 * written to a swap partition. It consists of many swap_map_page
318 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
319 * These structures are stored on the swap and linked together with the
320 * help of the .next_swap member.
321 *
322 * The swap map is created during suspend. The swap map pages are
323 * allocated and populated one at a time, so we only need one memory
324 * page to set up the entire structure.
325 *
326 * During resume we also only need to use one swap_map_page structure
327 * at a time.
328 */
329
330#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
331
332struct swap_map_page {
333 sector_t entries[MAP_PAGE_ENTRIES];
334 sector_t next_swap;
335};
336
337/**
338 * The swap_map_handle structure is used for handling swap in
339 * a file-alike way
340 */
341
342struct swap_map_handle {
343 struct swap_map_page *cur;
344 sector_t cur_swap;
345 unsigned int k;
346};
347
348static void release_swap_writer(struct swap_map_handle *handle) 264static void release_swap_writer(struct swap_map_handle *handle)
349{ 265{
350 if (handle->cur) 266 if (handle->cur)
@@ -354,16 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle)
354 270
355static int get_swap_writer(struct swap_map_handle *handle) 271static int get_swap_writer(struct swap_map_handle *handle)
356{ 272{
273 int ret;
274
275 ret = swsusp_swap_check();
276 if (ret) {
277 if (ret != -ENOSPC)
278 printk(KERN_ERR "PM: Cannot find swap device, try "
279 "swapon -a.\n");
280 return ret;
281 }
357 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 282 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
358 if (!handle->cur) 283 if (!handle->cur) {
359 return -ENOMEM; 284 ret = -ENOMEM;
285 goto err_close;
286 }
360 handle->cur_swap = alloc_swapdev_block(root_swap); 287 handle->cur_swap = alloc_swapdev_block(root_swap);
361 if (!handle->cur_swap) { 288 if (!handle->cur_swap) {
362 release_swap_writer(handle); 289 ret = -ENOSPC;
363 return -ENOSPC; 290 goto err_rel;
364 } 291 }
365 handle->k = 0; 292 handle->k = 0;
293 handle->first_sector = handle->cur_swap;
366 return 0; 294 return 0;
295err_rel:
296 release_swap_writer(handle);
297err_close:
298 swsusp_close(FMODE_WRITE);
299 return ret;
367} 300}
368 301
369static int swap_write_page(struct swap_map_handle *handle, void *buf, 302static int swap_write_page(struct swap_map_handle *handle, void *buf,
@@ -380,7 +313,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
380 return error; 313 return error;
381 handle->cur->entries[handle->k++] = offset; 314 handle->cur->entries[handle->k++] = offset;
382 if (handle->k >= MAP_PAGE_ENTRIES) { 315 if (handle->k >= MAP_PAGE_ENTRIES) {
383 error = wait_on_bio_chain(bio_chain); 316 error = hib_wait_on_bio_chain(bio_chain);
384 if (error) 317 if (error)
385 goto out; 318 goto out;
386 offset = alloc_swapdev_block(root_swap); 319 offset = alloc_swapdev_block(root_swap);
@@ -406,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle)
406 return -EINVAL; 339 return -EINVAL;
407} 340}
408 341
342static int swap_writer_finish(struct swap_map_handle *handle,
343 unsigned int flags, int error)
344{
345 if (!error) {
346 flush_swap_writer(handle);
347 printk(KERN_INFO "PM: S");
348 error = mark_swapfiles(handle, flags);
349 printk("|\n");
350 }
351
352 if (error)
353 free_all_swap_pages(root_swap);
354 release_swap_writer(handle);
355 swsusp_close(FMODE_WRITE);
356
357 return error;
358}
359
409/** 360/**
410 * save_image - save the suspend image data 361 * save_image - save the suspend image data
411 */ 362 */
@@ -431,7 +382,7 @@ static int save_image(struct swap_map_handle *handle,
431 bio = NULL; 382 bio = NULL;
432 do_gettimeofday(&start); 383 do_gettimeofday(&start);
433 while (1) { 384 while (1) {
434 ret = snapshot_read_next(snapshot, PAGE_SIZE); 385 ret = snapshot_read_next(snapshot);
435 if (ret <= 0) 386 if (ret <= 0)
436 break; 387 break;
437 ret = swap_write_page(handle, data_of(*snapshot), &bio); 388 ret = swap_write_page(handle, data_of(*snapshot), &bio);
@@ -441,7 +392,7 @@ static int save_image(struct swap_map_handle *handle,
441 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 392 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
442 nr_pages++; 393 nr_pages++;
443 } 394 }
444 err2 = wait_on_bio_chain(&bio); 395 err2 = hib_wait_on_bio_chain(&bio);
445 do_gettimeofday(&stop); 396 do_gettimeofday(&stop);
446 if (!ret) 397 if (!ret)
447 ret = err2; 398 ret = err2;
@@ -483,50 +434,34 @@ int swsusp_write(unsigned int flags)
483 struct swap_map_handle handle; 434 struct swap_map_handle handle;
484 struct snapshot_handle snapshot; 435 struct snapshot_handle snapshot;
485 struct swsusp_info *header; 436 struct swsusp_info *header;
437 unsigned long pages;
486 int error; 438 int error;
487 439
488 error = swsusp_swap_check(); 440 pages = snapshot_get_image_size();
441 error = get_swap_writer(&handle);
489 if (error) { 442 if (error) {
490 printk(KERN_ERR "PM: Cannot find swap device, try " 443 printk(KERN_ERR "PM: Cannot get swap writer\n");
491 "swapon -a.\n");
492 return error; 444 return error;
493 } 445 }
446 if (!enough_swap(pages)) {
447 printk(KERN_ERR "PM: Not enough free swap\n");
448 error = -ENOSPC;
449 goto out_finish;
450 }
494 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 451 memset(&snapshot, 0, sizeof(struct snapshot_handle));
495 error = snapshot_read_next(&snapshot, PAGE_SIZE); 452 error = snapshot_read_next(&snapshot);
496 if (error < PAGE_SIZE) { 453 if (error < PAGE_SIZE) {
497 if (error >= 0) 454 if (error >= 0)
498 error = -EFAULT; 455 error = -EFAULT;
499 456
500 goto out; 457 goto out_finish;
501 } 458 }
502 header = (struct swsusp_info *)data_of(snapshot); 459 header = (struct swsusp_info *)data_of(snapshot);
503 if (!enough_swap(header->pages)) { 460 error = swap_write_page(&handle, header, NULL);
504 printk(KERN_ERR "PM: Not enough free swap\n"); 461 if (!error)
505 error = -ENOSPC; 462 error = save_image(&handle, &snapshot, pages - 1);
506 goto out; 463out_finish:
507 } 464 error = swap_writer_finish(&handle, flags, error);
508 error = get_swap_writer(&handle);
509 if (!error) {
510 sector_t start = handle.cur_swap;
511
512 error = swap_write_page(&handle, header, NULL);
513 if (!error)
514 error = save_image(&handle, &snapshot,
515 header->pages - 1);
516
517 if (!error) {
518 flush_swap_writer(&handle);
519 printk(KERN_INFO "PM: S");
520 error = mark_swapfiles(start, flags);
521 printk("|\n");
522 }
523 }
524 if (error)
525 free_all_swap_pages(root_swap);
526
527 release_swap_writer(&handle);
528 out:
529 swsusp_close(FMODE_WRITE);
530 return error; 465 return error;
531} 466}
532 467
@@ -542,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle)
542 handle->cur = NULL; 477 handle->cur = NULL;
543} 478}
544 479
545static int get_swap_reader(struct swap_map_handle *handle, sector_t start) 480static int get_swap_reader(struct swap_map_handle *handle,
481 unsigned int *flags_p)
546{ 482{
547 int error; 483 int error;
548 484
549 if (!start) 485 *flags_p = swsusp_header->flags;
486
487 if (!swsusp_header->image) /* how can this happen? */
550 return -EINVAL; 488 return -EINVAL;
551 489
552 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); 490 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
553 if (!handle->cur) 491 if (!handle->cur)
554 return -ENOMEM; 492 return -ENOMEM;
555 493
556 error = bio_read_page(start, handle->cur, NULL); 494 error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
557 if (error) { 495 if (error) {
558 release_swap_reader(handle); 496 release_swap_reader(handle);
559 return error; 497 return error;
@@ -573,21 +511,28 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
573 offset = handle->cur->entries[handle->k]; 511 offset = handle->cur->entries[handle->k];
574 if (!offset) 512 if (!offset)
575 return -EFAULT; 513 return -EFAULT;
576 error = bio_read_page(offset, buf, bio_chain); 514 error = hib_bio_read_page(offset, buf, bio_chain);
577 if (error) 515 if (error)
578 return error; 516 return error;
579 if (++handle->k >= MAP_PAGE_ENTRIES) { 517 if (++handle->k >= MAP_PAGE_ENTRIES) {
580 error = wait_on_bio_chain(bio_chain); 518 error = hib_wait_on_bio_chain(bio_chain);
581 handle->k = 0; 519 handle->k = 0;
582 offset = handle->cur->next_swap; 520 offset = handle->cur->next_swap;
583 if (!offset) 521 if (!offset)
584 release_swap_reader(handle); 522 release_swap_reader(handle);
585 else if (!error) 523 else if (!error)
586 error = bio_read_page(offset, handle->cur, NULL); 524 error = hib_bio_read_page(offset, handle->cur, NULL);
587 } 525 }
588 return error; 526 return error;
589} 527}
590 528
529static int swap_reader_finish(struct swap_map_handle *handle)
530{
531 release_swap_reader(handle);
532
533 return 0;
534}
535
591/** 536/**
592 * load_image - load the image using the swap map handle 537 * load_image - load the image using the swap map handle
593 * @handle and the snapshot handle @snapshot 538 * @handle and the snapshot handle @snapshot
@@ -615,21 +560,21 @@ static int load_image(struct swap_map_handle *handle,
615 bio = NULL; 560 bio = NULL;
616 do_gettimeofday(&start); 561 do_gettimeofday(&start);
617 for ( ; ; ) { 562 for ( ; ; ) {
618 error = snapshot_write_next(snapshot, PAGE_SIZE); 563 error = snapshot_write_next(snapshot);
619 if (error <= 0) 564 if (error <= 0)
620 break; 565 break;
621 error = swap_read_page(handle, data_of(*snapshot), &bio); 566 error = swap_read_page(handle, data_of(*snapshot), &bio);
622 if (error) 567 if (error)
623 break; 568 break;
624 if (snapshot->sync_read) 569 if (snapshot->sync_read)
625 error = wait_on_bio_chain(&bio); 570 error = hib_wait_on_bio_chain(&bio);
626 if (error) 571 if (error)
627 break; 572 break;
628 if (!(nr_pages % m)) 573 if (!(nr_pages % m))
629 printk("\b\b\b\b%3d%%", nr_pages / m); 574 printk("\b\b\b\b%3d%%", nr_pages / m);
630 nr_pages++; 575 nr_pages++;
631 } 576 }
632 err2 = wait_on_bio_chain(&bio); 577 err2 = hib_wait_on_bio_chain(&bio);
633 do_gettimeofday(&stop); 578 do_gettimeofday(&stop);
634 if (!error) 579 if (!error)
635 error = err2; 580 error = err2;
@@ -657,20 +602,20 @@ int swsusp_read(unsigned int *flags_p)
657 struct snapshot_handle snapshot; 602 struct snapshot_handle snapshot;
658 struct swsusp_info *header; 603 struct swsusp_info *header;
659 604
660 *flags_p = swsusp_header->flags;
661
662 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 605 memset(&snapshot, 0, sizeof(struct snapshot_handle));
663 error = snapshot_write_next(&snapshot, PAGE_SIZE); 606 error = snapshot_write_next(&snapshot);
664 if (error < PAGE_SIZE) 607 if (error < PAGE_SIZE)
665 return error < 0 ? error : -EFAULT; 608 return error < 0 ? error : -EFAULT;
666 header = (struct swsusp_info *)data_of(snapshot); 609 header = (struct swsusp_info *)data_of(snapshot);
667 error = get_swap_reader(&handle, swsusp_header->image); 610 error = get_swap_reader(&handle, flags_p);
611 if (error)
612 goto end;
668 if (!error) 613 if (!error)
669 error = swap_read_page(&handle, header, NULL); 614 error = swap_read_page(&handle, header, NULL);
670 if (!error) 615 if (!error)
671 error = load_image(&handle, &snapshot, header->pages - 1); 616 error = load_image(&handle, &snapshot, header->pages - 1);
672 release_swap_reader(&handle); 617 swap_reader_finish(&handle);
673 618end:
674 if (!error) 619 if (!error)
675 pr_debug("PM: Image successfully loaded\n"); 620 pr_debug("PM: Image successfully loaded\n");
676 else 621 else
@@ -686,11 +631,11 @@ int swsusp_check(void)
686{ 631{
687 int error; 632 int error;
688 633
689 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 634 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
690 if (!IS_ERR(resume_bdev)) { 635 if (!IS_ERR(hib_resume_bdev)) {
691 set_blocksize(resume_bdev, PAGE_SIZE); 636 set_blocksize(hib_resume_bdev, PAGE_SIZE);
692 memset(swsusp_header, 0, PAGE_SIZE); 637 memset(swsusp_header, 0, PAGE_SIZE);
693 error = bio_read_page(swsusp_resume_block, 638 error = hib_bio_read_page(swsusp_resume_block,
694 swsusp_header, NULL); 639 swsusp_header, NULL);
695 if (error) 640 if (error)
696 goto put; 641 goto put;
@@ -698,7 +643,7 @@ int swsusp_check(void)
698 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 643 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
699 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 644 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
700 /* Reset swap signature now */ 645 /* Reset swap signature now */
701 error = bio_write_page(swsusp_resume_block, 646 error = hib_bio_write_page(swsusp_resume_block,
702 swsusp_header, NULL); 647 swsusp_header, NULL);
703 } else { 648 } else {
704 error = -EINVAL; 649 error = -EINVAL;
@@ -706,11 +651,11 @@ int swsusp_check(void)
706 651
707put: 652put:
708 if (error) 653 if (error)
709 blkdev_put(resume_bdev, FMODE_READ); 654 blkdev_put(hib_resume_bdev, FMODE_READ);
710 else 655 else
711 pr_debug("PM: Signature found, resuming\n"); 656 pr_debug("PM: Signature found, resuming\n");
712 } else { 657 } else {
713 error = PTR_ERR(resume_bdev); 658 error = PTR_ERR(hib_resume_bdev);
714 } 659 }
715 660
716 if (error) 661 if (error)
@@ -725,12 +670,12 @@ put:
725 670
726void swsusp_close(fmode_t mode) 671void swsusp_close(fmode_t mode)
727{ 672{
728 if (IS_ERR(resume_bdev)) { 673 if (IS_ERR(hib_resume_bdev)) {
729 pr_debug("PM: Image device not initialised\n"); 674 pr_debug("PM: Image device not initialised\n");
730 return; 675 return;
731 } 676 }
732 677
733 blkdev_put(resume_bdev, mode); 678 blkdev_put(hib_resume_bdev, mode);
734} 679}
735 680
736static int swsusp_header_init(void) 681static int swsusp_header_init(void)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a8c96212bc1b..e819e17877ca 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
151{ 151{
152 struct snapshot_data *data; 152 struct snapshot_data *data;
153 ssize_t res; 153 ssize_t res;
154 loff_t pg_offp = *offp & ~PAGE_MASK;
154 155
155 mutex_lock(&pm_mutex); 156 mutex_lock(&pm_mutex);
156 157
@@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
159 res = -ENODATA; 160 res = -ENODATA;
160 goto Unlock; 161 goto Unlock;
161 } 162 }
162 res = snapshot_read_next(&data->handle, count); 163 if (!pg_offp) { /* on page boundary? */
163 if (res > 0) { 164 res = snapshot_read_next(&data->handle);
164 if (copy_to_user(buf, data_of(data->handle), res)) 165 if (res <= 0)
165 res = -EFAULT; 166 goto Unlock;
166 else 167 } else {
167 *offp = data->handle.offset; 168 res = PAGE_SIZE - pg_offp;
168 } 169 }
169 170
171 res = simple_read_from_buffer(buf, count, &pg_offp,
172 data_of(data->handle), res);
173 if (res > 0)
174 *offp += res;
175
170 Unlock: 176 Unlock:
171 mutex_unlock(&pm_mutex); 177 mutex_unlock(&pm_mutex);
172 178
@@ -178,18 +184,25 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
178{ 184{
179 struct snapshot_data *data; 185 struct snapshot_data *data;
180 ssize_t res; 186 ssize_t res;
187 loff_t pg_offp = *offp & ~PAGE_MASK;
181 188
182 mutex_lock(&pm_mutex); 189 mutex_lock(&pm_mutex);
183 190
184 data = filp->private_data; 191 data = filp->private_data;
185 res = snapshot_write_next(&data->handle, count); 192
186 if (res > 0) { 193 if (!pg_offp) {
187 if (copy_from_user(data_of(data->handle), buf, res)) 194 res = snapshot_write_next(&data->handle);
188 res = -EFAULT; 195 if (res <= 0)
189 else 196 goto unlock;
190 *offp = data->handle.offset; 197 } else {
198 res = PAGE_SIZE - pg_offp;
191 } 199 }
192 200
201 res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
202 buf, count);
203 if (res > 0)
204 *offp += res;
205unlock:
193 mutex_unlock(&pm_mutex); 206 mutex_unlock(&pm_mutex);
194 207
195 return res; 208 return res;
diff --git a/kernel/printk.c b/kernel/printk.c
index 75077ad0b537..444b770c9595 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,7 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/kdb.h>
36#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h> 38#include <linux/kmsg_dump.h>
38#include <linux/syslog.h> 39#include <linux/syslog.h>
@@ -413,6 +414,22 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
413 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 414 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
414} 415}
415 416
417#ifdef CONFIG_KGDB_KDB
418/* kdb dmesg command needs access to the syslog buffer. do_syslog()
419 * uses locks so it cannot be used during debugging. Just tell kdb
420 * where the start and end of the physical and logical logs are. This
421 * is equivalent to do_syslog(3).
422 */
423void kdb_syslog_data(char *syslog_data[4])
424{
425 syslog_data[0] = log_buf;
426 syslog_data[1] = log_buf + log_buf_len;
427 syslog_data[2] = log_buf + log_end -
428 (logged_chars < log_buf_len ? logged_chars : log_buf_len);
429 syslog_data[3] = log_buf + log_end;
430}
431#endif /* CONFIG_KGDB_KDB */
432
416/* 433/*
417 * Call the console drivers on a range of log_buf 434 * Call the console drivers on a range of log_buf
418 */ 435 */
@@ -586,6 +603,14 @@ asmlinkage int printk(const char *fmt, ...)
586 va_list args; 603 va_list args;
587 int r; 604 int r;
588 605
606#ifdef CONFIG_KGDB_KDB
607 if (unlikely(kdb_trap_printk)) {
608 va_start(args, fmt);
609 r = vkdb_printf(fmt, args);
610 va_end(args);
611 return r;
612 }
613#endif
589 va_start(args, fmt); 614 va_start(args, fmt);
590 r = vprintk(fmt, args); 615 r = vprintk(fmt, args);
591 va_end(args); 616 va_end(args);
diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae8..dfadc5b729f1 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
127 return 0; 127 return 0;
128 128
129 prof_buffer = vmalloc(buffer_bytes); 129 prof_buffer = vmalloc(buffer_bytes);
130 if (prof_buffer) 130 if (prof_buffer) {
131 memset(prof_buffer, 0, buffer_bytes);
131 return 0; 132 return 0;
133 }
132 134
133 free_cpumask_var(prof_cpu_mask); 135 free_cpumask_var(prof_cpu_mask);
134 return -ENOMEM; 136 return -ENOMEM;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a0..6af9cdd558b7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18#include <linux/ptrace.h> 17#include <linux/ptrace.h>
19#include <linux/security.h> 18#include <linux/security.h>
20#include <linux/signal.h> 19#include <linux/signal.h>
@@ -76,7 +75,6 @@ void __ptrace_unlink(struct task_struct *child)
76 child->parent = child->real_parent; 75 child->parent = child->real_parent;
77 list_del_init(&child->ptrace_entry); 76 list_del_init(&child->ptrace_entry);
78 77
79 arch_ptrace_untrace(child);
80 if (task_is_traced(child)) 78 if (task_is_traced(child))
81 ptrace_untrace(child); 79 ptrace_untrace(child);
82} 80}
@@ -666,10 +664,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
666 struct task_struct *child; 664 struct task_struct *child;
667 long ret; 665 long ret;
668 666
669 /*
670 * This lock_kernel fixes a subtle race with suid exec
671 */
672 lock_kernel();
673 if (request == PTRACE_TRACEME) { 667 if (request == PTRACE_TRACEME) {
674 ret = ptrace_traceme(); 668 ret = ptrace_traceme();
675 if (!ret) 669 if (!ret)
@@ -703,7 +697,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
703 out_put_task_struct: 697 out_put_task_struct:
704 put_task_struct(child); 698 put_task_struct(child);
705 out: 699 out:
706 unlock_kernel();
707 return ret; 700 return ret;
708} 701}
709 702
@@ -813,10 +806,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
813 struct task_struct *child; 806 struct task_struct *child;
814 long ret; 807 long ret;
815 808
816 /*
817 * This lock_kernel fixes a subtle race with suid exec
818 */
819 lock_kernel();
820 if (request == PTRACE_TRACEME) { 809 if (request == PTRACE_TRACEME) {
821 ret = ptrace_traceme(); 810 ret = ptrace_traceme();
822 goto out; 811 goto out;
@@ -846,7 +835,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
846 out_put_task_struct: 835 out_put_task_struct:
847 put_task_struct(child); 836 put_task_struct(child);
848 out: 837 out:
849 unlock_kernel();
850 return ret; 838 return ret;
851} 839}
852#endif /* CONFIG_COMPAT */ 840#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 03a7ea1579f6..72a8dc9567f5 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48#include <linux/hardirq.h> 47#include <linux/hardirq.h>
49 48
50#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -64,9 +63,6 @@ struct lockdep_map rcu_sched_lock_map =
64EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
65#endif 64#endif
66 65
67int rcu_scheduler_active __read_mostly;
68EXPORT_SYMBOL_GPL(rcu_scheduler_active);
69
70#ifdef CONFIG_DEBUG_LOCK_ALLOC 66#ifdef CONFIG_DEBUG_LOCK_ALLOC
71 67
72int debug_lockdep_rcu_enabled(void) 68int debug_lockdep_rcu_enabled(void)
@@ -97,21 +93,6 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
97#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 93#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
98 94
99/* 95/*
100 * This function is invoked towards the end of the scheduler's initialization
101 * process. Before this is called, the idle task might contain
102 * RCU read-side critical sections (during which time, this idle
103 * task is booting the system). After this function is called, the
104 * idle tasks are prohibited from containing RCU read-side critical
105 * sections.
106 */
107void rcu_scheduler_starting(void)
108{
109 WARN_ON(num_online_cpus() != 1);
110 WARN_ON(nr_context_switches() > 0);
111 rcu_scheduler_active = 1;
112}
113
114/*
115 * Awaken the corresponding synchronize_rcu() instance now that a 96 * Awaken the corresponding synchronize_rcu() instance now that a
116 * grace period has elapsed. 97 * grace period has elapsed.
117 */ 98 */
@@ -122,3 +103,14 @@ void wakeme_after_rcu(struct rcu_head *head)
122 rcu = container_of(head, struct rcu_synchronize, head); 103 rcu = container_of(head, struct rcu_synchronize, head);
123 complete(&rcu->completion); 104 complete(&rcu->completion);
124} 105}
106
107#ifdef CONFIG_PROVE_RCU
108/*
109 * wrapper function to avoid #include problems.
110 */
111int rcu_my_thread_group_empty(void)
112{
113 return thread_group_empty(current);
114}
115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
116#endif /* #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572c..38729d3cd236 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
44}; 44};
45 45
46/* Definition for rcupdate control block. */ 46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = { 47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist, 48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist, 49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50}; 50};
51 51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = { 52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
54 .curtail = &rcu_bh_ctrlblk.rcucblist, 54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55}; 55};
56 56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61
57#ifdef CONFIG_NO_HZ 62#ifdef CONFIG_NO_HZ
58 63
59static long rcu_dynticks_nesting = 1; 64static long rcu_dynticks_nesting = 1;
@@ -108,7 +113,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
108 */ 113 */
109void rcu_sched_qs(int cpu) 114void rcu_sched_qs(int cpu)
110{ 115{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) 116 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
117 rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ); 118 raise_softirq(RCU_SOFTIRQ);
113} 119}
114 120
@@ -173,7 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
173 */ 179 */
174static void rcu_process_callbacks(struct softirq_action *unused) 180static void rcu_process_callbacks(struct softirq_action *unused)
175{ 181{
176 __rcu_process_callbacks(&rcu_ctrlblk); 182 __rcu_process_callbacks(&rcu_sched_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk); 183 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178} 184}
179 185
@@ -187,7 +193,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
187 * 193 *
188 * Cool, huh? (Due to Josh Triplett.) 194 * Cool, huh? (Due to Josh Triplett.)
189 * 195 *
190 * But we want to make this a static inline later. 196 * But we want to make this a static inline later. The cond_resched()
197 * currently makes this problematic.
191 */ 198 */
192void synchronize_sched(void) 199void synchronize_sched(void)
193{ 200{
@@ -195,12 +202,6 @@ void synchronize_sched(void)
195} 202}
196EXPORT_SYMBOL_GPL(synchronize_sched); 203EXPORT_SYMBOL_GPL(synchronize_sched);
197 204
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/* 205/*
205 * Helper function for call_rcu() and call_rcu_bh(). 206 * Helper function for call_rcu() and call_rcu_bh().
206 */ 207 */
@@ -226,7 +227,7 @@ static void __call_rcu(struct rcu_head *head,
226 */ 227 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 228void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{ 229{
229 __call_rcu(head, func, &rcu_ctrlblk); 230 __call_rcu(head, func, &rcu_sched_ctrlblk);
230} 231}
231EXPORT_SYMBOL_GPL(call_rcu); 232EXPORT_SYMBOL_GPL(call_rcu);
232 233
@@ -244,11 +245,13 @@ void rcu_barrier(void)
244{ 245{
245 struct rcu_synchronize rcu; 246 struct rcu_synchronize rcu;
246 247
248 init_rcu_head_on_stack(&rcu.head);
247 init_completion(&rcu.completion); 249 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */ 250 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu); 251 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */ 252 /* Wait for it. */
251 wait_for_completion(&rcu.completion); 253 wait_for_completion(&rcu.completion);
254 destroy_rcu_head_on_stack(&rcu.head);
252} 255}
253EXPORT_SYMBOL_GPL(rcu_barrier); 256EXPORT_SYMBOL_GPL(rcu_barrier);
254 257
@@ -256,11 +259,13 @@ void rcu_barrier_bh(void)
256{ 259{
257 struct rcu_synchronize rcu; 260 struct rcu_synchronize rcu;
258 261
262 init_rcu_head_on_stack(&rcu.head);
259 init_completion(&rcu.completion); 263 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */ 264 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu); 265 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */ 266 /* Wait for it. */
263 wait_for_completion(&rcu.completion); 267 wait_for_completion(&rcu.completion);
268 destroy_rcu_head_on_stack(&rcu.head);
264} 269}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh); 270EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266 271
@@ -268,11 +273,13 @@ void rcu_barrier_sched(void)
268{ 273{
269 struct rcu_synchronize rcu; 274 struct rcu_synchronize rcu;
270 275
276 init_rcu_head_on_stack(&rcu.head);
271 init_completion(&rcu.completion); 277 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */ 278 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu); 279 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */ 280 /* Wait for it. */
275 wait_for_completion(&rcu.completion); 281 wait_for_completion(&rcu.completion);
282 destroy_rcu_head_on_stack(&rcu.head);
276} 283}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched); 284EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278 285
@@ -280,3 +287,5 @@ void __init rcu_init(void)
280{ 287{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 288 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282} 289}
290
291#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 000000000000..d223a92bc742
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright IBM Corporation, 2009
21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#ifdef CONFIG_DEBUG_LOCK_ALLOC
26
27#include <linux/kernel_stat.h>
28
29/*
30 * During boot, we forgive RCU lockdep issues. After this function is
31 * invoked, we start taking RCU lockdep issues seriously.
32 */
33void rcu_scheduler_starting(void)
34{
35 WARN_ON(nr_context_switches() > 0);
36 rcu_scheduler_active = 1;
37}
38
39#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83ed..6535ac8bc6a5 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -464,9 +464,11 @@ static void rcu_bh_torture_synchronize(void)
464{ 464{
465 struct rcu_bh_torture_synchronize rcu; 465 struct rcu_bh_torture_synchronize rcu;
466 466
467 init_rcu_head_on_stack(&rcu.head);
467 init_completion(&rcu.completion); 468 init_completion(&rcu.completion);
468 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); 469 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
469 wait_for_completion(&rcu.completion); 470 wait_for_completion(&rcu.completion);
471 destroy_rcu_head_on_stack(&rcu.head);
470} 472}
471 473
472static struct rcu_torture_ops rcu_bh_ops = { 474static struct rcu_torture_ops rcu_bh_ops = {
@@ -669,7 +671,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
669 .sync = synchronize_sched_expedited, 671 .sync = synchronize_sched_expedited,
670 .cb_barrier = NULL, 672 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state, 673 .fqs = rcu_sched_force_quiescent_state,
672 .stats = rcu_expedited_torture_stats, 674 .stats = NULL,
673 .irq_capable = 1, 675 .irq_capable = 1,
674 .name = "sched_expedited" 676 .name = "sched_expedited"
675}; 677};
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3ec8160fc75f..d4437345706f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,7 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
@@ -53,8 +54,8 @@
53 54
54static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
55 56
56#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(structname) { \
57 .level = { &name.node[0] }, \ 58 .level = { &structname.node[0] }, \
58 .levelcnt = { \ 59 .levelcnt = { \
59 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
60 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
@@ -65,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
65 .signaled = RCU_GP_IDLE, \ 66 .signaled = RCU_GP_IDLE, \
66 .gpnum = -300, \ 67 .gpnum = -300, \
67 .completed = -300, \ 68 .completed = -300, \
68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
69 .orphan_cbs_list = NULL, \ 70 .orphan_cbs_list = NULL, \
70 .orphan_cbs_tail = &name.orphan_cbs_list, \ 71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
71 .orphan_qlen = 0, \ 72 .orphan_qlen = 0, \
72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ 73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
73 .n_force_qs = 0, \ 74 .n_force_qs = 0, \
74 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
76 .name = #structname, \
75} 77}
76 78
77struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); 79struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -80,6 +82,9 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
82 84
85int rcu_scheduler_active __read_mostly;
86EXPORT_SYMBOL_GPL(rcu_scheduler_active);
87
83/* 88/*
84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 89 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
85 * permit this function to be invoked without holding the root rcu_node 90 * permit this function to be invoked without holding the root rcu_node
@@ -97,25 +102,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
97 */ 102 */
98void rcu_sched_qs(int cpu) 103void rcu_sched_qs(int cpu)
99{ 104{
100 struct rcu_data *rdp; 105 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
101 106
102 rdp = &per_cpu(rcu_sched_data, cpu);
103 rdp->passed_quiesc_completed = rdp->gpnum - 1; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
104 barrier(); 108 barrier();
105 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
106 rcu_preempt_note_context_switch(cpu);
107} 110}
108 111
109void rcu_bh_qs(int cpu) 112void rcu_bh_qs(int cpu)
110{ 113{
111 struct rcu_data *rdp; 114 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
112 115
113 rdp = &per_cpu(rcu_bh_data, cpu);
114 rdp->passed_quiesc_completed = rdp->gpnum - 1; 116 rdp->passed_quiesc_completed = rdp->gpnum - 1;
115 barrier(); 117 barrier();
116 rdp->passed_quiesc = 1; 118 rdp->passed_quiesc = 1;
117} 119}
118 120
121/*
122 * Note a context switch. This is a quiescent state for RCU-sched,
123 * and requires special handling for preemptible RCU.
124 */
125void rcu_note_context_switch(int cpu)
126{
127 rcu_sched_qs(cpu);
128 rcu_preempt_note_context_switch(cpu);
129}
130
119#ifdef CONFIG_NO_HZ 131#ifdef CONFIG_NO_HZ
120DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 132DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
121 .dynticks_nesting = 1, 133 .dynticks_nesting = 1,
@@ -438,6 +450,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
438 450
439#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
440 452
453int rcu_cpu_stall_panicking __read_mostly;
454
441static void record_gp_stall_check_time(struct rcu_state *rsp) 455static void record_gp_stall_check_time(struct rcu_state *rsp)
442{ 456{
443 rsp->gp_start = jiffies; 457 rsp->gp_start = jiffies;
@@ -470,7 +484,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
470 484
471 /* OK, time to rat on our buddy... */ 485 /* OK, time to rat on our buddy... */
472 486
473 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name);
474 rcu_for_each_leaf_node(rsp, rnp) { 489 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags); 490 raw_spin_lock_irqsave(&rnp->lock, flags);
476 rcu_print_task_stall(rnp); 491 rcu_print_task_stall(rnp);
@@ -481,7 +496,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
481 if (rnp->qsmask & (1UL << cpu)) 496 if (rnp->qsmask & (1UL << cpu))
482 printk(" %d", rnp->grplo + cpu); 497 printk(" %d", rnp->grplo + cpu);
483 } 498 }
484 printk(" (detected by %d, t=%ld jiffies)\n", 499 printk("} (detected by %d, t=%ld jiffies)\n",
485 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 500 smp_processor_id(), (long)(jiffies - rsp->gp_start));
486 trigger_all_cpu_backtrace(); 501 trigger_all_cpu_backtrace();
487 502
@@ -497,8 +512,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
497 unsigned long flags; 512 unsigned long flags;
498 struct rcu_node *rnp = rcu_get_root(rsp); 513 struct rcu_node *rnp = rcu_get_root(rsp);
499 514
500 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
501 smp_processor_id(), jiffies - rsp->gp_start); 516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
502 trigger_all_cpu_backtrace(); 517 trigger_all_cpu_backtrace();
503 518
504 raw_spin_lock_irqsave(&rnp->lock, flags); 519 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -515,6 +530,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
515 long delta; 530 long delta;
516 struct rcu_node *rnp; 531 struct rcu_node *rnp;
517 532
533 if (rcu_cpu_stall_panicking)
534 return;
518 delta = jiffies - rsp->jiffies_stall; 535 delta = jiffies - rsp->jiffies_stall;
519 rnp = rdp->mynode; 536 rnp = rdp->mynode;
520 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -529,6 +546,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
529 } 546 }
530} 547}
531 548
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{
551 rcu_cpu_stall_panicking = 1;
552 return NOTIFY_DONE;
553}
554
555static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic,
557};
558
559static void __init check_cpu_stall_init(void)
560{
561 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
562}
563
532#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 564#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
533 565
534static void record_gp_stall_check_time(struct rcu_state *rsp) 566static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -539,6 +571,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 571{
540} 572}
541 573
574static void __init check_cpu_stall_init(void)
575{
576}
577
542#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 578#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
543 579
544/* 580/*
@@ -1125,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1125 */ 1161 */
1126void rcu_check_callbacks(int cpu, int user) 1162void rcu_check_callbacks(int cpu, int user)
1127{ 1163{
1128 if (!rcu_pending(cpu))
1129 return; /* if nothing for RCU to do. */
1130 if (user || 1164 if (user ||
1131 (idle_cpu(cpu) && rcu_scheduler_active && 1165 (idle_cpu(cpu) && rcu_scheduler_active &&
1132 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1166 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1158,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user)
1158 rcu_bh_qs(cpu); 1192 rcu_bh_qs(cpu);
1159 } 1193 }
1160 rcu_preempt_check_callbacks(cpu); 1194 rcu_preempt_check_callbacks(cpu);
1161 raise_softirq(RCU_SOFTIRQ); 1195 if (rcu_pending(cpu))
1196 raise_softirq(RCU_SOFTIRQ);
1162} 1197}
1163 1198
1164#ifdef CONFIG_SMP 1199#ifdef CONFIG_SMP
@@ -1236,11 +1271,11 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1236 break; /* grace period idle or initializing, ignore. */ 1271 break; /* grace period idle or initializing, ignore. */
1237 1272
1238 case RCU_SAVE_DYNTICK: 1273 case RCU_SAVE_DYNTICK:
1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1274 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1242 break; /* So gcc recognizes the dead code. */ 1275 break; /* So gcc recognizes the dead code. */
1243 1276
1277 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1278
1244 /* Record dyntick-idle state. */ 1279 /* Record dyntick-idle state. */
1245 force_qs_rnp(rsp, dyntick_save_progress_counter); 1280 force_qs_rnp(rsp, dyntick_save_progress_counter);
1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1281 raw_spin_lock(&rnp->lock); /* irqs already disabled */
@@ -1449,11 +1484,13 @@ void synchronize_sched(void)
1449 if (rcu_blocking_is_gp()) 1484 if (rcu_blocking_is_gp())
1450 return; 1485 return;
1451 1486
1487 init_rcu_head_on_stack(&rcu.head);
1452 init_completion(&rcu.completion); 1488 init_completion(&rcu.completion);
1453 /* Will wake me after RCU finished. */ 1489 /* Will wake me after RCU finished. */
1454 call_rcu_sched(&rcu.head, wakeme_after_rcu); 1490 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1455 /* Wait for it. */ 1491 /* Wait for it. */
1456 wait_for_completion(&rcu.completion); 1492 wait_for_completion(&rcu.completion);
1493 destroy_rcu_head_on_stack(&rcu.head);
1457} 1494}
1458EXPORT_SYMBOL_GPL(synchronize_sched); 1495EXPORT_SYMBOL_GPL(synchronize_sched);
1459 1496
@@ -1473,11 +1510,13 @@ void synchronize_rcu_bh(void)
1473 if (rcu_blocking_is_gp()) 1510 if (rcu_blocking_is_gp())
1474 return; 1511 return;
1475 1512
1513 init_rcu_head_on_stack(&rcu.head);
1476 init_completion(&rcu.completion); 1514 init_completion(&rcu.completion);
1477 /* Will wake me after RCU finished. */ 1515 /* Will wake me after RCU finished. */
1478 call_rcu_bh(&rcu.head, wakeme_after_rcu); 1516 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1479 /* Wait for it. */ 1517 /* Wait for it. */
1480 wait_for_completion(&rcu.completion); 1518 wait_for_completion(&rcu.completion);
1519 destroy_rcu_head_on_stack(&rcu.head);
1481} 1520}
1482EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1521EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1483 1522
@@ -1498,8 +1537,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1498 check_cpu_stall(rsp, rdp); 1537 check_cpu_stall(rsp, rdp);
1499 1538
1500 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1539 /* Is the RCU core waiting for a quiescent state from this CPU? */
1501 if (rdp->qs_pending) { 1540 if (rdp->qs_pending && !rdp->passed_quiesc) {
1541
1542 /*
1543 * If force_quiescent_state() coming soon and this CPU
1544 * needs a quiescent state, and this is either RCU-sched
1545 * or RCU-bh, force a local reschedule.
1546 */
1502 rdp->n_rp_qs_pending++; 1547 rdp->n_rp_qs_pending++;
1548 if (!rdp->preemptable &&
1549 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1550 jiffies))
1551 set_need_resched();
1552 } else if (rdp->qs_pending && rdp->passed_quiesc) {
1553 rdp->n_rp_report_qs++;
1503 return 1; 1554 return 1;
1504 } 1555 }
1505 1556
@@ -1767,6 +1818,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1767} 1818}
1768 1819
1769/* 1820/*
1821 * This function is invoked towards the end of the scheduler's initialization
1822 * process. Before this is called, the idle task might contain
1823 * RCU read-side critical sections (during which time, this idle
1824 * task is booting the system). After this function is called, the
1825 * idle tasks are prohibited from containing RCU read-side critical
1826 * sections. This function also enables RCU lockdep checking.
1827 */
1828void rcu_scheduler_starting(void)
1829{
1830 WARN_ON(num_online_cpus() != 1);
1831 WARN_ON(nr_context_switches() > 0);
1832 rcu_scheduler_active = 1;
1833}
1834
1835/*
1770 * Compute the per-level fanout, either using the exact fanout specified 1836 * Compute the per-level fanout, either using the exact fanout specified
1771 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. 1837 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1772 */ 1838 */
@@ -1849,6 +1915,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]); 1915 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1850 } 1916 }
1851 } 1917 }
1918
1919 rnp = rsp->level[NUM_RCU_LVLS - 1];
1920 for_each_possible_cpu(i) {
1921 while (i > rnp->grphi)
1922 rnp++;
1923 rsp->rda[i]->mynode = rnp;
1924 rcu_boot_init_percpu_data(i, rsp);
1925 }
1852} 1926}
1853 1927
1854/* 1928/*
@@ -1859,19 +1933,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1859#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1933#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1860do { \ 1934do { \
1861 int i; \ 1935 int i; \
1862 int j; \
1863 struct rcu_node *rnp; \
1864 \ 1936 \
1865 rcu_init_one(rsp); \
1866 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1867 j = 0; \
1868 for_each_possible_cpu(i) { \ 1937 for_each_possible_cpu(i) { \
1869 if (i > rnp[j].grphi) \
1870 j++; \
1871 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1872 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1938 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1873 rcu_boot_init_percpu_data(i, rsp); \
1874 } \ 1939 } \
1940 rcu_init_one(rsp); \
1875} while (0) 1941} while (0)
1876 1942
1877void __init rcu_init(void) 1943void __init rcu_init(void)
@@ -1879,12 +1945,6 @@ void __init rcu_init(void)
1879 int cpu; 1945 int cpu;
1880 1946
1881 rcu_bootup_announce(); 1947 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1948 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1949 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1890 __rcu_init_preempt(); 1950 __rcu_init_preempt();
@@ -1898,6 +1958,7 @@ void __init rcu_init(void)
1898 cpu_notifier(rcu_cpu_notify, 0); 1958 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(cpu) 1959 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1960 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1961 check_cpu_stall_init();
1901} 1962}
1902 1963
1903#include "rcutree_plugin.h" 1964#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4a525a30e08e..14c040b18ed0 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -223,6 +223,7 @@ struct rcu_data {
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 unsigned long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 unsigned long n_rp_report_qs;
226 unsigned long n_rp_cb_ready; 227 unsigned long n_rp_cb_ready;
227 unsigned long n_rp_cpu_needs_gp; 228 unsigned long n_rp_cpu_needs_gp;
228 unsigned long n_rp_gp_completed; 229 unsigned long n_rp_gp_completed;
@@ -326,6 +327,7 @@ struct rcu_state {
326 unsigned long jiffies_stall; /* Time at which to check */ 327 unsigned long jiffies_stall; /* Time at which to check */
327 /* for CPU stalls. */ 328 /* for CPU stalls. */
328#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 329#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
330 char *name; /* Name of structure. */
329}; 331};
330 332
331/* Return values for rcu_preempt_offline_tasks(). */ 333/* Return values for rcu_preempt_offline_tasks(). */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 79b53bda8943..0e4f420245d9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28 28
29/*
30 * Check the RCU kernel configuration parameters and print informative
31 * messages about anything out of the ordinary. If you like #ifdef, you
32 * will love this function.
33 */
34static void __init rcu_bootup_announce_oddness(void)
35{
36#ifdef CONFIG_RCU_TRACE
37 printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
38#endif
39#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
40 printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
41 CONFIG_RCU_FANOUT);
42#endif
43#ifdef CONFIG_RCU_FANOUT_EXACT
44 printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
45#endif
46#ifdef CONFIG_RCU_FAST_NO_HZ
47 printk(KERN_INFO
48 "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
49#endif
50#ifdef CONFIG_PROVE_RCU
51 printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
52#endif
53#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
54 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
55#endif
56#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif
63#if NUM_RCU_LVL_4 != 0
64 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
65#endif
66}
67
29#ifdef CONFIG_TREE_PREEMPT_RCU 68#ifdef CONFIG_TREE_PREEMPT_RCU
30 69
31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 70struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
38 */ 77 */
39static void __init rcu_bootup_announce(void) 78static void __init rcu_bootup_announce(void)
40{ 79{
41 printk(KERN_INFO 80 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
42 "Experimental preemptable hierarchical RCU implementation.\n"); 81 rcu_bootup_announce_oddness();
43} 82}
44 83
45/* 84/*
@@ -75,13 +114,19 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
75 * that this just means that the task currently running on the CPU is 114 * that this just means that the task currently running on the CPU is
76 * not in a quiescent state. There might be any number of tasks blocked 115 * not in a quiescent state. There might be any number of tasks blocked
77 * while in an RCU read-side critical section. 116 * while in an RCU read-side critical section.
117 *
118 * Unlike the other rcu_*_qs() functions, callers to this function
119 * must disable irqs in order to protect the assignment to
120 * ->rcu_read_unlock_special.
78 */ 121 */
79static void rcu_preempt_qs(int cpu) 122static void rcu_preempt_qs(int cpu)
80{ 123{
81 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 124 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
125
82 rdp->passed_quiesc_completed = rdp->gpnum - 1; 126 rdp->passed_quiesc_completed = rdp->gpnum - 1;
83 barrier(); 127 barrier();
84 rdp->passed_quiesc = 1; 128 rdp->passed_quiesc = 1;
129 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
85} 130}
86 131
87/* 132/*
@@ -144,9 +189,8 @@ static void rcu_preempt_note_context_switch(int cpu)
144 * grace period, then the fact that the task has been enqueued 189 * grace period, then the fact that the task has been enqueued
145 * means that we continue to block the current grace period. 190 * means that we continue to block the current grace period.
146 */ 191 */
147 rcu_preempt_qs(cpu);
148 local_irq_save(flags); 192 local_irq_save(flags);
149 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 193 rcu_preempt_qs(cpu);
150 local_irq_restore(flags); 194 local_irq_restore(flags);
151} 195}
152 196
@@ -236,7 +280,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
236 */ 280 */
237 special = t->rcu_read_unlock_special; 281 special = t->rcu_read_unlock_special;
238 if (special & RCU_READ_UNLOCK_NEED_QS) { 282 if (special & RCU_READ_UNLOCK_NEED_QS) {
239 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
240 rcu_preempt_qs(smp_processor_id()); 283 rcu_preempt_qs(smp_processor_id());
241 } 284 }
242 285
@@ -473,7 +516,6 @@ static void rcu_preempt_check_callbacks(int cpu)
473 struct task_struct *t = current; 516 struct task_struct *t = current;
474 517
475 if (t->rcu_read_lock_nesting == 0) { 518 if (t->rcu_read_lock_nesting == 0) {
476 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
477 rcu_preempt_qs(cpu); 519 rcu_preempt_qs(cpu);
478 return; 520 return;
479 } 521 }
@@ -515,11 +557,13 @@ void synchronize_rcu(void)
515 if (!rcu_scheduler_active) 557 if (!rcu_scheduler_active)
516 return; 558 return;
517 559
560 init_rcu_head_on_stack(&rcu.head);
518 init_completion(&rcu.completion); 561 init_completion(&rcu.completion);
519 /* Will wake me after RCU finished. */ 562 /* Will wake me after RCU finished. */
520 call_rcu(&rcu.head, wakeme_after_rcu); 563 call_rcu(&rcu.head, wakeme_after_rcu);
521 /* Wait for it. */ 564 /* Wait for it. */
522 wait_for_completion(&rcu.completion); 565 wait_for_completion(&rcu.completion);
566 destroy_rcu_head_on_stack(&rcu.head);
523} 567}
524EXPORT_SYMBOL_GPL(synchronize_rcu); 568EXPORT_SYMBOL_GPL(synchronize_rcu);
525 569
@@ -754,6 +798,7 @@ void exit_rcu(void)
754static void __init rcu_bootup_announce(void) 798static void __init rcu_bootup_announce(void)
755{ 799{
756 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 800 printk(KERN_INFO "Hierarchical RCU implementation.\n");
801 rcu_bootup_announce_oddness();
757} 802}
758 803
759/* 804/*
@@ -1008,6 +1053,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1008int rcu_needs_cpu(int cpu) 1053int rcu_needs_cpu(int cpu)
1009{ 1054{
1010 int c = 0; 1055 int c = 0;
1056 int snap;
1057 int snap_nmi;
1011 int thatcpu; 1058 int thatcpu;
1012 1059
1013 /* Check for being in the holdoff period. */ 1060 /* Check for being in the holdoff period. */
@@ -1015,12 +1062,18 @@ int rcu_needs_cpu(int cpu)
1015 return rcu_needs_cpu_quick_check(cpu); 1062 return rcu_needs_cpu_quick_check(cpu);
1016 1063
1017 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 1064 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1018 for_each_cpu_not(thatcpu, nohz_cpu_mask) 1065 for_each_online_cpu(thatcpu) {
1019 if (thatcpu != cpu) { 1066 if (thatcpu == cpu)
1067 continue;
1068 snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
1069 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
1070 smp_mb(); /* Order sampling of snap with end of grace period. */
1071 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
1020 per_cpu(rcu_dyntick_drain, cpu) = 0; 1072 per_cpu(rcu_dyntick_drain, cpu) = 0;
1021 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1073 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1022 return rcu_needs_cpu_quick_check(cpu); 1074 return rcu_needs_cpu_quick_check(cpu);
1023 } 1075 }
1076 }
1024 1077
1025 /* Check and update the rcu_dyntick_drain sequencing. */ 1078 /* Check and update the rcu_dyntick_drain sequencing. */
1026 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 1079 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d45db2e35d27..36c95b45738e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
242{ 242{
243 seq_printf(m, "%3d%cnp=%ld " 243 seq_printf(m, "%3d%cnp=%ld "
244 "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", 244 "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
245 "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
245 rdp->cpu, 246 rdp->cpu,
246 cpu_is_offline(rdp->cpu) ? '!' : ' ', 247 cpu_is_offline(rdp->cpu) ? '!' : ' ',
247 rdp->n_rcu_pending, 248 rdp->n_rcu_pending,
248 rdp->n_rp_qs_pending, 249 rdp->n_rp_qs_pending,
250 rdp->n_rp_report_qs,
249 rdp->n_rp_cb_ready, 251 rdp->n_rp_cb_ready,
250 rdp->n_rp_cpu_needs_gp, 252 rdp->n_rp_cpu_needs_gp,
251 rdp->n_rp_gp_completed, 253 rdp->n_rp_gp_completed,
diff --git a/kernel/sched.c b/kernel/sched.c
index 6af210a7de70..054a6012de99 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/cpuset.h> 56#include <linux/cpuset.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h> 58#include <linux/proc_fs.h>
60#include <linux/seq_file.h> 59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h> 61#include <linux/sysctl.h>
62#include <linux/syscalls.h> 62#include <linux/syscalls.h>
63#include <linux/times.h> 63#include <linux/times.h>
@@ -323,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p)
323/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 323/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
324static inline void set_task_rq(struct task_struct *p, unsigned int cpu) 324static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
325{ 325{
326 /*
327 * Strictly speaking this rcu_read_lock() is not needed since the
328 * task_group is tied to the cgroup, which in turn can never go away
329 * as long as there are tasks attached to it.
330 *
331 * However since task_group() uses task_subsys_state() which is an
332 * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
333 */
334 rcu_read_lock();
326#ifdef CONFIG_FAIR_GROUP_SCHED 335#ifdef CONFIG_FAIR_GROUP_SCHED
327 p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; 336 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
328 p->se.parent = task_group(p)->se[cpu]; 337 p->se.parent = task_group(p)->se[cpu];
@@ -332,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
332 p->rt.rt_rq = task_group(p)->rt_rq[cpu]; 341 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
333 p->rt.parent = task_group(p)->rt_se[cpu]; 342 p->rt.parent = task_group(p)->rt_se[cpu];
334#endif 343#endif
344 rcu_read_unlock();
335} 345}
336 346
337#else 347#else
@@ -493,8 +503,11 @@ struct rq {
493 #define CPU_LOAD_IDX_MAX 5 503 #define CPU_LOAD_IDX_MAX 5
494 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 504 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
495#ifdef CONFIG_NO_HZ 505#ifdef CONFIG_NO_HZ
506 u64 nohz_stamp;
496 unsigned char in_nohz_recently; 507 unsigned char in_nohz_recently;
497#endif 508#endif
509 unsigned int skip_clock_update;
510
498 /* capture load from *all* tasks on this cpu: */ 511 /* capture load from *all* tasks on this cpu: */
499 struct load_weight load; 512 struct load_weight load;
500 unsigned long nr_load_updates; 513 unsigned long nr_load_updates;
@@ -536,15 +549,13 @@ struct rq {
536 int post_schedule; 549 int post_schedule;
537 int active_balance; 550 int active_balance;
538 int push_cpu; 551 int push_cpu;
552 struct cpu_stop_work active_balance_work;
539 /* cpu of this runqueue: */ 553 /* cpu of this runqueue: */
540 int cpu; 554 int cpu;
541 int online; 555 int online;
542 556
543 unsigned long avg_load_per_task; 557 unsigned long avg_load_per_task;
544 558
545 struct task_struct *migration_thread;
546 struct list_head migration_queue;
547
548 u64 rt_avg; 559 u64 rt_avg;
549 u64 age_stamp; 560 u64 age_stamp;
550 u64 idle_stamp; 561 u64 idle_stamp;
@@ -592,6 +603,13 @@ static inline
592void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 603void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
593{ 604{
594 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 605 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
606
607 /*
608 * A queue event has occurred, and we're going to schedule. In
609 * this case, we can save a useless back to back clock update.
610 */
611 if (test_tsk_need_resched(p))
612 rq->skip_clock_update = 1;
595} 613}
596 614
597static inline int cpu_of(struct rq *rq) 615static inline int cpu_of(struct rq *rq)
@@ -626,7 +644,8 @@ static inline int cpu_of(struct rq *rq)
626 644
627inline void update_rq_clock(struct rq *rq) 645inline void update_rq_clock(struct rq *rq)
628{ 646{
629 rq->clock = sched_clock_cpu(cpu_of(rq)); 647 if (!rq->skip_clock_update)
648 rq->clock = sched_clock_cpu(cpu_of(rq));
630} 649}
631 650
632/* 651/*
@@ -904,16 +923,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
904#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 923#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
905 924
906/* 925/*
907 * Check whether the task is waking, we use this to synchronize against 926 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
908 * ttwu() so that task_cpu() reports a stable number. 927 * against ttwu().
909 *
910 * We need to make an exception for PF_STARTING tasks because the fork
911 * path might require task_rq_lock() to work, eg. it can call
912 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
913 */ 928 */
914static inline int task_is_waking(struct task_struct *p) 929static inline int task_is_waking(struct task_struct *p)
915{ 930{
916 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); 931 return unlikely(p->state == TASK_WAKING);
917} 932}
918 933
919/* 934/*
@@ -926,11 +941,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
926 struct rq *rq; 941 struct rq *rq;
927 942
928 for (;;) { 943 for (;;) {
929 while (task_is_waking(p))
930 cpu_relax();
931 rq = task_rq(p); 944 rq = task_rq(p);
932 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
933 if (likely(rq == task_rq(p) && !task_is_waking(p))) 946 if (likely(rq == task_rq(p)))
934 return rq; 947 return rq;
935 raw_spin_unlock(&rq->lock); 948 raw_spin_unlock(&rq->lock);
936 } 949 }
@@ -947,12 +960,10 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
947 struct rq *rq; 960 struct rq *rq;
948 961
949 for (;;) { 962 for (;;) {
950 while (task_is_waking(p))
951 cpu_relax();
952 local_irq_save(*flags); 963 local_irq_save(*flags);
953 rq = task_rq(p); 964 rq = task_rq(p);
954 raw_spin_lock(&rq->lock); 965 raw_spin_lock(&rq->lock);
955 if (likely(rq == task_rq(p) && !task_is_waking(p))) 966 if (likely(rq == task_rq(p)))
956 return rq; 967 return rq;
957 raw_spin_unlock_irqrestore(&rq->lock, *flags); 968 raw_spin_unlock_irqrestore(&rq->lock, *flags);
958 } 969 }
@@ -1229,6 +1240,17 @@ void wake_up_idle_cpu(int cpu)
1229 if (!tsk_is_polling(rq->idle)) 1240 if (!tsk_is_polling(rq->idle))
1230 smp_send_reschedule(cpu); 1241 smp_send_reschedule(cpu);
1231} 1242}
1243
1244int nohz_ratelimit(int cpu)
1245{
1246 struct rq *rq = cpu_rq(cpu);
1247 u64 diff = rq->clock - rq->nohz_stamp;
1248
1249 rq->nohz_stamp = rq->clock;
1250
1251 return diff < (NSEC_PER_SEC / HZ) >> 1;
1252}
1253
1232#endif /* CONFIG_NO_HZ */ 1254#endif /* CONFIG_NO_HZ */
1233 1255
1234static u64 sched_avg_period(void) 1256static u64 sched_avg_period(void)
@@ -1771,8 +1793,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1771 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1793 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1772 } 1794 }
1773 } 1795 }
1774 update_rq_clock(rq1);
1775 update_rq_clock(rq2);
1776} 1796}
1777 1797
1778/* 1798/*
@@ -1803,7 +1823,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1803} 1823}
1804#endif 1824#endif
1805 1825
1806static void calc_load_account_active(struct rq *this_rq); 1826static void calc_load_account_idle(struct rq *this_rq);
1807static void update_sysctl(void); 1827static void update_sysctl(void);
1808static int get_update_sysctl_factor(void); 1828static int get_update_sysctl_factor(void);
1809 1829
@@ -1860,62 +1880,43 @@ static void set_load_weight(struct task_struct *p)
1860 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1880 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1861} 1881}
1862 1882
1863static void update_avg(u64 *avg, u64 sample) 1883static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1864{
1865 s64 diff = sample - *avg;
1866 *avg += diff >> 3;
1867}
1868
1869static void
1870enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1871{ 1884{
1872 if (wakeup) 1885 update_rq_clock(rq);
1873 p->se.start_runtime = p->se.sum_exec_runtime;
1874
1875 sched_info_queued(p); 1886 sched_info_queued(p);
1876 p->sched_class->enqueue_task(rq, p, wakeup, head); 1887 p->sched_class->enqueue_task(rq, p, flags);
1877 p->se.on_rq = 1; 1888 p->se.on_rq = 1;
1878} 1889}
1879 1890
1880static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1891static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1881{ 1892{
1882 if (sleep) { 1893 update_rq_clock(rq);
1883 if (p->se.last_wakeup) {
1884 update_avg(&p->se.avg_overlap,
1885 p->se.sum_exec_runtime - p->se.last_wakeup);
1886 p->se.last_wakeup = 0;
1887 } else {
1888 update_avg(&p->se.avg_wakeup,
1889 sysctl_sched_wakeup_granularity);
1890 }
1891 }
1892
1893 sched_info_dequeued(p); 1894 sched_info_dequeued(p);
1894 p->sched_class->dequeue_task(rq, p, sleep); 1895 p->sched_class->dequeue_task(rq, p, flags);
1895 p->se.on_rq = 0; 1896 p->se.on_rq = 0;
1896} 1897}
1897 1898
1898/* 1899/*
1899 * activate_task - move a task to the runqueue. 1900 * activate_task - move a task to the runqueue.
1900 */ 1901 */
1901static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 1902static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1902{ 1903{
1903 if (task_contributes_to_load(p)) 1904 if (task_contributes_to_load(p))
1904 rq->nr_uninterruptible--; 1905 rq->nr_uninterruptible--;
1905 1906
1906 enqueue_task(rq, p, wakeup, false); 1907 enqueue_task(rq, p, flags);
1907 inc_nr_running(rq); 1908 inc_nr_running(rq);
1908} 1909}
1909 1910
1910/* 1911/*
1911 * deactivate_task - remove a task from the runqueue. 1912 * deactivate_task - remove a task from the runqueue.
1912 */ 1913 */
1913static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 1914static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1914{ 1915{
1915 if (task_contributes_to_load(p)) 1916 if (task_contributes_to_load(p))
1916 rq->nr_uninterruptible++; 1917 rq->nr_uninterruptible++;
1917 1918
1918 dequeue_task(rq, p, sleep); 1919 dequeue_task(rq, p, flags);
1919 dec_nr_running(rq); 1920 dec_nr_running(rq);
1920} 1921}
1921 1922
@@ -2044,21 +2045,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2044 __set_task_cpu(p, new_cpu); 2045 __set_task_cpu(p, new_cpu);
2045} 2046}
2046 2047
2047struct migration_req { 2048struct migration_arg {
2048 struct list_head list;
2049
2050 struct task_struct *task; 2049 struct task_struct *task;
2051 int dest_cpu; 2050 int dest_cpu;
2052
2053 struct completion done;
2054}; 2051};
2055 2052
2053static int migration_cpu_stop(void *data);
2054
2056/* 2055/*
2057 * The task's runqueue lock must be held. 2056 * The task's runqueue lock must be held.
2058 * Returns true if you have to wait for migration thread. 2057 * Returns true if you have to wait for migration thread.
2059 */ 2058 */
2060static int 2059static bool migrate_task(struct task_struct *p, int dest_cpu)
2061migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2062{ 2060{
2063 struct rq *rq = task_rq(p); 2061 struct rq *rq = task_rq(p);
2064 2062
@@ -2066,58 +2064,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2066 * If the task is not on a runqueue (and not running), then 2064 * If the task is not on a runqueue (and not running), then
2067 * the next wake-up will properly place the task. 2065 * the next wake-up will properly place the task.
2068 */ 2066 */
2069 if (!p->se.on_rq && !task_running(rq, p)) 2067 return p->se.on_rq || task_running(rq, p);
2070 return 0;
2071
2072 init_completion(&req->done);
2073 req->task = p;
2074 req->dest_cpu = dest_cpu;
2075 list_add(&req->list, &rq->migration_queue);
2076
2077 return 1;
2078}
2079
2080/*
2081 * wait_task_context_switch - wait for a thread to complete at least one
2082 * context switch.
2083 *
2084 * @p must not be current.
2085 */
2086void wait_task_context_switch(struct task_struct *p)
2087{
2088 unsigned long nvcsw, nivcsw, flags;
2089 int running;
2090 struct rq *rq;
2091
2092 nvcsw = p->nvcsw;
2093 nivcsw = p->nivcsw;
2094 for (;;) {
2095 /*
2096 * The runqueue is assigned before the actual context
2097 * switch. We need to take the runqueue lock.
2098 *
2099 * We could check initially without the lock but it is
2100 * very likely that we need to take the lock in every
2101 * iteration.
2102 */
2103 rq = task_rq_lock(p, &flags);
2104 running = task_running(rq, p);
2105 task_rq_unlock(rq, &flags);
2106
2107 if (likely(!running))
2108 break;
2109 /*
2110 * The switch count is incremented before the actual
2111 * context switch. We thus wait for two switches to be
2112 * sure at least one completed.
2113 */
2114 if ((p->nvcsw - nvcsw) > 1)
2115 break;
2116 if ((p->nivcsw - nivcsw) > 1)
2117 break;
2118
2119 cpu_relax();
2120 }
2121} 2068}
2122 2069
2123/* 2070/*
@@ -2175,7 +2122,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2175 * just go back and repeat. 2122 * just go back and repeat.
2176 */ 2123 */
2177 rq = task_rq_lock(p, &flags); 2124 rq = task_rq_lock(p, &flags);
2178 trace_sched_wait_task(rq, p); 2125 trace_sched_wait_task(p);
2179 running = task_running(rq, p); 2126 running = task_running(rq, p);
2180 on_rq = p->se.on_rq; 2127 on_rq = p->se.on_rq;
2181 ncsw = 0; 2128 ncsw = 0;
@@ -2273,6 +2220,9 @@ void task_oncpu_function_call(struct task_struct *p,
2273} 2220}
2274 2221
2275#ifdef CONFIG_SMP 2222#ifdef CONFIG_SMP
2223/*
2224 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2225 */
2276static int select_fallback_rq(int cpu, struct task_struct *p) 2226static int select_fallback_rq(int cpu, struct task_struct *p)
2277{ 2227{
2278 int dest_cpu; 2228 int dest_cpu;
@@ -2289,12 +2239,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2289 return dest_cpu; 2239 return dest_cpu;
2290 2240
2291 /* No more Mr. Nice Guy. */ 2241 /* No more Mr. Nice Guy. */
2292 if (dest_cpu >= nr_cpu_ids) { 2242 if (unlikely(dest_cpu >= nr_cpu_ids)) {
2293 rcu_read_lock(); 2243 dest_cpu = cpuset_cpus_allowed_fallback(p);
2294 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2295 rcu_read_unlock();
2296 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2297
2298 /* 2244 /*
2299 * Don't tell them about moving exiting tasks or 2245 * Don't tell them about moving exiting tasks or
2300 * kernel threads (both mm NULL), since they never 2246 * kernel threads (both mm NULL), since they never
@@ -2311,17 +2257,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2311} 2257}
2312 2258
2313/* 2259/*
2314 * Gets called from 3 sites (exec, fork, wakeup), since it is called without 2260 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2315 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2316 * by:
2317 *
2318 * exec: is unstable, retry loop
2319 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2320 */ 2261 */
2321static inline 2262static inline
2322int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2263int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2323{ 2264{
2324 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2265 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2325 2266
2326 /* 2267 /*
2327 * In order not to call set_task_cpu() on a blocking task we need 2268 * In order not to call set_task_cpu() on a blocking task we need
@@ -2339,6 +2280,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2339 2280
2340 return cpu; 2281 return cpu;
2341} 2282}
2283
2284static void update_avg(u64 *avg, u64 sample)
2285{
2286 s64 diff = sample - *avg;
2287 *avg += diff >> 3;
2288}
2342#endif 2289#endif
2343 2290
2344/*** 2291/***
@@ -2360,16 +2307,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2360{ 2307{
2361 int cpu, orig_cpu, this_cpu, success = 0; 2308 int cpu, orig_cpu, this_cpu, success = 0;
2362 unsigned long flags; 2309 unsigned long flags;
2310 unsigned long en_flags = ENQUEUE_WAKEUP;
2363 struct rq *rq; 2311 struct rq *rq;
2364 2312
2365 if (!sched_feat(SYNC_WAKEUPS))
2366 wake_flags &= ~WF_SYNC;
2367
2368 this_cpu = get_cpu(); 2313 this_cpu = get_cpu();
2369 2314
2370 smp_wmb(); 2315 smp_wmb();
2371 rq = task_rq_lock(p, &flags); 2316 rq = task_rq_lock(p, &flags);
2372 update_rq_clock(rq);
2373 if (!(p->state & state)) 2317 if (!(p->state & state))
2374 goto out; 2318 goto out;
2375 2319
@@ -2389,28 +2333,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2389 * 2333 *
2390 * First fix up the nr_uninterruptible count: 2334 * First fix up the nr_uninterruptible count:
2391 */ 2335 */
2392 if (task_contributes_to_load(p)) 2336 if (task_contributes_to_load(p)) {
2393 rq->nr_uninterruptible--; 2337 if (likely(cpu_online(orig_cpu)))
2338 rq->nr_uninterruptible--;
2339 else
2340 this_rq()->nr_uninterruptible--;
2341 }
2394 p->state = TASK_WAKING; 2342 p->state = TASK_WAKING;
2395 2343
2396 if (p->sched_class->task_waking) 2344 if (p->sched_class->task_waking) {
2397 p->sched_class->task_waking(rq, p); 2345 p->sched_class->task_waking(rq, p);
2346 en_flags |= ENQUEUE_WAKING;
2347 }
2398 2348
2399 __task_rq_unlock(rq); 2349 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2400 2350 if (cpu != orig_cpu)
2401 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2402 if (cpu != orig_cpu) {
2403 /*
2404 * Since we migrate the task without holding any rq->lock,
2405 * we need to be careful with task_rq_lock(), since that
2406 * might end up locking an invalid rq.
2407 */
2408 set_task_cpu(p, cpu); 2351 set_task_cpu(p, cpu);
2409 } 2352 __task_rq_unlock(rq);
2410 2353
2411 rq = cpu_rq(cpu); 2354 rq = cpu_rq(cpu);
2412 raw_spin_lock(&rq->lock); 2355 raw_spin_lock(&rq->lock);
2413 update_rq_clock(rq);
2414 2356
2415 /* 2357 /*
2416 * We migrated the task without holding either rq->lock, however 2358 * We migrated the task without holding either rq->lock, however
@@ -2438,36 +2380,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2438 2380
2439out_activate: 2381out_activate:
2440#endif /* CONFIG_SMP */ 2382#endif /* CONFIG_SMP */
2441 schedstat_inc(p, se.nr_wakeups); 2383 schedstat_inc(p, se.statistics.nr_wakeups);
2442 if (wake_flags & WF_SYNC) 2384 if (wake_flags & WF_SYNC)
2443 schedstat_inc(p, se.nr_wakeups_sync); 2385 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2444 if (orig_cpu != cpu) 2386 if (orig_cpu != cpu)
2445 schedstat_inc(p, se.nr_wakeups_migrate); 2387 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2446 if (cpu == this_cpu) 2388 if (cpu == this_cpu)
2447 schedstat_inc(p, se.nr_wakeups_local); 2389 schedstat_inc(p, se.statistics.nr_wakeups_local);
2448 else 2390 else
2449 schedstat_inc(p, se.nr_wakeups_remote); 2391 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2450 activate_task(rq, p, 1); 2392 activate_task(rq, p, en_flags);
2451 success = 1; 2393 success = 1;
2452 2394
2453 /*
2454 * Only attribute actual wakeups done by this task.
2455 */
2456 if (!in_interrupt()) {
2457 struct sched_entity *se = &current->se;
2458 u64 sample = se->sum_exec_runtime;
2459
2460 if (se->last_wakeup)
2461 sample -= se->last_wakeup;
2462 else
2463 sample -= se->start_runtime;
2464 update_avg(&se->avg_wakeup, sample);
2465
2466 se->last_wakeup = se->sum_exec_runtime;
2467 }
2468
2469out_running: 2395out_running:
2470 trace_sched_wakeup(rq, p, success); 2396 trace_sched_wakeup(p, success);
2471 check_preempt_curr(rq, p, wake_flags); 2397 check_preempt_curr(rq, p, wake_flags);
2472 2398
2473 p->state = TASK_RUNNING; 2399 p->state = TASK_RUNNING;
@@ -2527,42 +2453,9 @@ static void __sched_fork(struct task_struct *p)
2527 p->se.sum_exec_runtime = 0; 2453 p->se.sum_exec_runtime = 0;
2528 p->se.prev_sum_exec_runtime = 0; 2454 p->se.prev_sum_exec_runtime = 0;
2529 p->se.nr_migrations = 0; 2455 p->se.nr_migrations = 0;
2530 p->se.last_wakeup = 0;
2531 p->se.avg_overlap = 0;
2532 p->se.start_runtime = 0;
2533 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2534 2456
2535#ifdef CONFIG_SCHEDSTATS 2457#ifdef CONFIG_SCHEDSTATS
2536 p->se.wait_start = 0; 2458 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2537 p->se.wait_max = 0;
2538 p->se.wait_count = 0;
2539 p->se.wait_sum = 0;
2540
2541 p->se.sleep_start = 0;
2542 p->se.sleep_max = 0;
2543 p->se.sum_sleep_runtime = 0;
2544
2545 p->se.block_start = 0;
2546 p->se.block_max = 0;
2547 p->se.exec_max = 0;
2548 p->se.slice_max = 0;
2549
2550 p->se.nr_migrations_cold = 0;
2551 p->se.nr_failed_migrations_affine = 0;
2552 p->se.nr_failed_migrations_running = 0;
2553 p->se.nr_failed_migrations_hot = 0;
2554 p->se.nr_forced_migrations = 0;
2555
2556 p->se.nr_wakeups = 0;
2557 p->se.nr_wakeups_sync = 0;
2558 p->se.nr_wakeups_migrate = 0;
2559 p->se.nr_wakeups_local = 0;
2560 p->se.nr_wakeups_remote = 0;
2561 p->se.nr_wakeups_affine = 0;
2562 p->se.nr_wakeups_affine_attempts = 0;
2563 p->se.nr_wakeups_passive = 0;
2564 p->se.nr_wakeups_idle = 0;
2565
2566#endif 2459#endif
2567 2460
2568 INIT_LIST_HEAD(&p->rt.run_list); 2461 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2583,11 +2476,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2583 2476
2584 __sched_fork(p); 2477 __sched_fork(p);
2585 /* 2478 /*
2586 * We mark the process as waking here. This guarantees that 2479 * We mark the process as running here. This guarantees that
2587 * nobody will actually run it, and a signal or other external 2480 * nobody will actually run it, and a signal or other external
2588 * event cannot wake it up and insert it on the runqueue either. 2481 * event cannot wake it up and insert it on the runqueue either.
2589 */ 2482 */
2590 p->state = TASK_WAKING; 2483 p->state = TASK_RUNNING;
2591 2484
2592 /* 2485 /*
2593 * Revert to default priority/policy on fork if requested. 2486 * Revert to default priority/policy on fork if requested.
@@ -2654,31 +2547,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2654 int cpu __maybe_unused = get_cpu(); 2547 int cpu __maybe_unused = get_cpu();
2655 2548
2656#ifdef CONFIG_SMP 2549#ifdef CONFIG_SMP
2550 rq = task_rq_lock(p, &flags);
2551 p->state = TASK_WAKING;
2552
2657 /* 2553 /*
2658 * Fork balancing, do it here and not earlier because: 2554 * Fork balancing, do it here and not earlier because:
2659 * - cpus_allowed can change in the fork path 2555 * - cpus_allowed can change in the fork path
2660 * - any previously selected cpu might disappear through hotplug 2556 * - any previously selected cpu might disappear through hotplug
2661 * 2557 *
2662 * We still have TASK_WAKING but PF_STARTING is gone now, meaning 2558 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2663 * ->cpus_allowed is stable, we have preemption disabled, meaning 2559 * without people poking at ->cpus_allowed.
2664 * cpu_online_mask is stable.
2665 */ 2560 */
2666 cpu = select_task_rq(p, SD_BALANCE_FORK, 0); 2561 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2667 set_task_cpu(p, cpu); 2562 set_task_cpu(p, cpu);
2668#endif
2669 2563
2670 /*
2671 * Since the task is not on the rq and we still have TASK_WAKING set
2672 * nobody else will migrate this task.
2673 */
2674 rq = cpu_rq(cpu);
2675 raw_spin_lock_irqsave(&rq->lock, flags);
2676
2677 BUG_ON(p->state != TASK_WAKING);
2678 p->state = TASK_RUNNING; 2564 p->state = TASK_RUNNING;
2679 update_rq_clock(rq); 2565 task_rq_unlock(rq, &flags);
2566#endif
2567
2568 rq = task_rq_lock(p, &flags);
2680 activate_task(rq, p, 0); 2569 activate_task(rq, p, 0);
2681 trace_sched_wakeup_new(rq, p, 1); 2570 trace_sched_wakeup_new(p, 1);
2682 check_preempt_curr(rq, p, WF_FORK); 2571 check_preempt_curr(rq, p, WF_FORK);
2683#ifdef CONFIG_SMP 2572#ifdef CONFIG_SMP
2684 if (p->sched_class->task_woken) 2573 if (p->sched_class->task_woken)
@@ -2898,7 +2787,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2898 struct mm_struct *mm, *oldmm; 2787 struct mm_struct *mm, *oldmm;
2899 2788
2900 prepare_task_switch(rq, prev, next); 2789 prepare_task_switch(rq, prev, next);
2901 trace_sched_switch(rq, prev, next); 2790 trace_sched_switch(prev, next);
2902 mm = next->mm; 2791 mm = next->mm;
2903 oldmm = prev->active_mm; 2792 oldmm = prev->active_mm;
2904 /* 2793 /*
@@ -3015,6 +2904,61 @@ static unsigned long calc_load_update;
3015unsigned long avenrun[3]; 2904unsigned long avenrun[3];
3016EXPORT_SYMBOL(avenrun); 2905EXPORT_SYMBOL(avenrun);
3017 2906
2907static long calc_load_fold_active(struct rq *this_rq)
2908{
2909 long nr_active, delta = 0;
2910
2911 nr_active = this_rq->nr_running;
2912 nr_active += (long) this_rq->nr_uninterruptible;
2913
2914 if (nr_active != this_rq->calc_load_active) {
2915 delta = nr_active - this_rq->calc_load_active;
2916 this_rq->calc_load_active = nr_active;
2917 }
2918
2919 return delta;
2920}
2921
2922#ifdef CONFIG_NO_HZ
2923/*
2924 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2925 *
2926 * When making the ILB scale, we should try to pull this in as well.
2927 */
2928static atomic_long_t calc_load_tasks_idle;
2929
2930static void calc_load_account_idle(struct rq *this_rq)
2931{
2932 long delta;
2933
2934 delta = calc_load_fold_active(this_rq);
2935 if (delta)
2936 atomic_long_add(delta, &calc_load_tasks_idle);
2937}
2938
2939static long calc_load_fold_idle(void)
2940{
2941 long delta = 0;
2942
2943 /*
2944 * Its got a race, we don't care...
2945 */
2946 if (atomic_long_read(&calc_load_tasks_idle))
2947 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2948
2949 return delta;
2950}
2951#else
2952static void calc_load_account_idle(struct rq *this_rq)
2953{
2954}
2955
2956static inline long calc_load_fold_idle(void)
2957{
2958 return 0;
2959}
2960#endif
2961
3018/** 2962/**
3019 * get_avenrun - get the load average array 2963 * get_avenrun - get the load average array
3020 * @loads: pointer to dest load array 2964 * @loads: pointer to dest load array
@@ -3061,20 +3005,22 @@ void calc_global_load(void)
3061} 3005}
3062 3006
3063/* 3007/*
3064 * Either called from update_cpu_load() or from a cpu going idle 3008 * Called from update_cpu_load() to periodically update this CPU's
3009 * active count.
3065 */ 3010 */
3066static void calc_load_account_active(struct rq *this_rq) 3011static void calc_load_account_active(struct rq *this_rq)
3067{ 3012{
3068 long nr_active, delta; 3013 long delta;
3069 3014
3070 nr_active = this_rq->nr_running; 3015 if (time_before(jiffies, this_rq->calc_load_update))
3071 nr_active += (long) this_rq->nr_uninterruptible; 3016 return;
3072 3017
3073 if (nr_active != this_rq->calc_load_active) { 3018 delta = calc_load_fold_active(this_rq);
3074 delta = nr_active - this_rq->calc_load_active; 3019 delta += calc_load_fold_idle();
3075 this_rq->calc_load_active = nr_active; 3020 if (delta)
3076 atomic_long_add(delta, &calc_load_tasks); 3021 atomic_long_add(delta, &calc_load_tasks);
3077 } 3022
3023 this_rq->calc_load_update += LOAD_FREQ;
3078} 3024}
3079 3025
3080/* 3026/*
@@ -3106,10 +3052,7 @@ static void update_cpu_load(struct rq *this_rq)
3106 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3052 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3107 } 3053 }
3108 3054
3109 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3055 calc_load_account_active(this_rq);
3110 this_rq->calc_load_update += LOAD_FREQ;
3111 calc_load_account_active(this_rq);
3112 }
3113} 3056}
3114 3057
3115#ifdef CONFIG_SMP 3058#ifdef CONFIG_SMP
@@ -3121,44 +3064,27 @@ static void update_cpu_load(struct rq *this_rq)
3121void sched_exec(void) 3064void sched_exec(void)
3122{ 3065{
3123 struct task_struct *p = current; 3066 struct task_struct *p = current;
3124 struct migration_req req;
3125 int dest_cpu, this_cpu;
3126 unsigned long flags; 3067 unsigned long flags;
3127 struct rq *rq; 3068 struct rq *rq;
3128 3069 int dest_cpu;
3129again:
3130 this_cpu = get_cpu();
3131 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3132 if (dest_cpu == this_cpu) {
3133 put_cpu();
3134 return;
3135 }
3136 3070
3137 rq = task_rq_lock(p, &flags); 3071 rq = task_rq_lock(p, &flags);
3138 put_cpu(); 3072 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3073 if (dest_cpu == smp_processor_id())
3074 goto unlock;
3139 3075
3140 /* 3076 /*
3141 * select_task_rq() can race against ->cpus_allowed 3077 * select_task_rq() can race against ->cpus_allowed
3142 */ 3078 */
3143 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3079 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3144 || unlikely(!cpu_active(dest_cpu))) { 3080 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3145 task_rq_unlock(rq, &flags); 3081 struct migration_arg arg = { p, dest_cpu };
3146 goto again;
3147 }
3148 3082
3149 /* force the process onto the specified CPU */
3150 if (migrate_task(p, dest_cpu, &req)) {
3151 /* Need to wait for migration thread (might exit: take ref). */
3152 struct task_struct *mt = rq->migration_thread;
3153
3154 get_task_struct(mt);
3155 task_rq_unlock(rq, &flags); 3083 task_rq_unlock(rq, &flags);
3156 wake_up_process(mt); 3084 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3157 put_task_struct(mt);
3158 wait_for_completion(&req.done);
3159
3160 return; 3085 return;
3161 } 3086 }
3087unlock:
3162 task_rq_unlock(rq, &flags); 3088 task_rq_unlock(rq, &flags);
3163} 3089}
3164 3090
@@ -3630,23 +3556,9 @@ static inline void schedule_debug(struct task_struct *prev)
3630 3556
3631static void put_prev_task(struct rq *rq, struct task_struct *prev) 3557static void put_prev_task(struct rq *rq, struct task_struct *prev)
3632{ 3558{
3633 if (prev->state == TASK_RUNNING) { 3559 if (prev->se.on_rq)
3634 u64 runtime = prev->se.sum_exec_runtime; 3560 update_rq_clock(rq);
3635 3561 rq->skip_clock_update = 0;
3636 runtime -= prev->se.prev_sum_exec_runtime;
3637 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
3638
3639 /*
3640 * In order to avoid avg_overlap growing stale when we are
3641 * indeed overlapping and hence not getting put to sleep, grow
3642 * the avg_overlap on preemption.
3643 *
3644 * We use the average preemption runtime because that
3645 * correlates to the amount of cache footprint a task can
3646 * build up.
3647 */
3648 update_avg(&prev->se.avg_overlap, runtime);
3649 }
3650 prev->sched_class->put_prev_task(rq, prev); 3562 prev->sched_class->put_prev_task(rq, prev);
3651} 3563}
3652 3564
@@ -3696,7 +3608,7 @@ need_resched:
3696 preempt_disable(); 3608 preempt_disable();
3697 cpu = smp_processor_id(); 3609 cpu = smp_processor_id();
3698 rq = cpu_rq(cpu); 3610 rq = cpu_rq(cpu);
3699 rcu_sched_qs(cpu); 3611 rcu_note_context_switch(cpu);
3700 prev = rq->curr; 3612 prev = rq->curr;
3701 switch_count = &prev->nivcsw; 3613 switch_count = &prev->nivcsw;
3702 3614
@@ -3709,14 +3621,13 @@ need_resched_nonpreemptible:
3709 hrtick_clear(rq); 3621 hrtick_clear(rq);
3710 3622
3711 raw_spin_lock_irq(&rq->lock); 3623 raw_spin_lock_irq(&rq->lock);
3712 update_rq_clock(rq);
3713 clear_tsk_need_resched(prev); 3624 clear_tsk_need_resched(prev);
3714 3625
3715 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3626 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3716 if (unlikely(signal_pending_state(prev->state, prev))) 3627 if (unlikely(signal_pending_state(prev->state, prev)))
3717 prev->state = TASK_RUNNING; 3628 prev->state = TASK_RUNNING;
3718 else 3629 else
3719 deactivate_task(rq, prev, 1); 3630 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3720 switch_count = &prev->nvcsw; 3631 switch_count = &prev->nvcsw;
3721 } 3632 }
3722 3633
@@ -3780,7 +3691,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3780 * the mutex owner just released it and exited. 3691 * the mutex owner just released it and exited.
3781 */ 3692 */
3782 if (probe_kernel_address(&owner->cpu, cpu)) 3693 if (probe_kernel_address(&owner->cpu, cpu))
3783 goto out; 3694 return 0;
3784#else 3695#else
3785 cpu = owner->cpu; 3696 cpu = owner->cpu;
3786#endif 3697#endif
@@ -3790,14 +3701,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3790 * the cpu field may no longer be valid. 3701 * the cpu field may no longer be valid.
3791 */ 3702 */
3792 if (cpu >= nr_cpumask_bits) 3703 if (cpu >= nr_cpumask_bits)
3793 goto out; 3704 return 0;
3794 3705
3795 /* 3706 /*
3796 * We need to validate that we can do a 3707 * We need to validate that we can do a
3797 * get_cpu() and that we have the percpu area. 3708 * get_cpu() and that we have the percpu area.
3798 */ 3709 */
3799 if (!cpu_online(cpu)) 3710 if (!cpu_online(cpu))
3800 goto out; 3711 return 0;
3801 3712
3802 rq = cpu_rq(cpu); 3713 rq = cpu_rq(cpu);
3803 3714
@@ -3816,7 +3727,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3816 3727
3817 cpu_relax(); 3728 cpu_relax();
3818 } 3729 }
3819out: 3730
3820 return 1; 3731 return 1;
3821} 3732}
3822#endif 3733#endif
@@ -3940,6 +3851,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3940{ 3851{
3941 __wake_up_common(q, mode, 1, 0, NULL); 3852 __wake_up_common(q, mode, 1, 0, NULL);
3942} 3853}
3854EXPORT_SYMBOL_GPL(__wake_up_locked);
3943 3855
3944void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 3856void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3945{ 3857{
@@ -4039,8 +3951,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4039 if (!x->done) { 3951 if (!x->done) {
4040 DECLARE_WAITQUEUE(wait, current); 3952 DECLARE_WAITQUEUE(wait, current);
4041 3953
4042 wait.flags |= WQ_FLAG_EXCLUSIVE; 3954 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4043 __add_wait_queue_tail(&x->wait, &wait);
4044 do { 3955 do {
4045 if (signal_pending_state(state, current)) { 3956 if (signal_pending_state(state, current)) {
4046 timeout = -ERESTARTSYS; 3957 timeout = -ERESTARTSYS;
@@ -4266,7 +4177,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4266 BUG_ON(prio < 0 || prio > MAX_PRIO); 4177 BUG_ON(prio < 0 || prio > MAX_PRIO);
4267 4178
4268 rq = task_rq_lock(p, &flags); 4179 rq = task_rq_lock(p, &flags);
4269 update_rq_clock(rq);
4270 4180
4271 oldprio = p->prio; 4181 oldprio = p->prio;
4272 prev_class = p->sched_class; 4182 prev_class = p->sched_class;
@@ -4287,7 +4197,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4287 if (running) 4197 if (running)
4288 p->sched_class->set_curr_task(rq); 4198 p->sched_class->set_curr_task(rq);
4289 if (on_rq) { 4199 if (on_rq) {
4290 enqueue_task(rq, p, 0, oldprio < prio); 4200 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4291 4201
4292 check_class_changed(rq, p, prev_class, oldprio, running); 4202 check_class_changed(rq, p, prev_class, oldprio, running);
4293 } 4203 }
@@ -4309,7 +4219,6 @@ void set_user_nice(struct task_struct *p, long nice)
4309 * the task might be in the middle of scheduling on another CPU. 4219 * the task might be in the middle of scheduling on another CPU.
4310 */ 4220 */
4311 rq = task_rq_lock(p, &flags); 4221 rq = task_rq_lock(p, &flags);
4312 update_rq_clock(rq);
4313 /* 4222 /*
4314 * The RT priorities are set via sched_setscheduler(), but we still 4223 * The RT priorities are set via sched_setscheduler(), but we still
4315 * allow the 'normal' nice value to be set - but as expected 4224 * allow the 'normal' nice value to be set - but as expected
@@ -4331,7 +4240,7 @@ void set_user_nice(struct task_struct *p, long nice)
4331 delta = p->prio - old_prio; 4240 delta = p->prio - old_prio;
4332 4241
4333 if (on_rq) { 4242 if (on_rq) {
4334 enqueue_task(rq, p, 0, false); 4243 enqueue_task(rq, p, 0);
4335 /* 4244 /*
4336 * If the task increased its priority or is running and 4245 * If the task increased its priority or is running and
4337 * lowered its priority, then reschedule its CPU: 4246 * lowered its priority, then reschedule its CPU:
@@ -4592,7 +4501,6 @@ recheck:
4592 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4501 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4593 goto recheck; 4502 goto recheck;
4594 } 4503 }
4595 update_rq_clock(rq);
4596 on_rq = p->se.on_rq; 4504 on_rq = p->se.on_rq;
4597 running = task_current(rq, p); 4505 running = task_current(rq, p);
4598 if (on_rq) 4506 if (on_rq)
@@ -5329,17 +5237,15 @@ static inline void sched_init_granularity(void)
5329/* 5237/*
5330 * This is how migration works: 5238 * This is how migration works:
5331 * 5239 *
5332 * 1) we queue a struct migration_req structure in the source CPU's 5240 * 1) we invoke migration_cpu_stop() on the target CPU using
5333 * runqueue and wake up that CPU's migration thread. 5241 * stop_one_cpu().
5334 * 2) we down() the locked semaphore => thread blocks. 5242 * 2) stopper starts to run (implicitly forcing the migrated thread
5335 * 3) migration thread wakes up (implicitly it forces the migrated 5243 * off the CPU)
5336 * thread off the CPU) 5244 * 3) it checks whether the migrated task is still in the wrong runqueue.
5337 * 4) it gets the migration request and checks whether the migrated 5245 * 4) if it's in the wrong runqueue then the migration thread removes
5338 * task is still in the wrong runqueue.
5339 * 5) if it's in the wrong runqueue then the migration thread removes
5340 * it and puts it into the right queue. 5246 * it and puts it into the right queue.
5341 * 6) migration thread up()s the semaphore. 5247 * 5) stopper completes and stop_one_cpu() returns and the migration
5342 * 7) we wake up and the migration is done. 5248 * is done.
5343 */ 5249 */
5344 5250
5345/* 5251/*
@@ -5353,12 +5259,23 @@ static inline void sched_init_granularity(void)
5353 */ 5259 */
5354int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5260int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5355{ 5261{
5356 struct migration_req req;
5357 unsigned long flags; 5262 unsigned long flags;
5358 struct rq *rq; 5263 struct rq *rq;
5264 unsigned int dest_cpu;
5359 int ret = 0; 5265 int ret = 0;
5360 5266
5267 /*
5268 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5269 * drop the rq->lock and still rely on ->cpus_allowed.
5270 */
5271again:
5272 while (task_is_waking(p))
5273 cpu_relax();
5361 rq = task_rq_lock(p, &flags); 5274 rq = task_rq_lock(p, &flags);
5275 if (task_is_waking(p)) {
5276 task_rq_unlock(rq, &flags);
5277 goto again;
5278 }
5362 5279
5363 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5280 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5364 ret = -EINVAL; 5281 ret = -EINVAL;
@@ -5382,15 +5299,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5382 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5299 if (cpumask_test_cpu(task_cpu(p), new_mask))
5383 goto out; 5300 goto out;
5384 5301
5385 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { 5302 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5303 if (migrate_task(p, dest_cpu)) {
5304 struct migration_arg arg = { p, dest_cpu };
5386 /* Need help from migration thread: drop lock and wait. */ 5305 /* Need help from migration thread: drop lock and wait. */
5387 struct task_struct *mt = rq->migration_thread;
5388
5389 get_task_struct(mt);
5390 task_rq_unlock(rq, &flags); 5306 task_rq_unlock(rq, &flags);
5391 wake_up_process(mt); 5307 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5392 put_task_struct(mt);
5393 wait_for_completion(&req.done);
5394 tlb_migrate_finish(p->mm); 5308 tlb_migrate_finish(p->mm);
5395 return 0; 5309 return 0;
5396 } 5310 }
@@ -5448,98 +5362,49 @@ fail:
5448 return ret; 5362 return ret;
5449} 5363}
5450 5364
5451#define RCU_MIGRATION_IDLE 0
5452#define RCU_MIGRATION_NEED_QS 1
5453#define RCU_MIGRATION_GOT_QS 2
5454#define RCU_MIGRATION_MUST_SYNC 3
5455
5456/* 5365/*
5457 * migration_thread - this is a highprio system thread that performs 5366 * migration_cpu_stop - this will be executed by a highprio stopper thread
5458 * thread migration by bumping thread off CPU then 'pushing' onto 5367 * and performs thread migration by bumping thread off CPU then
5459 * another runqueue. 5368 * 'pushing' onto another runqueue.
5460 */ 5369 */
5461static int migration_thread(void *data) 5370static int migration_cpu_stop(void *data)
5462{
5463 int badcpu;
5464 int cpu = (long)data;
5465 struct rq *rq;
5466
5467 rq = cpu_rq(cpu);
5468 BUG_ON(rq->migration_thread != current);
5469
5470 set_current_state(TASK_INTERRUPTIBLE);
5471 while (!kthread_should_stop()) {
5472 struct migration_req *req;
5473 struct list_head *head;
5474
5475 raw_spin_lock_irq(&rq->lock);
5476
5477 if (cpu_is_offline(cpu)) {
5478 raw_spin_unlock_irq(&rq->lock);
5479 break;
5480 }
5481
5482 if (rq->active_balance) {
5483 active_load_balance(rq, cpu);
5484 rq->active_balance = 0;
5485 }
5486
5487 head = &rq->migration_queue;
5488
5489 if (list_empty(head)) {
5490 raw_spin_unlock_irq(&rq->lock);
5491 schedule();
5492 set_current_state(TASK_INTERRUPTIBLE);
5493 continue;
5494 }
5495 req = list_entry(head->next, struct migration_req, list);
5496 list_del_init(head->next);
5497
5498 if (req->task != NULL) {
5499 raw_spin_unlock(&rq->lock);
5500 __migrate_task(req->task, cpu, req->dest_cpu);
5501 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
5502 req->dest_cpu = RCU_MIGRATION_GOT_QS;
5503 raw_spin_unlock(&rq->lock);
5504 } else {
5505 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5506 raw_spin_unlock(&rq->lock);
5507 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5508 }
5509 local_irq_enable();
5510
5511 complete(&req->done);
5512 }
5513 __set_current_state(TASK_RUNNING);
5514
5515 return 0;
5516}
5517
5518#ifdef CONFIG_HOTPLUG_CPU
5519
5520static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5521{ 5371{
5522 int ret; 5372 struct migration_arg *arg = data;
5523 5373
5374 /*
5375 * The original target cpu might have gone down and we might
5376 * be on another cpu but it doesn't matter.
5377 */
5524 local_irq_disable(); 5378 local_irq_disable();
5525 ret = __migrate_task(p, src_cpu, dest_cpu); 5379 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5526 local_irq_enable(); 5380 local_irq_enable();
5527 return ret; 5381 return 0;
5528} 5382}
5529 5383
5384#ifdef CONFIG_HOTPLUG_CPU
5530/* 5385/*
5531 * Figure out where task on dead CPU should go, use force if necessary. 5386 * Figure out where task on dead CPU should go, use force if necessary.
5532 */ 5387 */
5533static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5388void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5534{ 5389{
5535 int dest_cpu; 5390 struct rq *rq = cpu_rq(dead_cpu);
5391 int needs_cpu, uninitialized_var(dest_cpu);
5392 unsigned long flags;
5536 5393
5537again: 5394 local_irq_save(flags);
5538 dest_cpu = select_fallback_rq(dead_cpu, p);
5539 5395
5540 /* It can have affinity changed while we were choosing. */ 5396 raw_spin_lock(&rq->lock);
5541 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5397 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
5542 goto again; 5398 if (needs_cpu)
5399 dest_cpu = select_fallback_rq(dead_cpu, p);
5400 raw_spin_unlock(&rq->lock);
5401 /*
5402 * It can only fail if we race with set_cpus_allowed(),
5403 * in the racer should migrate the task anyway.
5404 */
5405 if (needs_cpu)
5406 __migrate_task(p, dead_cpu, dest_cpu);
5407 local_irq_restore(flags);
5543} 5408}
5544 5409
5545/* 5410/*
@@ -5603,7 +5468,6 @@ void sched_idle_next(void)
5603 5468
5604 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5469 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5605 5470
5606 update_rq_clock(rq);
5607 activate_task(rq, p, 0); 5471 activate_task(rq, p, 0);
5608 5472
5609 raw_spin_unlock_irqrestore(&rq->lock, flags); 5473 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5658,7 +5522,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5658 for ( ; ; ) { 5522 for ( ; ; ) {
5659 if (!rq->nr_running) 5523 if (!rq->nr_running)
5660 break; 5524 break;
5661 update_rq_clock(rq);
5662 next = pick_next_task(rq); 5525 next = pick_next_task(rq);
5663 if (!next) 5526 if (!next)
5664 break; 5527 break;
@@ -5881,35 +5744,20 @@ static void set_rq_offline(struct rq *rq)
5881static int __cpuinit 5744static int __cpuinit
5882migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5745migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5883{ 5746{
5884 struct task_struct *p;
5885 int cpu = (long)hcpu; 5747 int cpu = (long)hcpu;
5886 unsigned long flags; 5748 unsigned long flags;
5887 struct rq *rq; 5749 struct rq *rq = cpu_rq(cpu);
5888 5750
5889 switch (action) { 5751 switch (action) {
5890 5752
5891 case CPU_UP_PREPARE: 5753 case CPU_UP_PREPARE:
5892 case CPU_UP_PREPARE_FROZEN: 5754 case CPU_UP_PREPARE_FROZEN:
5893 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5894 if (IS_ERR(p))
5895 return NOTIFY_BAD;
5896 kthread_bind(p, cpu);
5897 /* Must be high prio: stop_machine expects to yield to it. */
5898 rq = task_rq_lock(p, &flags);
5899 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5900 task_rq_unlock(rq, &flags);
5901 get_task_struct(p);
5902 cpu_rq(cpu)->migration_thread = p;
5903 rq->calc_load_update = calc_load_update; 5755 rq->calc_load_update = calc_load_update;
5904 break; 5756 break;
5905 5757
5906 case CPU_ONLINE: 5758 case CPU_ONLINE:
5907 case CPU_ONLINE_FROZEN: 5759 case CPU_ONLINE_FROZEN:
5908 /* Strictly unnecessary, as first user will wake it. */
5909 wake_up_process(cpu_rq(cpu)->migration_thread);
5910
5911 /* Update our root-domain */ 5760 /* Update our root-domain */
5912 rq = cpu_rq(cpu);
5913 raw_spin_lock_irqsave(&rq->lock, flags); 5761 raw_spin_lock_irqsave(&rq->lock, flags);
5914 if (rq->rd) { 5762 if (rq->rd) {
5915 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5763 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5920,61 +5768,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5920 break; 5768 break;
5921 5769
5922#ifdef CONFIG_HOTPLUG_CPU 5770#ifdef CONFIG_HOTPLUG_CPU
5923 case CPU_UP_CANCELED:
5924 case CPU_UP_CANCELED_FROZEN:
5925 if (!cpu_rq(cpu)->migration_thread)
5926 break;
5927 /* Unbind it from offline cpu so it can run. Fall thru. */
5928 kthread_bind(cpu_rq(cpu)->migration_thread,
5929 cpumask_any(cpu_online_mask));
5930 kthread_stop(cpu_rq(cpu)->migration_thread);
5931 put_task_struct(cpu_rq(cpu)->migration_thread);
5932 cpu_rq(cpu)->migration_thread = NULL;
5933 break;
5934
5935 case CPU_DEAD: 5771 case CPU_DEAD:
5936 case CPU_DEAD_FROZEN: 5772 case CPU_DEAD_FROZEN:
5937 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
5938 migrate_live_tasks(cpu); 5773 migrate_live_tasks(cpu);
5939 rq = cpu_rq(cpu);
5940 kthread_stop(rq->migration_thread);
5941 put_task_struct(rq->migration_thread);
5942 rq->migration_thread = NULL;
5943 /* Idle task back to normal (off runqueue, low prio) */ 5774 /* Idle task back to normal (off runqueue, low prio) */
5944 raw_spin_lock_irq(&rq->lock); 5775 raw_spin_lock_irq(&rq->lock);
5945 update_rq_clock(rq);
5946 deactivate_task(rq, rq->idle, 0); 5776 deactivate_task(rq, rq->idle, 0);
5947 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5777 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5948 rq->idle->sched_class = &idle_sched_class; 5778 rq->idle->sched_class = &idle_sched_class;
5949 migrate_dead_tasks(cpu); 5779 migrate_dead_tasks(cpu);
5950 raw_spin_unlock_irq(&rq->lock); 5780 raw_spin_unlock_irq(&rq->lock);
5951 cpuset_unlock();
5952 migrate_nr_uninterruptible(rq); 5781 migrate_nr_uninterruptible(rq);
5953 BUG_ON(rq->nr_running != 0); 5782 BUG_ON(rq->nr_running != 0);
5954 calc_global_load_remove(rq); 5783 calc_global_load_remove(rq);
5955 /*
5956 * No need to migrate the tasks: it was best-effort if
5957 * they didn't take sched_hotcpu_mutex. Just wake up
5958 * the requestors.
5959 */
5960 raw_spin_lock_irq(&rq->lock);
5961 while (!list_empty(&rq->migration_queue)) {
5962 struct migration_req *req;
5963
5964 req = list_entry(rq->migration_queue.next,
5965 struct migration_req, list);
5966 list_del_init(&req->list);
5967 raw_spin_unlock_irq(&rq->lock);
5968 complete(&req->done);
5969 raw_spin_lock_irq(&rq->lock);
5970 }
5971 raw_spin_unlock_irq(&rq->lock);
5972 break; 5784 break;
5973 5785
5974 case CPU_DYING: 5786 case CPU_DYING:
5975 case CPU_DYING_FROZEN: 5787 case CPU_DYING_FROZEN:
5976 /* Update our root-domain */ 5788 /* Update our root-domain */
5977 rq = cpu_rq(cpu);
5978 raw_spin_lock_irqsave(&rq->lock, flags); 5789 raw_spin_lock_irqsave(&rq->lock, flags);
5979 if (rq->rd) { 5790 if (rq->rd) {
5980 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5791 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6305,6 +6116,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6305 struct rq *rq = cpu_rq(cpu); 6116 struct rq *rq = cpu_rq(cpu);
6306 struct sched_domain *tmp; 6117 struct sched_domain *tmp;
6307 6118
6119 for (tmp = sd; tmp; tmp = tmp->parent)
6120 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6121
6308 /* Remove the sched domains which do not contribute to scheduling. */ 6122 /* Remove the sched domains which do not contribute to scheduling. */
6309 for (tmp = sd; tmp; ) { 6123 for (tmp = sd; tmp; ) {
6310 struct sched_domain *parent = tmp->parent; 6124 struct sched_domain *parent = tmp->parent;
@@ -7788,10 +7602,8 @@ void __init sched_init(void)
7788 rq->push_cpu = 0; 7602 rq->push_cpu = 0;
7789 rq->cpu = i; 7603 rq->cpu = i;
7790 rq->online = 0; 7604 rq->online = 0;
7791 rq->migration_thread = NULL;
7792 rq->idle_stamp = 0; 7605 rq->idle_stamp = 0;
7793 rq->avg_idle = 2*sysctl_sched_migration_cost; 7606 rq->avg_idle = 2*sysctl_sched_migration_cost;
7794 INIT_LIST_HEAD(&rq->migration_queue);
7795 rq_attach_root(rq, &def_root_domain); 7607 rq_attach_root(rq, &def_root_domain);
7796#endif 7608#endif
7797 init_rq_hrtick(rq); 7609 init_rq_hrtick(rq);
@@ -7892,7 +7704,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7892{ 7704{
7893 int on_rq; 7705 int on_rq;
7894 7706
7895 update_rq_clock(rq);
7896 on_rq = p->se.on_rq; 7707 on_rq = p->se.on_rq;
7897 if (on_rq) 7708 if (on_rq)
7898 deactivate_task(rq, p, 0); 7709 deactivate_task(rq, p, 0);
@@ -7919,9 +7730,9 @@ void normalize_rt_tasks(void)
7919 7730
7920 p->se.exec_start = 0; 7731 p->se.exec_start = 0;
7921#ifdef CONFIG_SCHEDSTATS 7732#ifdef CONFIG_SCHEDSTATS
7922 p->se.wait_start = 0; 7733 p->se.statistics.wait_start = 0;
7923 p->se.sleep_start = 0; 7734 p->se.statistics.sleep_start = 0;
7924 p->se.block_start = 0; 7735 p->se.statistics.block_start = 0;
7925#endif 7736#endif
7926 7737
7927 if (!rt_task(p)) { 7738 if (!rt_task(p)) {
@@ -7948,9 +7759,9 @@ void normalize_rt_tasks(void)
7948 7759
7949#endif /* CONFIG_MAGIC_SYSRQ */ 7760#endif /* CONFIG_MAGIC_SYSRQ */
7950 7761
7951#ifdef CONFIG_IA64 7762#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7952/* 7763/*
7953 * These functions are only useful for the IA64 MCA handling. 7764 * These functions are only useful for the IA64 MCA handling, or kdb.
7954 * 7765 *
7955 * They can only be called when the whole system has been 7766 * They can only be called when the whole system has been
7956 * stopped - every CPU needs to be quiescent, and no scheduling 7767 * stopped - every CPU needs to be quiescent, and no scheduling
@@ -7970,6 +7781,9 @@ struct task_struct *curr_task(int cpu)
7970 return cpu_curr(cpu); 7781 return cpu_curr(cpu);
7971} 7782}
7972 7783
7784#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7785
7786#ifdef CONFIG_IA64
7973/** 7787/**
7974 * set_curr_task - set the current task for a given cpu. 7788 * set_curr_task - set the current task for a given cpu.
7975 * @cpu: the processor in question. 7789 * @cpu: the processor in question.
@@ -8254,8 +8068,6 @@ void sched_move_task(struct task_struct *tsk)
8254 8068
8255 rq = task_rq_lock(tsk, &flags); 8069 rq = task_rq_lock(tsk, &flags);
8256 8070
8257 update_rq_clock(rq);
8258
8259 running = task_current(rq, tsk); 8071 running = task_current(rq, tsk);
8260 on_rq = tsk->se.on_rq; 8072 on_rq = tsk->se.on_rq;
8261 8073
@@ -8274,7 +8086,7 @@ void sched_move_task(struct task_struct *tsk)
8274 if (unlikely(running)) 8086 if (unlikely(running))
8275 tsk->sched_class->set_curr_task(rq); 8087 tsk->sched_class->set_curr_task(rq);
8276 if (on_rq) 8088 if (on_rq)
8277 enqueue_task(rq, tsk, 0, false); 8089 enqueue_task(rq, tsk, 0);
8278 8090
8279 task_rq_unlock(rq, &flags); 8091 task_rq_unlock(rq, &flags);
8280} 8092}
@@ -9088,43 +8900,32 @@ struct cgroup_subsys cpuacct_subsys = {
9088 8900
9089#ifndef CONFIG_SMP 8901#ifndef CONFIG_SMP
9090 8902
9091int rcu_expedited_torture_stats(char *page)
9092{
9093 return 0;
9094}
9095EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9096
9097void synchronize_sched_expedited(void) 8903void synchronize_sched_expedited(void)
9098{ 8904{
8905 barrier();
9099} 8906}
9100EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8907EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9101 8908
9102#else /* #ifndef CONFIG_SMP */ 8909#else /* #ifndef CONFIG_SMP */
9103 8910
9104static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 8911static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9105static DEFINE_MUTEX(rcu_sched_expedited_mutex);
9106
9107#define RCU_EXPEDITED_STATE_POST -2
9108#define RCU_EXPEDITED_STATE_IDLE -1
9109 8912
9110static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE; 8913static int synchronize_sched_expedited_cpu_stop(void *data)
9111
9112int rcu_expedited_torture_stats(char *page)
9113{ 8914{
9114 int cnt = 0; 8915 /*
9115 int cpu; 8916 * There must be a full memory barrier on each affected CPU
9116 8917 * between the time that try_stop_cpus() is called and the
9117 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 8918 * time that it returns.
9118 for_each_online_cpu(cpu) { 8919 *
9119 cnt += sprintf(&page[cnt], " %d:%d", 8920 * In the current initial implementation of cpu_stop, the
9120 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 8921 * above condition is already met when the control reaches
9121 } 8922 * this point and the following smp_mb() is not strictly
9122 cnt += sprintf(&page[cnt], "\n"); 8923 * necessary. Do smp_mb() anyway for documentation and
9123 return cnt; 8924 * robustness against future implementation changes.
8925 */
8926 smp_mb(); /* See above comment block. */
8927 return 0;
9124} 8928}
9125EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9126
9127static long synchronize_sched_expedited_count;
9128 8929
9129/* 8930/*
9130 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 8931 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9138,18 +8939,14 @@ static long synchronize_sched_expedited_count;
9138 */ 8939 */
9139void synchronize_sched_expedited(void) 8940void synchronize_sched_expedited(void)
9140{ 8941{
9141 int cpu; 8942 int snap, trycount = 0;
9142 unsigned long flags;
9143 bool need_full_sync = 0;
9144 struct rq *rq;
9145 struct migration_req *req;
9146 long snap;
9147 int trycount = 0;
9148 8943
9149 smp_mb(); /* ensure prior mod happens before capturing snap. */ 8944 smp_mb(); /* ensure prior mod happens before capturing snap. */
9150 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 8945 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9151 get_online_cpus(); 8946 get_online_cpus();
9152 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 8947 while (try_stop_cpus(cpu_online_mask,
8948 synchronize_sched_expedited_cpu_stop,
8949 NULL) == -EAGAIN) {
9153 put_online_cpus(); 8950 put_online_cpus();
9154 if (trycount++ < 10) 8951 if (trycount++ < 10)
9155 udelay(trycount * num_online_cpus()); 8952 udelay(trycount * num_online_cpus());
@@ -9157,41 +8954,15 @@ void synchronize_sched_expedited(void)
9157 synchronize_sched(); 8954 synchronize_sched();
9158 return; 8955 return;
9159 } 8956 }
9160 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 8957 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9161 smp_mb(); /* ensure test happens before caller kfree */ 8958 smp_mb(); /* ensure test happens before caller kfree */
9162 return; 8959 return;
9163 } 8960 }
9164 get_online_cpus(); 8961 get_online_cpus();
9165 } 8962 }
9166 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 8963 atomic_inc(&synchronize_sched_expedited_count);
9167 for_each_online_cpu(cpu) { 8964 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9168 rq = cpu_rq(cpu);
9169 req = &per_cpu(rcu_migration_req, cpu);
9170 init_completion(&req->done);
9171 req->task = NULL;
9172 req->dest_cpu = RCU_MIGRATION_NEED_QS;
9173 raw_spin_lock_irqsave(&rq->lock, flags);
9174 list_add(&req->list, &rq->migration_queue);
9175 raw_spin_unlock_irqrestore(&rq->lock, flags);
9176 wake_up_process(rq->migration_thread);
9177 }
9178 for_each_online_cpu(cpu) {
9179 rcu_expedited_state = cpu;
9180 req = &per_cpu(rcu_migration_req, cpu);
9181 rq = cpu_rq(cpu);
9182 wait_for_completion(&req->done);
9183 raw_spin_lock_irqsave(&rq->lock, flags);
9184 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
9185 need_full_sync = 1;
9186 req->dest_cpu = RCU_MIGRATION_IDLE;
9187 raw_spin_unlock_irqrestore(&rq->lock, flags);
9188 }
9189 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9190 synchronize_sched_expedited_count++;
9191 mutex_unlock(&rcu_sched_expedited_mutex);
9192 put_online_cpus(); 8965 put_online_cpus();
9193 if (need_full_sync)
9194 synchronize_sched();
9195} 8966}
9196EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8967EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9197 8968
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 9b49db144037..87a330a7185f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
70 PN(se->vruntime); 70 PN(se->vruntime);
71 PN(se->sum_exec_runtime); 71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS 72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start); 73 PN(se->statistics.wait_start);
74 PN(se->sleep_start); 74 PN(se->statistics.sleep_start);
75 PN(se->block_start); 75 PN(se->statistics.block_start);
76 PN(se->sleep_max); 76 PN(se->statistics.sleep_max);
77 PN(se->block_max); 77 PN(se->statistics.block_max);
78 PN(se->exec_max); 78 PN(se->statistics.exec_max);
79 PN(se->slice_max); 79 PN(se->statistics.slice_max);
80 PN(se->wait_max); 80 PN(se->statistics.wait_max);
81 PN(se->wait_sum); 81 PN(se->statistics.wait_sum);
82 P(se->wait_count); 82 P(se->statistics.wait_count);
83#endif 83#endif
84 P(se->load.weight); 84 P(se->load.weight);
85#undef PN 85#undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
105 SPLIT_NS(p->se.vruntime), 105 SPLIT_NS(p->se.vruntime),
106 SPLIT_NS(p->se.sum_exec_runtime), 106 SPLIT_NS(p->se.sum_exec_runtime),
107 SPLIT_NS(p->se.sum_sleep_runtime)); 107 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
108#else 108#else
109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
114 { 114 {
115 char path[64]; 115 char path[64];
116 116
117 rcu_read_lock();
117 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); 118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
118 SEQ_printf(m, " %s", path); 120 SEQ_printf(m, " %s", path);
119 } 121 }
120#endif 122#endif
@@ -173,11 +175,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
173 task_group_path(tg, path, sizeof(path)); 175 task_group_path(tg, path, sizeof(path));
174 176
175 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
176#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
177 {
178 uid_t uid = cfs_rq->tg->uid;
179 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
180 }
181#else 178#else
182 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
183#endif 180#endif
@@ -407,40 +404,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
407 PN(se.exec_start); 404 PN(se.exec_start);
408 PN(se.vruntime); 405 PN(se.vruntime);
409 PN(se.sum_exec_runtime); 406 PN(se.sum_exec_runtime);
410 PN(se.avg_overlap);
411 PN(se.avg_wakeup);
412 407
413 nr_switches = p->nvcsw + p->nivcsw; 408 nr_switches = p->nvcsw + p->nivcsw;
414 409
415#ifdef CONFIG_SCHEDSTATS 410#ifdef CONFIG_SCHEDSTATS
416 PN(se.wait_start); 411 PN(se.statistics.wait_start);
417 PN(se.sleep_start); 412 PN(se.statistics.sleep_start);
418 PN(se.block_start); 413 PN(se.statistics.block_start);
419 PN(se.sleep_max); 414 PN(se.statistics.sleep_max);
420 PN(se.block_max); 415 PN(se.statistics.block_max);
421 PN(se.exec_max); 416 PN(se.statistics.exec_max);
422 PN(se.slice_max); 417 PN(se.statistics.slice_max);
423 PN(se.wait_max); 418 PN(se.statistics.wait_max);
424 PN(se.wait_sum); 419 PN(se.statistics.wait_sum);
425 P(se.wait_count); 420 P(se.statistics.wait_count);
426 PN(se.iowait_sum); 421 PN(se.statistics.iowait_sum);
427 P(se.iowait_count); 422 P(se.statistics.iowait_count);
428 P(sched_info.bkl_count); 423 P(sched_info.bkl_count);
429 P(se.nr_migrations); 424 P(se.nr_migrations);
430 P(se.nr_migrations_cold); 425 P(se.statistics.nr_migrations_cold);
431 P(se.nr_failed_migrations_affine); 426 P(se.statistics.nr_failed_migrations_affine);
432 P(se.nr_failed_migrations_running); 427 P(se.statistics.nr_failed_migrations_running);
433 P(se.nr_failed_migrations_hot); 428 P(se.statistics.nr_failed_migrations_hot);
434 P(se.nr_forced_migrations); 429 P(se.statistics.nr_forced_migrations);
435 P(se.nr_wakeups); 430 P(se.statistics.nr_wakeups);
436 P(se.nr_wakeups_sync); 431 P(se.statistics.nr_wakeups_sync);
437 P(se.nr_wakeups_migrate); 432 P(se.statistics.nr_wakeups_migrate);
438 P(se.nr_wakeups_local); 433 P(se.statistics.nr_wakeups_local);
439 P(se.nr_wakeups_remote); 434 P(se.statistics.nr_wakeups_remote);
440 P(se.nr_wakeups_affine); 435 P(se.statistics.nr_wakeups_affine);
441 P(se.nr_wakeups_affine_attempts); 436 P(se.statistics.nr_wakeups_affine_attempts);
442 P(se.nr_wakeups_passive); 437 P(se.statistics.nr_wakeups_passive);
443 P(se.nr_wakeups_idle); 438 P(se.statistics.nr_wakeups_idle);
444 439
445 { 440 {
446 u64 avg_atom, avg_per_cpu; 441 u64 avg_atom, avg_per_cpu;
@@ -491,31 +486,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
491void proc_sched_set_task(struct task_struct *p) 486void proc_sched_set_task(struct task_struct *p)
492{ 487{
493#ifdef CONFIG_SCHEDSTATS 488#ifdef CONFIG_SCHEDSTATS
494 p->se.wait_max = 0; 489 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
495 p->se.wait_sum = 0;
496 p->se.wait_count = 0;
497 p->se.iowait_sum = 0;
498 p->se.iowait_count = 0;
499 p->se.sleep_max = 0;
500 p->se.sum_sleep_runtime = 0;
501 p->se.block_max = 0;
502 p->se.exec_max = 0;
503 p->se.slice_max = 0;
504 p->se.nr_migrations = 0;
505 p->se.nr_migrations_cold = 0;
506 p->se.nr_failed_migrations_affine = 0;
507 p->se.nr_failed_migrations_running = 0;
508 p->se.nr_failed_migrations_hot = 0;
509 p->se.nr_forced_migrations = 0;
510 p->se.nr_wakeups = 0;
511 p->se.nr_wakeups_sync = 0;
512 p->se.nr_wakeups_migrate = 0;
513 p->se.nr_wakeups_local = 0;
514 p->se.nr_wakeups_remote = 0;
515 p->se.nr_wakeups_affine = 0;
516 p->se.nr_wakeups_affine_attempts = 0;
517 p->se.nr_wakeups_passive = 0;
518 p->se.nr_wakeups_idle = 0;
519 p->sched_info.bkl_count = 0;
520#endif 490#endif
521} 491}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5a5ea2cd924f..217e4a9393e4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
35 * (to see the precise effective timeslice length of your workload, 35 * (to see the precise effective timeslice length of your workload,
36 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
37 */ 37 */
38unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 6000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL; 39unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40 40
41/* 41/*
42 * The initial- and re-scaling of tunables is configurable 42 * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 2000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 5; 63static unsigned int sched_nr_latency = 3;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
505{ 505{
506 unsigned long delta_exec_weighted; 506 unsigned long delta_exec_weighted;
507 507
508 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 508 schedstat_set(curr->statistics.exec_max,
509 max((u64)delta_exec, curr->statistics.exec_max));
509 510
510 curr->sum_exec_runtime += delta_exec; 511 curr->sum_exec_runtime += delta_exec;
511 schedstat_add(cfs_rq, exec_clock, delta_exec); 512 schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
548static inline void 549static inline void
549update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 550update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
550{ 551{
551 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); 552 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
552} 553}
553 554
554/* 555/*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
567static void 568static void
568update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 569update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
569{ 570{
570 schedstat_set(se->wait_max, max(se->wait_max, 571 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
571 rq_of(cfs_rq)->clock - se->wait_start)); 572 rq_of(cfs_rq)->clock - se->statistics.wait_start));
572 schedstat_set(se->wait_count, se->wait_count + 1); 573 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
573 schedstat_set(se->wait_sum, se->wait_sum + 574 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
574 rq_of(cfs_rq)->clock - se->wait_start); 575 rq_of(cfs_rq)->clock - se->statistics.wait_start);
575#ifdef CONFIG_SCHEDSTATS 576#ifdef CONFIG_SCHEDSTATS
576 if (entity_is_task(se)) { 577 if (entity_is_task(se)) {
577 trace_sched_stat_wait(task_of(se), 578 trace_sched_stat_wait(task_of(se),
578 rq_of(cfs_rq)->clock - se->wait_start); 579 rq_of(cfs_rq)->clock - se->statistics.wait_start);
579 } 580 }
580#endif 581#endif
581 schedstat_set(se->wait_start, 0); 582 schedstat_set(se->statistics.wait_start, 0);
582} 583}
583 584
584static inline void 585static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
657 if (entity_is_task(se)) 658 if (entity_is_task(se))
658 tsk = task_of(se); 659 tsk = task_of(se);
659 660
660 if (se->sleep_start) { 661 if (se->statistics.sleep_start) {
661 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 662 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
662 663
663 if ((s64)delta < 0) 664 if ((s64)delta < 0)
664 delta = 0; 665 delta = 0;
665 666
666 if (unlikely(delta > se->sleep_max)) 667 if (unlikely(delta > se->statistics.sleep_max))
667 se->sleep_max = delta; 668 se->statistics.sleep_max = delta;
668 669
669 se->sleep_start = 0; 670 se->statistics.sleep_start = 0;
670 se->sum_sleep_runtime += delta; 671 se->statistics.sum_sleep_runtime += delta;
671 672
672 if (tsk) { 673 if (tsk) {
673 account_scheduler_latency(tsk, delta >> 10, 1); 674 account_scheduler_latency(tsk, delta >> 10, 1);
674 trace_sched_stat_sleep(tsk, delta); 675 trace_sched_stat_sleep(tsk, delta);
675 } 676 }
676 } 677 }
677 if (se->block_start) { 678 if (se->statistics.block_start) {
678 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 679 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
679 680
680 if ((s64)delta < 0) 681 if ((s64)delta < 0)
681 delta = 0; 682 delta = 0;
682 683
683 if (unlikely(delta > se->block_max)) 684 if (unlikely(delta > se->statistics.block_max))
684 se->block_max = delta; 685 se->statistics.block_max = delta;
685 686
686 se->block_start = 0; 687 se->statistics.block_start = 0;
687 se->sum_sleep_runtime += delta; 688 se->statistics.sum_sleep_runtime += delta;
688 689
689 if (tsk) { 690 if (tsk) {
690 if (tsk->in_iowait) { 691 if (tsk->in_iowait) {
691 se->iowait_sum += delta; 692 se->statistics.iowait_sum += delta;
692 se->iowait_count++; 693 se->statistics.iowait_count++;
693 trace_sched_stat_iowait(tsk, delta); 694 trace_sched_stat_iowait(tsk, delta);
694 } 695 }
695 696
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
737 vruntime += sched_vslice(cfs_rq, se); 738 vruntime += sched_vslice(cfs_rq, se);
738 739
739 /* sleeps up to a single latency don't count. */ 740 /* sleeps up to a single latency don't count. */
740 if (!initial && sched_feat(FAIR_SLEEPERS)) { 741 if (!initial) {
741 unsigned long thresh = sysctl_sched_latency; 742 unsigned long thresh = sysctl_sched_latency;
742 743
743 /* 744 /*
744 * Convert the sleeper threshold into virtual time.
745 * SCHED_IDLE is a special sub-class. We care about
746 * fairness only relative to other SCHED_IDLE tasks,
747 * all of which have the same weight.
748 */
749 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
750 task_of(se)->policy != SCHED_IDLE))
751 thresh = calc_delta_fair(thresh, se);
752
753 /*
754 * Halve their sleep time's effect, to allow 745 * Halve their sleep time's effect, to allow
755 * for a gentler effect of sleepers: 746 * for a gentler effect of sleepers:
756 */ 747 */
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
766 se->vruntime = vruntime; 757 se->vruntime = vruntime;
767} 758}
768 759
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
772static void 760static void
773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 761enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
774{ 762{
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
776 * Update the normalized vruntime before updating min_vruntime 764 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr(). 765 * through callig update_curr().
778 */ 766 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) 767 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
780 se->vruntime += cfs_rq->min_vruntime; 768 se->vruntime += cfs_rq->min_vruntime;
781 769
782 /* 770 /*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
812} 800}
813 801
814static void 802static void
815dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 803dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
816{ 804{
817 /* 805 /*
818 * Update run-time statistics of the 'current'. 806 * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
820 update_curr(cfs_rq); 808 update_curr(cfs_rq);
821 809
822 update_stats_dequeue(cfs_rq, se); 810 update_stats_dequeue(cfs_rq, se);
823 if (sleep) { 811 if (flags & DEQUEUE_SLEEP) {
824#ifdef CONFIG_SCHEDSTATS 812#ifdef CONFIG_SCHEDSTATS
825 if (entity_is_task(se)) { 813 if (entity_is_task(se)) {
826 struct task_struct *tsk = task_of(se); 814 struct task_struct *tsk = task_of(se);
827 815
828 if (tsk->state & TASK_INTERRUPTIBLE) 816 if (tsk->state & TASK_INTERRUPTIBLE)
829 se->sleep_start = rq_of(cfs_rq)->clock; 817 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
830 if (tsk->state & TASK_UNINTERRUPTIBLE) 818 if (tsk->state & TASK_UNINTERRUPTIBLE)
831 se->block_start = rq_of(cfs_rq)->clock; 819 se->statistics.block_start = rq_of(cfs_rq)->clock;
832 } 820 }
833#endif 821#endif
834 } 822 }
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
845 * update can refer to the ->curr item and we need to reflect this 833 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position. 834 * movement in our normalized position.
847 */ 835 */
848 if (!sleep) 836 if (!(flags & DEQUEUE_SLEEP))
849 se->vruntime -= cfs_rq->min_vruntime; 837 se->vruntime -= cfs_rq->min_vruntime;
850} 838}
851 839
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
912 * when there are only lesser-weight tasks around): 900 * when there are only lesser-weight tasks around):
913 */ 901 */
914 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 902 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
915 se->slice_max = max(se->slice_max, 903 se->statistics.slice_max = max(se->statistics.slice_max,
916 se->sum_exec_runtime - se->prev_sum_exec_runtime); 904 se->sum_exec_runtime - se->prev_sum_exec_runtime);
917 } 905 }
918#endif 906#endif
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
1054 * then put the task into the rbtree: 1042 * then put the task into the rbtree:
1055 */ 1043 */
1056static void 1044static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) 1045enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1058{ 1046{
1059 struct cfs_rq *cfs_rq; 1047 struct cfs_rq *cfs_rq;
1060 struct sched_entity *se = &p->se; 1048 struct sched_entity *se = &p->se;
1061 int flags = 0;
1062
1063 if (wakeup)
1064 flags |= ENQUEUE_WAKEUP;
1065 if (p->state == TASK_WAKING)
1066 flags |= ENQUEUE_MIGRATE;
1067 1049
1068 for_each_sched_entity(se) { 1050 for_each_sched_entity(se) {
1069 if (se->on_rq) 1051 if (se->on_rq)
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1081 * decreased. We remove the task from the rbtree and 1063 * decreased. We remove the task from the rbtree and
1082 * update the fair scheduling stats: 1064 * update the fair scheduling stats:
1083 */ 1065 */
1084static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 1066static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1085{ 1067{
1086 struct cfs_rq *cfs_rq; 1068 struct cfs_rq *cfs_rq;
1087 struct sched_entity *se = &p->se; 1069 struct sched_entity *se = &p->se;
1088 1070
1089 for_each_sched_entity(se) { 1071 for_each_sched_entity(se) {
1090 cfs_rq = cfs_rq_of(se); 1072 cfs_rq = cfs_rq_of(se);
1091 dequeue_entity(cfs_rq, se, sleep); 1073 dequeue_entity(cfs_rq, se, flags);
1092 /* Don't dequeue parent if it has other entities besides us */ 1074 /* Don't dequeue parent if it has other entities besides us */
1093 if (cfs_rq->load.weight) 1075 if (cfs_rq->load.weight)
1094 break; 1076 break;
1095 sleep = 1; 1077 flags |= DEQUEUE_SLEEP;
1096 } 1078 }
1097 1079
1098 hrtick_update(rq); 1080 hrtick_update(rq);
@@ -1240,7 +1222,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1240 1222
1241static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1223static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1242{ 1224{
1243 struct task_struct *curr = current;
1244 unsigned long this_load, load; 1225 unsigned long this_load, load;
1245 int idx, this_cpu, prev_cpu; 1226 int idx, this_cpu, prev_cpu;
1246 unsigned long tl_per_task; 1227 unsigned long tl_per_task;
@@ -1255,18 +1236,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1255 load = source_load(prev_cpu, idx); 1236 load = source_load(prev_cpu, idx);
1256 this_load = target_load(this_cpu, idx); 1237 this_load = target_load(this_cpu, idx);
1257 1238
1258 if (sync) {
1259 if (sched_feat(SYNC_LESS) &&
1260 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1261 p->se.avg_overlap > sysctl_sched_migration_cost))
1262 sync = 0;
1263 } else {
1264 if (sched_feat(SYNC_MORE) &&
1265 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1266 p->se.avg_overlap < sysctl_sched_migration_cost))
1267 sync = 1;
1268 }
1269
1270 /* 1239 /*
1271 * If sync wakeup then subtract the (maximum possible) 1240 * If sync wakeup then subtract the (maximum possible)
1272 * effect of the currently running task from the load 1241 * effect of the currently running task from the load
@@ -1306,7 +1275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1306 if (sync && balanced) 1275 if (sync && balanced)
1307 return 1; 1276 return 1;
1308 1277
1309 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1278 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
1310 tl_per_task = cpu_avg_load_per_task(this_cpu); 1279 tl_per_task = cpu_avg_load_per_task(this_cpu);
1311 1280
1312 if (balanced || 1281 if (balanced ||
@@ -1318,7 +1287,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1318 * there is no bad imbalance. 1287 * there is no bad imbalance.
1319 */ 1288 */
1320 schedstat_inc(sd, ttwu_move_affine); 1289 schedstat_inc(sd, ttwu_move_affine);
1321 schedstat_inc(p, se.nr_wakeups_affine); 1290 schedstat_inc(p, se.statistics.nr_wakeups_affine);
1322 1291
1323 return 1; 1292 return 1;
1324 } 1293 }
@@ -1406,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1406/* 1375/*
1407 * Try and locate an idle CPU in the sched_domain. 1376 * Try and locate an idle CPU in the sched_domain.
1408 */ 1377 */
1409static int 1378static int select_idle_sibling(struct task_struct *p, int target)
1410select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1411{ 1379{
1412 int cpu = smp_processor_id(); 1380 int cpu = smp_processor_id();
1413 int prev_cpu = task_cpu(p); 1381 int prev_cpu = task_cpu(p);
1382 struct sched_domain *sd;
1414 int i; 1383 int i;
1415 1384
1416 /* 1385 /*
1417 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE 1386 * If the task is going to be woken-up on this cpu and if it is
1418 * test in select_task_rq_fair) and the prev_cpu is idle then that's 1387 * already idle, then it is the right target.
1419 * always a better target than the current cpu.
1420 */ 1388 */
1421 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) 1389 if (target == cpu && idle_cpu(cpu))
1390 return cpu;
1391
1392 /*
1393 * If the task is going to be woken-up on the cpu where it previously
1394 * ran and if it is currently idle, then it the right target.
1395 */
1396 if (target == prev_cpu && idle_cpu(prev_cpu))
1422 return prev_cpu; 1397 return prev_cpu;
1423 1398
1424 /* 1399 /*
1425 * Otherwise, iterate the domain and find an elegible idle cpu. 1400 * Otherwise, iterate the domains and find an elegible idle cpu.
1426 */ 1401 */
1427 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 1402 for_each_domain(target, sd) {
1428 if (!cpu_rq(i)->cfs.nr_running) { 1403 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1429 target = i;
1430 break; 1404 break;
1405
1406 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1407 if (idle_cpu(i)) {
1408 target = i;
1409 break;
1410 }
1431 } 1411 }
1412
1413 /*
1414 * Lets stop looking for an idle sibling when we reached
1415 * the domain that spans the current cpu and prev_cpu.
1416 */
1417 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
1418 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1419 break;
1432 } 1420 }
1433 1421
1434 return target; 1422 return target;
@@ -1445,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1445 * 1433 *
1446 * preempt must be disabled. 1434 * preempt must be disabled.
1447 */ 1435 */
1448static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 1436static int
1437select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
1449{ 1438{
1450 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1439 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1451 int cpu = smp_processor_id(); 1440 int cpu = smp_processor_id();
@@ -1456,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1456 int sync = wake_flags & WF_SYNC; 1445 int sync = wake_flags & WF_SYNC;
1457 1446
1458 if (sd_flag & SD_BALANCE_WAKE) { 1447 if (sd_flag & SD_BALANCE_WAKE) {
1459 if (sched_feat(AFFINE_WAKEUPS) && 1448 if (cpumask_test_cpu(cpu, &p->cpus_allowed))
1460 cpumask_test_cpu(cpu, &p->cpus_allowed))
1461 want_affine = 1; 1449 want_affine = 1;
1462 new_cpu = prev_cpu; 1450 new_cpu = prev_cpu;
1463 } 1451 }
@@ -1491,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1491 } 1479 }
1492 1480
1493 /* 1481 /*
1494 * While iterating the domains looking for a spanning 1482 * If both cpu and prev_cpu are part of this domain,
1495 * WAKE_AFFINE domain, adjust the affine target to any idle cpu 1483 * cpu is a valid SD_WAKE_AFFINE target.
1496 * in cache sharing domains along the way.
1497 */ 1484 */
1498 if (want_affine) { 1485 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1499 int target = -1; 1486 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1500 1487 affine_sd = tmp;
1501 /* 1488 want_affine = 0;
1502 * If both cpu and prev_cpu are part of this domain,
1503 * cpu is a valid SD_WAKE_AFFINE target.
1504 */
1505 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1506 target = cpu;
1507
1508 /*
1509 * If there's an idle sibling in this domain, make that
1510 * the wake_affine target instead of the current cpu.
1511 */
1512 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1513 target = select_idle_sibling(p, tmp, target);
1514
1515 if (target >= 0) {
1516 if (tmp->flags & SD_WAKE_AFFINE) {
1517 affine_sd = tmp;
1518 want_affine = 0;
1519 }
1520 cpu = target;
1521 }
1522 } 1489 }
1523 1490
1524 if (!want_sd && !want_affine) 1491 if (!want_sd && !want_affine)
@@ -1531,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1531 sd = tmp; 1498 sd = tmp;
1532 } 1499 }
1533 1500
1501#ifdef CONFIG_FAIR_GROUP_SCHED
1534 if (sched_feat(LB_SHARES_UPDATE)) { 1502 if (sched_feat(LB_SHARES_UPDATE)) {
1535 /* 1503 /*
1536 * Pick the largest domain to update shares over 1504 * Pick the largest domain to update shares over
1537 */ 1505 */
1538 tmp = sd; 1506 tmp = sd;
1539 if (affine_sd && (!tmp || 1507 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1540 cpumask_weight(sched_domain_span(affine_sd)) >
1541 cpumask_weight(sched_domain_span(sd))))
1542 tmp = affine_sd; 1508 tmp = affine_sd;
1543 1509
1544 if (tmp) 1510 if (tmp) {
1511 raw_spin_unlock(&rq->lock);
1545 update_shares(tmp); 1512 update_shares(tmp);
1513 raw_spin_lock(&rq->lock);
1514 }
1546 } 1515 }
1516#endif
1547 1517
1548 if (affine_sd && wake_affine(affine_sd, p, sync)) 1518 if (affine_sd) {
1549 return cpu; 1519 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1520 return select_idle_sibling(p, cpu);
1521 else
1522 return select_idle_sibling(p, prev_cpu);
1523 }
1550 1524
1551 while (sd) { 1525 while (sd) {
1552 int load_idx = sd->forkexec_idx; 1526 int load_idx = sd->forkexec_idx;
@@ -1576,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1576 1550
1577 /* Now try balancing at a lower domain level of new_cpu */ 1551 /* Now try balancing at a lower domain level of new_cpu */
1578 cpu = new_cpu; 1552 cpu = new_cpu;
1579 weight = cpumask_weight(sched_domain_span(sd)); 1553 weight = sd->span_weight;
1580 sd = NULL; 1554 sd = NULL;
1581 for_each_domain(cpu, tmp) { 1555 for_each_domain(cpu, tmp) {
1582 if (weight <= cpumask_weight(sched_domain_span(tmp))) 1556 if (weight <= tmp->span_weight)
1583 break; 1557 break;
1584 if (tmp->flags & sd_flag) 1558 if (tmp->flags & sd_flag)
1585 sd = tmp; 1559 sd = tmp;
@@ -1591,63 +1565,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1591} 1565}
1592#endif /* CONFIG_SMP */ 1566#endif /* CONFIG_SMP */
1593 1567
1594/*
1595 * Adaptive granularity
1596 *
1597 * se->avg_wakeup gives the average time a task runs until it does a wakeup,
1598 * with the limit of wakeup_gran -- when it never does a wakeup.
1599 *
1600 * So the smaller avg_wakeup is the faster we want this task to preempt,
1601 * but we don't want to treat the preemptee unfairly and therefore allow it
1602 * to run for at least the amount of time we'd like to run.
1603 *
1604 * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
1605 *
1606 * NOTE: we use *nr_running to scale with load, this nicely matches the
1607 * degrading latency on load.
1608 */
1609static unsigned long
1610adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
1611{
1612 u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1613 u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
1614 u64 gran = 0;
1615
1616 if (this_run < expected_wakeup)
1617 gran = expected_wakeup - this_run;
1618
1619 return min_t(s64, gran, sysctl_sched_wakeup_granularity);
1620}
1621
1622static unsigned long 1568static unsigned long
1623wakeup_gran(struct sched_entity *curr, struct sched_entity *se) 1569wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1624{ 1570{
1625 unsigned long gran = sysctl_sched_wakeup_granularity; 1571 unsigned long gran = sysctl_sched_wakeup_granularity;
1626 1572
1627 if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
1628 gran = adaptive_gran(curr, se);
1629
1630 /* 1573 /*
1631 * Since its curr running now, convert the gran from real-time 1574 * Since its curr running now, convert the gran from real-time
1632 * to virtual-time in his units. 1575 * to virtual-time in his units.
1576 *
1577 * By using 'se' instead of 'curr' we penalize light tasks, so
1578 * they get preempted easier. That is, if 'se' < 'curr' then
1579 * the resulting gran will be larger, therefore penalizing the
1580 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1581 * be smaller, again penalizing the lighter task.
1582 *
1583 * This is especially important for buddies when the leftmost
1584 * task is higher priority than the buddy.
1633 */ 1585 */
1634 if (sched_feat(ASYM_GRAN)) { 1586 if (unlikely(se->load.weight != NICE_0_LOAD))
1635 /* 1587 gran = calc_delta_fair(gran, se);
1636 * By using 'se' instead of 'curr' we penalize light tasks, so
1637 * they get preempted easier. That is, if 'se' < 'curr' then
1638 * the resulting gran will be larger, therefore penalizing the
1639 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1640 * be smaller, again penalizing the lighter task.
1641 *
1642 * This is especially important for buddies when the leftmost
1643 * task is higher priority than the buddy.
1644 */
1645 if (unlikely(se->load.weight != NICE_0_LOAD))
1646 gran = calc_delta_fair(gran, se);
1647 } else {
1648 if (unlikely(curr->load.weight != NICE_0_LOAD))
1649 gran = calc_delta_fair(gran, curr);
1650 }
1651 1588
1652 return gran; 1589 return gran;
1653} 1590}
@@ -1705,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1705 struct task_struct *curr = rq->curr; 1642 struct task_struct *curr = rq->curr;
1706 struct sched_entity *se = &curr->se, *pse = &p->se; 1643 struct sched_entity *se = &curr->se, *pse = &p->se;
1707 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1644 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1708 int sync = wake_flags & WF_SYNC;
1709 int scale = cfs_rq->nr_running >= sched_nr_latency; 1645 int scale = cfs_rq->nr_running >= sched_nr_latency;
1710 1646
1711 if (unlikely(rt_prio(p->prio))) 1647 if (unlikely(rt_prio(p->prio)))
@@ -1738,14 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1738 if (unlikely(curr->policy == SCHED_IDLE)) 1674 if (unlikely(curr->policy == SCHED_IDLE))
1739 goto preempt; 1675 goto preempt;
1740 1676
1741 if (sched_feat(WAKEUP_SYNC) && sync)
1742 goto preempt;
1743
1744 if (sched_feat(WAKEUP_OVERLAP) &&
1745 se->avg_overlap < sysctl_sched_migration_cost &&
1746 pse->avg_overlap < sysctl_sched_migration_cost)
1747 goto preempt;
1748
1749 if (!sched_feat(WAKEUP_PREEMPT)) 1677 if (!sched_feat(WAKEUP_PREEMPT))
1750 return; 1678 return;
1751 1679
@@ -1844,13 +1772,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1844 * 3) are cache-hot on their current CPU. 1772 * 3) are cache-hot on their current CPU.
1845 */ 1773 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 1774 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine); 1775 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
1848 return 0; 1776 return 0;
1849 } 1777 }
1850 *all_pinned = 0; 1778 *all_pinned = 0;
1851 1779
1852 if (task_running(rq, p)) { 1780 if (task_running(rq, p)) {
1853 schedstat_inc(p, se.nr_failed_migrations_running); 1781 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1854 return 0; 1782 return 0;
1855 } 1783 }
1856 1784
@@ -1866,14 +1794,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1866#ifdef CONFIG_SCHEDSTATS 1794#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) { 1795 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]); 1796 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations); 1797 schedstat_inc(p, se.statistics.nr_forced_migrations);
1870 } 1798 }
1871#endif 1799#endif
1872 return 1; 1800 return 1;
1873 } 1801 }
1874 1802
1875 if (tsk_cache_hot) { 1803 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot); 1804 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
1877 return 0; 1805 return 0;
1878 } 1806 }
1879 return 1; 1807 return 1;
@@ -2311,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2311 2239
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 2240unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{ 2241{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2242 unsigned long weight = sd->span_weight;
2315 unsigned long smt_gain = sd->smt_gain; 2243 unsigned long smt_gain = sd->smt_gain;
2316 2244
2317 smt_gain /= weight; 2245 smt_gain /= weight;
@@ -2344,7 +2272,7 @@ unsigned long scale_rt_power(int cpu)
2344 2272
2345static void update_cpu_power(struct sched_domain *sd, int cpu) 2273static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{ 2274{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2275 unsigned long weight = sd->span_weight;
2348 unsigned long power = SCHED_LOAD_SCALE; 2276 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups; 2277 struct sched_group *sdg = sd->groups;
2350 2278
@@ -2870,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 2798 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871} 2799}
2872 2800
2801static int active_load_balance_cpu_stop(void *data);
2802
2873/* 2803/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2804 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance. 2805 * tasks if there is an imbalance.
@@ -2959,8 +2889,9 @@ redo:
2959 if (need_active_balance(sd, sd_idle, idle)) { 2889 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags); 2890 raw_spin_lock_irqsave(&busiest->lock, flags);
2961 2891
2962 /* don't kick the migration_thread, if the curr 2892 /* don't kick the active_load_balance_cpu_stop,
2963 * task on busiest cpu can't be moved to this_cpu 2893 * if the curr task on busiest cpu can't be
2894 * moved to this_cpu
2964 */ 2895 */
2965 if (!cpumask_test_cpu(this_cpu, 2896 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) { 2897 &busiest->curr->cpus_allowed)) {
@@ -2970,14 +2901,22 @@ redo:
2970 goto out_one_pinned; 2901 goto out_one_pinned;
2971 } 2902 }
2972 2903
2904 /*
2905 * ->active_balance synchronizes accesses to
2906 * ->active_balance_work. Once set, it's cleared
2907 * only after active load balance is finished.
2908 */
2973 if (!busiest->active_balance) { 2909 if (!busiest->active_balance) {
2974 busiest->active_balance = 1; 2910 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu; 2911 busiest->push_cpu = this_cpu;
2976 active_balance = 1; 2912 active_balance = 1;
2977 } 2913 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags); 2914 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2915
2979 if (active_balance) 2916 if (active_balance)
2980 wake_up_process(busiest->migration_thread); 2917 stop_one_cpu_nowait(cpu_of(busiest),
2918 active_load_balance_cpu_stop, busiest,
2919 &busiest->active_balance_work);
2981 2920
2982 /* 2921 /*
2983 * We've kicked active balancing, reset the failure 2922 * We've kicked active balancing, reset the failure
@@ -3084,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3084} 3023}
3085 3024
3086/* 3025/*
3087 * active_load_balance is run by migration threads. It pushes running tasks 3026 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 3027 * running tasks off the busiest CPU onto idle CPUs. It requires at
3089 * running on each physical CPU where possible, and avoids physical / 3028 * least 1 task to be running on each physical CPU where possible, and
3090 * logical imbalances. 3029 * avoids physical / logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */ 3030 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) 3031static int active_load_balance_cpu_stop(void *data)
3095{ 3032{
3033 struct rq *busiest_rq = data;
3034 int busiest_cpu = cpu_of(busiest_rq);
3096 int target_cpu = busiest_rq->push_cpu; 3035 int target_cpu = busiest_rq->push_cpu;
3036 struct rq *target_rq = cpu_rq(target_cpu);
3097 struct sched_domain *sd; 3037 struct sched_domain *sd;
3098 struct rq *target_rq; 3038
3039 raw_spin_lock_irq(&busiest_rq->lock);
3040
3041 /* make sure the requested cpu hasn't gone down in the meantime */
3042 if (unlikely(busiest_cpu != smp_processor_id() ||
3043 !busiest_rq->active_balance))
3044 goto out_unlock;
3099 3045
3100 /* Is there any task to move? */ 3046 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1) 3047 if (busiest_rq->nr_running <= 1)
3102 return; 3048 goto out_unlock;
3103
3104 target_rq = cpu_rq(target_cpu);
3105 3049
3106 /* 3050 /*
3107 * This condition is "impossible", if it occurs 3051 * This condition is "impossible", if it occurs
@@ -3112,8 +3056,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3112 3056
3113 /* move a task from busiest_rq to target_rq */ 3057 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq); 3058 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117 3059
3118 /* Search for an sd spanning us and the target CPU. */ 3060 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) { 3061 for_each_domain(target_cpu, sd) {
@@ -3132,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3132 schedstat_inc(sd, alb_failed); 3074 schedstat_inc(sd, alb_failed);
3133 } 3075 }
3134 double_unlock_balance(busiest_rq, target_rq); 3076 double_unlock_balance(busiest_rq, target_rq);
3077out_unlock:
3078 busiest_rq->active_balance = 0;
3079 raw_spin_unlock_irq(&busiest_rq->lock);
3080 return 0;
3135} 3081}
3136 3082
3137#ifdef CONFIG_NO_HZ 3083#ifdef CONFIG_NO_HZ
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d9..83c66e8ad3ee 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
1/* 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows 2 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart. 4 * rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14 7
15/* 8/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
21
22/*
23 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
24 * tasks 10 * tasks
25 */ 11 */
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
31SCHED_FEAT(WAKEUP_PREEMPT, 1) 17SCHED_FEAT(WAKEUP_PREEMPT, 1)
32 18
33/* 19/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
58 * the remote end is likely to consume the data we just wrote, and
59 * therefore has cache benefit from being placed on the same cpu, see
60 * also AFFINE_WAKEUPS.
61 */
62SCHED_FEAT(SYNC_WAKEUPS, 1)
63
64/*
65 * Based on load and program behaviour, see if it makes sense to place 20 * Based on load and program behaviour, see if it makes sense to place
66 * a newly woken task on the same cpu as the task that woke it -- 21 * a newly woken task on the same cpu as the task that woke it --
67 * improve cache locality. Typically used with SYNC wakeups as 22 * improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
70SCHED_FEAT(AFFINE_WAKEUPS, 1) 25SCHED_FEAT(AFFINE_WAKEUPS, 1)
71 26
72/* 27/*
73 * Weaken SYNC hint based on overlap
74 */
75SCHED_FEAT(SYNC_LESS, 1)
76
77/*
78 * Add SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_MORE, 0)
81
82/*
83 * Prefer to schedule the task we woke last (assuming it failed 28 * Prefer to schedule the task we woke last (assuming it failed
84 * wakeup-preemption), since its likely going to consume data we 29 * wakeup-preemption), since its likely going to consume data we
85 * touched, increases cache locality. 30 * touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a50947..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
10{ 11{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
12} 13}
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
22static struct task_struct *pick_next_task_idle(struct rq *rq) 23static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 24{
24 schedstat_inc(rq, sched_goidle); 25 schedstat_inc(rq, sched_goidle);
25 /* adjust the active tasks as we might go into a long sleep */ 26 calc_load_account_idle(rq);
26 calc_load_account_active(rq);
27 return rq->idle; 27 return rq->idle;
28} 28}
29 29
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
32 * message if some code attempts to do it: 32 * message if some code attempts to do it:
33 */ 33 */
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 raw_spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b5b920ae2ea7..8afb953e31c6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
616 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 616 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
617 617
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
888 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
889 */ 889 */
890static void 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) 891enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
892{ 892{
893 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
894 894
895 if (wakeup) 895 if (flags & ENQUEUE_WAKEUP)
896 rt_se->timeout = 0; 896 rt_se->timeout = 0;
897 897
898 enqueue_rt_entity(rt_se, head); 898 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
899 899
900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
901 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
902} 902}
903 903
904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
905{ 905{
906 struct sched_rt_entity *rt_se = &p->rt; 906 struct sched_rt_entity *rt_se = &p->rt;
907 907
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
948#ifdef CONFIG_SMP 948#ifdef CONFIG_SMP
949static int find_lowest_rq(struct task_struct *task); 949static int find_lowest_rq(struct task_struct *task);
950 950
951static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 951static int
952select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
952{ 953{
953 struct rq *rq = task_rq(p);
954
955 if (sd_flag != SD_BALANCE_WAKE) 954 if (sd_flag != SD_BALANCE_WAKE)
956 return smp_processor_id(); 955 return smp_processor_id();
957 956
diff --git a/kernel/signal.c b/kernel/signal.c
index dbd7fe073c55..825a3f24ad76 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2735,3 +2735,43 @@ void __init signals_init(void)
2735{ 2735{
2736 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); 2736 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
2737} 2737}
2738
2739#ifdef CONFIG_KGDB_KDB
2740#include <linux/kdb.h>
2741/*
2742 * kdb_send_sig_info - Allows kdb to send signals without exposing
2743 * signal internals. This function checks if the required locks are
2744 * available before calling the main signal code, to avoid kdb
2745 * deadlocks.
2746 */
2747void
2748kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
2749{
2750 static struct task_struct *kdb_prev_t;
2751 int sig, new_t;
2752 if (!spin_trylock(&t->sighand->siglock)) {
2753 kdb_printf("Can't do kill command now.\n"
2754 "The sigmask lock is held somewhere else in "
2755 "kernel, try again later\n");
2756 return;
2757 }
2758 spin_unlock(&t->sighand->siglock);
2759 new_t = kdb_prev_t != t;
2760 kdb_prev_t = t;
2761 if (t->state != TASK_RUNNING && new_t) {
2762 kdb_printf("Process is not RUNNING, sending a signal from "
2763 "kdb risks deadlock\n"
2764 "on the run queue locks. "
2765 "The signal has _not_ been sent.\n"
2766 "Reissue the kill command if you want to risk "
2767 "the deadlock.\n");
2768 return;
2769 }
2770 sig = info->si_signo;
2771 if (send_sig_info(sig, info, t))
2772 kdb_printf("Fail to deliver Signal %d to process %d.\n",
2773 sig, t->pid);
2774 else
2775 kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
2776}
2777#endif /* CONFIG_KGDB_KDB */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7c1a67ef0274..0db913a5c60f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -716,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
716 preempt_enable_no_resched(); 716 preempt_enable_no_resched();
717 cond_resched(); 717 cond_resched();
718 preempt_disable(); 718 preempt_disable();
719 rcu_sched_qs((long)__bind_cpu); 719 rcu_note_context_switch((long)__bind_cpu);
720 } 720 }
721 preempt_enable(); 721 preempt_enable();
722 set_current_state(TASK_INTERRUPTIBLE); 722 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bb9fb1bd79c..b4e7431e7c78 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,384 @@
1/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. 1/*
2 * GPL v2 and any later version. 2 * kernel/stop_machine.c
3 *
4 * Copyright (C) 2008, 2005 IBM Corporation.
5 * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
6 * Copyright (C) 2010 SUSE Linux Products GmbH
7 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
8 *
9 * This file is released under the GPLv2 and any later version.
3 */ 10 */
11#include <linux/completion.h>
4#include <linux/cpu.h> 12#include <linux/cpu.h>
5#include <linux/err.h> 13#include <linux/init.h>
6#include <linux/kthread.h> 14#include <linux/kthread.h>
7#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/percpu.h>
8#include <linux/sched.h> 17#include <linux/sched.h>
9#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
10#include <linux/syscalls.h>
11#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h>
12 21
13#include <asm/atomic.h> 22#include <asm/atomic.h>
14#include <asm/uaccess.h> 23
24/*
25 * Structure to determine completion condition and record errors. May
26 * be shared by works on different cpus.
27 */
28struct cpu_stop_done {
29 atomic_t nr_todo; /* nr left to execute */
30 bool executed; /* actually executed? */
31 int ret; /* collected return value */
32 struct completion completion; /* fired if nr_todo reaches 0 */
33};
34
35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper {
37 spinlock_t lock;
38 struct list_head works; /* list of pending works */
39 struct task_struct *thread; /* stopper thread */
40 bool enabled; /* is this stopper enabled? */
41};
42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44
45static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
46{
47 memset(done, 0, sizeof(*done));
48 atomic_set(&done->nr_todo, nr_todo);
49 init_completion(&done->completion);
50}
51
52/* signal completion unless @done is NULL */
53static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
54{
55 if (done) {
56 if (executed)
57 done->executed = true;
58 if (atomic_dec_and_test(&done->nr_todo))
59 complete(&done->completion);
60 }
61}
62
63/* queue @work to @stopper. if offline, @work is completed immediately */
64static void cpu_stop_queue_work(struct cpu_stopper *stopper,
65 struct cpu_stop_work *work)
66{
67 unsigned long flags;
68
69 spin_lock_irqsave(&stopper->lock, flags);
70
71 if (stopper->enabled) {
72 list_add_tail(&work->list, &stopper->works);
73 wake_up_process(stopper->thread);
74 } else
75 cpu_stop_signal_done(work->done, false);
76
77 spin_unlock_irqrestore(&stopper->lock, flags);
78}
79
80/**
81 * stop_one_cpu - stop a cpu
82 * @cpu: cpu to stop
83 * @fn: function to execute
84 * @arg: argument to @fn
85 *
86 * Execute @fn(@arg) on @cpu. @fn is run in a process context with
87 * the highest priority preempting any task on the cpu and
88 * monopolizing it. This function returns after the execution is
89 * complete.
90 *
91 * This function doesn't guarantee @cpu stays online till @fn
92 * completes. If @cpu goes down in the middle, execution may happen
93 * partially or fully on different cpus. @fn should either be ready
94 * for that or the caller should ensure that @cpu stays online until
95 * this function completes.
96 *
97 * CONTEXT:
98 * Might sleep.
99 *
100 * RETURNS:
101 * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
102 * otherwise, the return value of @fn.
103 */
104int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
105{
106 struct cpu_stop_done done;
107 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
108
109 cpu_stop_init_done(&done, 1);
110 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
111 wait_for_completion(&done.completion);
112 return done.executed ? done.ret : -ENOENT;
113}
114
115/**
116 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
117 * @cpu: cpu to stop
118 * @fn: function to execute
119 * @arg: argument to @fn
120 *
121 * Similar to stop_one_cpu() but doesn't wait for completion. The
122 * caller is responsible for ensuring @work_buf is currently unused
123 * and will remain untouched until stopper starts executing @fn.
124 *
125 * CONTEXT:
126 * Don't care.
127 */
128void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
129 struct cpu_stop_work *work_buf)
130{
131 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
132 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
133}
134
135/* static data for stop_cpus */
136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
140{
141 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu;
144
145 /* initialize works and done */
146 for_each_cpu(cpu, cpumask) {
147 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn;
149 work->arg = arg;
150 work->done = &done;
151 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153
154 /*
155 * Disable preemption while queueing to avoid getting
156 * preempted by a stopper which might wait for other stoppers
157 * to enter @fn which can lead to deadlock.
158 */
159 preempt_disable();
160 for_each_cpu(cpu, cpumask)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable();
164
165 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT;
167}
168
169/**
170 * stop_cpus - stop multiple cpus
171 * @cpumask: cpus to stop
172 * @fn: function to execute
173 * @arg: argument to @fn
174 *
175 * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
176 * @fn is run in a process context with the highest priority
177 * preempting any task on the cpu and monopolizing it. This function
178 * returns after all executions are complete.
179 *
180 * This function doesn't guarantee the cpus in @cpumask stay online
181 * till @fn completes. If some cpus go down in the middle, execution
182 * on the cpu may happen partially or fully on different cpus. @fn
183 * should either be ready for that or the caller should ensure that
184 * the cpus stay online until this function completes.
185 *
186 * All stop_cpus() calls are serialized making it safe for @fn to wait
187 * for all cpus to start executing it.
188 *
189 * CONTEXT:
190 * Might sleep.
191 *
192 * RETURNS:
193 * -ENOENT if @fn(@arg) was not executed at all because all cpus in
194 * @cpumask were offline; otherwise, 0 if all executions of @fn
195 * returned 0, any non zero return value if any returned non zero.
196 */
197int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
198{
199 int ret;
200
201 /* static works are used, process one request at a time */
202 mutex_lock(&stop_cpus_mutex);
203 ret = __stop_cpus(cpumask, fn, arg);
204 mutex_unlock(&stop_cpus_mutex);
205 return ret;
206}
207
208/**
209 * try_stop_cpus - try to stop multiple cpus
210 * @cpumask: cpus to stop
211 * @fn: function to execute
212 * @arg: argument to @fn
213 *
214 * Identical to stop_cpus() except that it fails with -EAGAIN if
215 * someone else is already using the facility.
216 *
217 * CONTEXT:
218 * Might sleep.
219 *
220 * RETURNS:
221 * -EAGAIN if someone else is already stopping cpus, -ENOENT if
222 * @fn(@arg) was not executed at all because all cpus in @cpumask were
223 * offline; otherwise, 0 if all executions of @fn returned 0, any non
224 * zero return value if any returned non zero.
225 */
226int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
227{
228 int ret;
229
230 /* static works are used, process one request at a time */
231 if (!mutex_trylock(&stop_cpus_mutex))
232 return -EAGAIN;
233 ret = __stop_cpus(cpumask, fn, arg);
234 mutex_unlock(&stop_cpus_mutex);
235 return ret;
236}
237
238static int cpu_stopper_thread(void *data)
239{
240 struct cpu_stopper *stopper = data;
241 struct cpu_stop_work *work;
242 int ret;
243
244repeat:
245 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
246
247 if (kthread_should_stop()) {
248 __set_current_state(TASK_RUNNING);
249 return 0;
250 }
251
252 work = NULL;
253 spin_lock_irq(&stopper->lock);
254 if (!list_empty(&stopper->works)) {
255 work = list_first_entry(&stopper->works,
256 struct cpu_stop_work, list);
257 list_del_init(&work->list);
258 }
259 spin_unlock_irq(&stopper->lock);
260
261 if (work) {
262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN];
266
267 __set_current_state(TASK_RUNNING);
268
269 /* cpu stop callbacks are not allowed to sleep */
270 preempt_disable();
271
272 ret = fn(arg);
273 if (ret)
274 done->ret = ret;
275
276 /* restore preemption and check it's still balanced */
277 preempt_enable();
278 WARN_ONCE(preempt_count(),
279 "cpu_stop: %s(%p) leaked preempt count\n",
280 kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
281 ksym_buf), arg);
282
283 cpu_stop_signal_done(done, true);
284 } else
285 schedule();
286
287 goto repeat;
288}
289
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu)
293{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p;
298
299 switch (action & ~CPU_TASKS_FROZEN) {
300 case CPU_UP_PREPARE:
301 BUG_ON(stopper->thread || stopper->enabled ||
302 !list_empty(&stopper->works));
303 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
304 cpu);
305 if (IS_ERR(p))
306 return NOTIFY_BAD;
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p);
309 stopper->thread = p;
310 break;
311
312 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread);
316 /* mark enabled */
317 spin_lock_irq(&stopper->lock);
318 stopper->enabled = true;
319 spin_unlock_irq(&stopper->lock);
320 break;
321
322#ifdef CONFIG_HOTPLUG_CPU
323 case CPU_UP_CANCELED:
324 case CPU_DEAD:
325 {
326 struct cpu_stop_work *work;
327
328 /* kill the stopper */
329 kthread_stop(stopper->thread);
330 /* drain remaining works */
331 spin_lock_irq(&stopper->lock);
332 list_for_each_entry(work, &stopper->works, list)
333 cpu_stop_signal_done(work->done, false);
334 stopper->enabled = false;
335 spin_unlock_irq(&stopper->lock);
336 /* release the stopper */
337 put_task_struct(stopper->thread);
338 stopper->thread = NULL;
339 break;
340 }
341#endif
342 }
343
344 return NOTIFY_OK;
345}
346
347/*
348 * Give it a higher priority so that cpu stopper is available to other
349 * cpu notifiers. It currently shares the same priority as sched
350 * migration_notifier.
351 */
352static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
353 .notifier_call = cpu_stop_cpu_callback,
354 .priority = 10,
355};
356
357static int __init cpu_stop_init(void)
358{
359 void *bcpu = (void *)(long)smp_processor_id();
360 unsigned int cpu;
361 int err;
362
363 for_each_possible_cpu(cpu) {
364 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
365
366 spin_lock_init(&stopper->lock);
367 INIT_LIST_HEAD(&stopper->works);
368 }
369
370 /* start one for the boot cpu */
371 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
372 bcpu);
373 BUG_ON(err == NOTIFY_BAD);
374 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
375 register_cpu_notifier(&cpu_stop_cpu_notifier);
376
377 return 0;
378}
379early_initcall(cpu_stop_init);
380
381#ifdef CONFIG_STOP_MACHINE
15 382
16/* This controls the threads on each CPU. */ 383/* This controls the threads on each CPU. */
17enum stopmachine_state { 384enum stopmachine_state {
@@ -26,174 +393,94 @@ enum stopmachine_state {
26 /* Exit */ 393 /* Exit */
27 STOPMACHINE_EXIT, 394 STOPMACHINE_EXIT,
28}; 395};
29static enum stopmachine_state state;
30 396
31struct stop_machine_data { 397struct stop_machine_data {
32 int (*fn)(void *); 398 int (*fn)(void *);
33 void *data; 399 void *data;
34 int fnret; 400 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
401 unsigned int num_threads;
402 const struct cpumask *active_cpus;
403
404 enum stopmachine_state state;
405 atomic_t thread_ack;
35}; 406};
36 407
37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ 408static void set_state(struct stop_machine_data *smdata,
38static unsigned int num_threads; 409 enum stopmachine_state newstate)
39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock);
41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus;
48static void __percpu *stop_machine_work;
49
50static void set_state(enum stopmachine_state newstate)
51{ 410{
52 /* Reset ack counter. */ 411 /* Reset ack counter. */
53 atomic_set(&thread_ack, num_threads); 412 atomic_set(&smdata->thread_ack, smdata->num_threads);
54 smp_wmb(); 413 smp_wmb();
55 state = newstate; 414 smdata->state = newstate;
56} 415}
57 416
58/* Last one to ack a state moves to the next state. */ 417/* Last one to ack a state moves to the next state. */
59static void ack_state(void) 418static void ack_state(struct stop_machine_data *smdata)
60{ 419{
61 if (atomic_dec_and_test(&thread_ack)) 420 if (atomic_dec_and_test(&smdata->thread_ack))
62 set_state(state + 1); 421 set_state(smdata, smdata->state + 1);
63} 422}
64 423
65/* This is the actual function which stops the CPU. It runs 424/* This is the cpu_stop function which stops the CPU. */
66 * in the context of a dedicated stopmachine workqueue. */ 425static int stop_machine_cpu_stop(void *data)
67static void stop_cpu(struct work_struct *unused)
68{ 426{
427 struct stop_machine_data *smdata = data;
69 enum stopmachine_state curstate = STOPMACHINE_NONE; 428 enum stopmachine_state curstate = STOPMACHINE_NONE;
70 struct stop_machine_data *smdata = &idle; 429 int cpu = smp_processor_id(), err = 0;
71 int cpu = smp_processor_id(); 430 bool is_active;
72 int err; 431
432 if (!smdata->active_cpus)
433 is_active = cpu == cpumask_first(cpu_online_mask);
434 else
435 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
73 436
74 if (!active_cpus) {
75 if (cpu == cpumask_first(cpu_online_mask))
76 smdata = &active;
77 } else {
78 if (cpumask_test_cpu(cpu, active_cpus))
79 smdata = &active;
80 }
81 /* Simple state machine */ 437 /* Simple state machine */
82 do { 438 do {
83 /* Chill out and ensure we re-read stopmachine_state. */ 439 /* Chill out and ensure we re-read stopmachine_state. */
84 cpu_relax(); 440 cpu_relax();
85 if (state != curstate) { 441 if (smdata->state != curstate) {
86 curstate = state; 442 curstate = smdata->state;
87 switch (curstate) { 443 switch (curstate) {
88 case STOPMACHINE_DISABLE_IRQ: 444 case STOPMACHINE_DISABLE_IRQ:
89 local_irq_disable(); 445 local_irq_disable();
90 hard_irq_disable(); 446 hard_irq_disable();
91 break; 447 break;
92 case STOPMACHINE_RUN: 448 case STOPMACHINE_RUN:
93 /* On multiple CPUs only a single error code 449 if (is_active)
94 * is needed to tell that something failed. */ 450 err = smdata->fn(smdata->data);
95 err = smdata->fn(smdata->data);
96 if (err)
97 smdata->fnret = err;
98 break; 451 break;
99 default: 452 default:
100 break; 453 break;
101 } 454 }
102 ack_state(); 455 ack_state(smdata);
103 } 456 }
104 } while (curstate != STOPMACHINE_EXIT); 457 } while (curstate != STOPMACHINE_EXIT);
105 458
106 local_irq_enable(); 459 local_irq_enable();
460 return err;
107} 461}
108 462
109/* Callback for CPUs which aren't supposed to do anything. */
110static int chill(void *unused)
111{
112 return 0;
113}
114
115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 463int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
153{ 464{
154 struct work_struct *sm_work; 465 struct stop_machine_data smdata = { .fn = fn, .data = data,
155 int i, ret; 466 .num_threads = num_online_cpus(),
156 467 .active_cpus = cpus };
157 /* Set up initial state. */ 468
158 mutex_lock(&lock); 469 /* Set the initial state and stop all online cpus. */
159 num_threads = num_online_cpus(); 470 set_state(&smdata, STOPMACHINE_PREPARE);
160 active_cpus = cpus; 471 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
161 active.fn = fn;
162 active.data = data;
163 active.fnret = 0;
164 idle.fn = chill;
165 idle.data = NULL;
166
167 set_state(STOPMACHINE_PREPARE);
168
169 /* Schedule the stop_cpu work on all cpus: hold this CPU so one
170 * doesn't hit this CPU until we're ready. */
171 get_cpu();
172 for_each_online_cpu(i) {
173 sm_work = per_cpu_ptr(stop_machine_work, i);
174 INIT_WORK(sm_work, stop_cpu);
175 queue_work_on(i, stop_machine_wq, sm_work);
176 }
177 /* This will release the thread on our CPU. */
178 put_cpu();
179 flush_workqueue(stop_machine_wq);
180 ret = active.fnret;
181 mutex_unlock(&lock);
182 return ret;
183} 472}
184 473
185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 474int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
186{ 475{
187 int ret; 476 int ret;
188 477
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
192 /* No CPUs can come up or down during this. */ 478 /* No CPUs can come up or down during this. */
193 get_online_cpus(); 479 get_online_cpus();
194 ret = __stop_machine(fn, data, cpus); 480 ret = __stop_machine(fn, data, cpus);
195 put_online_cpus(); 481 put_online_cpus();
196 stop_machine_destroy();
197 return ret; 482 return ret;
198} 483}
199EXPORT_SYMBOL_GPL(stop_machine); 484EXPORT_SYMBOL_GPL(stop_machine);
485
486#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index 7cb426a58965..0d36d889c74d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -492,10 +492,6 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
492 return -ENOMEM; 492 return -ENOMEM;
493 old = current_cred(); 493 old = current_cred();
494 494
495 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
496 if (retval)
497 goto error;
498
499 retval = -EPERM; 495 retval = -EPERM;
500 if (rgid != (gid_t) -1) { 496 if (rgid != (gid_t) -1) {
501 if (old->gid == rgid || 497 if (old->gid == rgid ||
@@ -543,10 +539,6 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
543 return -ENOMEM; 539 return -ENOMEM;
544 old = current_cred(); 540 old = current_cred();
545 541
546 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
547 if (retval)
548 goto error;
549
550 retval = -EPERM; 542 retval = -EPERM;
551 if (capable(CAP_SETGID)) 543 if (capable(CAP_SETGID))
552 new->gid = new->egid = new->sgid = new->fsgid = gid; 544 new->gid = new->egid = new->sgid = new->fsgid = gid;
@@ -610,10 +602,6 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
610 return -ENOMEM; 602 return -ENOMEM;
611 old = current_cred(); 603 old = current_cred();
612 604
613 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
614 if (retval)
615 goto error;
616
617 retval = -EPERM; 605 retval = -EPERM;
618 if (ruid != (uid_t) -1) { 606 if (ruid != (uid_t) -1) {
619 new->uid = ruid; 607 new->uid = ruid;
@@ -675,10 +663,6 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
675 return -ENOMEM; 663 return -ENOMEM;
676 old = current_cred(); 664 old = current_cred();
677 665
678 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
679 if (retval)
680 goto error;
681
682 retval = -EPERM; 666 retval = -EPERM;
683 if (capable(CAP_SETUID)) { 667 if (capable(CAP_SETUID)) {
684 new->suid = new->uid = uid; 668 new->suid = new->uid = uid;
@@ -719,9 +703,6 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
719 if (!new) 703 if (!new)
720 return -ENOMEM; 704 return -ENOMEM;
721 705
722 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
723 if (retval)
724 goto error;
725 old = current_cred(); 706 old = current_cred();
726 707
727 retval = -EPERM; 708 retval = -EPERM;
@@ -788,10 +769,6 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
788 return -ENOMEM; 769 return -ENOMEM;
789 old = current_cred(); 770 old = current_cred();
790 771
791 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
792 if (retval)
793 goto error;
794
795 retval = -EPERM; 772 retval = -EPERM;
796 if (!capable(CAP_SETGID)) { 773 if (!capable(CAP_SETGID)) {
797 if (rgid != (gid_t) -1 && rgid != old->gid && 774 if (rgid != (gid_t) -1 && rgid != old->gid &&
@@ -851,9 +828,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
851 old = current_cred(); 828 old = current_cred();
852 old_fsuid = old->fsuid; 829 old_fsuid = old->fsuid;
853 830
854 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
855 goto error;
856
857 if (uid == old->uid || uid == old->euid || 831 if (uid == old->uid || uid == old->euid ||
858 uid == old->suid || uid == old->fsuid || 832 uid == old->suid || uid == old->fsuid ||
859 capable(CAP_SETUID)) { 833 capable(CAP_SETUID)) {
@@ -864,7 +838,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
864 } 838 }
865 } 839 }
866 840
867error:
868 abort_creds(new); 841 abort_creds(new);
869 return old_fsuid; 842 return old_fsuid;
870 843
@@ -888,9 +861,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
888 old = current_cred(); 861 old = current_cred();
889 old_fsgid = old->fsgid; 862 old_fsgid = old->fsgid;
890 863
891 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
892 goto error;
893
894 if (gid == old->gid || gid == old->egid || 864 if (gid == old->gid || gid == old->egid ||
895 gid == old->sgid || gid == old->fsgid || 865 gid == old->sgid || gid == old->fsgid ||
896 capable(CAP_SETGID)) { 866 capable(CAP_SETGID)) {
@@ -900,7 +870,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
900 } 870 }
901 } 871 }
902 872
903error:
904 abort_creds(new); 873 abort_creds(new);
905 return old_fsgid; 874 return old_fsgid;
906 875
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c649d1c5fe09..18821e77b2a0 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -164,6 +164,27 @@ static int proc_taint(struct ctl_table *table, int write,
164 void __user *buffer, size_t *lenp, loff_t *ppos); 164 void __user *buffer, size_t *lenp, loff_t *ppos);
165#endif 165#endif
166 166
167#ifdef CONFIG_MAGIC_SYSRQ
168static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
169
170static int sysrq_sysctl_handler(ctl_table *table, int write,
171 void __user *buffer, size_t *lenp,
172 loff_t *ppos)
173{
174 int error;
175
176 error = proc_dointvec(table, write, buffer, lenp, ppos);
177 if (error)
178 return error;
179
180 if (write)
181 sysrq_toggle_support(__sysrq_enabled);
182
183 return 0;
184}
185
186#endif
187
167static struct ctl_table root_table[]; 188static struct ctl_table root_table[];
168static struct ctl_table_root sysctl_table_root; 189static struct ctl_table_root sysctl_table_root;
169static struct ctl_table_header root_table_header = { 190static struct ctl_table_header root_table_header = {
@@ -568,7 +589,7 @@ static struct ctl_table kern_table[] = {
568 .data = &__sysrq_enabled, 589 .data = &__sysrq_enabled,
569 .maxlen = sizeof (int), 590 .maxlen = sizeof (int),
570 .mode = 0644, 591 .mode = 0644,
571 .proc_handler = proc_dointvec, 592 .proc_handler = sysrq_sysctl_handler,
572 }, 593 },
573#endif 594#endif
574#ifdef CONFIG_PROC_SYSCTL 595#ifdef CONFIG_PROC_SYSCTL
@@ -622,7 +643,7 @@ static struct ctl_table kern_table[] = {
622#endif 643#endif
623 { 644 {
624 .procname = "userprocess_debug", 645 .procname = "userprocess_debug",
625 .data = &sysctl_userprocess_debug, 646 .data = &show_unhandled_signals,
626 .maxlen = sizeof(int), 647 .maxlen = sizeof(int),
627 .mode = 0644, 648 .mode = 0644,
628 .proc_handler = proc_dointvec, 649 .proc_handler = proc_dointvec,
@@ -1440,7 +1461,8 @@ static struct ctl_table fs_table[] = {
1440}; 1461};
1441 1462
1442static struct ctl_table debug_table[] = { 1463static struct ctl_table debug_table[] = {
1443#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) 1464#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
1465 defined(CONFIG_S390)
1444 { 1466 {
1445 .procname = "exception-trace", 1467 .procname = "exception-trace",
1446 .data = &show_unhandled_signals, 1468 .data = &show_unhandled_signals,
@@ -2049,8 +2071,132 @@ int proc_dostring(struct ctl_table *table, int write,
2049 buffer, lenp, ppos); 2071 buffer, lenp, ppos);
2050} 2072}
2051 2073
2074static size_t proc_skip_spaces(char **buf)
2075{
2076 size_t ret;
2077 char *tmp = skip_spaces(*buf);
2078 ret = tmp - *buf;
2079 *buf = tmp;
2080 return ret;
2081}
2082
2083static void proc_skip_char(char **buf, size_t *size, const char v)
2084{
2085 while (*size) {
2086 if (**buf != v)
2087 break;
2088 (*size)--;
2089 (*buf)++;
2090 }
2091}
2092
2093#define TMPBUFLEN 22
2094/**
2095 * proc_get_long - reads an ASCII formated integer from a user buffer
2096 *
2097 * @buf - a kernel buffer
2098 * @size - size of the kernel buffer
2099 * @val - this is where the number will be stored
2100 * @neg - set to %TRUE if number is negative
2101 * @perm_tr - a vector which contains the allowed trailers
2102 * @perm_tr_len - size of the perm_tr vector
2103 * @tr - pointer to store the trailer character
2104 *
2105 * In case of success 0 is returned and buf and size are updated with
2106 * the amount of bytes read. If tr is non NULL and a trailing
2107 * character exist (size is non zero after returning from this
2108 * function) tr is updated with the trailing character.
2109 */
2110static int proc_get_long(char **buf, size_t *size,
2111 unsigned long *val, bool *neg,
2112 const char *perm_tr, unsigned perm_tr_len, char *tr)
2113{
2114 int len;
2115 char *p, tmp[TMPBUFLEN];
2116
2117 if (!*size)
2118 return -EINVAL;
2119
2120 len = *size;
2121 if (len > TMPBUFLEN - 1)
2122 len = TMPBUFLEN - 1;
2123
2124 memcpy(tmp, *buf, len);
2125
2126 tmp[len] = 0;
2127 p = tmp;
2128 if (*p == '-' && *size > 1) {
2129 *neg = true;
2130 p++;
2131 } else
2132 *neg = false;
2133 if (!isdigit(*p))
2134 return -EINVAL;
2135
2136 *val = simple_strtoul(p, &p, 0);
2137
2138 len = p - tmp;
2139
2140 /* We don't know if the next char is whitespace thus we may accept
2141 * invalid integers (e.g. 1234...a) or two integers instead of one
2142 * (e.g. 123...1). So lets not allow such large numbers. */
2143 if (len == TMPBUFLEN - 1)
2144 return -EINVAL;
2145
2146 if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
2147 return -EINVAL;
2148
2149 if (tr && (len < *size))
2150 *tr = *p;
2151
2152 *buf += len;
2153 *size -= len;
2154
2155 return 0;
2156}
2157
2158/**
2159 * proc_put_long - coverts an integer to a decimal ASCII formated string
2160 *
2161 * @buf - the user buffer
2162 * @size - the size of the user buffer
2163 * @val - the integer to be converted
2164 * @neg - sign of the number, %TRUE for negative
2165 *
2166 * In case of success 0 is returned and buf and size are updated with
2167 * the amount of bytes read.
2168 */
2169static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
2170 bool neg)
2171{
2172 int len;
2173 char tmp[TMPBUFLEN], *p = tmp;
2174
2175 sprintf(p, "%s%lu", neg ? "-" : "", val);
2176 len = strlen(tmp);
2177 if (len > *size)
2178 len = *size;
2179 if (copy_to_user(*buf, tmp, len))
2180 return -EFAULT;
2181 *size -= len;
2182 *buf += len;
2183 return 0;
2184}
2185#undef TMPBUFLEN
2052 2186
2053static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 2187static int proc_put_char(void __user **buf, size_t *size, char c)
2188{
2189 if (*size) {
2190 char __user **buffer = (char __user **)buf;
2191 if (put_user(c, *buffer))
2192 return -EFAULT;
2193 (*size)--, (*buffer)++;
2194 *buf = *buffer;
2195 }
2196 return 0;
2197}
2198
2199static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
2054 int *valp, 2200 int *valp,
2055 int write, void *data) 2201 int write, void *data)
2056{ 2202{
@@ -2059,33 +2205,31 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2059 } else { 2205 } else {
2060 int val = *valp; 2206 int val = *valp;
2061 if (val < 0) { 2207 if (val < 0) {
2062 *negp = -1; 2208 *negp = true;
2063 *lvalp = (unsigned long)-val; 2209 *lvalp = (unsigned long)-val;
2064 } else { 2210 } else {
2065 *negp = 0; 2211 *negp = false;
2066 *lvalp = (unsigned long)val; 2212 *lvalp = (unsigned long)val;
2067 } 2213 }
2068 } 2214 }
2069 return 0; 2215 return 0;
2070} 2216}
2071 2217
2218static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
2219
2072static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2220static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2073 int write, void __user *buffer, 2221 int write, void __user *buffer,
2074 size_t *lenp, loff_t *ppos, 2222 size_t *lenp, loff_t *ppos,
2075 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2223 int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
2076 int write, void *data), 2224 int write, void *data),
2077 void *data) 2225 void *data)
2078{ 2226{
2079#define TMPBUFLEN 21 2227 int *i, vleft, first = 1, err = 0;
2080 int *i, vleft, first = 1, neg; 2228 unsigned long page = 0;
2081 unsigned long lval; 2229 size_t left;
2082 size_t left, len; 2230 char *kbuf;
2083 2231
2084 char buf[TMPBUFLEN], *p; 2232 if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
2085 char __user *s = buffer;
2086
2087 if (!tbl_data || !table->maxlen || !*lenp ||
2088 (*ppos && !write)) {
2089 *lenp = 0; 2233 *lenp = 0;
2090 return 0; 2234 return 0;
2091 } 2235 }
@@ -2097,89 +2241,69 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2097 if (!conv) 2241 if (!conv)
2098 conv = do_proc_dointvec_conv; 2242 conv = do_proc_dointvec_conv;
2099 2243
2244 if (write) {
2245 if (left > PAGE_SIZE - 1)
2246 left = PAGE_SIZE - 1;
2247 page = __get_free_page(GFP_TEMPORARY);
2248 kbuf = (char *) page;
2249 if (!kbuf)
2250 return -ENOMEM;
2251 if (copy_from_user(kbuf, buffer, left)) {
2252 err = -EFAULT;
2253 goto free;
2254 }
2255 kbuf[left] = 0;
2256 }
2257
2100 for (; left && vleft--; i++, first=0) { 2258 for (; left && vleft--; i++, first=0) {
2101 if (write) { 2259 unsigned long lval;
2102 while (left) { 2260 bool neg;
2103 char c;
2104 if (get_user(c, s))
2105 return -EFAULT;
2106 if (!isspace(c))
2107 break;
2108 left--;
2109 s++;
2110 }
2111 if (!left)
2112 break;
2113 neg = 0;
2114 len = left;
2115 if (len > sizeof(buf) - 1)
2116 len = sizeof(buf) - 1;
2117 if (copy_from_user(buf, s, len))
2118 return -EFAULT;
2119 buf[len] = 0;
2120 p = buf;
2121 if (*p == '-' && left > 1) {
2122 neg = 1;
2123 p++;
2124 }
2125 if (*p < '0' || *p > '9')
2126 break;
2127 2261
2128 lval = simple_strtoul(p, &p, 0); 2262 if (write) {
2263 left -= proc_skip_spaces(&kbuf);
2129 2264
2130 len = p-buf; 2265 err = proc_get_long(&kbuf, &left, &lval, &neg,
2131 if ((len < left) && *p && !isspace(*p)) 2266 proc_wspace_sep,
2267 sizeof(proc_wspace_sep), NULL);
2268 if (err)
2132 break; 2269 break;
2133 s += len; 2270 if (conv(&neg, &lval, i, 1, data)) {
2134 left -= len; 2271 err = -EINVAL;
2135
2136 if (conv(&neg, &lval, i, 1, data))
2137 break; 2272 break;
2273 }
2138 } else { 2274 } else {
2139 p = buf; 2275 if (conv(&neg, &lval, i, 0, data)) {
2276 err = -EINVAL;
2277 break;
2278 }
2140 if (!first) 2279 if (!first)
2141 *p++ = '\t'; 2280 err = proc_put_char(&buffer, &left, '\t');
2142 2281 if (err)
2143 if (conv(&neg, &lval, i, 0, data)) 2282 break;
2283 err = proc_put_long(&buffer, &left, lval, neg);
2284 if (err)
2144 break; 2285 break;
2145
2146 sprintf(p, "%s%lu", neg ? "-" : "", lval);
2147 len = strlen(buf);
2148 if (len > left)
2149 len = left;
2150 if(copy_to_user(s, buf, len))
2151 return -EFAULT;
2152 left -= len;
2153 s += len;
2154 } 2286 }
2155 } 2287 }
2156 2288
2157 if (!write && !first && left) { 2289 if (!write && !first && left && !err)
2158 if(put_user('\n', s)) 2290 err = proc_put_char(&buffer, &left, '\n');
2159 return -EFAULT; 2291 if (write && !err)
2160 left--, s++; 2292 left -= proc_skip_spaces(&kbuf);
2161 } 2293free:
2162 if (write) { 2294 if (write) {
2163 while (left) { 2295 free_page(page);
2164 char c; 2296 if (first)
2165 if (get_user(c, s++)) 2297 return err ? : -EINVAL;
2166 return -EFAULT;
2167 if (!isspace(c))
2168 break;
2169 left--;
2170 }
2171 } 2298 }
2172 if (write && first)
2173 return -EINVAL;
2174 *lenp -= left; 2299 *lenp -= left;
2175 *ppos += *lenp; 2300 *ppos += *lenp;
2176 return 0; 2301 return err;
2177#undef TMPBUFLEN
2178} 2302}
2179 2303
2180static int do_proc_dointvec(struct ctl_table *table, int write, 2304static int do_proc_dointvec(struct ctl_table *table, int write,
2181 void __user *buffer, size_t *lenp, loff_t *ppos, 2305 void __user *buffer, size_t *lenp, loff_t *ppos,
2182 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2306 int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
2183 int write, void *data), 2307 int write, void *data),
2184 void *data) 2308 void *data)
2185{ 2309{
@@ -2247,8 +2371,8 @@ struct do_proc_dointvec_minmax_conv_param {
2247 int *max; 2371 int *max;
2248}; 2372};
2249 2373
2250static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 2374static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
2251 int *valp, 2375 int *valp,
2252 int write, void *data) 2376 int write, void *data)
2253{ 2377{
2254 struct do_proc_dointvec_minmax_conv_param *param = data; 2378 struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2261,10 +2385,10 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2261 } else { 2385 } else {
2262 int val = *valp; 2386 int val = *valp;
2263 if (val < 0) { 2387 if (val < 0) {
2264 *negp = -1; 2388 *negp = true;
2265 *lvalp = (unsigned long)-val; 2389 *lvalp = (unsigned long)-val;
2266 } else { 2390 } else {
2267 *negp = 0; 2391 *negp = false;
2268 *lvalp = (unsigned long)val; 2392 *lvalp = (unsigned long)val;
2269 } 2393 }
2270 } 2394 }
@@ -2304,102 +2428,78 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2304 unsigned long convmul, 2428 unsigned long convmul,
2305 unsigned long convdiv) 2429 unsigned long convdiv)
2306{ 2430{
2307#define TMPBUFLEN 21 2431 unsigned long *i, *min, *max;
2308 unsigned long *i, *min, *max, val; 2432 int vleft, first = 1, err = 0;
2309 int vleft, first=1, neg; 2433 unsigned long page = 0;
2310 size_t len, left; 2434 size_t left;
2311 char buf[TMPBUFLEN], *p; 2435 char *kbuf;
2312 char __user *s = buffer; 2436
2313 2437 if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
2314 if (!data || !table->maxlen || !*lenp ||
2315 (*ppos && !write)) {
2316 *lenp = 0; 2438 *lenp = 0;
2317 return 0; 2439 return 0;
2318 } 2440 }
2319 2441
2320 i = (unsigned long *) data; 2442 i = (unsigned long *) data;
2321 min = (unsigned long *) table->extra1; 2443 min = (unsigned long *) table->extra1;
2322 max = (unsigned long *) table->extra2; 2444 max = (unsigned long *) table->extra2;
2323 vleft = table->maxlen / sizeof(unsigned long); 2445 vleft = table->maxlen / sizeof(unsigned long);
2324 left = *lenp; 2446 left = *lenp;
2325 2447
2448 if (write) {
2449 if (left > PAGE_SIZE - 1)
2450 left = PAGE_SIZE - 1;
2451 page = __get_free_page(GFP_TEMPORARY);
2452 kbuf = (char *) page;
2453 if (!kbuf)
2454 return -ENOMEM;
2455 if (copy_from_user(kbuf, buffer, left)) {
2456 err = -EFAULT;
2457 goto free;
2458 }
2459 kbuf[left] = 0;
2460 }
2461
2326 for (; left && vleft--; i++, min++, max++, first=0) { 2462 for (; left && vleft--; i++, min++, max++, first=0) {
2463 unsigned long val;
2464
2327 if (write) { 2465 if (write) {
2328 while (left) { 2466 bool neg;
2329 char c; 2467
2330 if (get_user(c, s)) 2468 left -= proc_skip_spaces(&kbuf);
2331 return -EFAULT; 2469
2332 if (!isspace(c)) 2470 err = proc_get_long(&kbuf, &left, &val, &neg,
2333 break; 2471 proc_wspace_sep,
2334 left--; 2472 sizeof(proc_wspace_sep), NULL);
2335 s++; 2473 if (err)
2336 }
2337 if (!left)
2338 break;
2339 neg = 0;
2340 len = left;
2341 if (len > TMPBUFLEN-1)
2342 len = TMPBUFLEN-1;
2343 if (copy_from_user(buf, s, len))
2344 return -EFAULT;
2345 buf[len] = 0;
2346 p = buf;
2347 if (*p == '-' && left > 1) {
2348 neg = 1;
2349 p++;
2350 }
2351 if (*p < '0' || *p > '9')
2352 break;
2353 val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
2354 len = p-buf;
2355 if ((len < left) && *p && !isspace(*p))
2356 break; 2474 break;
2357 if (neg) 2475 if (neg)
2358 val = -val;
2359 s += len;
2360 left -= len;
2361
2362 if(neg)
2363 continue; 2476 continue;
2364 if ((min && val < *min) || (max && val > *max)) 2477 if ((min && val < *min) || (max && val > *max))
2365 continue; 2478 continue;
2366 *i = val; 2479 *i = val;
2367 } else { 2480 } else {
2368 p = buf; 2481 val = convdiv * (*i) / convmul;
2369 if (!first) 2482 if (!first)
2370 *p++ = '\t'; 2483 err = proc_put_char(&buffer, &left, '\t');
2371 sprintf(p, "%lu", convdiv * (*i) / convmul); 2484 err = proc_put_long(&buffer, &left, val, false);
2372 len = strlen(buf); 2485 if (err)
2373 if (len > left) 2486 break;
2374 len = left;
2375 if(copy_to_user(s, buf, len))
2376 return -EFAULT;
2377 left -= len;
2378 s += len;
2379 } 2487 }
2380 } 2488 }
2381 2489
2382 if (!write && !first && left) { 2490 if (!write && !first && left && !err)
2383 if(put_user('\n', s)) 2491 err = proc_put_char(&buffer, &left, '\n');
2384 return -EFAULT; 2492 if (write && !err)
2385 left--, s++; 2493 left -= proc_skip_spaces(&kbuf);
2386 } 2494free:
2387 if (write) { 2495 if (write) {
2388 while (left) { 2496 free_page(page);
2389 char c; 2497 if (first)
2390 if (get_user(c, s++)) 2498 return err ? : -EINVAL;
2391 return -EFAULT;
2392 if (!isspace(c))
2393 break;
2394 left--;
2395 }
2396 } 2499 }
2397 if (write && first)
2398 return -EINVAL;
2399 *lenp -= left; 2500 *lenp -= left;
2400 *ppos += *lenp; 2501 *ppos += *lenp;
2401 return 0; 2502 return err;
2402#undef TMPBUFLEN
2403} 2503}
2404 2504
2405static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2505static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2460,7 +2560,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2460} 2560}
2461 2561
2462 2562
2463static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, 2563static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
2464 int *valp, 2564 int *valp,
2465 int write, void *data) 2565 int write, void *data)
2466{ 2566{
@@ -2472,10 +2572,10 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2472 int val = *valp; 2572 int val = *valp;
2473 unsigned long lval; 2573 unsigned long lval;
2474 if (val < 0) { 2574 if (val < 0) {
2475 *negp = -1; 2575 *negp = true;
2476 lval = (unsigned long)-val; 2576 lval = (unsigned long)-val;
2477 } else { 2577 } else {
2478 *negp = 0; 2578 *negp = false;
2479 lval = (unsigned long)val; 2579 lval = (unsigned long)val;
2480 } 2580 }
2481 *lvalp = lval / HZ; 2581 *lvalp = lval / HZ;
@@ -2483,7 +2583,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2483 return 0; 2583 return 0;
2484} 2584}
2485 2585
2486static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, 2586static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
2487 int *valp, 2587 int *valp,
2488 int write, void *data) 2588 int write, void *data)
2489{ 2589{
@@ -2495,10 +2595,10 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2495 int val = *valp; 2595 int val = *valp;
2496 unsigned long lval; 2596 unsigned long lval;
2497 if (val < 0) { 2597 if (val < 0) {
2498 *negp = -1; 2598 *negp = true;
2499 lval = (unsigned long)-val; 2599 lval = (unsigned long)-val;
2500 } else { 2600 } else {
2501 *negp = 0; 2601 *negp = false;
2502 lval = (unsigned long)val; 2602 lval = (unsigned long)val;
2503 } 2603 }
2504 *lvalp = jiffies_to_clock_t(lval); 2604 *lvalp = jiffies_to_clock_t(lval);
@@ -2506,7 +2606,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2506 return 0; 2606 return 0;
2507} 2607}
2508 2608
2509static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, 2609static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
2510 int *valp, 2610 int *valp,
2511 int write, void *data) 2611 int write, void *data)
2512{ 2612{
@@ -2516,10 +2616,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2516 int val = *valp; 2616 int val = *valp;
2517 unsigned long lval; 2617 unsigned long lval;
2518 if (val < 0) { 2618 if (val < 0) {
2519 *negp = -1; 2619 *negp = true;
2520 lval = (unsigned long)-val; 2620 lval = (unsigned long)-val;
2521 } else { 2621 } else {
2522 *negp = 0; 2622 *negp = false;
2523 lval = (unsigned long)val; 2623 lval = (unsigned long)val;
2524 } 2624 }
2525 *lvalp = jiffies_to_msecs(lval); 2625 *lvalp = jiffies_to_msecs(lval);
@@ -2616,6 +2716,157 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
2616 return 0; 2716 return 0;
2617} 2717}
2618 2718
2719/**
2720 * proc_do_large_bitmap - read/write from/to a large bitmap
2721 * @table: the sysctl table
2722 * @write: %TRUE if this is a write to the sysctl file
2723 * @buffer: the user buffer
2724 * @lenp: the size of the user buffer
2725 * @ppos: file position
2726 *
2727 * The bitmap is stored at table->data and the bitmap length (in bits)
2728 * in table->maxlen.
2729 *
2730 * We use a range comma separated format (e.g. 1,3-4,10-10) so that
2731 * large bitmaps may be represented in a compact manner. Writing into
2732 * the file will clear the bitmap then update it with the given input.
2733 *
2734 * Returns 0 on success.
2735 */
2736int proc_do_large_bitmap(struct ctl_table *table, int write,
2737 void __user *buffer, size_t *lenp, loff_t *ppos)
2738{
2739 int err = 0;
2740 bool first = 1;
2741 size_t left = *lenp;
2742 unsigned long bitmap_len = table->maxlen;
2743 unsigned long *bitmap = (unsigned long *) table->data;
2744 unsigned long *tmp_bitmap = NULL;
2745 char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
2746
2747 if (!bitmap_len || !left || (*ppos && !write)) {
2748 *lenp = 0;
2749 return 0;
2750 }
2751
2752 if (write) {
2753 unsigned long page = 0;
2754 char *kbuf;
2755
2756 if (left > PAGE_SIZE - 1)
2757 left = PAGE_SIZE - 1;
2758
2759 page = __get_free_page(GFP_TEMPORARY);
2760 kbuf = (char *) page;
2761 if (!kbuf)
2762 return -ENOMEM;
2763 if (copy_from_user(kbuf, buffer, left)) {
2764 free_page(page);
2765 return -EFAULT;
2766 }
2767 kbuf[left] = 0;
2768
2769 tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
2770 GFP_KERNEL);
2771 if (!tmp_bitmap) {
2772 free_page(page);
2773 return -ENOMEM;
2774 }
2775 proc_skip_char(&kbuf, &left, '\n');
2776 while (!err && left) {
2777 unsigned long val_a, val_b;
2778 bool neg;
2779
2780 err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
2781 sizeof(tr_a), &c);
2782 if (err)
2783 break;
2784 if (val_a >= bitmap_len || neg) {
2785 err = -EINVAL;
2786 break;
2787 }
2788
2789 val_b = val_a;
2790 if (left) {
2791 kbuf++;
2792 left--;
2793 }
2794
2795 if (c == '-') {
2796 err = proc_get_long(&kbuf, &left, &val_b,
2797 &neg, tr_b, sizeof(tr_b),
2798 &c);
2799 if (err)
2800 break;
2801 if (val_b >= bitmap_len || neg ||
2802 val_a > val_b) {
2803 err = -EINVAL;
2804 break;
2805 }
2806 if (left) {
2807 kbuf++;
2808 left--;
2809 }
2810 }
2811
2812 while (val_a <= val_b)
2813 set_bit(val_a++, tmp_bitmap);
2814
2815 first = 0;
2816 proc_skip_char(&kbuf, &left, '\n');
2817 }
2818 free_page(page);
2819 } else {
2820 unsigned long bit_a, bit_b = 0;
2821
2822 while (left) {
2823 bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
2824 if (bit_a >= bitmap_len)
2825 break;
2826 bit_b = find_next_zero_bit(bitmap, bitmap_len,
2827 bit_a + 1) - 1;
2828
2829 if (!first) {
2830 err = proc_put_char(&buffer, &left, ',');
2831 if (err)
2832 break;
2833 }
2834 err = proc_put_long(&buffer, &left, bit_a, false);
2835 if (err)
2836 break;
2837 if (bit_a != bit_b) {
2838 err = proc_put_char(&buffer, &left, '-');
2839 if (err)
2840 break;
2841 err = proc_put_long(&buffer, &left, bit_b, false);
2842 if (err)
2843 break;
2844 }
2845
2846 first = 0; bit_b++;
2847 }
2848 if (!err)
2849 err = proc_put_char(&buffer, &left, '\n');
2850 }
2851
2852 if (!err) {
2853 if (write) {
2854 if (*ppos)
2855 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
2856 else
2857 memcpy(bitmap, tmp_bitmap,
2858 BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
2859 }
2860 kfree(tmp_bitmap);
2861 *lenp -= left;
2862 *ppos += *lenp;
2863 return 0;
2864 } else {
2865 kfree(tmp_bitmap);
2866 return err;
2867 }
2868}
2869
2619#else /* CONFIG_PROC_FS */ 2870#else /* CONFIG_PROC_FS */
2620 2871
2621int proc_dostring(struct ctl_table *table, int write, 2872int proc_dostring(struct ctl_table *table, int write,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 59030570f5ca..937d31dc8566 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -224,7 +224,6 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
224 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, 224 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
225 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, 225 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
226 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, 226 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
227 { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
228 {} 227 {}
229}; 228};
230 229
diff --git a/kernel/time.c b/kernel/time.c
index 656dccfe1cbb..50612faa9baf 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -132,12 +132,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
132 */ 132 */
133static inline void warp_clock(void) 133static inline void warp_clock(void)
134{ 134{
135 write_seqlock_irq(&xtime_lock); 135 struct timespec delta, adjust;
136 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 136 delta.tv_sec = sys_tz.tz_minuteswest * 60;
137 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 137 delta.tv_nsec = 0;
138 update_xtime_cache(0); 138 adjust = timespec_add_safe(current_kernel_time(), delta);
139 write_sequnlock_irq(&xtime_lock); 139 do_settimeofday(&adjust);
140 clock_was_set();
141} 140}
142 141
143/* 142/*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1f5dde637457..f08e99c1d561 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -625,6 +625,54 @@ static void clocksource_enqueue(struct clocksource *cs)
625 list_add(&cs->list, entry); 625 list_add(&cs->list, entry);
626} 626}
627 627
628
629/*
630 * Maximum time we expect to go between ticks. This includes idle
631 * tickless time. It provides the trade off between selecting a
632 * mult/shift pair that is very precise but can only handle a short
633 * period of time, vs. a mult/shift pair that can handle long periods
634 * of time but isn't as precise.
635 *
636 * This is a subsystem constant, and actual hardware limitations
637 * may override it (ie: clocksources that wrap every 3 seconds).
638 */
639#define MAX_UPDATE_LENGTH 5 /* Seconds */
640
641/**
642 * __clocksource_register_scale - Used to install new clocksources
643 * @t: clocksource to be registered
644 * @scale: Scale factor multiplied against freq to get clocksource hz
645 * @freq: clocksource frequency (cycles per second) divided by scale
646 *
647 * Returns -EBUSY if registration fails, zero otherwise.
648 *
649 * This *SHOULD NOT* be called directly! Please use the
650 * clocksource_register_hz() or clocksource_register_khz helper functions.
651 */
652int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
653{
654
655 /*
656 * Ideally we want to use some of the limits used in
657 * clocksource_max_deferment, to provide a more informed
658 * MAX_UPDATE_LENGTH. But for now this just gets the
659 * register interface working properly.
660 */
661 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
662 NSEC_PER_SEC/scale,
663 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs);
665
666 mutex_lock(&clocksource_mutex);
667 clocksource_enqueue(cs);
668 clocksource_select();
669 clocksource_enqueue_watchdog(cs);
670 mutex_unlock(&clocksource_mutex);
671 return 0;
672}
673EXPORT_SYMBOL_GPL(__clocksource_register_scale);
674
675
628/** 676/**
629 * clocksource_register - Used to install new clocksources 677 * clocksource_register - Used to install new clocksources
630 * @t: clocksource to be registered 678 * @t: clocksource to be registered
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7c0f180d6e9d..c63116863a80 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -69,7 +69,7 @@ static s64 time_freq;
69/* time at last adjustment (secs): */ 69/* time at last adjustment (secs): */
70static long time_reftime; 70static long time_reftime;
71 71
72long time_adjust; 72static long time_adjust;
73 73
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 75static s64 ntp_tick_adj;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f51..1d7b9bc1c034 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,14 +150,32 @@ static void tick_nohz_update_jiffies(ktime_t now)
150 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
151} 151}
152 152
153/*
154 * Updates the per cpu time idle statistics counters
155 */
156static void
157update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
158{
159 ktime_t delta;
160
161 if (ts->idle_active) {
162 delta = ktime_sub(now, ts->idle_entrytime);
163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
164 if (nr_iowait_cpu() > 0)
165 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
166 ts->idle_entrytime = now;
167 }
168
169 if (last_update_time)
170 *last_update_time = ktime_to_us(now);
171
172}
173
153static void tick_nohz_stop_idle(int cpu, ktime_t now) 174static void tick_nohz_stop_idle(int cpu, ktime_t now)
154{ 175{
155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 176 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
157 177
158 delta = ktime_sub(now, ts->idle_entrytime); 178 update_ts_time_stats(ts, now, NULL);
159 ts->idle_lastupdate = now;
160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
161 ts->idle_active = 0; 179 ts->idle_active = 0;
162 180
163 sched_clock_idle_wakeup_event(0); 181 sched_clock_idle_wakeup_event(0);
@@ -165,20 +183,32 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
165 183
166static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 184static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
167{ 185{
168 ktime_t now, delta; 186 ktime_t now;
169 187
170 now = ktime_get(); 188 now = ktime_get();
171 if (ts->idle_active) { 189
172 delta = ktime_sub(now, ts->idle_entrytime); 190 update_ts_time_stats(ts, now, NULL);
173 ts->idle_lastupdate = now; 191
174 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
175 }
176 ts->idle_entrytime = now; 192 ts->idle_entrytime = now;
177 ts->idle_active = 1; 193 ts->idle_active = 1;
178 sched_clock_idle_sleep_event(); 194 sched_clock_idle_sleep_event();
179 return now; 195 return now;
180} 196}
181 197
198/**
199 * get_cpu_idle_time_us - get the total idle time of a cpu
200 * @cpu: CPU number to query
201 * @last_update_time: variable to store update time in
202 *
203 * Return the cummulative idle time (since boot) for a given
204 * CPU, in microseconds. The idle time returned includes
205 * the iowait time (unlike what "top" and co report).
206 *
207 * This time is measured via accounting rather than sampling,
208 * and is as accurate as ktime_get() is.
209 *
210 * This function returns -1 if NOHZ is not enabled.
211 */
182u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 212u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
183{ 213{
184 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 214 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
186 if (!tick_nohz_enabled) 216 if (!tick_nohz_enabled)
187 return -1; 217 return -1;
188 218
189 if (ts->idle_active) 219 update_ts_time_stats(ts, ktime_get(), last_update_time);
190 *last_update_time = ktime_to_us(ts->idle_lastupdate);
191 else
192 *last_update_time = ktime_to_us(ktime_get());
193 220
194 return ktime_to_us(ts->idle_sleeptime); 221 return ktime_to_us(ts->idle_sleeptime);
195} 222}
196EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 223EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
197 224
225/*
226 * get_cpu_iowait_time_us - get the total iowait time of a cpu
227 * @cpu: CPU number to query
228 * @last_update_time: variable to store update time in
229 *
230 * Return the cummulative iowait time (since boot) for a given
231 * CPU, in microseconds.
232 *
233 * This time is measured via accounting rather than sampling,
234 * and is as accurate as ktime_get() is.
235 *
236 * This function returns -1 if NOHZ is not enabled.
237 */
238u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
239{
240 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
241
242 if (!tick_nohz_enabled)
243 return -1;
244
245 update_ts_time_stats(ts, ktime_get(), last_update_time);
246
247 return ktime_to_us(ts->iowait_sleeptime);
248}
249EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
250
198/** 251/**
199 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 252 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
200 * 253 *
@@ -262,6 +315,9 @@ void tick_nohz_stop_sched_tick(int inidle)
262 goto end; 315 goto end;
263 } 316 }
264 317
318 if (nohz_ratelimit(cpu))
319 goto end;
320
265 ts->idle_calls++; 321 ts->idle_calls++;
266 /* Read jiffies and the time when jiffies were updated last */ 322 /* Read jiffies and the time when jiffies were updated last */
267 do { 323 do {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 39f6177fafac..caf8d4d4f5c8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,13 +165,6 @@ struct timespec raw_time;
165/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
167 167
168static struct timespec xtime_cache __attribute__ ((aligned (16)));
169void update_xtime_cache(u64 nsec)
170{
171 xtime_cache = xtime;
172 timespec_add_ns(&xtime_cache, nsec);
173}
174
175/* must hold xtime_lock */ 168/* must hold xtime_lock */
176void timekeeping_leap_insert(int leapsecond) 169void timekeeping_leap_insert(int leapsecond)
177{ 170{
@@ -332,8 +325,6 @@ int do_settimeofday(struct timespec *tv)
332 325
333 xtime = *tv; 326 xtime = *tv;
334 327
335 update_xtime_cache(0);
336
337 timekeeper.ntp_error = 0; 328 timekeeper.ntp_error = 0;
338 ntp_clear(); 329 ntp_clear();
339 330
@@ -559,7 +550,6 @@ void __init timekeeping_init(void)
559 } 550 }
560 set_normalized_timespec(&wall_to_monotonic, 551 set_normalized_timespec(&wall_to_monotonic,
561 -boot.tv_sec, -boot.tv_nsec); 552 -boot.tv_sec, -boot.tv_nsec);
562 update_xtime_cache(0);
563 total_sleep_time.tv_sec = 0; 553 total_sleep_time.tv_sec = 0;
564 total_sleep_time.tv_nsec = 0; 554 total_sleep_time.tv_nsec = 0;
565 write_sequnlock_irqrestore(&xtime_lock, flags); 555 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -593,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
593 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
594 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 584 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
595 } 585 }
596 update_xtime_cache(0);
597 /* re-base the last cycle value */ 586 /* re-base the last cycle value */
598 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
599 timekeeper.ntp_error = 0; 588 timekeeper.ntp_error = 0;
@@ -788,7 +777,6 @@ void update_wall_time(void)
788{ 777{
789 struct clocksource *clock; 778 struct clocksource *clock;
790 cycle_t offset; 779 cycle_t offset;
791 u64 nsecs;
792 int shift = 0, maxshift; 780 int shift = 0, maxshift;
793 781
794 /* Make sure we're fully resumed: */ 782 /* Make sure we're fully resumed: */
@@ -847,7 +835,9 @@ void update_wall_time(void)
847 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; 835 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
848 } 836 }
849 837
850 /* store full nanoseconds into xtime after rounding it up and 838
839 /*
840 * Store full nanoseconds into xtime after rounding it up and
851 * add the remainder to the error difference. 841 * add the remainder to the error difference.
852 */ 842 */
853 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; 843 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
@@ -855,8 +845,15 @@ void update_wall_time(void)
855 timekeeper.ntp_error += timekeeper.xtime_nsec << 845 timekeeper.ntp_error += timekeeper.xtime_nsec <<
856 timekeeper.ntp_error_shift; 846 timekeeper.ntp_error_shift;
857 847
858 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); 848 /*
859 update_xtime_cache(nsecs); 849 * Finally, make sure that after the rounding
850 * xtime.tv_nsec isn't larger then NSEC_PER_SEC
851 */
852 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
853 xtime.tv_nsec -= NSEC_PER_SEC;
854 xtime.tv_sec++;
855 second_overflow();
856 }
860 857
861 /* check to see if there is a new clocksource to use */ 858 /* check to see if there is a new clocksource to use */
862 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 859 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
@@ -896,13 +893,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
896 893
897unsigned long get_seconds(void) 894unsigned long get_seconds(void)
898{ 895{
899 return xtime_cache.tv_sec; 896 return xtime.tv_sec;
900} 897}
901EXPORT_SYMBOL(get_seconds); 898EXPORT_SYMBOL(get_seconds);
902 899
903struct timespec __current_kernel_time(void) 900struct timespec __current_kernel_time(void)
904{ 901{
905 return xtime_cache; 902 return xtime;
906} 903}
907 904
908struct timespec current_kernel_time(void) 905struct timespec current_kernel_time(void)
@@ -913,7 +910,7 @@ struct timespec current_kernel_time(void)
913 do { 910 do {
914 seq = read_seqbegin(&xtime_lock); 911 seq = read_seqbegin(&xtime_lock);
915 912
916 now = xtime_cache; 913 now = xtime;
917 } while (read_seqretry(&xtime_lock, seq)); 914 } while (read_seqretry(&xtime_lock, seq));
918 915
919 return now; 916 return now;
@@ -928,7 +925,7 @@ struct timespec get_monotonic_coarse(void)
928 do { 925 do {
929 seq = read_seqbegin(&xtime_lock); 926 seq = read_seqbegin(&xtime_lock);
930 927
931 now = xtime_cache; 928 now = xtime;
932 mono = wall_to_monotonic; 929 mono = wall_to_monotonic;
933 } while (read_seqretry(&xtime_lock, seq)); 930 } while (read_seqretry(&xtime_lock, seq));
934 931
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1a4a7dd78777..ab8f5e33fa92 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
176 P_ns(idle_waketime); 176 P_ns(idle_waketime);
177 P_ns(idle_exittime); 177 P_ns(idle_exittime);
178 P_ns(idle_sleeptime); 178 P_ns(idle_sleeptime);
179 P_ns(iowait_sleeptime);
179 P(last_jiffies); 180 P(last_jiffies);
180 P(next_jiffies); 181 P(next_jiffies);
181 P_ns(idle_expires); 182 P_ns(idle_expires);
diff --git a/kernel/timer.c b/kernel/timer.c
index aeb6a54f2771..9199f3c52215 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -319,6 +319,24 @@ unsigned long round_jiffies_up_relative(unsigned long j)
319} 319}
320EXPORT_SYMBOL_GPL(round_jiffies_up_relative); 320EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
321 321
322/**
323 * set_timer_slack - set the allowed slack for a timer
324 * @slack_hz: the amount of time (in jiffies) allowed for rounding
325 *
326 * Set the amount of time, in jiffies, that a certain timer has
327 * in terms of slack. By setting this value, the timer subsystem
328 * will schedule the actual timer somewhere between
329 * the time mod_timer() asks for, and that time plus the slack.
330 *
331 * By setting the slack to -1, a percentage of the delay is used
332 * instead.
333 */
334void set_timer_slack(struct timer_list *timer, int slack_hz)
335{
336 timer->slack = slack_hz;
337}
338EXPORT_SYMBOL_GPL(set_timer_slack);
339
322 340
323static inline void set_running_timer(struct tvec_base *base, 341static inline void set_running_timer(struct tvec_base *base,
324 struct timer_list *timer) 342 struct timer_list *timer)
@@ -550,6 +568,7 @@ static void __init_timer(struct timer_list *timer,
550{ 568{
551 timer->entry.next = NULL; 569 timer->entry.next = NULL;
552 timer->base = __raw_get_cpu_var(tvec_bases); 570 timer->base = __raw_get_cpu_var(tvec_bases);
571 timer->slack = -1;
553#ifdef CONFIG_TIMER_STATS 572#ifdef CONFIG_TIMER_STATS
554 timer->start_site = NULL; 573 timer->start_site = NULL;
555 timer->start_pid = -1; 574 timer->start_pid = -1;
@@ -715,6 +734,41 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
715} 734}
716EXPORT_SYMBOL(mod_timer_pending); 735EXPORT_SYMBOL(mod_timer_pending);
717 736
737/*
738 * Decide where to put the timer while taking the slack into account
739 *
740 * Algorithm:
741 * 1) calculate the maximum (absolute) time
742 * 2) calculate the highest bit where the expires and new max are different
743 * 3) use this bit to make a mask
744 * 4) use the bitmask to round down the maximum time, so that all last
745 * bits are zeros
746 */
747static inline
748unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
749{
750 unsigned long expires_limit, mask;
751 int bit;
752
753 expires_limit = expires + timer->slack;
754
755 if (timer->slack < 0) /* auto slack: use 0.4% */
756 expires_limit = expires + (expires - jiffies)/256;
757
758 mask = expires ^ expires_limit;
759
760 if (mask == 0)
761 return expires;
762
763 bit = find_last_bit(&mask, BITS_PER_LONG);
764
765 mask = (1 << bit) - 1;
766
767 expires_limit = expires_limit & ~(mask);
768
769 return expires_limit;
770}
771
718/** 772/**
719 * mod_timer - modify a timer's timeout 773 * mod_timer - modify a timer's timeout
720 * @timer: the timer to be modified 774 * @timer: the timer to be modified
@@ -745,6 +799,8 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
745 if (timer_pending(timer) && timer->expires == expires) 799 if (timer_pending(timer) && timer->expires == expires)
746 return 1; 800 return 1;
747 801
802 expires = apply_slack(timer, expires);
803
748 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 804 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
749} 805}
750EXPORT_SYMBOL(mod_timer); 806EXPORT_SYMBOL(mod_timer);
@@ -955,6 +1011,47 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
955 return index; 1011 return index;
956} 1012}
957 1013
1014static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1015 unsigned long data)
1016{
1017 int preempt_count = preempt_count();
1018
1019#ifdef CONFIG_LOCKDEP
1020 /*
1021 * It is permissible to free the timer from inside the
1022 * function that is called from it, this we need to take into
1023 * account for lockdep too. To avoid bogus "held lock freed"
1024 * warnings as well as problems when looking into
1025 * timer->lockdep_map, make a copy and use that here.
1026 */
1027 struct lockdep_map lockdep_map = timer->lockdep_map;
1028#endif
1029 /*
1030 * Couple the lock chain with the lock chain at
1031 * del_timer_sync() by acquiring the lock_map around the fn()
1032 * call here and in del_timer_sync().
1033 */
1034 lock_map_acquire(&lockdep_map);
1035
1036 trace_timer_expire_entry(timer);
1037 fn(data);
1038 trace_timer_expire_exit(timer);
1039
1040 lock_map_release(&lockdep_map);
1041
1042 if (preempt_count != preempt_count()) {
1043 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1044 fn, preempt_count, preempt_count());
1045 /*
1046 * Restore the preempt count. That gives us a decent
1047 * chance to survive and extract information. If the
1048 * callback kept a lock held, bad luck, but not worse
1049 * than the BUG() we had.
1050 */
1051 preempt_count() = preempt_count;
1052 }
1053}
1054
958#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) 1055#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
959 1056
960/** 1057/**
@@ -998,45 +1095,7 @@ static inline void __run_timers(struct tvec_base *base)
998 detach_timer(timer, 1); 1095 detach_timer(timer, 1);
999 1096
1000 spin_unlock_irq(&base->lock); 1097 spin_unlock_irq(&base->lock);
1001 { 1098 call_timer_fn(timer, fn, data);
1002 int preempt_count = preempt_count();
1003
1004#ifdef CONFIG_LOCKDEP
1005 /*
1006 * It is permissible to free the timer from
1007 * inside the function that is called from
1008 * it, this we need to take into account for
1009 * lockdep too. To avoid bogus "held lock
1010 * freed" warnings as well as problems when
1011 * looking into timer->lockdep_map, make a
1012 * copy and use that here.
1013 */
1014 struct lockdep_map lockdep_map =
1015 timer->lockdep_map;
1016#endif
1017 /*
1018 * Couple the lock chain with the lock chain at
1019 * del_timer_sync() by acquiring the lock_map
1020 * around the fn() call here and in
1021 * del_timer_sync().
1022 */
1023 lock_map_acquire(&lockdep_map);
1024
1025 trace_timer_expire_entry(timer);
1026 fn(data);
1027 trace_timer_expire_exit(timer);
1028
1029 lock_map_release(&lockdep_map);
1030
1031 if (preempt_count != preempt_count()) {
1032 printk(KERN_ERR "huh, entered %p "
1033 "with preempt_count %08x, exited"
1034 " with %08x?\n",
1035 fn, preempt_count,
1036 preempt_count());
1037 BUG();
1038 }
1039 }
1040 spin_lock_irq(&base->lock); 1099 spin_lock_irq(&base->lock);
1041 } 1100 }
1042 } 1101 }
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 13e13d428cd3..8b1797c4545b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
44 help 44 help
45 See Documentation/trace/ftrace-design.txt 45 See Documentation/trace/ftrace-design.txt
46 46
47config HAVE_HW_BRANCH_TRACER
48 bool
49
50config HAVE_SYSCALL_TRACEPOINTS 47config HAVE_SYSCALL_TRACEPOINTS
51 bool 48 bool
52 help 49 help
@@ -374,14 +371,6 @@ config STACK_TRACER
374 371
375 Say N if unsure. 372 Say N if unsure.
376 373
377config HW_BRANCH_TRACER
378 depends on HAVE_HW_BRANCH_TRACER
379 bool "Trace hw branches"
380 select GENERIC_TRACER
381 help
382 This tracer records all branches on the system in a circular
383 buffer, giving access to the last N branches for each cpu.
384
385config KMEMTRACE 374config KMEMTRACE
386 bool "Trace SLAB allocations" 375 bool "Trace SLAB allocations"
387 select GENERIC_TRACER 376 select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 78edc6490038..ffb1a5b0550e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o 41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 44obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 45obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 46obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2404b59b3097..32837e19e3bd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -264,6 +264,7 @@ struct ftrace_profile {
264 unsigned long counter; 264 unsigned long counter;
265#ifdef CONFIG_FUNCTION_GRAPH_TRACER 265#ifdef CONFIG_FUNCTION_GRAPH_TRACER
266 unsigned long long time; 266 unsigned long long time;
267 unsigned long long time_squared;
267#endif 268#endif
268}; 269};
269 270
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
366{ 367{
367#ifdef CONFIG_FUNCTION_GRAPH_TRACER 368#ifdef CONFIG_FUNCTION_GRAPH_TRACER
368 seq_printf(m, " Function " 369 seq_printf(m, " Function "
369 "Hit Time Avg\n" 370 "Hit Time Avg s^2\n"
370 " -------- " 371 " -------- "
371 "--- ---- ---\n"); 372 "--- ---- --- ---\n");
372#else 373#else
373 seq_printf(m, " Function Hit\n" 374 seq_printf(m, " Function Hit\n"
374 " -------- ---\n"); 375 " -------- ---\n");
@@ -384,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
384 static DEFINE_MUTEX(mutex); 385 static DEFINE_MUTEX(mutex);
385 static struct trace_seq s; 386 static struct trace_seq s;
386 unsigned long long avg; 387 unsigned long long avg;
388 unsigned long long stddev;
387#endif 389#endif
388 390
389 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 391 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -394,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
394 avg = rec->time; 396 avg = rec->time;
395 do_div(avg, rec->counter); 397 do_div(avg, rec->counter);
396 398
399 /* Sample standard deviation (s^2) */
400 if (rec->counter <= 1)
401 stddev = 0;
402 else {
403 stddev = rec->time_squared - rec->counter * avg * avg;
404 /*
405 * Divide only 1000 for ns^2 -> us^2 conversion.
406 * trace_print_graph_duration will divide 1000 again.
407 */
408 do_div(stddev, (rec->counter - 1) * 1000);
409 }
410
397 mutex_lock(&mutex); 411 mutex_lock(&mutex);
398 trace_seq_init(&s); 412 trace_seq_init(&s);
399 trace_print_graph_duration(rec->time, &s); 413 trace_print_graph_duration(rec->time, &s);
400 trace_seq_puts(&s, " "); 414 trace_seq_puts(&s, " ");
401 trace_print_graph_duration(avg, &s); 415 trace_print_graph_duration(avg, &s);
416 trace_seq_puts(&s, " ");
417 trace_print_graph_duration(stddev, &s);
402 trace_print_seq(m, &s); 418 trace_print_seq(m, &s);
403 mutex_unlock(&mutex); 419 mutex_unlock(&mutex);
404#endif 420#endif
@@ -650,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
650 if (!stat->hash || !ftrace_profile_enabled) 666 if (!stat->hash || !ftrace_profile_enabled)
651 goto out; 667 goto out;
652 668
669 /* If the calltime was zero'd ignore it */
670 if (!trace->calltime)
671 goto out;
672
653 calltime = trace->rettime - trace->calltime; 673 calltime = trace->rettime - trace->calltime;
654 674
655 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { 675 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -668,8 +688,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
668 } 688 }
669 689
670 rec = ftrace_find_profiled_func(stat, trace->func); 690 rec = ftrace_find_profiled_func(stat, trace->func);
671 if (rec) 691 if (rec) {
672 rec->time += calltime; 692 rec->time += calltime;
693 rec->time_squared += calltime * calltime;
694 }
673 695
674 out: 696 out:
675 local_irq_restore(flags); 697 local_irq_restore(flags);
@@ -3212,8 +3234,7 @@ free:
3212} 3234}
3213 3235
3214static void 3236static void
3215ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, 3237ftrace_graph_probe_sched_switch(struct task_struct *prev, struct task_struct *next)
3216 struct task_struct *next)
3217{ 3238{
3218 unsigned long long timestamp; 3239 unsigned long long timestamp;
3219 int index; 3240 int index;
@@ -3339,11 +3360,11 @@ void unregister_ftrace_graph(void)
3339 goto out; 3360 goto out;
3340 3361
3341 ftrace_graph_active--; 3362 ftrace_graph_active--;
3342 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
3343 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 3363 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3344 ftrace_graph_entry = ftrace_graph_entry_stub; 3364 ftrace_graph_entry = ftrace_graph_entry_stub;
3345 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 3365 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
3346 unregister_pm_notifier(&ftrace_suspend_notifier); 3366 unregister_pm_notifier(&ftrace_suspend_notifier);
3367 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
3347 3368
3348 out: 3369 out:
3349 mutex_unlock(&ftrace_lock); 3370 mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 41ca394feb22..7f6059c5aa94 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -319,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
319#define TS_MASK ((1ULL << TS_SHIFT) - 1) 319#define TS_MASK ((1ULL << TS_SHIFT) - 1)
320#define TS_DELTA_TEST (~TS_MASK) 320#define TS_DELTA_TEST (~TS_MASK)
321 321
322/* Flag when events were overwritten */
323#define RB_MISSED_EVENTS (1 << 31)
324/* Missed count stored at end */
325#define RB_MISSED_STORED (1 << 30)
326
322struct buffer_data_page { 327struct buffer_data_page {
323 u64 time_stamp; /* page time stamp */ 328 u64 time_stamp; /* page time stamp */
324 local_t commit; /* write committed index */ 329 local_t commit; /* write committed index */
@@ -338,6 +343,7 @@ struct buffer_page {
338 local_t write; /* index for next write */ 343 local_t write; /* index for next write */
339 unsigned read; /* index for next read */ 344 unsigned read; /* index for next read */
340 local_t entries; /* entries on this page */ 345 local_t entries; /* entries on this page */
346 unsigned long real_end; /* real end of data */
341 struct buffer_data_page *page; /* Actual data page */ 347 struct buffer_data_page *page; /* Actual data page */
342}; 348};
343 349
@@ -417,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
417 (unsigned int)sizeof(field.commit), 423 (unsigned int)sizeof(field.commit),
418 (unsigned int)is_signed_type(long)); 424 (unsigned int)is_signed_type(long));
419 425
426 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
427 "offset:%u;\tsize:%u;\tsigned:%u;\n",
428 (unsigned int)offsetof(typeof(field), commit),
429 1,
430 (unsigned int)is_signed_type(long));
431
420 ret = trace_seq_printf(s, "\tfield: char data;\t" 432 ret = trace_seq_printf(s, "\tfield: char data;\t"
421 "offset:%u;\tsize:%u;\tsigned:%u;\n", 433 "offset:%u;\tsize:%u;\tsigned:%u;\n",
422 (unsigned int)offsetof(typeof(field), data), 434 (unsigned int)offsetof(typeof(field), data),
@@ -440,6 +452,8 @@ struct ring_buffer_per_cpu {
440 struct buffer_page *tail_page; /* write to tail */ 452 struct buffer_page *tail_page; /* write to tail */
441 struct buffer_page *commit_page; /* committed pages */ 453 struct buffer_page *commit_page; /* committed pages */
442 struct buffer_page *reader_page; 454 struct buffer_page *reader_page;
455 unsigned long lost_events;
456 unsigned long last_overrun;
443 local_t commit_overrun; 457 local_t commit_overrun;
444 local_t overrun; 458 local_t overrun;
445 local_t entries; 459 local_t entries;
@@ -1762,6 +1776,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1762 kmemcheck_annotate_bitfield(event, bitfield); 1776 kmemcheck_annotate_bitfield(event, bitfield);
1763 1777
1764 /* 1778 /*
1779 * Save the original length to the meta data.
1780 * This will be used by the reader to add lost event
1781 * counter.
1782 */
1783 tail_page->real_end = tail;
1784
1785 /*
1765 * If this event is bigger than the minimum size, then 1786 * If this event is bigger than the minimum size, then
1766 * we need to be careful that we don't subtract the 1787 * we need to be careful that we don't subtract the
1767 * write counter enough to allow another writer to slip 1788 * write counter enough to allow another writer to slip
@@ -1979,17 +2000,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1979 u64 *ts, u64 *delta) 2000 u64 *ts, u64 *delta)
1980{ 2001{
1981 struct ring_buffer_event *event; 2002 struct ring_buffer_event *event;
1982 static int once;
1983 int ret; 2003 int ret;
1984 2004
1985 if (unlikely(*delta > (1ULL << 59) && !once++)) { 2005 WARN_ONCE(*delta > (1ULL << 59),
1986 printk(KERN_WARNING "Delta way too big! %llu" 2006 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
1987 " ts=%llu write stamp = %llu\n", 2007 (unsigned long long)*delta,
1988 (unsigned long long)*delta, 2008 (unsigned long long)*ts,
1989 (unsigned long long)*ts, 2009 (unsigned long long)cpu_buffer->write_stamp);
1990 (unsigned long long)cpu_buffer->write_stamp);
1991 WARN_ON(1);
1992 }
1993 2010
1994 /* 2011 /*
1995 * The delta is too big, we to add a 2012 * The delta is too big, we to add a
@@ -2838,6 +2855,7 @@ static struct buffer_page *
2838rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 2855rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2839{ 2856{
2840 struct buffer_page *reader = NULL; 2857 struct buffer_page *reader = NULL;
2858 unsigned long overwrite;
2841 unsigned long flags; 2859 unsigned long flags;
2842 int nr_loops = 0; 2860 int nr_loops = 0;
2843 int ret; 2861 int ret;
@@ -2879,6 +2897,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2879 local_set(&cpu_buffer->reader_page->write, 0); 2897 local_set(&cpu_buffer->reader_page->write, 0);
2880 local_set(&cpu_buffer->reader_page->entries, 0); 2898 local_set(&cpu_buffer->reader_page->entries, 0);
2881 local_set(&cpu_buffer->reader_page->page->commit, 0); 2899 local_set(&cpu_buffer->reader_page->page->commit, 0);
2900 cpu_buffer->reader_page->real_end = 0;
2882 2901
2883 spin: 2902 spin:
2884 /* 2903 /*
@@ -2899,6 +2918,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2899 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); 2918 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2900 2919
2901 /* 2920 /*
2921 * We want to make sure we read the overruns after we set up our
2922 * pointers to the next object. The writer side does a
2923 * cmpxchg to cross pages which acts as the mb on the writer
2924 * side. Note, the reader will constantly fail the swap
2925 * while the writer is updating the pointers, so this
2926 * guarantees that the overwrite recorded here is the one we
2927 * want to compare with the last_overrun.
2928 */
2929 smp_mb();
2930 overwrite = local_read(&(cpu_buffer->overrun));
2931
2932 /*
2902 * Here's the tricky part. 2933 * Here's the tricky part.
2903 * 2934 *
2904 * We need to move the pointer past the header page. 2935 * We need to move the pointer past the header page.
@@ -2929,6 +2960,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2929 cpu_buffer->reader_page = reader; 2960 cpu_buffer->reader_page = reader;
2930 rb_reset_reader_page(cpu_buffer); 2961 rb_reset_reader_page(cpu_buffer);
2931 2962
2963 if (overwrite != cpu_buffer->last_overrun) {
2964 cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
2965 cpu_buffer->last_overrun = overwrite;
2966 }
2967
2932 goto again; 2968 goto again;
2933 2969
2934 out: 2970 out:
@@ -3005,8 +3041,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
3005 rb_advance_iter(iter); 3041 rb_advance_iter(iter);
3006} 3042}
3007 3043
3044static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
3045{
3046 return cpu_buffer->lost_events;
3047}
3048
3008static struct ring_buffer_event * 3049static struct ring_buffer_event *
3009rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) 3050rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3051 unsigned long *lost_events)
3010{ 3052{
3011 struct ring_buffer_event *event; 3053 struct ring_buffer_event *event;
3012 struct buffer_page *reader; 3054 struct buffer_page *reader;
@@ -3058,6 +3100,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3058 ring_buffer_normalize_time_stamp(cpu_buffer->buffer, 3100 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3059 cpu_buffer->cpu, ts); 3101 cpu_buffer->cpu, ts);
3060 } 3102 }
3103 if (lost_events)
3104 *lost_events = rb_lost_events(cpu_buffer);
3061 return event; 3105 return event;
3062 3106
3063 default: 3107 default:
@@ -3168,12 +3212,14 @@ static inline int rb_ok_to_lock(void)
3168 * @buffer: The ring buffer to read 3212 * @buffer: The ring buffer to read
3169 * @cpu: The cpu to peak at 3213 * @cpu: The cpu to peak at
3170 * @ts: The timestamp counter of this event. 3214 * @ts: The timestamp counter of this event.
3215 * @lost_events: a variable to store if events were lost (may be NULL)
3171 * 3216 *
3172 * This will return the event that will be read next, but does 3217 * This will return the event that will be read next, but does
3173 * not consume the data. 3218 * not consume the data.
3174 */ 3219 */
3175struct ring_buffer_event * 3220struct ring_buffer_event *
3176ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3221ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3222 unsigned long *lost_events)
3177{ 3223{
3178 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 3224 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3179 struct ring_buffer_event *event; 3225 struct ring_buffer_event *event;
@@ -3188,7 +3234,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3188 local_irq_save(flags); 3234 local_irq_save(flags);
3189 if (dolock) 3235 if (dolock)
3190 spin_lock(&cpu_buffer->reader_lock); 3236 spin_lock(&cpu_buffer->reader_lock);
3191 event = rb_buffer_peek(cpu_buffer, ts); 3237 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3192 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3238 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3193 rb_advance_reader(cpu_buffer); 3239 rb_advance_reader(cpu_buffer);
3194 if (dolock) 3240 if (dolock)
@@ -3230,13 +3276,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3230/** 3276/**
3231 * ring_buffer_consume - return an event and consume it 3277 * ring_buffer_consume - return an event and consume it
3232 * @buffer: The ring buffer to get the next event from 3278 * @buffer: The ring buffer to get the next event from
3279 * @cpu: the cpu to read the buffer from
3280 * @ts: a variable to store the timestamp (may be NULL)
3281 * @lost_events: a variable to store if events were lost (may be NULL)
3233 * 3282 *
3234 * Returns the next event in the ring buffer, and that event is consumed. 3283 * Returns the next event in the ring buffer, and that event is consumed.
3235 * Meaning, that sequential reads will keep returning a different event, 3284 * Meaning, that sequential reads will keep returning a different event,
3236 * and eventually empty the ring buffer if the producer is slower. 3285 * and eventually empty the ring buffer if the producer is slower.
3237 */ 3286 */
3238struct ring_buffer_event * 3287struct ring_buffer_event *
3239ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 3288ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3289 unsigned long *lost_events)
3240{ 3290{
3241 struct ring_buffer_per_cpu *cpu_buffer; 3291 struct ring_buffer_per_cpu *cpu_buffer;
3242 struct ring_buffer_event *event = NULL; 3292 struct ring_buffer_event *event = NULL;
@@ -3257,9 +3307,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3257 if (dolock) 3307 if (dolock)
3258 spin_lock(&cpu_buffer->reader_lock); 3308 spin_lock(&cpu_buffer->reader_lock);
3259 3309
3260 event = rb_buffer_peek(cpu_buffer, ts); 3310 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3261 if (event) 3311 if (event) {
3312 cpu_buffer->lost_events = 0;
3262 rb_advance_reader(cpu_buffer); 3313 rb_advance_reader(cpu_buffer);
3314 }
3263 3315
3264 if (dolock) 3316 if (dolock)
3265 spin_unlock(&cpu_buffer->reader_lock); 3317 spin_unlock(&cpu_buffer->reader_lock);
@@ -3276,23 +3328,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3276EXPORT_SYMBOL_GPL(ring_buffer_consume); 3328EXPORT_SYMBOL_GPL(ring_buffer_consume);
3277 3329
3278/** 3330/**
3279 * ring_buffer_read_start - start a non consuming read of the buffer 3331 * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
3280 * @buffer: The ring buffer to read from 3332 * @buffer: The ring buffer to read from
3281 * @cpu: The cpu buffer to iterate over 3333 * @cpu: The cpu buffer to iterate over
3282 * 3334 *
3283 * This starts up an iteration through the buffer. It also disables 3335 * This performs the initial preparations necessary to iterate
3284 * the recording to the buffer until the reading is finished. 3336 * through the buffer. Memory is allocated, buffer recording
3285 * This prevents the reading from being corrupted. This is not 3337 * is disabled, and the iterator pointer is returned to the caller.
3286 * a consuming read, so a producer is not expected.
3287 * 3338 *
3288 * Must be paired with ring_buffer_finish. 3339 * Disabling buffer recordng prevents the reading from being
3340 * corrupted. This is not a consuming read, so a producer is not
3341 * expected.
3342 *
3343 * After a sequence of ring_buffer_read_prepare calls, the user is
3344 * expected to make at least one call to ring_buffer_prepare_sync.
3345 * Afterwards, ring_buffer_read_start is invoked to get things going
3346 * for real.
3347 *
3348 * This overall must be paired with ring_buffer_finish.
3289 */ 3349 */
3290struct ring_buffer_iter * 3350struct ring_buffer_iter *
3291ring_buffer_read_start(struct ring_buffer *buffer, int cpu) 3351ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3292{ 3352{
3293 struct ring_buffer_per_cpu *cpu_buffer; 3353 struct ring_buffer_per_cpu *cpu_buffer;
3294 struct ring_buffer_iter *iter; 3354 struct ring_buffer_iter *iter;
3295 unsigned long flags;
3296 3355
3297 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3356 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3298 return NULL; 3357 return NULL;
@@ -3306,15 +3365,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3306 iter->cpu_buffer = cpu_buffer; 3365 iter->cpu_buffer = cpu_buffer;
3307 3366
3308 atomic_inc(&cpu_buffer->record_disabled); 3367 atomic_inc(&cpu_buffer->record_disabled);
3368
3369 return iter;
3370}
3371EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
3372
3373/**
3374 * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
3375 *
3376 * All previously invoked ring_buffer_read_prepare calls to prepare
3377 * iterators will be synchronized. Afterwards, read_buffer_read_start
3378 * calls on those iterators are allowed.
3379 */
3380void
3381ring_buffer_read_prepare_sync(void)
3382{
3309 synchronize_sched(); 3383 synchronize_sched();
3384}
3385EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
3386
3387/**
3388 * ring_buffer_read_start - start a non consuming read of the buffer
3389 * @iter: The iterator returned by ring_buffer_read_prepare
3390 *
3391 * This finalizes the startup of an iteration through the buffer.
3392 * The iterator comes from a call to ring_buffer_read_prepare and
3393 * an intervening ring_buffer_read_prepare_sync must have been
3394 * performed.
3395 *
3396 * Must be paired with ring_buffer_finish.
3397 */
3398void
3399ring_buffer_read_start(struct ring_buffer_iter *iter)
3400{
3401 struct ring_buffer_per_cpu *cpu_buffer;
3402 unsigned long flags;
3403
3404 if (!iter)
3405 return;
3406
3407 cpu_buffer = iter->cpu_buffer;
3310 3408
3311 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3409 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3312 arch_spin_lock(&cpu_buffer->lock); 3410 arch_spin_lock(&cpu_buffer->lock);
3313 rb_iter_reset(iter); 3411 rb_iter_reset(iter);
3314 arch_spin_unlock(&cpu_buffer->lock); 3412 arch_spin_unlock(&cpu_buffer->lock);
3315 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3413 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3316
3317 return iter;
3318} 3414}
3319EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3415EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3320 3416
@@ -3408,6 +3504,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3408 cpu_buffer->write_stamp = 0; 3504 cpu_buffer->write_stamp = 0;
3409 cpu_buffer->read_stamp = 0; 3505 cpu_buffer->read_stamp = 0;
3410 3506
3507 cpu_buffer->lost_events = 0;
3508 cpu_buffer->last_overrun = 0;
3509
3411 rb_head_page_activate(cpu_buffer); 3510 rb_head_page_activate(cpu_buffer);
3412} 3511}
3413 3512
@@ -3683,6 +3782,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3683 struct ring_buffer_event *event; 3782 struct ring_buffer_event *event;
3684 struct buffer_data_page *bpage; 3783 struct buffer_data_page *bpage;
3685 struct buffer_page *reader; 3784 struct buffer_page *reader;
3785 unsigned long missed_events;
3686 unsigned long flags; 3786 unsigned long flags;
3687 unsigned int commit; 3787 unsigned int commit;
3688 unsigned int read; 3788 unsigned int read;
@@ -3719,6 +3819,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3719 read = reader->read; 3819 read = reader->read;
3720 commit = rb_page_commit(reader); 3820 commit = rb_page_commit(reader);
3721 3821
3822 /* Check if any events were dropped */
3823 missed_events = cpu_buffer->lost_events;
3824
3722 /* 3825 /*
3723 * If this page has been partially read or 3826 * If this page has been partially read or
3724 * if len is not big enough to read the rest of the page or 3827 * if len is not big enough to read the rest of the page or
@@ -3779,9 +3882,35 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3779 local_set(&reader->entries, 0); 3882 local_set(&reader->entries, 0);
3780 reader->read = 0; 3883 reader->read = 0;
3781 *data_page = bpage; 3884 *data_page = bpage;
3885
3886 /*
3887 * Use the real_end for the data size,
3888 * This gives us a chance to store the lost events
3889 * on the page.
3890 */
3891 if (reader->real_end)
3892 local_set(&bpage->commit, reader->real_end);
3782 } 3893 }
3783 ret = read; 3894 ret = read;
3784 3895
3896 cpu_buffer->lost_events = 0;
3897 /*
3898 * Set a flag in the commit field if we lost events
3899 */
3900 if (missed_events) {
3901 commit = local_read(&bpage->commit);
3902
3903 /* If there is room at the end of the page to save the
3904 * missed events, then record it there.
3905 */
3906 if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
3907 memcpy(&bpage->data[commit], &missed_events,
3908 sizeof(missed_events));
3909 local_add(RB_MISSED_STORED, &bpage->commit);
3910 }
3911 local_add(RB_MISSED_EVENTS, &bpage->commit);
3912 }
3913
3785 out_unlock: 3914 out_unlock:
3786 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3915 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3787 3916
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index df74c7982255..302f8a614635 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -81,7 +81,7 @@ static enum event_status read_event(int cpu)
81 int *entry; 81 int *entry;
82 u64 ts; 82 u64 ts;
83 83
84 event = ring_buffer_consume(buffer, cpu, &ts); 84 event = ring_buffer_consume(buffer, cpu, &ts, NULL);
85 if (!event) 85 if (!event)
86 return EVENT_DROPPED; 86 return EVENT_DROPPED;
87 87
@@ -113,7 +113,8 @@ static enum event_status read_page(int cpu)
113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); 113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
114 if (ret >= 0) { 114 if (ret >= 0) {
115 rpage = bpage; 115 rpage = bpage;
116 commit = local_read(&rpage->commit); 116 /* The commit may have missed event flags set, clear them */
117 commit = local_read(&rpage->commit) & 0xfffff;
117 for (i = 0; i < commit && !kill_test; i += inc) { 118 for (i = 0; i < commit && !kill_test; i += inc) {
118 119
119 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { 120 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7b155a0e6f31..8a76339a9e65 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
117 * 117 *
118 * It is default off, but you can enable it with either specifying 118 * It is default off, but you can enable it with either specifying
119 * "ftrace_dump_on_oops" in the kernel command line, or setting 119 * "ftrace_dump_on_oops" in the kernel command line, or setting
120 * /proc/sys/kernel/ftrace_dump_on_oops to true. 120 * /proc/sys/kernel/ftrace_dump_on_oops
121 * Set 1 if you want to dump buffers of all CPUs
122 * Set 2 if you want to dump the buffer of the CPU that triggered oops
121 */ 123 */
122int ftrace_dump_on_oops; 124
125enum ftrace_dump_mode ftrace_dump_on_oops;
123 126
124static int tracing_set_tracer(const char *buf); 127static int tracing_set_tracer(const char *buf);
125 128
@@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
139 142
140static int __init set_ftrace_dump_on_oops(char *str) 143static int __init set_ftrace_dump_on_oops(char *str)
141{ 144{
142 ftrace_dump_on_oops = 1; 145 if (*str++ != '=' || !*str) {
143 return 1; 146 ftrace_dump_on_oops = DUMP_ALL;
147 return 1;
148 }
149
150 if (!strcmp("orig_cpu", str)) {
151 ftrace_dump_on_oops = DUMP_ORIG;
152 return 1;
153 }
154
155 return 0;
144} 156}
145__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 157__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
146 158
@@ -1545,7 +1557,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
1545} 1557}
1546 1558
1547static struct trace_entry * 1559static struct trace_entry *
1548peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) 1560peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1561 unsigned long *lost_events)
1549{ 1562{
1550 struct ring_buffer_event *event; 1563 struct ring_buffer_event *event;
1551 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1564 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1556,7 +1569,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1556 if (buf_iter) 1569 if (buf_iter)
1557 event = ring_buffer_iter_peek(buf_iter, ts); 1570 event = ring_buffer_iter_peek(buf_iter, ts);
1558 else 1571 else
1559 event = ring_buffer_peek(iter->tr->buffer, cpu, ts); 1572 event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
1573 lost_events);
1560 1574
1561 ftrace_enable_cpu(); 1575 ftrace_enable_cpu();
1562 1576
@@ -1564,10 +1578,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1564} 1578}
1565 1579
1566static struct trace_entry * 1580static struct trace_entry *
1567__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) 1581__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1582 unsigned long *missing_events, u64 *ent_ts)
1568{ 1583{
1569 struct ring_buffer *buffer = iter->tr->buffer; 1584 struct ring_buffer *buffer = iter->tr->buffer;
1570 struct trace_entry *ent, *next = NULL; 1585 struct trace_entry *ent, *next = NULL;
1586 unsigned long lost_events = 0, next_lost = 0;
1571 int cpu_file = iter->cpu_file; 1587 int cpu_file = iter->cpu_file;
1572 u64 next_ts = 0, ts; 1588 u64 next_ts = 0, ts;
1573 int next_cpu = -1; 1589 int next_cpu = -1;
@@ -1580,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1580 if (cpu_file > TRACE_PIPE_ALL_CPU) { 1596 if (cpu_file > TRACE_PIPE_ALL_CPU) {
1581 if (ring_buffer_empty_cpu(buffer, cpu_file)) 1597 if (ring_buffer_empty_cpu(buffer, cpu_file))
1582 return NULL; 1598 return NULL;
1583 ent = peek_next_entry(iter, cpu_file, ent_ts); 1599 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
1584 if (ent_cpu) 1600 if (ent_cpu)
1585 *ent_cpu = cpu_file; 1601 *ent_cpu = cpu_file;
1586 1602
@@ -1592,7 +1608,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1592 if (ring_buffer_empty_cpu(buffer, cpu)) 1608 if (ring_buffer_empty_cpu(buffer, cpu))
1593 continue; 1609 continue;
1594 1610
1595 ent = peek_next_entry(iter, cpu, &ts); 1611 ent = peek_next_entry(iter, cpu, &ts, &lost_events);
1596 1612
1597 /* 1613 /*
1598 * Pick the entry with the smallest timestamp: 1614 * Pick the entry with the smallest timestamp:
@@ -1601,6 +1617,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1601 next = ent; 1617 next = ent;
1602 next_cpu = cpu; 1618 next_cpu = cpu;
1603 next_ts = ts; 1619 next_ts = ts;
1620 next_lost = lost_events;
1604 } 1621 }
1605 } 1622 }
1606 1623
@@ -1610,6 +1627,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1610 if (ent_ts) 1627 if (ent_ts)
1611 *ent_ts = next_ts; 1628 *ent_ts = next_ts;
1612 1629
1630 if (missing_events)
1631 *missing_events = next_lost;
1632
1613 return next; 1633 return next;
1614} 1634}
1615 1635
@@ -1617,13 +1637,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1617struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 1637struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1618 int *ent_cpu, u64 *ent_ts) 1638 int *ent_cpu, u64 *ent_ts)
1619{ 1639{
1620 return __find_next_entry(iter, ent_cpu, ent_ts); 1640 return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
1621} 1641}
1622 1642
1623/* Find the next real entry, and increment the iterator to the next entry */ 1643/* Find the next real entry, and increment the iterator to the next entry */
1624static void *find_next_entry_inc(struct trace_iterator *iter) 1644static void *find_next_entry_inc(struct trace_iterator *iter)
1625{ 1645{
1626 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); 1646 iter->ent = __find_next_entry(iter, &iter->cpu,
1647 &iter->lost_events, &iter->ts);
1627 1648
1628 if (iter->ent) 1649 if (iter->ent)
1629 trace_iterator_increment(iter); 1650 trace_iterator_increment(iter);
@@ -1635,7 +1656,8 @@ static void trace_consume(struct trace_iterator *iter)
1635{ 1656{
1636 /* Don't allow ftrace to trace into the ring buffers */ 1657 /* Don't allow ftrace to trace into the ring buffers */
1637 ftrace_disable_cpu(); 1658 ftrace_disable_cpu();
1638 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); 1659 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
1660 &iter->lost_events);
1639 ftrace_enable_cpu(); 1661 ftrace_enable_cpu();
1640} 1662}
1641 1663
@@ -1786,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
1786} 1808}
1787 1809
1788 1810
1789static void 1811void
1790print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1812print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1791{ 1813{
1792 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1814 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1995,7 +2017,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1995 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; 2017 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
1996} 2018}
1997 2019
1998static int trace_empty(struct trace_iterator *iter) 2020int trace_empty(struct trace_iterator *iter)
1999{ 2021{
2000 int cpu; 2022 int cpu;
2001 2023
@@ -2030,6 +2052,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2030{ 2052{
2031 enum print_line_t ret; 2053 enum print_line_t ret;
2032 2054
2055 if (iter->lost_events)
2056 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2057 iter->cpu, iter->lost_events);
2058
2033 if (iter->trace && iter->trace->print_line) { 2059 if (iter->trace && iter->trace->print_line) {
2034 ret = iter->trace->print_line(iter); 2060 ret = iter->trace->print_line(iter);
2035 if (ret != TRACE_TYPE_UNHANDLED) 2061 if (ret != TRACE_TYPE_UNHANDLED)
@@ -2058,6 +2084,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2058 return print_trace_fmt(iter); 2084 return print_trace_fmt(iter);
2059} 2085}
2060 2086
2087void trace_default_header(struct seq_file *m)
2088{
2089 struct trace_iterator *iter = m->private;
2090
2091 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2092 /* print nothing if the buffers are empty */
2093 if (trace_empty(iter))
2094 return;
2095 print_trace_header(m, iter);
2096 if (!(trace_flags & TRACE_ITER_VERBOSE))
2097 print_lat_help_header(m);
2098 } else {
2099 if (!(trace_flags & TRACE_ITER_VERBOSE))
2100 print_func_help_header(m);
2101 }
2102}
2103
2061static int s_show(struct seq_file *m, void *v) 2104static int s_show(struct seq_file *m, void *v)
2062{ 2105{
2063 struct trace_iterator *iter = v; 2106 struct trace_iterator *iter = v;
@@ -2070,17 +2113,9 @@ static int s_show(struct seq_file *m, void *v)
2070 } 2113 }
2071 if (iter->trace && iter->trace->print_header) 2114 if (iter->trace && iter->trace->print_header)
2072 iter->trace->print_header(m); 2115 iter->trace->print_header(m);
2073 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2116 else
2074 /* print nothing if the buffers are empty */ 2117 trace_default_header(m);
2075 if (trace_empty(iter)) 2118
2076 return 0;
2077 print_trace_header(m, iter);
2078 if (!(trace_flags & TRACE_ITER_VERBOSE))
2079 print_lat_help_header(m);
2080 } else {
2081 if (!(trace_flags & TRACE_ITER_VERBOSE))
2082 print_func_help_header(m);
2083 }
2084 } else if (iter->leftover) { 2119 } else if (iter->leftover) {
2085 /* 2120 /*
2086 * If we filled the seq_file buffer earlier, we 2121 * If we filled the seq_file buffer earlier, we
@@ -2166,15 +2201,20 @@ __tracing_open(struct inode *inode, struct file *file)
2166 2201
2167 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2202 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2168 for_each_tracing_cpu(cpu) { 2203 for_each_tracing_cpu(cpu) {
2169
2170 iter->buffer_iter[cpu] = 2204 iter->buffer_iter[cpu] =
2171 ring_buffer_read_start(iter->tr->buffer, cpu); 2205 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2206 }
2207 ring_buffer_read_prepare_sync();
2208 for_each_tracing_cpu(cpu) {
2209 ring_buffer_read_start(iter->buffer_iter[cpu]);
2172 tracing_iter_reset(iter, cpu); 2210 tracing_iter_reset(iter, cpu);
2173 } 2211 }
2174 } else { 2212 } else {
2175 cpu = iter->cpu_file; 2213 cpu = iter->cpu_file;
2176 iter->buffer_iter[cpu] = 2214 iter->buffer_iter[cpu] =
2177 ring_buffer_read_start(iter->tr->buffer, cpu); 2215 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2216 ring_buffer_read_prepare_sync();
2217 ring_buffer_read_start(iter->buffer_iter[cpu]);
2178 tracing_iter_reset(iter, cpu); 2218 tracing_iter_reset(iter, cpu);
2179 } 2219 }
2180 2220
@@ -4336,7 +4376,7 @@ static int trace_panic_handler(struct notifier_block *this,
4336 unsigned long event, void *unused) 4376 unsigned long event, void *unused)
4337{ 4377{
4338 if (ftrace_dump_on_oops) 4378 if (ftrace_dump_on_oops)
4339 ftrace_dump(); 4379 ftrace_dump(ftrace_dump_on_oops);
4340 return NOTIFY_OK; 4380 return NOTIFY_OK;
4341} 4381}
4342 4382
@@ -4353,7 +4393,7 @@ static int trace_die_handler(struct notifier_block *self,
4353 switch (val) { 4393 switch (val) {
4354 case DIE_OOPS: 4394 case DIE_OOPS:
4355 if (ftrace_dump_on_oops) 4395 if (ftrace_dump_on_oops)
4356 ftrace_dump(); 4396 ftrace_dump(ftrace_dump_on_oops);
4357 break; 4397 break;
4358 default: 4398 default:
4359 break; 4399 break;
@@ -4394,7 +4434,8 @@ trace_printk_seq(struct trace_seq *s)
4394 trace_seq_init(s); 4434 trace_seq_init(s);
4395} 4435}
4396 4436
4397static void __ftrace_dump(bool disable_tracing) 4437static void
4438__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4398{ 4439{
4399 static arch_spinlock_t ftrace_dump_lock = 4440 static arch_spinlock_t ftrace_dump_lock =
4400 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 4441 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4427,12 +4468,25 @@ static void __ftrace_dump(bool disable_tracing)
4427 /* don't look at user memory in panic mode */ 4468 /* don't look at user memory in panic mode */
4428 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 4469 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
4429 4470
4430 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4431
4432 /* Simulate the iterator */ 4471 /* Simulate the iterator */
4433 iter.tr = &global_trace; 4472 iter.tr = &global_trace;
4434 iter.trace = current_trace; 4473 iter.trace = current_trace;
4435 iter.cpu_file = TRACE_PIPE_ALL_CPU; 4474
4475 switch (oops_dump_mode) {
4476 case DUMP_ALL:
4477 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4478 break;
4479 case DUMP_ORIG:
4480 iter.cpu_file = raw_smp_processor_id();
4481 break;
4482 case DUMP_NONE:
4483 goto out_enable;
4484 default:
4485 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
4486 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4487 }
4488
4489 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4436 4490
4437 /* 4491 /*
4438 * We need to stop all tracing on all CPUS to read the 4492 * We need to stop all tracing on all CPUS to read the
@@ -4471,6 +4525,7 @@ static void __ftrace_dump(bool disable_tracing)
4471 else 4525 else
4472 printk(KERN_TRACE "---------------------------------\n"); 4526 printk(KERN_TRACE "---------------------------------\n");
4473 4527
4528 out_enable:
4474 /* Re-enable tracing if requested */ 4529 /* Re-enable tracing if requested */
4475 if (!disable_tracing) { 4530 if (!disable_tracing) {
4476 trace_flags |= old_userobj; 4531 trace_flags |= old_userobj;
@@ -4487,9 +4542,9 @@ static void __ftrace_dump(bool disable_tracing)
4487} 4542}
4488 4543
4489/* By default: disable tracing after the dump */ 4544/* By default: disable tracing after the dump */
4490void ftrace_dump(void) 4545void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4491{ 4546{
4492 __ftrace_dump(true); 4547 __ftrace_dump(true, oops_dump_mode);
4493} 4548}
4494 4549
4495__init static int tracer_alloc_buffers(void) 4550__init static int tracer_alloc_buffers(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2825ef2c0b15..d1ce0bec1b3f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,7 +34,6 @@ enum trace_type {
34 TRACE_GRAPH_RET, 34 TRACE_GRAPH_RET,
35 TRACE_GRAPH_ENT, 35 TRACE_GRAPH_ENT,
36 TRACE_USER_STACK, 36 TRACE_USER_STACK,
37 TRACE_HW_BRANCHES,
38 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
39 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
40 TRACE_BLK, 39 TRACE_BLK,
@@ -103,29 +102,17 @@ struct syscall_trace_exit {
103 long ret; 102 long ret;
104}; 103};
105 104
106struct kprobe_trace_entry { 105struct kprobe_trace_entry_head {
107 struct trace_entry ent; 106 struct trace_entry ent;
108 unsigned long ip; 107 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111}; 108};
112 109
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ 110struct kretprobe_trace_entry_head {
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent; 111 struct trace_entry ent;
119 unsigned long func; 112 unsigned long func;
120 unsigned long ret_ip; 113 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123}; 114};
124 115
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
129/* 116/*
130 * trace_flag_type is an enumeration that holds different 117 * trace_flag_type is an enumeration that holds different
131 * states when a trace occurs. These are: 118 * states when a trace occurs. These are:
@@ -229,7 +216,6 @@ extern void __ftrace_bad_type(void);
229 TRACE_GRAPH_ENT); \ 216 TRACE_GRAPH_ENT); \
230 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 217 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
231 TRACE_GRAPH_RET); \ 218 TRACE_GRAPH_RET); \
232 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
233 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 219 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
234 TRACE_KMEM_ALLOC); \ 220 TRACE_KMEM_ALLOC); \
235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 221 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
@@ -378,6 +364,9 @@ void trace_function(struct trace_array *tr,
378 unsigned long ip, 364 unsigned long ip,
379 unsigned long parent_ip, 365 unsigned long parent_ip,
380 unsigned long flags, int pc); 366 unsigned long flags, int pc);
367void trace_default_header(struct seq_file *m);
368void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
369int trace_empty(struct trace_iterator *iter);
381 370
382void trace_graph_return(struct ftrace_graph_ret *trace); 371void trace_graph_return(struct ftrace_graph_ret *trace);
383int trace_graph_entry(struct ftrace_graph_ent *trace); 372int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -467,8 +456,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
467 struct trace_array *tr); 456 struct trace_array *tr);
468extern int trace_selftest_startup_branch(struct tracer *trace, 457extern int trace_selftest_startup_branch(struct tracer *trace,
469 struct trace_array *tr); 458 struct trace_array *tr);
470extern int trace_selftest_startup_hw_branches(struct tracer *trace,
471 struct trace_array *tr);
472extern int trace_selftest_startup_ksym(struct tracer *trace, 459extern int trace_selftest_startup_ksym(struct tracer *trace,
473 struct trace_array *tr); 460 struct trace_array *tr);
474#endif /* CONFIG_FTRACE_STARTUP_TEST */ 461#endif /* CONFIG_FTRACE_STARTUP_TEST */
@@ -491,9 +478,29 @@ extern int trace_clock_id;
491 478
492/* Standard output formatting function used for function return traces */ 479/* Standard output formatting function used for function return traces */
493#ifdef CONFIG_FUNCTION_GRAPH_TRACER 480#ifdef CONFIG_FUNCTION_GRAPH_TRACER
494extern enum print_line_t print_graph_function(struct trace_iterator *iter); 481
482/* Flag options */
483#define TRACE_GRAPH_PRINT_OVERRUN 0x1
484#define TRACE_GRAPH_PRINT_CPU 0x2
485#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
486#define TRACE_GRAPH_PRINT_PROC 0x8
487#define TRACE_GRAPH_PRINT_DURATION 0x10
488#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
489
490extern enum print_line_t
491print_graph_function_flags(struct trace_iterator *iter, u32 flags);
492extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
495extern enum print_line_t 493extern enum print_line_t
496trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); 494trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
495extern void graph_trace_open(struct trace_iterator *iter);
496extern void graph_trace_close(struct trace_iterator *iter);
497extern int __trace_graph_entry(struct trace_array *tr,
498 struct ftrace_graph_ent *trace,
499 unsigned long flags, int pc);
500extern void __trace_graph_return(struct trace_array *tr,
501 struct ftrace_graph_ret *trace,
502 unsigned long flags, int pc);
503
497 504
498#ifdef CONFIG_DYNAMIC_FTRACE 505#ifdef CONFIG_DYNAMIC_FTRACE
499/* TODO: make this variable */ 506/* TODO: make this variable */
@@ -524,7 +531,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
524#endif /* CONFIG_DYNAMIC_FTRACE */ 531#endif /* CONFIG_DYNAMIC_FTRACE */
525#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 532#else /* CONFIG_FUNCTION_GRAPH_TRACER */
526static inline enum print_line_t 533static inline enum print_line_t
527print_graph_function(struct trace_iterator *iter) 534print_graph_function_flags(struct trace_iterator *iter, u32 flags)
528{ 535{
529 return TRACE_TYPE_UNHANDLED; 536 return TRACE_TYPE_UNHANDLED;
530} 537}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399df..dc008c1240da 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch,
318 __entry->func, __entry->file, __entry->correct) 318 __entry->func, __entry->file, __entry->correct)
319); 319);
320 320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, 321FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334 322
335 TRACE_KMEM_ALLOC, 323 TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 88c0b6dbd7fe..58092d844a1f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1398,7 +1398,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1398 } 1398 }
1399 1399
1400 err = -EINVAL; 1400 err = -EINVAL;
1401 if (!call) 1401 if (&call->list == &ftrace_events)
1402 goto out_unlock; 1402 goto out_unlock;
1403 1403
1404 err = -EEXIST; 1404 err = -EEXIST;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 9aed1a5cf553..dd11c830eb84 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -40,7 +40,7 @@ struct fgraph_data {
40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
41#define TRACE_GRAPH_PRINT_PROC 0x8 41#define TRACE_GRAPH_PRINT_PROC 0x8
42#define TRACE_GRAPH_PRINT_DURATION 0x10 42#define TRACE_GRAPH_PRINT_DURATION 0x10
43#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
44 44
45static struct tracer_opt trace_opts[] = { 45static struct tracer_opt trace_opts[] = {
46 /* Display overruns? (for self-debug purpose) */ 46 /* Display overruns? (for self-debug purpose) */
@@ -179,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
179 return ret; 179 return ret;
180} 180}
181 181
182static int __trace_graph_entry(struct trace_array *tr, 182int __trace_graph_entry(struct trace_array *tr,
183 struct ftrace_graph_ent *trace, 183 struct ftrace_graph_ent *trace,
184 unsigned long flags, 184 unsigned long flags,
185 int pc) 185 int pc)
@@ -246,7 +246,7 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
246 return trace_graph_entry(trace); 246 return trace_graph_entry(trace);
247} 247}
248 248
249static void __trace_graph_return(struct trace_array *tr, 249void __trace_graph_return(struct trace_array *tr,
250 struct ftrace_graph_ret *trace, 250 struct ftrace_graph_ret *trace,
251 unsigned long flags, 251 unsigned long flags,
252 int pc) 252 int pc)
@@ -490,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
490 * We need to consume the current entry to see 490 * We need to consume the current entry to see
491 * the next one. 491 * the next one.
492 */ 492 */
493 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 493 ring_buffer_consume(iter->tr->buffer, iter->cpu,
494 NULL, NULL);
494 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 495 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
495 NULL); 496 NULL, NULL);
496 } 497 }
497 498
498 if (!event) 499 if (!event)
@@ -526,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
526 527
527/* Signal a overhead of time execution to the output */ 528/* Signal a overhead of time execution to the output */
528static int 529static int
529print_graph_overhead(unsigned long long duration, struct trace_seq *s) 530print_graph_overhead(unsigned long long duration, struct trace_seq *s,
531 u32 flags)
530{ 532{
531 /* If duration disappear, we don't need anything */ 533 /* If duration disappear, we don't need anything */
532 if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) 534 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
533 return 1; 535 return 1;
534 536
535 /* Non nested entry or return */ 537 /* Non nested entry or return */
536 if (duration == -1) 538 if (duration == -1)
537 return trace_seq_printf(s, " "); 539 return trace_seq_printf(s, " ");
538 540
539 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 541 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
540 /* Duration exceeded 100 msecs */ 542 /* Duration exceeded 100 msecs */
541 if (duration > 100000ULL) 543 if (duration > 100000ULL)
542 return trace_seq_printf(s, "! "); 544 return trace_seq_printf(s, "! ");
@@ -562,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
562 564
563static enum print_line_t 565static enum print_line_t
564print_graph_irq(struct trace_iterator *iter, unsigned long addr, 566print_graph_irq(struct trace_iterator *iter, unsigned long addr,
565 enum trace_type type, int cpu, pid_t pid) 567 enum trace_type type, int cpu, pid_t pid, u32 flags)
566{ 568{
567 int ret; 569 int ret;
568 struct trace_seq *s = &iter->seq; 570 struct trace_seq *s = &iter->seq;
@@ -572,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
572 return TRACE_TYPE_UNHANDLED; 574 return TRACE_TYPE_UNHANDLED;
573 575
574 /* Absolute time */ 576 /* Absolute time */
575 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 577 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
576 ret = print_graph_abs_time(iter->ts, s); 578 ret = print_graph_abs_time(iter->ts, s);
577 if (!ret) 579 if (!ret)
578 return TRACE_TYPE_PARTIAL_LINE; 580 return TRACE_TYPE_PARTIAL_LINE;
579 } 581 }
580 582
581 /* Cpu */ 583 /* Cpu */
582 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 584 if (flags & TRACE_GRAPH_PRINT_CPU) {
583 ret = print_graph_cpu(s, cpu); 585 ret = print_graph_cpu(s, cpu);
584 if (ret == TRACE_TYPE_PARTIAL_LINE) 586 if (ret == TRACE_TYPE_PARTIAL_LINE)
585 return TRACE_TYPE_PARTIAL_LINE; 587 return TRACE_TYPE_PARTIAL_LINE;
586 } 588 }
587 589
588 /* Proc */ 590 /* Proc */
589 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 591 if (flags & TRACE_GRAPH_PRINT_PROC) {
590 ret = print_graph_proc(s, pid); 592 ret = print_graph_proc(s, pid);
591 if (ret == TRACE_TYPE_PARTIAL_LINE) 593 if (ret == TRACE_TYPE_PARTIAL_LINE)
592 return TRACE_TYPE_PARTIAL_LINE; 594 return TRACE_TYPE_PARTIAL_LINE;
@@ -596,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
596 } 598 }
597 599
598 /* No overhead */ 600 /* No overhead */
599 ret = print_graph_overhead(-1, s); 601 ret = print_graph_overhead(-1, s, flags);
600 if (!ret) 602 if (!ret)
601 return TRACE_TYPE_PARTIAL_LINE; 603 return TRACE_TYPE_PARTIAL_LINE;
602 604
@@ -609,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
609 return TRACE_TYPE_PARTIAL_LINE; 611 return TRACE_TYPE_PARTIAL_LINE;
610 612
611 /* Don't close the duration column if haven't one */ 613 /* Don't close the duration column if haven't one */
612 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 614 if (flags & TRACE_GRAPH_PRINT_DURATION)
613 trace_seq_printf(s, " |"); 615 trace_seq_printf(s, " |");
614 ret = trace_seq_printf(s, "\n"); 616 ret = trace_seq_printf(s, "\n");
615 617
@@ -679,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
679static enum print_line_t 681static enum print_line_t
680print_graph_entry_leaf(struct trace_iterator *iter, 682print_graph_entry_leaf(struct trace_iterator *iter,
681 struct ftrace_graph_ent_entry *entry, 683 struct ftrace_graph_ent_entry *entry,
682 struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) 684 struct ftrace_graph_ret_entry *ret_entry,
685 struct trace_seq *s, u32 flags)
683{ 686{
684 struct fgraph_data *data = iter->private; 687 struct fgraph_data *data = iter->private;
685 struct ftrace_graph_ret *graph_ret; 688 struct ftrace_graph_ret *graph_ret;
@@ -711,12 +714,12 @@ print_graph_entry_leaf(struct trace_iterator *iter,
711 } 714 }
712 715
713 /* Overhead */ 716 /* Overhead */
714 ret = print_graph_overhead(duration, s); 717 ret = print_graph_overhead(duration, s, flags);
715 if (!ret) 718 if (!ret)
716 return TRACE_TYPE_PARTIAL_LINE; 719 return TRACE_TYPE_PARTIAL_LINE;
717 720
718 /* Duration */ 721 /* Duration */
719 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 722 if (flags & TRACE_GRAPH_PRINT_DURATION) {
720 ret = print_graph_duration(duration, s); 723 ret = print_graph_duration(duration, s);
721 if (ret == TRACE_TYPE_PARTIAL_LINE) 724 if (ret == TRACE_TYPE_PARTIAL_LINE)
722 return TRACE_TYPE_PARTIAL_LINE; 725 return TRACE_TYPE_PARTIAL_LINE;
@@ -739,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
739static enum print_line_t 742static enum print_line_t
740print_graph_entry_nested(struct trace_iterator *iter, 743print_graph_entry_nested(struct trace_iterator *iter,
741 struct ftrace_graph_ent_entry *entry, 744 struct ftrace_graph_ent_entry *entry,
742 struct trace_seq *s, int cpu) 745 struct trace_seq *s, int cpu, u32 flags)
743{ 746{
744 struct ftrace_graph_ent *call = &entry->graph_ent; 747 struct ftrace_graph_ent *call = &entry->graph_ent;
745 struct fgraph_data *data = iter->private; 748 struct fgraph_data *data = iter->private;
@@ -759,12 +762,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
759 } 762 }
760 763
761 /* No overhead */ 764 /* No overhead */
762 ret = print_graph_overhead(-1, s); 765 ret = print_graph_overhead(-1, s, flags);
763 if (!ret) 766 if (!ret)
764 return TRACE_TYPE_PARTIAL_LINE; 767 return TRACE_TYPE_PARTIAL_LINE;
765 768
766 /* No time */ 769 /* No time */
767 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 770 if (flags & TRACE_GRAPH_PRINT_DURATION) {
768 ret = trace_seq_printf(s, " | "); 771 ret = trace_seq_printf(s, " | ");
769 if (!ret) 772 if (!ret)
770 return TRACE_TYPE_PARTIAL_LINE; 773 return TRACE_TYPE_PARTIAL_LINE;
@@ -790,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
790 793
791static enum print_line_t 794static enum print_line_t
792print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, 795print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
793 int type, unsigned long addr) 796 int type, unsigned long addr, u32 flags)
794{ 797{
795 struct fgraph_data *data = iter->private; 798 struct fgraph_data *data = iter->private;
796 struct trace_entry *ent = iter->ent; 799 struct trace_entry *ent = iter->ent;
@@ -803,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
803 806
804 if (type) { 807 if (type) {
805 /* Interrupt */ 808 /* Interrupt */
806 ret = print_graph_irq(iter, addr, type, cpu, ent->pid); 809 ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
807 if (ret == TRACE_TYPE_PARTIAL_LINE) 810 if (ret == TRACE_TYPE_PARTIAL_LINE)
808 return TRACE_TYPE_PARTIAL_LINE; 811 return TRACE_TYPE_PARTIAL_LINE;
809 } 812 }
810 813
811 /* Absolute time */ 814 /* Absolute time */
812 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 815 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
813 ret = print_graph_abs_time(iter->ts, s); 816 ret = print_graph_abs_time(iter->ts, s);
814 if (!ret) 817 if (!ret)
815 return TRACE_TYPE_PARTIAL_LINE; 818 return TRACE_TYPE_PARTIAL_LINE;
816 } 819 }
817 820
818 /* Cpu */ 821 /* Cpu */
819 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 822 if (flags & TRACE_GRAPH_PRINT_CPU) {
820 ret = print_graph_cpu(s, cpu); 823 ret = print_graph_cpu(s, cpu);
821 if (ret == TRACE_TYPE_PARTIAL_LINE) 824 if (ret == TRACE_TYPE_PARTIAL_LINE)
822 return TRACE_TYPE_PARTIAL_LINE; 825 return TRACE_TYPE_PARTIAL_LINE;
823 } 826 }
824 827
825 /* Proc */ 828 /* Proc */
826 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 829 if (flags & TRACE_GRAPH_PRINT_PROC) {
827 ret = print_graph_proc(s, ent->pid); 830 ret = print_graph_proc(s, ent->pid);
828 if (ret == TRACE_TYPE_PARTIAL_LINE) 831 if (ret == TRACE_TYPE_PARTIAL_LINE)
829 return TRACE_TYPE_PARTIAL_LINE; 832 return TRACE_TYPE_PARTIAL_LINE;
@@ -845,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
845 848
846static enum print_line_t 849static enum print_line_t
847print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 850print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
848 struct trace_iterator *iter) 851 struct trace_iterator *iter, u32 flags)
849{ 852{
850 struct fgraph_data *data = iter->private; 853 struct fgraph_data *data = iter->private;
851 struct ftrace_graph_ent *call = &field->graph_ent; 854 struct ftrace_graph_ent *call = &field->graph_ent;
@@ -853,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
853 static enum print_line_t ret; 856 static enum print_line_t ret;
854 int cpu = iter->cpu; 857 int cpu = iter->cpu;
855 858
856 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 859 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
857 return TRACE_TYPE_PARTIAL_LINE; 860 return TRACE_TYPE_PARTIAL_LINE;
858 861
859 leaf_ret = get_return_for_leaf(iter, field); 862 leaf_ret = get_return_for_leaf(iter, field);
860 if (leaf_ret) 863 if (leaf_ret)
861 ret = print_graph_entry_leaf(iter, field, leaf_ret, s); 864 ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
862 else 865 else
863 ret = print_graph_entry_nested(iter, field, s, cpu); 866 ret = print_graph_entry_nested(iter, field, s, cpu, flags);
864 867
865 if (data) { 868 if (data) {
866 /* 869 /*
@@ -879,7 +882,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
879 882
880static enum print_line_t 883static enum print_line_t
881print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, 884print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
882 struct trace_entry *ent, struct trace_iterator *iter) 885 struct trace_entry *ent, struct trace_iterator *iter,
886 u32 flags)
883{ 887{
884 unsigned long long duration = trace->rettime - trace->calltime; 888 unsigned long long duration = trace->rettime - trace->calltime;
885 struct fgraph_data *data = iter->private; 889 struct fgraph_data *data = iter->private;
@@ -909,16 +913,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
909 } 913 }
910 } 914 }
911 915
912 if (print_graph_prologue(iter, s, 0, 0)) 916 if (print_graph_prologue(iter, s, 0, 0, flags))
913 return TRACE_TYPE_PARTIAL_LINE; 917 return TRACE_TYPE_PARTIAL_LINE;
914 918
915 /* Overhead */ 919 /* Overhead */
916 ret = print_graph_overhead(duration, s); 920 ret = print_graph_overhead(duration, s, flags);
917 if (!ret) 921 if (!ret)
918 return TRACE_TYPE_PARTIAL_LINE; 922 return TRACE_TYPE_PARTIAL_LINE;
919 923
920 /* Duration */ 924 /* Duration */
921 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 925 if (flags & TRACE_GRAPH_PRINT_DURATION) {
922 ret = print_graph_duration(duration, s); 926 ret = print_graph_duration(duration, s);
923 if (ret == TRACE_TYPE_PARTIAL_LINE) 927 if (ret == TRACE_TYPE_PARTIAL_LINE)
924 return TRACE_TYPE_PARTIAL_LINE; 928 return TRACE_TYPE_PARTIAL_LINE;
@@ -948,14 +952,15 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
948 } 952 }
949 953
950 /* Overrun */ 954 /* Overrun */
951 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 955 if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
952 ret = trace_seq_printf(s, " (Overruns: %lu)\n", 956 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
953 trace->overrun); 957 trace->overrun);
954 if (!ret) 958 if (!ret)
955 return TRACE_TYPE_PARTIAL_LINE; 959 return TRACE_TYPE_PARTIAL_LINE;
956 } 960 }
957 961
958 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid); 962 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
963 cpu, pid, flags);
959 if (ret == TRACE_TYPE_PARTIAL_LINE) 964 if (ret == TRACE_TYPE_PARTIAL_LINE)
960 return TRACE_TYPE_PARTIAL_LINE; 965 return TRACE_TYPE_PARTIAL_LINE;
961 966
@@ -963,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
963} 968}
964 969
965static enum print_line_t 970static enum print_line_t
966print_graph_comment(struct trace_seq *s, struct trace_entry *ent, 971print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
967 struct trace_iterator *iter) 972 struct trace_iterator *iter, u32 flags)
968{ 973{
969 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 974 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
970 struct fgraph_data *data = iter->private; 975 struct fgraph_data *data = iter->private;
@@ -976,16 +981,16 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
976 if (data) 981 if (data)
977 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; 982 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
978 983
979 if (print_graph_prologue(iter, s, 0, 0)) 984 if (print_graph_prologue(iter, s, 0, 0, flags))
980 return TRACE_TYPE_PARTIAL_LINE; 985 return TRACE_TYPE_PARTIAL_LINE;
981 986
982 /* No overhead */ 987 /* No overhead */
983 ret = print_graph_overhead(-1, s); 988 ret = print_graph_overhead(-1, s, flags);
984 if (!ret) 989 if (!ret)
985 return TRACE_TYPE_PARTIAL_LINE; 990 return TRACE_TYPE_PARTIAL_LINE;
986 991
987 /* No time */ 992 /* No time */
988 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 993 if (flags & TRACE_GRAPH_PRINT_DURATION) {
989 ret = trace_seq_printf(s, " | "); 994 ret = trace_seq_printf(s, " | ");
990 if (!ret) 995 if (!ret)
991 return TRACE_TYPE_PARTIAL_LINE; 996 return TRACE_TYPE_PARTIAL_LINE;
@@ -1040,7 +1045,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1040 1045
1041 1046
1042enum print_line_t 1047enum print_line_t
1043print_graph_function(struct trace_iterator *iter) 1048print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1044{ 1049{
1045 struct ftrace_graph_ent_entry *field; 1050 struct ftrace_graph_ent_entry *field;
1046 struct fgraph_data *data = iter->private; 1051 struct fgraph_data *data = iter->private;
@@ -1061,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
1061 if (data && data->failed) { 1066 if (data && data->failed) {
1062 field = &data->ent; 1067 field = &data->ent;
1063 iter->cpu = data->cpu; 1068 iter->cpu = data->cpu;
1064 ret = print_graph_entry(field, s, iter); 1069 ret = print_graph_entry(field, s, iter, flags);
1065 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { 1070 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1066 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; 1071 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1067 ret = TRACE_TYPE_NO_CONSUME; 1072 ret = TRACE_TYPE_NO_CONSUME;
@@ -1081,32 +1086,49 @@ print_graph_function(struct trace_iterator *iter)
1081 struct ftrace_graph_ent_entry saved; 1086 struct ftrace_graph_ent_entry saved;
1082 trace_assign_type(field, entry); 1087 trace_assign_type(field, entry);
1083 saved = *field; 1088 saved = *field;
1084 return print_graph_entry(&saved, s, iter); 1089 return print_graph_entry(&saved, s, iter, flags);
1085 } 1090 }
1086 case TRACE_GRAPH_RET: { 1091 case TRACE_GRAPH_RET: {
1087 struct ftrace_graph_ret_entry *field; 1092 struct ftrace_graph_ret_entry *field;
1088 trace_assign_type(field, entry); 1093 trace_assign_type(field, entry);
1089 return print_graph_return(&field->ret, s, entry, iter); 1094 return print_graph_return(&field->ret, s, entry, iter, flags);
1090 } 1095 }
1096 case TRACE_STACK:
1097 case TRACE_FN:
1098 /* dont trace stack and functions as comments */
1099 return TRACE_TYPE_UNHANDLED;
1100
1091 default: 1101 default:
1092 return print_graph_comment(s, entry, iter); 1102 return print_graph_comment(s, entry, iter, flags);
1093 } 1103 }
1094 1104
1095 return TRACE_TYPE_HANDLED; 1105 return TRACE_TYPE_HANDLED;
1096} 1106}
1097 1107
1098static void print_lat_header(struct seq_file *s) 1108static enum print_line_t
1109print_graph_function(struct trace_iterator *iter)
1110{
1111 return print_graph_function_flags(iter, tracer_flags.val);
1112}
1113
1114static enum print_line_t
1115print_graph_function_event(struct trace_iterator *iter, int flags)
1116{
1117 return print_graph_function(iter);
1118}
1119
1120static void print_lat_header(struct seq_file *s, u32 flags)
1099{ 1121{
1100 static const char spaces[] = " " /* 16 spaces */ 1122 static const char spaces[] = " " /* 16 spaces */
1101 " " /* 4 spaces */ 1123 " " /* 4 spaces */
1102 " "; /* 17 spaces */ 1124 " "; /* 17 spaces */
1103 int size = 0; 1125 int size = 0;
1104 1126
1105 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1127 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1106 size += 16; 1128 size += 16;
1107 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1129 if (flags & TRACE_GRAPH_PRINT_CPU)
1108 size += 4; 1130 size += 4;
1109 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1131 if (flags & TRACE_GRAPH_PRINT_PROC)
1110 size += 17; 1132 size += 17;
1111 1133
1112 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); 1134 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
@@ -1117,43 +1139,48 @@ static void print_lat_header(struct seq_file *s)
1117 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1139 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1118} 1140}
1119 1141
1120static void print_graph_headers(struct seq_file *s) 1142void print_graph_headers_flags(struct seq_file *s, u32 flags)
1121{ 1143{
1122 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1144 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1123 1145
1124 if (lat) 1146 if (lat)
1125 print_lat_header(s); 1147 print_lat_header(s, flags);
1126 1148
1127 /* 1st line */ 1149 /* 1st line */
1128 seq_printf(s, "#"); 1150 seq_printf(s, "#");
1129 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1151 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1130 seq_printf(s, " TIME "); 1152 seq_printf(s, " TIME ");
1131 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1153 if (flags & TRACE_GRAPH_PRINT_CPU)
1132 seq_printf(s, " CPU"); 1154 seq_printf(s, " CPU");
1133 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1155 if (flags & TRACE_GRAPH_PRINT_PROC)
1134 seq_printf(s, " TASK/PID "); 1156 seq_printf(s, " TASK/PID ");
1135 if (lat) 1157 if (lat)
1136 seq_printf(s, "|||||"); 1158 seq_printf(s, "|||||");
1137 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1159 if (flags & TRACE_GRAPH_PRINT_DURATION)
1138 seq_printf(s, " DURATION "); 1160 seq_printf(s, " DURATION ");
1139 seq_printf(s, " FUNCTION CALLS\n"); 1161 seq_printf(s, " FUNCTION CALLS\n");
1140 1162
1141 /* 2nd line */ 1163 /* 2nd line */
1142 seq_printf(s, "#"); 1164 seq_printf(s, "#");
1143 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1165 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1144 seq_printf(s, " | "); 1166 seq_printf(s, " | ");
1145 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1167 if (flags & TRACE_GRAPH_PRINT_CPU)
1146 seq_printf(s, " | "); 1168 seq_printf(s, " | ");
1147 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1169 if (flags & TRACE_GRAPH_PRINT_PROC)
1148 seq_printf(s, " | | "); 1170 seq_printf(s, " | | ");
1149 if (lat) 1171 if (lat)
1150 seq_printf(s, "|||||"); 1172 seq_printf(s, "|||||");
1151 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1173 if (flags & TRACE_GRAPH_PRINT_DURATION)
1152 seq_printf(s, " | | "); 1174 seq_printf(s, " | | ");
1153 seq_printf(s, " | | | |\n"); 1175 seq_printf(s, " | | | |\n");
1154} 1176}
1155 1177
1156static void graph_trace_open(struct trace_iterator *iter) 1178void print_graph_headers(struct seq_file *s)
1179{
1180 print_graph_headers_flags(s, tracer_flags.val);
1181}
1182
1183void graph_trace_open(struct trace_iterator *iter)
1157{ 1184{
1158 /* pid and depth on the last trace processed */ 1185 /* pid and depth on the last trace processed */
1159 struct fgraph_data *data; 1186 struct fgraph_data *data;
@@ -1188,7 +1215,7 @@ static void graph_trace_open(struct trace_iterator *iter)
1188 pr_warning("function graph tracer: not enough memory\n"); 1215 pr_warning("function graph tracer: not enough memory\n");
1189} 1216}
1190 1217
1191static void graph_trace_close(struct trace_iterator *iter) 1218void graph_trace_close(struct trace_iterator *iter)
1192{ 1219{
1193 struct fgraph_data *data = iter->private; 1220 struct fgraph_data *data = iter->private;
1194 1221
@@ -1198,6 +1225,16 @@ static void graph_trace_close(struct trace_iterator *iter)
1198 } 1225 }
1199} 1226}
1200 1227
1228static struct trace_event graph_trace_entry_event = {
1229 .type = TRACE_GRAPH_ENT,
1230 .trace = print_graph_function_event,
1231};
1232
1233static struct trace_event graph_trace_ret_event = {
1234 .type = TRACE_GRAPH_RET,
1235 .trace = print_graph_function_event,
1236};
1237
1201static struct tracer graph_trace __read_mostly = { 1238static struct tracer graph_trace __read_mostly = {
1202 .name = "function_graph", 1239 .name = "function_graph",
1203 .open = graph_trace_open, 1240 .open = graph_trace_open,
@@ -1219,6 +1256,16 @@ static __init int init_graph_trace(void)
1219{ 1256{
1220 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1257 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1221 1258
1259 if (!register_ftrace_event(&graph_trace_entry_event)) {
1260 pr_warning("Warning: could not register graph trace events\n");
1261 return 1;
1262 }
1263
1264 if (!register_ftrace_event(&graph_trace_ret_event)) {
1265 pr_warning("Warning: could not register graph trace events\n");
1266 return 1;
1267 }
1268
1222 return register_tracer(&graph_trace); 1269 return register_tracer(&graph_trace);
1223} 1270}
1224 1271
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f5..000000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
1/*
2 * h/w branch tracer for x86 based on BTS
3 *
4 * Copyright (C) 2008-2009 Intel Corporation.
5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */
7#include <linux/kallsyms.h>
8#include <linux/debugfs.h>
9#include <linux/ftrace.h>
10#include <linux/module.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
13#include <linux/fs.h>
14
15#include <asm/ds.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20
21#define BTS_BUFFER_SIZE (1 << 13)
22
23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25
26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27
28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly;
30static struct trace_array *hw_branch_trace __read_mostly;
31
32
33static void bts_trace_init_cpu(int cpu)
34{
35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
39
40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
41 per_cpu(hwb_tracer, cpu) = NULL;
42}
43
44static int bts_trace_init(struct trace_array *tr)
45{
46 int cpu;
47
48 hw_branch_trace = tr;
49 trace_hw_branches_enabled = 0;
50
51 get_online_cpus();
52 for_each_online_cpu(cpu) {
53 bts_trace_init_cpu(cpu);
54
55 if (likely(per_cpu(hwb_tracer, cpu)))
56 trace_hw_branches_enabled = 1;
57 }
58 trace_hw_branches_suspended = 0;
59 put_online_cpus();
60
61 /* If we could not enable tracing on a single cpu, we fail. */
62 return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
63}
64
65static void bts_trace_reset(struct trace_array *tr)
66{
67 int cpu;
68
69 get_online_cpus();
70 for_each_online_cpu(cpu) {
71 if (likely(per_cpu(hwb_tracer, cpu))) {
72 ds_release_bts(per_cpu(hwb_tracer, cpu));
73 per_cpu(hwb_tracer, cpu) = NULL;
74 }
75 }
76 trace_hw_branches_enabled = 0;
77 trace_hw_branches_suspended = 0;
78 put_online_cpus();
79}
80
81static void bts_trace_start(struct trace_array *tr)
82{
83 int cpu;
84
85 get_online_cpus();
86 for_each_online_cpu(cpu)
87 if (likely(per_cpu(hwb_tracer, cpu)))
88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
89 trace_hw_branches_suspended = 0;
90 put_online_cpus();
91}
92
93static void bts_trace_stop(struct trace_array *tr)
94{
95 int cpu;
96
97 get_online_cpus();
98 for_each_online_cpu(cpu)
99 if (likely(per_cpu(hwb_tracer, cpu)))
100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
101 trace_hw_branches_suspended = 1;
102 put_online_cpus();
103}
104
105static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
106 unsigned long action, void *hcpu)
107{
108 int cpu = (long)hcpu;
109
110 switch (action) {
111 case CPU_ONLINE:
112 case CPU_DOWN_FAILED:
113 /* The notification is sent with interrupts enabled. */
114 if (trace_hw_branches_enabled) {
115 bts_trace_init_cpu(cpu);
116
117 if (trace_hw_branches_suspended &&
118 likely(per_cpu(hwb_tracer, cpu)))
119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
120 }
121 break;
122
123 case CPU_DOWN_PREPARE:
124 /* The notification is sent with interrupts enabled. */
125 if (likely(per_cpu(hwb_tracer, cpu))) {
126 ds_release_bts(per_cpu(hwb_tracer, cpu));
127 per_cpu(hwb_tracer, cpu) = NULL;
128 }
129 }
130
131 return NOTIFY_DONE;
132}
133
134static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
135 .notifier_call = bts_hotcpu_handler
136};
137
138static void bts_trace_print_header(struct seq_file *m)
139{
140 seq_puts(m, "# CPU# TO <- FROM\n");
141}
142
143static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
144{
145 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
146 struct trace_entry *entry = iter->ent;
147 struct trace_seq *seq = &iter->seq;
148 struct hw_branch_entry *it;
149
150 trace_assign_type(it, entry);
151
152 if (entry->type == TRACE_HW_BRANCHES) {
153 if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
154 seq_print_ip_sym(seq, it->to, symflags) &&
155 trace_seq_printf(seq, "\t <- ") &&
156 seq_print_ip_sym(seq, it->from, symflags) &&
157 trace_seq_printf(seq, "\n"))
158 return TRACE_TYPE_HANDLED;
159 return TRACE_TYPE_PARTIAL_LINE;
160 }
161 return TRACE_TYPE_UNHANDLED;
162}
163
164void trace_hw_branch(u64 from, u64 to)
165{
166 struct ftrace_event_call *call = &event_hw_branch;
167 struct trace_array *tr = hw_branch_trace;
168 struct ring_buffer_event *event;
169 struct ring_buffer *buf;
170 struct hw_branch_entry *entry;
171 unsigned long irq1;
172 int cpu;
173
174 if (unlikely(!tr))
175 return;
176
177 if (unlikely(!trace_hw_branches_enabled))
178 return;
179
180 local_irq_save(irq1);
181 cpu = raw_smp_processor_id();
182 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
183 goto out;
184
185 buf = tr->buffer;
186 event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
187 sizeof(*entry), 0, 0);
188 if (!event)
189 goto out;
190 entry = ring_buffer_event_data(event);
191 tracing_generic_entry_update(&entry->ent, 0, from);
192 entry->ent.type = TRACE_HW_BRANCHES;
193 entry->from = from;
194 entry->to = to;
195 if (!filter_check_discard(call, entry, buf, event))
196 trace_buffer_unlock_commit(buf, event, 0, 0);
197
198 out:
199 atomic_dec(&tr->data[cpu]->disabled);
200 local_irq_restore(irq1);
201}
202
203static void trace_bts_at(const struct bts_trace *trace, void *at)
204{
205 struct bts_struct bts;
206 int err = 0;
207
208 WARN_ON_ONCE(!trace->read);
209 if (!trace->read)
210 return;
211
212 err = trace->read(this_tracer, at, &bts);
213 if (err < 0)
214 return;
215
216 switch (bts.qualifier) {
217 case BTS_BRANCH:
218 trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
219 break;
220 }
221}
222
223/*
224 * Collect the trace on the current cpu and write it into the ftrace buffer.
225 *
226 * pre: tracing must be suspended on the current cpu
227 */
228static void trace_bts_cpu(void *arg)
229{
230 struct trace_array *tr = (struct trace_array *)arg;
231 const struct bts_trace *trace;
232 unsigned char *at;
233
234 if (unlikely(!tr))
235 return;
236
237 if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
238 return;
239
240 if (unlikely(!this_tracer))
241 return;
242
243 trace = ds_read_bts(this_tracer);
244 if (!trace)
245 return;
246
247 for (at = trace->ds.top; (void *)at < trace->ds.end;
248 at += trace->ds.size)
249 trace_bts_at(trace, at);
250
251 for (at = trace->ds.begin; (void *)at < trace->ds.top;
252 at += trace->ds.size)
253 trace_bts_at(trace, at);
254}
255
256static void trace_bts_prepare(struct trace_iterator *iter)
257{
258 int cpu;
259
260 get_online_cpus();
261 for_each_online_cpu(cpu)
262 if (likely(per_cpu(hwb_tracer, cpu)))
263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
264 /*
265 * We need to collect the trace on the respective cpu since ftrace
266 * implicitly adds the record for the current cpu.
267 * Once that is more flexible, we could collect the data from any cpu.
268 */
269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
270
271 for_each_online_cpu(cpu)
272 if (likely(per_cpu(hwb_tracer, cpu)))
273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
274 put_online_cpus();
275}
276
277static void trace_bts_close(struct trace_iterator *iter)
278{
279 tracing_reset_online_cpus(iter->tr);
280}
281
282void trace_hw_branch_oops(void)
283{
284 if (this_tracer) {
285 ds_suspend_bts_noirq(this_tracer);
286 trace_bts_cpu(hw_branch_trace);
287 ds_resume_bts_noirq(this_tracer);
288 }
289}
290
291struct tracer bts_tracer __read_mostly =
292{
293 .name = "hw-branch-tracer",
294 .init = bts_trace_init,
295 .reset = bts_trace_reset,
296 .print_header = bts_trace_print_header,
297 .print_line = bts_trace_print_line,
298 .start = bts_trace_start,
299 .stop = bts_trace_stop,
300 .open = trace_bts_prepare,
301 .close = trace_bts_close,
302#ifdef CONFIG_FTRACE_SELFTEST
303 .selftest = trace_selftest_startup_hw_branches,
304#endif /* CONFIG_FTRACE_SELFTEST */
305};
306
307__init static int init_bts_trace(void)
308{
309 register_hotcpu_notifier(&bts_hotcpu_notifier);
310 return register_tracer(&bts_tracer);
311}
312device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c7..6fd486e0cef4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
34 34
35static int save_lat_flag; 35static int save_lat_flag;
36 36
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph);
39
37#ifdef CONFIG_PREEMPT_TRACER 40#ifdef CONFIG_PREEMPT_TRACER
38static inline int 41static inline int
39preempt_trace(void) 42preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
55# define irq_trace() (0) 58# define irq_trace() (0)
56#endif 59#endif
57 60
61#define TRACE_DISPLAY_GRAPH 1
62
63static struct tracer_opt trace_opts[] = {
64#ifdef CONFIG_FUNCTION_GRAPH_TRACER
65 /* display latency trace as call graph */
66 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
67#endif
68 { } /* Empty entry */
69};
70
71static struct tracer_flags tracer_flags = {
72 .val = 0,
73 .opts = trace_opts,
74};
75
76#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
77
58/* 78/*
59 * Sequence count - we record it when starting a measurement and 79 * Sequence count - we record it when starting a measurement and
60 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
108}; 128};
109#endif /* CONFIG_FUNCTION_TRACER */ 129#endif /* CONFIG_FUNCTION_TRACER */
110 130
131#ifdef CONFIG_FUNCTION_GRAPH_TRACER
132static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
133{
134 int cpu;
135
136 if (!(bit & TRACE_DISPLAY_GRAPH))
137 return -EINVAL;
138
139 if (!(is_graph() ^ set))
140 return 0;
141
142 stop_irqsoff_tracer(irqsoff_trace, !set);
143
144 for_each_possible_cpu(cpu)
145 per_cpu(tracing_cpu, cpu) = 0;
146
147 tracing_max_latency = 0;
148 tracing_reset_online_cpus(irqsoff_trace);
149
150 return start_irqsoff_tracer(irqsoff_trace, set);
151}
152
153static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
154{
155 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data;
157 unsigned long flags;
158 long disabled;
159 int ret;
160 int cpu;
161 int pc;
162
163 cpu = raw_smp_processor_id();
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0;
166
167 local_save_flags(flags);
168 /* slight chance to get a false positive on tracing_cpu */
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled);
182 return ret;
183}
184
185static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
186{
187 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data;
189 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc;
193
194 cpu = raw_smp_processor_id();
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return;
197
198 local_save_flags(flags);
199 /* slight chance to get a false positive on tracing_cpu */
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled);
212}
213
214static void irqsoff_trace_open(struct trace_iterator *iter)
215{
216 if (is_graph())
217 graph_trace_open(iter);
218
219}
220
221static void irqsoff_trace_close(struct trace_iterator *iter)
222{
223 if (iter->private)
224 graph_trace_close(iter);
225}
226
227#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
228 TRACE_GRAPH_PRINT_PROC)
229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /*
240 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler
242 */
243 if (is_graph())
244 return print_graph_function_flags(iter, flags);
245
246 return TRACE_TYPE_UNHANDLED;
247}
248
249static void irqsoff_print_header(struct seq_file *s)
250{
251 if (is_graph()) {
252 struct trace_iterator *iter = s->private;
253 u32 flags = GRAPH_TRACER_FLAGS;
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s);
268}
269
270static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc)
294{
295 if (!is_graph())
296 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301}
302
303#else
304#define __trace_function trace_function
305
306static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
307{
308 return -EINVAL;
309}
310
311static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
312{
313 return -1;
314}
315
316static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
317{
318 return TRACE_TYPE_UNHANDLED;
319}
320
321static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
322static void irqsoff_print_header(struct seq_file *s) { }
323static void irqsoff_trace_open(struct trace_iterator *iter) { }
324static void irqsoff_trace_close(struct trace_iterator *iter) { }
325#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
326
111/* 327/*
112 * Should this new latency be reported/recorded? 328 * Should this new latency be reported/recorded?
113 */ 329 */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
150 if (!report_latency(delta)) 366 if (!report_latency(delta))
151 goto out_unlock; 367 goto out_unlock;
152 368
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 369 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */ 370 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc); 371 __trace_stack(tr, flags, 5, pc);
156 372
@@ -172,7 +388,7 @@ out_unlock:
172out: 388out:
173 data->critical_sequence = max_sequence; 389 data->critical_sequence = max_sequence;
174 data->preempt_timestamp = ftrace_now(cpu); 390 data->preempt_timestamp = ftrace_now(cpu);
175 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 391 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
176} 392}
177 393
178static inline void 394static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
204 420
205 local_save_flags(flags); 421 local_save_flags(flags);
206 422
207 trace_function(tr, ip, parent_ip, flags, preempt_count()); 423 __trace_function(tr, ip, parent_ip, flags, preempt_count());
208 424
209 per_cpu(tracing_cpu, cpu) = 1; 425 per_cpu(tracing_cpu, cpu) = 1;
210 426
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
238 atomic_inc(&data->disabled); 454 atomic_inc(&data->disabled);
239 455
240 local_save_flags(flags); 456 local_save_flags(flags);
241 trace_function(tr, ip, parent_ip, flags, preempt_count()); 457 __trace_function(tr, ip, parent_ip, flags, preempt_count());
242 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 458 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
243 data->critical_start = 0; 459 data->critical_start = 0;
244 atomic_dec(&data->disabled); 460 atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
347} 563}
348#endif /* CONFIG_PREEMPT_TRACER */ 564#endif /* CONFIG_PREEMPT_TRACER */
349 565
350static void start_irqsoff_tracer(struct trace_array *tr) 566static int start_irqsoff_tracer(struct trace_array *tr, int graph)
351{ 567{
352 register_ftrace_function(&trace_ops); 568 int ret = 0;
353 if (tracing_is_enabled()) 569
570 if (!graph)
571 ret = register_ftrace_function(&trace_ops);
572 else
573 ret = register_ftrace_graph(&irqsoff_graph_return,
574 &irqsoff_graph_entry);
575
576 if (!ret && tracing_is_enabled())
354 tracer_enabled = 1; 577 tracer_enabled = 1;
355 else 578 else
356 tracer_enabled = 0; 579 tracer_enabled = 0;
580
581 return ret;
357} 582}
358 583
359static void stop_irqsoff_tracer(struct trace_array *tr) 584static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
360{ 585{
361 tracer_enabled = 0; 586 tracer_enabled = 0;
362 unregister_ftrace_function(&trace_ops); 587
588 if (!graph)
589 unregister_ftrace_function(&trace_ops);
590 else
591 unregister_ftrace_graph();
363} 592}
364 593
365static void __irqsoff_tracer_init(struct trace_array *tr) 594static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
372 /* make sure that the tracer is visible */ 601 /* make sure that the tracer is visible */
373 smp_wmb(); 602 smp_wmb();
374 tracing_reset_online_cpus(tr); 603 tracing_reset_online_cpus(tr);
375 start_irqsoff_tracer(tr); 604
605 if (start_irqsoff_tracer(tr, is_graph()))
606 printk(KERN_ERR "failed to start irqsoff tracer\n");
376} 607}
377 608
378static void irqsoff_tracer_reset(struct trace_array *tr) 609static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 610{
380 stop_irqsoff_tracer(tr); 611 stop_irqsoff_tracer(tr, is_graph());
381 612
382 if (!save_lat_flag) 613 if (!save_lat_flag)
383 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 614 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
409 .start = irqsoff_tracer_start, 640 .start = irqsoff_tracer_start,
410 .stop = irqsoff_tracer_stop, 641 .stop = irqsoff_tracer_stop,
411 .print_max = 1, 642 .print_max = 1,
643 .print_header = irqsoff_print_header,
644 .print_line = irqsoff_print_line,
645 .flags = &tracer_flags,
646 .set_flag = irqsoff_set_flag,
412#ifdef CONFIG_FTRACE_SELFTEST 647#ifdef CONFIG_FTRACE_SELFTEST
413 .selftest = trace_selftest_startup_irqsoff, 648 .selftest = trace_selftest_startup_irqsoff,
414#endif 649#endif
650 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close,
415}; 652};
416# define register_irqsoff(trace) register_tracer(&trace) 653# define register_irqsoff(trace) register_tracer(&trace)
417#else 654#else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
435 .start = irqsoff_tracer_start, 672 .start = irqsoff_tracer_start,
436 .stop = irqsoff_tracer_stop, 673 .stop = irqsoff_tracer_stop,
437 .print_max = 1, 674 .print_max = 1,
675 .print_header = irqsoff_print_header,
676 .print_line = irqsoff_print_line,
677 .flags = &tracer_flags,
678 .set_flag = irqsoff_set_flag,
438#ifdef CONFIG_FTRACE_SELFTEST 679#ifdef CONFIG_FTRACE_SELFTEST
439 .selftest = trace_selftest_startup_preemptoff, 680 .selftest = trace_selftest_startup_preemptoff,
440#endif 681#endif
682 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close,
441}; 684};
442# define register_preemptoff(trace) register_tracer(&trace) 685# define register_preemptoff(trace) register_tracer(&trace)
443#else 686#else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
463 .start = irqsoff_tracer_start, 706 .start = irqsoff_tracer_start,
464 .stop = irqsoff_tracer_stop, 707 .stop = irqsoff_tracer_stop,
465 .print_max = 1, 708 .print_max = 1,
709 .print_header = irqsoff_print_header,
710 .print_line = irqsoff_print_line,
711 .flags = &tracer_flags,
712 .set_flag = irqsoff_set_flag,
466#ifdef CONFIG_FTRACE_SELFTEST 713#ifdef CONFIG_FTRACE_SELFTEST
467 .selftest = trace_selftest_startup_preemptirqsoff, 714 .selftest = trace_selftest_startup_preemptirqsoff,
468#endif 715#endif
716 .open = irqsoff_trace_open,
717 .close = irqsoff_trace_close,
469}; 718};
470 719
471# define register_preemptirqsoff(trace) register_tracer(&trace) 720# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1251e367bae9..a7514326052b 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,6 +29,8 @@
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h>
33#include <asm/bitsperlong.h>
32 34
33#include "trace.h" 35#include "trace.h"
34#include "trace_output.h" 36#include "trace_output.h"
@@ -40,7 +42,6 @@
40 42
41/* Reserved field names */ 43/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip" 44#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip" 45#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func" 46#define FIELD_STRING_FUNC "__probe_func"
46 47
@@ -52,56 +53,102 @@ const char *reserved_field_names[] = {
52 "common_tgid", 53 "common_tgid",
53 "common_lock_depth", 54 "common_lock_depth",
54 FIELD_STRING_IP, 55 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP, 56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC, 57 FIELD_STRING_FUNC,
58}; 58};
59 59
60struct fetch_func { 60/* Printing function type */
61 unsigned long (*func)(struct pt_regs *, void *); 61typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
62#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
63#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
64
65/* Printing in basic type function template */
66#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
67static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
68 const char *name, void *data)\
69{ \
70 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
71} \
72static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
73
74DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
75DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
76DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
77DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
78DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
79DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
80DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
82
83/* Data fetch function type */
84typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
85
86struct fetch_param {
87 fetch_func_t fn;
62 void *data; 88 void *data;
63}; 89};
64 90
65static __kprobes unsigned long call_fetch(struct fetch_func *f, 91static __kprobes void call_fetch(struct fetch_param *fprm,
66 struct pt_regs *regs) 92 struct pt_regs *regs, void *dest)
67{ 93{
68 return f->func(regs, f->data); 94 return fprm->fn(regs, fprm->data, dest);
69} 95}
70 96
71/* fetch handlers */ 97#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type
72static __kprobes unsigned long fetch_register(struct pt_regs *regs, 98/*
73 void *offset) 99 * Define macro for basic types - we don't need to define s* types, because
74{ 100 * we have to care only about bitwidth at recording time.
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset)); 101 */
102#define DEFINE_BASIC_FETCH_FUNCS(kind) \
103DEFINE_FETCH_##kind(u8) \
104DEFINE_FETCH_##kind(u16) \
105DEFINE_FETCH_##kind(u32) \
106DEFINE_FETCH_##kind(u64)
107
108#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \
109 ((FETCH_FUNC_NAME(kind, u8) == fn) || \
110 (FETCH_FUNC_NAME(kind, u16) == fn) || \
111 (FETCH_FUNC_NAME(kind, u32) == fn) || \
112 (FETCH_FUNC_NAME(kind, u64) == fn))
113
114/* Data fetch function templates */
115#define DEFINE_FETCH_reg(type) \
116static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
117 void *offset, void *dest) \
118{ \
119 *(type *)dest = (type)regs_get_register(regs, \
120 (unsigned int)((unsigned long)offset)); \
76} 121}
77 122DEFINE_BASIC_FETCH_FUNCS(reg)
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs, 123
79 void *num) 124#define DEFINE_FETCH_stack(type) \
80{ 125static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
81 return regs_get_kernel_stack_nth(regs, 126 void *offset, void *dest) \
82 (unsigned int)((unsigned long)num)); 127{ \
128 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
129 (unsigned int)((unsigned long)offset)); \
83} 130}
131DEFINE_BASIC_FETCH_FUNCS(stack)
84 132
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) 133#define DEFINE_FETCH_retval(type) \
86{ 134static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
87 unsigned long retval; 135 void *dummy, void *dest) \
88 136{ \
89 if (probe_kernel_address(addr, retval)) 137 *(type *)dest = (type)regs_return_value(regs); \
90 return 0;
91 return retval;
92} 138}
93 139DEFINE_BASIC_FETCH_FUNCS(retval)
94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 140
95 void *dummy) 141#define DEFINE_FETCH_memory(type) \
96{ 142static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
97 return regs_return_value(regs); 143 void *addr, void *dest) \
98} 144{ \
99 145 type retval; \
100static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, 146 if (probe_kernel_address(addr, retval)) \
101 void *dummy) 147 *(type *)dest = 0; \
102{ 148 else \
103 return kernel_stack_pointer(regs); 149 *(type *)dest = retval; \
104} 150}
151DEFINE_BASIC_FETCH_FUNCS(memory)
105 152
106/* Memory fetching by symbol */ 153/* Memory fetching by symbol */
107struct symbol_cache { 154struct symbol_cache {
@@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
145 return sc; 192 return sc;
146} 193}
147 194
148static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) 195#define DEFINE_FETCH_symbol(type) \
149{ 196static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
150 struct symbol_cache *sc = data; 197 void *data, void *dest) \
151 198{ \
152 if (sc->addr) 199 struct symbol_cache *sc = data; \
153 return fetch_memory(regs, (void *)sc->addr); 200 if (sc->addr) \
154 else 201 fetch_memory_##type(regs, (void *)sc->addr, dest); \
155 return 0; 202 else \
203 *(type *)dest = 0; \
156} 204}
205DEFINE_BASIC_FETCH_FUNCS(symbol)
157 206
158/* Special indirect memory access interface */ 207/* Dereference memory access function */
159struct indirect_fetch_data { 208struct deref_fetch_param {
160 struct fetch_func orig; 209 struct fetch_param orig;
161 long offset; 210 long offset;
162}; 211};
163 212
164static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) 213#define DEFINE_FETCH_deref(type) \
165{ 214static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
166 struct indirect_fetch_data *ind = data; 215 void *data, void *dest) \
167 unsigned long addr; 216{ \
168 217 struct deref_fetch_param *dprm = data; \
169 addr = call_fetch(&ind->orig, regs); 218 unsigned long addr; \
170 if (addr) { 219 call_fetch(&dprm->orig, regs, &addr); \
171 addr += ind->offset; 220 if (addr) { \
172 return fetch_memory(regs, (void *)addr); 221 addr += dprm->offset; \
173 } else 222 fetch_memory_##type(regs, (void *)addr, dest); \
174 return 0; 223 } else \
224 *(type *)dest = 0; \
175} 225}
226DEFINE_BASIC_FETCH_FUNCS(deref)
176 227
177static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) 228static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
178{ 229{
179 if (data->orig.func == fetch_indirect) 230 if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
180 free_indirect_fetch_data(data->orig.data); 231 free_deref_fetch_param(data->orig.data);
181 else if (data->orig.func == fetch_symbol) 232 else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
182 free_symbol_cache(data->orig.data); 233 free_symbol_cache(data->orig.data);
183 kfree(data); 234 kfree(data);
184} 235}
185 236
237/* Default (unsigned long) fetch type */
238#define __DEFAULT_FETCH_TYPE(t) u##t
239#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
240#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
241#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
242
243#define ASSIGN_FETCH_FUNC(kind, type) \
244 .kind = FETCH_FUNC_NAME(kind, type)
245
246#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
247 {.name = #ptype, \
248 .size = sizeof(ftype), \
249 .is_signed = sign, \
250 .print = PRINT_TYPE_FUNC_NAME(ptype), \
251 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
252ASSIGN_FETCH_FUNC(reg, ftype), \
253ASSIGN_FETCH_FUNC(stack, ftype), \
254ASSIGN_FETCH_FUNC(retval, ftype), \
255ASSIGN_FETCH_FUNC(memory, ftype), \
256ASSIGN_FETCH_FUNC(symbol, ftype), \
257ASSIGN_FETCH_FUNC(deref, ftype), \
258 }
259
260/* Fetch type information table */
261static const struct fetch_type {
262 const char *name; /* Name of type */
263 size_t size; /* Byte size of type */
264 int is_signed; /* Signed flag */
265 print_type_func_t print; /* Print functions */
266 const char *fmt; /* Fromat string */
267 /* Fetch functions */
268 fetch_func_t reg;
269 fetch_func_t stack;
270 fetch_func_t retval;
271 fetch_func_t memory;
272 fetch_func_t symbol;
273 fetch_func_t deref;
274} fetch_type_table[] = {
275 ASSIGN_FETCH_TYPE(u8, u8, 0),
276 ASSIGN_FETCH_TYPE(u16, u16, 0),
277 ASSIGN_FETCH_TYPE(u32, u32, 0),
278 ASSIGN_FETCH_TYPE(u64, u64, 0),
279 ASSIGN_FETCH_TYPE(s8, u8, 1),
280 ASSIGN_FETCH_TYPE(s16, u16, 1),
281 ASSIGN_FETCH_TYPE(s32, u32, 1),
282 ASSIGN_FETCH_TYPE(s64, u64, 1),
283};
284
285static const struct fetch_type *find_fetch_type(const char *type)
286{
287 int i;
288
289 if (!type)
290 type = DEFAULT_FETCH_TYPE_STR;
291
292 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
293 if (strcmp(type, fetch_type_table[i].name) == 0)
294 return &fetch_type_table[i];
295 return NULL;
296}
297
298/* Special function : only accept unsigned long */
299static __kprobes void fetch_stack_address(struct pt_regs *regs,
300 void *dummy, void *dest)
301{
302 *(unsigned long *)dest = kernel_stack_pointer(regs);
303}
304
186/** 305/**
187 * Kprobe event core functions 306 * Kprobe event core functions
188 */ 307 */
189 308
190struct probe_arg { 309struct probe_arg {
191 struct fetch_func fetch; 310 struct fetch_param fetch;
192 const char *name; 311 unsigned int offset; /* Offset from argument entry */
312 const char *name; /* Name of this argument */
313 const char *comm; /* Command of this argument */
314 const struct fetch_type *type; /* Type of this argument */
193}; 315};
194 316
195/* Flags for trace_probe */ 317/* Flags for trace_probe */
@@ -204,6 +326,7 @@ struct trace_probe {
204 const char *symbol; /* symbol name */ 326 const char *symbol; /* symbol name */
205 struct ftrace_event_call call; 327 struct ftrace_event_call call;
206 struct trace_event event; 328 struct trace_event event;
329 ssize_t size; /* trace entry size */
207 unsigned int nr_args; 330 unsigned int nr_args;
208 struct probe_arg args[]; 331 struct probe_arg args[];
209}; 332};
@@ -212,6 +335,7 @@ struct trace_probe {
212 (offsetof(struct trace_probe, args) + \ 335 (offsetof(struct trace_probe, args) + \
213 (sizeof(struct probe_arg) * (n))) 336 (sizeof(struct probe_arg) * (n)))
214 337
338
215static __kprobes int probe_is_return(struct trace_probe *tp) 339static __kprobes int probe_is_return(struct trace_probe *tp)
216{ 340{
217 return tp->rp.handler != NULL; 341 return tp->rp.handler != NULL;
@@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
222 return tp->symbol ? tp->symbol : "unknown"; 346 return tp->symbol ? tp->symbol : "unknown";
223} 347}
224 348
225static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
226{
227 int ret = -EINVAL;
228
229 if (ff->func == fetch_register) {
230 const char *name;
231 name = regs_query_register_name((unsigned int)((long)ff->data));
232 ret = snprintf(buf, n, "%%%s", name);
233 } else if (ff->func == fetch_stack)
234 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
235 else if (ff->func == fetch_memory)
236 ret = snprintf(buf, n, "@0x%p", ff->data);
237 else if (ff->func == fetch_symbol) {
238 struct symbol_cache *sc = ff->data;
239 if (sc->offset)
240 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
241 sc->offset);
242 else
243 ret = snprintf(buf, n, "@%s", sc->symbol);
244 } else if (ff->func == fetch_retvalue)
245 ret = snprintf(buf, n, "$retval");
246 else if (ff->func == fetch_stack_address)
247 ret = snprintf(buf, n, "$stack");
248 else if (ff->func == fetch_indirect) {
249 struct indirect_fetch_data *id = ff->data;
250 size_t l = 0;
251 ret = snprintf(buf, n, "%+ld(", id->offset);
252 if (ret >= n)
253 goto end;
254 l += ret;
255 ret = probe_arg_string(buf + l, n - l, &id->orig);
256 if (ret < 0)
257 goto end;
258 l += ret;
259 ret = snprintf(buf + l, n - l, ")");
260 ret += l;
261 }
262end:
263 if (ret >= n)
264 return -ENOSPC;
265 return ret;
266}
267
268static int register_probe_event(struct trace_probe *tp); 349static int register_probe_event(struct trace_probe *tp);
269static void unregister_probe_event(struct trace_probe *tp); 350static void unregister_probe_event(struct trace_probe *tp);
270 351
@@ -347,11 +428,12 @@ error:
347 428
348static void free_probe_arg(struct probe_arg *arg) 429static void free_probe_arg(struct probe_arg *arg)
349{ 430{
350 if (arg->fetch.func == fetch_symbol) 431 if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
432 free_deref_fetch_param(arg->fetch.data);
433 else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
351 free_symbol_cache(arg->fetch.data); 434 free_symbol_cache(arg->fetch.data);
352 else if (arg->fetch.func == fetch_indirect)
353 free_indirect_fetch_data(arg->fetch.data);
354 kfree(arg->name); 435 kfree(arg->name);
436 kfree(arg->comm);
355} 437}
356 438
357static void free_trace_probe(struct trace_probe *tp) 439static void free_trace_probe(struct trace_probe *tp)
@@ -457,28 +539,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
457#define PARAM_MAX_ARGS 16 539#define PARAM_MAX_ARGS 16
458#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) 540#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
459 541
460static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) 542static int parse_probe_vars(char *arg, const struct fetch_type *t,
543 struct fetch_param *f, int is_return)
461{ 544{
462 int ret = 0; 545 int ret = 0;
463 unsigned long param; 546 unsigned long param;
464 547
465 if (strcmp(arg, "retval") == 0) { 548 if (strcmp(arg, "retval") == 0) {
466 if (is_return) { 549 if (is_return)
467 ff->func = fetch_retvalue; 550 f->fn = t->retval;
468 ff->data = NULL; 551 else
469 } else
470 ret = -EINVAL; 552 ret = -EINVAL;
471 } else if (strncmp(arg, "stack", 5) == 0) { 553 } else if (strncmp(arg, "stack", 5) == 0) {
472 if (arg[5] == '\0') { 554 if (arg[5] == '\0') {
473 ff->func = fetch_stack_address; 555 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
474 ff->data = NULL; 556 f->fn = fetch_stack_address;
557 else
558 ret = -EINVAL;
475 } else if (isdigit(arg[5])) { 559 } else if (isdigit(arg[5])) {
476 ret = strict_strtoul(arg + 5, 10, &param); 560 ret = strict_strtoul(arg + 5, 10, &param);
477 if (ret || param > PARAM_MAX_STACK) 561 if (ret || param > PARAM_MAX_STACK)
478 ret = -EINVAL; 562 ret = -EINVAL;
479 else { 563 else {
480 ff->func = fetch_stack; 564 f->fn = t->stack;
481 ff->data = (void *)param; 565 f->data = (void *)param;
482 } 566 }
483 } else 567 } else
484 ret = -EINVAL; 568 ret = -EINVAL;
@@ -488,7 +572,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
488} 572}
489 573
490/* Recursive argument parser */ 574/* Recursive argument parser */
491static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 575static int __parse_probe_arg(char *arg, const struct fetch_type *t,
576 struct fetch_param *f, int is_return)
492{ 577{
493 int ret = 0; 578 int ret = 0;
494 unsigned long param; 579 unsigned long param;
@@ -497,13 +582,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
497 582
498 switch (arg[0]) { 583 switch (arg[0]) {
499 case '$': 584 case '$':
500 ret = parse_probe_vars(arg + 1, ff, is_return); 585 ret = parse_probe_vars(arg + 1, t, f, is_return);
501 break; 586 break;
502 case '%': /* named register */ 587 case '%': /* named register */
503 ret = regs_query_register_offset(arg + 1); 588 ret = regs_query_register_offset(arg + 1);
504 if (ret >= 0) { 589 if (ret >= 0) {
505 ff->func = fetch_register; 590 f->fn = t->reg;
506 ff->data = (void *)(unsigned long)ret; 591 f->data = (void *)(unsigned long)ret;
507 ret = 0; 592 ret = 0;
508 } 593 }
509 break; 594 break;
@@ -512,26 +597,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
512 ret = strict_strtoul(arg + 1, 0, &param); 597 ret = strict_strtoul(arg + 1, 0, &param);
513 if (ret) 598 if (ret)
514 break; 599 break;
515 ff->func = fetch_memory; 600 f->fn = t->memory;
516 ff->data = (void *)param; 601 f->data = (void *)param;
517 } else { 602 } else {
518 ret = split_symbol_offset(arg + 1, &offset); 603 ret = split_symbol_offset(arg + 1, &offset);
519 if (ret) 604 if (ret)
520 break; 605 break;
521 ff->data = alloc_symbol_cache(arg + 1, offset); 606 f->data = alloc_symbol_cache(arg + 1, offset);
522 if (ff->data) 607 if (f->data)
523 ff->func = fetch_symbol; 608 f->fn = t->symbol;
524 else
525 ret = -EINVAL;
526 } 609 }
527 break; 610 break;
528 case '+': /* indirect memory */ 611 case '+': /* deref memory */
529 case '-': 612 case '-':
530 tmp = strchr(arg, '('); 613 tmp = strchr(arg, '(');
531 if (!tmp) { 614 if (!tmp)
532 ret = -EINVAL;
533 break; 615 break;
534 }
535 *tmp = '\0'; 616 *tmp = '\0';
536 ret = strict_strtol(arg + 1, 0, &offset); 617 ret = strict_strtol(arg + 1, 0, &offset);
537 if (ret) 618 if (ret)
@@ -541,38 +622,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
541 arg = tmp + 1; 622 arg = tmp + 1;
542 tmp = strrchr(arg, ')'); 623 tmp = strrchr(arg, ')');
543 if (tmp) { 624 if (tmp) {
544 struct indirect_fetch_data *id; 625 struct deref_fetch_param *dprm;
626 const struct fetch_type *t2 = find_fetch_type(NULL);
545 *tmp = '\0'; 627 *tmp = '\0';
546 id = kzalloc(sizeof(struct indirect_fetch_data), 628 dprm = kzalloc(sizeof(struct deref_fetch_param),
547 GFP_KERNEL); 629 GFP_KERNEL);
548 if (!id) 630 if (!dprm)
549 return -ENOMEM; 631 return -ENOMEM;
550 id->offset = offset; 632 dprm->offset = offset;
551 ret = __parse_probe_arg(arg, &id->orig, is_return); 633 ret = __parse_probe_arg(arg, t2, &dprm->orig,
634 is_return);
552 if (ret) 635 if (ret)
553 kfree(id); 636 kfree(dprm);
554 else { 637 else {
555 ff->func = fetch_indirect; 638 f->fn = t->deref;
556 ff->data = (void *)id; 639 f->data = (void *)dprm;
557 } 640 }
558 } else 641 }
559 ret = -EINVAL;
560 break; 642 break;
561 default:
562 /* TODO: support custom handler */
563 ret = -EINVAL;
564 } 643 }
644 if (!ret && !f->fn)
645 ret = -EINVAL;
565 return ret; 646 return ret;
566} 647}
567 648
568/* String length checking wrapper */ 649/* String length checking wrapper */
569static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 650static int parse_probe_arg(char *arg, struct trace_probe *tp,
651 struct probe_arg *parg, int is_return)
570{ 652{
653 const char *t;
654
571 if (strlen(arg) > MAX_ARGSTR_LEN) { 655 if (strlen(arg) > MAX_ARGSTR_LEN) {
572 pr_info("Argument is too long.: %s\n", arg); 656 pr_info("Argument is too long.: %s\n", arg);
573 return -ENOSPC; 657 return -ENOSPC;
574 } 658 }
575 return __parse_probe_arg(arg, ff, is_return); 659 parg->comm = kstrdup(arg, GFP_KERNEL);
660 if (!parg->comm) {
661 pr_info("Failed to allocate memory for command '%s'.\n", arg);
662 return -ENOMEM;
663 }
664 t = strchr(parg->comm, ':');
665 if (t) {
666 arg[t - parg->comm] = '\0';
667 t++;
668 }
669 parg->type = find_fetch_type(t);
670 if (!parg->type) {
671 pr_info("Unsupported type: %s\n", t);
672 return -EINVAL;
673 }
674 parg->offset = tp->size;
675 tp->size += parg->type->size;
676 return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
576} 677}
577 678
578/* Return 1 if name is reserved or already used by another argument */ 679/* Return 1 if name is reserved or already used by another argument */
@@ -602,15 +703,18 @@ static int create_trace_probe(int argc, char **argv)
602 * @ADDR : fetch memory at ADDR (ADDR should be in kernel) 703 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
603 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) 704 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
604 * %REG : fetch register REG 705 * %REG : fetch register REG
605 * Indirect memory fetch: 706 * Dereferencing memory fetch:
606 * +|-offs(ARG) : fetch memory at ARG +|- offs address. 707 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
607 * Alias name of args: 708 * Alias name of args:
608 * NAME=FETCHARG : set NAME as alias of FETCHARG. 709 * NAME=FETCHARG : set NAME as alias of FETCHARG.
710 * Type of args:
711 * FETCHARG:TYPE : use TYPE instead of unsigned long.
609 */ 712 */
610 struct trace_probe *tp; 713 struct trace_probe *tp;
611 int i, ret = 0; 714 int i, ret = 0;
612 int is_return = 0, is_delete = 0; 715 int is_return = 0, is_delete = 0;
613 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; 716 char *symbol = NULL, *event = NULL, *group = NULL;
717 char *arg, *tmp;
614 unsigned long offset = 0; 718 unsigned long offset = 0;
615 void *addr = NULL; 719 void *addr = NULL;
616 char buf[MAX_EVENT_NAME_LEN]; 720 char buf[MAX_EVENT_NAME_LEN];
@@ -723,13 +827,6 @@ static int create_trace_probe(int argc, char **argv)
723 else 827 else
724 arg = argv[i]; 828 arg = argv[i];
725 829
726 if (conflict_field_name(argv[i], tp->args, i)) {
727 pr_info("Argument%d name '%s' conflicts with "
728 "another field.\n", i, argv[i]);
729 ret = -EINVAL;
730 goto error;
731 }
732
733 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); 830 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
734 if (!tp->args[i].name) { 831 if (!tp->args[i].name) {
735 pr_info("Failed to allocate argument%d name '%s'.\n", 832 pr_info("Failed to allocate argument%d name '%s'.\n",
@@ -737,9 +834,19 @@ static int create_trace_probe(int argc, char **argv)
737 ret = -ENOMEM; 834 ret = -ENOMEM;
738 goto error; 835 goto error;
739 } 836 }
837 tmp = strchr(tp->args[i].name, ':');
838 if (tmp)
839 *tmp = '_'; /* convert : to _ */
840
841 if (conflict_field_name(tp->args[i].name, tp->args, i)) {
842 pr_info("Argument%d name '%s' conflicts with "
843 "another field.\n", i, argv[i]);
844 ret = -EINVAL;
845 goto error;
846 }
740 847
741 /* Parse fetch argument */ 848 /* Parse fetch argument */
742 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); 849 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
743 if (ret) { 850 if (ret) {
744 pr_info("Parse error at argument%d. (%d)\n", i, ret); 851 pr_info("Parse error at argument%d. (%d)\n", i, ret);
745 kfree(tp->args[i].name); 852 kfree(tp->args[i].name);
@@ -794,8 +901,7 @@ static void probes_seq_stop(struct seq_file *m, void *v)
794static int probes_seq_show(struct seq_file *m, void *v) 901static int probes_seq_show(struct seq_file *m, void *v)
795{ 902{
796 struct trace_probe *tp = v; 903 struct trace_probe *tp = v;
797 int i, ret; 904 int i;
798 char buf[MAX_ARGSTR_LEN + 1];
799 905
800 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 906 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
801 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); 907 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name);
@@ -807,15 +913,10 @@ static int probes_seq_show(struct seq_file *m, void *v)
807 else 913 else
808 seq_printf(m, " %s", probe_symbol(tp)); 914 seq_printf(m, " %s", probe_symbol(tp));
809 915
810 for (i = 0; i < tp->nr_args; i++) { 916 for (i = 0; i < tp->nr_args; i++)
811 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); 917 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
812 if (ret < 0) {
813 pr_warning("Argument%d decoding error(%d).\n", i, ret);
814 return ret;
815 }
816 seq_printf(m, " %s=%s", tp->args[i].name, buf);
817 }
818 seq_printf(m, "\n"); 918 seq_printf(m, "\n");
919
819 return 0; 920 return 0;
820} 921}
821 922
@@ -945,9 +1046,10 @@ static const struct file_operations kprobe_profile_ops = {
945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 1046static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
946{ 1047{
947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1048 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
948 struct kprobe_trace_entry *entry; 1049 struct kprobe_trace_entry_head *entry;
949 struct ring_buffer_event *event; 1050 struct ring_buffer_event *event;
950 struct ring_buffer *buffer; 1051 struct ring_buffer *buffer;
1052 u8 *data;
951 int size, i, pc; 1053 int size, i, pc;
952 unsigned long irq_flags; 1054 unsigned long irq_flags;
953 struct ftrace_event_call *call = &tp->call; 1055 struct ftrace_event_call *call = &tp->call;
@@ -957,7 +1059,7 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
957 local_save_flags(irq_flags); 1059 local_save_flags(irq_flags);
958 pc = preempt_count(); 1060 pc = preempt_count();
959 1061
960 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1062 size = sizeof(*entry) + tp->size;
961 1063
962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1064 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
963 irq_flags, pc); 1065 irq_flags, pc);
@@ -965,10 +1067,10 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
965 return; 1067 return;
966 1068
967 entry = ring_buffer_event_data(event); 1069 entry = ring_buffer_event_data(event);
968 entry->nargs = tp->nr_args;
969 entry->ip = (unsigned long)kp->addr; 1070 entry->ip = (unsigned long)kp->addr;
1071 data = (u8 *)&entry[1];
970 for (i = 0; i < tp->nr_args; i++) 1072 for (i = 0; i < tp->nr_args; i++)
971 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1073 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
972 1074
973 if (!filter_current_check_discard(buffer, call, entry, event)) 1075 if (!filter_current_check_discard(buffer, call, entry, event))
974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1076 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -979,9 +1081,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
979 struct pt_regs *regs) 1081 struct pt_regs *regs)
980{ 1082{
981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1083 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
982 struct kretprobe_trace_entry *entry; 1084 struct kretprobe_trace_entry_head *entry;
983 struct ring_buffer_event *event; 1085 struct ring_buffer_event *event;
984 struct ring_buffer *buffer; 1086 struct ring_buffer *buffer;
1087 u8 *data;
985 int size, i, pc; 1088 int size, i, pc;
986 unsigned long irq_flags; 1089 unsigned long irq_flags;
987 struct ftrace_event_call *call = &tp->call; 1090 struct ftrace_event_call *call = &tp->call;
@@ -989,7 +1092,7 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
989 local_save_flags(irq_flags); 1092 local_save_flags(irq_flags);
990 pc = preempt_count(); 1093 pc = preempt_count();
991 1094
992 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1095 size = sizeof(*entry) + tp->size;
993 1096
994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1097 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
995 irq_flags, pc); 1098 irq_flags, pc);
@@ -997,11 +1100,11 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
997 return; 1100 return;
998 1101
999 entry = ring_buffer_event_data(event); 1102 entry = ring_buffer_event_data(event);
1000 entry->nargs = tp->nr_args;
1001 entry->func = (unsigned long)tp->rp.kp.addr; 1103 entry->func = (unsigned long)tp->rp.kp.addr;
1002 entry->ret_ip = (unsigned long)ri->ret_addr; 1104 entry->ret_ip = (unsigned long)ri->ret_addr;
1105 data = (u8 *)&entry[1];
1003 for (i = 0; i < tp->nr_args; i++) 1106 for (i = 0; i < tp->nr_args; i++)
1004 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1107 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1005 1108
1006 if (!filter_current_check_discard(buffer, call, entry, event)) 1109 if (!filter_current_check_discard(buffer, call, entry, event))
1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1110 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1011,13 +1114,14 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1011enum print_line_t 1114enum print_line_t
1012print_kprobe_event(struct trace_iterator *iter, int flags) 1115print_kprobe_event(struct trace_iterator *iter, int flags)
1013{ 1116{
1014 struct kprobe_trace_entry *field; 1117 struct kprobe_trace_entry_head *field;
1015 struct trace_seq *s = &iter->seq; 1118 struct trace_seq *s = &iter->seq;
1016 struct trace_event *event; 1119 struct trace_event *event;
1017 struct trace_probe *tp; 1120 struct trace_probe *tp;
1121 u8 *data;
1018 int i; 1122 int i;
1019 1123
1020 field = (struct kprobe_trace_entry *)iter->ent; 1124 field = (struct kprobe_trace_entry_head *)iter->ent;
1021 event = ftrace_find_event(field->ent.type); 1125 event = ftrace_find_event(field->ent.type);
1022 tp = container_of(event, struct trace_probe, event); 1126 tp = container_of(event, struct trace_probe, event);
1023 1127
@@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
1030 if (!trace_seq_puts(s, ")")) 1134 if (!trace_seq_puts(s, ")"))
1031 goto partial; 1135 goto partial;
1032 1136
1033 for (i = 0; i < field->nargs; i++) 1137 data = (u8 *)&field[1];
1034 if (!trace_seq_printf(s, " %s=%lx", 1138 for (i = 0; i < tp->nr_args; i++)
1035 tp->args[i].name, field->args[i])) 1139 if (!tp->args[i].type->print(s, tp->args[i].name,
1140 data + tp->args[i].offset))
1036 goto partial; 1141 goto partial;
1037 1142
1038 if (!trace_seq_puts(s, "\n")) 1143 if (!trace_seq_puts(s, "\n"))
@@ -1046,13 +1151,14 @@ partial:
1046enum print_line_t 1151enum print_line_t
1047print_kretprobe_event(struct trace_iterator *iter, int flags) 1152print_kretprobe_event(struct trace_iterator *iter, int flags)
1048{ 1153{
1049 struct kretprobe_trace_entry *field; 1154 struct kretprobe_trace_entry_head *field;
1050 struct trace_seq *s = &iter->seq; 1155 struct trace_seq *s = &iter->seq;
1051 struct trace_event *event; 1156 struct trace_event *event;
1052 struct trace_probe *tp; 1157 struct trace_probe *tp;
1158 u8 *data;
1053 int i; 1159 int i;
1054 1160
1055 field = (struct kretprobe_trace_entry *)iter->ent; 1161 field = (struct kretprobe_trace_entry_head *)iter->ent;
1056 event = ftrace_find_event(field->ent.type); 1162 event = ftrace_find_event(field->ent.type);
1057 tp = container_of(event, struct trace_probe, event); 1163 tp = container_of(event, struct trace_probe, event);
1058 1164
@@ -1071,9 +1177,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
1071 if (!trace_seq_puts(s, ")")) 1177 if (!trace_seq_puts(s, ")"))
1072 goto partial; 1178 goto partial;
1073 1179
1074 for (i = 0; i < field->nargs; i++) 1180 data = (u8 *)&field[1];
1075 if (!trace_seq_printf(s, " %s=%lx", 1181 for (i = 0; i < tp->nr_args; i++)
1076 tp->args[i].name, field->args[i])) 1182 if (!tp->args[i].type->print(s, tp->args[i].name,
1183 data + tp->args[i].offset))
1077 goto partial; 1184 goto partial;
1078 1185
1079 if (!trace_seq_puts(s, "\n")) 1186 if (!trace_seq_puts(s, "\n"))
@@ -1129,29 +1236,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call)
1129static int kprobe_event_define_fields(struct ftrace_event_call *event_call) 1236static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1130{ 1237{
1131 int ret, i; 1238 int ret, i;
1132 struct kprobe_trace_entry field; 1239 struct kprobe_trace_entry_head field;
1133 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1240 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1134 1241
1135 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1242 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1136 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1137 /* Set argument names as fields */ 1243 /* Set argument names as fields */
1138 for (i = 0; i < tp->nr_args; i++) 1244 for (i = 0; i < tp->nr_args; i++) {
1139 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1245 ret = trace_define_field(event_call, tp->args[i].type->name,
1246 tp->args[i].name,
1247 sizeof(field) + tp->args[i].offset,
1248 tp->args[i].type->size,
1249 tp->args[i].type->is_signed,
1250 FILTER_OTHER);
1251 if (ret)
1252 return ret;
1253 }
1140 return 0; 1254 return 0;
1141} 1255}
1142 1256
1143static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) 1257static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1144{ 1258{
1145 int ret, i; 1259 int ret, i;
1146 struct kretprobe_trace_entry field; 1260 struct kretprobe_trace_entry_head field;
1147 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1261 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1148 1262
1149 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); 1263 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1150 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1264 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1151 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1152 /* Set argument names as fields */ 1265 /* Set argument names as fields */
1153 for (i = 0; i < tp->nr_args; i++) 1266 for (i = 0; i < tp->nr_args; i++) {
1154 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1267 ret = trace_define_field(event_call, tp->args[i].type->name,
1268 tp->args[i].name,
1269 sizeof(field) + tp->args[i].offset,
1270 tp->args[i].type->size,
1271 tp->args[i].type->is_signed,
1272 FILTER_OTHER);
1273 if (ret)
1274 return ret;
1275 }
1155 return 0; 1276 return 0;
1156} 1277}
1157 1278
@@ -1176,8 +1297,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); 1297 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1177 1298
1178 for (i = 0; i < tp->nr_args; i++) { 1299 for (i = 0; i < tp->nr_args; i++) {
1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", 1300 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
1180 tp->args[i].name); 1301 tp->args[i].name, tp->args[i].type->fmt);
1181 } 1302 }
1182 1303
1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); 1304 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
@@ -1219,12 +1340,13 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1219{ 1340{
1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1341 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1221 struct ftrace_event_call *call = &tp->call; 1342 struct ftrace_event_call *call = &tp->call;
1222 struct kprobe_trace_entry *entry; 1343 struct kprobe_trace_entry_head *entry;
1344 u8 *data;
1223 int size, __size, i; 1345 int size, __size, i;
1224 unsigned long irq_flags; 1346 unsigned long irq_flags;
1225 int rctx; 1347 int rctx;
1226 1348
1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1349 __size = sizeof(*entry) + tp->size;
1228 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1350 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1229 size -= sizeof(u32); 1351 size -= sizeof(u32);
1230 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1352 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1235,10 +1357,10 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1235 if (!entry) 1357 if (!entry)
1236 return; 1358 return;
1237 1359
1238 entry->nargs = tp->nr_args;
1239 entry->ip = (unsigned long)kp->addr; 1360 entry->ip = (unsigned long)kp->addr;
1361 data = (u8 *)&entry[1];
1240 for (i = 0; i < tp->nr_args; i++) 1362 for (i = 0; i < tp->nr_args; i++)
1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1363 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1242 1364
1243 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); 1365 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs);
1244} 1366}
@@ -1249,12 +1371,13 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1249{ 1371{
1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1372 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1251 struct ftrace_event_call *call = &tp->call; 1373 struct ftrace_event_call *call = &tp->call;
1252 struct kretprobe_trace_entry *entry; 1374 struct kretprobe_trace_entry_head *entry;
1375 u8 *data;
1253 int size, __size, i; 1376 int size, __size, i;
1254 unsigned long irq_flags; 1377 unsigned long irq_flags;
1255 int rctx; 1378 int rctx;
1256 1379
1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1380 __size = sizeof(*entry) + tp->size;
1258 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1381 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1259 size -= sizeof(u32); 1382 size -= sizeof(u32);
1260 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1383 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
@@ -1265,11 +1388,11 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1265 if (!entry) 1388 if (!entry)
1266 return; 1389 return;
1267 1390
1268 entry->nargs = tp->nr_args;
1269 entry->func = (unsigned long)tp->rp.kp.addr; 1391 entry->func = (unsigned long)tp->rp.kp.addr;
1270 entry->ret_ip = (unsigned long)ri->ret_addr; 1392 entry->ret_ip = (unsigned long)ri->ret_addr;
1393 data = (u8 *)&entry[1];
1271 for (i = 0; i < tp->nr_args; i++) 1394 for (i = 0; i < tp->nr_args; i++)
1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1395 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1273 1396
1274 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, 1397 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1,
1275 irq_flags, regs); 1398 irq_flags, regs);
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index d59cd6879477..8eaf00749b65 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -34,12 +34,6 @@
34 34
35#include <asm/atomic.h> 35#include <asm/atomic.h>
36 36
37/*
38 * For now, let us restrict the no. of symbols traced simultaneously to number
39 * of available hardware breakpoint registers.
40 */
41#define KSYM_TRACER_MAX HBP_NUM
42
43#define KSYM_TRACER_OP_LEN 3 /* rw- */ 37#define KSYM_TRACER_OP_LEN 3 /* rw- */
44 38
45struct trace_ksym { 39struct trace_ksym {
@@ -53,7 +47,6 @@ struct trace_ksym {
53 47
54static struct trace_array *ksym_trace_array; 48static struct trace_array *ksym_trace_array;
55 49
56static unsigned int ksym_filter_entry_count;
57static unsigned int ksym_tracing_enabled; 50static unsigned int ksym_tracing_enabled;
58 51
59static HLIST_HEAD(ksym_filter_head); 52static HLIST_HEAD(ksym_filter_head);
@@ -181,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
181 struct trace_ksym *entry; 174 struct trace_ksym *entry;
182 int ret = -ENOMEM; 175 int ret = -ENOMEM;
183 176
184 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
185 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
186 " new requests for tracing can be accepted now.\n",
187 KSYM_TRACER_MAX);
188 return -ENOSPC;
189 }
190
191 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); 177 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
192 if (!entry) 178 if (!entry)
193 return -ENOMEM; 179 return -ENOMEM;
@@ -203,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
203 189
204 if (IS_ERR(entry->ksym_hbp)) { 190 if (IS_ERR(entry->ksym_hbp)) {
205 ret = PTR_ERR(entry->ksym_hbp); 191 ret = PTR_ERR(entry->ksym_hbp);
206 printk(KERN_INFO "ksym_tracer request failed. Try again" 192 if (ret == -ENOSPC) {
207 " later!!\n"); 193 printk(KERN_ERR "ksym_tracer: Maximum limit reached."
194 " No new requests for tracing can be accepted now.\n");
195 } else {
196 printk(KERN_INFO "ksym_tracer request failed. Try again"
197 " later!!\n");
198 }
208 goto err; 199 goto err;
209 } 200 }
210 201
211 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); 202 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
212 ksym_filter_entry_count++;
213 203
214 return 0; 204 return 0;
215 205
@@ -265,7 +255,6 @@ static void __ksym_trace_reset(void)
265 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, 255 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
266 ksym_hlist) { 256 ksym_hlist) {
267 unregister_wide_hw_breakpoint(entry->ksym_hbp); 257 unregister_wide_hw_breakpoint(entry->ksym_hbp);
268 ksym_filter_entry_count--;
269 hlist_del_rcu(&(entry->ksym_hlist)); 258 hlist_del_rcu(&(entry->ksym_hlist));
270 synchronize_rcu(); 259 synchronize_rcu();
271 kfree(entry); 260 kfree(entry);
@@ -338,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file,
338 goto out_unlock; 327 goto out_unlock;
339 } 328 }
340 /* Error or "symbol:---" case: drop it */ 329 /* Error or "symbol:---" case: drop it */
341 ksym_filter_entry_count--;
342 hlist_del_rcu(&(entry->ksym_hlist)); 330 hlist_del_rcu(&(entry->ksym_hlist));
343 synchronize_rcu(); 331 synchronize_rcu();
344 kfree(entry); 332 kfree(entry);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cdc..ab13d7008061 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -209,6 +209,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
209 209
210 return 1; 210 return 1;
211} 211}
212EXPORT_SYMBOL(trace_seq_putc);
212 213
213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 214int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
214{ 215{
@@ -253,7 +254,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
253 void *ret; 254 void *ret;
254 255
255 if (s->full) 256 if (s->full)
256 return 0; 257 return NULL;
257 258
258 if (len > ((PAGE_SIZE - 1) - s->len)) { 259 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1; 260 s->full = 1;
@@ -355,6 +356,21 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
355} 356}
356EXPORT_SYMBOL(ftrace_print_symbols_seq); 357EXPORT_SYMBOL(ftrace_print_symbols_seq);
357 358
359const char *
360ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
361{
362 int i;
363 const char *ret = p->buffer + p->len;
364
365 for (i = 0; i < buf_len; i++)
366 trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
367
368 trace_seq_putc(p, 0);
369
370 return ret;
371}
372EXPORT_SYMBOL(ftrace_print_hex_seq);
373
358#ifdef CONFIG_KRETPROBES 374#ifdef CONFIG_KRETPROBES
359static inline const char *kretprobed(const char *name) 375static inline const char *kretprobed(const char *name)
360{ 376{
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde4..a55fccfede5d 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
50} 50}
51 51
52static void 52static void
53probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(struct task_struct *prev, struct task_struct *next)
54 struct task_struct *next)
55{ 54{
56 struct trace_array_cpu *data; 55 struct trace_array_cpu *data;
57 unsigned long flags; 56 unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
109} 108}
110 109
111static void 110static void
112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 111probe_sched_wakeup(struct task_struct *wakee, int success)
113{ 112{
114 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
115 unsigned long flags; 114 unsigned long flags;
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8d..8052446ceeaa 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -107,8 +107,7 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
107} 107}
108 108
109static void notrace 109static void notrace
110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 110probe_wakeup_sched_switch(struct task_struct *prev, struct task_struct *next)
111 struct task_struct *next)
112{ 111{
113 struct trace_array_cpu *data; 112 struct trace_array_cpu *data;
114 cycle_t T0, T1, delta; 113 cycle_t T0, T1, delta;
@@ -200,7 +199,7 @@ static void wakeup_reset(struct trace_array *tr)
200} 199}
201 200
202static void 201static void
203probe_wakeup(struct rq *rq, struct task_struct *p, int success) 202probe_wakeup(struct task_struct *p, int success)
204{ 203{
205 struct trace_array_cpu *data; 204 struct trace_array_cpu *data;
206 int cpu = smp_processor_id(); 205 int cpu = smp_processor_id();
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 81003b4d617f..250e7f9bd2f0 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,7 +17,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
17 case TRACE_BRANCH: 17 case TRACE_BRANCH:
18 case TRACE_GRAPH_ENT: 18 case TRACE_GRAPH_ENT:
19 case TRACE_GRAPH_RET: 19 case TRACE_GRAPH_RET:
20 case TRACE_HW_BRANCHES:
21 case TRACE_KSYM: 20 case TRACE_KSYM:
22 return 1; 21 return 1;
23 } 22 }
@@ -30,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
30 struct trace_entry *entry; 29 struct trace_entry *entry;
31 unsigned int loops = 0; 30 unsigned int loops = 0;
32 31
33 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { 32 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
34 entry = ring_buffer_event_data(event); 33 entry = ring_buffer_event_data(event);
35 34
36 /* 35 /*
@@ -256,7 +255,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
256/* Maximum number of functions to trace before diagnosing a hang */ 255/* Maximum number of functions to trace before diagnosing a hang */
257#define GRAPH_MAX_FUNC_TEST 100000000 256#define GRAPH_MAX_FUNC_TEST 100000000
258 257
259static void __ftrace_dump(bool disable_tracing); 258static void
259__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
260static unsigned int graph_hang_thresh; 260static unsigned int graph_hang_thresh;
261 261
262/* Wrap the real function entry probe to avoid possible hanging */ 262/* Wrap the real function entry probe to avoid possible hanging */
@@ -267,7 +267,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
267 ftrace_graph_stop(); 267 ftrace_graph_stop();
268 printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); 268 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
269 if (ftrace_dump_on_oops) 269 if (ftrace_dump_on_oops)
270 __ftrace_dump(false); 270 __ftrace_dump(false, DUMP_ALL);
271 return 0; 271 return 0;
272 } 272 }
273 273
@@ -755,62 +755,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
755} 755}
756#endif /* CONFIG_BRANCH_TRACER */ 756#endif /* CONFIG_BRANCH_TRACER */
757 757
758#ifdef CONFIG_HW_BRANCH_TRACER
759int
760trace_selftest_startup_hw_branches(struct tracer *trace,
761 struct trace_array *tr)
762{
763 struct trace_iterator *iter;
764 struct tracer tracer;
765 unsigned long count;
766 int ret;
767
768 if (!trace->open) {
769 printk(KERN_CONT "missing open function...");
770 return -1;
771 }
772
773 ret = tracer_init(trace, tr);
774 if (ret) {
775 warn_failed_init_tracer(trace, ret);
776 return ret;
777 }
778
779 /*
780 * The hw-branch tracer needs to collect the trace from the various
781 * cpu trace buffers - before tracing is stopped.
782 */
783 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
784 if (!iter)
785 return -ENOMEM;
786
787 memcpy(&tracer, trace, sizeof(tracer));
788
789 iter->trace = &tracer;
790 iter->tr = tr;
791 iter->pos = -1;
792 mutex_init(&iter->mutex);
793
794 trace->open(iter);
795
796 mutex_destroy(&iter->mutex);
797 kfree(iter);
798
799 tracing_stop();
800
801 ret = trace_test_buffer(tr, &count);
802 trace->reset(tr);
803 tracing_start();
804
805 if (!ret && !count) {
806 printk(KERN_CONT "no entries found..");
807 ret = -1;
808 }
809
810 return ret;
811}
812#endif /* CONFIG_HW_BRANCH_TRACER */
813
814#ifdef CONFIG_KSYM_TRACER 758#ifdef CONFIG_KSYM_TRACER
815static int ksym_selftest_dummy; 759static int ksym_selftest_dummy;
816 760
diff --git a/kernel/user.c b/kernel/user.c
index 766467b3bcb7..7e72614b736d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include "cred-internals.h"
20 19
21struct user_namespace init_user_ns = { 20struct user_namespace init_user_ns = {
22 .kref = { 21 .kref = {
@@ -137,9 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
137 struct hlist_head *hashent = uidhashentry(ns, uid); 136 struct hlist_head *hashent = uidhashentry(ns, uid);
138 struct user_struct *up, *new; 137 struct user_struct *up, *new;
139 138
140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
141 * atomic.
142 */
143 spin_lock_irq(&uidhash_lock); 139 spin_lock_irq(&uidhash_lock);
144 up = uid_hash_find(uid, hashent); 140 up = uid_hash_find(uid, hashent);
145 spin_unlock_irq(&uidhash_lock); 141 spin_unlock_irq(&uidhash_lock);
@@ -161,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
161 spin_lock_irq(&uidhash_lock); 157 spin_lock_irq(&uidhash_lock);
162 up = uid_hash_find(uid, hashent); 158 up = uid_hash_find(uid, hashent);
163 if (up) { 159 if (up) {
164 /* This case is not possible when CONFIG_USER_SCHED
165 * is defined, since we serialize alloc_uid() using
166 * uids_mutex. Hence no need to call
167 * sched_destroy_user() or remove_user_sysfs_dir().
168 */
169 key_put(new->uid_keyring); 160 key_put(new->uid_keyring);
170 key_put(new->session_keyring); 161 key_put(new->session_keyring);
171 kmem_cache_free(uid_cachep, new); 162 kmem_cache_free(uid_cachep, new);
@@ -178,8 +169,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
178 169
179 return up; 170 return up;
180 171
181 put_user_ns(new->user_ns);
182 kmem_cache_free(uid_cachep, new);
183out_unlock: 172out_unlock:
184 return NULL; 173 return NULL;
185} 174}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8215b0..b2d70d38dff4 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -54,8 +54,8 @@ int create_user_ns(struct cred *new)
54#endif 54#endif
55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
56 56
57 /* alloc_uid() incremented the userns refcount. Just set it to 1 */ 57 /* root_user holds a reference to ns, our reference can be dropped */
58 kref_set(&ns->kref, 1); 58 put_user_ns(ns);
59 59
60 return 0; 60 return 0;
61} 61}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dee48658805c..77dabbf64b8f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -229,6 +229,16 @@ static inline void set_wq_data(struct work_struct *work,
229 atomic_long_set(&work->data, new); 229 atomic_long_set(&work->data, new);
230} 230}
231 231
232/*
233 * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
234 */
235static inline void clear_wq_data(struct work_struct *work)
236{
237 unsigned long flags = *work_data_bits(work) &
238 (1UL << WORK_STRUCT_STATIC);
239 atomic_long_set(&work->data, flags);
240}
241
232static inline 242static inline
233struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) 243struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
234{ 244{
@@ -671,7 +681,7 @@ static int __cancel_work_timer(struct work_struct *work,
671 wait_on_work(work); 681 wait_on_work(work);
672 } while (unlikely(ret < 0)); 682 } while (unlikely(ret < 0));
673 683
674 work_clear_pending(work); 684 clear_wq_data(work);
675 return ret; 685 return ret;
676} 686}
677 687
@@ -774,7 +784,7 @@ void flush_delayed_work(struct delayed_work *dwork)
774{ 784{
775 if (del_timer_sync(&dwork->timer)) { 785 if (del_timer_sync(&dwork->timer)) {
776 struct cpu_workqueue_struct *cwq; 786 struct cpu_workqueue_struct *cwq;
777 cwq = wq_per_cpu(keventd_wq, get_cpu()); 787 cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
778 __queue_work(cwq, &dwork->work); 788 __queue_work(cwq, &dwork->work);
779 put_cpu(); 789 put_cpu();
780 } 790 }
@@ -845,6 +855,30 @@ int schedule_on_each_cpu(work_func_t func)
845 return 0; 855 return 0;
846} 856}
847 857
858/**
859 * flush_scheduled_work - ensure that any scheduled work has run to completion.
860 *
861 * Forces execution of the kernel-global workqueue and blocks until its
862 * completion.
863 *
864 * Think twice before calling this function! It's very easy to get into
865 * trouble if you don't take great care. Either of the following situations
866 * will lead to deadlock:
867 *
868 * One of the work items currently on the workqueue needs to acquire
869 * a lock held by your code or its caller.
870 *
871 * Your code is running in the context of a work routine.
872 *
873 * They will be detected by lockdep when they occur, but the first might not
874 * occur very often. It depends on what work items are on the workqueue and
875 * what locks they need, which you have no control over.
876 *
877 * In most situations flushing the entire workqueue is overkill; you merely
878 * need to know that a particular work item isn't queued and isn't running.
879 * In such cases you should use cancel_delayed_work_sync() or
880 * cancel_work_sync() instead.
881 */
848void flush_scheduled_work(void) 882void flush_scheduled_work(void)
849{ 883{
850 flush_workqueue(keventd_wq); 884 flush_workqueue(keventd_wq);