aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/acct.c20
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup.c5
-rw-r--r--kernel/cgroup_freezer.c21
-rw-r--r--kernel/compat.c25
-rw-r--r--kernel/cpu.c155
-rw-r--r--kernel/cpuset.c145
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c65
-rw-r--r--kernel/debug/Makefile6
-rw-r--r--kernel/debug/debug_core.c983
-rw-r--r--kernel/debug/debug_core.h81
-rw-r--r--kernel/debug/gdbstub.c1017
-rw-r--r--kernel/debug/kdb/.gitignore1
-rw-r--r--kernel/debug/kdb/Makefile25
-rw-r--r--kernel/debug/kdb/kdb_bp.c564
-rw-r--r--kernel/debug/kdb/kdb_bt.c210
-rw-r--r--kernel/debug/kdb/kdb_cmds35
-rw-r--r--kernel/debug/kdb/kdb_debugger.c169
-rw-r--r--kernel/debug/kdb/kdb_io.c826
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c212
-rw-r--r--kernel/debug/kdb/kdb_main.c2849
-rw-r--r--kernel/debug/kdb/kdb_private.h300
-rw-r--r--kernel/debug/kdb/kdb_support.c927
-rw-r--r--kernel/exit.c43
-rw-r--r--kernel/fork.c57
-rw-r--r--kernel/groups.c6
-rw-r--r--kernel/hrtimer.c69
-rw-r--r--kernel/hw_breakpoint.c196
-rw-r--r--kernel/irq/handle.c3
-rw-r--r--kernel/irq/manage.c89
-rw-r--r--kernel/irq/proc.c60
-rw-r--r--kernel/kallsyms.c21
-rw-r--r--kernel/kgdb.c1764
-rw-r--r--kernel/kmod.c193
-rw-r--r--kernel/kprobes.c132
-rw-r--r--kernel/ksysfs.c3
-rw-r--r--kernel/lockdep.c93
-rw-r--r--kernel/lockdep_internals.h72
-rw-r--r--kernel/lockdep_proc.c58
-rw-r--r--kernel/module.c30
-rw-r--r--kernel/padata.c189
-rw-r--r--kernel/panic.c27
-rw-r--r--kernel/perf_event.c770
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pm_qos_params.c218
-rw-r--r--kernel/posix-cpu-timers.c310
-rw-r--r--kernel/posix-timers.c11
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/snapshot.c145
-rw-r--r--kernel/power/swap.c333
-rw-r--r--kernel/power/user.c37
-rw-r--r--kernel/printk.c25
-rw-r--r--kernel/profile.c8
-rw-r--r--kernel/ptrace.c38
-rw-r--r--kernel/rcupdate.c19
-rw-r--r--kernel/rcutiny.c35
-rw-r--r--kernel/rcutiny_plugin.h39
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/rcutree.c131
-rw-r--r--kernel/rcutree.h2
-rw-r--r--kernel/rcutree_plugin.h69
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/relay.c17
-rw-r--r--kernel/resource.c16
-rw-r--r--kernel/sched.c787
-rw-r--r--kernel/sched_clock.c1
-rw-r--r--kernel/sched_debug.c118
-rw-r--r--kernel/sched_fair.c350
-rw-r--r--kernel/sched_features.h55
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c15
-rw-r--r--kernel/signal.c63
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/stop_machine.c537
-rw-r--r--kernel/sys.c37
-rw-r--r--kernel/sysctl.c613
-rw-r--r--kernel/sysctl_binary.c10
-rw-r--r--kernel/time.c11
-rw-r--r--kernel/time/clocksource.c48
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-sched.c84
-rw-r--r--kernel/time/timekeeping.c35
-rw-r--r--kernel/time/timer_list.c1
-rw-r--r--kernel/timer.c149
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c138
-rw-r--r--kernel/trace/ftrace.c36
-rw-r--r--kernel/trace/kmemtrace.c70
-rw-r--r--kernel/trace/ring_buffer.c179
-rw-r--r--kernel/trace/ring_buffer_benchmark.c5
-rw-r--r--kernel/trace/trace.c196
-rw-r--r--kernel/trace/trace.h56
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_entries.h12
-rw-r--r--kernel/trace/trace_event_perf.c185
-rw-r--r--kernel/trace/trace_events.c139
-rw-r--r--kernel/trace/trace_events_filter.c30
-rw-r--r--kernel/trace/trace_export.c16
-rw-r--r--kernel/trace/trace_functions_graph.c176
-rw-r--r--kernel/trace/trace_hw_branches.c312
-rw-r--r--kernel/trace/trace_irqsoff.c271
-rw-r--r--kernel/trace/trace_kprobe.c648
-rw-r--r--kernel/trace/trace_ksym.c26
-rw-r--r--kernel/trace/trace_output.c155
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_sched_switch.c21
-rw-r--r--kernel/trace/trace_sched_wakeup.c29
-rw-r--r--kernel/trace/trace_selftest.c64
-rw-r--r--kernel/trace/trace_syscalls.c146
-rw-r--r--kernel/trace/trace_workqueue.c26
-rw-r--r--kernel/tracepoint.c91
-rw-r--r--kernel/user.c11
-rw-r--r--kernel/user_namespace.c4
-rw-r--r--kernel/workqueue.c45
120 files changed, 14067 insertions, 6115 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1676b5..057472fbc272 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,14 +68,14 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
68obj-$(CONFIG_PID_NS) += pid_namespace.o 68obj-$(CONFIG_PID_NS) += pid_namespace.o
69obj-$(CONFIG_IKCONFIG) += configs.o 69obj-$(CONFIG_IKCONFIG) += configs.o
70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
71obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
75obj-$(CONFIG_GCOV_KERNEL) += gcov/ 75obj-$(CONFIG_GCOV_KERNEL) += gcov/
76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
77obj-$(CONFIG_KPROBES) += kprobes.o 77obj-$(CONFIG_KPROBES) += kprobes.o
78obj-$(CONFIG_KGDB) += kgdb.o 78obj-$(CONFIG_KGDB) += debug/
79obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o 79obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
diff --git a/kernel/acct.c b/kernel/acct.c
index e4c0e1fee9b0..385b88461c29 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -216,7 +216,6 @@ static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt; 218 struct vfsmount *mnt;
219 int error;
220 struct pid_namespace *ns; 219 struct pid_namespace *ns;
221 struct bsd_acct_struct *acct = NULL; 220 struct bsd_acct_struct *acct = NULL;
222 221
@@ -244,13 +243,6 @@ static int acct_on(char *name)
244 } 243 }
245 } 244 }
246 245
247 error = security_acct(file);
248 if (error) {
249 kfree(acct);
250 filp_close(file, NULL);
251 return error;
252 }
253
254 spin_lock(&acct_lock); 246 spin_lock(&acct_lock);
255 if (ns->bacct == NULL) { 247 if (ns->bacct == NULL) {
256 ns->bacct = acct; 248 ns->bacct = acct;
@@ -281,7 +273,7 @@ static int acct_on(char *name)
281 */ 273 */
282SYSCALL_DEFINE1(acct, const char __user *, name) 274SYSCALL_DEFINE1(acct, const char __user *, name)
283{ 275{
284 int error; 276 int error = 0;
285 277
286 if (!capable(CAP_SYS_PACCT)) 278 if (!capable(CAP_SYS_PACCT))
287 return -EPERM; 279 return -EPERM;
@@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
299 if (acct == NULL) 291 if (acct == NULL)
300 return 0; 292 return 0;
301 293
302 error = security_acct(NULL); 294 spin_lock(&acct_lock);
303 if (!error) { 295 acct_file_reopen(acct, NULL, NULL);
304 spin_lock(&acct_lock); 296 spin_unlock(&acct_lock);
305 acct_file_reopen(acct, NULL, NULL);
306 spin_unlock(&acct_lock);
307 }
308 } 297 }
298
309 return error; 299 return error;
310} 300}
311 301
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e4697e9b276..2f05303715a5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "cred-internals.h"
19 18
20/* 19/*
21 * Leveraged for setting/resetting capabilities 20 * Leveraged for setting/resetting capabilities
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 6d870f2d1228..422cb19f156e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2994,7 +2994,6 @@ static void cgroup_event_remove(struct work_struct *work)
2994 remove); 2994 remove);
2995 struct cgroup *cgrp = event->cgrp; 2995 struct cgroup *cgrp = event->cgrp;
2996 2996
2997 /* TODO: check return code */
2998 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 2997 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2999 2998
3000 eventfd_ctx_put(event->eventfd); 2999 eventfd_ctx_put(event->eventfd);
@@ -3016,7 +3015,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3016 unsigned long flags = (unsigned long)key; 3015 unsigned long flags = (unsigned long)key;
3017 3016
3018 if (flags & POLLHUP) { 3017 if (flags & POLLHUP) {
3019 remove_wait_queue_locked(event->wqh, &event->wait); 3018 __remove_wait_queue(event->wqh, &event->wait);
3020 spin_lock(&cgrp->event_list_lock); 3019 spin_lock(&cgrp->event_list_lock);
3021 list_del(&event->list); 3020 list_del(&event->list);
3022 spin_unlock(&cgrp->event_list_lock); 3021 spin_unlock(&cgrp->event_list_lock);
@@ -3615,7 +3614,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3615 * @ss: the subsystem to load 3614 * @ss: the subsystem to load
3616 * 3615 *
3617 * This function should be called in a modular subsystem's initcall. If the 3616 * This function should be called in a modular subsystem's initcall. If the
3618 * subsytem is built as a module, it will be assigned a new subsys_id and set 3617 * subsystem is built as a module, it will be assigned a new subsys_id and set
3619 * up for use. If the subsystem is built-in anyway, work is delegated to the 3618 * up for use. If the subsystem is built-in anyway, work is delegated to the
3620 * simpler cgroup_init_subsys. 3619 * simpler cgroup_init_subsys.
3621 */ 3620 */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e5c0244962b0..ce71ed53e88f 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -89,10 +89,10 @@ struct cgroup_subsys freezer_subsys;
89 89
90/* Locks taken and their ordering 90/* Locks taken and their ordering
91 * ------------------------------ 91 * ------------------------------
92 * css_set_lock
93 * cgroup_mutex (AKA cgroup_lock) 92 * cgroup_mutex (AKA cgroup_lock)
94 * task->alloc_lock (AKA task_lock)
95 * freezer->lock 93 * freezer->lock
94 * css_set_lock
95 * task->alloc_lock (AKA task_lock)
96 * task->sighand->siglock 96 * task->sighand->siglock
97 * 97 *
98 * cgroup code forces css_set_lock to be taken before task->alloc_lock 98 * cgroup code forces css_set_lock to be taken before task->alloc_lock
@@ -100,33 +100,38 @@ struct cgroup_subsys freezer_subsys;
100 * freezer_create(), freezer_destroy(): 100 * freezer_create(), freezer_destroy():
101 * cgroup_mutex [ by cgroup core ] 101 * cgroup_mutex [ by cgroup core ]
102 * 102 *
103 * can_attach(): 103 * freezer_can_attach():
104 * cgroup_mutex 104 * cgroup_mutex (held by caller of can_attach)
105 * 105 *
106 * cgroup_frozen(): 106 * cgroup_freezing_or_frozen():
107 * task->alloc_lock (to get task's cgroup) 107 * task->alloc_lock (to get task's cgroup)
108 * 108 *
109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): 109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
110 * task->alloc_lock (to get task's cgroup)
111 * freezer->lock 110 * freezer->lock
112 * sighand->siglock (if the cgroup is freezing) 111 * sighand->siglock (if the cgroup is freezing)
113 * 112 *
114 * freezer_read(): 113 * freezer_read():
115 * cgroup_mutex 114 * cgroup_mutex
116 * freezer->lock 115 * freezer->lock
116 * write_lock css_set_lock (cgroup iterator start)
117 * task->alloc_lock
117 * read_lock css_set_lock (cgroup iterator start) 118 * read_lock css_set_lock (cgroup iterator start)
118 * 119 *
119 * freezer_write() (freeze): 120 * freezer_write() (freeze):
120 * cgroup_mutex 121 * cgroup_mutex
121 * freezer->lock 122 * freezer->lock
123 * write_lock css_set_lock (cgroup iterator start)
124 * task->alloc_lock
122 * read_lock css_set_lock (cgroup iterator start) 125 * read_lock css_set_lock (cgroup iterator start)
123 * sighand->siglock 126 * sighand->siglock (fake signal delivery inside freeze_task())
124 * 127 *
125 * freezer_write() (unfreeze): 128 * freezer_write() (unfreeze):
126 * cgroup_mutex 129 * cgroup_mutex
127 * freezer->lock 130 * freezer->lock
131 * write_lock css_set_lock (cgroup iterator start)
132 * task->alloc_lock
128 * read_lock css_set_lock (cgroup iterator start) 133 * read_lock css_set_lock (cgroup iterator start)
129 * task->alloc_lock (to prevent races with freeze_task()) 134 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
130 * sighand->siglock 135 * sighand->siglock
131 */ 136 */
132static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 137static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
diff --git a/kernel/compat.c b/kernel/compat.c
index 7f40e9275fd9..5adab05a3172 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -495,29 +495,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
495{ 495{
496 int ret; 496 int ret;
497 cpumask_var_t mask; 497 cpumask_var_t mask;
498 unsigned long *k;
499 unsigned int min_length = cpumask_size();
500
501 if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
502 min_length = sizeof(compat_ulong_t);
503 498
504 if (len < min_length) 499 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
500 return -EINVAL;
501 if (len & (sizeof(compat_ulong_t)-1))
505 return -EINVAL; 502 return -EINVAL;
506 503
507 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 504 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
508 return -ENOMEM; 505 return -ENOMEM;
509 506
510 ret = sched_getaffinity(pid, mask); 507 ret = sched_getaffinity(pid, mask);
511 if (ret < 0) 508 if (ret == 0) {
512 goto out; 509 size_t retlen = min_t(size_t, len, cpumask_size());
513 510
514 k = cpumask_bits(mask); 511 if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
515 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); 512 ret = -EFAULT;
516 if (ret == 0) 513 else
517 ret = min_length; 514 ret = retlen;
518 515 }
519out:
520 free_cpumask_var(mask); 516 free_cpumask_var(mask);
517
521 return ret; 518 return ret;
522} 519}
523 520
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 25bba73b1be3..3097382eb44a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,20 @@
20/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 20/* Serializes the updates to cpu_online_mask, cpu_present_mask */
21static DEFINE_MUTEX(cpu_add_remove_lock); 21static DEFINE_MUTEX(cpu_add_remove_lock);
22 22
23/*
24 * The following two API's must be used when attempting
25 * to serialize the updates to cpu_online_mask, cpu_present_mask.
26 */
27void cpu_maps_update_begin(void)
28{
29 mutex_lock(&cpu_add_remove_lock);
30}
31
32void cpu_maps_update_done(void)
33{
34 mutex_unlock(&cpu_add_remove_lock);
35}
36
23static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 37static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
24 38
25/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. 39/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
@@ -27,6 +41,8 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
27 */ 41 */
28static int cpu_hotplug_disabled; 42static int cpu_hotplug_disabled;
29 43
44#ifdef CONFIG_HOTPLUG_CPU
45
30static struct { 46static struct {
31 struct task_struct *active_writer; 47 struct task_struct *active_writer;
32 struct mutex lock; /* Synchronizes accesses to refcount, */ 48 struct mutex lock; /* Synchronizes accesses to refcount, */
@@ -41,8 +57,6 @@ static struct {
41 .refcount = 0, 57 .refcount = 0,
42}; 58};
43 59
44#ifdef CONFIG_HOTPLUG_CPU
45
46void get_online_cpus(void) 60void get_online_cpus(void)
47{ 61{
48 might_sleep(); 62 might_sleep();
@@ -67,22 +81,6 @@ void put_online_cpus(void)
67} 81}
68EXPORT_SYMBOL_GPL(put_online_cpus); 82EXPORT_SYMBOL_GPL(put_online_cpus);
69 83
70#endif /* CONFIG_HOTPLUG_CPU */
71
72/*
73 * The following two API's must be used when attempting
74 * to serialize the updates to cpu_online_mask, cpu_present_mask.
75 */
76void cpu_maps_update_begin(void)
77{
78 mutex_lock(&cpu_add_remove_lock);
79}
80
81void cpu_maps_update_done(void)
82{
83 mutex_unlock(&cpu_add_remove_lock);
84}
85
86/* 84/*
87 * This ensures that the hotplug operation can begin only when the 85 * This ensures that the hotplug operation can begin only when the
88 * refcount goes to zero. 86 * refcount goes to zero.
@@ -124,6 +122,12 @@ static void cpu_hotplug_done(void)
124 cpu_hotplug.active_writer = NULL; 122 cpu_hotplug.active_writer = NULL;
125 mutex_unlock(&cpu_hotplug.lock); 123 mutex_unlock(&cpu_hotplug.lock);
126} 124}
125
126#else /* #if CONFIG_HOTPLUG_CPU */
127static void cpu_hotplug_begin(void) {}
128static void cpu_hotplug_done(void) {}
129#endif /* #esle #if CONFIG_HOTPLUG_CPU */
130
127/* Need to know about CPUs going up/down? */ 131/* Need to know about CPUs going up/down? */
128int __ref register_cpu_notifier(struct notifier_block *nb) 132int __ref register_cpu_notifier(struct notifier_block *nb)
129{ 133{
@@ -134,8 +138,29 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
134 return ret; 138 return ret;
135} 139}
136 140
141static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
142 int *nr_calls)
143{
144 int ret;
145
146 ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
147 nr_calls);
148
149 return notifier_to_errno(ret);
150}
151
152static int cpu_notify(unsigned long val, void *v)
153{
154 return __cpu_notify(val, v, -1, NULL);
155}
156
137#ifdef CONFIG_HOTPLUG_CPU 157#ifdef CONFIG_HOTPLUG_CPU
138 158
159static void cpu_notify_nofail(unsigned long val, void *v)
160{
161 BUG_ON(cpu_notify(val, v));
162}
163
139EXPORT_SYMBOL(register_cpu_notifier); 164EXPORT_SYMBOL(register_cpu_notifier);
140 165
141void __ref unregister_cpu_notifier(struct notifier_block *nb) 166void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -164,6 +189,7 @@ static inline void check_for_tasks(int cpu)
164} 189}
165 190
166struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
167 unsigned long mod; 193 unsigned long mod;
168 void *hcpu; 194 void *hcpu;
169}; 195};
@@ -172,6 +198,7 @@ struct take_cpu_down_param {
172static int __ref take_cpu_down(void *_param) 198static int __ref take_cpu_down(void *_param)
173{ 199{
174 struct take_cpu_down_param *param = _param; 200 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
175 int err; 202 int err;
176 203
177 /* Ensure this CPU doesn't handle any more interrupts. */ 204 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -179,9 +206,10 @@ static int __ref take_cpu_down(void *_param)
179 if (err < 0) 206 if (err < 0)
180 return err; 207 return err;
181 208
182 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, 209 cpu_notify(CPU_DYING | param->mod, param->hcpu);
183 param->hcpu);
184 210
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
185 /* Force idle task to run as soon as we yield: it should 213 /* Force idle task to run as soon as we yield: it should
186 immediately notice cpu is offline and die quickly. */ 214 immediately notice cpu is offline and die quickly. */
187 sched_idle_next(); 215 sched_idle_next();
@@ -192,10 +220,10 @@ static int __ref take_cpu_down(void *_param)
192static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 220static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
193{ 221{
194 int err, nr_calls = 0; 222 int err, nr_calls = 0;
195 cpumask_var_t old_allowed;
196 void *hcpu = (void *)(long)cpu; 223 void *hcpu = (void *)(long)cpu;
197 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
198 struct take_cpu_down_param tcd_param = { 225 struct take_cpu_down_param tcd_param = {
226 .caller = current,
199 .mod = mod, 227 .mod = mod,
200 .hcpu = hcpu, 228 .hcpu = hcpu,
201 }; 229 };
@@ -206,38 +234,26 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
206 if (!cpu_online(cpu)) 234 if (!cpu_online(cpu))
207 return -EINVAL; 235 return -EINVAL;
208 236
209 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
210 return -ENOMEM;
211
212 cpu_hotplug_begin(); 237 cpu_hotplug_begin();
213 set_cpu_active(cpu, false); 238 set_cpu_active(cpu, false);
214 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 239 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
215 hcpu, -1, &nr_calls); 240 if (err) {
216 if (err == NOTIFY_BAD) {
217 set_cpu_active(cpu, true); 241 set_cpu_active(cpu, true);
218 242
219 nr_calls--; 243 nr_calls--;
220 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 244 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
221 hcpu, nr_calls, NULL);
222 printk("%s: attempt to take down CPU %u failed\n", 245 printk("%s: attempt to take down CPU %u failed\n",
223 __func__, cpu); 246 __func__, cpu);
224 err = -EINVAL;
225 goto out_release; 247 goto out_release;
226 } 248 }
227 249
228 /* Ensure that we are not runnable on dying cpu */
229 cpumask_copy(old_allowed, &current->cpus_allowed);
230 set_cpus_allowed_ptr(current, cpu_active_mask);
231
232 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 250 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
233 if (err) { 251 if (err) {
234 set_cpu_active(cpu, true); 252 set_cpu_active(cpu, true);
235 /* CPU didn't die: tell everyone. Can't complain. */ 253 /* CPU didn't die: tell everyone. Can't complain. */
236 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 254 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
237 hcpu) == NOTIFY_BAD)
238 BUG();
239 255
240 goto out_allowed; 256 goto out_release;
241 } 257 }
242 BUG_ON(cpu_online(cpu)); 258 BUG_ON(cpu_online(cpu));
243 259
@@ -249,22 +265,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
249 __cpu_die(cpu); 265 __cpu_die(cpu);
250 266
251 /* CPU is completely dead: tell everyone. Too late to complain. */ 267 /* CPU is completely dead: tell everyone. Too late to complain. */
252 if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod, 268 cpu_notify_nofail(CPU_DEAD | mod, hcpu);
253 hcpu) == NOTIFY_BAD)
254 BUG();
255 269
256 check_for_tasks(cpu); 270 check_for_tasks(cpu);
257 271
258out_allowed:
259 set_cpus_allowed_ptr(current, old_allowed);
260out_release: 272out_release:
261 cpu_hotplug_done(); 273 cpu_hotplug_done();
262 if (!err) { 274 if (!err)
263 if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod, 275 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
264 hcpu) == NOTIFY_BAD)
265 BUG();
266 }
267 free_cpumask_var(old_allowed);
268 return err; 276 return err;
269} 277}
270 278
@@ -272,9 +280,6 @@ int __ref cpu_down(unsigned int cpu)
272{ 280{
273 int err; 281 int err;
274 282
275 err = stop_machine_create();
276 if (err)
277 return err;
278 cpu_maps_update_begin(); 283 cpu_maps_update_begin();
279 284
280 if (cpu_hotplug_disabled) { 285 if (cpu_hotplug_disabled) {
@@ -286,7 +291,6 @@ int __ref cpu_down(unsigned int cpu)
286 291
287out: 292out:
288 cpu_maps_update_done(); 293 cpu_maps_update_done();
289 stop_machine_destroy();
290 return err; 294 return err;
291} 295}
292EXPORT_SYMBOL(cpu_down); 296EXPORT_SYMBOL(cpu_down);
@@ -303,13 +307,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
303 return -EINVAL; 307 return -EINVAL;
304 308
305 cpu_hotplug_begin(); 309 cpu_hotplug_begin();
306 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, 310 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
307 -1, &nr_calls); 311 if (ret) {
308 if (ret == NOTIFY_BAD) {
309 nr_calls--; 312 nr_calls--;
310 printk("%s: attempt to bring up CPU %u failed\n", 313 printk("%s: attempt to bring up CPU %u failed\n",
311 __func__, cpu); 314 __func__, cpu);
312 ret = -EINVAL;
313 goto out_notify; 315 goto out_notify;
314 } 316 }
315 317
@@ -322,12 +324,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
322 set_cpu_active(cpu, true); 324 set_cpu_active(cpu, true);
323 325
324 /* Now call notifier in preparation. */ 326 /* Now call notifier in preparation. */
325 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); 327 cpu_notify(CPU_ONLINE | mod, hcpu);
326 328
327out_notify: 329out_notify:
328 if (ret != 0) 330 if (ret != 0)
329 __raw_notifier_call_chain(&cpu_chain, 331 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
330 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
331 cpu_hotplug_done(); 332 cpu_hotplug_done();
332 333
333 return ret; 334 return ret;
@@ -336,6 +337,12 @@ out_notify:
336int __cpuinit cpu_up(unsigned int cpu) 337int __cpuinit cpu_up(unsigned int cpu)
337{ 338{
338 int err = 0; 339 int err = 0;
340
341#ifdef CONFIG_MEMORY_HOTPLUG
342 int nid;
343 pg_data_t *pgdat;
344#endif
345
339 if (!cpu_possible(cpu)) { 346 if (!cpu_possible(cpu)) {
340 printk(KERN_ERR "can't online cpu %d because it is not " 347 printk(KERN_ERR "can't online cpu %d because it is not "
341 "configured as may-hotadd at boot time\n", cpu); 348 "configured as may-hotadd at boot time\n", cpu);
@@ -346,6 +353,28 @@ int __cpuinit cpu_up(unsigned int cpu)
346 return -EINVAL; 353 return -EINVAL;
347 } 354 }
348 355
356#ifdef CONFIG_MEMORY_HOTPLUG
357 nid = cpu_to_node(cpu);
358 if (!node_online(nid)) {
359 err = mem_online_node(nid);
360 if (err)
361 return err;
362 }
363
364 pgdat = NODE_DATA(nid);
365 if (!pgdat) {
366 printk(KERN_ERR
367 "Can't online cpu %d due to NULL pgdat\n", cpu);
368 return -ENOMEM;
369 }
370
371 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
372 mutex_lock(&zonelists_mutex);
373 build_all_zonelists(NULL);
374 mutex_unlock(&zonelists_mutex);
375 }
376#endif
377
349 cpu_maps_update_begin(); 378 cpu_maps_update_begin();
350 379
351 if (cpu_hotplug_disabled) { 380 if (cpu_hotplug_disabled) {
@@ -367,9 +396,6 @@ int disable_nonboot_cpus(void)
367{ 396{
368 int cpu, first_cpu, error; 397 int cpu, first_cpu, error;
369 398
370 error = stop_machine_create();
371 if (error)
372 return error;
373 cpu_maps_update_begin(); 399 cpu_maps_update_begin();
374 first_cpu = cpumask_first(cpu_online_mask); 400 first_cpu = cpumask_first(cpu_online_mask);
375 /* 401 /*
@@ -400,7 +426,6 @@ int disable_nonboot_cpus(void)
400 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 426 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
401 } 427 }
402 cpu_maps_update_done(); 428 cpu_maps_update_done();
403 stop_machine_destroy();
404 return error; 429 return error;
405} 430}
406 431
@@ -467,7 +492,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
467 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) 492 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
468 val = CPU_STARTING_FROZEN; 493 val = CPU_STARTING_FROZEN;
469#endif /* CONFIG_PM_SLEEP_SMP */ 494#endif /* CONFIG_PM_SLEEP_SMP */
470 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); 495 cpu_notify(val, (void *)(long)cpu);
471} 496}
472 497
473#endif /* CONFIG_SMP */ 498#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d10946748ec2..02b9611eadde 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -946,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
946 * In order to avoid seeing no nodes if the old and new nodes are disjoint, 946 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
947 * we structure updates as setting all new allowed nodes, then clearing newly 947 * we structure updates as setting all new allowed nodes, then clearing newly
948 * disallowed ones. 948 * disallowed ones.
949 *
950 * Called with task's alloc_lock held
951 */ 949 */
952static void cpuset_change_task_nodemask(struct task_struct *tsk, 950static void cpuset_change_task_nodemask(struct task_struct *tsk,
953 nodemask_t *newmems) 951 nodemask_t *newmems)
954{ 952{
953repeat:
954 /*
955 * Allow tasks that have access to memory reserves because they have
956 * been OOM killed to get memory anywhere.
957 */
958 if (unlikely(test_thread_flag(TIF_MEMDIE)))
959 return;
960 if (current->flags & PF_EXITING) /* Let dying task have memory */
961 return;
962
963 task_lock(tsk);
955 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 964 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
956 mpol_rebind_task(tsk, &tsk->mems_allowed); 965 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
957 mpol_rebind_task(tsk, newmems); 966
967
968 /*
969 * ensure checking ->mems_allowed_change_disable after setting all new
970 * allowed nodes.
971 *
972 * the read-side task can see an nodemask with new allowed nodes and
973 * old allowed nodes. and if it allocates page when cpuset clears newly
974 * disallowed ones continuous, it can see the new allowed bits.
975 *
976 * And if setting all new allowed nodes is after the checking, setting
977 * all new allowed nodes and clearing newly disallowed ones will be done
978 * continuous, and the read-side task may find no node to alloc page.
979 */
980 smp_mb();
981
982 /*
983 * Allocation of memory is very fast, we needn't sleep when waiting
984 * for the read-side.
985 */
986 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
987 task_unlock(tsk);
988 if (!task_curr(tsk))
989 yield();
990 goto repeat;
991 }
992
993 /*
994 * ensure checking ->mems_allowed_change_disable before clearing all new
995 * disallowed nodes.
996 *
997 * if clearing newly disallowed bits before the checking, the read-side
998 * task may find no node to alloc page.
999 */
1000 smp_mb();
1001
1002 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
958 tsk->mems_allowed = *newmems; 1003 tsk->mems_allowed = *newmems;
1004 task_unlock(tsk);
959} 1005}
960 1006
961/* 1007/*
@@ -978,9 +1024,7 @@ static void cpuset_change_nodemask(struct task_struct *p,
978 cs = cgroup_cs(scan->cg); 1024 cs = cgroup_cs(scan->cg);
979 guarantee_online_mems(cs, newmems); 1025 guarantee_online_mems(cs, newmems);
980 1026
981 task_lock(p);
982 cpuset_change_task_nodemask(p, newmems); 1027 cpuset_change_task_nodemask(p, newmems);
983 task_unlock(p);
984 1028
985 NODEMASK_FREE(newmems); 1029 NODEMASK_FREE(newmems);
986 1030
@@ -1383,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1383 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1427 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1384 WARN_ON_ONCE(err); 1428 WARN_ON_ONCE(err);
1385 1429
1386 task_lock(tsk);
1387 cpuset_change_task_nodemask(tsk, to); 1430 cpuset_change_task_nodemask(tsk, to);
1388 task_unlock(tsk);
1389 cpuset_update_task_spread_flag(cs, tsk); 1431 cpuset_update_task_spread_flag(cs, tsk);
1390 1432
1391} 1433}
@@ -2182,19 +2224,52 @@ void __init cpuset_init_smp(void)
2182void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2224void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2183{ 2225{
2184 mutex_lock(&callback_mutex); 2226 mutex_lock(&callback_mutex);
2185 cpuset_cpus_allowed_locked(tsk, pmask); 2227 task_lock(tsk);
2228 guarantee_online_cpus(task_cs(tsk), pmask);
2229 task_unlock(tsk);
2186 mutex_unlock(&callback_mutex); 2230 mutex_unlock(&callback_mutex);
2187} 2231}
2188 2232
2189/** 2233int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2190 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2191 * Must be called with callback_mutex held.
2192 **/
2193void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2194{ 2234{
2195 task_lock(tsk); 2235 const struct cpuset *cs;
2196 guarantee_online_cpus(task_cs(tsk), pmask); 2236 int cpu;
2197 task_unlock(tsk); 2237
2238 rcu_read_lock();
2239 cs = task_cs(tsk);
2240 if (cs)
2241 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
2242 rcu_read_unlock();
2243
2244 /*
2245 * We own tsk->cpus_allowed, nobody can change it under us.
2246 *
2247 * But we used cs && cs->cpus_allowed lockless and thus can
2248 * race with cgroup_attach_task() or update_cpumask() and get
2249 * the wrong tsk->cpus_allowed. However, both cases imply the
2250 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
2251 * which takes task_rq_lock().
2252 *
2253 * If we are called after it dropped the lock we must see all
2254 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2255 * set any mask even if it is not right from task_cs() pov,
2256 * the pending set_cpus_allowed_ptr() will fix things.
2257 */
2258
2259 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2260 if (cpu >= nr_cpu_ids) {
2261 /*
2262 * Either tsk->cpus_allowed is wrong (see above) or it
2263 * is actually empty. The latter case is only possible
2264 * if we are racing with remove_tasks_in_empty_cpuset().
2265 * Like above we can temporary set any mask and rely on
2266 * set_cpus_allowed_ptr() as synchronization point.
2267 */
2268 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
2269 cpu = cpumask_any(cpu_active_mask);
2270 }
2271
2272 return cpu;
2198} 2273}
2199 2274
2200void cpuset_init_current_mems_allowed(void) 2275void cpuset_init_current_mems_allowed(void)
@@ -2383,22 +2458,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2383} 2458}
2384 2459
2385/** 2460/**
2386 * cpuset_lock - lock out any changes to cpuset structures
2387 *
2388 * The out of memory (oom) code needs to mutex_lock cpusets
2389 * from being changed while it scans the tasklist looking for a
2390 * task in an overlapping cpuset. Expose callback_mutex via this
2391 * cpuset_lock() routine, so the oom code can lock it, before
2392 * locking the task list. The tasklist_lock is a spinlock, so
2393 * must be taken inside callback_mutex.
2394 */
2395
2396void cpuset_lock(void)
2397{
2398 mutex_lock(&callback_mutex);
2399}
2400
2401/**
2402 * cpuset_unlock - release lock on cpuset changes 2461 * cpuset_unlock - release lock on cpuset changes
2403 * 2462 *
2404 * Undo the lock taken in a previous cpuset_lock() call. 2463 * Undo the lock taken in a previous cpuset_lock() call.
@@ -2410,7 +2469,8 @@ void cpuset_unlock(void)
2410} 2469}
2411 2470
2412/** 2471/**
2413 * cpuset_mem_spread_node() - On which node to begin search for a page 2472 * cpuset_mem_spread_node() - On which node to begin search for a file page
2473 * cpuset_slab_spread_node() - On which node to begin search for a slab page
2414 * 2474 *
2415 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 2475 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2416 * tasks in a cpuset with is_spread_page or is_spread_slab set), 2476 * tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2435,16 +2495,27 @@ void cpuset_unlock(void)
2435 * See kmem_cache_alloc_node(). 2495 * See kmem_cache_alloc_node().
2436 */ 2496 */
2437 2497
2438int cpuset_mem_spread_node(void) 2498static int cpuset_spread_node(int *rotor)
2439{ 2499{
2440 int node; 2500 int node;
2441 2501
2442 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); 2502 node = next_node(*rotor, current->mems_allowed);
2443 if (node == MAX_NUMNODES) 2503 if (node == MAX_NUMNODES)
2444 node = first_node(current->mems_allowed); 2504 node = first_node(current->mems_allowed);
2445 current->cpuset_mem_spread_rotor = node; 2505 *rotor = node;
2446 return node; 2506 return node;
2447} 2507}
2508
2509int cpuset_mem_spread_node(void)
2510{
2511 return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
2512}
2513
2514int cpuset_slab_spread_node(void)
2515{
2516 return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
2517}
2518
2448EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); 2519EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2449 2520
2450/** 2521/**
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf1..000000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
1/* Internal credentials stuff
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12/*
13 * user.c
14 */
15static inline void sched_switch_user(struct task_struct *p)
16{
17#ifdef CONFIG_USER_SCHED
18 sched_move_task(p);
19#endif /* CONFIG_USER_SCHED */
20}
21
diff --git a/kernel/cred.c b/kernel/cred.c
index 62af1816c235..a2d5504fbcc2 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -17,7 +17,6 @@
17#include <linux/init_task.h> 17#include <linux/init_task.h>
18#include <linux/security.h> 18#include <linux/security.h>
19#include <linux/cn_proc.h> 19#include <linux/cn_proc.h>
20#include "cred-internals.h"
21 20
22#if 0 21#if 0
23#define kdebug(FMT, ...) \ 22#define kdebug(FMT, ...) \
@@ -348,66 +347,6 @@ struct cred *prepare_exec_creds(void)
348} 347}
349 348
350/* 349/*
351 * prepare new credentials for the usermode helper dispatcher
352 */
353struct cred *prepare_usermodehelper_creds(void)
354{
355#ifdef CONFIG_KEYS
356 struct thread_group_cred *tgcred = NULL;
357#endif
358 struct cred *new;
359
360#ifdef CONFIG_KEYS
361 tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
362 if (!tgcred)
363 return NULL;
364#endif
365
366 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
367 if (!new)
368 goto free_tgcred;
369
370 kdebug("prepare_usermodehelper_creds() alloc %p", new);
371
372 memcpy(new, &init_cred, sizeof(struct cred));
373
374 atomic_set(&new->usage, 1);
375 set_cred_subscribers(new, 0);
376 get_group_info(new->group_info);
377 get_uid(new->user);
378
379#ifdef CONFIG_KEYS
380 new->thread_keyring = NULL;
381 new->request_key_auth = NULL;
382 new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
383
384 atomic_set(&tgcred->usage, 1);
385 spin_lock_init(&tgcred->lock);
386 new->tgcred = tgcred;
387#endif
388
389#ifdef CONFIG_SECURITY
390 new->security = NULL;
391#endif
392 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
393 goto error;
394 validate_creds(new);
395
396 BUG_ON(atomic_read(&new->usage) != 1);
397 return new;
398
399error:
400 put_cred(new);
401 return NULL;
402
403free_tgcred:
404#ifdef CONFIG_KEYS
405 kfree(tgcred);
406#endif
407 return NULL;
408}
409
410/*
411 * Copy credentials for the new process created by fork() 350 * Copy credentials for the new process created by fork()
412 * 351 *
413 * We share if we can, but under some circumstances we have to generate a new 352 * We share if we can, but under some circumstances we have to generate a new
@@ -523,8 +462,6 @@ int commit_creds(struct cred *new)
523#endif 462#endif
524 BUG_ON(atomic_read(&new->usage) < 1); 463 BUG_ON(atomic_read(&new->usage) < 1);
525 464
526 security_commit_creds(new, old);
527
528 get_cred(new); /* we will require a ref for the subj creds too */ 465 get_cred(new); /* we will require a ref for the subj creds too */
529 466
530 /* dumpability changes */ 467 /* dumpability changes */
@@ -560,8 +497,6 @@ int commit_creds(struct cred *new)
560 atomic_dec(&old->user->processes); 497 atomic_dec(&old->user->processes);
561 alter_cred_subscribers(old, -2); 498 alter_cred_subscribers(old, -2);
562 499
563 sched_switch_user(task);
564
565 /* send notifications */ 500 /* send notifications */
566 if (new->uid != old->uid || 501 if (new->uid != old->uid ||
567 new->euid != old->euid || 502 new->euid != old->euid ||
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
new file mode 100644
index 000000000000..a85edc339985
--- /dev/null
+++ b/kernel/debug/Makefile
@@ -0,0 +1,6 @@
1#
2# Makefile for the linux kernel debugger
3#
4
5obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
6obj-$(CONFIG_KGDB_KDB) += kdb/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
new file mode 100644
index 000000000000..5cb7cd1de10c
--- /dev/null
+++ b/kernel/debug/debug_core.c
@@ -0,0 +1,983 @@
1/*
2 * Kernel Debug Core
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/console.h>
35#include <linux/threads.h>
36#include <linux/uaccess.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/ptrace.h>
40#include <linux/string.h>
41#include <linux/delay.h>
42#include <linux/sched.h>
43#include <linux/sysrq.h>
44#include <linux/init.h>
45#include <linux/kgdb.h>
46#include <linux/kdb.h>
47#include <linux/pid.h>
48#include <linux/smp.h>
49#include <linux/mm.h>
50
51#include <asm/cacheflush.h>
52#include <asm/byteorder.h>
53#include <asm/atomic.h>
54#include <asm/system.h>
55
56#include "debug_core.h"
57
58static int kgdb_break_asap;
59
60struct debuggerinfo_struct kgdb_info[NR_CPUS];
61
62/**
63 * kgdb_connected - Is a host GDB connected to us?
64 */
65int kgdb_connected;
66EXPORT_SYMBOL_GPL(kgdb_connected);
67
68/* All the KGDB handlers are installed */
69int kgdb_io_module_registered;
70
71/* Guard for recursive entry */
72static int exception_level;
73
74struct kgdb_io *dbg_io_ops;
75static DEFINE_SPINLOCK(kgdb_registration_lock);
76
77/* kgdb console driver is loaded */
78static int kgdb_con_registered;
79/* determine if kgdb console output should be used */
80static int kgdb_use_con;
81/* Flag for alternate operations for early debugging */
82bool dbg_is_early = true;
83/* Next cpu to become the master debug core */
84int dbg_switch_cpu;
85
86/* Use kdb or gdbserver mode */
87int dbg_kdb_mode = 1;
88
89static int __init opt_kgdb_con(char *str)
90{
91 kgdb_use_con = 1;
92 return 0;
93}
94
95early_param("kgdbcon", opt_kgdb_con);
96
97module_param(kgdb_use_con, int, 0644);
98
99/*
100 * Holds information about breakpoints in a kernel. These breakpoints are
101 * added and removed by gdb.
102 */
103static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
104 [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
105};
106
107/*
108 * The CPU# of the active CPU, or -1 if none:
109 */
110atomic_t kgdb_active = ATOMIC_INIT(-1);
111EXPORT_SYMBOL_GPL(kgdb_active);
112
113/*
114 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
115 * bootup code (which might not have percpu set up yet):
116 */
117static atomic_t passive_cpu_wait[NR_CPUS];
118static atomic_t cpu_in_kgdb[NR_CPUS];
119static atomic_t kgdb_break_tasklet_var;
120atomic_t kgdb_setting_breakpoint;
121
122struct task_struct *kgdb_usethread;
123struct task_struct *kgdb_contthread;
124
125int kgdb_single_step;
126static pid_t kgdb_sstep_pid;
127
128/* to keep track of the CPU which is doing the single stepping*/
129atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
130
131/*
132 * If you are debugging a problem where roundup (the collection of
133 * all other CPUs) is a problem [this should be extremely rare],
134 * then use the nokgdbroundup option to avoid roundup. In that case
135 * the other CPUs might interfere with your debugging context, so
136 * use this with care:
137 */
138static int kgdb_do_roundup = 1;
139
140static int __init opt_nokgdbroundup(char *str)
141{
142 kgdb_do_roundup = 0;
143
144 return 0;
145}
146
147early_param("nokgdbroundup", opt_nokgdbroundup);
148
149/*
150 * Finally, some KGDB code :-)
151 */
152
153/*
154 * Weak aliases for breakpoint management,
155 * can be overriden by architectures when needed:
156 */
157int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
158{
159 int err;
160
161 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
162 if (err)
163 return err;
164
165 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
166 BREAK_INSTR_SIZE);
167}
168
169int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
170{
171 return probe_kernel_write((char *)addr,
172 (char *)bundle, BREAK_INSTR_SIZE);
173}
174
175int __weak kgdb_validate_break_address(unsigned long addr)
176{
177 char tmp_variable[BREAK_INSTR_SIZE];
178 int err;
179 /* Validate setting the breakpoint and then removing it. In the
180 * remove fails, the kernel needs to emit a bad message because we
181 * are deep trouble not being able to put things back the way we
182 * found them.
183 */
184 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
185 if (err)
186 return err;
187 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
188 if (err)
189 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
190 "memory destroyed at: %lx", addr);
191 return err;
192}
193
194unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
195{
196 return instruction_pointer(regs);
197}
198
199int __weak kgdb_arch_init(void)
200{
201 return 0;
202}
203
204int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
205{
206 return 0;
207}
208
209/**
210 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
211 * @regs: Current &struct pt_regs.
212 *
213 * This function will be called if the particular architecture must
214 * disable hardware debugging while it is processing gdb packets or
215 * handling exception.
216 */
217void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
218{
219}
220
221/*
222 * Some architectures need cache flushes when we set/clear a
223 * breakpoint:
224 */
225static void kgdb_flush_swbreak_addr(unsigned long addr)
226{
227 if (!CACHE_FLUSH_IS_SAFE)
228 return;
229
230 if (current->mm && current->mm->mmap_cache) {
231 flush_cache_range(current->mm->mmap_cache,
232 addr, addr + BREAK_INSTR_SIZE);
233 }
234 /* Force flush instruction cache if it was outside the mm */
235 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
236}
237
238/*
239 * SW breakpoint management:
240 */
241int dbg_activate_sw_breakpoints(void)
242{
243 unsigned long addr;
244 int error;
245 int ret = 0;
246 int i;
247
248 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
249 if (kgdb_break[i].state != BP_SET)
250 continue;
251
252 addr = kgdb_break[i].bpt_addr;
253 error = kgdb_arch_set_breakpoint(addr,
254 kgdb_break[i].saved_instr);
255 if (error) {
256 ret = error;
257 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
258 continue;
259 }
260
261 kgdb_flush_swbreak_addr(addr);
262 kgdb_break[i].state = BP_ACTIVE;
263 }
264 return ret;
265}
266
267int dbg_set_sw_break(unsigned long addr)
268{
269 int err = kgdb_validate_break_address(addr);
270 int breakno = -1;
271 int i;
272
273 if (err)
274 return err;
275
276 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
277 if ((kgdb_break[i].state == BP_SET) &&
278 (kgdb_break[i].bpt_addr == addr))
279 return -EEXIST;
280 }
281 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
282 if (kgdb_break[i].state == BP_REMOVED &&
283 kgdb_break[i].bpt_addr == addr) {
284 breakno = i;
285 break;
286 }
287 }
288
289 if (breakno == -1) {
290 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
291 if (kgdb_break[i].state == BP_UNDEFINED) {
292 breakno = i;
293 break;
294 }
295 }
296 }
297
298 if (breakno == -1)
299 return -E2BIG;
300
301 kgdb_break[breakno].state = BP_SET;
302 kgdb_break[breakno].type = BP_BREAKPOINT;
303 kgdb_break[breakno].bpt_addr = addr;
304
305 return 0;
306}
307
308int dbg_deactivate_sw_breakpoints(void)
309{
310 unsigned long addr;
311 int error;
312 int ret = 0;
313 int i;
314
315 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
316 if (kgdb_break[i].state != BP_ACTIVE)
317 continue;
318 addr = kgdb_break[i].bpt_addr;
319 error = kgdb_arch_remove_breakpoint(addr,
320 kgdb_break[i].saved_instr);
321 if (error) {
322 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
323 ret = error;
324 }
325
326 kgdb_flush_swbreak_addr(addr);
327 kgdb_break[i].state = BP_SET;
328 }
329 return ret;
330}
331
332int dbg_remove_sw_break(unsigned long addr)
333{
334 int i;
335
336 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
337 if ((kgdb_break[i].state == BP_SET) &&
338 (kgdb_break[i].bpt_addr == addr)) {
339 kgdb_break[i].state = BP_REMOVED;
340 return 0;
341 }
342 }
343 return -ENOENT;
344}
345
346int kgdb_isremovedbreak(unsigned long addr)
347{
348 int i;
349
350 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
351 if ((kgdb_break[i].state == BP_REMOVED) &&
352 (kgdb_break[i].bpt_addr == addr))
353 return 1;
354 }
355 return 0;
356}
357
358int dbg_remove_all_break(void)
359{
360 unsigned long addr;
361 int error;
362 int i;
363
364 /* Clear memory breakpoints. */
365 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
366 if (kgdb_break[i].state != BP_ACTIVE)
367 goto setundefined;
368 addr = kgdb_break[i].bpt_addr;
369 error = kgdb_arch_remove_breakpoint(addr,
370 kgdb_break[i].saved_instr);
371 if (error)
372 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
373 addr);
374setundefined:
375 kgdb_break[i].state = BP_UNDEFINED;
376 }
377
378 /* Clear hardware breakpoints. */
379 if (arch_kgdb_ops.remove_all_hw_break)
380 arch_kgdb_ops.remove_all_hw_break();
381
382 return 0;
383}
384
385/*
386 * Return true if there is a valid kgdb I/O module. Also if no
387 * debugger is attached a message can be printed to the console about
388 * waiting for the debugger to attach.
389 *
390 * The print_wait argument is only to be true when called from inside
391 * the core kgdb_handle_exception, because it will wait for the
392 * debugger to attach.
393 */
394static int kgdb_io_ready(int print_wait)
395{
396 if (!dbg_io_ops)
397 return 0;
398 if (kgdb_connected)
399 return 1;
400 if (atomic_read(&kgdb_setting_breakpoint))
401 return 1;
402 if (print_wait) {
403#ifdef CONFIG_KGDB_KDB
404 if (!dbg_kdb_mode)
405 printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
406#else
407 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
408#endif
409 }
410 return 1;
411}
412
413static int kgdb_reenter_check(struct kgdb_state *ks)
414{
415 unsigned long addr;
416
417 if (atomic_read(&kgdb_active) != raw_smp_processor_id())
418 return 0;
419
420 /* Panic on recursive debugger calls: */
421 exception_level++;
422 addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
423 dbg_deactivate_sw_breakpoints();
424
425 /*
426 * If the break point removed ok at the place exception
427 * occurred, try to recover and print a warning to the end
428 * user because the user planted a breakpoint in a place that
429 * KGDB needs in order to function.
430 */
431 if (dbg_remove_sw_break(addr) == 0) {
432 exception_level = 0;
433 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
434 dbg_activate_sw_breakpoints();
435 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
436 addr);
437 WARN_ON_ONCE(1);
438
439 return 1;
440 }
441 dbg_remove_all_break();
442 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
443
444 if (exception_level > 1) {
445 dump_stack();
446 panic("Recursive entry to debugger");
447 }
448
449 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
450#ifdef CONFIG_KGDB_KDB
451 /* Allow kdb to debug itself one level */
452 return 0;
453#endif
454 dump_stack();
455 panic("Recursive entry to debugger");
456
457 return 1;
458}
459
460static void dbg_cpu_switch(int cpu, int next_cpu)
461{
462 /* Mark the cpu we are switching away from as a slave when it
463 * holds the kgdb_active token. This must be done so that the
464 * that all the cpus wait in for the debug core will not enter
465 * again as the master. */
466 if (cpu == atomic_read(&kgdb_active)) {
467 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
468 kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
469 }
470 kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
471}
472
473static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
474{
475 unsigned long flags;
476 int sstep_tries = 100;
477 int error;
478 int i, cpu;
479 int trace_on = 0;
480acquirelock:
481 /*
482 * Interrupts will be restored by the 'trap return' code, except when
483 * single stepping.
484 */
485 local_irq_save(flags);
486
487 cpu = ks->cpu;
488 kgdb_info[cpu].debuggerinfo = regs;
489 kgdb_info[cpu].task = current;
490 kgdb_info[cpu].ret_state = 0;
491 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
492 /*
493 * Make sure the above info reaches the primary CPU before
494 * our cpu_in_kgdb[] flag setting does:
495 */
496 atomic_inc(&cpu_in_kgdb[cpu]);
497
498 if (exception_level == 1)
499 goto cpu_master_loop;
500
501 /*
502 * CPU will loop if it is a slave or request to become a kgdb
503 * master cpu and acquire the kgdb_active lock:
504 */
505 while (1) {
506cpu_loop:
507 if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
509 goto cpu_master_loop;
510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
511 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
512 break;
513 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
514 if (!atomic_read(&passive_cpu_wait[cpu]))
515 goto return_normal;
516 } else {
517return_normal:
518 /* Return to normal operation by executing any
519 * hw breakpoint fixup.
520 */
521 if (arch_kgdb_ops.correct_hw_break)
522 arch_kgdb_ops.correct_hw_break();
523 if (trace_on)
524 tracing_on();
525 atomic_dec(&cpu_in_kgdb[cpu]);
526 touch_softlockup_watchdog_sync();
527 clocksource_touch_watchdog();
528 local_irq_restore(flags);
529 return 0;
530 }
531 cpu_relax();
532 }
533
534 /*
535 * For single stepping, try to only enter on the processor
536 * that was single stepping. To gaurd against a deadlock, the
537 * kernel will only try for the value of sstep_tries before
538 * giving up and continuing on.
539 */
540 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
541 (kgdb_info[cpu].task &&
542 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
543 atomic_set(&kgdb_active, -1);
544 touch_softlockup_watchdog_sync();
545 clocksource_touch_watchdog();
546 local_irq_restore(flags);
547
548 goto acquirelock;
549 }
550
551 if (!kgdb_io_ready(1)) {
552 kgdb_info[cpu].ret_state = 1;
553 goto kgdb_restore; /* No I/O connection, resume the system */
554 }
555
556 /*
557 * Don't enter if we have hit a removed breakpoint.
558 */
559 if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
560 goto kgdb_restore;
561
562 /* Call the I/O driver's pre_exception routine */
563 if (dbg_io_ops->pre_exception)
564 dbg_io_ops->pre_exception();
565
566 kgdb_disable_hw_debug(ks->linux_regs);
567
568 /*
569 * Get the passive CPU lock which will hold all the non-primary
570 * CPU in a spin state while the debugger is active
571 */
572 if (!kgdb_single_step) {
573 for (i = 0; i < NR_CPUS; i++)
574 atomic_inc(&passive_cpu_wait[i]);
575 }
576
577#ifdef CONFIG_SMP
578 /* Signal the other CPUs to enter kgdb_wait() */
579 if ((!kgdb_single_step) && kgdb_do_roundup)
580 kgdb_roundup_cpus(flags);
581#endif
582
583 /*
584 * Wait for the other CPUs to be notified and be waiting for us:
585 */
586 for_each_online_cpu(i) {
587 while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
588 cpu_relax();
589 }
590
591 /*
592 * At this point the primary processor is completely
593 * in the debugger and all secondary CPUs are quiescent
594 */
595 dbg_deactivate_sw_breakpoints();
596 kgdb_single_step = 0;
597 kgdb_contthread = current;
598 exception_level = 0;
599 trace_on = tracing_is_on();
600 if (trace_on)
601 tracing_off();
602
603 while (1) {
604cpu_master_loop:
605 if (dbg_kdb_mode) {
606 kgdb_connected = 1;
607 error = kdb_stub(ks);
608 } else {
609 error = gdb_serial_stub(ks);
610 }
611
612 if (error == DBG_PASS_EVENT) {
613 dbg_kdb_mode = !dbg_kdb_mode;
614 kgdb_connected = 0;
615 } else if (error == DBG_SWITCH_CPU_EVENT) {
616 dbg_cpu_switch(cpu, dbg_switch_cpu);
617 goto cpu_loop;
618 } else {
619 kgdb_info[cpu].ret_state = error;
620 break;
621 }
622 }
623
624 /* Call the I/O driver's post_exception routine */
625 if (dbg_io_ops->post_exception)
626 dbg_io_ops->post_exception();
627
628 atomic_dec(&cpu_in_kgdb[ks->cpu]);
629
630 if (!kgdb_single_step) {
631 for (i = NR_CPUS-1; i >= 0; i--)
632 atomic_dec(&passive_cpu_wait[i]);
633 /*
634 * Wait till all the CPUs have quit from the debugger,
635 * but allow a CPU that hit an exception and is
636 * waiting to become the master to remain in the debug
637 * core.
638 */
639 for_each_online_cpu(i) {
640 while (kgdb_do_roundup &&
641 atomic_read(&cpu_in_kgdb[i]) &&
642 !(kgdb_info[i].exception_state &
643 DCPU_WANT_MASTER))
644 cpu_relax();
645 }
646 }
647
648kgdb_restore:
649 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
650 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
651 if (kgdb_info[sstep_cpu].task)
652 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
653 else
654 kgdb_sstep_pid = 0;
655 }
656 if (trace_on)
657 tracing_on();
658 /* Free kgdb_active */
659 atomic_set(&kgdb_active, -1);
660 touch_softlockup_watchdog_sync();
661 clocksource_touch_watchdog();
662 local_irq_restore(flags);
663
664 return kgdb_info[cpu].ret_state;
665}
666
667/*
668 * kgdb_handle_exception() - main entry point from a kernel exception
669 *
670 * Locking hierarchy:
671 * interface locks, if any (begin_session)
672 * kgdb lock (kgdb_active)
673 */
674int
675kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
676{
677 struct kgdb_state kgdb_var;
678 struct kgdb_state *ks = &kgdb_var;
679 int ret;
680
681 ks->cpu = raw_smp_processor_id();
682 ks->ex_vector = evector;
683 ks->signo = signo;
684 ks->err_code = ecode;
685 ks->kgdb_usethreadid = 0;
686 ks->linux_regs = regs;
687
688 if (kgdb_reenter_check(ks))
689 return 0; /* Ouch, double exception ! */
690 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
691 ret = kgdb_cpu_enter(ks, regs);
692 kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
693 DCPU_IS_SLAVE);
694 return ret;
695}
696
697int kgdb_nmicallback(int cpu, void *regs)
698{
699#ifdef CONFIG_SMP
700 struct kgdb_state kgdb_var;
701 struct kgdb_state *ks = &kgdb_var;
702
703 memset(ks, 0, sizeof(struct kgdb_state));
704 ks->cpu = cpu;
705 ks->linux_regs = regs;
706
707 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
708 atomic_read(&kgdb_active) != -1 &&
709 atomic_read(&kgdb_active) != cpu) {
710 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
711 kgdb_cpu_enter(ks, regs);
712 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
713 return 0;
714 }
715#endif
716 return 1;
717}
718
719static void kgdb_console_write(struct console *co, const char *s,
720 unsigned count)
721{
722 unsigned long flags;
723
724 /* If we're debugging, or KGDB has not connected, don't try
725 * and print. */
726 if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
727 return;
728
729 local_irq_save(flags);
730 gdbstub_msg_write(s, count);
731 local_irq_restore(flags);
732}
733
734static struct console kgdbcons = {
735 .name = "kgdb",
736 .write = kgdb_console_write,
737 .flags = CON_PRINTBUFFER | CON_ENABLED,
738 .index = -1,
739};
740
741#ifdef CONFIG_MAGIC_SYSRQ
742static void sysrq_handle_dbg(int key, struct tty_struct *tty)
743{
744 if (!dbg_io_ops) {
745 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
746 return;
747 }
748 if (!kgdb_connected) {
749#ifdef CONFIG_KGDB_KDB
750 if (!dbg_kdb_mode)
751 printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
752#else
753 printk(KERN_CRIT "Entering KGDB\n");
754#endif
755 }
756
757 kgdb_breakpoint();
758}
759
760static struct sysrq_key_op sysrq_dbg_op = {
761 .handler = sysrq_handle_dbg,
762 .help_msg = "debug(G)",
763 .action_msg = "DEBUG",
764};
765#endif
766
767static int kgdb_panic_event(struct notifier_block *self,
768 unsigned long val,
769 void *data)
770{
771 if (dbg_kdb_mode)
772 kdb_printf("PANIC: %s\n", (char *)data);
773 kgdb_breakpoint();
774 return NOTIFY_DONE;
775}
776
777static struct notifier_block kgdb_panic_event_nb = {
778 .notifier_call = kgdb_panic_event,
779 .priority = INT_MAX,
780};
781
782void __weak kgdb_arch_late(void)
783{
784}
785
786void __init dbg_late_init(void)
787{
788 dbg_is_early = false;
789 if (kgdb_io_module_registered)
790 kgdb_arch_late();
791 kdb_init(KDB_INIT_FULL);
792}
793
794static void kgdb_register_callbacks(void)
795{
796 if (!kgdb_io_module_registered) {
797 kgdb_io_module_registered = 1;
798 kgdb_arch_init();
799 if (!dbg_is_early)
800 kgdb_arch_late();
801 atomic_notifier_chain_register(&panic_notifier_list,
802 &kgdb_panic_event_nb);
803#ifdef CONFIG_MAGIC_SYSRQ
804 register_sysrq_key('g', &sysrq_dbg_op);
805#endif
806 if (kgdb_use_con && !kgdb_con_registered) {
807 register_console(&kgdbcons);
808 kgdb_con_registered = 1;
809 }
810 }
811}
812
813static void kgdb_unregister_callbacks(void)
814{
815 /*
816 * When this routine is called KGDB should unregister from the
817 * panic handler and clean up, making sure it is not handling any
818 * break exceptions at the time.
819 */
820 if (kgdb_io_module_registered) {
821 kgdb_io_module_registered = 0;
822 atomic_notifier_chain_unregister(&panic_notifier_list,
823 &kgdb_panic_event_nb);
824 kgdb_arch_exit();
825#ifdef CONFIG_MAGIC_SYSRQ
826 unregister_sysrq_key('g', &sysrq_dbg_op);
827#endif
828 if (kgdb_con_registered) {
829 unregister_console(&kgdbcons);
830 kgdb_con_registered = 0;
831 }
832 }
833}
834
835/*
836 * There are times a tasklet needs to be used vs a compiled in
837 * break point so as to cause an exception outside a kgdb I/O module,
838 * such as is the case with kgdboe, where calling a breakpoint in the
839 * I/O driver itself would be fatal.
840 */
841static void kgdb_tasklet_bpt(unsigned long ing)
842{
843 kgdb_breakpoint();
844 atomic_set(&kgdb_break_tasklet_var, 0);
845}
846
847static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
848
849void kgdb_schedule_breakpoint(void)
850{
851 if (atomic_read(&kgdb_break_tasklet_var) ||
852 atomic_read(&kgdb_active) != -1 ||
853 atomic_read(&kgdb_setting_breakpoint))
854 return;
855 atomic_inc(&kgdb_break_tasklet_var);
856 tasklet_schedule(&kgdb_tasklet_breakpoint);
857}
858EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
859
860static void kgdb_initial_breakpoint(void)
861{
862 kgdb_break_asap = 0;
863
864 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
865 kgdb_breakpoint();
866}
867
868/**
869 * kgdb_register_io_module - register KGDB IO module
870 * @new_dbg_io_ops: the io ops vector
871 *
872 * Register it with the KGDB core.
873 */
874int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
875{
876 int err;
877
878 spin_lock(&kgdb_registration_lock);
879
880 if (dbg_io_ops) {
881 spin_unlock(&kgdb_registration_lock);
882
883 printk(KERN_ERR "kgdb: Another I/O driver is already "
884 "registered with KGDB.\n");
885 return -EBUSY;
886 }
887
888 if (new_dbg_io_ops->init) {
889 err = new_dbg_io_ops->init();
890 if (err) {
891 spin_unlock(&kgdb_registration_lock);
892 return err;
893 }
894 }
895
896 dbg_io_ops = new_dbg_io_ops;
897
898 spin_unlock(&kgdb_registration_lock);
899
900 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
901 new_dbg_io_ops->name);
902
903 /* Arm KGDB now. */
904 kgdb_register_callbacks();
905
906 if (kgdb_break_asap)
907 kgdb_initial_breakpoint();
908
909 return 0;
910}
911EXPORT_SYMBOL_GPL(kgdb_register_io_module);
912
913/**
914 * kkgdb_unregister_io_module - unregister KGDB IO module
915 * @old_dbg_io_ops: the io ops vector
916 *
917 * Unregister it with the KGDB core.
918 */
919void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
920{
921 BUG_ON(kgdb_connected);
922
923 /*
924 * KGDB is no longer able to communicate out, so
925 * unregister our callbacks and reset state.
926 */
927 kgdb_unregister_callbacks();
928
929 spin_lock(&kgdb_registration_lock);
930
931 WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops);
932 dbg_io_ops = NULL;
933
934 spin_unlock(&kgdb_registration_lock);
935
936 printk(KERN_INFO
937 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
938 old_dbg_io_ops->name);
939}
940EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
941
942int dbg_io_get_char(void)
943{
944 int ret = dbg_io_ops->read_char();
945 if (ret == NO_POLL_CHAR)
946 return -1;
947 if (!dbg_kdb_mode)
948 return ret;
949 if (ret == 127)
950 return 8;
951 return ret;
952}
953
954/**
955 * kgdb_breakpoint - generate breakpoint exception
956 *
957 * This function will generate a breakpoint exception. It is used at the
958 * beginning of a program to sync up with a debugger and can be used
959 * otherwise as a quick means to stop program execution and "break" into
960 * the debugger.
961 */
962void kgdb_breakpoint(void)
963{
964 atomic_inc(&kgdb_setting_breakpoint);
965 wmb(); /* Sync point before breakpoint */
966 arch_kgdb_breakpoint();
967 wmb(); /* Sync point after breakpoint */
968 atomic_dec(&kgdb_setting_breakpoint);
969}
970EXPORT_SYMBOL_GPL(kgdb_breakpoint);
971
972static int __init opt_kgdb_wait(char *str)
973{
974 kgdb_break_asap = 1;
975
976 kdb_init(KDB_INIT_EARLY);
977 if (kgdb_io_module_registered)
978 kgdb_initial_breakpoint();
979
980 return 0;
981}
982
983early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
new file mode 100644
index 000000000000..c5d753d80f67
--- /dev/null
+++ b/kernel/debug/debug_core.h
@@ -0,0 +1,81 @@
1/*
2 * Created by: Jason Wessel <jason.wessel@windriver.com>
3 *
4 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
5 *
6 * This file is licensed under the terms of the GNU General Public
7 * License version 2. This program is licensed "as is" without any
8 * warranty of any kind, whether express or implied.
9 */
10
11#ifndef _DEBUG_CORE_H_
12#define _DEBUG_CORE_H_
13/*
14 * These are the private implementation headers between the kernel
15 * debugger core and the debugger front end code.
16 */
17
18/* kernel debug core data structures */
19struct kgdb_state {
20 int ex_vector;
21 int signo;
22 int err_code;
23 int cpu;
24 int pass_exception;
25 unsigned long thr_query;
26 unsigned long threadid;
27 long kgdb_usethreadid;
28 struct pt_regs *linux_regs;
29};
30
31/* Exception state values */
32#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
33#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
34#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
35#define DCPU_SSTEP 0x8 /* CPU is single stepping */
36
37struct debuggerinfo_struct {
38 void *debuggerinfo;
39 struct task_struct *task;
40 int exception_state;
41 int ret_state;
42 int irq_depth;
43};
44
45extern struct debuggerinfo_struct kgdb_info[];
46
47/* kernel debug core break point routines */
48extern int dbg_remove_all_break(void);
49extern int dbg_set_sw_break(unsigned long addr);
50extern int dbg_remove_sw_break(unsigned long addr);
51extern int dbg_activate_sw_breakpoints(void);
52extern int dbg_deactivate_sw_breakpoints(void);
53
54/* polled character access to i/o module */
55extern int dbg_io_get_char(void);
56
57/* stub return value for switching between the gdbstub and kdb */
58#define DBG_PASS_EVENT -12345
59/* Switch from one cpu to another */
60#define DBG_SWITCH_CPU_EVENT -123456
61extern int dbg_switch_cpu;
62
63/* gdbstub interface functions */
64extern int gdb_serial_stub(struct kgdb_state *ks);
65extern void gdbstub_msg_write(const char *s, int len);
66
67/* gdbstub functions used for kdb <-> gdbstub transition */
68extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
69extern int dbg_kdb_mode;
70
71#ifdef CONFIG_KGDB_KDB
72extern int kdb_stub(struct kgdb_state *ks);
73extern int kdb_parse(const char *cmdstr);
74#else /* ! CONFIG_KGDB_KDB */
75static inline int kdb_stub(struct kgdb_state *ks)
76{
77 return DBG_PASS_EVENT;
78}
79#endif /* CONFIG_KGDB_KDB */
80
81#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
new file mode 100644
index 000000000000..4b17b3269525
--- /dev/null
+++ b/kernel/debug/gdbstub.c
@@ -0,0 +1,1017 @@
1/*
2 * Kernel Debug Core
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30
31#include <linux/kernel.h>
32#include <linux/kgdb.h>
33#include <linux/kdb.h>
34#include <linux/reboot.h>
35#include <linux/uaccess.h>
36#include <asm/cacheflush.h>
37#include <asm/unaligned.h>
38#include "debug_core.h"
39
40#define KGDB_MAX_THREAD_QUERY 17
41
42/* Our I/O buffers. */
43static char remcom_in_buffer[BUFMAX];
44static char remcom_out_buffer[BUFMAX];
45
46/* Storage for the registers, in GDB format. */
47static unsigned long gdb_regs[(NUMREGBYTES +
48 sizeof(unsigned long) - 1) /
49 sizeof(unsigned long)];
50
51/*
52 * GDB remote protocol parser:
53 */
54
55static int hex(char ch)
56{
57 if ((ch >= 'a') && (ch <= 'f'))
58 return ch - 'a' + 10;
59 if ((ch >= '0') && (ch <= '9'))
60 return ch - '0';
61 if ((ch >= 'A') && (ch <= 'F'))
62 return ch - 'A' + 10;
63 return -1;
64}
65
66#ifdef CONFIG_KGDB_KDB
67static int gdbstub_read_wait(void)
68{
69 int ret = -1;
70 int i;
71
72 /* poll any additional I/O interfaces that are defined */
73 while (ret < 0)
74 for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
75 ret = kdb_poll_funcs[i]();
76 if (ret > 0)
77 break;
78 }
79 return ret;
80}
81#else
82static int gdbstub_read_wait(void)
83{
84 int ret = dbg_io_ops->read_char();
85 while (ret == NO_POLL_CHAR)
86 ret = dbg_io_ops->read_char();
87 return ret;
88}
89#endif
90/* scan for the sequence $<data>#<checksum> */
91static void get_packet(char *buffer)
92{
93 unsigned char checksum;
94 unsigned char xmitcsum;
95 int count;
96 char ch;
97
98 do {
99 /*
100 * Spin and wait around for the start character, ignore all
101 * other characters:
102 */
103 while ((ch = (gdbstub_read_wait())) != '$')
104 /* nothing */;
105
106 kgdb_connected = 1;
107 checksum = 0;
108 xmitcsum = -1;
109
110 count = 0;
111
112 /*
113 * now, read until a # or end of buffer is found:
114 */
115 while (count < (BUFMAX - 1)) {
116 ch = gdbstub_read_wait();
117 if (ch == '#')
118 break;
119 checksum = checksum + ch;
120 buffer[count] = ch;
121 count = count + 1;
122 }
123 buffer[count] = 0;
124
125 if (ch == '#') {
126 xmitcsum = hex(gdbstub_read_wait()) << 4;
127 xmitcsum += hex(gdbstub_read_wait());
128
129 if (checksum != xmitcsum)
130 /* failed checksum */
131 dbg_io_ops->write_char('-');
132 else
133 /* successful transfer */
134 dbg_io_ops->write_char('+');
135 if (dbg_io_ops->flush)
136 dbg_io_ops->flush();
137 }
138 } while (checksum != xmitcsum);
139}
140
141/*
142 * Send the packet in buffer.
143 * Check for gdb connection if asked for.
144 */
145static void put_packet(char *buffer)
146{
147 unsigned char checksum;
148 int count;
149 char ch;
150
151 /*
152 * $<packet info>#<checksum>.
153 */
154 while (1) {
155 dbg_io_ops->write_char('$');
156 checksum = 0;
157 count = 0;
158
159 while ((ch = buffer[count])) {
160 dbg_io_ops->write_char(ch);
161 checksum += ch;
162 count++;
163 }
164
165 dbg_io_ops->write_char('#');
166 dbg_io_ops->write_char(hex_asc_hi(checksum));
167 dbg_io_ops->write_char(hex_asc_lo(checksum));
168 if (dbg_io_ops->flush)
169 dbg_io_ops->flush();
170
171 /* Now see what we get in reply. */
172 ch = gdbstub_read_wait();
173
174 if (ch == 3)
175 ch = gdbstub_read_wait();
176
177 /* If we get an ACK, we are done. */
178 if (ch == '+')
179 return;
180
181 /*
182 * If we get the start of another packet, this means
183 * that GDB is attempting to reconnect. We will NAK
184 * the packet being sent, and stop trying to send this
185 * packet.
186 */
187 if (ch == '$') {
188 dbg_io_ops->write_char('-');
189 if (dbg_io_ops->flush)
190 dbg_io_ops->flush();
191 return;
192 }
193 }
194}
195
196static char gdbmsgbuf[BUFMAX + 1];
197
198void gdbstub_msg_write(const char *s, int len)
199{
200 char *bufptr;
201 int wcount;
202 int i;
203
204 if (len == 0)
205 len = strlen(s);
206
207 /* 'O'utput */
208 gdbmsgbuf[0] = 'O';
209
210 /* Fill and send buffers... */
211 while (len > 0) {
212 bufptr = gdbmsgbuf + 1;
213
214 /* Calculate how many this time */
215 if ((len << 1) > (BUFMAX - 2))
216 wcount = (BUFMAX - 2) >> 1;
217 else
218 wcount = len;
219
220 /* Pack in hex chars */
221 for (i = 0; i < wcount; i++)
222 bufptr = pack_hex_byte(bufptr, s[i]);
223 *bufptr = '\0';
224
225 /* Move up */
226 s += wcount;
227 len -= wcount;
228
229 /* Write packet */
230 put_packet(gdbmsgbuf);
231 }
232}
233
234/*
235 * Convert the memory pointed to by mem into hex, placing result in
236 * buf. Return a pointer to the last char put in buf (null). May
237 * return an error.
238 */
239int kgdb_mem2hex(char *mem, char *buf, int count)
240{
241 char *tmp;
242 int err;
243
244 /*
245 * We use the upper half of buf as an intermediate buffer for the
246 * raw memory copy. Hex conversion will work against this one.
247 */
248 tmp = buf + count;
249
250 err = probe_kernel_read(tmp, mem, count);
251 if (!err) {
252 while (count > 0) {
253 buf = pack_hex_byte(buf, *tmp);
254 tmp++;
255 count--;
256 }
257
258 *buf = 0;
259 }
260
261 return err;
262}
263
264/*
265 * Convert the hex array pointed to by buf into binary to be placed in
266 * mem. Return a pointer to the character AFTER the last byte
267 * written. May return an error.
268 */
269int kgdb_hex2mem(char *buf, char *mem, int count)
270{
271 char *tmp_raw;
272 char *tmp_hex;
273
274 /*
275 * We use the upper half of buf as an intermediate buffer for the
276 * raw memory that is converted from hex.
277 */
278 tmp_raw = buf + count * 2;
279
280 tmp_hex = tmp_raw - 1;
281 while (tmp_hex >= buf) {
282 tmp_raw--;
283 *tmp_raw = hex(*tmp_hex--);
284 *tmp_raw |= hex(*tmp_hex--) << 4;
285 }
286
287 return probe_kernel_write(mem, tmp_raw, count);
288}
289
290/*
291 * While we find nice hex chars, build a long_val.
292 * Return number of chars processed.
293 */
294int kgdb_hex2long(char **ptr, unsigned long *long_val)
295{
296 int hex_val;
297 int num = 0;
298 int negate = 0;
299
300 *long_val = 0;
301
302 if (**ptr == '-') {
303 negate = 1;
304 (*ptr)++;
305 }
306 while (**ptr) {
307 hex_val = hex(**ptr);
308 if (hex_val < 0)
309 break;
310
311 *long_val = (*long_val << 4) | hex_val;
312 num++;
313 (*ptr)++;
314 }
315
316 if (negate)
317 *long_val = -*long_val;
318
319 return num;
320}
321
322/*
323 * Copy the binary array pointed to by buf into mem. Fix $, #, and
324 * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
325 * The input buf is overwitten with the result to write to mem.
326 */
327static int kgdb_ebin2mem(char *buf, char *mem, int count)
328{
329 int size = 0;
330 char *c = buf;
331
332 while (count-- > 0) {
333 c[size] = *buf++;
334 if (c[size] == 0x7d)
335 c[size] = *buf++ ^ 0x20;
336 size++;
337 }
338
339 return probe_kernel_write(mem, c, size);
340}
341
342/* Write memory due to an 'M' or 'X' packet. */
343static int write_mem_msg(int binary)
344{
345 char *ptr = &remcom_in_buffer[1];
346 unsigned long addr;
347 unsigned long length;
348 int err;
349
350 if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
351 kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
352 if (binary)
353 err = kgdb_ebin2mem(ptr, (char *)addr, length);
354 else
355 err = kgdb_hex2mem(ptr, (char *)addr, length);
356 if (err)
357 return err;
358 if (CACHE_FLUSH_IS_SAFE)
359 flush_icache_range(addr, addr + length);
360 return 0;
361 }
362
363 return -EINVAL;
364}
365
366static void error_packet(char *pkt, int error)
367{
368 error = -error;
369 pkt[0] = 'E';
370 pkt[1] = hex_asc[(error / 10)];
371 pkt[2] = hex_asc[(error % 10)];
372 pkt[3] = '\0';
373}
374
375/*
376 * Thread ID accessors. We represent a flat TID space to GDB, where
377 * the per CPU idle threads (which under Linux all have PID 0) are
378 * remapped to negative TIDs.
379 */
380
381#define BUF_THREAD_ID_SIZE 16
382
383static char *pack_threadid(char *pkt, unsigned char *id)
384{
385 char *limit;
386
387 limit = pkt + BUF_THREAD_ID_SIZE;
388 while (pkt < limit)
389 pkt = pack_hex_byte(pkt, *id++);
390
391 return pkt;
392}
393
394static void int_to_threadref(unsigned char *id, int value)
395{
396 unsigned char *scan;
397 int i = 4;
398
399 scan = (unsigned char *)id;
400 while (i--)
401 *scan++ = 0;
402 put_unaligned_be32(value, scan);
403}
404
405static struct task_struct *getthread(struct pt_regs *regs, int tid)
406{
407 /*
408 * Non-positive TIDs are remapped to the cpu shadow information
409 */
410 if (tid == 0 || tid == -1)
411 tid = -atomic_read(&kgdb_active) - 2;
412 if (tid < -1 && tid > -NR_CPUS - 2) {
413 if (kgdb_info[-tid - 2].task)
414 return kgdb_info[-tid - 2].task;
415 else
416 return idle_task(-tid - 2);
417 }
418 if (tid <= 0) {
419 printk(KERN_ERR "KGDB: Internal thread select error\n");
420 dump_stack();
421 return NULL;
422 }
423
424 /*
425 * find_task_by_pid_ns() does not take the tasklist lock anymore
426 * but is nicely RCU locked - hence is a pretty resilient
427 * thing to use:
428 */
429 return find_task_by_pid_ns(tid, &init_pid_ns);
430}
431
432
433/*
434 * Remap normal tasks to their real PID,
435 * CPU shadow threads are mapped to -CPU - 2
436 */
437static inline int shadow_pid(int realpid)
438{
439 if (realpid)
440 return realpid;
441
442 return -raw_smp_processor_id() - 2;
443}
444
445/*
446 * All the functions that start with gdb_cmd are the various
447 * operations to implement the handlers for the gdbserial protocol
448 * where KGDB is communicating with an external debugger
449 */
450
451/* Handle the '?' status packets */
452static void gdb_cmd_status(struct kgdb_state *ks)
453{
454 /*
455 * We know that this packet is only sent
456 * during initial connect. So to be safe,
457 * we clear out our breakpoints now in case
458 * GDB is reconnecting.
459 */
460 dbg_remove_all_break();
461
462 remcom_out_buffer[0] = 'S';
463 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
464}
465
466/* Handle the 'g' get registers request */
467static void gdb_cmd_getregs(struct kgdb_state *ks)
468{
469 struct task_struct *thread;
470 void *local_debuggerinfo;
471 int i;
472
473 thread = kgdb_usethread;
474 if (!thread) {
475 thread = kgdb_info[ks->cpu].task;
476 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
477 } else {
478 local_debuggerinfo = NULL;
479 for_each_online_cpu(i) {
480 /*
481 * Try to find the task on some other
482 * or possibly this node if we do not
483 * find the matching task then we try
484 * to approximate the results.
485 */
486 if (thread == kgdb_info[i].task)
487 local_debuggerinfo = kgdb_info[i].debuggerinfo;
488 }
489 }
490
491 /*
492 * All threads that don't have debuggerinfo should be
493 * in schedule() sleeping, since all other CPUs
494 * are in kgdb_wait, and thus have debuggerinfo.
495 */
496 if (local_debuggerinfo) {
497 pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
498 } else {
499 /*
500 * Pull stuff saved during switch_to; nothing
501 * else is accessible (or even particularly
502 * relevant).
503 *
504 * This should be enough for a stack trace.
505 */
506 sleeping_thread_to_gdb_regs(gdb_regs, thread);
507 }
508 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
509}
510
511/* Handle the 'G' set registers request */
512static void gdb_cmd_setregs(struct kgdb_state *ks)
513{
514 kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
515
516 if (kgdb_usethread && kgdb_usethread != current) {
517 error_packet(remcom_out_buffer, -EINVAL);
518 } else {
519 gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
520 strcpy(remcom_out_buffer, "OK");
521 }
522}
523
524/* Handle the 'm' memory read bytes */
525static void gdb_cmd_memread(struct kgdb_state *ks)
526{
527 char *ptr = &remcom_in_buffer[1];
528 unsigned long length;
529 unsigned long addr;
530 int err;
531
532 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
533 kgdb_hex2long(&ptr, &length) > 0) {
534 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
535 if (err)
536 error_packet(remcom_out_buffer, err);
537 } else {
538 error_packet(remcom_out_buffer, -EINVAL);
539 }
540}
541
542/* Handle the 'M' memory write bytes */
543static void gdb_cmd_memwrite(struct kgdb_state *ks)
544{
545 int err = write_mem_msg(0);
546
547 if (err)
548 error_packet(remcom_out_buffer, err);
549 else
550 strcpy(remcom_out_buffer, "OK");
551}
552
553/* Handle the 'X' memory binary write bytes */
554static void gdb_cmd_binwrite(struct kgdb_state *ks)
555{
556 int err = write_mem_msg(1);
557
558 if (err)
559 error_packet(remcom_out_buffer, err);
560 else
561 strcpy(remcom_out_buffer, "OK");
562}
563
564/* Handle the 'D' or 'k', detach or kill packets */
565static void gdb_cmd_detachkill(struct kgdb_state *ks)
566{
567 int error;
568
569 /* The detach case */
570 if (remcom_in_buffer[0] == 'D') {
571 error = dbg_remove_all_break();
572 if (error < 0) {
573 error_packet(remcom_out_buffer, error);
574 } else {
575 strcpy(remcom_out_buffer, "OK");
576 kgdb_connected = 0;
577 }
578 put_packet(remcom_out_buffer);
579 } else {
580 /*
581 * Assume the kill case, with no exit code checking,
582 * trying to force detach the debugger:
583 */
584 dbg_remove_all_break();
585 kgdb_connected = 0;
586 }
587}
588
589/* Handle the 'R' reboot packets */
590static int gdb_cmd_reboot(struct kgdb_state *ks)
591{
592 /* For now, only honor R0 */
593 if (strcmp(remcom_in_buffer, "R0") == 0) {
594 printk(KERN_CRIT "Executing emergency reboot\n");
595 strcpy(remcom_out_buffer, "OK");
596 put_packet(remcom_out_buffer);
597
598 /*
599 * Execution should not return from
600 * machine_emergency_restart()
601 */
602 machine_emergency_restart();
603 kgdb_connected = 0;
604
605 return 1;
606 }
607 return 0;
608}
609
610/* Handle the 'q' query packets */
611static void gdb_cmd_query(struct kgdb_state *ks)
612{
613 struct task_struct *g;
614 struct task_struct *p;
615 unsigned char thref[8];
616 char *ptr;
617 int i;
618 int cpu;
619 int finished = 0;
620
621 switch (remcom_in_buffer[1]) {
622 case 's':
623 case 'f':
624 if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
625 error_packet(remcom_out_buffer, -EINVAL);
626 break;
627 }
628
629 i = 0;
630 remcom_out_buffer[0] = 'm';
631 ptr = remcom_out_buffer + 1;
632 if (remcom_in_buffer[1] == 'f') {
633 /* Each cpu is a shadow thread */
634 for_each_online_cpu(cpu) {
635 ks->thr_query = 0;
636 int_to_threadref(thref, -cpu - 2);
637 pack_threadid(ptr, thref);
638 ptr += BUF_THREAD_ID_SIZE;
639 *(ptr++) = ',';
640 i++;
641 }
642 }
643
644 do_each_thread(g, p) {
645 if (i >= ks->thr_query && !finished) {
646 int_to_threadref(thref, p->pid);
647 pack_threadid(ptr, thref);
648 ptr += BUF_THREAD_ID_SIZE;
649 *(ptr++) = ',';
650 ks->thr_query++;
651 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
652 finished = 1;
653 }
654 i++;
655 } while_each_thread(g, p);
656
657 *(--ptr) = '\0';
658 break;
659
660 case 'C':
661 /* Current thread id */
662 strcpy(remcom_out_buffer, "QC");
663 ks->threadid = shadow_pid(current->pid);
664 int_to_threadref(thref, ks->threadid);
665 pack_threadid(remcom_out_buffer + 2, thref);
666 break;
667 case 'T':
668 if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
669 error_packet(remcom_out_buffer, -EINVAL);
670 break;
671 }
672 ks->threadid = 0;
673 ptr = remcom_in_buffer + 17;
674 kgdb_hex2long(&ptr, &ks->threadid);
675 if (!getthread(ks->linux_regs, ks->threadid)) {
676 error_packet(remcom_out_buffer, -EINVAL);
677 break;
678 }
679 if ((int)ks->threadid > 0) {
680 kgdb_mem2hex(getthread(ks->linux_regs,
681 ks->threadid)->comm,
682 remcom_out_buffer, 16);
683 } else {
684 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
685
686 sprintf(tmpstr, "shadowCPU%d",
687 (int)(-ks->threadid - 2));
688 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
689 }
690 break;
691#ifdef CONFIG_KGDB_KDB
692 case 'R':
693 if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) {
694 int len = strlen(remcom_in_buffer + 6);
695
696 if ((len % 2) != 0) {
697 strcpy(remcom_out_buffer, "E01");
698 break;
699 }
700 kgdb_hex2mem(remcom_in_buffer + 6,
701 remcom_out_buffer, len);
702 len = len / 2;
703 remcom_out_buffer[len++] = 0;
704
705 kdb_parse(remcom_out_buffer);
706 strcpy(remcom_out_buffer, "OK");
707 }
708 break;
709#endif
710 }
711}
712
713/* Handle the 'H' task query packets */
714static void gdb_cmd_task(struct kgdb_state *ks)
715{
716 struct task_struct *thread;
717 char *ptr;
718
719 switch (remcom_in_buffer[1]) {
720 case 'g':
721 ptr = &remcom_in_buffer[2];
722 kgdb_hex2long(&ptr, &ks->threadid);
723 thread = getthread(ks->linux_regs, ks->threadid);
724 if (!thread && ks->threadid > 0) {
725 error_packet(remcom_out_buffer, -EINVAL);
726 break;
727 }
728 kgdb_usethread = thread;
729 ks->kgdb_usethreadid = ks->threadid;
730 strcpy(remcom_out_buffer, "OK");
731 break;
732 case 'c':
733 ptr = &remcom_in_buffer[2];
734 kgdb_hex2long(&ptr, &ks->threadid);
735 if (!ks->threadid) {
736 kgdb_contthread = NULL;
737 } else {
738 thread = getthread(ks->linux_regs, ks->threadid);
739 if (!thread && ks->threadid > 0) {
740 error_packet(remcom_out_buffer, -EINVAL);
741 break;
742 }
743 kgdb_contthread = thread;
744 }
745 strcpy(remcom_out_buffer, "OK");
746 break;
747 }
748}
749
750/* Handle the 'T' thread query packets */
751static void gdb_cmd_thread(struct kgdb_state *ks)
752{
753 char *ptr = &remcom_in_buffer[1];
754 struct task_struct *thread;
755
756 kgdb_hex2long(&ptr, &ks->threadid);
757 thread = getthread(ks->linux_regs, ks->threadid);
758 if (thread)
759 strcpy(remcom_out_buffer, "OK");
760 else
761 error_packet(remcom_out_buffer, -EINVAL);
762}
763
764/* Handle the 'z' or 'Z' breakpoint remove or set packets */
765static void gdb_cmd_break(struct kgdb_state *ks)
766{
767 /*
768 * Since GDB-5.3, it's been drafted that '0' is a software
769 * breakpoint, '1' is a hardware breakpoint, so let's do that.
770 */
771 char *bpt_type = &remcom_in_buffer[1];
772 char *ptr = &remcom_in_buffer[2];
773 unsigned long addr;
774 unsigned long length;
775 int error = 0;
776
777 if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
778 /* Unsupported */
779 if (*bpt_type > '4')
780 return;
781 } else {
782 if (*bpt_type != '0' && *bpt_type != '1')
783 /* Unsupported. */
784 return;
785 }
786
787 /*
788 * Test if this is a hardware breakpoint, and
789 * if we support it:
790 */
791 if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
792 /* Unsupported. */
793 return;
794
795 if (*(ptr++) != ',') {
796 error_packet(remcom_out_buffer, -EINVAL);
797 return;
798 }
799 if (!kgdb_hex2long(&ptr, &addr)) {
800 error_packet(remcom_out_buffer, -EINVAL);
801 return;
802 }
803 if (*(ptr++) != ',' ||
804 !kgdb_hex2long(&ptr, &length)) {
805 error_packet(remcom_out_buffer, -EINVAL);
806 return;
807 }
808
809 if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
810 error = dbg_set_sw_break(addr);
811 else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
812 error = dbg_remove_sw_break(addr);
813 else if (remcom_in_buffer[0] == 'Z')
814 error = arch_kgdb_ops.set_hw_breakpoint(addr,
815 (int)length, *bpt_type - '0');
816 else if (remcom_in_buffer[0] == 'z')
817 error = arch_kgdb_ops.remove_hw_breakpoint(addr,
818 (int) length, *bpt_type - '0');
819
820 if (error == 0)
821 strcpy(remcom_out_buffer, "OK");
822 else
823 error_packet(remcom_out_buffer, error);
824}
825
826/* Handle the 'C' signal / exception passing packets */
827static int gdb_cmd_exception_pass(struct kgdb_state *ks)
828{
829 /* C09 == pass exception
830 * C15 == detach kgdb, pass exception
831 */
832 if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
833
834 ks->pass_exception = 1;
835 remcom_in_buffer[0] = 'c';
836
837 } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
838
839 ks->pass_exception = 1;
840 remcom_in_buffer[0] = 'D';
841 dbg_remove_all_break();
842 kgdb_connected = 0;
843 return 1;
844
845 } else {
846 gdbstub_msg_write("KGDB only knows signal 9 (pass)"
847 " and 15 (pass and disconnect)\n"
848 "Executing a continue without signal passing\n", 0);
849 remcom_in_buffer[0] = 'c';
850 }
851
852 /* Indicate fall through */
853 return -1;
854}
855
856/*
857 * This function performs all gdbserial command procesing
858 */
859int gdb_serial_stub(struct kgdb_state *ks)
860{
861 int error = 0;
862 int tmp;
863
864 /* Clear the out buffer. */
865 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
866
867 if (kgdb_connected) {
868 unsigned char thref[8];
869 char *ptr;
870
871 /* Reply to host that an exception has occurred */
872 ptr = remcom_out_buffer;
873 *ptr++ = 'T';
874 ptr = pack_hex_byte(ptr, ks->signo);
875 ptr += strlen(strcpy(ptr, "thread:"));
876 int_to_threadref(thref, shadow_pid(current->pid));
877 ptr = pack_threadid(ptr, thref);
878 *ptr++ = ';';
879 put_packet(remcom_out_buffer);
880 }
881
882 kgdb_usethread = kgdb_info[ks->cpu].task;
883 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
884 ks->pass_exception = 0;
885
886 while (1) {
887 error = 0;
888
889 /* Clear the out buffer. */
890 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
891
892 get_packet(remcom_in_buffer);
893
894 switch (remcom_in_buffer[0]) {
895 case '?': /* gdbserial status */
896 gdb_cmd_status(ks);
897 break;
898 case 'g': /* return the value of the CPU registers */
899 gdb_cmd_getregs(ks);
900 break;
901 case 'G': /* set the value of the CPU registers - return OK */
902 gdb_cmd_setregs(ks);
903 break;
904 case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
905 gdb_cmd_memread(ks);
906 break;
907 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
908 gdb_cmd_memwrite(ks);
909 break;
910 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
911 gdb_cmd_binwrite(ks);
912 break;
913 /* kill or detach. KGDB should treat this like a
914 * continue.
915 */
916 case 'D': /* Debugger detach */
917 case 'k': /* Debugger detach via kill */
918 gdb_cmd_detachkill(ks);
919 goto default_handle;
920 case 'R': /* Reboot */
921 if (gdb_cmd_reboot(ks))
922 goto default_handle;
923 break;
924 case 'q': /* query command */
925 gdb_cmd_query(ks);
926 break;
927 case 'H': /* task related */
928 gdb_cmd_task(ks);
929 break;
930 case 'T': /* Query thread status */
931 gdb_cmd_thread(ks);
932 break;
933 case 'z': /* Break point remove */
934 case 'Z': /* Break point set */
935 gdb_cmd_break(ks);
936 break;
937#ifdef CONFIG_KGDB_KDB
938 case '3': /* Escape into back into kdb */
939 if (remcom_in_buffer[1] == '\0') {
940 gdb_cmd_detachkill(ks);
941 return DBG_PASS_EVENT;
942 }
943#endif
944 case 'C': /* Exception passing */
945 tmp = gdb_cmd_exception_pass(ks);
946 if (tmp > 0)
947 goto default_handle;
948 if (tmp == 0)
949 break;
950 /* Fall through on tmp < 0 */
951 case 'c': /* Continue packet */
952 case 's': /* Single step packet */
953 if (kgdb_contthread && kgdb_contthread != current) {
954 /* Can't switch threads in kgdb */
955 error_packet(remcom_out_buffer, -EINVAL);
956 break;
957 }
958 dbg_activate_sw_breakpoints();
959 /* Fall through to default processing */
960 default:
961default_handle:
962 error = kgdb_arch_handle_exception(ks->ex_vector,
963 ks->signo,
964 ks->err_code,
965 remcom_in_buffer,
966 remcom_out_buffer,
967 ks->linux_regs);
968 /*
969 * Leave cmd processing on error, detach,
970 * kill, continue, or single step.
971 */
972 if (error >= 0 || remcom_in_buffer[0] == 'D' ||
973 remcom_in_buffer[0] == 'k') {
974 error = 0;
975 goto kgdb_exit;
976 }
977
978 }
979
980 /* reply to the request */
981 put_packet(remcom_out_buffer);
982 }
983
984kgdb_exit:
985 if (ks->pass_exception)
986 error = 1;
987 return error;
988}
989
990int gdbstub_state(struct kgdb_state *ks, char *cmd)
991{
992 int error;
993
994 switch (cmd[0]) {
995 case 'e':
996 error = kgdb_arch_handle_exception(ks->ex_vector,
997 ks->signo,
998 ks->err_code,
999 remcom_in_buffer,
1000 remcom_out_buffer,
1001 ks->linux_regs);
1002 return error;
1003 case 's':
1004 case 'c':
1005 strcpy(remcom_in_buffer, cmd);
1006 return 0;
1007 case '?':
1008 gdb_cmd_status(ks);
1009 break;
1010 case '\0':
1011 strcpy(remcom_out_buffer, "");
1012 break;
1013 }
1014 dbg_io_ops->write_char('+');
1015 put_packet(remcom_out_buffer);
1016 return 0;
1017}
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
new file mode 100644
index 000000000000..396d12eda9e8
--- /dev/null
+++ b/kernel/debug/kdb/.gitignore
@@ -0,0 +1 @@
gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
new file mode 100644
index 000000000000..d4fc58f4b88d
--- /dev/null
+++ b/kernel/debug/kdb/Makefile
@@ -0,0 +1,25 @@
1# This file is subject to the terms and conditions of the GNU General Public
2# License. See the file "COPYING" in the main directory of this archive
3# for more details.
4#
5# Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
6# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
7#
8
9CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
10obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
11obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
12
13clean-files := gen-kdb_cmds.c
14
15quiet_cmd_gen-kdb = GENKDB $@
16 cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
17 /^\#/{next} \
18 /^[ \t]*$$/{next} \
19 {gsub(/"/, "\\\"", $$0); \
20 print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
21 END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" NULL\n};");}' \
22 $(filter-out %/Makefile,$^) > $@#
23
24$(obj)/gen-kdb_cmds.c: $(src)/kdb_cmds $(src)/Makefile
25 $(call cmd,gen-kdb)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
new file mode 100644
index 000000000000..75bd9b3ebbb7
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -0,0 +1,564 @@
1/*
2 * Kernel Debugger Architecture Independent Breakpoint Handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/string.h>
13#include <linux/kernel.h>
14#include <linux/init.h>
15#include <linux/kdb.h>
16#include <linux/kgdb.h>
17#include <linux/smp.h>
18#include <linux/sched.h>
19#include <linux/interrupt.h>
20#include "kdb_private.h"
21
22/*
23 * Table of kdb_breakpoints
24 */
25kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
26
27static void kdb_setsinglestep(struct pt_regs *regs)
28{
29 KDB_STATE_SET(DOING_SS);
30}
31
32static char *kdb_rwtypes[] = {
33 "Instruction(i)",
34 "Instruction(Register)",
35 "Data Write",
36 "I/O",
37 "Data Access"
38};
39
40static char *kdb_bptype(kdb_bp_t *bp)
41{
42 if (bp->bp_type < 0 || bp->bp_type > 4)
43 return "";
44
45 return kdb_rwtypes[bp->bp_type];
46}
47
48static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
49{
50 int nextarg = *nextargp;
51 int diag;
52
53 bp->bph_length = 1;
54 if ((argc + 1) != nextarg) {
55 if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
56 bp->bp_type = BP_ACCESS_WATCHPOINT;
57 else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
58 bp->bp_type = BP_WRITE_WATCHPOINT;
59 else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
60 bp->bp_type = BP_HARDWARE_BREAKPOINT;
61 else
62 return KDB_ARGCOUNT;
63
64 bp->bph_length = 1;
65
66 nextarg++;
67
68 if ((argc + 1) != nextarg) {
69 unsigned long len;
70
71 diag = kdbgetularg((char *)argv[nextarg],
72 &len);
73 if (diag)
74 return diag;
75
76
77 if (len > 8)
78 return KDB_BADLENGTH;
79
80 bp->bph_length = len;
81 nextarg++;
82 }
83
84 if ((argc + 1) != nextarg)
85 return KDB_ARGCOUNT;
86 }
87
88 *nextargp = nextarg;
89 return 0;
90}
91
92static int _kdb_bp_remove(kdb_bp_t *bp)
93{
94 int ret = 1;
95 if (!bp->bp_installed)
96 return ret;
97 if (!bp->bp_type)
98 ret = dbg_remove_sw_break(bp->bp_addr);
99 else
100 ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
101 bp->bph_length,
102 bp->bp_type);
103 if (ret == 0)
104 bp->bp_installed = 0;
105 return ret;
106}
107
108static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
109{
110 if (KDB_DEBUG(BP))
111 kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
112
113 /*
114 * Setup single step
115 */
116 kdb_setsinglestep(regs);
117
118 /*
119 * Reset delay attribute
120 */
121 bp->bp_delay = 0;
122 bp->bp_delayed = 1;
123}
124
125static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
126{
127 int ret;
128 /*
129 * Install the breakpoint, if it is not already installed.
130 */
131
132 if (KDB_DEBUG(BP))
133 kdb_printf("%s: bp_installed %d\n",
134 __func__, bp->bp_installed);
135 if (!KDB_STATE(SSBPT))
136 bp->bp_delay = 0;
137 if (bp->bp_installed)
138 return 1;
139 if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
140 if (KDB_DEBUG(BP))
141 kdb_printf("%s: delayed bp\n", __func__);
142 kdb_handle_bp(regs, bp);
143 return 0;
144 }
145 if (!bp->bp_type)
146 ret = dbg_set_sw_break(bp->bp_addr);
147 else
148 ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
149 bp->bph_length,
150 bp->bp_type);
151 if (ret == 0) {
152 bp->bp_installed = 1;
153 } else {
154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
155 __func__, bp->bp_addr);
156 return 1;
157 }
158 return 0;
159}
160
161/*
162 * kdb_bp_install
163 *
164 * Install kdb_breakpoints prior to returning from the
165 * kernel debugger. This allows the kdb_breakpoints to be set
166 * upon functions that are used internally by kdb, such as
167 * printk(). This function is only called once per kdb session.
168 */
169void kdb_bp_install(struct pt_regs *regs)
170{
171 int i;
172
173 for (i = 0; i < KDB_MAXBPT; i++) {
174 kdb_bp_t *bp = &kdb_breakpoints[i];
175
176 if (KDB_DEBUG(BP)) {
177 kdb_printf("%s: bp %d bp_enabled %d\n",
178 __func__, i, bp->bp_enabled);
179 }
180 if (bp->bp_enabled)
181 _kdb_bp_install(regs, bp);
182 }
183}
184
185/*
186 * kdb_bp_remove
187 *
188 * Remove kdb_breakpoints upon entry to the kernel debugger.
189 *
190 * Parameters:
191 * None.
192 * Outputs:
193 * None.
194 * Returns:
195 * None.
196 * Locking:
197 * None.
198 * Remarks:
199 */
200void kdb_bp_remove(void)
201{
202 int i;
203
204 for (i = KDB_MAXBPT - 1; i >= 0; i--) {
205 kdb_bp_t *bp = &kdb_breakpoints[i];
206
207 if (KDB_DEBUG(BP)) {
208 kdb_printf("%s: bp %d bp_enabled %d\n",
209 __func__, i, bp->bp_enabled);
210 }
211 if (bp->bp_enabled)
212 _kdb_bp_remove(bp);
213 }
214}
215
216
217/*
218 * kdb_printbp
219 *
220 * Internal function to format and print a breakpoint entry.
221 *
222 * Parameters:
223 * None.
224 * Outputs:
225 * None.
226 * Returns:
227 * None.
228 * Locking:
229 * None.
230 * Remarks:
231 */
232
233static void kdb_printbp(kdb_bp_t *bp, int i)
234{
235 kdb_printf("%s ", kdb_bptype(bp));
236 kdb_printf("BP #%d at ", i);
237 kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
238
239 if (bp->bp_enabled)
240 kdb_printf("\n is enabled");
241 else
242 kdb_printf("\n is disabled");
243
244 kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
245 bp->bp_addr, bp->bp_type, bp->bp_installed);
246
247 kdb_printf("\n");
248}
249
250/*
251 * kdb_bp
252 *
253 * Handle the bp commands.
254 *
255 * [bp|bph] <addr-expression> [DATAR|DATAW]
256 *
257 * Parameters:
258 * argc Count of arguments in argv
259 * argv Space delimited command line arguments
260 * Outputs:
261 * None.
262 * Returns:
263 * Zero for success, a kdb diagnostic if failure.
264 * Locking:
265 * None.
266 * Remarks:
267 *
268 * bp Set breakpoint on all cpus. Only use hardware assist if need.
269 * bph Set breakpoint on all cpus. Force hardware register
270 */
271
272static int kdb_bp(int argc, const char **argv)
273{
274 int i, bpno;
275 kdb_bp_t *bp, *bp_check;
276 int diag;
277 int free;
278 char *symname = NULL;
279 long offset = 0ul;
280 int nextarg;
281 kdb_bp_t template = {0};
282
283 if (argc == 0) {
284 /*
285 * Display breakpoint table
286 */
287 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
288 bpno++, bp++) {
289 if (bp->bp_free)
290 continue;
291 kdb_printbp(bp, bpno);
292 }
293
294 return 0;
295 }
296
297 nextarg = 1;
298 diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
299 &offset, &symname);
300 if (diag)
301 return diag;
302 if (!template.bp_addr)
303 return KDB_BADINT;
304
305 /*
306 * Find an empty bp structure to allocate
307 */
308 free = KDB_MAXBPT;
309 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
310 if (bp->bp_free)
311 break;
312 }
313
314 if (bpno == KDB_MAXBPT)
315 return KDB_TOOMANYBPT;
316
317 if (strcmp(argv[0], "bph") == 0) {
318 template.bp_type = BP_HARDWARE_BREAKPOINT;
319 diag = kdb_parsebp(argc, argv, &nextarg, &template);
320 if (diag)
321 return diag;
322 } else {
323 template.bp_type = BP_BREAKPOINT;
324 }
325
326 /*
327 * Check for clashing breakpoints.
328 *
329 * Note, in this design we can't have hardware breakpoints
330 * enabled for both read and write on the same address.
331 */
332 for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
333 i++, bp_check++) {
334 if (!bp_check->bp_free &&
335 bp_check->bp_addr == template.bp_addr) {
336 kdb_printf("You already have a breakpoint at "
337 kdb_bfd_vma_fmt0 "\n", template.bp_addr);
338 return KDB_DUPBPT;
339 }
340 }
341
342 template.bp_enabled = 1;
343
344 /*
345 * Actually allocate the breakpoint found earlier
346 */
347 *bp = template;
348 bp->bp_free = 0;
349
350 kdb_printbp(bp, bpno);
351
352 return 0;
353}
354
355/*
356 * kdb_bc
357 *
358 * Handles the 'bc', 'be', and 'bd' commands
359 *
360 * [bd|bc|be] <breakpoint-number>
361 * [bd|bc|be] *
362 *
363 * Parameters:
364 * argc Count of arguments in argv
365 * argv Space delimited command line arguments
366 * Outputs:
367 * None.
368 * Returns:
369 * Zero for success, a kdb diagnostic for failure
370 * Locking:
371 * None.
372 * Remarks:
373 */
374static int kdb_bc(int argc, const char **argv)
375{
376 unsigned long addr;
377 kdb_bp_t *bp = NULL;
378 int lowbp = KDB_MAXBPT;
379 int highbp = 0;
380 int done = 0;
381 int i;
382 int diag = 0;
383
384 int cmd; /* KDBCMD_B? */
385#define KDBCMD_BC 0
386#define KDBCMD_BE 1
387#define KDBCMD_BD 2
388
389 if (strcmp(argv[0], "be") == 0)
390 cmd = KDBCMD_BE;
391 else if (strcmp(argv[0], "bd") == 0)
392 cmd = KDBCMD_BD;
393 else
394 cmd = KDBCMD_BC;
395
396 if (argc != 1)
397 return KDB_ARGCOUNT;
398
399 if (strcmp(argv[1], "*") == 0) {
400 lowbp = 0;
401 highbp = KDB_MAXBPT;
402 } else {
403 diag = kdbgetularg(argv[1], &addr);
404 if (diag)
405 return diag;
406
407 /*
408 * For addresses less than the maximum breakpoint number,
409 * assume that the breakpoint number is desired.
410 */
411 if (addr < KDB_MAXBPT) {
412 bp = &kdb_breakpoints[addr];
413 lowbp = highbp = addr;
414 highbp++;
415 } else {
416 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
417 i++, bp++) {
418 if (bp->bp_addr == addr) {
419 lowbp = highbp = i;
420 highbp++;
421 break;
422 }
423 }
424 }
425 }
426
427 /*
428 * Now operate on the set of breakpoints matching the input
429 * criteria (either '*' for all, or an individual breakpoint).
430 */
431 for (bp = &kdb_breakpoints[lowbp], i = lowbp;
432 i < highbp;
433 i++, bp++) {
434 if (bp->bp_free)
435 continue;
436
437 done++;
438
439 switch (cmd) {
440 case KDBCMD_BC:
441 bp->bp_enabled = 0;
442
443 kdb_printf("Breakpoint %d at "
444 kdb_bfd_vma_fmt " cleared\n",
445 i, bp->bp_addr);
446
447 bp->bp_addr = 0;
448 bp->bp_free = 1;
449
450 break;
451 case KDBCMD_BE:
452 bp->bp_enabled = 1;
453
454 kdb_printf("Breakpoint %d at "
455 kdb_bfd_vma_fmt " enabled",
456 i, bp->bp_addr);
457
458 kdb_printf("\n");
459 break;
460 case KDBCMD_BD:
461 if (!bp->bp_enabled)
462 break;
463
464 bp->bp_enabled = 0;
465
466 kdb_printf("Breakpoint %d at "
467 kdb_bfd_vma_fmt " disabled\n",
468 i, bp->bp_addr);
469
470 break;
471 }
472 if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
473 bp->bp_delay = 0;
474 KDB_STATE_CLEAR(SSBPT);
475 }
476 }
477
478 return (!done) ? KDB_BPTNOTFOUND : 0;
479}
480
481/*
482 * kdb_ss
483 *
484 * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
485 * commands.
486 *
487 * ss
488 * ssb
489 *
490 * Parameters:
491 * argc Argument count
492 * argv Argument vector
493 * Outputs:
494 * None.
495 * Returns:
496 * KDB_CMD_SS[B] for success, a kdb error if failure.
497 * Locking:
498 * None.
499 * Remarks:
500 *
501 * Set the arch specific option to trigger a debug trap after the next
502 * instruction.
503 *
504 * For 'ssb', set the trace flag in the debug trap handler
505 * after printing the current insn and return directly without
506 * invoking the kdb command processor, until a branch instruction
507 * is encountered.
508 */
509
510static int kdb_ss(int argc, const char **argv)
511{
512 int ssb = 0;
513
514 ssb = (strcmp(argv[0], "ssb") == 0);
515 if (argc != 0)
516 return KDB_ARGCOUNT;
517 /*
518 * Set trace flag and go.
519 */
520 KDB_STATE_SET(DOING_SS);
521 if (ssb) {
522 KDB_STATE_SET(DOING_SSB);
523 return KDB_CMD_SSB;
524 }
525 return KDB_CMD_SS;
526}
527
528/* Initialize the breakpoint table and register breakpoint commands. */
529
530void __init kdb_initbptab(void)
531{
532 int i;
533 kdb_bp_t *bp;
534
535 /*
536 * First time initialization.
537 */
538 memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
539
540 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
541 bp->bp_free = 1;
542
543 kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
544 "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
545 kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
546 "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
547 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
548 kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
549 "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS);
550 kdb_register_repeat("bc", kdb_bc, "<bpnum>",
551 "Clear Breakpoint", 0, KDB_REPEAT_NONE);
552 kdb_register_repeat("be", kdb_bc, "<bpnum>",
553 "Enable Breakpoint", 0, KDB_REPEAT_NONE);
554 kdb_register_repeat("bd", kdb_bc, "<bpnum>",
555 "Disable Breakpoint", 0, KDB_REPEAT_NONE);
556
557 kdb_register_repeat("ss", kdb_ss, "",
558 "Single Step", 1, KDB_REPEAT_NO_ARGS);
559 kdb_register_repeat("ssb", kdb_ss, "",
560 "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
561 /*
562 * Architecture dependent initialization.
563 */
564}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
new file mode 100644
index 000000000000..2f62fe85f16a
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -0,0 +1,210 @@
1/*
2 * Kernel Debugger Architecture Independent Stack Traceback
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/ctype.h>
13#include <linux/string.h>
14#include <linux/kernel.h>
15#include <linux/sched.h>
16#include <linux/kdb.h>
17#include <linux/nmi.h>
18#include <asm/system.h>
19#include "kdb_private.h"
20
21
22static void kdb_show_stack(struct task_struct *p, void *addr)
23{
24 int old_lvl = console_loglevel;
25 console_loglevel = 15;
26 kdb_trap_printk++;
27 kdb_set_current_task(p);
28 if (addr) {
29 show_stack((struct task_struct *)p, addr);
30 } else if (kdb_current_regs) {
31#ifdef CONFIG_X86
32 show_stack(p, &kdb_current_regs->sp);
33#else
34 show_stack(p, NULL);
35#endif
36 } else {
37 show_stack(p, NULL);
38 }
39 console_loglevel = old_lvl;
40 kdb_trap_printk--;
41}
42
43/*
44 * kdb_bt
45 *
46 * This function implements the 'bt' command. Print a stack
47 * traceback.
48 *
49 * bt [<address-expression>] (addr-exp is for alternate stacks)
50 * btp <pid> Kernel stack for <pid>
51 * btt <address-expression> Kernel stack for task structure at
52 * <address-expression>
53 * bta [DRSTCZEUIMA] All useful processes, optionally
54 * filtered by state
55 * btc [<cpu>] The current process on one cpu,
56 * default is all cpus
57 *
58 * bt <address-expression> refers to a address on the stack, that location
59 * is assumed to contain a return address.
60 *
61 * btt <address-expression> refers to the address of a struct task.
62 *
63 * Inputs:
64 * argc argument count
65 * argv argument vector
66 * Outputs:
67 * None.
68 * Returns:
69 * zero for success, a kdb diagnostic if error
70 * Locking:
71 * none.
72 * Remarks:
73 * Backtrack works best when the code uses frame pointers. But even
74 * without frame pointers we should get a reasonable trace.
75 *
76 * mds comes in handy when examining the stack to do a manual traceback or
77 * to get a starting point for bt <address-expression>.
78 */
79
80static int
81kdb_bt1(struct task_struct *p, unsigned long mask,
82 int argcount, int btaprompt)
83{
84 char buffer[2];
85 if (kdb_getarea(buffer[0], (unsigned long)p) ||
86 kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
87 return KDB_BADADDR;
88 if (!kdb_task_state(p, mask))
89 return 0;
90 kdb_printf("Stack traceback for pid %d\n", p->pid);
91 kdb_ps1(p);
92 kdb_show_stack(p, NULL);
93 if (btaprompt) {
94 kdb_getstr(buffer, sizeof(buffer),
95 "Enter <q> to end, <cr> to continue:");
96 if (buffer[0] == 'q') {
97 kdb_printf("\n");
98 return 1;
99 }
100 }
101 touch_nmi_watchdog();
102 return 0;
103}
104
105int
106kdb_bt(int argc, const char **argv)
107{
108 int diag;
109 int argcount = 5;
110 int btaprompt = 1;
111 int nextarg;
112 unsigned long addr;
113 long offset;
114
115 kdbgetintenv("BTARGS", &argcount); /* Arguments to print */
116 kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each
117 * proc in bta */
118
119 if (strcmp(argv[0], "bta") == 0) {
120 struct task_struct *g, *p;
121 unsigned long cpu;
122 unsigned long mask = kdb_task_state_string(argc ? argv[1] :
123 NULL);
124 if (argc == 0)
125 kdb_ps_suppressed();
126 /* Run the active tasks first */
127 for_each_online_cpu(cpu) {
128 p = kdb_curr_task(cpu);
129 if (kdb_bt1(p, mask, argcount, btaprompt))
130 return 0;
131 }
132 /* Now the inactive tasks */
133 kdb_do_each_thread(g, p) {
134 if (task_curr(p))
135 continue;
136 if (kdb_bt1(p, mask, argcount, btaprompt))
137 return 0;
138 } kdb_while_each_thread(g, p);
139 } else if (strcmp(argv[0], "btp") == 0) {
140 struct task_struct *p;
141 unsigned long pid;
142 if (argc != 1)
143 return KDB_ARGCOUNT;
144 diag = kdbgetularg((char *)argv[1], &pid);
145 if (diag)
146 return diag;
147 p = find_task_by_pid_ns(pid, &init_pid_ns);
148 if (p) {
149 kdb_set_current_task(p);
150 return kdb_bt1(p, ~0UL, argcount, 0);
151 }
152 kdb_printf("No process with pid == %ld found\n", pid);
153 return 0;
154 } else if (strcmp(argv[0], "btt") == 0) {
155 if (argc != 1)
156 return KDB_ARGCOUNT;
157 diag = kdbgetularg((char *)argv[1], &addr);
158 if (diag)
159 return diag;
160 kdb_set_current_task((struct task_struct *)addr);
161 return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
162 } else if (strcmp(argv[0], "btc") == 0) {
163 unsigned long cpu = ~0;
164 struct task_struct *save_current_task = kdb_current_task;
165 char buf[80];
166 if (argc > 1)
167 return KDB_ARGCOUNT;
168 if (argc == 1) {
169 diag = kdbgetularg((char *)argv[1], &cpu);
170 if (diag)
171 return diag;
172 }
173 /* Recursive use of kdb_parse, do not use argv after
174 * this point */
175 argv = NULL;
176 if (cpu != ~0) {
177 if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
178 kdb_printf("no process for cpu %ld\n", cpu);
179 return 0;
180 }
181 sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
182 kdb_parse(buf);
183 return 0;
184 }
185 kdb_printf("btc: cpu status: ");
186 kdb_parse("cpu\n");
187 for_each_online_cpu(cpu) {
188 sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
189 kdb_parse(buf);
190 touch_nmi_watchdog();
191 }
192 kdb_set_current_task(save_current_task);
193 return 0;
194 } else {
195 if (argc) {
196 nextarg = 1;
197 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
198 &offset, NULL);
199 if (diag)
200 return diag;
201 kdb_show_stack(kdb_current_task, (void *)addr);
202 return 0;
203 } else {
204 return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
205 }
206 }
207
208 /* NOTREACHED */
209 return 0;
210}
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
new file mode 100644
index 000000000000..56c88e4db309
--- /dev/null
+++ b/kernel/debug/kdb/kdb_cmds
@@ -0,0 +1,35 @@
1# Initial commands for kdb, alter to suit your needs.
2# These commands are executed in kdb_init() context, no SMP, no
3# processes. Commands that require process data (including stack or
4# registers) are not reliable this early. set and bp commands should
5# be safe. Global breakpoint commands affect each cpu as it is booted.
6
7# Standard debugging information for first level support, just type archkdb
8# or archkdbcpu or archkdbshort at the kdb prompt.
9
10defcmd dumpcommon "" "Common kdb debugging"
11 set BTAPROMPT 0
12 set LINES 10000
13 -summary
14 -cpu
15 -ps
16 -dmesg 600
17 -bt
18endefcmd
19
20defcmd dumpall "" "First line debugging"
21 set BTSYMARG 1
22 set BTARGS 9
23 pid R
24 -dumpcommon
25 -bta
26endefcmd
27
28defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
29 set BTSYMARG 1
30 set BTARGS 9
31 pid R
32 -dumpcommon
33 -btc
34endefcmd
35
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
new file mode 100644
index 000000000000..bf6e8270e957
--- /dev/null
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -0,0 +1,169 @@
1/*
2 * Created by: Jason Wessel <jason.wessel@windriver.com>
3 *
4 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
5 *
6 * This file is licensed under the terms of the GNU General Public
7 * License version 2. This program is licensed "as is" without any
8 * warranty of any kind, whether express or implied.
9 */
10
11#include <linux/kgdb.h>
12#include <linux/kdb.h>
13#include <linux/kdebug.h>
14#include "kdb_private.h"
15#include "../debug_core.h"
16
17/*
18 * KDB interface to KGDB internals
19 */
20get_char_func kdb_poll_funcs[] = {
21 dbg_io_get_char,
22 NULL,
23 NULL,
24 NULL,
25 NULL,
26 NULL,
27};
28EXPORT_SYMBOL_GPL(kdb_poll_funcs);
29
30int kdb_poll_idx = 1;
31EXPORT_SYMBOL_GPL(kdb_poll_idx);
32
33int kdb_stub(struct kgdb_state *ks)
34{
35 int error = 0;
36 kdb_bp_t *bp;
37 unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
38 kdb_reason_t reason = KDB_REASON_OOPS;
39 kdb_dbtrap_t db_result = KDB_DB_NOBPT;
40 int i;
41
42 if (KDB_STATE(REENTRY)) {
43 reason = KDB_REASON_SWITCH;
44 KDB_STATE_CLEAR(REENTRY);
45 addr = instruction_pointer(ks->linux_regs);
46 }
47 ks->pass_exception = 0;
48 if (atomic_read(&kgdb_setting_breakpoint))
49 reason = KDB_REASON_KEYBOARD;
50
51 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
52 if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
53 reason = KDB_REASON_BREAK;
54 db_result = KDB_DB_BPT;
55 if (addr != instruction_pointer(ks->linux_regs))
56 kgdb_arch_set_pc(ks->linux_regs, addr);
57 break;
58 }
59 }
60 if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
61 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
62 if (bp->bp_free)
63 continue;
64 if (bp->bp_addr == addr) {
65 bp->bp_delay = 1;
66 bp->bp_delayed = 1;
67 /*
68 * SSBPT is set when the kernel debugger must single step a
69 * task in order to re-establish an instruction breakpoint
70 * which uses the instruction replacement mechanism. It is
71 * cleared by any action that removes the need to single-step
72 * the breakpoint.
73 */
74 reason = KDB_REASON_BREAK;
75 db_result = KDB_DB_BPT;
76 KDB_STATE_SET(SSBPT);
77 break;
78 }
79 }
80 }
81
82 if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
83 ks->signo == SIGTRAP) {
84 reason = KDB_REASON_SSTEP;
85 db_result = KDB_DB_BPT;
86 }
87 /* Set initial kdb state variables */
88 KDB_STATE_CLEAR(KGDB_TRANS);
89 kdb_initial_cpu = ks->cpu;
90 kdb_current_task = kgdb_info[ks->cpu].task;
91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
92 /* Remove any breakpoints as needed by kdb and clear single step */
93 kdb_bp_remove();
94 KDB_STATE_CLEAR(DOING_SS);
95 KDB_STATE_CLEAR(DOING_SSB);
96 KDB_STATE_SET(PAGER);
97 /* zero out any offline cpu data */
98 for_each_present_cpu(i) {
99 if (!cpu_online(i)) {
100 kgdb_info[i].debuggerinfo = NULL;
101 kgdb_info[i].task = NULL;
102 }
103 }
104 if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
105 ks->pass_exception = 1;
106 KDB_FLAG_SET(CATASTROPHIC);
107 }
108 kdb_initial_cpu = ks->cpu;
109 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
110 KDB_STATE_CLEAR(SSBPT);
111 KDB_STATE_CLEAR(DOING_SS);
112 } else {
113 /* Start kdb main loop */
114 error = kdb_main_loop(KDB_REASON_ENTER, reason,
115 ks->err_code, db_result, ks->linux_regs);
116 }
117 /*
118 * Upon exit from the kdb main loop setup break points and restart
119 * the system based on the requested continue state
120 */
121 kdb_initial_cpu = -1;
122 kdb_current_task = NULL;
123 kdb_current_regs = NULL;
124 KDB_STATE_CLEAR(PAGER);
125 kdbnearsym_cleanup();
126 if (error == KDB_CMD_KGDB) {
127 if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
128 /*
129 * This inteface glue which allows kdb to transition in into
130 * the gdb stub. In order to do this the '?' or '' gdb serial
131 * packet response is processed here. And then control is
132 * passed to the gdbstub.
133 */
134 if (KDB_STATE(DOING_KGDB))
135 gdbstub_state(ks, "?");
136 else
137 gdbstub_state(ks, "");
138 KDB_STATE_CLEAR(DOING_KGDB);
139 KDB_STATE_CLEAR(DOING_KGDB2);
140 }
141 return DBG_PASS_EVENT;
142 }
143 kdb_bp_install(ks->linux_regs);
144 dbg_activate_sw_breakpoints();
145 /* Set the exit state to a single step or a continue */
146 if (KDB_STATE(DOING_SS))
147 gdbstub_state(ks, "s");
148 else
149 gdbstub_state(ks, "c");
150
151 KDB_FLAG_CLEAR(CATASTROPHIC);
152
153 /* Invoke arch specific exception handling prior to system resume */
154 kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
155 if (ks->pass_exception)
156 kgdb_info[ks->cpu].ret_state = 1;
157 if (error == KDB_CMD_CPU) {
158 KDB_STATE_SET(REENTRY);
159 /*
160 * Force clear the single step bit because kdb emulates this
161 * differently vs the gdbstub
162 */
163 kgdb_single_step = 0;
164 dbg_deactivate_sw_breakpoints();
165 return DBG_SWITCH_CPU_EVENT;
166 }
167 return kgdb_info[ks->cpu].ret_state;
168}
169
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
new file mode 100644
index 000000000000..c9b7f4f90bba
--- /dev/null
+++ b/kernel/debug/kdb/kdb_io.c
@@ -0,0 +1,826 @@
1/*
2 * Kernel Debugger Architecture Independent Console I/O handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/ctype.h>
15#include <linux/kernel.h>
16#include <linux/init.h>
17#include <linux/kdev_t.h>
18#include <linux/console.h>
19#include <linux/string.h>
20#include <linux/sched.h>
21#include <linux/smp.h>
22#include <linux/nmi.h>
23#include <linux/delay.h>
24#include <linux/kgdb.h>
25#include <linux/kdb.h>
26#include <linux/kallsyms.h>
27#include "kdb_private.h"
28
29#define CMD_BUFLEN 256
30char kdb_prompt_str[CMD_BUFLEN];
31
32int kdb_trap_printk;
33
34static void kgdb_transition_check(char *buffer)
35{
36 int slen = strlen(buffer);
37 if (strncmp(buffer, "$?#3f", slen) != 0 &&
38 strncmp(buffer, "$qSupported#37", slen) != 0 &&
39 strncmp(buffer, "+$qSupported#37", slen) != 0) {
40 KDB_STATE_SET(KGDB_TRANS);
41 kdb_printf("%s", buffer);
42 }
43}
44
45static int kdb_read_get_key(char *buffer, size_t bufsize)
46{
47#define ESCAPE_UDELAY 1000
48#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
49 char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
50 char *ped = escape_data;
51 int escape_delay = 0;
52 get_char_func *f, *f_escape = NULL;
53 int key;
54
55 for (f = &kdb_poll_funcs[0]; ; ++f) {
56 if (*f == NULL) {
57 /* Reset NMI watchdog once per poll loop */
58 touch_nmi_watchdog();
59 f = &kdb_poll_funcs[0];
60 }
61 if (escape_delay == 2) {
62 *ped = '\0';
63 ped = escape_data;
64 --escape_delay;
65 }
66 if (escape_delay == 1) {
67 key = *ped++;
68 if (!*ped)
69 --escape_delay;
70 break;
71 }
72 key = (*f)();
73 if (key == -1) {
74 if (escape_delay) {
75 udelay(ESCAPE_UDELAY);
76 --escape_delay;
77 }
78 continue;
79 }
80 if (bufsize <= 2) {
81 if (key == '\r')
82 key = '\n';
83 *buffer++ = key;
84 *buffer = '\0';
85 return -1;
86 }
87 if (escape_delay == 0 && key == '\e') {
88 escape_delay = ESCAPE_DELAY;
89 ped = escape_data;
90 f_escape = f;
91 }
92 if (escape_delay) {
93 *ped++ = key;
94 if (f_escape != f) {
95 escape_delay = 2;
96 continue;
97 }
98 if (ped - escape_data == 1) {
99 /* \e */
100 continue;
101 } else if (ped - escape_data == 2) {
102 /* \e<something> */
103 if (key != '[')
104 escape_delay = 2;
105 continue;
106 } else if (ped - escape_data == 3) {
107 /* \e[<something> */
108 int mapkey = 0;
109 switch (key) {
110 case 'A': /* \e[A, up arrow */
111 mapkey = 16;
112 break;
113 case 'B': /* \e[B, down arrow */
114 mapkey = 14;
115 break;
116 case 'C': /* \e[C, right arrow */
117 mapkey = 6;
118 break;
119 case 'D': /* \e[D, left arrow */
120 mapkey = 2;
121 break;
122 case '1': /* dropthrough */
123 case '3': /* dropthrough */
124 /* \e[<1,3,4>], may be home, del, end */
125 case '4':
126 mapkey = -1;
127 break;
128 }
129 if (mapkey != -1) {
130 if (mapkey > 0) {
131 escape_data[0] = mapkey;
132 escape_data[1] = '\0';
133 }
134 escape_delay = 2;
135 }
136 continue;
137 } else if (ped - escape_data == 4) {
138 /* \e[<1,3,4><something> */
139 int mapkey = 0;
140 if (key == '~') {
141 switch (escape_data[2]) {
142 case '1': /* \e[1~, home */
143 mapkey = 1;
144 break;
145 case '3': /* \e[3~, del */
146 mapkey = 4;
147 break;
148 case '4': /* \e[4~, end */
149 mapkey = 5;
150 break;
151 }
152 }
153 if (mapkey > 0) {
154 escape_data[0] = mapkey;
155 escape_data[1] = '\0';
156 }
157 escape_delay = 2;
158 continue;
159 }
160 }
161 break; /* A key to process */
162 }
163 return key;
164}
165
166/*
167 * kdb_read
168 *
169 * This function reads a string of characters, terminated by
170 * a newline, or by reaching the end of the supplied buffer,
171 * from the current kernel debugger console device.
172 * Parameters:
173 * buffer - Address of character buffer to receive input characters.
174 * bufsize - size, in bytes, of the character buffer
175 * Returns:
176 * Returns a pointer to the buffer containing the received
177 * character string. This string will be terminated by a
178 * newline character.
179 * Locking:
180 * No locks are required to be held upon entry to this
181 * function. It is not reentrant - it relies on the fact
182 * that while kdb is running on only one "master debug" cpu.
183 * Remarks:
184 *
185 * The buffer size must be >= 2. A buffer size of 2 means that the caller only
186 * wants a single key.
187 *
188 * An escape key could be the start of a vt100 control sequence such as \e[D
189 * (left arrow) or it could be a character in its own right. The standard
190 * method for detecting the difference is to wait for 2 seconds to see if there
191 * are any other characters. kdb is complicated by the lack of a timer service
192 * (interrupts are off), by multiple input sources and by the need to sometimes
193 * return after just one key. Escape sequence processing has to be done as
194 * states in the polling loop.
195 */
196
197static char *kdb_read(char *buffer, size_t bufsize)
198{
199 char *cp = buffer;
200 char *bufend = buffer+bufsize-2; /* Reserve space for newline
201 * and null byte */
202 char *lastchar;
203 char *p_tmp;
204 char tmp;
205 static char tmpbuffer[CMD_BUFLEN];
206 int len = strlen(buffer);
207 int len_tmp;
208 int tab = 0;
209 int count;
210 int i;
211 int diag, dtab_count;
212 int key;
213
214
215 diag = kdbgetintenv("DTABCOUNT", &dtab_count);
216 if (diag)
217 dtab_count = 30;
218
219 if (len > 0) {
220 cp += len;
221 if (*(buffer+len-1) == '\n')
222 cp--;
223 }
224
225 lastchar = cp;
226 *cp = '\0';
227 kdb_printf("%s", buffer);
228poll_again:
229 key = kdb_read_get_key(buffer, bufsize);
230 if (key == -1)
231 return buffer;
232 if (key != 9)
233 tab = 0;
234 switch (key) {
235 case 8: /* backspace */
236 if (cp > buffer) {
237 if (cp < lastchar) {
238 memcpy(tmpbuffer, cp, lastchar - cp);
239 memcpy(cp-1, tmpbuffer, lastchar - cp);
240 }
241 *(--lastchar) = '\0';
242 --cp;
243 kdb_printf("\b%s \r", cp);
244 tmp = *cp;
245 *cp = '\0';
246 kdb_printf(kdb_prompt_str);
247 kdb_printf("%s", buffer);
248 *cp = tmp;
249 }
250 break;
251 case 13: /* enter */
252 *lastchar++ = '\n';
253 *lastchar++ = '\0';
254 kdb_printf("\n");
255 return buffer;
256 case 4: /* Del */
257 if (cp < lastchar) {
258 memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
259 memcpy(cp, tmpbuffer, lastchar - cp - 1);
260 *(--lastchar) = '\0';
261 kdb_printf("%s \r", cp);
262 tmp = *cp;
263 *cp = '\0';
264 kdb_printf(kdb_prompt_str);
265 kdb_printf("%s", buffer);
266 *cp = tmp;
267 }
268 break;
269 case 1: /* Home */
270 if (cp > buffer) {
271 kdb_printf("\r");
272 kdb_printf(kdb_prompt_str);
273 cp = buffer;
274 }
275 break;
276 case 5: /* End */
277 if (cp < lastchar) {
278 kdb_printf("%s", cp);
279 cp = lastchar;
280 }
281 break;
282 case 2: /* Left */
283 if (cp > buffer) {
284 kdb_printf("\b");
285 --cp;
286 }
287 break;
288 case 14: /* Down */
289 memset(tmpbuffer, ' ',
290 strlen(kdb_prompt_str) + (lastchar-buffer));
291 *(tmpbuffer+strlen(kdb_prompt_str) +
292 (lastchar-buffer)) = '\0';
293 kdb_printf("\r%s\r", tmpbuffer);
294 *lastchar = (char)key;
295 *(lastchar+1) = '\0';
296 return lastchar;
297 case 6: /* Right */
298 if (cp < lastchar) {
299 kdb_printf("%c", *cp);
300 ++cp;
301 }
302 break;
303 case 16: /* Up */
304 memset(tmpbuffer, ' ',
305 strlen(kdb_prompt_str) + (lastchar-buffer));
306 *(tmpbuffer+strlen(kdb_prompt_str) +
307 (lastchar-buffer)) = '\0';
308 kdb_printf("\r%s\r", tmpbuffer);
309 *lastchar = (char)key;
310 *(lastchar+1) = '\0';
311 return lastchar;
312 case 9: /* Tab */
313 if (tab < 2)
314 ++tab;
315 p_tmp = buffer;
316 while (*p_tmp == ' ')
317 p_tmp++;
318 if (p_tmp > cp)
319 break;
320 memcpy(tmpbuffer, p_tmp, cp-p_tmp);
321 *(tmpbuffer + (cp-p_tmp)) = '\0';
322 p_tmp = strrchr(tmpbuffer, ' ');
323 if (p_tmp)
324 ++p_tmp;
325 else
326 p_tmp = tmpbuffer;
327 len = strlen(p_tmp);
328 count = kallsyms_symbol_complete(p_tmp,
329 sizeof(tmpbuffer) -
330 (p_tmp - tmpbuffer));
331 if (tab == 2 && count > 0) {
332 kdb_printf("\n%d symbols are found.", count);
333 if (count > dtab_count) {
334 count = dtab_count;
335 kdb_printf(" But only first %d symbols will"
336 " be printed.\nYou can change the"
337 " environment variable DTABCOUNT.",
338 count);
339 }
340 kdb_printf("\n");
341 for (i = 0; i < count; i++) {
342 if (kallsyms_symbol_next(p_tmp, i) < 0)
343 break;
344 kdb_printf("%s ", p_tmp);
345 *(p_tmp + len) = '\0';
346 }
347 if (i >= dtab_count)
348 kdb_printf("...");
349 kdb_printf("\n");
350 kdb_printf(kdb_prompt_str);
351 kdb_printf("%s", buffer);
352 } else if (tab != 2 && count > 0) {
353 len_tmp = strlen(p_tmp);
354 strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
355 len_tmp = strlen(p_tmp);
356 strncpy(cp, p_tmp+len, len_tmp-len + 1);
357 len = len_tmp - len;
358 kdb_printf("%s", cp);
359 cp += len;
360 lastchar += len;
361 }
362 kdb_nextline = 1; /* reset output line number */
363 break;
364 default:
365 if (key >= 32 && lastchar < bufend) {
366 if (cp < lastchar) {
367 memcpy(tmpbuffer, cp, lastchar - cp);
368 memcpy(cp+1, tmpbuffer, lastchar - cp);
369 *++lastchar = '\0';
370 *cp = key;
371 kdb_printf("%s\r", cp);
372 ++cp;
373 tmp = *cp;
374 *cp = '\0';
375 kdb_printf(kdb_prompt_str);
376 kdb_printf("%s", buffer);
377 *cp = tmp;
378 } else {
379 *++lastchar = '\0';
380 *cp++ = key;
381 /* The kgdb transition check will hide
382 * printed characters if we think that
383 * kgdb is connecting, until the check
384 * fails */
385 if (!KDB_STATE(KGDB_TRANS))
386 kgdb_transition_check(buffer);
387 else
388 kdb_printf("%c", key);
389 }
390 /* Special escape to kgdb */
391 if (lastchar - buffer >= 5 &&
392 strcmp(lastchar - 5, "$?#3f") == 0) {
393 strcpy(buffer, "kgdb");
394 KDB_STATE_SET(DOING_KGDB);
395 return buffer;
396 }
397 if (lastchar - buffer >= 14 &&
398 strcmp(lastchar - 14, "$qSupported#37") == 0) {
399 strcpy(buffer, "kgdb");
400 KDB_STATE_SET(DOING_KGDB2);
401 return buffer;
402 }
403 }
404 break;
405 }
406 goto poll_again;
407}
408
409/*
410 * kdb_getstr
411 *
412 * Print the prompt string and read a command from the
413 * input device.
414 *
415 * Parameters:
416 * buffer Address of buffer to receive command
417 * bufsize Size of buffer in bytes
418 * prompt Pointer to string to use as prompt string
419 * Returns:
420 * Pointer to command buffer.
421 * Locking:
422 * None.
423 * Remarks:
424 * For SMP kernels, the processor number will be
425 * substituted for %d, %x or %o in the prompt.
426 */
427
428char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
429{
430 if (prompt && kdb_prompt_str != prompt)
431 strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
432 kdb_printf(kdb_prompt_str);
433 kdb_nextline = 1; /* Prompt and input resets line number */
434 return kdb_read(buffer, bufsize);
435}
436
437/*
438 * kdb_input_flush
439 *
440 * Get rid of any buffered console input.
441 *
442 * Parameters:
443 * none
444 * Returns:
445 * nothing
446 * Locking:
447 * none
448 * Remarks:
449 * Call this function whenever you want to flush input. If there is any
450 * outstanding input, it ignores all characters until there has been no
451 * data for approximately 1ms.
452 */
453
454static void kdb_input_flush(void)
455{
456 get_char_func *f;
457 int res;
458 int flush_delay = 1;
459 while (flush_delay) {
460 flush_delay--;
461empty:
462 touch_nmi_watchdog();
463 for (f = &kdb_poll_funcs[0]; *f; ++f) {
464 res = (*f)();
465 if (res != -1) {
466 flush_delay = 1;
467 goto empty;
468 }
469 }
470 if (flush_delay)
471 mdelay(1);
472 }
473}
474
475/*
476 * kdb_printf
477 *
478 * Print a string to the output device(s).
479 *
480 * Parameters:
481 * printf-like format and optional args.
482 * Returns:
483 * 0
484 * Locking:
485 * None.
486 * Remarks:
487 * use 'kdbcons->write()' to avoid polluting 'log_buf' with
488 * kdb output.
489 *
490 * If the user is doing a cmd args | grep srch
491 * then kdb_grepping_flag is set.
492 * In that case we need to accumulate full lines (ending in \n) before
493 * searching for the pattern.
494 */
495
496static char kdb_buffer[256]; /* A bit too big to go on stack */
497static char *next_avail = kdb_buffer;
498static int size_avail;
499static int suspend_grep;
500
501/*
502 * search arg1 to see if it contains arg2
503 * (kdmain.c provides flags for ^pat and pat$)
504 *
505 * return 1 for found, 0 for not found
506 */
507static int kdb_search_string(char *searched, char *searchfor)
508{
509 char firstchar, *cp;
510 int len1, len2;
511
512 /* not counting the newline at the end of "searched" */
513 len1 = strlen(searched)-1;
514 len2 = strlen(searchfor);
515 if (len1 < len2)
516 return 0;
517 if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
518 return 0;
519 if (kdb_grep_leading) {
520 if (!strncmp(searched, searchfor, len2))
521 return 1;
522 } else if (kdb_grep_trailing) {
523 if (!strncmp(searched+len1-len2, searchfor, len2))
524 return 1;
525 } else {
526 firstchar = *searchfor;
527 cp = searched;
528 while ((cp = strchr(cp, firstchar))) {
529 if (!strncmp(cp, searchfor, len2))
530 return 1;
531 cp++;
532 }
533 }
534 return 0;
535}
536
537int vkdb_printf(const char *fmt, va_list ap)
538{
539 int diag;
540 int linecount;
541 int logging, saved_loglevel = 0;
542 int saved_trap_printk;
543 int got_printf_lock = 0;
544 int retlen = 0;
545 int fnd, len;
546 char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
547 char *moreprompt = "more> ";
548 struct console *c = console_drivers;
549 static DEFINE_SPINLOCK(kdb_printf_lock);
550 unsigned long uninitialized_var(flags);
551
552 preempt_disable();
553 saved_trap_printk = kdb_trap_printk;
554 kdb_trap_printk = 0;
555
556 /* Serialize kdb_printf if multiple cpus try to write at once.
557 * But if any cpu goes recursive in kdb, just print the output,
558 * even if it is interleaved with any other text.
559 */
560 if (!KDB_STATE(PRINTF_LOCK)) {
561 KDB_STATE_SET(PRINTF_LOCK);
562 spin_lock_irqsave(&kdb_printf_lock, flags);
563 got_printf_lock = 1;
564 atomic_inc(&kdb_event);
565 } else {
566 __acquire(kdb_printf_lock);
567 }
568
569 diag = kdbgetintenv("LINES", &linecount);
570 if (diag || linecount <= 1)
571 linecount = 24;
572
573 diag = kdbgetintenv("LOGGING", &logging);
574 if (diag)
575 logging = 0;
576
577 if (!kdb_grepping_flag || suspend_grep) {
578 /* normally, every vsnprintf starts a new buffer */
579 next_avail = kdb_buffer;
580 size_avail = sizeof(kdb_buffer);
581 }
582 vsnprintf(next_avail, size_avail, fmt, ap);
583
584 /*
585 * If kdb_parse() found that the command was cmd xxx | grep yyy
586 * then kdb_grepping_flag is set, and kdb_grep_string contains yyy
587 *
588 * Accumulate the print data up to a newline before searching it.
589 * (vsnprintf does null-terminate the string that it generates)
590 */
591
592 /* skip the search if prints are temporarily unconditional */
593 if (!suspend_grep && kdb_grepping_flag) {
594 cp = strchr(kdb_buffer, '\n');
595 if (!cp) {
596 /*
597 * Special cases that don't end with newlines
598 * but should be written without one:
599 * The "[nn]kdb> " prompt should
600 * appear at the front of the buffer.
601 *
602 * The "[nn]more " prompt should also be
603 * (MOREPROMPT -> moreprompt)
604 * written * but we print that ourselves,
605 * we set the suspend_grep flag to make
606 * it unconditional.
607 *
608 */
609 if (next_avail == kdb_buffer) {
610 /*
611 * these should occur after a newline,
612 * so they will be at the front of the
613 * buffer
614 */
615 cp2 = kdb_buffer;
616 len = strlen(kdb_prompt_str);
617 if (!strncmp(cp2, kdb_prompt_str, len)) {
618 /*
619 * We're about to start a new
620 * command, so we can go back
621 * to normal mode.
622 */
623 kdb_grepping_flag = 0;
624 goto kdb_printit;
625 }
626 }
627 /* no newline; don't search/write the buffer
628 until one is there */
629 len = strlen(kdb_buffer);
630 next_avail = kdb_buffer + len;
631 size_avail = sizeof(kdb_buffer) - len;
632 goto kdb_print_out;
633 }
634
635 /*
636 * The newline is present; print through it or discard
637 * it, depending on the results of the search.
638 */
639 cp++; /* to byte after the newline */
640 replaced_byte = *cp; /* remember what/where it was */
641 cphold = cp;
642 *cp = '\0'; /* end the string for our search */
643
644 /*
645 * We now have a newline at the end of the string
646 * Only continue with this output if it contains the
647 * search string.
648 */
649 fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
650 if (!fnd) {
651 /*
652 * At this point the complete line at the start
653 * of kdb_buffer can be discarded, as it does
654 * not contain what the user is looking for.
655 * Shift the buffer left.
656 */
657 *cphold = replaced_byte;
658 strcpy(kdb_buffer, cphold);
659 len = strlen(kdb_buffer);
660 next_avail = kdb_buffer + len;
661 size_avail = sizeof(kdb_buffer) - len;
662 goto kdb_print_out;
663 }
664 /*
665 * at this point the string is a full line and
666 * should be printed, up to the null.
667 */
668 }
669kdb_printit:
670
671 /*
672 * Write to all consoles.
673 */
674 retlen = strlen(kdb_buffer);
675 if (!dbg_kdb_mode && kgdb_connected) {
676 gdbstub_msg_write(kdb_buffer, retlen);
677 } else {
678 if (!dbg_io_ops->is_console) {
679 len = strlen(kdb_buffer);
680 cp = kdb_buffer;
681 while (len--) {
682 dbg_io_ops->write_char(*cp);
683 cp++;
684 }
685 }
686 while (c) {
687 c->write(c, kdb_buffer, retlen);
688 touch_nmi_watchdog();
689 c = c->next;
690 }
691 }
692 if (logging) {
693 saved_loglevel = console_loglevel;
694 console_loglevel = 0;
695 printk(KERN_INFO "%s", kdb_buffer);
696 }
697
698 if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
699 kdb_nextline++;
700
701 /* check for having reached the LINES number of printed lines */
702 if (kdb_nextline == linecount) {
703 char buf1[16] = "";
704#if defined(CONFIG_SMP)
705 char buf2[32];
706#endif
707
708 /* Watch out for recursion here. Any routine that calls
709 * kdb_printf will come back through here. And kdb_read
710 * uses kdb_printf to echo on serial consoles ...
711 */
712 kdb_nextline = 1; /* In case of recursion */
713
714 /*
715 * Pause until cr.
716 */
717 moreprompt = kdbgetenv("MOREPROMPT");
718 if (moreprompt == NULL)
719 moreprompt = "more> ";
720
721#if defined(CONFIG_SMP)
722 if (strchr(moreprompt, '%')) {
723 sprintf(buf2, moreprompt, get_cpu());
724 put_cpu();
725 moreprompt = buf2;
726 }
727#endif
728
729 kdb_input_flush();
730 c = console_drivers;
731
732 if (!dbg_io_ops->is_console) {
733 len = strlen(moreprompt);
734 cp = moreprompt;
735 while (len--) {
736 dbg_io_ops->write_char(*cp);
737 cp++;
738 }
739 }
740 while (c) {
741 c->write(c, moreprompt, strlen(moreprompt));
742 touch_nmi_watchdog();
743 c = c->next;
744 }
745
746 if (logging)
747 printk("%s", moreprompt);
748
749 kdb_read(buf1, 2); /* '2' indicates to return
750 * immediately after getting one key. */
751 kdb_nextline = 1; /* Really set output line 1 */
752
753 /* empty and reset the buffer: */
754 kdb_buffer[0] = '\0';
755 next_avail = kdb_buffer;
756 size_avail = sizeof(kdb_buffer);
757 if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
758 /* user hit q or Q */
759 KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
760 KDB_STATE_CLEAR(PAGER);
761 /* end of command output; back to normal mode */
762 kdb_grepping_flag = 0;
763 kdb_printf("\n");
764 } else if (buf1[0] == ' ') {
765 kdb_printf("\n");
766 suspend_grep = 1; /* for this recursion */
767 } else if (buf1[0] == '\n') {
768 kdb_nextline = linecount - 1;
769 kdb_printf("\r");
770 suspend_grep = 1; /* for this recursion */
771 } else if (buf1[0] && buf1[0] != '\n') {
772 /* user hit something other than enter */
773 suspend_grep = 1; /* for this recursion */
774 kdb_printf("\nOnly 'q' or 'Q' are processed at more "
775 "prompt, input ignored\n");
776 } else if (kdb_grepping_flag) {
777 /* user hit enter */
778 suspend_grep = 1; /* for this recursion */
779 kdb_printf("\n");
780 }
781 kdb_input_flush();
782 }
783
784 /*
785 * For grep searches, shift the printed string left.
786 * replaced_byte contains the character that was overwritten with
787 * the terminating null, and cphold points to the null.
788 * Then adjust the notion of available space in the buffer.
789 */
790 if (kdb_grepping_flag && !suspend_grep) {
791 *cphold = replaced_byte;
792 strcpy(kdb_buffer, cphold);
793 len = strlen(kdb_buffer);
794 next_avail = kdb_buffer + len;
795 size_avail = sizeof(kdb_buffer) - len;
796 }
797
798kdb_print_out:
799 suspend_grep = 0; /* end of what may have been a recursive call */
800 if (logging)
801 console_loglevel = saved_loglevel;
802 if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
803 got_printf_lock = 0;
804 spin_unlock_irqrestore(&kdb_printf_lock, flags);
805 KDB_STATE_CLEAR(PRINTF_LOCK);
806 atomic_dec(&kdb_event);
807 } else {
808 __release(kdb_printf_lock);
809 }
810 kdb_trap_printk = saved_trap_printk;
811 preempt_enable();
812 return retlen;
813}
814
815int kdb_printf(const char *fmt, ...)
816{
817 va_list ap;
818 int r;
819
820 va_start(ap, fmt);
821 r = vkdb_printf(fmt, ap);
822 va_end(ap);
823
824 return r;
825}
826
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
new file mode 100644
index 000000000000..4bca634975c0
--- /dev/null
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -0,0 +1,212 @@
1/*
2 * Kernel Debugger Architecture Dependent Console I/O handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License.
6 *
7 * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
8 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
9 */
10
11#include <linux/kdb.h>
12#include <linux/keyboard.h>
13#include <linux/ctype.h>
14#include <linux/module.h>
15#include <linux/io.h>
16
17/* Keyboard Controller Registers on normal PCs. */
18
19#define KBD_STATUS_REG 0x64 /* Status register (R) */
20#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */
21
22/* Status Register Bits */
23
24#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
26
27static int kbd_exists;
28
29/*
30 * Check if the keyboard controller has a keypress for us.
31 * Some parts (Enter Release, LED change) are still blocking polled here,
32 * but hopefully they are all short.
33 */
34int kdb_get_kbd_char(void)
35{
36 int scancode, scanstatus;
37 static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */
38 static int shift_key; /* Shift next keypress */
39 static int ctrl_key;
40 u_short keychar;
41
42 if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
43 (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
44 kbd_exists = 0;
45 return -1;
46 }
47 kbd_exists = 1;
48
49 if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
50 return -1;
51
52 /*
53 * Fetch the scancode
54 */
55 scancode = inb(KBD_DATA_REG);
56 scanstatus = inb(KBD_STATUS_REG);
57
58 /*
59 * Ignore mouse events.
60 */
61 if (scanstatus & KBD_STAT_MOUSE_OBF)
62 return -1;
63
64 /*
65 * Ignore release, trigger on make
66 * (except for shift keys, where we want to
67 * keep the shift state so long as the key is
68 * held down).
69 */
70
71 if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
72 /*
73 * Next key may use shift table
74 */
75 if ((scancode & 0x80) == 0)
76 shift_key = 1;
77 else
78 shift_key = 0;
79 return -1;
80 }
81
82 if ((scancode&0x7f) == 0x1d) {
83 /*
84 * Left ctrl key
85 */
86 if ((scancode & 0x80) == 0)
87 ctrl_key = 1;
88 else
89 ctrl_key = 0;
90 return -1;
91 }
92
93 if ((scancode & 0x80) != 0)
94 return -1;
95
96 scancode &= 0x7f;
97
98 /*
99 * Translate scancode
100 */
101
102 if (scancode == 0x3a) {
103 /*
104 * Toggle caps lock
105 */
106 shift_lock ^= 1;
107
108#ifdef KDB_BLINK_LED
109 kdb_toggleled(0x4);
110#endif
111 return -1;
112 }
113
114 if (scancode == 0x0e) {
115 /*
116 * Backspace
117 */
118 return 8;
119 }
120
121 /* Special Key */
122 switch (scancode) {
123 case 0xF: /* Tab */
124 return 9;
125 case 0x53: /* Del */
126 return 4;
127 case 0x47: /* Home */
128 return 1;
129 case 0x4F: /* End */
130 return 5;
131 case 0x4B: /* Left */
132 return 2;
133 case 0x48: /* Up */
134 return 16;
135 case 0x50: /* Down */
136 return 14;
137 case 0x4D: /* Right */
138 return 6;
139 }
140
141 if (scancode == 0xe0)
142 return -1;
143
144 /*
145 * For Japanese 86/106 keyboards
146 * See comment in drivers/char/pc_keyb.c.
147 * - Masahiro Adegawa
148 */
149 if (scancode == 0x73)
150 scancode = 0x59;
151 else if (scancode == 0x7d)
152 scancode = 0x7c;
153
154 if (!shift_lock && !shift_key && !ctrl_key) {
155 keychar = plain_map[scancode];
156 } else if ((shift_lock || shift_key) && key_maps[1]) {
157 keychar = key_maps[1][scancode];
158 } else if (ctrl_key && key_maps[4]) {
159 keychar = key_maps[4][scancode];
160 } else {
161 keychar = 0x0020;
162 kdb_printf("Unknown state/scancode (%d)\n", scancode);
163 }
164 keychar &= 0x0fff;
165 if (keychar == '\t')
166 keychar = ' ';
167 switch (KTYP(keychar)) {
168 case KT_LETTER:
169 case KT_LATIN:
170 if (isprint(keychar))
171 break; /* printable characters */
172 /* drop through */
173 case KT_SPEC:
174 if (keychar == K_ENTER)
175 break;
176 /* drop through */
177 default:
178 return -1; /* ignore unprintables */
179 }
180
181 if ((scancode & 0x7f) == 0x1c) {
182 /*
183 * enter key. All done. Absorb the release scancode.
184 */
185 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
186 ;
187
188 /*
189 * Fetch the scancode
190 */
191 scancode = inb(KBD_DATA_REG);
192 scanstatus = inb(KBD_STATUS_REG);
193
194 while (scanstatus & KBD_STAT_MOUSE_OBF) {
195 scancode = inb(KBD_DATA_REG);
196 scanstatus = inb(KBD_STATUS_REG);
197 }
198
199 if (scancode != 0x9c) {
200 /*
201 * Wasn't an enter-release, why not?
202 */
203 kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
204 scancode, scanstatus);
205 }
206
207 return 13;
208 }
209
210 return keychar & 0xff;
211}
212EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
new file mode 100644
index 000000000000..b724c791b6d4
--- /dev/null
+++ b/kernel/debug/kdb/kdb_main.c
@@ -0,0 +1,2849 @@
1/*
2 * Kernel Debugger Architecture Independent Main Code
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
10 * Xscale (R) modifications copyright (C) 2003 Intel Corporation.
11 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
12 */
13
14#include <linux/ctype.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/reboot.h>
18#include <linux/sched.h>
19#include <linux/sysrq.h>
20#include <linux/smp.h>
21#include <linux/utsname.h>
22#include <linux/vmalloc.h>
23#include <linux/module.h>
24#include <linux/mm.h>
25#include <linux/init.h>
26#include <linux/kallsyms.h>
27#include <linux/kgdb.h>
28#include <linux/kdb.h>
29#include <linux/notifier.h>
30#include <linux/interrupt.h>
31#include <linux/delay.h>
32#include <linux/nmi.h>
33#include <linux/time.h>
34#include <linux/ptrace.h>
35#include <linux/sysctl.h>
36#include <linux/cpu.h>
37#include <linux/kdebug.h>
38#include <linux/proc_fs.h>
39#include <linux/uaccess.h>
40#include <linux/slab.h>
41#include "kdb_private.h"
42
43#define GREP_LEN 256
44char kdb_grep_string[GREP_LEN];
45int kdb_grepping_flag;
46EXPORT_SYMBOL(kdb_grepping_flag);
47int kdb_grep_leading;
48int kdb_grep_trailing;
49
50/*
51 * Kernel debugger state flags
52 */
53int kdb_flags;
54atomic_t kdb_event;
55
56/*
57 * kdb_lock protects updates to kdb_initial_cpu. Used to
58 * single thread processors through the kernel debugger.
59 */
60int kdb_initial_cpu = -1; /* cpu number that owns kdb */
61int kdb_nextline = 1;
62int kdb_state; /* General KDB state */
63
64struct task_struct *kdb_current_task;
65EXPORT_SYMBOL(kdb_current_task);
66struct pt_regs *kdb_current_regs;
67
68const char *kdb_diemsg;
69static int kdb_go_count;
70#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC
71static unsigned int kdb_continue_catastrophic =
72 CONFIG_KDB_CONTINUE_CATASTROPHIC;
73#else
74static unsigned int kdb_continue_catastrophic;
75#endif
76
77/* kdb_commands describes the available commands. */
78static kdbtab_t *kdb_commands;
79#define KDB_BASE_CMD_MAX 50
80static int kdb_max_commands = KDB_BASE_CMD_MAX;
81static kdbtab_t kdb_base_commands[50];
82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \
85 num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
86
87typedef struct _kdbmsg {
88 int km_diag; /* kdb diagnostic */
89 char *km_msg; /* Corresponding message text */
90} kdbmsg_t;
91
92#define KDBMSG(msgnum, text) \
93 { KDB_##msgnum, text }
94
95static kdbmsg_t kdbmsgs[] = {
96 KDBMSG(NOTFOUND, "Command Not Found"),
97 KDBMSG(ARGCOUNT, "Improper argument count, see usage."),
98 KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, "
99 "8 is only allowed on 64 bit systems"),
100 KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"),
101 KDBMSG(NOTENV, "Cannot find environment variable"),
102 KDBMSG(NOENVVALUE, "Environment variable should have value"),
103 KDBMSG(NOTIMP, "Command not implemented"),
104 KDBMSG(ENVFULL, "Environment full"),
105 KDBMSG(ENVBUFFULL, "Environment buffer full"),
106 KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
107#ifdef CONFIG_CPU_XSCALE
108 KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
109#else
110 KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"),
111#endif
112 KDBMSG(DUPBPT, "Duplicate breakpoint address"),
113 KDBMSG(BPTNOTFOUND, "Breakpoint not found"),
114 KDBMSG(BADMODE, "Invalid IDMODE"),
115 KDBMSG(BADINT, "Illegal numeric value"),
116 KDBMSG(INVADDRFMT, "Invalid symbolic address format"),
117 KDBMSG(BADREG, "Invalid register name"),
118 KDBMSG(BADCPUNUM, "Invalid cpu number"),
119 KDBMSG(BADLENGTH, "Invalid length field"),
120 KDBMSG(NOBP, "No Breakpoint exists"),
121 KDBMSG(BADADDR, "Invalid address"),
122};
123#undef KDBMSG
124
125static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
126
127
128/*
129 * Initial environment. This is all kept static and local to
130 * this file. We don't want to rely on the memory allocation
131 * mechanisms in the kernel, so we use a very limited allocate-only
132 * heap for new and altered environment variables. The entire
133 * environment is limited to a fixed number of entries (add more
134 * to __env[] if required) and a fixed amount of heap (add more to
135 * KDB_ENVBUFSIZE if required).
136 */
137
138static char *__env[] = {
139#if defined(CONFIG_SMP)
140 "PROMPT=[%d]kdb> ",
141 "MOREPROMPT=[%d]more> ",
142#else
143 "PROMPT=kdb> ",
144 "MOREPROMPT=more> ",
145#endif
146 "RADIX=16",
147 "MDCOUNT=8", /* lines of md output */
148 "BTARGS=9", /* 9 possible args in bt */
149 KDB_PLATFORM_ENV,
150 "DTABCOUNT=30",
151 "NOSECT=1",
152 (char *)0,
153 (char *)0,
154 (char *)0,
155 (char *)0,
156 (char *)0,
157 (char *)0,
158 (char *)0,
159 (char *)0,
160 (char *)0,
161 (char *)0,
162 (char *)0,
163 (char *)0,
164 (char *)0,
165 (char *)0,
166 (char *)0,
167 (char *)0,
168 (char *)0,
169 (char *)0,
170 (char *)0,
171 (char *)0,
172 (char *)0,
173 (char *)0,
174 (char *)0,
175};
176
177static const int __nenv = (sizeof(__env) / sizeof(char *));
178
179struct task_struct *kdb_curr_task(int cpu)
180{
181 struct task_struct *p = curr_task(cpu);
182#ifdef _TIF_MCA_INIT
183 if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
184 p = krp->p;
185#endif
186 return p;
187}
188
189/*
190 * kdbgetenv - This function will return the character string value of
191 * an environment variable.
192 * Parameters:
193 * match A character string representing an environment variable.
194 * Returns:
195 * NULL No environment variable matches 'match'
196 * char* Pointer to string value of environment variable.
197 */
198char *kdbgetenv(const char *match)
199{
200 char **ep = __env;
201 int matchlen = strlen(match);
202 int i;
203
204 for (i = 0; i < __nenv; i++) {
205 char *e = *ep++;
206
207 if (!e)
208 continue;
209
210 if ((strncmp(match, e, matchlen) == 0)
211 && ((e[matchlen] == '\0')
212 || (e[matchlen] == '='))) {
213 char *cp = strchr(e, '=');
214 return cp ? ++cp : "";
215 }
216 }
217 return NULL;
218}
219
220/*
221 * kdballocenv - This function is used to allocate bytes for
222 * environment entries.
223 * Parameters:
224 * match A character string representing a numeric value
225 * Outputs:
226 * *value the unsigned long representation of the env variable 'match'
227 * Returns:
228 * Zero on success, a kdb diagnostic on failure.
229 * Remarks:
230 * We use a static environment buffer (envbuffer) to hold the values
231 * of dynamically generated environment variables (see kdb_set). Buffer
232 * space once allocated is never free'd, so over time, the amount of space
233 * (currently 512 bytes) will be exhausted if env variables are changed
234 * frequently.
235 */
236static char *kdballocenv(size_t bytes)
237{
238#define KDB_ENVBUFSIZE 512
239 static char envbuffer[KDB_ENVBUFSIZE];
240 static int envbufsize;
241 char *ep = NULL;
242
243 if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
244 ep = &envbuffer[envbufsize];
245 envbufsize += bytes;
246 }
247 return ep;
248}
249
250/*
251 * kdbgetulenv - This function will return the value of an unsigned
252 * long-valued environment variable.
253 * Parameters:
254 * match A character string representing a numeric value
255 * Outputs:
256 * *value the unsigned long represntation of the env variable 'match'
257 * Returns:
258 * Zero on success, a kdb diagnostic on failure.
259 */
260static int kdbgetulenv(const char *match, unsigned long *value)
261{
262 char *ep;
263
264 ep = kdbgetenv(match);
265 if (!ep)
266 return KDB_NOTENV;
267 if (strlen(ep) == 0)
268 return KDB_NOENVVALUE;
269
270 *value = simple_strtoul(ep, NULL, 0);
271
272 return 0;
273}
274
275/*
276 * kdbgetintenv - This function will return the value of an
277 * integer-valued environment variable.
278 * Parameters:
279 * match A character string representing an integer-valued env variable
280 * Outputs:
281 * *value the integer representation of the environment variable 'match'
282 * Returns:
283 * Zero on success, a kdb diagnostic on failure.
284 */
285int kdbgetintenv(const char *match, int *value)
286{
287 unsigned long val;
288 int diag;
289
290 diag = kdbgetulenv(match, &val);
291 if (!diag)
292 *value = (int) val;
293 return diag;
294}
295
296/*
297 * kdbgetularg - This function will convert a numeric string into an
298 * unsigned long value.
299 * Parameters:
300 * arg A character string representing a numeric value
301 * Outputs:
302 * *value the unsigned long represntation of arg.
303 * Returns:
304 * Zero on success, a kdb diagnostic on failure.
305 */
306int kdbgetularg(const char *arg, unsigned long *value)
307{
308 char *endp;
309 unsigned long val;
310
311 val = simple_strtoul(arg, &endp, 0);
312
313 if (endp == arg) {
314 /*
315 * Try base 16, for us folks too lazy to type the
316 * leading 0x...
317 */
318 val = simple_strtoul(arg, &endp, 16);
319 if (endp == arg)
320 return KDB_BADINT;
321 }
322
323 *value = val;
324
325 return 0;
326}
327
328/*
329 * kdb_set - This function implements the 'set' command. Alter an
330 * existing environment variable or create a new one.
331 */
332int kdb_set(int argc, const char **argv)
333{
334 int i;
335 char *ep;
336 size_t varlen, vallen;
337
338 /*
339 * we can be invoked two ways:
340 * set var=value argv[1]="var", argv[2]="value"
341 * set var = value argv[1]="var", argv[2]="=", argv[3]="value"
342 * - if the latter, shift 'em down.
343 */
344 if (argc == 3) {
345 argv[2] = argv[3];
346 argc--;
347 }
348
349 if (argc != 2)
350 return KDB_ARGCOUNT;
351
352 /*
353 * Check for internal variables
354 */
355 if (strcmp(argv[1], "KDBDEBUG") == 0) {
356 unsigned int debugflags;
357 char *cp;
358
359 debugflags = simple_strtoul(argv[2], &cp, 0);
360 if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
361 kdb_printf("kdb: illegal debug flags '%s'\n",
362 argv[2]);
363 return 0;
364 }
365 kdb_flags = (kdb_flags &
366 ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
367 | (debugflags << KDB_DEBUG_FLAG_SHIFT);
368
369 return 0;
370 }
371
372 /*
373 * Tokenizer squashed the '=' sign. argv[1] is variable
374 * name, argv[2] = value.
375 */
376 varlen = strlen(argv[1]);
377 vallen = strlen(argv[2]);
378 ep = kdballocenv(varlen + vallen + 2);
379 if (ep == (char *)0)
380 return KDB_ENVBUFFULL;
381
382 sprintf(ep, "%s=%s", argv[1], argv[2]);
383
384 ep[varlen+vallen+1] = '\0';
385
386 for (i = 0; i < __nenv; i++) {
387 if (__env[i]
388 && ((strncmp(__env[i], argv[1], varlen) == 0)
389 && ((__env[i][varlen] == '\0')
390 || (__env[i][varlen] == '=')))) {
391 __env[i] = ep;
392 return 0;
393 }
394 }
395
396 /*
397 * Wasn't existing variable. Fit into slot.
398 */
399 for (i = 0; i < __nenv-1; i++) {
400 if (__env[i] == (char *)0) {
401 __env[i] = ep;
402 return 0;
403 }
404 }
405
406 return KDB_ENVFULL;
407}
408
409static int kdb_check_regs(void)
410{
411 if (!kdb_current_regs) {
412 kdb_printf("No current kdb registers."
413 " You may need to select another task\n");
414 return KDB_BADREG;
415 }
416 return 0;
417}
418
419/*
420 * kdbgetaddrarg - This function is responsible for parsing an
421 * address-expression and returning the value of the expression,
422 * symbol name, and offset to the caller.
423 *
424 * The argument may consist of a numeric value (decimal or
425 * hexidecimal), a symbol name, a register name (preceeded by the
426 * percent sign), an environment variable with a numeric value
427 * (preceeded by a dollar sign) or a simple arithmetic expression
428 * consisting of a symbol name, +/-, and a numeric constant value
429 * (offset).
430 * Parameters:
431 * argc - count of arguments in argv
432 * argv - argument vector
433 * *nextarg - index to next unparsed argument in argv[]
434 * regs - Register state at time of KDB entry
435 * Outputs:
436 * *value - receives the value of the address-expression
437 * *offset - receives the offset specified, if any
438 * *name - receives the symbol name, if any
439 * *nextarg - index to next unparsed argument in argv[]
440 * Returns:
441 * zero is returned on success, a kdb diagnostic code is
442 * returned on error.
443 */
444int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
445 unsigned long *value, long *offset,
446 char **name)
447{
448 unsigned long addr;
449 unsigned long off = 0;
450 int positive;
451 int diag;
452 int found = 0;
453 char *symname;
454 char symbol = '\0';
455 char *cp;
456 kdb_symtab_t symtab;
457
458 /*
459 * Process arguments which follow the following syntax:
460 *
461 * symbol | numeric-address [+/- numeric-offset]
462 * %register
463 * $environment-variable
464 */
465
466 if (*nextarg > argc)
467 return KDB_ARGCOUNT;
468
469 symname = (char *)argv[*nextarg];
470
471 /*
472 * If there is no whitespace between the symbol
473 * or address and the '+' or '-' symbols, we
474 * remember the character and replace it with a
475 * null so the symbol/value can be properly parsed
476 */
477 cp = strpbrk(symname, "+-");
478 if (cp != NULL) {
479 symbol = *cp;
480 *cp++ = '\0';
481 }
482
483 if (symname[0] == '$') {
484 diag = kdbgetulenv(&symname[1], &addr);
485 if (diag)
486 return diag;
487 } else if (symname[0] == '%') {
488 diag = kdb_check_regs();
489 if (diag)
490 return diag;
491 /* Implement register values with % at a later time as it is
492 * arch optional.
493 */
494 return KDB_NOTIMP;
495 } else {
496 found = kdbgetsymval(symname, &symtab);
497 if (found) {
498 addr = symtab.sym_start;
499 } else {
500 diag = kdbgetularg(argv[*nextarg], &addr);
501 if (diag)
502 return diag;
503 }
504 }
505
506 if (!found)
507 found = kdbnearsym(addr, &symtab);
508
509 (*nextarg)++;
510
511 if (name)
512 *name = symname;
513 if (value)
514 *value = addr;
515 if (offset && name && *name)
516 *offset = addr - symtab.sym_start;
517
518 if ((*nextarg > argc)
519 && (symbol == '\0'))
520 return 0;
521
522 /*
523 * check for +/- and offset
524 */
525
526 if (symbol == '\0') {
527 if ((argv[*nextarg][0] != '+')
528 && (argv[*nextarg][0] != '-')) {
529 /*
530 * Not our argument. Return.
531 */
532 return 0;
533 } else {
534 positive = (argv[*nextarg][0] == '+');
535 (*nextarg)++;
536 }
537 } else
538 positive = (symbol == '+');
539
540 /*
541 * Now there must be an offset!
542 */
543 if ((*nextarg > argc)
544 && (symbol == '\0')) {
545 return KDB_INVADDRFMT;
546 }
547
548 if (!symbol) {
549 cp = (char *)argv[*nextarg];
550 (*nextarg)++;
551 }
552
553 diag = kdbgetularg(cp, &off);
554 if (diag)
555 return diag;
556
557 if (!positive)
558 off = -off;
559
560 if (offset)
561 *offset += off;
562
563 if (value)
564 *value += off;
565
566 return 0;
567}
568
569static void kdb_cmderror(int diag)
570{
571 int i;
572
573 if (diag >= 0) {
574 kdb_printf("no error detected (diagnostic is %d)\n", diag);
575 return;
576 }
577
578 for (i = 0; i < __nkdb_err; i++) {
579 if (kdbmsgs[i].km_diag == diag) {
580 kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg);
581 return;
582 }
583 }
584
585 kdb_printf("Unknown diag %d\n", -diag);
586}
587
588/*
589 * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd'
590 * command which defines one command as a set of other commands,
591 * terminated by endefcmd. kdb_defcmd processes the initial
592 * 'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for
593 * the following commands until 'endefcmd'.
594 * Inputs:
595 * argc argument count
596 * argv argument vector
597 * Returns:
598 * zero for success, a kdb diagnostic if error
599 */
600struct defcmd_set {
601 int count;
602 int usable;
603 char *name;
604 char *usage;
605 char *help;
606 char **command;
607};
608static struct defcmd_set *defcmd_set;
609static int defcmd_set_count;
610static int defcmd_in_progress;
611
612/* Forward references */
613static int kdb_exec_defcmd(int argc, const char **argv);
614
615static int kdb_defcmd2(const char *cmdstr, const char *argv0)
616{
617 struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
618 char **save_command = s->command;
619 if (strcmp(argv0, "endefcmd") == 0) {
620 defcmd_in_progress = 0;
621 if (!s->count)
622 s->usable = 0;
623 if (s->usable)
624 kdb_register(s->name, kdb_exec_defcmd,
625 s->usage, s->help, 0);
626 return 0;
627 }
628 if (!s->usable)
629 return KDB_NOTIMP;
630 s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
631 if (!s->command) {
632 kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
633 cmdstr);
634 s->usable = 0;
635 return KDB_NOTIMP;
636 }
637 memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
638 s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
639 kfree(save_command);
640 return 0;
641}
642
643static int kdb_defcmd(int argc, const char **argv)
644{
645 struct defcmd_set *save_defcmd_set = defcmd_set, *s;
646 if (defcmd_in_progress) {
647 kdb_printf("kdb: nested defcmd detected, assuming missing "
648 "endefcmd\n");
649 kdb_defcmd2("endefcmd", "endefcmd");
650 }
651 if (argc == 0) {
652 int i;
653 for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
654 kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
655 s->usage, s->help);
656 for (i = 0; i < s->count; ++i)
657 kdb_printf("%s", s->command[i]);
658 kdb_printf("endefcmd\n");
659 }
660 return 0;
661 }
662 if (argc != 3)
663 return KDB_ARGCOUNT;
664 defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
665 GFP_KDB);
666 if (!defcmd_set) {
667 kdb_printf("Could not allocate new defcmd_set entry for %s\n",
668 argv[1]);
669 defcmd_set = save_defcmd_set;
670 return KDB_NOTIMP;
671 }
672 memcpy(defcmd_set, save_defcmd_set,
673 defcmd_set_count * sizeof(*defcmd_set));
674 kfree(save_defcmd_set);
675 s = defcmd_set + defcmd_set_count;
676 memset(s, 0, sizeof(*s));
677 s->usable = 1;
678 s->name = kdb_strdup(argv[1], GFP_KDB);
679 s->usage = kdb_strdup(argv[2], GFP_KDB);
680 s->help = kdb_strdup(argv[3], GFP_KDB);
681 if (s->usage[0] == '"') {
682 strcpy(s->usage, s->usage+1);
683 s->usage[strlen(s->usage)-1] = '\0';
684 }
685 if (s->help[0] == '"') {
686 strcpy(s->help, s->help+1);
687 s->help[strlen(s->help)-1] = '\0';
688 }
689 ++defcmd_set_count;
690 defcmd_in_progress = 1;
691 return 0;
692}
693
694/*
695 * kdb_exec_defcmd - Execute the set of commands associated with this
696 * defcmd name.
697 * Inputs:
698 * argc argument count
699 * argv argument vector
700 * Returns:
701 * zero for success, a kdb diagnostic if error
702 */
703static int kdb_exec_defcmd(int argc, const char **argv)
704{
705 int i, ret;
706 struct defcmd_set *s;
707 if (argc != 0)
708 return KDB_ARGCOUNT;
709 for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
710 if (strcmp(s->name, argv[0]) == 0)
711 break;
712 }
713 if (i == defcmd_set_count) {
714 kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
715 argv[0]);
716 return KDB_NOTIMP;
717 }
718 for (i = 0; i < s->count; ++i) {
719 /* Recursive use of kdb_parse, do not use argv after
720 * this point */
721 argv = NULL;
722 kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
723 ret = kdb_parse(s->command[i]);
724 if (ret)
725 return ret;
726 }
727 return 0;
728}
729
730/* Command history */
731#define KDB_CMD_HISTORY_COUNT 32
732#define CMD_BUFLEN 200 /* kdb_printf: max printline
733 * size == 256 */
734static unsigned int cmd_head, cmd_tail;
735static unsigned int cmdptr;
736static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN];
737static char cmd_cur[CMD_BUFLEN];
738
739/*
740 * The "str" argument may point to something like | grep xyz
741 */
742static void parse_grep(const char *str)
743{
744 int len;
745 char *cp = (char *)str, *cp2;
746
747 /* sanity check: we should have been called with the \ first */
748 if (*cp != '|')
749 return;
750 cp++;
751 while (isspace(*cp))
752 cp++;
753 if (strncmp(cp, "grep ", 5)) {
754 kdb_printf("invalid 'pipe', see grephelp\n");
755 return;
756 }
757 cp += 5;
758 while (isspace(*cp))
759 cp++;
760 cp2 = strchr(cp, '\n');
761 if (cp2)
762 *cp2 = '\0'; /* remove the trailing newline */
763 len = strlen(cp);
764 if (len == 0) {
765 kdb_printf("invalid 'pipe', see grephelp\n");
766 return;
767 }
768 /* now cp points to a nonzero length search string */
769 if (*cp == '"') {
770 /* allow it be "x y z" by removing the "'s - there must
771 be two of them */
772 cp++;
773 cp2 = strchr(cp, '"');
774 if (!cp2) {
775 kdb_printf("invalid quoted string, see grephelp\n");
776 return;
777 }
778 *cp2 = '\0'; /* end the string where the 2nd " was */
779 }
780 kdb_grep_leading = 0;
781 if (*cp == '^') {
782 kdb_grep_leading = 1;
783 cp++;
784 }
785 len = strlen(cp);
786 kdb_grep_trailing = 0;
787 if (*(cp+len-1) == '$') {
788 kdb_grep_trailing = 1;
789 *(cp+len-1) = '\0';
790 }
791 len = strlen(cp);
792 if (!len)
793 return;
794 if (len >= GREP_LEN) {
795 kdb_printf("search string too long\n");
796 return;
797 }
798 strcpy(kdb_grep_string, cp);
799 kdb_grepping_flag++;
800 return;
801}
802
803/*
804 * kdb_parse - Parse the command line, search the command table for a
805 * matching command and invoke the command function. This
806 * function may be called recursively, if it is, the second call
807 * will overwrite argv and cbuf. It is the caller's
808 * responsibility to save their argv if they recursively call
809 * kdb_parse().
810 * Parameters:
811 * cmdstr The input command line to be parsed.
812 * regs The registers at the time kdb was entered.
813 * Returns:
814 * Zero for success, a kdb diagnostic if failure.
815 * Remarks:
816 * Limited to 20 tokens.
817 *
818 * Real rudimentary tokenization. Basically only whitespace
819 * is considered a token delimeter (but special consideration
820 * is taken of the '=' sign as used by the 'set' command).
821 *
822 * The algorithm used to tokenize the input string relies on
823 * there being at least one whitespace (or otherwise useless)
824 * character between tokens as the character immediately following
825 * the token is altered in-place to a null-byte to terminate the
826 * token string.
827 */
828
829#define MAXARGC 20
830
831int kdb_parse(const char *cmdstr)
832{
833 static char *argv[MAXARGC];
834 static int argc;
835 static char cbuf[CMD_BUFLEN+2];
836 char *cp;
837 char *cpp, quoted;
838 kdbtab_t *tp;
839 int i, escaped, ignore_errors = 0, check_grep;
840
841 /*
842 * First tokenize the command string.
843 */
844 cp = (char *)cmdstr;
845 kdb_grepping_flag = check_grep = 0;
846
847 if (KDB_FLAG(CMD_INTERRUPT)) {
848 /* Previous command was interrupted, newline must not
849 * repeat the command */
850 KDB_FLAG_CLEAR(CMD_INTERRUPT);
851 KDB_STATE_SET(PAGER);
852 argc = 0; /* no repeat */
853 }
854
855 if (*cp != '\n' && *cp != '\0') {
856 argc = 0;
857 cpp = cbuf;
858 while (*cp) {
859 /* skip whitespace */
860 while (isspace(*cp))
861 cp++;
862 if ((*cp == '\0') || (*cp == '\n') ||
863 (*cp == '#' && !defcmd_in_progress))
864 break;
865 /* special case: check for | grep pattern */
866 if (*cp == '|') {
867 check_grep++;
868 break;
869 }
870 if (cpp >= cbuf + CMD_BUFLEN) {
871 kdb_printf("kdb_parse: command buffer "
872 "overflow, command ignored\n%s\n",
873 cmdstr);
874 return KDB_NOTFOUND;
875 }
876 if (argc >= MAXARGC - 1) {
877 kdb_printf("kdb_parse: too many arguments, "
878 "command ignored\n%s\n", cmdstr);
879 return KDB_NOTFOUND;
880 }
881 argv[argc++] = cpp;
882 escaped = 0;
883 quoted = '\0';
884 /* Copy to next unquoted and unescaped
885 * whitespace or '=' */
886 while (*cp && *cp != '\n' &&
887 (escaped || quoted || !isspace(*cp))) {
888 if (cpp >= cbuf + CMD_BUFLEN)
889 break;
890 if (escaped) {
891 escaped = 0;
892 *cpp++ = *cp++;
893 continue;
894 }
895 if (*cp == '\\') {
896 escaped = 1;
897 ++cp;
898 continue;
899 }
900 if (*cp == quoted)
901 quoted = '\0';
902 else if (*cp == '\'' || *cp == '"')
903 quoted = *cp;
904 *cpp = *cp++;
905 if (*cpp == '=' && !quoted)
906 break;
907 ++cpp;
908 }
909 *cpp++ = '\0'; /* Squash a ws or '=' character */
910 }
911 }
912 if (!argc)
913 return 0;
914 if (check_grep)
915 parse_grep(cp);
916 if (defcmd_in_progress) {
917 int result = kdb_defcmd2(cmdstr, argv[0]);
918 if (!defcmd_in_progress) {
919 argc = 0; /* avoid repeat on endefcmd */
920 *(argv[0]) = '\0';
921 }
922 return result;
923 }
924 if (argv[0][0] == '-' && argv[0][1] &&
925 (argv[0][1] < '0' || argv[0][1] > '9')) {
926 ignore_errors = 1;
927 ++argv[0];
928 }
929
930 for_each_kdbcmd(tp, i) {
931 if (tp->cmd_name) {
932 /*
933 * If this command is allowed to be abbreviated,
934 * check to see if this is it.
935 */
936
937 if (tp->cmd_minlen
938 && (strlen(argv[0]) <= tp->cmd_minlen)) {
939 if (strncmp(argv[0],
940 tp->cmd_name,
941 tp->cmd_minlen) == 0) {
942 break;
943 }
944 }
945
946 if (strcmp(argv[0], tp->cmd_name) == 0)
947 break;
948 }
949 }
950
951 /*
952 * If we don't find a command by this name, see if the first
953 * few characters of this match any of the known commands.
954 * e.g., md1c20 should match md.
955 */
956 if (i == kdb_max_commands) {
957 for_each_kdbcmd(tp, i) {
958 if (tp->cmd_name) {
959 if (strncmp(argv[0],
960 tp->cmd_name,
961 strlen(tp->cmd_name)) == 0) {
962 break;
963 }
964 }
965 }
966 }
967
968 if (i < kdb_max_commands) {
969 int result;
970 KDB_STATE_SET(CMD);
971 result = (*tp->cmd_func)(argc-1, (const char **)argv);
972 if (result && ignore_errors && result > KDB_CMD_GO)
973 result = 0;
974 KDB_STATE_CLEAR(CMD);
975 switch (tp->cmd_repeat) {
976 case KDB_REPEAT_NONE:
977 argc = 0;
978 if (argv[0])
979 *(argv[0]) = '\0';
980 break;
981 case KDB_REPEAT_NO_ARGS:
982 argc = 1;
983 if (argv[1])
984 *(argv[1]) = '\0';
985 break;
986 case KDB_REPEAT_WITH_ARGS:
987 break;
988 }
989 return result;
990 }
991
992 /*
993 * If the input with which we were presented does not
994 * map to an existing command, attempt to parse it as an
995 * address argument and display the result. Useful for
996 * obtaining the address of a variable, or the nearest symbol
997 * to an address contained in a register.
998 */
999 {
1000 unsigned long value;
1001 char *name = NULL;
1002 long offset;
1003 int nextarg = 0;
1004
1005 if (kdbgetaddrarg(0, (const char **)argv, &nextarg,
1006 &value, &offset, &name)) {
1007 return KDB_NOTFOUND;
1008 }
1009
1010 kdb_printf("%s = ", argv[0]);
1011 kdb_symbol_print(value, NULL, KDB_SP_DEFAULT);
1012 kdb_printf("\n");
1013 return 0;
1014 }
1015}
1016
1017
1018static int handle_ctrl_cmd(char *cmd)
1019{
1020#define CTRL_P 16
1021#define CTRL_N 14
1022
1023 /* initial situation */
1024 if (cmd_head == cmd_tail)
1025 return 0;
1026 switch (*cmd) {
1027 case CTRL_P:
1028 if (cmdptr != cmd_tail)
1029 cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
1030 strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
1031 return 1;
1032 case CTRL_N:
1033 if (cmdptr != cmd_head)
1034 cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
1035 strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
1036 return 1;
1037 }
1038 return 0;
1039}
1040
1041/*
1042 * kdb_reboot - This function implements the 'reboot' command. Reboot
1043 * the system immediately, or loop for ever on failure.
1044 */
1045static int kdb_reboot(int argc, const char **argv)
1046{
1047 emergency_restart();
1048 kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n");
1049 while (1)
1050 cpu_relax();
1051 /* NOTREACHED */
1052 return 0;
1053}
1054
1055static void kdb_dumpregs(struct pt_regs *regs)
1056{
1057 int old_lvl = console_loglevel;
1058 console_loglevel = 15;
1059 kdb_trap_printk++;
1060 show_regs(regs);
1061 kdb_trap_printk--;
1062 kdb_printf("\n");
1063 console_loglevel = old_lvl;
1064}
1065
1066void kdb_set_current_task(struct task_struct *p)
1067{
1068 kdb_current_task = p;
1069
1070 if (kdb_task_has_cpu(p)) {
1071 kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p));
1072 return;
1073 }
1074 kdb_current_regs = NULL;
1075}
1076
1077/*
1078 * kdb_local - The main code for kdb. This routine is invoked on a
1079 * specific processor, it is not global. The main kdb() routine
1080 * ensures that only one processor at a time is in this routine.
1081 * This code is called with the real reason code on the first
1082 * entry to a kdb session, thereafter it is called with reason
1083 * SWITCH, even if the user goes back to the original cpu.
1084 * Inputs:
1085 * reason The reason KDB was invoked
1086 * error The hardware-defined error code
1087 * regs The exception frame at time of fault/breakpoint.
1088 * db_result Result code from the break or debug point.
1089 * Returns:
1090 * 0 KDB was invoked for an event which it wasn't responsible
1091 * 1 KDB handled the event for which it was invoked.
1092 * KDB_CMD_GO User typed 'go'.
1093 * KDB_CMD_CPU User switched to another cpu.
1094 * KDB_CMD_SS Single step.
1095 * KDB_CMD_SSB Single step until branch.
1096 */
1097static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1098 kdb_dbtrap_t db_result)
1099{
1100 char *cmdbuf;
1101 int diag;
1102 struct task_struct *kdb_current =
1103 kdb_curr_task(raw_smp_processor_id());
1104
1105 KDB_DEBUG_STATE("kdb_local 1", reason);
1106 kdb_go_count = 0;
1107 if (reason == KDB_REASON_DEBUG) {
1108 /* special case below */
1109 } else {
1110 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
1111 kdb_current, kdb_current->pid);
1112#if defined(CONFIG_SMP)
1113 kdb_printf("on processor %d ", raw_smp_processor_id());
1114#endif
1115 }
1116
1117 switch (reason) {
1118 case KDB_REASON_DEBUG:
1119 {
1120 /*
1121 * If re-entering kdb after a single step
1122 * command, don't print the message.
1123 */
1124 switch (db_result) {
1125 case KDB_DB_BPT:
1126 kdb_printf("\nEntering kdb (0x%p, pid %d) ",
1127 kdb_current, kdb_current->pid);
1128#if defined(CONFIG_SMP)
1129 kdb_printf("on processor %d ", raw_smp_processor_id());
1130#endif
1131 kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
1132 instruction_pointer(regs));
1133 break;
1134 case KDB_DB_SSB:
1135 /*
1136 * In the midst of ssb command. Just return.
1137 */
1138 KDB_DEBUG_STATE("kdb_local 3", reason);
1139 return KDB_CMD_SSB; /* Continue with SSB command */
1140
1141 break;
1142 case KDB_DB_SS:
1143 break;
1144 case KDB_DB_SSBPT:
1145 KDB_DEBUG_STATE("kdb_local 4", reason);
1146 return 1; /* kdba_db_trap did the work */
1147 default:
1148 kdb_printf("kdb: Bad result from kdba_db_trap: %d\n",
1149 db_result);
1150 break;
1151 }
1152
1153 }
1154 break;
1155 case KDB_REASON_ENTER:
1156 if (KDB_STATE(KEYBOARD))
1157 kdb_printf("due to Keyboard Entry\n");
1158 else
1159 kdb_printf("due to KDB_ENTER()\n");
1160 break;
1161 case KDB_REASON_KEYBOARD:
1162 KDB_STATE_SET(KEYBOARD);
1163 kdb_printf("due to Keyboard Entry\n");
1164 break;
1165 case KDB_REASON_ENTER_SLAVE:
1166 /* drop through, slaves only get released via cpu switch */
1167 case KDB_REASON_SWITCH:
1168 kdb_printf("due to cpu switch\n");
1169 break;
1170 case KDB_REASON_OOPS:
1171 kdb_printf("Oops: %s\n", kdb_diemsg);
1172 kdb_printf("due to oops @ " kdb_machreg_fmt "\n",
1173 instruction_pointer(regs));
1174 kdb_dumpregs(regs);
1175 break;
1176 case KDB_REASON_NMI:
1177 kdb_printf("due to NonMaskable Interrupt @ "
1178 kdb_machreg_fmt "\n",
1179 instruction_pointer(regs));
1180 kdb_dumpregs(regs);
1181 break;
1182 case KDB_REASON_SSTEP:
1183 case KDB_REASON_BREAK:
1184 kdb_printf("due to %s @ " kdb_machreg_fmt "\n",
1185 reason == KDB_REASON_BREAK ?
1186 "Breakpoint" : "SS trap", instruction_pointer(regs));
1187 /*
1188 * Determine if this breakpoint is one that we
1189 * are interested in.
1190 */
1191 if (db_result != KDB_DB_BPT) {
1192 kdb_printf("kdb: error return from kdba_bp_trap: %d\n",
1193 db_result);
1194 KDB_DEBUG_STATE("kdb_local 6", reason);
1195 return 0; /* Not for us, dismiss it */
1196 }
1197 break;
1198 case KDB_REASON_RECURSE:
1199 kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n",
1200 instruction_pointer(regs));
1201 break;
1202 default:
1203 kdb_printf("kdb: unexpected reason code: %d\n", reason);
1204 KDB_DEBUG_STATE("kdb_local 8", reason);
1205 return 0; /* Not for us, dismiss it */
1206 }
1207
1208 while (1) {
1209 /*
1210 * Initialize pager context.
1211 */
1212 kdb_nextline = 1;
1213 KDB_STATE_CLEAR(SUPPRESS);
1214
1215 cmdbuf = cmd_cur;
1216 *cmdbuf = '\0';
1217 *(cmd_hist[cmd_head]) = '\0';
1218
1219 if (KDB_FLAG(ONLY_DO_DUMP)) {
1220 /* kdb is off but a catastrophic error requires a dump.
1221 * Take the dump and reboot.
1222 * Turn on logging so the kdb output appears in the log
1223 * buffer in the dump.
1224 */
1225 const char *setargs[] = { "set", "LOGGING", "1" };
1226 kdb_set(2, setargs);
1227 kdb_reboot(0, NULL);
1228 /*NOTREACHED*/
1229 }
1230
1231do_full_getstr:
1232#if defined(CONFIG_SMP)
1233 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
1234 raw_smp_processor_id());
1235#else
1236 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
1237#endif
1238 if (defcmd_in_progress)
1239 strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
1240
1241 /*
1242 * Fetch command from keyboard
1243 */
1244 cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str);
1245 if (*cmdbuf != '\n') {
1246 if (*cmdbuf < 32) {
1247 if (cmdptr == cmd_head) {
1248 strncpy(cmd_hist[cmd_head], cmd_cur,
1249 CMD_BUFLEN);
1250 *(cmd_hist[cmd_head] +
1251 strlen(cmd_hist[cmd_head])-1) = '\0';
1252 }
1253 if (!handle_ctrl_cmd(cmdbuf))
1254 *(cmd_cur+strlen(cmd_cur)-1) = '\0';
1255 cmdbuf = cmd_cur;
1256 goto do_full_getstr;
1257 } else {
1258 strncpy(cmd_hist[cmd_head], cmd_cur,
1259 CMD_BUFLEN);
1260 }
1261
1262 cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT;
1263 if (cmd_head == cmd_tail)
1264 cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT;
1265 }
1266
1267 cmdptr = cmd_head;
1268 diag = kdb_parse(cmdbuf);
1269 if (diag == KDB_NOTFOUND) {
1270 kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
1271 diag = 0;
1272 }
1273 if (diag == KDB_CMD_GO
1274 || diag == KDB_CMD_CPU
1275 || diag == KDB_CMD_SS
1276 || diag == KDB_CMD_SSB
1277 || diag == KDB_CMD_KGDB)
1278 break;
1279
1280 if (diag)
1281 kdb_cmderror(diag);
1282 }
1283 KDB_DEBUG_STATE("kdb_local 9", diag);
1284 return diag;
1285}
1286
1287
1288/*
1289 * kdb_print_state - Print the state data for the current processor
1290 * for debugging.
1291 * Inputs:
1292 * text Identifies the debug point
1293 * value Any integer value to be printed, e.g. reason code.
1294 */
1295void kdb_print_state(const char *text, int value)
1296{
1297 kdb_printf("state: %s cpu %d value %d initial %d state %x\n",
1298 text, raw_smp_processor_id(), value, kdb_initial_cpu,
1299 kdb_state);
1300}
1301
1302/*
1303 * kdb_main_loop - After initial setup and assignment of the
1304 * controlling cpu, all cpus are in this loop. One cpu is in
1305 * control and will issue the kdb prompt, the others will spin
1306 * until 'go' or cpu switch.
1307 *
1308 * To get a consistent view of the kernel stacks for all
1309 * processes, this routine is invoked from the main kdb code via
1310 * an architecture specific routine. kdba_main_loop is
1311 * responsible for making the kernel stacks consistent for all
1312 * processes, there should be no difference between a blocked
1313 * process and a running process as far as kdb is concerned.
1314 * Inputs:
1315 * reason The reason KDB was invoked
1316 * error The hardware-defined error code
1317 * reason2 kdb's current reason code.
1318 * Initially error but can change
1319 * acording to kdb state.
1320 * db_result Result code from break or debug point.
1321 * regs The exception frame at time of fault/breakpoint.
1322 * should always be valid.
1323 * Returns:
1324 * 0 KDB was invoked for an event which it wasn't responsible
1325 * 1 KDB handled the event for which it was invoked.
1326 */
1327int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1328 kdb_dbtrap_t db_result, struct pt_regs *regs)
1329{
1330 int result = 1;
1331 /* Stay in kdb() until 'go', 'ss[b]' or an error */
1332 while (1) {
1333 /*
1334 * All processors except the one that is in control
1335 * will spin here.
1336 */
1337 KDB_DEBUG_STATE("kdb_main_loop 1", reason);
1338 while (KDB_STATE(HOLD_CPU)) {
1339 /* state KDB is turned off by kdb_cpu to see if the
1340 * other cpus are still live, each cpu in this loop
1341 * turns it back on.
1342 */
1343 if (!KDB_STATE(KDB))
1344 KDB_STATE_SET(KDB);
1345 }
1346
1347 KDB_STATE_CLEAR(SUPPRESS);
1348 KDB_DEBUG_STATE("kdb_main_loop 2", reason);
1349 if (KDB_STATE(LEAVING))
1350 break; /* Another cpu said 'go' */
1351 /* Still using kdb, this processor is in control */
1352 result = kdb_local(reason2, error, regs, db_result);
1353 KDB_DEBUG_STATE("kdb_main_loop 3", result);
1354
1355 if (result == KDB_CMD_CPU)
1356 break;
1357
1358 if (result == KDB_CMD_SS) {
1359 KDB_STATE_SET(DOING_SS);
1360 break;
1361 }
1362
1363 if (result == KDB_CMD_SSB) {
1364 KDB_STATE_SET(DOING_SS);
1365 KDB_STATE_SET(DOING_SSB);
1366 break;
1367 }
1368
1369 if (result == KDB_CMD_KGDB) {
1370 if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)))
1371 kdb_printf("Entering please attach debugger "
1372 "or use $D#44+ or $3#33\n");
1373 break;
1374 }
1375 if (result && result != 1 && result != KDB_CMD_GO)
1376 kdb_printf("\nUnexpected kdb_local return code %d\n",
1377 result);
1378 KDB_DEBUG_STATE("kdb_main_loop 4", reason);
1379 break;
1380 }
1381 if (KDB_STATE(DOING_SS))
1382 KDB_STATE_CLEAR(SSBPT);
1383
1384 return result;
1385}
1386
1387/*
1388 * kdb_mdr - This function implements the guts of the 'mdr', memory
1389 * read command.
1390 * mdr <addr arg>,<byte count>
1391 * Inputs:
1392 * addr Start address
1393 * count Number of bytes
1394 * Returns:
1395 * Always 0. Any errors are detected and printed by kdb_getarea.
1396 */
1397static int kdb_mdr(unsigned long addr, unsigned int count)
1398{
1399 unsigned char c;
1400 while (count--) {
1401 if (kdb_getarea(c, addr))
1402 return 0;
1403 kdb_printf("%02x", c);
1404 addr++;
1405 }
1406 kdb_printf("\n");
1407 return 0;
1408}
1409
1410/*
1411 * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4',
1412 * 'md8' 'mdr' and 'mds' commands.
1413 *
1414 * md|mds [<addr arg> [<line count> [<radix>]]]
1415 * mdWcN [<addr arg> [<line count> [<radix>]]]
1416 * where W = is the width (1, 2, 4 or 8) and N is the count.
1417 * for eg., md1c20 reads 20 bytes, 1 at a time.
1418 * mdr <addr arg>,<byte count>
1419 */
1420static void kdb_md_line(const char *fmtstr, unsigned long addr,
1421 int symbolic, int nosect, int bytesperword,
1422 int num, int repeat, int phys)
1423{
1424 /* print just one line of data */
1425 kdb_symtab_t symtab;
1426 char cbuf[32];
1427 char *c = cbuf;
1428 int i;
1429 unsigned long word;
1430
1431 memset(cbuf, '\0', sizeof(cbuf));
1432 if (phys)
1433 kdb_printf("phys " kdb_machreg_fmt0 " ", addr);
1434 else
1435 kdb_printf(kdb_machreg_fmt0 " ", addr);
1436
1437 for (i = 0; i < num && repeat--; i++) {
1438 if (phys) {
1439 if (kdb_getphysword(&word, addr, bytesperword))
1440 break;
1441 } else if (kdb_getword(&word, addr, bytesperword))
1442 break;
1443 kdb_printf(fmtstr, word);
1444 if (symbolic)
1445 kdbnearsym(word, &symtab);
1446 else
1447 memset(&symtab, 0, sizeof(symtab));
1448 if (symtab.sym_name) {
1449 kdb_symbol_print(word, &symtab, 0);
1450 if (!nosect) {
1451 kdb_printf("\n");
1452 kdb_printf(" %s %s "
1453 kdb_machreg_fmt " "
1454 kdb_machreg_fmt " "
1455 kdb_machreg_fmt, symtab.mod_name,
1456 symtab.sec_name, symtab.sec_start,
1457 symtab.sym_start, symtab.sym_end);
1458 }
1459 addr += bytesperword;
1460 } else {
1461 union {
1462 u64 word;
1463 unsigned char c[8];
1464 } wc;
1465 unsigned char *cp;
1466#ifdef __BIG_ENDIAN
1467 cp = wc.c + 8 - bytesperword;
1468#else
1469 cp = wc.c;
1470#endif
1471 wc.word = word;
1472#define printable_char(c) \
1473 ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
1474 switch (bytesperword) {
1475 case 8:
1476 *c++ = printable_char(*cp++);
1477 *c++ = printable_char(*cp++);
1478 *c++ = printable_char(*cp++);
1479 *c++ = printable_char(*cp++);
1480 addr += 4;
1481 case 4:
1482 *c++ = printable_char(*cp++);
1483 *c++ = printable_char(*cp++);
1484 addr += 2;
1485 case 2:
1486 *c++ = printable_char(*cp++);
1487 addr++;
1488 case 1:
1489 *c++ = printable_char(*cp++);
1490 addr++;
1491 break;
1492 }
1493#undef printable_char
1494 }
1495 }
1496 kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1),
1497 " ", cbuf);
1498}
1499
1500static int kdb_md(int argc, const char **argv)
1501{
1502 static unsigned long last_addr;
1503 static int last_radix, last_bytesperword, last_repeat;
1504 int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat;
1505 int nosect = 0;
1506 char fmtchar, fmtstr[64];
1507 unsigned long addr;
1508 unsigned long word;
1509 long offset = 0;
1510 int symbolic = 0;
1511 int valid = 0;
1512 int phys = 0;
1513
1514 kdbgetintenv("MDCOUNT", &mdcount);
1515 kdbgetintenv("RADIX", &radix);
1516 kdbgetintenv("BYTESPERWORD", &bytesperword);
1517
1518 /* Assume 'md <addr>' and start with environment values */
1519 repeat = mdcount * 16 / bytesperword;
1520
1521 if (strcmp(argv[0], "mdr") == 0) {
1522 if (argc != 2)
1523 return KDB_ARGCOUNT;
1524 valid = 1;
1525 } else if (isdigit(argv[0][2])) {
1526 bytesperword = (int)(argv[0][2] - '0');
1527 if (bytesperword == 0) {
1528 bytesperword = last_bytesperword;
1529 if (bytesperword == 0)
1530 bytesperword = 4;
1531 }
1532 last_bytesperword = bytesperword;
1533 repeat = mdcount * 16 / bytesperword;
1534 if (!argv[0][3])
1535 valid = 1;
1536 else if (argv[0][3] == 'c' && argv[0][4]) {
1537 char *p;
1538 repeat = simple_strtoul(argv[0] + 4, &p, 10);
1539 mdcount = ((repeat * bytesperword) + 15) / 16;
1540 valid = !*p;
1541 }
1542 last_repeat = repeat;
1543 } else if (strcmp(argv[0], "md") == 0)
1544 valid = 1;
1545 else if (strcmp(argv[0], "mds") == 0)
1546 valid = 1;
1547 else if (strcmp(argv[0], "mdp") == 0) {
1548 phys = valid = 1;
1549 }
1550 if (!valid)
1551 return KDB_NOTFOUND;
1552
1553 if (argc == 0) {
1554 if (last_addr == 0)
1555 return KDB_ARGCOUNT;
1556 addr = last_addr;
1557 radix = last_radix;
1558 bytesperword = last_bytesperword;
1559 repeat = last_repeat;
1560 mdcount = ((repeat * bytesperword) + 15) / 16;
1561 }
1562
1563 if (argc) {
1564 unsigned long val;
1565 int diag, nextarg = 1;
1566 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
1567 &offset, NULL);
1568 if (diag)
1569 return diag;
1570 if (argc > nextarg+2)
1571 return KDB_ARGCOUNT;
1572
1573 if (argc >= nextarg) {
1574 diag = kdbgetularg(argv[nextarg], &val);
1575 if (!diag) {
1576 mdcount = (int) val;
1577 repeat = mdcount * 16 / bytesperword;
1578 }
1579 }
1580 if (argc >= nextarg+1) {
1581 diag = kdbgetularg(argv[nextarg+1], &val);
1582 if (!diag)
1583 radix = (int) val;
1584 }
1585 }
1586
1587 if (strcmp(argv[0], "mdr") == 0)
1588 return kdb_mdr(addr, mdcount);
1589
1590 switch (radix) {
1591 case 10:
1592 fmtchar = 'd';
1593 break;
1594 case 16:
1595 fmtchar = 'x';
1596 break;
1597 case 8:
1598 fmtchar = 'o';
1599 break;
1600 default:
1601 return KDB_BADRADIX;
1602 }
1603
1604 last_radix = radix;
1605
1606 if (bytesperword > KDB_WORD_SIZE)
1607 return KDB_BADWIDTH;
1608
1609 switch (bytesperword) {
1610 case 8:
1611 sprintf(fmtstr, "%%16.16l%c ", fmtchar);
1612 break;
1613 case 4:
1614 sprintf(fmtstr, "%%8.8l%c ", fmtchar);
1615 break;
1616 case 2:
1617 sprintf(fmtstr, "%%4.4l%c ", fmtchar);
1618 break;
1619 case 1:
1620 sprintf(fmtstr, "%%2.2l%c ", fmtchar);
1621 break;
1622 default:
1623 return KDB_BADWIDTH;
1624 }
1625
1626 last_repeat = repeat;
1627 last_bytesperword = bytesperword;
1628
1629 if (strcmp(argv[0], "mds") == 0) {
1630 symbolic = 1;
1631 /* Do not save these changes as last_*, they are temporary mds
1632 * overrides.
1633 */
1634 bytesperword = KDB_WORD_SIZE;
1635 repeat = mdcount;
1636 kdbgetintenv("NOSECT", &nosect);
1637 }
1638
1639 /* Round address down modulo BYTESPERWORD */
1640
1641 addr &= ~(bytesperword-1);
1642
1643 while (repeat > 0) {
1644 unsigned long a;
1645 int n, z, num = (symbolic ? 1 : (16 / bytesperword));
1646
1647 if (KDB_FLAG(CMD_INTERRUPT))
1648 return 0;
1649 for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) {
1650 if (phys) {
1651 if (kdb_getphysword(&word, a, bytesperword)
1652 || word)
1653 break;
1654 } else if (kdb_getword(&word, a, bytesperword) || word)
1655 break;
1656 }
1657 n = min(num, repeat);
1658 kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword,
1659 num, repeat, phys);
1660 addr += bytesperword * n;
1661 repeat -= n;
1662 z = (z + num - 1) / num;
1663 if (z > 2) {
1664 int s = num * (z-2);
1665 kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0
1666 " zero suppressed\n",
1667 addr, addr + bytesperword * s - 1);
1668 addr += bytesperword * s;
1669 repeat -= s;
1670 }
1671 }
1672 last_addr = addr;
1673
1674 return 0;
1675}
1676
1677/*
1678 * kdb_mm - This function implements the 'mm' command.
1679 * mm address-expression new-value
1680 * Remarks:
1681 * mm works on machine words, mmW works on bytes.
1682 */
1683static int kdb_mm(int argc, const char **argv)
1684{
1685 int diag;
1686 unsigned long addr;
1687 long offset = 0;
1688 unsigned long contents;
1689 int nextarg;
1690 int width;
1691
1692 if (argv[0][2] && !isdigit(argv[0][2]))
1693 return KDB_NOTFOUND;
1694
1695 if (argc < 2)
1696 return KDB_ARGCOUNT;
1697
1698 nextarg = 1;
1699 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
1700 if (diag)
1701 return diag;
1702
1703 if (nextarg > argc)
1704 return KDB_ARGCOUNT;
1705 diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL);
1706 if (diag)
1707 return diag;
1708
1709 if (nextarg != argc + 1)
1710 return KDB_ARGCOUNT;
1711
1712 width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE);
1713 diag = kdb_putword(addr, contents, width);
1714 if (diag)
1715 return diag;
1716
1717 kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents);
1718
1719 return 0;
1720}
1721
1722/*
1723 * kdb_go - This function implements the 'go' command.
1724 * go [address-expression]
1725 */
1726static int kdb_go(int argc, const char **argv)
1727{
1728 unsigned long addr;
1729 int diag;
1730 int nextarg;
1731 long offset;
1732
1733 if (argc == 1) {
1734 if (raw_smp_processor_id() != kdb_initial_cpu) {
1735 kdb_printf("go <address> must be issued from the "
1736 "initial cpu, do cpu %d first\n",
1737 kdb_initial_cpu);
1738 return KDB_ARGCOUNT;
1739 }
1740 nextarg = 1;
1741 diag = kdbgetaddrarg(argc, argv, &nextarg,
1742 &addr, &offset, NULL);
1743 if (diag)
1744 return diag;
1745 } else if (argc) {
1746 return KDB_ARGCOUNT;
1747 }
1748
1749 diag = KDB_CMD_GO;
1750 if (KDB_FLAG(CATASTROPHIC)) {
1751 kdb_printf("Catastrophic error detected\n");
1752 kdb_printf("kdb_continue_catastrophic=%d, ",
1753 kdb_continue_catastrophic);
1754 if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) {
1755 kdb_printf("type go a second time if you really want "
1756 "to continue\n");
1757 return 0;
1758 }
1759 if (kdb_continue_catastrophic == 2) {
1760 kdb_printf("forcing reboot\n");
1761 kdb_reboot(0, NULL);
1762 }
1763 kdb_printf("attempting to continue\n");
1764 }
1765 return diag;
1766}
1767
1768/*
1769 * kdb_rd - This function implements the 'rd' command.
1770 */
1771static int kdb_rd(int argc, const char **argv)
1772{
1773 int diag = kdb_check_regs();
1774 if (diag)
1775 return diag;
1776
1777 kdb_dumpregs(kdb_current_regs);
1778 return 0;
1779}
1780
1781/*
1782 * kdb_rm - This function implements the 'rm' (register modify) command.
1783 * rm register-name new-contents
1784 * Remarks:
1785 * Currently doesn't allow modification of control or
1786 * debug registers.
1787 */
1788static int kdb_rm(int argc, const char **argv)
1789{
1790 int diag;
1791 int ind = 0;
1792 unsigned long contents;
1793
1794 if (argc != 2)
1795 return KDB_ARGCOUNT;
1796 /*
1797 * Allow presence or absence of leading '%' symbol.
1798 */
1799 if (argv[1][0] == '%')
1800 ind = 1;
1801
1802 diag = kdbgetularg(argv[2], &contents);
1803 if (diag)
1804 return diag;
1805
1806 diag = kdb_check_regs();
1807 if (diag)
1808 return diag;
1809 kdb_printf("ERROR: Register set currently not implemented\n");
1810 return 0;
1811}
1812
1813#if defined(CONFIG_MAGIC_SYSRQ)
1814/*
1815 * kdb_sr - This function implements the 'sr' (SYSRQ key) command
1816 * which interfaces to the soi-disant MAGIC SYSRQ functionality.
1817 * sr <magic-sysrq-code>
1818 */
1819static int kdb_sr(int argc, const char **argv)
1820{
1821 if (argc != 1)
1822 return KDB_ARGCOUNT;
1823 sysrq_toggle_support(1);
1824 kdb_trap_printk++;
1825 handle_sysrq(*argv[1], NULL);
1826 kdb_trap_printk--;
1827
1828 return 0;
1829}
1830#endif /* CONFIG_MAGIC_SYSRQ */
1831
1832/*
1833 * kdb_ef - This function implements the 'regs' (display exception
1834 * frame) command. This command takes an address and expects to
1835 * find an exception frame at that address, formats and prints
1836 * it.
1837 * regs address-expression
1838 * Remarks:
1839 * Not done yet.
1840 */
1841static int kdb_ef(int argc, const char **argv)
1842{
1843 int diag;
1844 unsigned long addr;
1845 long offset;
1846 int nextarg;
1847
1848 if (argc != 1)
1849 return KDB_ARGCOUNT;
1850
1851 nextarg = 1;
1852 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
1853 if (diag)
1854 return diag;
1855 show_regs((struct pt_regs *)addr);
1856 return 0;
1857}
1858
1859#if defined(CONFIG_MODULES)
1860/* modules using other modules */
1861struct module_use {
1862 struct list_head list;
1863 struct module *module_which_uses;
1864};
1865
1866/*
1867 * kdb_lsmod - This function implements the 'lsmod' command. Lists
1868 * currently loaded kernel modules.
1869 * Mostly taken from userland lsmod.
1870 */
1871static int kdb_lsmod(int argc, const char **argv)
1872{
1873 struct module *mod;
1874
1875 if (argc != 0)
1876 return KDB_ARGCOUNT;
1877
1878 kdb_printf("Module Size modstruct Used by\n");
1879 list_for_each_entry(mod, kdb_modules, list) {
1880
1881 kdb_printf("%-20s%8u 0x%p ", mod->name,
1882 mod->core_size, (void *)mod);
1883#ifdef CONFIG_MODULE_UNLOAD
1884 kdb_printf("%4d ", module_refcount(mod));
1885#endif
1886 if (mod->state == MODULE_STATE_GOING)
1887 kdb_printf(" (Unloading)");
1888 else if (mod->state == MODULE_STATE_COMING)
1889 kdb_printf(" (Loading)");
1890 else
1891 kdb_printf(" (Live)");
1892
1893#ifdef CONFIG_MODULE_UNLOAD
1894 {
1895 struct module_use *use;
1896 kdb_printf(" [ ");
1897 list_for_each_entry(use, &mod->modules_which_use_me,
1898 list)
1899 kdb_printf("%s ", use->module_which_uses->name);
1900 kdb_printf("]\n");
1901 }
1902#endif
1903 }
1904
1905 return 0;
1906}
1907
1908#endif /* CONFIG_MODULES */
1909
1910/*
1911 * kdb_env - This function implements the 'env' command. Display the
1912 * current environment variables.
1913 */
1914
1915static int kdb_env(int argc, const char **argv)
1916{
1917 int i;
1918
1919 for (i = 0; i < __nenv; i++) {
1920 if (__env[i])
1921 kdb_printf("%s\n", __env[i]);
1922 }
1923
1924 if (KDB_DEBUG(MASK))
1925 kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
1926
1927 return 0;
1928}
1929
1930#ifdef CONFIG_PRINTK
1931/*
1932 * kdb_dmesg - This function implements the 'dmesg' command to display
1933 * the contents of the syslog buffer.
1934 * dmesg [lines] [adjust]
1935 */
1936static int kdb_dmesg(int argc, const char **argv)
1937{
1938 char *syslog_data[4], *start, *end, c = '\0', *p;
1939 int diag, logging, logsize, lines = 0, adjust = 0, n;
1940
1941 if (argc > 2)
1942 return KDB_ARGCOUNT;
1943 if (argc) {
1944 char *cp;
1945 lines = simple_strtol(argv[1], &cp, 0);
1946 if (*cp)
1947 lines = 0;
1948 if (argc > 1) {
1949 adjust = simple_strtoul(argv[2], &cp, 0);
1950 if (*cp || adjust < 0)
1951 adjust = 0;
1952 }
1953 }
1954
1955 /* disable LOGGING if set */
1956 diag = kdbgetintenv("LOGGING", &logging);
1957 if (!diag && logging) {
1958 const char *setargs[] = { "set", "LOGGING", "0" };
1959 kdb_set(2, setargs);
1960 }
1961
1962 /* syslog_data[0,1] physical start, end+1. syslog_data[2,3]
1963 * logical start, end+1. */
1964 kdb_syslog_data(syslog_data);
1965 if (syslog_data[2] == syslog_data[3])
1966 return 0;
1967 logsize = syslog_data[1] - syslog_data[0];
1968 start = syslog_data[2];
1969 end = syslog_data[3];
1970#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
1971 for (n = 0, p = start; p < end; ++p) {
1972 c = *KDB_WRAP(p);
1973 if (c == '\n')
1974 ++n;
1975 }
1976 if (c != '\n')
1977 ++n;
1978 if (lines < 0) {
1979 if (adjust >= n)
1980 kdb_printf("buffer only contains %d lines, nothing "
1981 "printed\n", n);
1982 else if (adjust - lines >= n)
1983 kdb_printf("buffer only contains %d lines, last %d "
1984 "lines printed\n", n, n - adjust);
1985 if (adjust) {
1986 for (; start < end && adjust; ++start) {
1987 if (*KDB_WRAP(start) == '\n')
1988 --adjust;
1989 }
1990 if (start < end)
1991 ++start;
1992 }
1993 for (p = start; p < end && lines; ++p) {
1994 if (*KDB_WRAP(p) == '\n')
1995 ++lines;
1996 }
1997 end = p;
1998 } else if (lines > 0) {
1999 int skip = n - (adjust + lines);
2000 if (adjust >= n) {
2001 kdb_printf("buffer only contains %d lines, "
2002 "nothing printed\n", n);
2003 skip = n;
2004 } else if (skip < 0) {
2005 lines += skip;
2006 skip = 0;
2007 kdb_printf("buffer only contains %d lines, first "
2008 "%d lines printed\n", n, lines);
2009 }
2010 for (; start < end && skip; ++start) {
2011 if (*KDB_WRAP(start) == '\n')
2012 --skip;
2013 }
2014 for (p = start; p < end && lines; ++p) {
2015 if (*KDB_WRAP(p) == '\n')
2016 --lines;
2017 }
2018 end = p;
2019 }
2020 /* Do a line at a time (max 200 chars) to reduce protocol overhead */
2021 c = '\n';
2022 while (start != end) {
2023 char buf[201];
2024 p = buf;
2025 if (KDB_FLAG(CMD_INTERRUPT))
2026 return 0;
2027 while (start < end && (c = *KDB_WRAP(start)) &&
2028 (p - buf) < sizeof(buf)-1) {
2029 ++start;
2030 *p++ = c;
2031 if (c == '\n')
2032 break;
2033 }
2034 *p = '\0';
2035 kdb_printf("%s", buf);
2036 }
2037 if (c != '\n')
2038 kdb_printf("\n");
2039
2040 return 0;
2041}
2042#endif /* CONFIG_PRINTK */
2043/*
2044 * kdb_cpu - This function implements the 'cpu' command.
2045 * cpu [<cpunum>]
2046 * Returns:
2047 * KDB_CMD_CPU for success, a kdb diagnostic if error
2048 */
2049static void kdb_cpu_status(void)
2050{
2051 int i, start_cpu, first_print = 1;
2052 char state, prev_state = '?';
2053
2054 kdb_printf("Currently on cpu %d\n", raw_smp_processor_id());
2055 kdb_printf("Available cpus: ");
2056 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
2057 if (!cpu_online(i)) {
2058 state = 'F'; /* cpu is offline */
2059 } else {
2060 state = ' '; /* cpu is responding to kdb */
2061 if (kdb_task_state_char(KDB_TSK(i)) == 'I')
2062 state = 'I'; /* idle task */
2063 }
2064 if (state != prev_state) {
2065 if (prev_state != '?') {
2066 if (!first_print)
2067 kdb_printf(", ");
2068 first_print = 0;
2069 kdb_printf("%d", start_cpu);
2070 if (start_cpu < i-1)
2071 kdb_printf("-%d", i-1);
2072 if (prev_state != ' ')
2073 kdb_printf("(%c)", prev_state);
2074 }
2075 prev_state = state;
2076 start_cpu = i;
2077 }
2078 }
2079 /* print the trailing cpus, ignoring them if they are all offline */
2080 if (prev_state != 'F') {
2081 if (!first_print)
2082 kdb_printf(", ");
2083 kdb_printf("%d", start_cpu);
2084 if (start_cpu < i-1)
2085 kdb_printf("-%d", i-1);
2086 if (prev_state != ' ')
2087 kdb_printf("(%c)", prev_state);
2088 }
2089 kdb_printf("\n");
2090}
2091
2092static int kdb_cpu(int argc, const char **argv)
2093{
2094 unsigned long cpunum;
2095 int diag;
2096
2097 if (argc == 0) {
2098 kdb_cpu_status();
2099 return 0;
2100 }
2101
2102 if (argc != 1)
2103 return KDB_ARGCOUNT;
2104
2105 diag = kdbgetularg(argv[1], &cpunum);
2106 if (diag)
2107 return diag;
2108
2109 /*
2110 * Validate cpunum
2111 */
2112 if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
2113 return KDB_BADCPUNUM;
2114
2115 dbg_switch_cpu = cpunum;
2116
2117 /*
2118 * Switch to other cpu
2119 */
2120 return KDB_CMD_CPU;
2121}
2122
2123/* The user may not realize that ps/bta with no parameters does not print idle
2124 * or sleeping system daemon processes, so tell them how many were suppressed.
2125 */
2126void kdb_ps_suppressed(void)
2127{
2128 int idle = 0, daemon = 0;
2129 unsigned long mask_I = kdb_task_state_string("I"),
2130 mask_M = kdb_task_state_string("M");
2131 unsigned long cpu;
2132 const struct task_struct *p, *g;
2133 for_each_online_cpu(cpu) {
2134 p = kdb_curr_task(cpu);
2135 if (kdb_task_state(p, mask_I))
2136 ++idle;
2137 }
2138 kdb_do_each_thread(g, p) {
2139 if (kdb_task_state(p, mask_M))
2140 ++daemon;
2141 } kdb_while_each_thread(g, p);
2142 if (idle || daemon) {
2143 if (idle)
2144 kdb_printf("%d idle process%s (state I)%s\n",
2145 idle, idle == 1 ? "" : "es",
2146 daemon ? " and " : "");
2147 if (daemon)
2148 kdb_printf("%d sleeping system daemon (state M) "
2149 "process%s", daemon,
2150 daemon == 1 ? "" : "es");
2151 kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
2152 }
2153}
2154
2155/*
2156 * kdb_ps - This function implements the 'ps' command which shows a
2157 * list of the active processes.
2158 * ps [DRSTCZEUIMA] All processes, optionally filtered by state
2159 */
2160void kdb_ps1(const struct task_struct *p)
2161{
2162 int cpu;
2163 unsigned long tmp;
2164
2165 if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
2166 return;
2167
2168 cpu = kdb_process_cpu(p);
2169 kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n",
2170 (void *)p, p->pid, p->parent->pid,
2171 kdb_task_has_cpu(p), kdb_process_cpu(p),
2172 kdb_task_state_char(p),
2173 (void *)(&p->thread),
2174 p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
2175 p->comm);
2176 if (kdb_task_has_cpu(p)) {
2177 if (!KDB_TSK(cpu)) {
2178 kdb_printf(" Error: no saved data for this cpu\n");
2179 } else {
2180 if (KDB_TSK(cpu) != p)
2181 kdb_printf(" Error: does not match running "
2182 "process table (0x%p)\n", KDB_TSK(cpu));
2183 }
2184 }
2185}
2186
2187static int kdb_ps(int argc, const char **argv)
2188{
2189 struct task_struct *g, *p;
2190 unsigned long mask, cpu;
2191
2192 if (argc == 0)
2193 kdb_ps_suppressed();
2194 kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
2195 (int)(2*sizeof(void *))+2, "Task Addr",
2196 (int)(2*sizeof(void *))+2, "Thread");
2197 mask = kdb_task_state_string(argc ? argv[1] : NULL);
2198 /* Run the active tasks first */
2199 for_each_online_cpu(cpu) {
2200 if (KDB_FLAG(CMD_INTERRUPT))
2201 return 0;
2202 p = kdb_curr_task(cpu);
2203 if (kdb_task_state(p, mask))
2204 kdb_ps1(p);
2205 }
2206 kdb_printf("\n");
2207 /* Now the real tasks */
2208 kdb_do_each_thread(g, p) {
2209 if (KDB_FLAG(CMD_INTERRUPT))
2210 return 0;
2211 if (kdb_task_state(p, mask))
2212 kdb_ps1(p);
2213 } kdb_while_each_thread(g, p);
2214
2215 return 0;
2216}
2217
2218/*
2219 * kdb_pid - This function implements the 'pid' command which switches
2220 * the currently active process.
2221 * pid [<pid> | R]
2222 */
2223static int kdb_pid(int argc, const char **argv)
2224{
2225 struct task_struct *p;
2226 unsigned long val;
2227 int diag;
2228
2229 if (argc > 1)
2230 return KDB_ARGCOUNT;
2231
2232 if (argc) {
2233 if (strcmp(argv[1], "R") == 0) {
2234 p = KDB_TSK(kdb_initial_cpu);
2235 } else {
2236 diag = kdbgetularg(argv[1], &val);
2237 if (diag)
2238 return KDB_BADINT;
2239
2240 p = find_task_by_pid_ns((pid_t)val, &init_pid_ns);
2241 if (!p) {
2242 kdb_printf("No task with pid=%d\n", (pid_t)val);
2243 return 0;
2244 }
2245 }
2246 kdb_set_current_task(p);
2247 }
2248 kdb_printf("KDB current process is %s(pid=%d)\n",
2249 kdb_current_task->comm,
2250 kdb_current_task->pid);
2251
2252 return 0;
2253}
2254
2255/*
2256 * kdb_ll - This function implements the 'll' command which follows a
2257 * linked list and executes an arbitrary command for each
2258 * element.
2259 */
2260static int kdb_ll(int argc, const char **argv)
2261{
2262 int diag;
2263 unsigned long addr;
2264 long offset = 0;
2265 unsigned long va;
2266 unsigned long linkoffset;
2267 int nextarg;
2268 const char *command;
2269
2270 if (argc != 3)
2271 return KDB_ARGCOUNT;
2272
2273 nextarg = 1;
2274 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
2275 if (diag)
2276 return diag;
2277
2278 diag = kdbgetularg(argv[2], &linkoffset);
2279 if (diag)
2280 return diag;
2281
2282 /*
2283 * Using the starting address as
2284 * the first element in the list, and assuming that
2285 * the list ends with a null pointer.
2286 */
2287
2288 va = addr;
2289 command = kdb_strdup(argv[3], GFP_KDB);
2290 if (!command) {
2291 kdb_printf("%s: cannot duplicate command\n", __func__);
2292 return 0;
2293 }
2294 /* Recursive use of kdb_parse, do not use argv after this point */
2295 argv = NULL;
2296
2297 while (va) {
2298 char buf[80];
2299
2300 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2301 diag = kdb_parse(buf);
2302 if (diag)
2303 return diag;
2304
2305 addr = va + linkoffset;
2306 if (kdb_getword(&va, addr, sizeof(va)))
2307 return 0;
2308 }
2309 kfree(command);
2310
2311 return 0;
2312}
2313
2314static int kdb_kgdb(int argc, const char **argv)
2315{
2316 return KDB_CMD_KGDB;
2317}
2318
2319/*
2320 * kdb_help - This function implements the 'help' and '?' commands.
2321 */
2322static int kdb_help(int argc, const char **argv)
2323{
2324 kdbtab_t *kt;
2325 int i;
2326
2327 kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
2328 kdb_printf("-----------------------------"
2329 "-----------------------------\n");
2330 for_each_kdbcmd(kt, i) {
2331 if (kt->cmd_name)
2332 kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
2333 kt->cmd_usage, kt->cmd_help);
2334 if (KDB_FLAG(CMD_INTERRUPT))
2335 return 0;
2336 }
2337 return 0;
2338}
2339
2340/*
2341 * kdb_kill - This function implements the 'kill' commands.
2342 */
2343static int kdb_kill(int argc, const char **argv)
2344{
2345 long sig, pid;
2346 char *endp;
2347 struct task_struct *p;
2348 struct siginfo info;
2349
2350 if (argc != 2)
2351 return KDB_ARGCOUNT;
2352
2353 sig = simple_strtol(argv[1], &endp, 0);
2354 if (*endp)
2355 return KDB_BADINT;
2356 if (sig >= 0) {
2357 kdb_printf("Invalid signal parameter.<-signal>\n");
2358 return 0;
2359 }
2360 sig = -sig;
2361
2362 pid = simple_strtol(argv[2], &endp, 0);
2363 if (*endp)
2364 return KDB_BADINT;
2365 if (pid <= 0) {
2366 kdb_printf("Process ID must be large than 0.\n");
2367 return 0;
2368 }
2369
2370 /* Find the process. */
2371 p = find_task_by_pid_ns(pid, &init_pid_ns);
2372 if (!p) {
2373 kdb_printf("The specified process isn't found.\n");
2374 return 0;
2375 }
2376 p = p->group_leader;
2377 info.si_signo = sig;
2378 info.si_errno = 0;
2379 info.si_code = SI_USER;
2380 info.si_pid = pid; /* same capabilities as process being signalled */
2381 info.si_uid = 0; /* kdb has root authority */
2382 kdb_send_sig_info(p, &info);
2383 return 0;
2384}
2385
2386struct kdb_tm {
2387 int tm_sec; /* seconds */
2388 int tm_min; /* minutes */
2389 int tm_hour; /* hours */
2390 int tm_mday; /* day of the month */
2391 int tm_mon; /* month */
2392 int tm_year; /* year */
2393};
2394
2395static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
2396{
2397 /* This will work from 1970-2099, 2100 is not a leap year */
2398 static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
2399 31, 30, 31, 30, 31 };
2400 memset(tm, 0, sizeof(*tm));
2401 tm->tm_sec = tv->tv_sec % (24 * 60 * 60);
2402 tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
2403 (2 * 365 + 1); /* shift base from 1970 to 1968 */
2404 tm->tm_min = tm->tm_sec / 60 % 60;
2405 tm->tm_hour = tm->tm_sec / 60 / 60;
2406 tm->tm_sec = tm->tm_sec % 60;
2407 tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
2408 tm->tm_mday %= (4*365+1);
2409 mon_day[1] = 29;
2410 while (tm->tm_mday >= mon_day[tm->tm_mon]) {
2411 tm->tm_mday -= mon_day[tm->tm_mon];
2412 if (++tm->tm_mon == 12) {
2413 tm->tm_mon = 0;
2414 ++tm->tm_year;
2415 mon_day[1] = 28;
2416 }
2417 }
2418 ++tm->tm_mday;
2419}
2420
2421/*
2422 * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
2423 * I cannot call that code directly from kdb, it has an unconditional
2424 * cli()/sti() and calls routines that take locks which can stop the debugger.
2425 */
2426static void kdb_sysinfo(struct sysinfo *val)
2427{
2428 struct timespec uptime;
2429 do_posix_clock_monotonic_gettime(&uptime);
2430 memset(val, 0, sizeof(*val));
2431 val->uptime = uptime.tv_sec;
2432 val->loads[0] = avenrun[0];
2433 val->loads[1] = avenrun[1];
2434 val->loads[2] = avenrun[2];
2435 val->procs = nr_threads-1;
2436 si_meminfo(val);
2437
2438 return;
2439}
2440
2441/*
2442 * kdb_summary - This function implements the 'summary' command.
2443 */
2444static int kdb_summary(int argc, const char **argv)
2445{
2446 struct kdb_tm tm;
2447 struct sysinfo val;
2448
2449 if (argc)
2450 return KDB_ARGCOUNT;
2451
2452 kdb_printf("sysname %s\n", init_uts_ns.name.sysname);
2453 kdb_printf("release %s\n", init_uts_ns.name.release);
2454 kdb_printf("version %s\n", init_uts_ns.name.version);
2455 kdb_printf("machine %s\n", init_uts_ns.name.machine);
2456 kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
2457 kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
2458 kdb_printf("ccversion %s\n", __stringify(CCVERSION));
2459
2460 kdb_gmtime(&xtime, &tm);
2461 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
2462 "tz_minuteswest %d\n",
2463 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
2464 tm.tm_hour, tm.tm_min, tm.tm_sec,
2465 sys_tz.tz_minuteswest);
2466
2467 kdb_sysinfo(&val);
2468 kdb_printf("uptime ");
2469 if (val.uptime > (24*60*60)) {
2470 int days = val.uptime / (24*60*60);
2471 val.uptime %= (24*60*60);
2472 kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
2473 }
2474 kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
2475
2476 /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
2477
2478#define LOAD_INT(x) ((x) >> FSHIFT)
2479#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
2480 kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
2481 LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
2482 LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
2483 LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
2484#undef LOAD_INT
2485#undef LOAD_FRAC
2486 /* Display in kilobytes */
2487#define K(x) ((x) << (PAGE_SHIFT - 10))
2488 kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
2489 "Buffers: %8lu kB\n",
2490 val.totalram, val.freeram, val.bufferram);
2491 return 0;
2492}
2493
2494/*
2495 * kdb_per_cpu - This function implements the 'per_cpu' command.
2496 */
2497static int kdb_per_cpu(int argc, const char **argv)
2498{
2499 char buf[256], fmtstr[64];
2500 kdb_symtab_t symtab;
2501 cpumask_t suppress = CPU_MASK_NONE;
2502 int cpu, diag;
2503 unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
2504
2505 if (argc < 1 || argc > 3)
2506 return KDB_ARGCOUNT;
2507
2508 snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
2509 if (!kdbgetsymval(buf, &symtab)) {
2510 kdb_printf("%s is not a per_cpu variable\n", argv[1]);
2511 return KDB_BADADDR;
2512 }
2513 if (argc >= 2) {
2514 diag = kdbgetularg(argv[2], &bytesperword);
2515 if (diag)
2516 return diag;
2517 }
2518 if (!bytesperword)
2519 bytesperword = KDB_WORD_SIZE;
2520 else if (bytesperword > KDB_WORD_SIZE)
2521 return KDB_BADWIDTH;
2522 sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword));
2523 if (argc >= 3) {
2524 diag = kdbgetularg(argv[3], &whichcpu);
2525 if (diag)
2526 return diag;
2527 if (!cpu_online(whichcpu)) {
2528 kdb_printf("cpu %ld is not online\n", whichcpu);
2529 return KDB_BADCPUNUM;
2530 }
2531 }
2532
2533 /* Most architectures use __per_cpu_offset[cpu], some use
2534 * __per_cpu_offset(cpu), smp has no __per_cpu_offset.
2535 */
2536#ifdef __per_cpu_offset
2537#define KDB_PCU(cpu) __per_cpu_offset(cpu)
2538#else
2539#ifdef CONFIG_SMP
2540#define KDB_PCU(cpu) __per_cpu_offset[cpu]
2541#else
2542#define KDB_PCU(cpu) 0
2543#endif
2544#endif
2545
2546 for_each_online_cpu(cpu) {
2547 if (whichcpu != ~0UL && whichcpu != cpu)
2548 continue;
2549 addr = symtab.sym_start + KDB_PCU(cpu);
2550 diag = kdb_getword(&val, addr, bytesperword);
2551 if (diag) {
2552 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
2553 "read, diag=%d\n", cpu, addr, diag);
2554 continue;
2555 }
2556#ifdef CONFIG_SMP
2557 if (!val) {
2558 cpu_set(cpu, suppress);
2559 continue;
2560 }
2561#endif /* CONFIG_SMP */
2562 kdb_printf("%5d ", cpu);
2563 kdb_md_line(fmtstr, addr,
2564 bytesperword == KDB_WORD_SIZE,
2565 1, bytesperword, 1, 1, 0);
2566 }
2567 if (cpus_weight(suppress) == 0)
2568 return 0;
2569 kdb_printf("Zero suppressed cpu(s):");
2570 for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
2571 cpu = next_cpu(cpu, suppress)) {
2572 kdb_printf(" %d", cpu);
2573 if (cpu == num_possible_cpus() - 1 ||
2574 next_cpu(cpu, suppress) != cpu + 1)
2575 continue;
2576 while (cpu < num_possible_cpus() &&
2577 next_cpu(cpu, suppress) == cpu + 1)
2578 ++cpu;
2579 kdb_printf("-%d", cpu);
2580 }
2581 kdb_printf("\n");
2582
2583#undef KDB_PCU
2584
2585 return 0;
2586}
2587
2588/*
2589 * display help for the use of cmd | grep pattern
2590 */
2591static int kdb_grep_help(int argc, const char **argv)
2592{
2593 kdb_printf("Usage of cmd args | grep pattern:\n");
2594 kdb_printf(" Any command's output may be filtered through an ");
2595 kdb_printf("emulated 'pipe'.\n");
2596 kdb_printf(" 'grep' is just a key word.\n");
2597 kdb_printf(" The pattern may include a very limited set of "
2598 "metacharacters:\n");
2599 kdb_printf(" pattern or ^pattern or pattern$ or ^pattern$\n");
2600 kdb_printf(" And if there are spaces in the pattern, you may "
2601 "quote it:\n");
2602 kdb_printf(" \"pat tern\" or \"^pat tern\" or \"pat tern$\""
2603 " or \"^pat tern$\"\n");
2604 return 0;
2605}
2606
2607/*
2608 * kdb_register_repeat - This function is used to register a kernel
2609 * debugger command.
2610 * Inputs:
2611 * cmd Command name
2612 * func Function to execute the command
2613 * usage A simple usage string showing arguments
2614 * help A simple help string describing command
2615 * repeat Does the command auto repeat on enter?
2616 * Returns:
2617 * zero for success, one if a duplicate command.
2618 */
2619#define kdb_command_extend 50 /* arbitrary */
2620int kdb_register_repeat(char *cmd,
2621 kdb_func_t func,
2622 char *usage,
2623 char *help,
2624 short minlen,
2625 kdb_repeat_t repeat)
2626{
2627 int i;
2628 kdbtab_t *kp;
2629
2630 /*
2631 * Brute force method to determine duplicates
2632 */
2633 for_each_kdbcmd(kp, i) {
2634 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2635 kdb_printf("Duplicate kdb command registered: "
2636 "%s, func %p help %s\n", cmd, func, help);
2637 return 1;
2638 }
2639 }
2640
2641 /*
2642 * Insert command into first available location in table
2643 */
2644 for_each_kdbcmd(kp, i) {
2645 if (kp->cmd_name == NULL)
2646 break;
2647 }
2648
2649 if (i >= kdb_max_commands) {
2650 kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
2651 kdb_command_extend) * sizeof(*new), GFP_KDB);
2652 if (!new) {
2653 kdb_printf("Could not allocate new kdb_command "
2654 "table\n");
2655 return 1;
2656 }
2657 if (kdb_commands) {
2658 memcpy(new, kdb_commands,
2659 kdb_max_commands * sizeof(*new));
2660 kfree(kdb_commands);
2661 }
2662 memset(new + kdb_max_commands, 0,
2663 kdb_command_extend * sizeof(*new));
2664 kdb_commands = new;
2665 kp = kdb_commands + kdb_max_commands;
2666 kdb_max_commands += kdb_command_extend;
2667 }
2668
2669 kp->cmd_name = cmd;
2670 kp->cmd_func = func;
2671 kp->cmd_usage = usage;
2672 kp->cmd_help = help;
2673 kp->cmd_flags = 0;
2674 kp->cmd_minlen = minlen;
2675 kp->cmd_repeat = repeat;
2676
2677 return 0;
2678}
2679
2680/*
2681 * kdb_register - Compatibility register function for commands that do
2682 * not need to specify a repeat state. Equivalent to
2683 * kdb_register_repeat with KDB_REPEAT_NONE.
2684 * Inputs:
2685 * cmd Command name
2686 * func Function to execute the command
2687 * usage A simple usage string showing arguments
2688 * help A simple help string describing command
2689 * Returns:
2690 * zero for success, one if a duplicate command.
2691 */
2692int kdb_register(char *cmd,
2693 kdb_func_t func,
2694 char *usage,
2695 char *help,
2696 short minlen)
2697{
2698 return kdb_register_repeat(cmd, func, usage, help, minlen,
2699 KDB_REPEAT_NONE);
2700}
2701
2702/*
2703 * kdb_unregister - This function is used to unregister a kernel
2704 * debugger command. It is generally called when a module which
2705 * implements kdb commands is unloaded.
2706 * Inputs:
2707 * cmd Command name
2708 * Returns:
2709 * zero for success, one command not registered.
2710 */
2711int kdb_unregister(char *cmd)
2712{
2713 int i;
2714 kdbtab_t *kp;
2715
2716 /*
2717 * find the command.
2718 */
2719 for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) {
2720 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2721 kp->cmd_name = NULL;
2722 return 0;
2723 }
2724 }
2725
2726 /* Couldn't find it. */
2727 return 1;
2728}
2729
2730/* Initialize the kdb command table. */
2731static void __init kdb_inittab(void)
2732{
2733 int i;
2734 kdbtab_t *kp;
2735
2736 for_each_kdbcmd(kp, i)
2737 kp->cmd_name = NULL;
2738
2739 kdb_register_repeat("md", kdb_md, "<vaddr>",
2740 "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
2741 KDB_REPEAT_NO_ARGS);
2742 kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
2743 "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
2744 kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
2745 "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
2746 kdb_register_repeat("mds", kdb_md, "<vaddr>",
2747 "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
2748 kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
2749 "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
2750 kdb_register_repeat("go", kdb_go, "[<vaddr>]",
2751 "Continue Execution", 1, KDB_REPEAT_NONE);
2752 kdb_register_repeat("rd", kdb_rd, "",
2753 "Display Registers", 0, KDB_REPEAT_NONE);
2754 kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
2755 "Modify Registers", 0, KDB_REPEAT_NONE);
2756 kdb_register_repeat("ef", kdb_ef, "<vaddr>",
2757 "Display exception frame", 0, KDB_REPEAT_NONE);
2758 kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
2759 "Stack traceback", 1, KDB_REPEAT_NONE);
2760 kdb_register_repeat("btp", kdb_bt, "<pid>",
2761 "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
2762 kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
2763 "Display stack all processes", 0, KDB_REPEAT_NONE);
2764 kdb_register_repeat("btc", kdb_bt, "",
2765 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
2766 kdb_register_repeat("btt", kdb_bt, "<vaddr>",
2767 "Backtrace process given its struct task address", 0,
2768 KDB_REPEAT_NONE);
2769 kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
2770 "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
2771 kdb_register_repeat("env", kdb_env, "",
2772 "Show environment variables", 0, KDB_REPEAT_NONE);
2773 kdb_register_repeat("set", kdb_set, "",
2774 "Set environment variables", 0, KDB_REPEAT_NONE);
2775 kdb_register_repeat("help", kdb_help, "",
2776 "Display Help Message", 1, KDB_REPEAT_NONE);
2777 kdb_register_repeat("?", kdb_help, "",
2778 "Display Help Message", 0, KDB_REPEAT_NONE);
2779 kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
2780 "Switch to new cpu", 0, KDB_REPEAT_NONE);
2781 kdb_register_repeat("kgdb", kdb_kgdb, "",
2782 "Enter kgdb mode", 0, KDB_REPEAT_NONE);
2783 kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
2784 "Display active task list", 0, KDB_REPEAT_NONE);
2785 kdb_register_repeat("pid", kdb_pid, "<pidnum>",
2786 "Switch to another task", 0, KDB_REPEAT_NONE);
2787 kdb_register_repeat("reboot", kdb_reboot, "",
2788 "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
2789#if defined(CONFIG_MODULES)
2790 kdb_register_repeat("lsmod", kdb_lsmod, "",
2791 "List loaded kernel modules", 0, KDB_REPEAT_NONE);
2792#endif
2793#if defined(CONFIG_MAGIC_SYSRQ)
2794 kdb_register_repeat("sr", kdb_sr, "<key>",
2795 "Magic SysRq key", 0, KDB_REPEAT_NONE);
2796#endif
2797#if defined(CONFIG_PRINTK)
2798 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
2799 "Display syslog buffer", 0, KDB_REPEAT_NONE);
2800#endif
2801 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
2802 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
2803 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
2804 "Send a signal to a process", 0, KDB_REPEAT_NONE);
2805 kdb_register_repeat("summary", kdb_summary, "",
2806 "Summarize the system", 4, KDB_REPEAT_NONE);
2807 kdb_register_repeat("per_cpu", kdb_per_cpu, "",
2808 "Display per_cpu variables", 3, KDB_REPEAT_NONE);
2809 kdb_register_repeat("grephelp", kdb_grep_help, "",
2810 "Display help on | grep", 0, KDB_REPEAT_NONE);
2811}
2812
2813/* Execute any commands defined in kdb_cmds. */
2814static void __init kdb_cmd_init(void)
2815{
2816 int i, diag;
2817 for (i = 0; kdb_cmds[i]; ++i) {
2818 diag = kdb_parse(kdb_cmds[i]);
2819 if (diag)
2820 kdb_printf("kdb command %s failed, kdb diag %d\n",
2821 kdb_cmds[i], diag);
2822 }
2823 if (defcmd_in_progress) {
2824 kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n");
2825 kdb_parse("endefcmd");
2826 }
2827}
2828
2829/* Intialize kdb_printf, breakpoint tables and kdb state */
2830void __init kdb_init(int lvl)
2831{
2832 static int kdb_init_lvl = KDB_NOT_INITIALIZED;
2833 int i;
2834
2835 if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl)
2836 return;
2837 for (i = kdb_init_lvl; i < lvl; i++) {
2838 switch (i) {
2839 case KDB_NOT_INITIALIZED:
2840 kdb_inittab(); /* Initialize Command Table */
2841 kdb_initbptab(); /* Initialize Breakpoints */
2842 break;
2843 case KDB_INIT_EARLY:
2844 kdb_cmd_init(); /* Build kdb_cmds tables */
2845 break;
2846 }
2847 }
2848 kdb_init_lvl = lvl;
2849}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
new file mode 100644
index 000000000000..97d3ba69775d
--- /dev/null
+++ b/kernel/debug/kdb/kdb_private.h
@@ -0,0 +1,300 @@
1#ifndef _KDBPRIVATE_H
2#define _KDBPRIVATE_H
3
4/*
5 * Kernel Debugger Architecture Independent Private Headers
6 *
7 * This file is subject to the terms and conditions of the GNU General Public
8 * License. See the file "COPYING" in the main directory of this archive
9 * for more details.
10 *
11 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
12 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
13 */
14
15#include <linux/kgdb.h>
16#include "../debug_core.h"
17
18/* Kernel Debugger Error codes. Must not overlap with command codes. */
19#define KDB_NOTFOUND (-1)
20#define KDB_ARGCOUNT (-2)
21#define KDB_BADWIDTH (-3)
22#define KDB_BADRADIX (-4)
23#define KDB_NOTENV (-5)
24#define KDB_NOENVVALUE (-6)
25#define KDB_NOTIMP (-7)
26#define KDB_ENVFULL (-8)
27#define KDB_ENVBUFFULL (-9)
28#define KDB_TOOMANYBPT (-10)
29#define KDB_TOOMANYDBREGS (-11)
30#define KDB_DUPBPT (-12)
31#define KDB_BPTNOTFOUND (-13)
32#define KDB_BADMODE (-14)
33#define KDB_BADINT (-15)
34#define KDB_INVADDRFMT (-16)
35#define KDB_BADREG (-17)
36#define KDB_BADCPUNUM (-18)
37#define KDB_BADLENGTH (-19)
38#define KDB_NOBP (-20)
39#define KDB_BADADDR (-21)
40
41/* Kernel Debugger Command codes. Must not overlap with error codes. */
42#define KDB_CMD_GO (-1001)
43#define KDB_CMD_CPU (-1002)
44#define KDB_CMD_SS (-1003)
45#define KDB_CMD_SSB (-1004)
46#define KDB_CMD_KGDB (-1005)
47#define KDB_CMD_KGDB2 (-1006)
48
49/* Internal debug flags */
50#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
51#define KDB_DEBUG_FLAG_BB_SUMM 0x0004 /* Basic block analysis, summary only */
52#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */
53#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */
54#define KDB_DEBUG_FLAG_BB 0x0020 /* All basic block analysis */
55#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */
56#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */
57#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */
58
59#define KDB_DEBUG(flag) (kdb_flags & \
60 (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
61#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
62 kdb_print_state(text, value)
63
64#if BITS_PER_LONG == 32
65
66#define KDB_PLATFORM_ENV "BYTESPERWORD=4"
67
68#define kdb_machreg_fmt "0x%lx"
69#define kdb_machreg_fmt0 "0x%08lx"
70#define kdb_bfd_vma_fmt "0x%lx"
71#define kdb_bfd_vma_fmt0 "0x%08lx"
72#define kdb_elfw_addr_fmt "0x%x"
73#define kdb_elfw_addr_fmt0 "0x%08x"
74#define kdb_f_count_fmt "%d"
75
76#elif BITS_PER_LONG == 64
77
78#define KDB_PLATFORM_ENV "BYTESPERWORD=8"
79
80#define kdb_machreg_fmt "0x%lx"
81#define kdb_machreg_fmt0 "0x%016lx"
82#define kdb_bfd_vma_fmt "0x%lx"
83#define kdb_bfd_vma_fmt0 "0x%016lx"
84#define kdb_elfw_addr_fmt "0x%x"
85#define kdb_elfw_addr_fmt0 "0x%016x"
86#define kdb_f_count_fmt "%ld"
87
88#endif
89
90/*
91 * KDB_MAXBPT describes the total number of breakpoints
92 * supported by this architecure.
93 */
94#define KDB_MAXBPT 16
95
96/* Maximum number of arguments to a function */
97#define KDB_MAXARGS 16
98
99typedef enum {
100 KDB_REPEAT_NONE = 0, /* Do not repeat this command */
101 KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */
102 KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */
103} kdb_repeat_t;
104
105typedef int (*kdb_func_t)(int, const char **);
106
107/* Symbol table format returned by kallsyms. */
108typedef struct __ksymtab {
109 unsigned long value; /* Address of symbol */
110 const char *mod_name; /* Module containing symbol or
111 * "kernel" */
112 unsigned long mod_start;
113 unsigned long mod_end;
114 const char *sec_name; /* Section containing symbol */
115 unsigned long sec_start;
116 unsigned long sec_end;
117 const char *sym_name; /* Full symbol name, including
118 * any version */
119 unsigned long sym_start;
120 unsigned long sym_end;
121 } kdb_symtab_t;
122extern int kallsyms_symbol_next(char *prefix_name, int flag);
123extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
124
125/* Exported Symbols for kernel loadable modules to use. */
126extern int kdb_register(char *, kdb_func_t, char *, char *, short);
127extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
128 short, kdb_repeat_t);
129extern int kdb_unregister(char *);
130
131extern int kdb_getarea_size(void *, unsigned long, size_t);
132extern int kdb_putarea_size(unsigned long, void *, size_t);
133
134/*
135 * Like get_user and put_user, kdb_getarea and kdb_putarea take variable
136 * names, not pointers. The underlying *_size functions take pointers.
137 */
138#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
139#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
140
141extern int kdb_getphysword(unsigned long *word,
142 unsigned long addr, size_t size);
143extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t);
145
146extern int kdbgetularg(const char *, unsigned long *);
147extern int kdb_set(int, const char **);
148extern char *kdbgetenv(const char *);
149extern int kdbgetintenv(const char *, int *);
150extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
151 long *, char **);
152extern int kdbgetsymval(const char *, kdb_symtab_t *);
153extern int kdbnearsym(unsigned long, kdb_symtab_t *);
154extern void kdbnearsym_cleanup(void);
155extern char *kdb_strdup(const char *str, gfp_t type);
156extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
157
158/* Routine for debugging the debugger state. */
159extern void kdb_print_state(const char *, int);
160
161extern int kdb_state;
162#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */
163#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */
164#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */
165#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under
166 * kdb control */
167#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
168#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
169#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
170 * DOING_SS is also set */
171#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
172 * after one ss, independent of
173 * DOING_SS */
174#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */
175#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */
176#define KDB_STATE_PAGER 0x00000400 /* pager is available */
177#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
178 * back to initial cpu */
179#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
180#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
181#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
182#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
183 * adjusted */
184#define KDB_STATE_GO1 0x00010000 /* go only releases one cpu */
185#define KDB_STATE_KEYBOARD 0x00020000 /* kdb entered via
186 * keyboard on this cpu */
187#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
188#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
189#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */
190#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
191#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
192 * specific use */
193
194#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
195#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
196#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
197
198extern int kdb_nextline; /* Current number of lines displayed */
199
200typedef struct _kdb_bp {
201 unsigned long bp_addr; /* Address breakpoint is present at */
202 unsigned int bp_free:1; /* This entry is available */
203 unsigned int bp_enabled:1; /* Breakpoint is active in register */
204 unsigned int bp_type:4; /* Uses hardware register */
205 unsigned int bp_installed:1; /* Breakpoint is installed */
206 unsigned int bp_delay:1; /* Do delayed bp handling */
207 unsigned int bp_delayed:1; /* Delayed breakpoint */
208 unsigned int bph_length; /* HW break length */
209} kdb_bp_t;
210
211#ifdef CONFIG_KGDB_KDB
212extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
213
214/* The KDB shell command table */
215typedef struct _kdbtab {
216 char *cmd_name; /* Command name */
217 kdb_func_t cmd_func; /* Function to execute command */
218 char *cmd_usage; /* Usage String for this command */
219 char *cmd_help; /* Help message for this command */
220 short cmd_flags; /* Parsing flags */
221 short cmd_minlen; /* Minimum legal # command
222 * chars required */
223 kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */
224} kdbtab_t;
225
226extern int kdb_bt(int, const char **); /* KDB display back trace */
227
228/* KDB breakpoint management functions */
229extern void kdb_initbptab(void);
230extern void kdb_bp_install(struct pt_regs *);
231extern void kdb_bp_remove(void);
232
233typedef enum {
234 KDB_DB_BPT, /* Breakpoint */
235 KDB_DB_SS, /* Single-step trap */
236 KDB_DB_SSB, /* Single step to branch */
237 KDB_DB_SSBPT, /* Single step over breakpoint */
238 KDB_DB_NOBPT /* Spurious breakpoint */
239} kdb_dbtrap_t;
240
241extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
242 int, kdb_dbtrap_t, struct pt_regs *);
243
244/* Miscellaneous functions and data areas */
245extern int kdb_grepping_flag;
246extern char kdb_grep_string[];
247extern int kdb_grep_leading;
248extern int kdb_grep_trailing;
249extern char *kdb_cmds[];
250extern void kdb_syslog_data(char *syslog_data[]);
251extern unsigned long kdb_task_state_string(const char *);
252extern char kdb_task_state_char (const struct task_struct *);
253extern unsigned long kdb_task_state(const struct task_struct *p,
254 unsigned long mask);
255extern void kdb_ps_suppressed(void);
256extern void kdb_ps1(const struct task_struct *p);
257extern void kdb_print_nameval(const char *name, unsigned long val);
258extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
259extern void kdb_meminfo_proc_show(void);
260extern const char *kdb_walk_kallsyms(loff_t *pos);
261extern char *kdb_getstr(char *, size_t, char *);
262
263/* Defines for kdb_symbol_print */
264#define KDB_SP_SPACEB 0x0001 /* Space before string */
265#define KDB_SP_SPACEA 0x0002 /* Space after string */
266#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */
267#define KDB_SP_VALUE 0x0008 /* Print the value of the address */
268#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */
269#define KDB_SP_NEWLINE 0x0020 /* Newline after string */
270#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
271
272#define KDB_TSK(cpu) kgdb_info[cpu].task
273#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
274
275extern struct task_struct *kdb_curr_task(int);
276
277#define kdb_task_has_cpu(p) (task_curr(p))
278
279/* Simplify coexistence with NPTL */
280#define kdb_do_each_thread(g, p) do_each_thread(g, p)
281#define kdb_while_each_thread(g, p) while_each_thread(g, p)
282
283#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
284
285extern void *debug_kmalloc(size_t size, gfp_t flags);
286extern void debug_kfree(void *);
287extern void debug_kusage(void);
288
289extern void kdb_set_current_task(struct task_struct *);
290extern struct task_struct *kdb_current_task;
291#ifdef CONFIG_MODULES
292extern struct list_head *kdb_modules;
293#endif /* CONFIG_MODULES */
294
295extern char kdb_prompt_str[];
296
297#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
298
299#endif /* CONFIG_KGDB_KDB */
300#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
new file mode 100644
index 000000000000..45344d5c53dd
--- /dev/null
+++ b/kernel/debug/kdb/kdb_support.c
@@ -0,0 +1,927 @@
1/*
2 * Kernel Debugger Architecture Independent Support Functions
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 * 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
11 */
12
13#include <stdarg.h>
14#include <linux/types.h>
15#include <linux/sched.h>
16#include <linux/mm.h>
17#include <linux/kallsyms.h>
18#include <linux/stddef.h>
19#include <linux/vmalloc.h>
20#include <linux/ptrace.h>
21#include <linux/module.h>
22#include <linux/highmem.h>
23#include <linux/hardirq.h>
24#include <linux/delay.h>
25#include <linux/uaccess.h>
26#include <linux/kdb.h>
27#include <linux/slab.h>
28#include "kdb_private.h"
29
30/*
31 * kdbgetsymval - Return the address of the given symbol.
32 *
33 * Parameters:
34 * symname Character string containing symbol name
35 * symtab Structure to receive results
36 * Returns:
37 * 0 Symbol not found, symtab zero filled
38 * 1 Symbol mapped to module/symbol/section, data in symtab
39 */
40int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
41{
42 if (KDB_DEBUG(AR))
43 kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
44 symtab);
45 memset(symtab, 0, sizeof(*symtab));
46 symtab->sym_start = kallsyms_lookup_name(symname);
47 if (symtab->sym_start) {
48 if (KDB_DEBUG(AR))
49 kdb_printf("kdbgetsymval: returns 1, "
50 "symtab->sym_start=0x%lx\n",
51 symtab->sym_start);
52 return 1;
53 }
54 if (KDB_DEBUG(AR))
55 kdb_printf("kdbgetsymval: returns 0\n");
56 return 0;
57}
58EXPORT_SYMBOL(kdbgetsymval);
59
60static char *kdb_name_table[100]; /* arbitrary size */
61
62/*
63 * kdbnearsym - Return the name of the symbol with the nearest address
64 * less than 'addr'.
65 *
66 * Parameters:
67 * addr Address to check for symbol near
68 * symtab Structure to receive results
69 * Returns:
70 * 0 No sections contain this address, symtab zero filled
71 * 1 Address mapped to module/symbol/section, data in symtab
72 * Remarks:
73 * 2.6 kallsyms has a "feature" where it unpacks the name into a
74 * string. If that string is reused before the caller expects it
75 * then the caller sees its string change without warning. To
76 * avoid cluttering up the main kdb code with lots of kdb_strdup,
77 * tests and kfree calls, kdbnearsym maintains an LRU list of the
78 * last few unique strings. The list is sized large enough to
79 * hold active strings, no kdb caller of kdbnearsym makes more
80 * than ~20 later calls before using a saved value.
81 */
82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
83{
84 int ret = 0;
85 unsigned long symbolsize;
86 unsigned long offset;
87#define knt1_size 128 /* must be >= kallsyms table size */
88 char *knt1 = NULL;
89
90 if (KDB_DEBUG(AR))
91 kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
92 memset(symtab, 0, sizeof(*symtab));
93
94 if (addr < 4096)
95 goto out;
96 knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
97 if (!knt1) {
98 kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
99 addr);
100 goto out;
101 }
102 symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
103 (char **)(&symtab->mod_name), knt1);
104 if (offset > 8*1024*1024) {
105 symtab->sym_name = NULL;
106 addr = offset = symbolsize = 0;
107 }
108 symtab->sym_start = addr - offset;
109 symtab->sym_end = symtab->sym_start + symbolsize;
110 ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
111
112 if (ret) {
113 int i;
114 /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
115 * set but the buffer passed into kallsyms_lookup is not used,
116 * so it contains garbage. The caller has to work out which
117 * buffer needs to be saved.
118 *
119 * What was Rusty smoking when he wrote that code?
120 */
121 if (symtab->sym_name != knt1) {
122 strncpy(knt1, symtab->sym_name, knt1_size);
123 knt1[knt1_size-1] = '\0';
124 }
125 for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
126 if (kdb_name_table[i] &&
127 strcmp(kdb_name_table[i], knt1) == 0)
128 break;
129 }
130 if (i >= ARRAY_SIZE(kdb_name_table)) {
131 debug_kfree(kdb_name_table[0]);
132 memcpy(kdb_name_table, kdb_name_table+1,
133 sizeof(kdb_name_table[0]) *
134 (ARRAY_SIZE(kdb_name_table)-1));
135 } else {
136 debug_kfree(knt1);
137 knt1 = kdb_name_table[i];
138 memcpy(kdb_name_table+i, kdb_name_table+i+1,
139 sizeof(kdb_name_table[0]) *
140 (ARRAY_SIZE(kdb_name_table)-i-1));
141 }
142 i = ARRAY_SIZE(kdb_name_table) - 1;
143 kdb_name_table[i] = knt1;
144 symtab->sym_name = kdb_name_table[i];
145 knt1 = NULL;
146 }
147
148 if (symtab->mod_name == NULL)
149 symtab->mod_name = "kernel";
150 if (KDB_DEBUG(AR))
151 kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
152 "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
153 symtab->sym_start, symtab->mod_name, symtab->sym_name,
154 symtab->sym_name);
155
156out:
157 debug_kfree(knt1);
158 return ret;
159}
160
161void kdbnearsym_cleanup(void)
162{
163 int i;
164 for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
165 if (kdb_name_table[i]) {
166 debug_kfree(kdb_name_table[i]);
167 kdb_name_table[i] = NULL;
168 }
169 }
170}
171
172static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
173
174/*
175 * kallsyms_symbol_complete
176 *
177 * Parameters:
178 * prefix_name prefix of a symbol name to lookup
179 * max_len maximum length that can be returned
180 * Returns:
181 * Number of symbols which match the given prefix.
182 * Notes:
183 * prefix_name is changed to contain the longest unique prefix that
184 * starts with this prefix (tab completion).
185 */
186int kallsyms_symbol_complete(char *prefix_name, int max_len)
187{
188 loff_t pos = 0;
189 int prefix_len = strlen(prefix_name), prev_len = 0;
190 int i, number = 0;
191 const char *name;
192
193 while ((name = kdb_walk_kallsyms(&pos))) {
194 if (strncmp(name, prefix_name, prefix_len) == 0) {
195 strcpy(ks_namebuf, name);
196 /* Work out the longest name that matches the prefix */
197 if (++number == 1) {
198 prev_len = min_t(int, max_len-1,
199 strlen(ks_namebuf));
200 memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
201 ks_namebuf_prev[prev_len] = '\0';
202 continue;
203 }
204 for (i = 0; i < prev_len; i++) {
205 if (ks_namebuf[i] != ks_namebuf_prev[i]) {
206 prev_len = i;
207 ks_namebuf_prev[i] = '\0';
208 break;
209 }
210 }
211 }
212 }
213 if (prev_len > prefix_len)
214 memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
215 return number;
216}
217
218/*
219 * kallsyms_symbol_next
220 *
221 * Parameters:
222 * prefix_name prefix of a symbol name to lookup
223 * flag 0 means search from the head, 1 means continue search.
224 * Returns:
225 * 1 if a symbol matches the given prefix.
226 * 0 if no string found
227 */
228int kallsyms_symbol_next(char *prefix_name, int flag)
229{
230 int prefix_len = strlen(prefix_name);
231 static loff_t pos;
232 const char *name;
233
234 if (!flag)
235 pos = 0;
236
237 while ((name = kdb_walk_kallsyms(&pos))) {
238 if (strncmp(name, prefix_name, prefix_len) == 0) {
239 strncpy(prefix_name, name, strlen(name)+1);
240 return 1;
241 }
242 }
243 return 0;
244}
245
246/*
247 * kdb_symbol_print - Standard method for printing a symbol name and offset.
248 * Inputs:
249 * addr Address to be printed.
250 * symtab Address of symbol data, if NULL this routine does its
251 * own lookup.
252 * punc Punctuation for string, bit field.
253 * Remarks:
254 * The string and its punctuation is only printed if the address
255 * is inside the kernel, except that the value is always printed
256 * when requested.
257 */
258void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
259 unsigned int punc)
260{
261 kdb_symtab_t symtab, *symtab_p2;
262 if (symtab_p) {
263 symtab_p2 = (kdb_symtab_t *)symtab_p;
264 } else {
265 symtab_p2 = &symtab;
266 kdbnearsym(addr, symtab_p2);
267 }
268 if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
269 return;
270 if (punc & KDB_SP_SPACEB)
271 kdb_printf(" ");
272 if (punc & KDB_SP_VALUE)
273 kdb_printf(kdb_machreg_fmt0, addr);
274 if (symtab_p2->sym_name) {
275 if (punc & KDB_SP_VALUE)
276 kdb_printf(" ");
277 if (punc & KDB_SP_PAREN)
278 kdb_printf("(");
279 if (strcmp(symtab_p2->mod_name, "kernel"))
280 kdb_printf("[%s]", symtab_p2->mod_name);
281 kdb_printf("%s", symtab_p2->sym_name);
282 if (addr != symtab_p2->sym_start)
283 kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
284 if (punc & KDB_SP_SYMSIZE)
285 kdb_printf("/0x%lx",
286 symtab_p2->sym_end - symtab_p2->sym_start);
287 if (punc & KDB_SP_PAREN)
288 kdb_printf(")");
289 }
290 if (punc & KDB_SP_SPACEA)
291 kdb_printf(" ");
292 if (punc & KDB_SP_NEWLINE)
293 kdb_printf("\n");
294}
295
296/*
297 * kdb_strdup - kdb equivalent of strdup, for disasm code.
298 * Inputs:
299 * str The string to duplicate.
300 * type Flags to kmalloc for the new string.
301 * Returns:
302 * Address of the new string, NULL if storage could not be allocated.
303 * Remarks:
304 * This is not in lib/string.c because it uses kmalloc which is not
305 * available when string.o is used in boot loaders.
306 */
307char *kdb_strdup(const char *str, gfp_t type)
308{
309 int n = strlen(str)+1;
310 char *s = kmalloc(n, type);
311 if (!s)
312 return NULL;
313 return strcpy(s, str);
314}
315
316/*
317 * kdb_getarea_size - Read an area of data. The kdb equivalent of
318 * copy_from_user, with kdb messages for invalid addresses.
319 * Inputs:
320 * res Pointer to the area to receive the result.
321 * addr Address of the area to copy.
322 * size Size of the area.
323 * Returns:
324 * 0 for success, < 0 for error.
325 */
326int kdb_getarea_size(void *res, unsigned long addr, size_t size)
327{
328 int ret = probe_kernel_read((char *)res, (char *)addr, size);
329 if (ret) {
330 if (!KDB_STATE(SUPPRESS)) {
331 kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
332 KDB_STATE_SET(SUPPRESS);
333 }
334 ret = KDB_BADADDR;
335 } else {
336 KDB_STATE_CLEAR(SUPPRESS);
337 }
338 return ret;
339}
340
341/*
342 * kdb_putarea_size - Write an area of data. The kdb equivalent of
343 * copy_to_user, with kdb messages for invalid addresses.
344 * Inputs:
345 * addr Address of the area to write to.
346 * res Pointer to the area holding the data.
347 * size Size of the area.
348 * Returns:
349 * 0 for success, < 0 for error.
350 */
351int kdb_putarea_size(unsigned long addr, void *res, size_t size)
352{
353 int ret = probe_kernel_read((char *)addr, (char *)res, size);
354 if (ret) {
355 if (!KDB_STATE(SUPPRESS)) {
356 kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
357 KDB_STATE_SET(SUPPRESS);
358 }
359 ret = KDB_BADADDR;
360 } else {
361 KDB_STATE_CLEAR(SUPPRESS);
362 }
363 return ret;
364}
365
366/*
367 * kdb_getphys - Read data from a physical address. Validate the
368 * address is in range, use kmap_atomic() to get data
369 * similar to kdb_getarea() - but for phys addresses
370 * Inputs:
371 * res Pointer to the word to receive the result
372 * addr Physical address of the area to copy
373 * size Size of the area
374 * Returns:
375 * 0 for success, < 0 for error.
376 */
377static int kdb_getphys(void *res, unsigned long addr, size_t size)
378{
379 unsigned long pfn;
380 void *vaddr;
381 struct page *page;
382
383 pfn = (addr >> PAGE_SHIFT);
384 if (!pfn_valid(pfn))
385 return 1;
386 page = pfn_to_page(pfn);
387 vaddr = kmap_atomic(page, KM_KDB);
388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
389 kunmap_atomic(vaddr, KM_KDB);
390
391 return 0;
392}
393
394/*
395 * kdb_getphysword
396 * Inputs:
397 * word Pointer to the word to receive the result.
398 * addr Address of the area to copy.
399 * size Size of the area.
400 * Returns:
401 * 0 for success, < 0 for error.
402 */
403int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
404{
405 int diag;
406 __u8 w1;
407 __u16 w2;
408 __u32 w4;
409 __u64 w8;
410 *word = 0; /* Default value if addr or size is invalid */
411
412 switch (size) {
413 case 1:
414 diag = kdb_getphys(&w1, addr, sizeof(w1));
415 if (!diag)
416 *word = w1;
417 break;
418 case 2:
419 diag = kdb_getphys(&w2, addr, sizeof(w2));
420 if (!diag)
421 *word = w2;
422 break;
423 case 4:
424 diag = kdb_getphys(&w4, addr, sizeof(w4));
425 if (!diag)
426 *word = w4;
427 break;
428 case 8:
429 if (size <= sizeof(*word)) {
430 diag = kdb_getphys(&w8, addr, sizeof(w8));
431 if (!diag)
432 *word = w8;
433 break;
434 }
435 /* drop through */
436 default:
437 diag = KDB_BADWIDTH;
438 kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
439 }
440 return diag;
441}
442
443/*
444 * kdb_getword - Read a binary value. Unlike kdb_getarea, this treats
445 * data as numbers.
446 * Inputs:
447 * word Pointer to the word to receive the result.
448 * addr Address of the area to copy.
449 * size Size of the area.
450 * Returns:
451 * 0 for success, < 0 for error.
452 */
453int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
454{
455 int diag;
456 __u8 w1;
457 __u16 w2;
458 __u32 w4;
459 __u64 w8;
460 *word = 0; /* Default value if addr or size is invalid */
461 switch (size) {
462 case 1:
463 diag = kdb_getarea(w1, addr);
464 if (!diag)
465 *word = w1;
466 break;
467 case 2:
468 diag = kdb_getarea(w2, addr);
469 if (!diag)
470 *word = w2;
471 break;
472 case 4:
473 diag = kdb_getarea(w4, addr);
474 if (!diag)
475 *word = w4;
476 break;
477 case 8:
478 if (size <= sizeof(*word)) {
479 diag = kdb_getarea(w8, addr);
480 if (!diag)
481 *word = w8;
482 break;
483 }
484 /* drop through */
485 default:
486 diag = KDB_BADWIDTH;
487 kdb_printf("kdb_getword: bad width %ld\n", (long) size);
488 }
489 return diag;
490}
491
492/*
493 * kdb_putword - Write a binary value. Unlike kdb_putarea, this
494 * treats data as numbers.
495 * Inputs:
496 * addr Address of the area to write to..
497 * word The value to set.
498 * size Size of the area.
499 * Returns:
500 * 0 for success, < 0 for error.
501 */
502int kdb_putword(unsigned long addr, unsigned long word, size_t size)
503{
504 int diag;
505 __u8 w1;
506 __u16 w2;
507 __u32 w4;
508 __u64 w8;
509 switch (size) {
510 case 1:
511 w1 = word;
512 diag = kdb_putarea(addr, w1);
513 break;
514 case 2:
515 w2 = word;
516 diag = kdb_putarea(addr, w2);
517 break;
518 case 4:
519 w4 = word;
520 diag = kdb_putarea(addr, w4);
521 break;
522 case 8:
523 if (size <= sizeof(word)) {
524 w8 = word;
525 diag = kdb_putarea(addr, w8);
526 break;
527 }
528 /* drop through */
529 default:
530 diag = KDB_BADWIDTH;
531 kdb_printf("kdb_putword: bad width %ld\n", (long) size);
532 }
533 return diag;
534}
535
536/*
537 * kdb_task_state_string - Convert a string containing any of the
538 * letters DRSTCZEUIMA to a mask for the process state field and
539 * return the value. If no argument is supplied, return the mask
540 * that corresponds to environment variable PS, DRSTCZEU by
541 * default.
542 * Inputs:
543 * s String to convert
544 * Returns:
545 * Mask for process state.
546 * Notes:
547 * The mask folds data from several sources into a single long value, so
548 * be carefull not to overlap the bits. TASK_* bits are in the LSB,
549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
550 * is no overlap between TASK_* and EXIT_* but that may not always be
551 * true, so EXIT_* bits are shifted left 16 bits before being stored in
552 * the mask.
553 */
554
555/* unrunnable is < 0 */
556#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
557#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
558#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
559#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
560
561unsigned long kdb_task_state_string(const char *s)
562{
563 long res = 0;
564 if (!s) {
565 s = kdbgetenv("PS");
566 if (!s)
567 s = "DRSTCZEU"; /* default value for ps */
568 }
569 while (*s) {
570 switch (*s) {
571 case 'D':
572 res |= TASK_UNINTERRUPTIBLE;
573 break;
574 case 'R':
575 res |= RUNNING;
576 break;
577 case 'S':
578 res |= TASK_INTERRUPTIBLE;
579 break;
580 case 'T':
581 res |= TASK_STOPPED;
582 break;
583 case 'C':
584 res |= TASK_TRACED;
585 break;
586 case 'Z':
587 res |= EXIT_ZOMBIE << 16;
588 break;
589 case 'E':
590 res |= EXIT_DEAD << 16;
591 break;
592 case 'U':
593 res |= UNRUNNABLE;
594 break;
595 case 'I':
596 res |= IDLE;
597 break;
598 case 'M':
599 res |= DAEMON;
600 break;
601 case 'A':
602 res = ~0UL;
603 break;
604 default:
605 kdb_printf("%s: unknown flag '%c' ignored\n",
606 __func__, *s);
607 break;
608 }
609 ++s;
610 }
611 return res;
612}
613
614/*
615 * kdb_task_state_char - Return the character that represents the task state.
616 * Inputs:
617 * p struct task for the process
618 * Returns:
619 * One character to represent the task state.
620 */
621char kdb_task_state_char (const struct task_struct *p)
622{
623 int cpu;
624 char state;
625 unsigned long tmp;
626
627 if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
628 return 'E';
629
630 cpu = kdb_process_cpu(p);
631 state = (p->state == 0) ? 'R' :
632 (p->state < 0) ? 'U' :
633 (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
634 (p->state & TASK_STOPPED) ? 'T' :
635 (p->state & TASK_TRACED) ? 'C' :
636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
637 (p->exit_state & EXIT_DEAD) ? 'E' :
638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
639 if (p->pid == 0) {
640 /* Idle task. Is it really idle, apart from the kdb
641 * interrupt? */
642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
643 if (cpu != kdb_initial_cpu)
644 state = 'I'; /* idle task */
645 }
646 } else if (!p->mm && state == 'S') {
647 state = 'M'; /* sleeping system daemon */
648 }
649 return state;
650}
651
652/*
653 * kdb_task_state - Return true if a process has the desired state
654 * given by the mask.
655 * Inputs:
656 * p struct task for the process
657 * mask mask from kdb_task_state_string to select processes
658 * Returns:
659 * True if the process matches at least one criteria defined by the mask.
660 */
661unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
662{
663 char state[] = { kdb_task_state_char(p), '\0' };
664 return (mask & kdb_task_state_string(state)) != 0;
665}
666
667/*
668 * kdb_print_nameval - Print a name and its value, converting the
669 * value to a symbol lookup if possible.
670 * Inputs:
671 * name field name to print
672 * val value of field
673 */
674void kdb_print_nameval(const char *name, unsigned long val)
675{
676 kdb_symtab_t symtab;
677 kdb_printf(" %-11.11s ", name);
678 if (kdbnearsym(val, &symtab))
679 kdb_symbol_print(val, &symtab,
680 KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
681 else
682 kdb_printf("0x%lx\n", val);
683}
684
685/* Last ditch allocator for debugging, so we can still debug even when
686 * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
687 * for space usage, not for speed. One smallish memory pool, the free
688 * chain is always in ascending address order to allow coalescing,
689 * allocations are done in brute force best fit.
690 */
691
692struct debug_alloc_header {
693 u32 next; /* offset of next header from start of pool */
694 u32 size;
695 void *caller;
696};
697
698/* The memory returned by this allocator must be aligned, which means
699 * so must the header size. Do not assume that sizeof(struct
700 * debug_alloc_header) is a multiple of the alignment, explicitly
701 * calculate the overhead of this header, including the alignment.
702 * The rest of this code must not use sizeof() on any header or
703 * pointer to a header.
704 */
705#define dah_align 8
706#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
707
708static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
709static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
710static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
711
712/* Locking is awkward. The debug code is called from all contexts,
713 * including non maskable interrupts. A normal spinlock is not safe
714 * in NMI context. Try to get the debug allocator lock, if it cannot
715 * be obtained after a second then give up. If the lock could not be
716 * previously obtained on this cpu then only try once.
717 *
718 * sparse has no annotation for "this function _sometimes_ acquires a
719 * lock", so fudge the acquire/release notation.
720 */
721static DEFINE_SPINLOCK(dap_lock);
722static int get_dap_lock(void)
723 __acquires(dap_lock)
724{
725 static int dap_locked = -1;
726 int count;
727 if (dap_locked == smp_processor_id())
728 count = 1;
729 else
730 count = 1000;
731 while (1) {
732 if (spin_trylock(&dap_lock)) {
733 dap_locked = -1;
734 return 1;
735 }
736 if (!count--)
737 break;
738 udelay(1000);
739 }
740 dap_locked = smp_processor_id();
741 __acquire(dap_lock);
742 return 0;
743}
744
745void *debug_kmalloc(size_t size, gfp_t flags)
746{
747 unsigned int rem, h_offset;
748 struct debug_alloc_header *best, *bestprev, *prev, *h;
749 void *p = NULL;
750 if (!get_dap_lock()) {
751 __release(dap_lock); /* we never actually got it */
752 return NULL;
753 }
754 h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
755 if (dah_first_call) {
756 h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
757 dah_first_call = 0;
758 }
759 size = ALIGN(size, dah_align);
760 prev = best = bestprev = NULL;
761 while (1) {
762 if (h->size >= size && (!best || h->size < best->size)) {
763 best = h;
764 bestprev = prev;
765 if (h->size == size)
766 break;
767 }
768 if (!h->next)
769 break;
770 prev = h;
771 h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
772 }
773 if (!best)
774 goto out;
775 rem = best->size - size;
776 /* The pool must always contain at least one header */
777 if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
778 goto out;
779 if (rem >= dah_overhead) {
780 best->size = size;
781 h_offset = ((char *)best - debug_alloc_pool) +
782 dah_overhead + best->size;
783 h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
784 h->size = rem - dah_overhead;
785 h->next = best->next;
786 } else
787 h_offset = best->next;
788 best->caller = __builtin_return_address(0);
789 dah_used += best->size;
790 dah_used_max = max(dah_used, dah_used_max);
791 if (bestprev)
792 bestprev->next = h_offset;
793 else
794 dah_first = h_offset;
795 p = (char *)best + dah_overhead;
796 memset(p, POISON_INUSE, best->size - 1);
797 *((char *)p + best->size - 1) = POISON_END;
798out:
799 spin_unlock(&dap_lock);
800 return p;
801}
802
803void debug_kfree(void *p)
804{
805 struct debug_alloc_header *h;
806 unsigned int h_offset;
807 if (!p)
808 return;
809 if ((char *)p < debug_alloc_pool ||
810 (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
811 kfree(p);
812 return;
813 }
814 if (!get_dap_lock()) {
815 __release(dap_lock); /* we never actually got it */
816 return; /* memory leak, cannot be helped */
817 }
818 h = (struct debug_alloc_header *)((char *)p - dah_overhead);
819 memset(p, POISON_FREE, h->size - 1);
820 *((char *)p + h->size - 1) = POISON_END;
821 h->caller = NULL;
822 dah_used -= h->size;
823 h_offset = (char *)h - debug_alloc_pool;
824 if (h_offset < dah_first) {
825 h->next = dah_first;
826 dah_first = h_offset;
827 } else {
828 struct debug_alloc_header *prev;
829 unsigned int prev_offset;
830 prev = (struct debug_alloc_header *)(debug_alloc_pool +
831 dah_first);
832 while (1) {
833 if (!prev->next || prev->next > h_offset)
834 break;
835 prev = (struct debug_alloc_header *)
836 (debug_alloc_pool + prev->next);
837 }
838 prev_offset = (char *)prev - debug_alloc_pool;
839 if (prev_offset + dah_overhead + prev->size == h_offset) {
840 prev->size += dah_overhead + h->size;
841 memset(h, POISON_FREE, dah_overhead - 1);
842 *((char *)h + dah_overhead - 1) = POISON_END;
843 h = prev;
844 h_offset = prev_offset;
845 } else {
846 h->next = prev->next;
847 prev->next = h_offset;
848 }
849 }
850 if (h_offset + dah_overhead + h->size == h->next) {
851 struct debug_alloc_header *next;
852 next = (struct debug_alloc_header *)
853 (debug_alloc_pool + h->next);
854 h->size += dah_overhead + next->size;
855 h->next = next->next;
856 memset(next, POISON_FREE, dah_overhead - 1);
857 *((char *)next + dah_overhead - 1) = POISON_END;
858 }
859 spin_unlock(&dap_lock);
860}
861
862void debug_kusage(void)
863{
864 struct debug_alloc_header *h_free, *h_used;
865#ifdef CONFIG_IA64
866 /* FIXME: using dah for ia64 unwind always results in a memory leak.
867 * Fix that memory leak first, then set debug_kusage_one_time = 1 for
868 * all architectures.
869 */
870 static int debug_kusage_one_time;
871#else
872 static int debug_kusage_one_time = 1;
873#endif
874 if (!get_dap_lock()) {
875 __release(dap_lock); /* we never actually got it */
876 return;
877 }
878 h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
879 if (dah_first == 0 &&
880 (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
881 dah_first_call))
882 goto out;
883 if (!debug_kusage_one_time)
884 goto out;
885 debug_kusage_one_time = 0;
886 kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
887 __func__, dah_first);
888 if (dah_first) {
889 h_used = (struct debug_alloc_header *)debug_alloc_pool;
890 kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
891 h_used->size);
892 }
893 do {
894 h_used = (struct debug_alloc_header *)
895 ((char *)h_free + dah_overhead + h_free->size);
896 kdb_printf("%s: h_used %p size %d caller %p\n",
897 __func__, h_used, h_used->size, h_used->caller);
898 h_free = (struct debug_alloc_header *)
899 (debug_alloc_pool + h_free->next);
900 } while (h_free->next);
901 h_used = (struct debug_alloc_header *)
902 ((char *)h_free + dah_overhead + h_free->size);
903 if ((char *)h_used - debug_alloc_pool !=
904 sizeof(debug_alloc_pool_aligned))
905 kdb_printf("%s: h_used %p size %d caller %p\n",
906 __func__, h_used, h_used->size, h_used->caller);
907out:
908 spin_unlock(&dap_lock);
909}
910
911/* Maintain a small stack of kdb_flags to allow recursion without disturbing
912 * the global kdb state.
913 */
914
915static int kdb_flags_stack[4], kdb_flags_index;
916
917void kdb_save_flags(void)
918{
919 BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
920 kdb_flags_stack[kdb_flags_index++] = kdb_flags;
921}
922
923void kdb_restore_flags(void)
924{
925 BUG_ON(kdb_flags_index <= 0);
926 kdb_flags = kdb_flags_stack[--kdb_flags_index];
927}
diff --git a/kernel/exit.c b/kernel/exit.c
index 7f2683a10ac4..ceffc67b564a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,15 +55,14 @@
55#include <asm/unistd.h> 55#include <asm/unistd.h>
56#include <asm/pgtable.h> 56#include <asm/pgtable.h>
57#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
58#include "cred-internals.h"
59 58
60static void exit_mm(struct task_struct * tsk); 59static void exit_mm(struct task_struct * tsk);
61 60
62static void __unhash_process(struct task_struct *p) 61static void __unhash_process(struct task_struct *p, bool group_dead)
63{ 62{
64 nr_threads--; 63 nr_threads--;
65 detach_pid(p, PIDTYPE_PID); 64 detach_pid(p, PIDTYPE_PID);
66 if (thread_group_leader(p)) { 65 if (group_dead) {
67 detach_pid(p, PIDTYPE_PGID); 66 detach_pid(p, PIDTYPE_PGID);
68 detach_pid(p, PIDTYPE_SID); 67 detach_pid(p, PIDTYPE_SID);
69 68
@@ -80,10 +79,9 @@ static void __unhash_process(struct task_struct *p)
80static void __exit_signal(struct task_struct *tsk) 79static void __exit_signal(struct task_struct *tsk)
81{ 80{
82 struct signal_struct *sig = tsk->signal; 81 struct signal_struct *sig = tsk->signal;
82 bool group_dead = thread_group_leader(tsk);
83 struct sighand_struct *sighand; 83 struct sighand_struct *sighand;
84 84 struct tty_struct *uninitialized_var(tty);
85 BUG_ON(!sig);
86 BUG_ON(!atomic_read(&sig->count));
87 85
88 sighand = rcu_dereference_check(tsk->sighand, 86 sighand = rcu_dereference_check(tsk->sighand,
89 rcu_read_lock_held() || 87 rcu_read_lock_held() ||
@@ -91,14 +89,16 @@ static void __exit_signal(struct task_struct *tsk)
91 spin_lock(&sighand->siglock); 89 spin_lock(&sighand->siglock);
92 90
93 posix_cpu_timers_exit(tsk); 91 posix_cpu_timers_exit(tsk);
94 if (atomic_dec_and_test(&sig->count)) 92 if (group_dead) {
95 posix_cpu_timers_exit_group(tsk); 93 posix_cpu_timers_exit_group(tsk);
96 else { 94 tty = sig->tty;
95 sig->tty = NULL;
96 } else {
97 /* 97 /*
98 * If there is any task waiting for the group exit 98 * If there is any task waiting for the group exit
99 * then notify it: 99 * then notify it:
100 */ 100 */
101 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) 101 if (sig->notify_count > 0 && !--sig->notify_count)
102 wake_up_process(sig->group_exit_task); 102 wake_up_process(sig->group_exit_task);
103 103
104 if (tsk == sig->curr_target) 104 if (tsk == sig->curr_target)
@@ -124,32 +124,24 @@ static void __exit_signal(struct task_struct *tsk)
124 sig->oublock += task_io_get_oublock(tsk); 124 sig->oublock += task_io_get_oublock(tsk);
125 task_io_accounting_add(&sig->ioac, &tsk->ioac); 125 task_io_accounting_add(&sig->ioac, &tsk->ioac);
126 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 126 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
127 sig = NULL; /* Marker for below. */
128 } 127 }
129 128
130 __unhash_process(tsk); 129 sig->nr_threads--;
130 __unhash_process(tsk, group_dead);
131 131
132 /* 132 /*
133 * Do this under ->siglock, we can race with another thread 133 * Do this under ->siglock, we can race with another thread
134 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. 134 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
135 */ 135 */
136 flush_sigqueue(&tsk->pending); 136 flush_sigqueue(&tsk->pending);
137
138 tsk->signal = NULL;
139 tsk->sighand = NULL; 137 tsk->sighand = NULL;
140 spin_unlock(&sighand->siglock); 138 spin_unlock(&sighand->siglock);
141 139
142 __cleanup_sighand(sighand); 140 __cleanup_sighand(sighand);
143 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 141 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
144 if (sig) { 142 if (group_dead) {
145 flush_sigqueue(&sig->shared_pending); 143 flush_sigqueue(&sig->shared_pending);
146 taskstats_tgid_free(sig); 144 tty_kref_put(tty);
147 /*
148 * Make sure ->signal can't go away under rq->lock,
149 * see account_group_exec_runtime().
150 */
151 task_rq_unlock_wait(tsk);
152 __cleanup_signal(sig);
153 } 145 }
154} 146}
155 147
@@ -857,12 +849,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
857 849
858 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; 850 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
859 851
860 /* mt-exec, de_thread() is waiting for us */ 852 /* mt-exec, de_thread() is waiting for group leader */
861 if (thread_group_leader(tsk) && 853 if (unlikely(tsk->signal->notify_count < 0))
862 tsk->signal->group_exit_task &&
863 tsk->signal->notify_count < 0)
864 wake_up_process(tsk->signal->group_exit_task); 854 wake_up_process(tsk->signal->group_exit_task);
865
866 write_unlock_irq(&tasklist_lock); 855 write_unlock_irq(&tasklist_lock);
867 856
868 tracehook_report_death(tsk, signal, cookie, group_dead); 857 tracehook_report_death(tsk, signal, cookie, group_dead);
@@ -1003,8 +992,10 @@ NORET_TYPE void do_exit(long code)
1003 992
1004 exit_notify(tsk, group_dead); 993 exit_notify(tsk, group_dead);
1005#ifdef CONFIG_NUMA 994#ifdef CONFIG_NUMA
995 task_lock(tsk);
1006 mpol_put(tsk->mempolicy); 996 mpol_put(tsk->mempolicy);
1007 tsk->mempolicy = NULL; 997 tsk->mempolicy = NULL;
998 task_unlock(tsk);
1008#endif 999#endif
1009#ifdef CONFIG_FUTEX 1000#ifdef CONFIG_FUTEX
1010 if (unlikely(current->pi_state_cache)) 1001 if (unlikely(current->pi_state_cache))
diff --git a/kernel/fork.c b/kernel/fork.c
index 4c14942a0ee3..bf9fef6d1bfe 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -165,6 +165,18 @@ void free_task(struct task_struct *tsk)
165} 165}
166EXPORT_SYMBOL(free_task); 166EXPORT_SYMBOL(free_task);
167 167
168static inline void free_signal_struct(struct signal_struct *sig)
169{
170 taskstats_tgid_free(sig);
171 kmem_cache_free(signal_cachep, sig);
172}
173
174static inline void put_signal_struct(struct signal_struct *sig)
175{
176 if (atomic_dec_and_test(&sig->sigcnt))
177 free_signal_struct(sig);
178}
179
168void __put_task_struct(struct task_struct *tsk) 180void __put_task_struct(struct task_struct *tsk)
169{ 181{
170 WARN_ON(!tsk->exit_state); 182 WARN_ON(!tsk->exit_state);
@@ -173,6 +185,7 @@ void __put_task_struct(struct task_struct *tsk)
173 185
174 exit_creds(tsk); 186 exit_creds(tsk);
175 delayacct_tsk_free(tsk); 187 delayacct_tsk_free(tsk);
188 put_signal_struct(tsk->signal);
176 189
177 if (!profile_handoff_task(tsk)) 190 if (!profile_handoff_task(tsk))
178 free_task(tsk); 191 free_task(tsk);
@@ -864,8 +877,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
864 if (!sig) 877 if (!sig)
865 return -ENOMEM; 878 return -ENOMEM;
866 879
867 atomic_set(&sig->count, 1); 880 sig->nr_threads = 1;
868 atomic_set(&sig->live, 1); 881 atomic_set(&sig->live, 1);
882 atomic_set(&sig->sigcnt, 1);
869 init_waitqueue_head(&sig->wait_chldexit); 883 init_waitqueue_head(&sig->wait_chldexit);
870 if (clone_flags & CLONE_NEWPID) 884 if (clone_flags & CLONE_NEWPID)
871 sig->flags |= SIGNAL_UNKILLABLE; 885 sig->flags |= SIGNAL_UNKILLABLE;
@@ -889,13 +903,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
889 return 0; 903 return 0;
890} 904}
891 905
892void __cleanup_signal(struct signal_struct *sig)
893{
894 thread_group_cputime_free(sig);
895 tty_kref_put(sig->tty);
896 kmem_cache_free(signal_cachep, sig);
897}
898
899static void copy_flags(unsigned long clone_flags, struct task_struct *p) 906static void copy_flags(unsigned long clone_flags, struct task_struct *p)
900{ 907{
901 unsigned long new_flags = p->flags; 908 unsigned long new_flags = p->flags;
@@ -1079,6 +1086,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1079 } 1086 }
1080 mpol_fix_fork_child_flag(p); 1087 mpol_fix_fork_child_flag(p);
1081#endif 1088#endif
1089#ifdef CONFIG_CPUSETS
1090 p->cpuset_mem_spread_rotor = node_random(p->mems_allowed);
1091 p->cpuset_slab_spread_rotor = node_random(p->mems_allowed);
1092#endif
1082#ifdef CONFIG_TRACE_IRQFLAGS 1093#ifdef CONFIG_TRACE_IRQFLAGS
1083 p->irq_events = 0; 1094 p->irq_events = 0;
1084#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW 1095#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
@@ -1112,8 +1123,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1112 p->memcg_batch.memcg = NULL; 1123 p->memcg_batch.memcg = NULL;
1113#endif 1124#endif
1114 1125
1115 p->bts = NULL;
1116
1117 /* Perform scheduler related setup. Assign this task to a CPU. */ 1126 /* Perform scheduler related setup. Assign this task to a CPU. */
1118 sched_fork(p, clone_flags); 1127 sched_fork(p, clone_flags);
1119 1128
@@ -1247,8 +1256,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1247 } 1256 }
1248 1257
1249 if (clone_flags & CLONE_THREAD) { 1258 if (clone_flags & CLONE_THREAD) {
1250 atomic_inc(&current->signal->count); 1259 current->signal->nr_threads++;
1251 atomic_inc(&current->signal->live); 1260 atomic_inc(&current->signal->live);
1261 atomic_inc(&current->signal->sigcnt);
1252 p->group_leader = current->group_leader; 1262 p->group_leader = current->group_leader;
1253 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1263 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1254 } 1264 }
@@ -1261,7 +1271,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1261 p->nsproxy->pid_ns->child_reaper = p; 1271 p->nsproxy->pid_ns->child_reaper = p;
1262 1272
1263 p->signal->leader_pid = pid; 1273 p->signal->leader_pid = pid;
1264 tty_kref_put(p->signal->tty);
1265 p->signal->tty = tty_kref_get(current->signal->tty); 1274 p->signal->tty = tty_kref_get(current->signal->tty);
1266 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1275 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1267 attach_pid(p, PIDTYPE_SID, task_session(current)); 1276 attach_pid(p, PIDTYPE_SID, task_session(current));
@@ -1294,7 +1303,7 @@ bad_fork_cleanup_mm:
1294 mmput(p->mm); 1303 mmput(p->mm);
1295bad_fork_cleanup_signal: 1304bad_fork_cleanup_signal:
1296 if (!(clone_flags & CLONE_THREAD)) 1305 if (!(clone_flags & CLONE_THREAD))
1297 __cleanup_signal(p->signal); 1306 free_signal_struct(p->signal);
1298bad_fork_cleanup_sighand: 1307bad_fork_cleanup_sighand:
1299 __cleanup_sighand(p->sighand); 1308 __cleanup_sighand(p->sighand);
1300bad_fork_cleanup_fs: 1309bad_fork_cleanup_fs:
@@ -1329,6 +1338,16 @@ noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_re
1329 return regs; 1338 return regs;
1330} 1339}
1331 1340
1341static inline void init_idle_pids(struct pid_link *links)
1342{
1343 enum pid_type type;
1344
1345 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
1346 INIT_HLIST_NODE(&links[type].node); /* not really needed */
1347 links[type].pid = &init_struct_pid;
1348 }
1349}
1350
1332struct task_struct * __cpuinit fork_idle(int cpu) 1351struct task_struct * __cpuinit fork_idle(int cpu)
1333{ 1352{
1334 struct task_struct *task; 1353 struct task_struct *task;
@@ -1336,8 +1355,10 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1336 1355
1337 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, 1356 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1338 &init_struct_pid, 0); 1357 &init_struct_pid, 0);
1339 if (!IS_ERR(task)) 1358 if (!IS_ERR(task)) {
1359 init_idle_pids(task->pids);
1340 init_idle(task, cpu); 1360 init_idle(task, cpu);
1361 }
1341 1362
1342 return task; 1363 return task;
1343} 1364}
@@ -1509,14 +1530,6 @@ static void check_unshare_flags(unsigned long *flags_ptr)
1509 *flags_ptr |= CLONE_SIGHAND; 1530 *flags_ptr |= CLONE_SIGHAND;
1510 1531
1511 /* 1532 /*
1512 * If unsharing signal handlers and the task was created
1513 * using CLONE_THREAD, then must unshare the thread
1514 */
1515 if ((*flags_ptr & CLONE_SIGHAND) &&
1516 (atomic_read(&current->signal->count) > 1))
1517 *flags_ptr |= CLONE_THREAD;
1518
1519 /*
1520 * If unsharing namespace, must also unshare filesystem information. 1533 * If unsharing namespace, must also unshare filesystem information.
1521 */ 1534 */
1522 if (*flags_ptr & CLONE_NEWNS) 1535 if (*flags_ptr & CLONE_NEWNS)
diff --git a/kernel/groups.c b/kernel/groups.c
index 2b45b2ee3964..53b1916c9492 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -164,12 +164,6 @@ int groups_search(const struct group_info *group_info, gid_t grp)
164 */ 164 */
165int set_groups(struct cred *new, struct group_info *group_info) 165int set_groups(struct cred *new, struct group_info *group_info)
166{ 166{
167 int retval;
168
169 retval = security_task_setgroups(group_info);
170 if (retval)
171 return retval;
172
173 put_group_info(new->group_info); 167 put_group_info(new->group_info);
174 groups_sort(group_info); 168 groups_sort(group_info);
175 get_group_info(group_info); 169 get_group_info(group_info);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0086628b6e97..5c69e996bd0f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -89,7 +89,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
89 89
90 do { 90 do {
91 seq = read_seqbegin(&xtime_lock); 91 seq = read_seqbegin(&xtime_lock);
92 xts = current_kernel_time(); 92 xts = __current_kernel_time();
93 tom = wall_to_monotonic; 93 tom = wall_to_monotonic;
94 } while (read_seqretry(&xtime_lock, seq)); 94 } while (read_seqretry(&xtime_lock, seq));
95 95
@@ -1749,35 +1749,15 @@ void __init hrtimers_init(void)
1749} 1749}
1750 1750
1751/** 1751/**
1752 * schedule_hrtimeout_range - sleep until timeout 1752 * schedule_hrtimeout_range_clock - sleep until timeout
1753 * @expires: timeout value (ktime_t) 1753 * @expires: timeout value (ktime_t)
1754 * @delta: slack in expires timeout (ktime_t) 1754 * @delta: slack in expires timeout (ktime_t)
1755 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL 1755 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1756 * 1756 * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
1757 * Make the current task sleep until the given expiry time has
1758 * elapsed. The routine will return immediately unless
1759 * the current task state has been set (see set_current_state()).
1760 *
1761 * The @delta argument gives the kernel the freedom to schedule the
1762 * actual wakeup to a time that is both power and performance friendly.
1763 * The kernel give the normal best effort behavior for "@expires+@delta",
1764 * but may decide to fire the timer earlier, but no earlier than @expires.
1765 *
1766 * You can set the task state as follows -
1767 *
1768 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1769 * pass before the routine returns.
1770 *
1771 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1772 * delivered to the current task.
1773 *
1774 * The current task state is guaranteed to be TASK_RUNNING when this
1775 * routine returns.
1776 *
1777 * Returns 0 when the timer has expired otherwise -EINTR
1778 */ 1757 */
1779int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, 1758int __sched
1780 const enum hrtimer_mode mode) 1759schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1760 const enum hrtimer_mode mode, int clock)
1781{ 1761{
1782 struct hrtimer_sleeper t; 1762 struct hrtimer_sleeper t;
1783 1763
@@ -1799,7 +1779,7 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1799 return -EINTR; 1779 return -EINTR;
1800 } 1780 }
1801 1781
1802 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); 1782 hrtimer_init_on_stack(&t.timer, clock, mode);
1803 hrtimer_set_expires_range_ns(&t.timer, *expires, delta); 1783 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1804 1784
1805 hrtimer_init_sleeper(&t, current); 1785 hrtimer_init_sleeper(&t, current);
@@ -1818,6 +1798,41 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1818 1798
1819 return !t.task ? 0 : -EINTR; 1799 return !t.task ? 0 : -EINTR;
1820} 1800}
1801
1802/**
1803 * schedule_hrtimeout_range - sleep until timeout
1804 * @expires: timeout value (ktime_t)
1805 * @delta: slack in expires timeout (ktime_t)
1806 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1807 *
1808 * Make the current task sleep until the given expiry time has
1809 * elapsed. The routine will return immediately unless
1810 * the current task state has been set (see set_current_state()).
1811 *
1812 * The @delta argument gives the kernel the freedom to schedule the
1813 * actual wakeup to a time that is both power and performance friendly.
1814 * The kernel give the normal best effort behavior for "@expires+@delta",
1815 * but may decide to fire the timer earlier, but no earlier than @expires.
1816 *
1817 * You can set the task state as follows -
1818 *
1819 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1820 * pass before the routine returns.
1821 *
1822 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1823 * delivered to the current task.
1824 *
1825 * The current task state is guaranteed to be TASK_RUNNING when this
1826 * routine returns.
1827 *
1828 * Returns 0 when the timer has expired otherwise -EINTR
1829 */
1830int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1831 const enum hrtimer_mode mode)
1832{
1833 return schedule_hrtimeout_range_clock(expires, delta, mode,
1834 CLOCK_MONOTONIC);
1835}
1821EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); 1836EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
1822 1837
1823/** 1838/**
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 03808ed342a6..7a56b22e0602 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,23 +40,29 @@
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/slab.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/smp.h> 45#include <linux/smp.h>
45 46
46#include <linux/hw_breakpoint.h> 47#include <linux/hw_breakpoint.h>
47 48
49
48/* 50/*
49 * Constraints data 51 * Constraints data
50 */ 52 */
51 53
52/* Number of pinned cpu breakpoints in a cpu */ 54/* Number of pinned cpu breakpoints in a cpu */
53static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); 55static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
54 56
55/* Number of pinned task breakpoints in a cpu */ 57/* Number of pinned task breakpoints in a cpu */
56static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); 58static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
57 59
58/* Number of non-pinned cpu/task breakpoints in a cpu */ 60/* Number of non-pinned cpu/task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); 61static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
62
63static int nr_slots[TYPE_MAX];
64
65static int constraints_initialized;
60 66
61/* Gather the number of total pinned and un-pinned bp in a cpuset */ 67/* Gather the number of total pinned and un-pinned bp in a cpuset */
62struct bp_busy_slots { 68struct bp_busy_slots {
@@ -67,16 +73,29 @@ struct bp_busy_slots {
67/* Serialize accesses to the above constraints */ 73/* Serialize accesses to the above constraints */
68static DEFINE_MUTEX(nr_bp_mutex); 74static DEFINE_MUTEX(nr_bp_mutex);
69 75
76__weak int hw_breakpoint_weight(struct perf_event *bp)
77{
78 return 1;
79}
80
81static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
82{
83 if (bp->attr.bp_type & HW_BREAKPOINT_RW)
84 return TYPE_DATA;
85
86 return TYPE_INST;
87}
88
70/* 89/*
71 * Report the maximum number of pinned breakpoints a task 90 * Report the maximum number of pinned breakpoints a task
72 * have in this cpu 91 * have in this cpu
73 */ 92 */
74static unsigned int max_task_bp_pinned(int cpu) 93static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
75{ 94{
76 int i; 95 int i;
77 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 96 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
78 97
79 for (i = HBP_NUM -1; i >= 0; i--) { 98 for (i = nr_slots[type] - 1; i >= 0; i--) {
80 if (tsk_pinned[i] > 0) 99 if (tsk_pinned[i] > 0)
81 return i + 1; 100 return i + 1;
82 } 101 }
@@ -84,7 +103,7 @@ static unsigned int max_task_bp_pinned(int cpu)
84 return 0; 103 return 0;
85} 104}
86 105
87static int task_bp_pinned(struct task_struct *tsk) 106static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
88{ 107{
89 struct perf_event_context *ctx = tsk->perf_event_ctxp; 108 struct perf_event_context *ctx = tsk->perf_event_ctxp;
90 struct list_head *list; 109 struct list_head *list;
@@ -105,7 +124,8 @@ static int task_bp_pinned(struct task_struct *tsk)
105 */ 124 */
106 list_for_each_entry(bp, list, event_entry) { 125 list_for_each_entry(bp, list, event_entry) {
107 if (bp->attr.type == PERF_TYPE_BREAKPOINT) 126 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
108 count++; 127 if (find_slot_idx(bp) == type)
128 count += hw_breakpoint_weight(bp);
109 } 129 }
110 130
111 raw_spin_unlock_irqrestore(&ctx->lock, flags); 131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -118,18 +138,19 @@ static int task_bp_pinned(struct task_struct *tsk)
118 * a given cpu (cpu > -1) or in all of them (cpu = -1). 138 * a given cpu (cpu > -1) or in all of them (cpu = -1).
119 */ 139 */
120static void 140static void
121fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) 141fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
142 enum bp_type_idx type)
122{ 143{
123 int cpu = bp->cpu; 144 int cpu = bp->cpu;
124 struct task_struct *tsk = bp->ctx->task; 145 struct task_struct *tsk = bp->ctx->task;
125 146
126 if (cpu >= 0) { 147 if (cpu >= 0) {
127 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); 148 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
128 if (!tsk) 149 if (!tsk)
129 slots->pinned += max_task_bp_pinned(cpu); 150 slots->pinned += max_task_bp_pinned(cpu, type);
130 else 151 else
131 slots->pinned += task_bp_pinned(tsk); 152 slots->pinned += task_bp_pinned(tsk, type);
132 slots->flexible = per_cpu(nr_bp_flexible, cpu); 153 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
133 154
134 return; 155 return;
135 } 156 }
@@ -137,16 +158,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
137 for_each_online_cpu(cpu) { 158 for_each_online_cpu(cpu) {
138 unsigned int nr; 159 unsigned int nr;
139 160
140 nr = per_cpu(nr_cpu_bp_pinned, cpu); 161 nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
141 if (!tsk) 162 if (!tsk)
142 nr += max_task_bp_pinned(cpu); 163 nr += max_task_bp_pinned(cpu, type);
143 else 164 else
144 nr += task_bp_pinned(tsk); 165 nr += task_bp_pinned(tsk, type);
145 166
146 if (nr > slots->pinned) 167 if (nr > slots->pinned)
147 slots->pinned = nr; 168 slots->pinned = nr;
148 169
149 nr = per_cpu(nr_bp_flexible, cpu); 170 nr = per_cpu(nr_bp_flexible[type], cpu);
150 171
151 if (nr > slots->flexible) 172 if (nr > slots->flexible)
152 slots->flexible = nr; 173 slots->flexible = nr;
@@ -154,31 +175,49 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
154} 175}
155 176
156/* 177/*
178 * For now, continue to consider flexible as pinned, until we can
179 * ensure no flexible event can ever be scheduled before a pinned event
180 * in a same cpu.
181 */
182static void
183fetch_this_slot(struct bp_busy_slots *slots, int weight)
184{
185 slots->pinned += weight;
186}
187
188/*
157 * Add a pinned breakpoint for the given task in our constraint table 189 * Add a pinned breakpoint for the given task in our constraint table
158 */ 190 */
159static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) 191static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
192 enum bp_type_idx type, int weight)
160{ 193{
161 unsigned int *tsk_pinned; 194 unsigned int *tsk_pinned;
162 int count = 0; 195 int old_count = 0;
196 int old_idx = 0;
197 int idx = 0;
163 198
164 count = task_bp_pinned(tsk); 199 old_count = task_bp_pinned(tsk, type);
200 old_idx = old_count - 1;
201 idx = old_idx + weight;
165 202
166 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 203 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
167 if (enable) { 204 if (enable) {
168 tsk_pinned[count]++; 205 tsk_pinned[idx]++;
169 if (count > 0) 206 if (old_count > 0)
170 tsk_pinned[count-1]--; 207 tsk_pinned[old_idx]--;
171 } else { 208 } else {
172 tsk_pinned[count]--; 209 tsk_pinned[idx]--;
173 if (count > 0) 210 if (old_count > 0)
174 tsk_pinned[count-1]++; 211 tsk_pinned[old_idx]++;
175 } 212 }
176} 213}
177 214
178/* 215/*
179 * Add/remove the given breakpoint in our constraint table 216 * Add/remove the given breakpoint in our constraint table
180 */ 217 */
181static void toggle_bp_slot(struct perf_event *bp, bool enable) 218static void
219toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
220 int weight)
182{ 221{
183 int cpu = bp->cpu; 222 int cpu = bp->cpu;
184 struct task_struct *tsk = bp->ctx->task; 223 struct task_struct *tsk = bp->ctx->task;
@@ -186,20 +225,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
186 /* Pinned counter task profiling */ 225 /* Pinned counter task profiling */
187 if (tsk) { 226 if (tsk) {
188 if (cpu >= 0) { 227 if (cpu >= 0) {
189 toggle_bp_task_slot(tsk, cpu, enable); 228 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
190 return; 229 return;
191 } 230 }
192 231
193 for_each_online_cpu(cpu) 232 for_each_online_cpu(cpu)
194 toggle_bp_task_slot(tsk, cpu, enable); 233 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
195 return; 234 return;
196 } 235 }
197 236
198 /* Pinned counter cpu profiling */ 237 /* Pinned counter cpu profiling */
199 if (enable) 238 if (enable)
200 per_cpu(nr_cpu_bp_pinned, bp->cpu)++; 239 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
201 else 240 else
202 per_cpu(nr_cpu_bp_pinned, bp->cpu)--; 241 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
203} 242}
204 243
205/* 244/*
@@ -246,14 +285,29 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
246static int __reserve_bp_slot(struct perf_event *bp) 285static int __reserve_bp_slot(struct perf_event *bp)
247{ 286{
248 struct bp_busy_slots slots = {0}; 287 struct bp_busy_slots slots = {0};
288 enum bp_type_idx type;
289 int weight;
249 290
250 fetch_bp_busy_slots(&slots, bp); 291 /* We couldn't initialize breakpoint constraints on boot */
292 if (!constraints_initialized)
293 return -ENOMEM;
294
295 /* Basic checks */
296 if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
297 bp->attr.bp_type == HW_BREAKPOINT_INVALID)
298 return -EINVAL;
299
300 type = find_slot_idx(bp);
301 weight = hw_breakpoint_weight(bp);
302
303 fetch_bp_busy_slots(&slots, bp, type);
304 fetch_this_slot(&slots, weight);
251 305
252 /* Flexible counters need to keep at least one slot */ 306 /* Flexible counters need to keep at least one slot */
253 if (slots.pinned + (!!slots.flexible) == HBP_NUM) 307 if (slots.pinned + (!!slots.flexible) > nr_slots[type])
254 return -ENOSPC; 308 return -ENOSPC;
255 309
256 toggle_bp_slot(bp, true); 310 toggle_bp_slot(bp, true, type, weight);
257 311
258 return 0; 312 return 0;
259} 313}
@@ -273,7 +327,12 @@ int reserve_bp_slot(struct perf_event *bp)
273 327
274static void __release_bp_slot(struct perf_event *bp) 328static void __release_bp_slot(struct perf_event *bp)
275{ 329{
276 toggle_bp_slot(bp, false); 330 enum bp_type_idx type;
331 int weight;
332
333 type = find_slot_idx(bp);
334 weight = hw_breakpoint_weight(bp);
335 toggle_bp_slot(bp, false, type, weight);
277} 336}
278 337
279void release_bp_slot(struct perf_event *bp) 338void release_bp_slot(struct perf_event *bp)
@@ -308,6 +367,28 @@ int dbg_release_bp_slot(struct perf_event *bp)
308 return 0; 367 return 0;
309} 368}
310 369
370static int validate_hw_breakpoint(struct perf_event *bp)
371{
372 int ret;
373
374 ret = arch_validate_hwbkpt_settings(bp);
375 if (ret)
376 return ret;
377
378 if (arch_check_bp_in_kernelspace(bp)) {
379 if (bp->attr.exclude_kernel)
380 return -EINVAL;
381 /*
382 * Don't let unprivileged users set a breakpoint in the trap
383 * path to avoid trap recursion attacks.
384 */
385 if (!capable(CAP_SYS_ADMIN))
386 return -EPERM;
387 }
388
389 return 0;
390}
391
311int register_perf_hw_breakpoint(struct perf_event *bp) 392int register_perf_hw_breakpoint(struct perf_event *bp)
312{ 393{
313 int ret; 394 int ret;
@@ -316,17 +397,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
316 if (ret) 397 if (ret)
317 return ret; 398 return ret;
318 399
319 /* 400 ret = validate_hw_breakpoint(bp);
320 * Ptrace breakpoints can be temporary perf events only
321 * meant to reserve a slot. In this case, it is created disabled and
322 * we don't want to check the params right now (as we put a null addr)
323 * But perf tools create events as disabled and we want to check
324 * the params for them.
325 * This is a quick hack that will be removed soon, once we remove
326 * the tmp breakpoints from ptrace
327 */
328 if (!bp->attr.disabled || !bp->overflow_handler)
329 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
330 401
331 /* if arch_validate_hwbkpt_settings() fails then release bp slot */ 402 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
332 if (ret) 403 if (ret)
@@ -373,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
373 if (attr->disabled) 444 if (attr->disabled)
374 goto end; 445 goto end;
375 446
376 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); 447 err = validate_hw_breakpoint(bp);
377 if (!err) 448 if (!err)
378 perf_event_enable(bp); 449 perf_event_enable(bp);
379 450
@@ -480,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
480 551
481static int __init init_hw_breakpoint(void) 552static int __init init_hw_breakpoint(void)
482{ 553{
554 unsigned int **task_bp_pinned;
555 int cpu, err_cpu;
556 int i;
557
558 for (i = 0; i < TYPE_MAX; i++)
559 nr_slots[i] = hw_breakpoint_slots(i);
560
561 for_each_possible_cpu(cpu) {
562 for (i = 0; i < TYPE_MAX; i++) {
563 task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
564 *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
565 GFP_KERNEL);
566 if (!*task_bp_pinned)
567 goto err_alloc;
568 }
569 }
570
571 constraints_initialized = 1;
572
483 return register_die_notifier(&hw_breakpoint_exceptions_nb); 573 return register_die_notifier(&hw_breakpoint_exceptions_nb);
574
575 err_alloc:
576 for_each_possible_cpu(err_cpu) {
577 if (err_cpu == cpu)
578 break;
579 for (i = 0; i < TYPE_MAX; i++)
580 kfree(per_cpu(nr_task_bp_pinned[i], cpu));
581 }
582
583 return -ENOMEM;
484} 584}
485core_initcall(init_hw_breakpoint); 585core_initcall(init_hw_breakpoint);
486 586
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 76d5a671bfe1..27e5c6911223 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -370,9 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
370 irqreturn_t ret, retval = IRQ_NONE; 370 irqreturn_t ret, retval = IRQ_NONE;
371 unsigned int status = 0; 371 unsigned int status = 0;
372 372
373 if (!(action->flags & IRQF_DISABLED))
374 local_irq_enable_in_hardirq();
375
376 do { 373 do {
377 trace_irq_handler_entry(irq, action); 374 trace_irq_handler_entry(irq, action);
378 ret = action->handler(irq, action->dev_id); 375 ret = action->handler(irq, action->dev_id);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 704e488730a5..3164ba7ce151 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
138 return 0; 138 return 0;
139} 139}
140 140
141int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
142{
143 struct irq_desc *desc = irq_to_desc(irq);
144 unsigned long flags;
145
146 if (!desc)
147 return -EINVAL;
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->affinity_hint = m;
151 raw_spin_unlock_irqrestore(&desc->lock, flags);
152
153 return 0;
154}
155EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
156
141#ifndef CONFIG_AUTO_IRQ_AFFINITY 157#ifndef CONFIG_AUTO_IRQ_AFFINITY
142/* 158/*
143 * Generic version of the affinity autoselector. 159 * Generic version of the affinity autoselector.
@@ -757,16 +773,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
757 if (new->flags & IRQF_ONESHOT) 773 if (new->flags & IRQF_ONESHOT)
758 desc->status |= IRQ_ONESHOT; 774 desc->status |= IRQ_ONESHOT;
759 775
760 /*
761 * Force MSI interrupts to run with interrupts
762 * disabled. The multi vector cards can cause stack
763 * overflows due to nested interrupts when enough of
764 * them are directed to a core and fire at the same
765 * time.
766 */
767 if (desc->msi_desc)
768 new->flags |= IRQF_DISABLED;
769
770 if (!(desc->status & IRQ_NOAUTOEN)) { 776 if (!(desc->status & IRQ_NOAUTOEN)) {
771 desc->depth = 0; 777 desc->depth = 0;
772 desc->status &= ~IRQ_DISABLED; 778 desc->status &= ~IRQ_DISABLED;
@@ -916,6 +922,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
916 desc->chip->disable(irq); 922 desc->chip->disable(irq);
917 } 923 }
918 924
925#ifdef CONFIG_SMP
926 /* make sure affinity_hint is cleaned up */
927 if (WARN_ON_ONCE(desc->affinity_hint))
928 desc->affinity_hint = NULL;
929#endif
930
919 raw_spin_unlock_irqrestore(&desc->lock, flags); 931 raw_spin_unlock_irqrestore(&desc->lock, flags);
920 932
921 unregister_handler_proc(irq, action); 933 unregister_handler_proc(irq, action);
@@ -1027,7 +1039,6 @@ EXPORT_SYMBOL(free_irq);
1027 * Flags: 1039 * Flags:
1028 * 1040 *
1029 * IRQF_SHARED Interrupt is shared 1041 * IRQF_SHARED Interrupt is shared
1030 * IRQF_DISABLED Disable local interrupts while processing
1031 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy 1042 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
1032 * IRQF_TRIGGER_* Specify active edge(s) or level 1043 * IRQF_TRIGGER_* Specify active edge(s) or level
1033 * 1044 *
@@ -1041,25 +1052,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1041 int retval; 1052 int retval;
1042 1053
1043 /* 1054 /*
1044 * handle_IRQ_event() always ignores IRQF_DISABLED except for
1045 * the _first_ irqaction (sigh). That can cause oopsing, but
1046 * the behavior is classified as "will not fix" so we need to
1047 * start nudging drivers away from using that idiom.
1048 */
1049 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
1050 (IRQF_SHARED|IRQF_DISABLED)) {
1051 pr_warning(
1052 "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
1053 irq, devname);
1054 }
1055
1056#ifdef CONFIG_LOCKDEP
1057 /*
1058 * Lockdep wants atomic interrupt handlers:
1059 */
1060 irqflags |= IRQF_DISABLED;
1061#endif
1062 /*
1063 * Sanity-check: shared interrupts must pass in a real dev-ID, 1055 * Sanity-check: shared interrupts must pass in a real dev-ID,
1064 * otherwise we'll have trouble later trying to figure out 1056 * otherwise we'll have trouble later trying to figure out
1065 * which interrupt is which (messes up the interrupt freeing 1057 * which interrupt is which (messes up the interrupt freeing
@@ -1120,3 +1112,40 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1120 return retval; 1112 return retval;
1121} 1113}
1122EXPORT_SYMBOL(request_threaded_irq); 1114EXPORT_SYMBOL(request_threaded_irq);
1115
1116/**
1117 * request_any_context_irq - allocate an interrupt line
1118 * @irq: Interrupt line to allocate
1119 * @handler: Function to be called when the IRQ occurs.
1120 * Threaded handler for threaded interrupts.
1121 * @flags: Interrupt type flags
1122 * @name: An ascii name for the claiming device
1123 * @dev_id: A cookie passed back to the handler function
1124 *
1125 * This call allocates interrupt resources and enables the
1126 * interrupt line and IRQ handling. It selects either a
1127 * hardirq or threaded handling method depending on the
1128 * context.
1129 *
1130 * On failure, it returns a negative value. On success,
1131 * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
1132 */
1133int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1134 unsigned long flags, const char *name, void *dev_id)
1135{
1136 struct irq_desc *desc = irq_to_desc(irq);
1137 int ret;
1138
1139 if (!desc)
1140 return -EINVAL;
1141
1142 if (desc->status & IRQ_NESTED_THREAD) {
1143 ret = request_threaded_irq(irq, NULL, handler,
1144 flags, name, dev_id);
1145 return !ret ? IRQC_IS_NESTED : ret;
1146 }
1147
1148 ret = request_irq(irq, handler, flags, name, dev_id);
1149 return !ret ? IRQC_IS_HARDIRQ : ret;
1150}
1151EXPORT_SYMBOL_GPL(request_any_context_irq);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 7a6eb04ef6b5..09a2ee540bd2 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -32,6 +32,27 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
32 return 0; 32 return 0;
33} 33}
34 34
35static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
36{
37 struct irq_desc *desc = irq_to_desc((long)m->private);
38 unsigned long flags;
39 cpumask_var_t mask;
40
41 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
42 return -ENOMEM;
43
44 raw_spin_lock_irqsave(&desc->lock, flags);
45 if (desc->affinity_hint)
46 cpumask_copy(mask, desc->affinity_hint);
47 raw_spin_unlock_irqrestore(&desc->lock, flags);
48
49 seq_cpumask(m, mask);
50 seq_putc(m, '\n');
51 free_cpumask_var(mask);
52
53 return 0;
54}
55
35#ifndef is_affinity_mask_valid 56#ifndef is_affinity_mask_valid
36#define is_affinity_mask_valid(val) 1 57#define is_affinity_mask_valid(val) 1
37#endif 58#endif
@@ -84,6 +105,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file)
84 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 105 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
85} 106}
86 107
108static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
109{
110 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
111}
112
87static const struct file_operations irq_affinity_proc_fops = { 113static const struct file_operations irq_affinity_proc_fops = {
88 .open = irq_affinity_proc_open, 114 .open = irq_affinity_proc_open,
89 .read = seq_read, 115 .read = seq_read,
@@ -92,6 +118,13 @@ static const struct file_operations irq_affinity_proc_fops = {
92 .write = irq_affinity_proc_write, 118 .write = irq_affinity_proc_write,
93}; 119};
94 120
121static const struct file_operations irq_affinity_hint_proc_fops = {
122 .open = irq_affinity_hint_proc_open,
123 .read = seq_read,
124 .llseek = seq_lseek,
125 .release = single_release,
126};
127
95static int default_affinity_show(struct seq_file *m, void *v) 128static int default_affinity_show(struct seq_file *m, void *v)
96{ 129{
97 seq_cpumask(m, irq_default_affinity); 130 seq_cpumask(m, irq_default_affinity);
@@ -147,6 +180,26 @@ static const struct file_operations default_affinity_proc_fops = {
147 .release = single_release, 180 .release = single_release,
148 .write = default_affinity_write, 181 .write = default_affinity_write,
149}; 182};
183
184static int irq_node_proc_show(struct seq_file *m, void *v)
185{
186 struct irq_desc *desc = irq_to_desc((long) m->private);
187
188 seq_printf(m, "%d\n", desc->node);
189 return 0;
190}
191
192static int irq_node_proc_open(struct inode *inode, struct file *file)
193{
194 return single_open(file, irq_node_proc_show, PDE(inode)->data);
195}
196
197static const struct file_operations irq_node_proc_fops = {
198 .open = irq_node_proc_open,
199 .read = seq_read,
200 .llseek = seq_lseek,
201 .release = single_release,
202};
150#endif 203#endif
151 204
152static int irq_spurious_proc_show(struct seq_file *m, void *v) 205static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -231,6 +284,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
231 /* create /proc/irq/<irq>/smp_affinity */ 284 /* create /proc/irq/<irq>/smp_affinity */
232 proc_create_data("smp_affinity", 0600, desc->dir, 285 proc_create_data("smp_affinity", 0600, desc->dir,
233 &irq_affinity_proc_fops, (void *)(long)irq); 286 &irq_affinity_proc_fops, (void *)(long)irq);
287
288 /* create /proc/irq/<irq>/affinity_hint */
289 proc_create_data("affinity_hint", 0400, desc->dir,
290 &irq_affinity_hint_proc_fops, (void *)(long)irq);
291
292 proc_create_data("node", 0444, desc->dir,
293 &irq_node_proc_fops, (void *)(long)irq);
234#endif 294#endif
235 295
236 proc_create_data("spurious", 0444, desc->dir, 296 proc_create_data("spurious", 0444, desc->dir,
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 13aff293f4de..6f6d091b5757 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -16,6 +16,7 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/kdb.h>
19#include <linux/err.h> 20#include <linux/err.h>
20#include <linux/proc_fs.h> 21#include <linux/proc_fs.h>
21#include <linux/sched.h> /* for cond_resched */ 22#include <linux/sched.h> /* for cond_resched */
@@ -516,6 +517,26 @@ static int kallsyms_open(struct inode *inode, struct file *file)
516 return ret; 517 return ret;
517} 518}
518 519
520#ifdef CONFIG_KGDB_KDB
521const char *kdb_walk_kallsyms(loff_t *pos)
522{
523 static struct kallsym_iter kdb_walk_kallsyms_iter;
524 if (*pos == 0) {
525 memset(&kdb_walk_kallsyms_iter, 0,
526 sizeof(kdb_walk_kallsyms_iter));
527 reset_iter(&kdb_walk_kallsyms_iter, 0);
528 }
529 while (1) {
530 if (!update_iter(&kdb_walk_kallsyms_iter, *pos))
531 return NULL;
532 ++*pos;
533 /* Some debugging symbols have no name. Ignore them. */
534 if (kdb_walk_kallsyms_iter.name[0])
535 return kdb_walk_kallsyms_iter.name;
536 }
537}
538#endif /* CONFIG_KGDB_KDB */
539
519static const struct file_operations kallsyms_operations = { 540static const struct file_operations kallsyms_operations = {
520 .open = kallsyms_open, 541 .open = kallsyms_open,
521 .read = seq_read, 542 .read = seq_read,
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
deleted file mode 100644
index 11f3515ca83f..000000000000
--- a/kernel/kgdb.c
+++ /dev/null
@@ -1,1764 +0,0 @@
1/*
2 * KGDB stub.
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2008 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/console.h>
35#include <linux/threads.h>
36#include <linux/uaccess.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/ptrace.h>
40#include <linux/reboot.h>
41#include <linux/string.h>
42#include <linux/delay.h>
43#include <linux/sched.h>
44#include <linux/sysrq.h>
45#include <linux/init.h>
46#include <linux/kgdb.h>
47#include <linux/pid.h>
48#include <linux/smp.h>
49#include <linux/mm.h>
50
51#include <asm/cacheflush.h>
52#include <asm/byteorder.h>
53#include <asm/atomic.h>
54#include <asm/system.h>
55#include <asm/unaligned.h>
56
57static int kgdb_break_asap;
58
59#define KGDB_MAX_THREAD_QUERY 17
60struct kgdb_state {
61 int ex_vector;
62 int signo;
63 int err_code;
64 int cpu;
65 int pass_exception;
66 unsigned long thr_query;
67 unsigned long threadid;
68 long kgdb_usethreadid;
69 struct pt_regs *linux_regs;
70};
71
72/* Exception state values */
73#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
74#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
75#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
76#define DCPU_SSTEP 0x8 /* CPU is single stepping */
77
78static struct debuggerinfo_struct {
79 void *debuggerinfo;
80 struct task_struct *task;
81 int exception_state;
82} kgdb_info[NR_CPUS];
83
84/**
85 * kgdb_connected - Is a host GDB connected to us?
86 */
87int kgdb_connected;
88EXPORT_SYMBOL_GPL(kgdb_connected);
89
90/* All the KGDB handlers are installed */
91static int kgdb_io_module_registered;
92
93/* Guard for recursive entry */
94static int exception_level;
95
96static struct kgdb_io *kgdb_io_ops;
97static DEFINE_SPINLOCK(kgdb_registration_lock);
98
99/* kgdb console driver is loaded */
100static int kgdb_con_registered;
101/* determine if kgdb console output should be used */
102static int kgdb_use_con;
103
104static int __init opt_kgdb_con(char *str)
105{
106 kgdb_use_con = 1;
107 return 0;
108}
109
110early_param("kgdbcon", opt_kgdb_con);
111
112module_param(kgdb_use_con, int, 0644);
113
114/*
115 * Holds information about breakpoints in a kernel. These breakpoints are
116 * added and removed by gdb.
117 */
118static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
119 [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
120};
121
122/*
123 * The CPU# of the active CPU, or -1 if none:
124 */
125atomic_t kgdb_active = ATOMIC_INIT(-1);
126
127/*
128 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
129 * bootup code (which might not have percpu set up yet):
130 */
131static atomic_t passive_cpu_wait[NR_CPUS];
132static atomic_t cpu_in_kgdb[NR_CPUS];
133atomic_t kgdb_setting_breakpoint;
134
135struct task_struct *kgdb_usethread;
136struct task_struct *kgdb_contthread;
137
138int kgdb_single_step;
139pid_t kgdb_sstep_pid;
140
141/* Our I/O buffers. */
142static char remcom_in_buffer[BUFMAX];
143static char remcom_out_buffer[BUFMAX];
144
145/* Storage for the registers, in GDB format. */
146static unsigned long gdb_regs[(NUMREGBYTES +
147 sizeof(unsigned long) - 1) /
148 sizeof(unsigned long)];
149
150/* to keep track of the CPU which is doing the single stepping*/
151atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
152
153/*
154 * If you are debugging a problem where roundup (the collection of
155 * all other CPUs) is a problem [this should be extremely rare],
156 * then use the nokgdbroundup option to avoid roundup. In that case
157 * the other CPUs might interfere with your debugging context, so
158 * use this with care:
159 */
160static int kgdb_do_roundup = 1;
161
162static int __init opt_nokgdbroundup(char *str)
163{
164 kgdb_do_roundup = 0;
165
166 return 0;
167}
168
169early_param("nokgdbroundup", opt_nokgdbroundup);
170
171/*
172 * Finally, some KGDB code :-)
173 */
174
175/*
176 * Weak aliases for breakpoint management,
177 * can be overriden by architectures when needed:
178 */
179int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
180{
181 int err;
182
183 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
184 if (err)
185 return err;
186
187 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
188 BREAK_INSTR_SIZE);
189}
190
191int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
192{
193 return probe_kernel_write((char *)addr,
194 (char *)bundle, BREAK_INSTR_SIZE);
195}
196
197int __weak kgdb_validate_break_address(unsigned long addr)
198{
199 char tmp_variable[BREAK_INSTR_SIZE];
200 int err;
201 /* Validate setting the breakpoint and then removing it. In the
202 * remove fails, the kernel needs to emit a bad message because we
203 * are deep trouble not being able to put things back the way we
204 * found them.
205 */
206 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
207 if (err)
208 return err;
209 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
210 if (err)
211 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
212 "memory destroyed at: %lx", addr);
213 return err;
214}
215
216unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
217{
218 return instruction_pointer(regs);
219}
220
221int __weak kgdb_arch_init(void)
222{
223 return 0;
224}
225
226int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
227{
228 return 0;
229}
230
231void __weak
232kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
233{
234 return;
235}
236
237/**
238 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
239 * @regs: Current &struct pt_regs.
240 *
241 * This function will be called if the particular architecture must
242 * disable hardware debugging while it is processing gdb packets or
243 * handling exception.
244 */
245void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
246{
247}
248
249/*
250 * GDB remote protocol parser:
251 */
252
253static int hex(char ch)
254{
255 if ((ch >= 'a') && (ch <= 'f'))
256 return ch - 'a' + 10;
257 if ((ch >= '0') && (ch <= '9'))
258 return ch - '0';
259 if ((ch >= 'A') && (ch <= 'F'))
260 return ch - 'A' + 10;
261 return -1;
262}
263
264/* scan for the sequence $<data>#<checksum> */
265static void get_packet(char *buffer)
266{
267 unsigned char checksum;
268 unsigned char xmitcsum;
269 int count;
270 char ch;
271
272 do {
273 /*
274 * Spin and wait around for the start character, ignore all
275 * other characters:
276 */
277 while ((ch = (kgdb_io_ops->read_char())) != '$')
278 /* nothing */;
279
280 kgdb_connected = 1;
281 checksum = 0;
282 xmitcsum = -1;
283
284 count = 0;
285
286 /*
287 * now, read until a # or end of buffer is found:
288 */
289 while (count < (BUFMAX - 1)) {
290 ch = kgdb_io_ops->read_char();
291 if (ch == '#')
292 break;
293 checksum = checksum + ch;
294 buffer[count] = ch;
295 count = count + 1;
296 }
297 buffer[count] = 0;
298
299 if (ch == '#') {
300 xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
301 xmitcsum += hex(kgdb_io_ops->read_char());
302
303 if (checksum != xmitcsum)
304 /* failed checksum */
305 kgdb_io_ops->write_char('-');
306 else
307 /* successful transfer */
308 kgdb_io_ops->write_char('+');
309 if (kgdb_io_ops->flush)
310 kgdb_io_ops->flush();
311 }
312 } while (checksum != xmitcsum);
313}
314
315/*
316 * Send the packet in buffer.
317 * Check for gdb connection if asked for.
318 */
319static void put_packet(char *buffer)
320{
321 unsigned char checksum;
322 int count;
323 char ch;
324
325 /*
326 * $<packet info>#<checksum>.
327 */
328 while (1) {
329 kgdb_io_ops->write_char('$');
330 checksum = 0;
331 count = 0;
332
333 while ((ch = buffer[count])) {
334 kgdb_io_ops->write_char(ch);
335 checksum += ch;
336 count++;
337 }
338
339 kgdb_io_ops->write_char('#');
340 kgdb_io_ops->write_char(hex_asc_hi(checksum));
341 kgdb_io_ops->write_char(hex_asc_lo(checksum));
342 if (kgdb_io_ops->flush)
343 kgdb_io_ops->flush();
344
345 /* Now see what we get in reply. */
346 ch = kgdb_io_ops->read_char();
347
348 if (ch == 3)
349 ch = kgdb_io_ops->read_char();
350
351 /* If we get an ACK, we are done. */
352 if (ch == '+')
353 return;
354
355 /*
356 * If we get the start of another packet, this means
357 * that GDB is attempting to reconnect. We will NAK
358 * the packet being sent, and stop trying to send this
359 * packet.
360 */
361 if (ch == '$') {
362 kgdb_io_ops->write_char('-');
363 if (kgdb_io_ops->flush)
364 kgdb_io_ops->flush();
365 return;
366 }
367 }
368}
369
370/*
371 * Convert the memory pointed to by mem into hex, placing result in buf.
372 * Return a pointer to the last char put in buf (null). May return an error.
373 */
374int kgdb_mem2hex(char *mem, char *buf, int count)
375{
376 char *tmp;
377 int err;
378
379 /*
380 * We use the upper half of buf as an intermediate buffer for the
381 * raw memory copy. Hex conversion will work against this one.
382 */
383 tmp = buf + count;
384
385 err = probe_kernel_read(tmp, mem, count);
386 if (!err) {
387 while (count > 0) {
388 buf = pack_hex_byte(buf, *tmp);
389 tmp++;
390 count--;
391 }
392
393 *buf = 0;
394 }
395
396 return err;
397}
398
399/*
400 * Copy the binary array pointed to by buf into mem. Fix $, #, and
401 * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
402 * The input buf is overwitten with the result to write to mem.
403 */
404static int kgdb_ebin2mem(char *buf, char *mem, int count)
405{
406 int size = 0;
407 char *c = buf;
408
409 while (count-- > 0) {
410 c[size] = *buf++;
411 if (c[size] == 0x7d)
412 c[size] = *buf++ ^ 0x20;
413 size++;
414 }
415
416 return probe_kernel_write(mem, c, size);
417}
418
419/*
420 * Convert the hex array pointed to by buf into binary to be placed in mem.
421 * Return a pointer to the character AFTER the last byte written.
422 * May return an error.
423 */
424int kgdb_hex2mem(char *buf, char *mem, int count)
425{
426 char *tmp_raw;
427 char *tmp_hex;
428
429 /*
430 * We use the upper half of buf as an intermediate buffer for the
431 * raw memory that is converted from hex.
432 */
433 tmp_raw = buf + count * 2;
434
435 tmp_hex = tmp_raw - 1;
436 while (tmp_hex >= buf) {
437 tmp_raw--;
438 *tmp_raw = hex(*tmp_hex--);
439 *tmp_raw |= hex(*tmp_hex--) << 4;
440 }
441
442 return probe_kernel_write(mem, tmp_raw, count);
443}
444
445/*
446 * While we find nice hex chars, build a long_val.
447 * Return number of chars processed.
448 */
449int kgdb_hex2long(char **ptr, unsigned long *long_val)
450{
451 int hex_val;
452 int num = 0;
453 int negate = 0;
454
455 *long_val = 0;
456
457 if (**ptr == '-') {
458 negate = 1;
459 (*ptr)++;
460 }
461 while (**ptr) {
462 hex_val = hex(**ptr);
463 if (hex_val < 0)
464 break;
465
466 *long_val = (*long_val << 4) | hex_val;
467 num++;
468 (*ptr)++;
469 }
470
471 if (negate)
472 *long_val = -*long_val;
473
474 return num;
475}
476
477/* Write memory due to an 'M' or 'X' packet. */
478static int write_mem_msg(int binary)
479{
480 char *ptr = &remcom_in_buffer[1];
481 unsigned long addr;
482 unsigned long length;
483 int err;
484
485 if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
486 kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
487 if (binary)
488 err = kgdb_ebin2mem(ptr, (char *)addr, length);
489 else
490 err = kgdb_hex2mem(ptr, (char *)addr, length);
491 if (err)
492 return err;
493 if (CACHE_FLUSH_IS_SAFE)
494 flush_icache_range(addr, addr + length);
495 return 0;
496 }
497
498 return -EINVAL;
499}
500
501static void error_packet(char *pkt, int error)
502{
503 error = -error;
504 pkt[0] = 'E';
505 pkt[1] = hex_asc[(error / 10)];
506 pkt[2] = hex_asc[(error % 10)];
507 pkt[3] = '\0';
508}
509
510/*
511 * Thread ID accessors. We represent a flat TID space to GDB, where
512 * the per CPU idle threads (which under Linux all have PID 0) are
513 * remapped to negative TIDs.
514 */
515
516#define BUF_THREAD_ID_SIZE 16
517
518static char *pack_threadid(char *pkt, unsigned char *id)
519{
520 char *limit;
521
522 limit = pkt + BUF_THREAD_ID_SIZE;
523 while (pkt < limit)
524 pkt = pack_hex_byte(pkt, *id++);
525
526 return pkt;
527}
528
529static void int_to_threadref(unsigned char *id, int value)
530{
531 unsigned char *scan;
532 int i = 4;
533
534 scan = (unsigned char *)id;
535 while (i--)
536 *scan++ = 0;
537 put_unaligned_be32(value, scan);
538}
539
540static struct task_struct *getthread(struct pt_regs *regs, int tid)
541{
542 /*
543 * Non-positive TIDs are remapped to the cpu shadow information
544 */
545 if (tid == 0 || tid == -1)
546 tid = -atomic_read(&kgdb_active) - 2;
547 if (tid < -1 && tid > -NR_CPUS - 2) {
548 if (kgdb_info[-tid - 2].task)
549 return kgdb_info[-tid - 2].task;
550 else
551 return idle_task(-tid - 2);
552 }
553 if (tid <= 0) {
554 printk(KERN_ERR "KGDB: Internal thread select error\n");
555 dump_stack();
556 return NULL;
557 }
558
559 /*
560 * find_task_by_pid_ns() does not take the tasklist lock anymore
561 * but is nicely RCU locked - hence is a pretty resilient
562 * thing to use:
563 */
564 return find_task_by_pid_ns(tid, &init_pid_ns);
565}
566
567/*
568 * Some architectures need cache flushes when we set/clear a
569 * breakpoint:
570 */
571static void kgdb_flush_swbreak_addr(unsigned long addr)
572{
573 if (!CACHE_FLUSH_IS_SAFE)
574 return;
575
576 if (current->mm && current->mm->mmap_cache) {
577 flush_cache_range(current->mm->mmap_cache,
578 addr, addr + BREAK_INSTR_SIZE);
579 }
580 /* Force flush instruction cache if it was outside the mm */
581 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
582}
583
584/*
585 * SW breakpoint management:
586 */
587static int kgdb_activate_sw_breakpoints(void)
588{
589 unsigned long addr;
590 int error;
591 int ret = 0;
592 int i;
593
594 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
595 if (kgdb_break[i].state != BP_SET)
596 continue;
597
598 addr = kgdb_break[i].bpt_addr;
599 error = kgdb_arch_set_breakpoint(addr,
600 kgdb_break[i].saved_instr);
601 if (error) {
602 ret = error;
603 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
604 continue;
605 }
606
607 kgdb_flush_swbreak_addr(addr);
608 kgdb_break[i].state = BP_ACTIVE;
609 }
610 return ret;
611}
612
613static int kgdb_set_sw_break(unsigned long addr)
614{
615 int err = kgdb_validate_break_address(addr);
616 int breakno = -1;
617 int i;
618
619 if (err)
620 return err;
621
622 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
623 if ((kgdb_break[i].state == BP_SET) &&
624 (kgdb_break[i].bpt_addr == addr))
625 return -EEXIST;
626 }
627 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
628 if (kgdb_break[i].state == BP_REMOVED &&
629 kgdb_break[i].bpt_addr == addr) {
630 breakno = i;
631 break;
632 }
633 }
634
635 if (breakno == -1) {
636 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
637 if (kgdb_break[i].state == BP_UNDEFINED) {
638 breakno = i;
639 break;
640 }
641 }
642 }
643
644 if (breakno == -1)
645 return -E2BIG;
646
647 kgdb_break[breakno].state = BP_SET;
648 kgdb_break[breakno].type = BP_BREAKPOINT;
649 kgdb_break[breakno].bpt_addr = addr;
650
651 return 0;
652}
653
654static int kgdb_deactivate_sw_breakpoints(void)
655{
656 unsigned long addr;
657 int error;
658 int ret = 0;
659 int i;
660
661 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
662 if (kgdb_break[i].state != BP_ACTIVE)
663 continue;
664 addr = kgdb_break[i].bpt_addr;
665 error = kgdb_arch_remove_breakpoint(addr,
666 kgdb_break[i].saved_instr);
667 if (error) {
668 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
669 ret = error;
670 }
671
672 kgdb_flush_swbreak_addr(addr);
673 kgdb_break[i].state = BP_SET;
674 }
675 return ret;
676}
677
678static int kgdb_remove_sw_break(unsigned long addr)
679{
680 int i;
681
682 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
683 if ((kgdb_break[i].state == BP_SET) &&
684 (kgdb_break[i].bpt_addr == addr)) {
685 kgdb_break[i].state = BP_REMOVED;
686 return 0;
687 }
688 }
689 return -ENOENT;
690}
691
692int kgdb_isremovedbreak(unsigned long addr)
693{
694 int i;
695
696 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
697 if ((kgdb_break[i].state == BP_REMOVED) &&
698 (kgdb_break[i].bpt_addr == addr))
699 return 1;
700 }
701 return 0;
702}
703
704static int remove_all_break(void)
705{
706 unsigned long addr;
707 int error;
708 int i;
709
710 /* Clear memory breakpoints. */
711 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
712 if (kgdb_break[i].state != BP_ACTIVE)
713 goto setundefined;
714 addr = kgdb_break[i].bpt_addr;
715 error = kgdb_arch_remove_breakpoint(addr,
716 kgdb_break[i].saved_instr);
717 if (error)
718 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
719 addr);
720setundefined:
721 kgdb_break[i].state = BP_UNDEFINED;
722 }
723
724 /* Clear hardware breakpoints. */
725 if (arch_kgdb_ops.remove_all_hw_break)
726 arch_kgdb_ops.remove_all_hw_break();
727
728 return 0;
729}
730
731/*
732 * Remap normal tasks to their real PID,
733 * CPU shadow threads are mapped to -CPU - 2
734 */
735static inline int shadow_pid(int realpid)
736{
737 if (realpid)
738 return realpid;
739
740 return -raw_smp_processor_id() - 2;
741}
742
743static char gdbmsgbuf[BUFMAX + 1];
744
745static void kgdb_msg_write(const char *s, int len)
746{
747 char *bufptr;
748 int wcount;
749 int i;
750
751 /* 'O'utput */
752 gdbmsgbuf[0] = 'O';
753
754 /* Fill and send buffers... */
755 while (len > 0) {
756 bufptr = gdbmsgbuf + 1;
757
758 /* Calculate how many this time */
759 if ((len << 1) > (BUFMAX - 2))
760 wcount = (BUFMAX - 2) >> 1;
761 else
762 wcount = len;
763
764 /* Pack in hex chars */
765 for (i = 0; i < wcount; i++)
766 bufptr = pack_hex_byte(bufptr, s[i]);
767 *bufptr = '\0';
768
769 /* Move up */
770 s += wcount;
771 len -= wcount;
772
773 /* Write packet */
774 put_packet(gdbmsgbuf);
775 }
776}
777
778/*
779 * Return true if there is a valid kgdb I/O module. Also if no
780 * debugger is attached a message can be printed to the console about
781 * waiting for the debugger to attach.
782 *
783 * The print_wait argument is only to be true when called from inside
784 * the core kgdb_handle_exception, because it will wait for the
785 * debugger to attach.
786 */
787static int kgdb_io_ready(int print_wait)
788{
789 if (!kgdb_io_ops)
790 return 0;
791 if (kgdb_connected)
792 return 1;
793 if (atomic_read(&kgdb_setting_breakpoint))
794 return 1;
795 if (print_wait)
796 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
797 return 1;
798}
799
800/*
801 * All the functions that start with gdb_cmd are the various
802 * operations to implement the handlers for the gdbserial protocol
803 * where KGDB is communicating with an external debugger
804 */
805
806/* Handle the '?' status packets */
807static void gdb_cmd_status(struct kgdb_state *ks)
808{
809 /*
810 * We know that this packet is only sent
811 * during initial connect. So to be safe,
812 * we clear out our breakpoints now in case
813 * GDB is reconnecting.
814 */
815 remove_all_break();
816
817 remcom_out_buffer[0] = 'S';
818 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
819}
820
821/* Handle the 'g' get registers request */
822static void gdb_cmd_getregs(struct kgdb_state *ks)
823{
824 struct task_struct *thread;
825 void *local_debuggerinfo;
826 int i;
827
828 thread = kgdb_usethread;
829 if (!thread) {
830 thread = kgdb_info[ks->cpu].task;
831 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
832 } else {
833 local_debuggerinfo = NULL;
834 for_each_online_cpu(i) {
835 /*
836 * Try to find the task on some other
837 * or possibly this node if we do not
838 * find the matching task then we try
839 * to approximate the results.
840 */
841 if (thread == kgdb_info[i].task)
842 local_debuggerinfo = kgdb_info[i].debuggerinfo;
843 }
844 }
845
846 /*
847 * All threads that don't have debuggerinfo should be
848 * in schedule() sleeping, since all other CPUs
849 * are in kgdb_wait, and thus have debuggerinfo.
850 */
851 if (local_debuggerinfo) {
852 pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
853 } else {
854 /*
855 * Pull stuff saved during switch_to; nothing
856 * else is accessible (or even particularly
857 * relevant).
858 *
859 * This should be enough for a stack trace.
860 */
861 sleeping_thread_to_gdb_regs(gdb_regs, thread);
862 }
863 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
864}
865
866/* Handle the 'G' set registers request */
867static void gdb_cmd_setregs(struct kgdb_state *ks)
868{
869 kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
870
871 if (kgdb_usethread && kgdb_usethread != current) {
872 error_packet(remcom_out_buffer, -EINVAL);
873 } else {
874 gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
875 strcpy(remcom_out_buffer, "OK");
876 }
877}
878
879/* Handle the 'm' memory read bytes */
880static void gdb_cmd_memread(struct kgdb_state *ks)
881{
882 char *ptr = &remcom_in_buffer[1];
883 unsigned long length;
884 unsigned long addr;
885 int err;
886
887 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
888 kgdb_hex2long(&ptr, &length) > 0) {
889 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
890 if (err)
891 error_packet(remcom_out_buffer, err);
892 } else {
893 error_packet(remcom_out_buffer, -EINVAL);
894 }
895}
896
897/* Handle the 'M' memory write bytes */
898static void gdb_cmd_memwrite(struct kgdb_state *ks)
899{
900 int err = write_mem_msg(0);
901
902 if (err)
903 error_packet(remcom_out_buffer, err);
904 else
905 strcpy(remcom_out_buffer, "OK");
906}
907
908/* Handle the 'X' memory binary write bytes */
909static void gdb_cmd_binwrite(struct kgdb_state *ks)
910{
911 int err = write_mem_msg(1);
912
913 if (err)
914 error_packet(remcom_out_buffer, err);
915 else
916 strcpy(remcom_out_buffer, "OK");
917}
918
919/* Handle the 'D' or 'k', detach or kill packets */
920static void gdb_cmd_detachkill(struct kgdb_state *ks)
921{
922 int error;
923
924 /* The detach case */
925 if (remcom_in_buffer[0] == 'D') {
926 error = remove_all_break();
927 if (error < 0) {
928 error_packet(remcom_out_buffer, error);
929 } else {
930 strcpy(remcom_out_buffer, "OK");
931 kgdb_connected = 0;
932 }
933 put_packet(remcom_out_buffer);
934 } else {
935 /*
936 * Assume the kill case, with no exit code checking,
937 * trying to force detach the debugger:
938 */
939 remove_all_break();
940 kgdb_connected = 0;
941 }
942}
943
944/* Handle the 'R' reboot packets */
945static int gdb_cmd_reboot(struct kgdb_state *ks)
946{
947 /* For now, only honor R0 */
948 if (strcmp(remcom_in_buffer, "R0") == 0) {
949 printk(KERN_CRIT "Executing emergency reboot\n");
950 strcpy(remcom_out_buffer, "OK");
951 put_packet(remcom_out_buffer);
952
953 /*
954 * Execution should not return from
955 * machine_emergency_restart()
956 */
957 machine_emergency_restart();
958 kgdb_connected = 0;
959
960 return 1;
961 }
962 return 0;
963}
964
965/* Handle the 'q' query packets */
966static void gdb_cmd_query(struct kgdb_state *ks)
967{
968 struct task_struct *g;
969 struct task_struct *p;
970 unsigned char thref[8];
971 char *ptr;
972 int i;
973 int cpu;
974 int finished = 0;
975
976 switch (remcom_in_buffer[1]) {
977 case 's':
978 case 'f':
979 if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
980 error_packet(remcom_out_buffer, -EINVAL);
981 break;
982 }
983
984 i = 0;
985 remcom_out_buffer[0] = 'm';
986 ptr = remcom_out_buffer + 1;
987 if (remcom_in_buffer[1] == 'f') {
988 /* Each cpu is a shadow thread */
989 for_each_online_cpu(cpu) {
990 ks->thr_query = 0;
991 int_to_threadref(thref, -cpu - 2);
992 pack_threadid(ptr, thref);
993 ptr += BUF_THREAD_ID_SIZE;
994 *(ptr++) = ',';
995 i++;
996 }
997 }
998
999 do_each_thread(g, p) {
1000 if (i >= ks->thr_query && !finished) {
1001 int_to_threadref(thref, p->pid);
1002 pack_threadid(ptr, thref);
1003 ptr += BUF_THREAD_ID_SIZE;
1004 *(ptr++) = ',';
1005 ks->thr_query++;
1006 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
1007 finished = 1;
1008 }
1009 i++;
1010 } while_each_thread(g, p);
1011
1012 *(--ptr) = '\0';
1013 break;
1014
1015 case 'C':
1016 /* Current thread id */
1017 strcpy(remcom_out_buffer, "QC");
1018 ks->threadid = shadow_pid(current->pid);
1019 int_to_threadref(thref, ks->threadid);
1020 pack_threadid(remcom_out_buffer + 2, thref);
1021 break;
1022 case 'T':
1023 if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
1024 error_packet(remcom_out_buffer, -EINVAL);
1025 break;
1026 }
1027 ks->threadid = 0;
1028 ptr = remcom_in_buffer + 17;
1029 kgdb_hex2long(&ptr, &ks->threadid);
1030 if (!getthread(ks->linux_regs, ks->threadid)) {
1031 error_packet(remcom_out_buffer, -EINVAL);
1032 break;
1033 }
1034 if ((int)ks->threadid > 0) {
1035 kgdb_mem2hex(getthread(ks->linux_regs,
1036 ks->threadid)->comm,
1037 remcom_out_buffer, 16);
1038 } else {
1039 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
1040
1041 sprintf(tmpstr, "shadowCPU%d",
1042 (int)(-ks->threadid - 2));
1043 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
1044 }
1045 break;
1046 }
1047}
1048
1049/* Handle the 'H' task query packets */
1050static void gdb_cmd_task(struct kgdb_state *ks)
1051{
1052 struct task_struct *thread;
1053 char *ptr;
1054
1055 switch (remcom_in_buffer[1]) {
1056 case 'g':
1057 ptr = &remcom_in_buffer[2];
1058 kgdb_hex2long(&ptr, &ks->threadid);
1059 thread = getthread(ks->linux_regs, ks->threadid);
1060 if (!thread && ks->threadid > 0) {
1061 error_packet(remcom_out_buffer, -EINVAL);
1062 break;
1063 }
1064 kgdb_usethread = thread;
1065 ks->kgdb_usethreadid = ks->threadid;
1066 strcpy(remcom_out_buffer, "OK");
1067 break;
1068 case 'c':
1069 ptr = &remcom_in_buffer[2];
1070 kgdb_hex2long(&ptr, &ks->threadid);
1071 if (!ks->threadid) {
1072 kgdb_contthread = NULL;
1073 } else {
1074 thread = getthread(ks->linux_regs, ks->threadid);
1075 if (!thread && ks->threadid > 0) {
1076 error_packet(remcom_out_buffer, -EINVAL);
1077 break;
1078 }
1079 kgdb_contthread = thread;
1080 }
1081 strcpy(remcom_out_buffer, "OK");
1082 break;
1083 }
1084}
1085
1086/* Handle the 'T' thread query packets */
1087static void gdb_cmd_thread(struct kgdb_state *ks)
1088{
1089 char *ptr = &remcom_in_buffer[1];
1090 struct task_struct *thread;
1091
1092 kgdb_hex2long(&ptr, &ks->threadid);
1093 thread = getthread(ks->linux_regs, ks->threadid);
1094 if (thread)
1095 strcpy(remcom_out_buffer, "OK");
1096 else
1097 error_packet(remcom_out_buffer, -EINVAL);
1098}
1099
1100/* Handle the 'z' or 'Z' breakpoint remove or set packets */
1101static void gdb_cmd_break(struct kgdb_state *ks)
1102{
1103 /*
1104 * Since GDB-5.3, it's been drafted that '0' is a software
1105 * breakpoint, '1' is a hardware breakpoint, so let's do that.
1106 */
1107 char *bpt_type = &remcom_in_buffer[1];
1108 char *ptr = &remcom_in_buffer[2];
1109 unsigned long addr;
1110 unsigned long length;
1111 int error = 0;
1112
1113 if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
1114 /* Unsupported */
1115 if (*bpt_type > '4')
1116 return;
1117 } else {
1118 if (*bpt_type != '0' && *bpt_type != '1')
1119 /* Unsupported. */
1120 return;
1121 }
1122
1123 /*
1124 * Test if this is a hardware breakpoint, and
1125 * if we support it:
1126 */
1127 if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
1128 /* Unsupported. */
1129 return;
1130
1131 if (*(ptr++) != ',') {
1132 error_packet(remcom_out_buffer, -EINVAL);
1133 return;
1134 }
1135 if (!kgdb_hex2long(&ptr, &addr)) {
1136 error_packet(remcom_out_buffer, -EINVAL);
1137 return;
1138 }
1139 if (*(ptr++) != ',' ||
1140 !kgdb_hex2long(&ptr, &length)) {
1141 error_packet(remcom_out_buffer, -EINVAL);
1142 return;
1143 }
1144
1145 if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
1146 error = kgdb_set_sw_break(addr);
1147 else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
1148 error = kgdb_remove_sw_break(addr);
1149 else if (remcom_in_buffer[0] == 'Z')
1150 error = arch_kgdb_ops.set_hw_breakpoint(addr,
1151 (int)length, *bpt_type - '0');
1152 else if (remcom_in_buffer[0] == 'z')
1153 error = arch_kgdb_ops.remove_hw_breakpoint(addr,
1154 (int) length, *bpt_type - '0');
1155
1156 if (error == 0)
1157 strcpy(remcom_out_buffer, "OK");
1158 else
1159 error_packet(remcom_out_buffer, error);
1160}
1161
1162/* Handle the 'C' signal / exception passing packets */
1163static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1164{
1165 /* C09 == pass exception
1166 * C15 == detach kgdb, pass exception
1167 */
1168 if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
1169
1170 ks->pass_exception = 1;
1171 remcom_in_buffer[0] = 'c';
1172
1173 } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
1174
1175 ks->pass_exception = 1;
1176 remcom_in_buffer[0] = 'D';
1177 remove_all_break();
1178 kgdb_connected = 0;
1179 return 1;
1180
1181 } else {
1182 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1183 " and 15 (pass and disconnect)\n"
1184 "Executing a continue without signal passing\n", 0);
1185 remcom_in_buffer[0] = 'c';
1186 }
1187
1188 /* Indicate fall through */
1189 return -1;
1190}
1191
1192/*
1193 * This function performs all gdbserial command procesing
1194 */
1195static int gdb_serial_stub(struct kgdb_state *ks)
1196{
1197 int error = 0;
1198 int tmp;
1199
1200 /* Clear the out buffer. */
1201 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1202
1203 if (kgdb_connected) {
1204 unsigned char thref[8];
1205 char *ptr;
1206
1207 /* Reply to host that an exception has occurred */
1208 ptr = remcom_out_buffer;
1209 *ptr++ = 'T';
1210 ptr = pack_hex_byte(ptr, ks->signo);
1211 ptr += strlen(strcpy(ptr, "thread:"));
1212 int_to_threadref(thref, shadow_pid(current->pid));
1213 ptr = pack_threadid(ptr, thref);
1214 *ptr++ = ';';
1215 put_packet(remcom_out_buffer);
1216 }
1217
1218 kgdb_usethread = kgdb_info[ks->cpu].task;
1219 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
1220 ks->pass_exception = 0;
1221
1222 while (1) {
1223 error = 0;
1224
1225 /* Clear the out buffer. */
1226 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1227
1228 get_packet(remcom_in_buffer);
1229
1230 switch (remcom_in_buffer[0]) {
1231 case '?': /* gdbserial status */
1232 gdb_cmd_status(ks);
1233 break;
1234 case 'g': /* return the value of the CPU registers */
1235 gdb_cmd_getregs(ks);
1236 break;
1237 case 'G': /* set the value of the CPU registers - return OK */
1238 gdb_cmd_setregs(ks);
1239 break;
1240 case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
1241 gdb_cmd_memread(ks);
1242 break;
1243 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1244 gdb_cmd_memwrite(ks);
1245 break;
1246 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1247 gdb_cmd_binwrite(ks);
1248 break;
1249 /* kill or detach. KGDB should treat this like a
1250 * continue.
1251 */
1252 case 'D': /* Debugger detach */
1253 case 'k': /* Debugger detach via kill */
1254 gdb_cmd_detachkill(ks);
1255 goto default_handle;
1256 case 'R': /* Reboot */
1257 if (gdb_cmd_reboot(ks))
1258 goto default_handle;
1259 break;
1260 case 'q': /* query command */
1261 gdb_cmd_query(ks);
1262 break;
1263 case 'H': /* task related */
1264 gdb_cmd_task(ks);
1265 break;
1266 case 'T': /* Query thread status */
1267 gdb_cmd_thread(ks);
1268 break;
1269 case 'z': /* Break point remove */
1270 case 'Z': /* Break point set */
1271 gdb_cmd_break(ks);
1272 break;
1273 case 'C': /* Exception passing */
1274 tmp = gdb_cmd_exception_pass(ks);
1275 if (tmp > 0)
1276 goto default_handle;
1277 if (tmp == 0)
1278 break;
1279 /* Fall through on tmp < 0 */
1280 case 'c': /* Continue packet */
1281 case 's': /* Single step packet */
1282 if (kgdb_contthread && kgdb_contthread != current) {
1283 /* Can't switch threads in kgdb */
1284 error_packet(remcom_out_buffer, -EINVAL);
1285 break;
1286 }
1287 kgdb_activate_sw_breakpoints();
1288 /* Fall through to default processing */
1289 default:
1290default_handle:
1291 error = kgdb_arch_handle_exception(ks->ex_vector,
1292 ks->signo,
1293 ks->err_code,
1294 remcom_in_buffer,
1295 remcom_out_buffer,
1296 ks->linux_regs);
1297 /*
1298 * Leave cmd processing on error, detach,
1299 * kill, continue, or single step.
1300 */
1301 if (error >= 0 || remcom_in_buffer[0] == 'D' ||
1302 remcom_in_buffer[0] == 'k') {
1303 error = 0;
1304 goto kgdb_exit;
1305 }
1306
1307 }
1308
1309 /* reply to the request */
1310 put_packet(remcom_out_buffer);
1311 }
1312
1313kgdb_exit:
1314 if (ks->pass_exception)
1315 error = 1;
1316 return error;
1317}
1318
1319static int kgdb_reenter_check(struct kgdb_state *ks)
1320{
1321 unsigned long addr;
1322
1323 if (atomic_read(&kgdb_active) != raw_smp_processor_id())
1324 return 0;
1325
1326 /* Panic on recursive debugger calls: */
1327 exception_level++;
1328 addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
1329 kgdb_deactivate_sw_breakpoints();
1330
1331 /*
1332 * If the break point removed ok at the place exception
1333 * occurred, try to recover and print a warning to the end
1334 * user because the user planted a breakpoint in a place that
1335 * KGDB needs in order to function.
1336 */
1337 if (kgdb_remove_sw_break(addr) == 0) {
1338 exception_level = 0;
1339 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1340 kgdb_activate_sw_breakpoints();
1341 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
1342 addr);
1343 WARN_ON_ONCE(1);
1344
1345 return 1;
1346 }
1347 remove_all_break();
1348 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1349
1350 if (exception_level > 1) {
1351 dump_stack();
1352 panic("Recursive entry to debugger");
1353 }
1354
1355 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
1356 dump_stack();
1357 panic("Recursive entry to debugger");
1358
1359 return 1;
1360}
1361
1362static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
1363{
1364 unsigned long flags;
1365 int sstep_tries = 100;
1366 int error = 0;
1367 int i, cpu;
1368 int trace_on = 0;
1369acquirelock:
1370 /*
1371 * Interrupts will be restored by the 'trap return' code, except when
1372 * single stepping.
1373 */
1374 local_irq_save(flags);
1375
1376 cpu = ks->cpu;
1377 kgdb_info[cpu].debuggerinfo = regs;
1378 kgdb_info[cpu].task = current;
1379 /*
1380 * Make sure the above info reaches the primary CPU before
1381 * our cpu_in_kgdb[] flag setting does:
1382 */
1383 atomic_inc(&cpu_in_kgdb[cpu]);
1384
1385 /*
1386 * CPU will loop if it is a slave or request to become a kgdb
1387 * master cpu and acquire the kgdb_active lock:
1388 */
1389 while (1) {
1390 if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
1391 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
1392 break;
1393 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
1394 if (!atomic_read(&passive_cpu_wait[cpu]))
1395 goto return_normal;
1396 } else {
1397return_normal:
1398 /* Return to normal operation by executing any
1399 * hw breakpoint fixup.
1400 */
1401 if (arch_kgdb_ops.correct_hw_break)
1402 arch_kgdb_ops.correct_hw_break();
1403 if (trace_on)
1404 tracing_on();
1405 atomic_dec(&cpu_in_kgdb[cpu]);
1406 touch_softlockup_watchdog_sync();
1407 clocksource_touch_watchdog();
1408 local_irq_restore(flags);
1409 return 0;
1410 }
1411 cpu_relax();
1412 }
1413
1414 /*
1415 * For single stepping, try to only enter on the processor
1416 * that was single stepping. To gaurd against a deadlock, the
1417 * kernel will only try for the value of sstep_tries before
1418 * giving up and continuing on.
1419 */
1420 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1421 (kgdb_info[cpu].task &&
1422 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1423 atomic_set(&kgdb_active, -1);
1424 touch_softlockup_watchdog_sync();
1425 clocksource_touch_watchdog();
1426 local_irq_restore(flags);
1427
1428 goto acquirelock;
1429 }
1430
1431 if (!kgdb_io_ready(1)) {
1432 error = 1;
1433 goto kgdb_restore; /* No I/O connection, so resume the system */
1434 }
1435
1436 /*
1437 * Don't enter if we have hit a removed breakpoint.
1438 */
1439 if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
1440 goto kgdb_restore;
1441
1442 /* Call the I/O driver's pre_exception routine */
1443 if (kgdb_io_ops->pre_exception)
1444 kgdb_io_ops->pre_exception();
1445
1446 kgdb_disable_hw_debug(ks->linux_regs);
1447
1448 /*
1449 * Get the passive CPU lock which will hold all the non-primary
1450 * CPU in a spin state while the debugger is active
1451 */
1452 if (!kgdb_single_step) {
1453 for (i = 0; i < NR_CPUS; i++)
1454 atomic_inc(&passive_cpu_wait[i]);
1455 }
1456
1457#ifdef CONFIG_SMP
1458 /* Signal the other CPUs to enter kgdb_wait() */
1459 if ((!kgdb_single_step) && kgdb_do_roundup)
1460 kgdb_roundup_cpus(flags);
1461#endif
1462
1463 /*
1464 * Wait for the other CPUs to be notified and be waiting for us:
1465 */
1466 for_each_online_cpu(i) {
1467 while (!atomic_read(&cpu_in_kgdb[i]))
1468 cpu_relax();
1469 }
1470
1471 /*
1472 * At this point the primary processor is completely
1473 * in the debugger and all secondary CPUs are quiescent
1474 */
1475 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1476 kgdb_deactivate_sw_breakpoints();
1477 kgdb_single_step = 0;
1478 kgdb_contthread = current;
1479 exception_level = 0;
1480 trace_on = tracing_is_on();
1481 if (trace_on)
1482 tracing_off();
1483
1484 /* Talk to debugger with gdbserial protocol */
1485 error = gdb_serial_stub(ks);
1486
1487 /* Call the I/O driver's post_exception routine */
1488 if (kgdb_io_ops->post_exception)
1489 kgdb_io_ops->post_exception();
1490
1491 atomic_dec(&cpu_in_kgdb[ks->cpu]);
1492
1493 if (!kgdb_single_step) {
1494 for (i = NR_CPUS-1; i >= 0; i--)
1495 atomic_dec(&passive_cpu_wait[i]);
1496 /*
1497 * Wait till all the CPUs have quit
1498 * from the debugger.
1499 */
1500 for_each_online_cpu(i) {
1501 while (atomic_read(&cpu_in_kgdb[i]))
1502 cpu_relax();
1503 }
1504 }
1505
1506kgdb_restore:
1507 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1508 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1509 if (kgdb_info[sstep_cpu].task)
1510 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1511 else
1512 kgdb_sstep_pid = 0;
1513 }
1514 if (trace_on)
1515 tracing_on();
1516 /* Free kgdb_active */
1517 atomic_set(&kgdb_active, -1);
1518 touch_softlockup_watchdog_sync();
1519 clocksource_touch_watchdog();
1520 local_irq_restore(flags);
1521
1522 return error;
1523}
1524
1525/*
1526 * kgdb_handle_exception() - main entry point from a kernel exception
1527 *
1528 * Locking hierarchy:
1529 * interface locks, if any (begin_session)
1530 * kgdb lock (kgdb_active)
1531 */
1532int
1533kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1534{
1535 struct kgdb_state kgdb_var;
1536 struct kgdb_state *ks = &kgdb_var;
1537 int ret;
1538
1539 ks->cpu = raw_smp_processor_id();
1540 ks->ex_vector = evector;
1541 ks->signo = signo;
1542 ks->ex_vector = evector;
1543 ks->err_code = ecode;
1544 ks->kgdb_usethreadid = 0;
1545 ks->linux_regs = regs;
1546
1547 if (kgdb_reenter_check(ks))
1548 return 0; /* Ouch, double exception ! */
1549 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
1550 ret = kgdb_cpu_enter(ks, regs);
1551 kgdb_info[ks->cpu].exception_state &= ~DCPU_WANT_MASTER;
1552 return ret;
1553}
1554
1555int kgdb_nmicallback(int cpu, void *regs)
1556{
1557#ifdef CONFIG_SMP
1558 struct kgdb_state kgdb_var;
1559 struct kgdb_state *ks = &kgdb_var;
1560
1561 memset(ks, 0, sizeof(struct kgdb_state));
1562 ks->cpu = cpu;
1563 ks->linux_regs = regs;
1564
1565 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
1566 atomic_read(&kgdb_active) != -1 &&
1567 atomic_read(&kgdb_active) != cpu) {
1568 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
1569 kgdb_cpu_enter(ks, regs);
1570 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
1571 return 0;
1572 }
1573#endif
1574 return 1;
1575}
1576
1577static void kgdb_console_write(struct console *co, const char *s,
1578 unsigned count)
1579{
1580 unsigned long flags;
1581
1582 /* If we're debugging, or KGDB has not connected, don't try
1583 * and print. */
1584 if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
1585 return;
1586
1587 local_irq_save(flags);
1588 kgdb_msg_write(s, count);
1589 local_irq_restore(flags);
1590}
1591
1592static struct console kgdbcons = {
1593 .name = "kgdb",
1594 .write = kgdb_console_write,
1595 .flags = CON_PRINTBUFFER | CON_ENABLED,
1596 .index = -1,
1597};
1598
1599#ifdef CONFIG_MAGIC_SYSRQ
1600static void sysrq_handle_gdb(int key, struct tty_struct *tty)
1601{
1602 if (!kgdb_io_ops) {
1603 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
1604 return;
1605 }
1606 if (!kgdb_connected)
1607 printk(KERN_CRIT "Entering KGDB\n");
1608
1609 kgdb_breakpoint();
1610}
1611
1612static struct sysrq_key_op sysrq_gdb_op = {
1613 .handler = sysrq_handle_gdb,
1614 .help_msg = "debug(G)",
1615 .action_msg = "DEBUG",
1616};
1617#endif
1618
1619static void kgdb_register_callbacks(void)
1620{
1621 if (!kgdb_io_module_registered) {
1622 kgdb_io_module_registered = 1;
1623 kgdb_arch_init();
1624#ifdef CONFIG_MAGIC_SYSRQ
1625 register_sysrq_key('g', &sysrq_gdb_op);
1626#endif
1627 if (kgdb_use_con && !kgdb_con_registered) {
1628 register_console(&kgdbcons);
1629 kgdb_con_registered = 1;
1630 }
1631 }
1632}
1633
1634static void kgdb_unregister_callbacks(void)
1635{
1636 /*
1637 * When this routine is called KGDB should unregister from the
1638 * panic handler and clean up, making sure it is not handling any
1639 * break exceptions at the time.
1640 */
1641 if (kgdb_io_module_registered) {
1642 kgdb_io_module_registered = 0;
1643 kgdb_arch_exit();
1644#ifdef CONFIG_MAGIC_SYSRQ
1645 unregister_sysrq_key('g', &sysrq_gdb_op);
1646#endif
1647 if (kgdb_con_registered) {
1648 unregister_console(&kgdbcons);
1649 kgdb_con_registered = 0;
1650 }
1651 }
1652}
1653
1654static void kgdb_initial_breakpoint(void)
1655{
1656 kgdb_break_asap = 0;
1657
1658 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
1659 kgdb_breakpoint();
1660}
1661
1662/**
1663 * kgdb_register_io_module - register KGDB IO module
1664 * @new_kgdb_io_ops: the io ops vector
1665 *
1666 * Register it with the KGDB core.
1667 */
1668int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
1669{
1670 int err;
1671
1672 spin_lock(&kgdb_registration_lock);
1673
1674 if (kgdb_io_ops) {
1675 spin_unlock(&kgdb_registration_lock);
1676
1677 printk(KERN_ERR "kgdb: Another I/O driver is already "
1678 "registered with KGDB.\n");
1679 return -EBUSY;
1680 }
1681
1682 if (new_kgdb_io_ops->init) {
1683 err = new_kgdb_io_ops->init();
1684 if (err) {
1685 spin_unlock(&kgdb_registration_lock);
1686 return err;
1687 }
1688 }
1689
1690 kgdb_io_ops = new_kgdb_io_ops;
1691
1692 spin_unlock(&kgdb_registration_lock);
1693
1694 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
1695 new_kgdb_io_ops->name);
1696
1697 /* Arm KGDB now. */
1698 kgdb_register_callbacks();
1699
1700 if (kgdb_break_asap)
1701 kgdb_initial_breakpoint();
1702
1703 return 0;
1704}
1705EXPORT_SYMBOL_GPL(kgdb_register_io_module);
1706
1707/**
1708 * kkgdb_unregister_io_module - unregister KGDB IO module
1709 * @old_kgdb_io_ops: the io ops vector
1710 *
1711 * Unregister it with the KGDB core.
1712 */
1713void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
1714{
1715 BUG_ON(kgdb_connected);
1716
1717 /*
1718 * KGDB is no longer able to communicate out, so
1719 * unregister our callbacks and reset state.
1720 */
1721 kgdb_unregister_callbacks();
1722
1723 spin_lock(&kgdb_registration_lock);
1724
1725 WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
1726 kgdb_io_ops = NULL;
1727
1728 spin_unlock(&kgdb_registration_lock);
1729
1730 printk(KERN_INFO
1731 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
1732 old_kgdb_io_ops->name);
1733}
1734EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
1735
1736/**
1737 * kgdb_breakpoint - generate breakpoint exception
1738 *
1739 * This function will generate a breakpoint exception. It is used at the
1740 * beginning of a program to sync up with a debugger and can be used
1741 * otherwise as a quick means to stop program execution and "break" into
1742 * the debugger.
1743 */
1744void kgdb_breakpoint(void)
1745{
1746 atomic_inc(&kgdb_setting_breakpoint);
1747 wmb(); /* Sync point before breakpoint */
1748 arch_kgdb_breakpoint();
1749 wmb(); /* Sync point after breakpoint */
1750 atomic_dec(&kgdb_setting_breakpoint);
1751}
1752EXPORT_SYMBOL_GPL(kgdb_breakpoint);
1753
1754static int __init opt_kgdb_wait(char *str)
1755{
1756 kgdb_break_asap = 1;
1757
1758 if (kgdb_io_module_registered)
1759 kgdb_initial_breakpoint();
1760
1761 return 0;
1762}
1763
1764early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bf0e231d9702..6e9b19667a8d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -116,27 +116,16 @@ int __request_module(bool wait, const char *fmt, ...)
116 116
117 trace_module_request(module_name, wait, _RET_IP_); 117 trace_module_request(module_name, wait, _RET_IP_);
118 118
119 ret = call_usermodehelper(modprobe_path, argv, envp, 119 ret = call_usermodehelper_fns(modprobe_path, argv, envp,
120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
121 NULL, NULL, NULL);
122
121 atomic_dec(&kmod_concurrent); 123 atomic_dec(&kmod_concurrent);
122 return ret; 124 return ret;
123} 125}
124EXPORT_SYMBOL(__request_module); 126EXPORT_SYMBOL(__request_module);
125#endif /* CONFIG_MODULES */ 127#endif /* CONFIG_MODULES */
126 128
127struct subprocess_info {
128 struct work_struct work;
129 struct completion *complete;
130 struct cred *cred;
131 char *path;
132 char **argv;
133 char **envp;
134 enum umh_wait wait;
135 int retval;
136 struct file *stdin;
137 void (*cleanup)(char **argv, char **envp);
138};
139
140/* 129/*
141 * This is the task which runs the usermode application 130 * This is the task which runs the usermode application
142 */ 131 */
@@ -145,36 +134,10 @@ static int ____call_usermodehelper(void *data)
145 struct subprocess_info *sub_info = data; 134 struct subprocess_info *sub_info = data;
146 int retval; 135 int retval;
147 136
148 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
149
150 /* Unblock all signals */
151 spin_lock_irq(&current->sighand->siglock); 137 spin_lock_irq(&current->sighand->siglock);
152 flush_signal_handlers(current, 1); 138 flush_signal_handlers(current, 1);
153 sigemptyset(&current->blocked);
154 recalc_sigpending();
155 spin_unlock_irq(&current->sighand->siglock); 139 spin_unlock_irq(&current->sighand->siglock);
156 140
157 /* Install the credentials */
158 commit_creds(sub_info->cred);
159 sub_info->cred = NULL;
160
161 /* Install input pipe when needed */
162 if (sub_info->stdin) {
163 struct files_struct *f = current->files;
164 struct fdtable *fdt;
165 /* no races because files should be private here */
166 sys_close(0);
167 fd_install(0, sub_info->stdin);
168 spin_lock(&f->file_lock);
169 fdt = files_fdtable(f);
170 FD_SET(0, fdt->open_fds);
171 FD_CLR(0, fdt->close_on_exec);
172 spin_unlock(&f->file_lock);
173
174 /* and disallow core files too */
175 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
176 }
177
178 /* We can run anywhere, unlike our parent keventd(). */ 141 /* We can run anywhere, unlike our parent keventd(). */
179 set_cpus_allowed_ptr(current, cpu_all_mask); 142 set_cpus_allowed_ptr(current, cpu_all_mask);
180 143
@@ -184,9 +147,16 @@ static int ____call_usermodehelper(void *data)
184 */ 147 */
185 set_user_nice(current, 0); 148 set_user_nice(current, 0);
186 149
150 if (sub_info->init) {
151 retval = sub_info->init(sub_info);
152 if (retval)
153 goto fail;
154 }
155
187 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); 156 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
188 157
189 /* Exec failed? */ 158 /* Exec failed? */
159fail:
190 sub_info->retval = retval; 160 sub_info->retval = retval;
191 do_exit(0); 161 do_exit(0);
192} 162}
@@ -194,9 +164,7 @@ static int ____call_usermodehelper(void *data)
194void call_usermodehelper_freeinfo(struct subprocess_info *info) 164void call_usermodehelper_freeinfo(struct subprocess_info *info)
195{ 165{
196 if (info->cleanup) 166 if (info->cleanup)
197 (*info->cleanup)(info->argv, info->envp); 167 (*info->cleanup)(info);
198 if (info->cred)
199 put_cred(info->cred);
200 kfree(info); 168 kfree(info);
201} 169}
202EXPORT_SYMBOL(call_usermodehelper_freeinfo); 170EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -207,16 +175,16 @@ static int wait_for_helper(void *data)
207 struct subprocess_info *sub_info = data; 175 struct subprocess_info *sub_info = data;
208 pid_t pid; 176 pid_t pid;
209 177
210 /* Install a handler: if SIGCLD isn't handled sys_wait4 won't 178 /* If SIGCLD is ignored sys_wait4 won't populate the status. */
211 * populate the status, but will return -ECHILD. */ 179 spin_lock_irq(&current->sighand->siglock);
212 allow_signal(SIGCHLD); 180 current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
181 spin_unlock_irq(&current->sighand->siglock);
213 182
214 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 183 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
215 if (pid < 0) { 184 if (pid < 0) {
216 sub_info->retval = pid; 185 sub_info->retval = pid;
217 } else { 186 } else {
218 int ret; 187 int ret = -ECHILD;
219
220 /* 188 /*
221 * Normally it is bogus to call wait4() from in-kernel because 189 * Normally it is bogus to call wait4() from in-kernel because
222 * wait4() wants to write the exit code to a userspace address. 190 * wait4() wants to write the exit code to a userspace address.
@@ -237,10 +205,7 @@ static int wait_for_helper(void *data)
237 sub_info->retval = ret; 205 sub_info->retval = ret;
238 } 206 }
239 207
240 if (sub_info->wait == UMH_NO_WAIT) 208 complete(sub_info->complete);
241 call_usermodehelper_freeinfo(sub_info);
242 else
243 complete(sub_info->complete);
244 return 0; 209 return 0;
245} 210}
246 211
@@ -249,15 +214,13 @@ static void __call_usermodehelper(struct work_struct *work)
249{ 214{
250 struct subprocess_info *sub_info = 215 struct subprocess_info *sub_info =
251 container_of(work, struct subprocess_info, work); 216 container_of(work, struct subprocess_info, work);
252 pid_t pid;
253 enum umh_wait wait = sub_info->wait; 217 enum umh_wait wait = sub_info->wait;
254 218 pid_t pid;
255 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
256 219
257 /* CLONE_VFORK: wait until the usermode helper has execve'd 220 /* CLONE_VFORK: wait until the usermode helper has execve'd
258 * successfully We need the data structures to stay around 221 * successfully We need the data structures to stay around
259 * until that is done. */ 222 * until that is done. */
260 if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) 223 if (wait == UMH_WAIT_PROC)
261 pid = kernel_thread(wait_for_helper, sub_info, 224 pid = kernel_thread(wait_for_helper, sub_info,
262 CLONE_FS | CLONE_FILES | SIGCHLD); 225 CLONE_FS | CLONE_FILES | SIGCHLD);
263 else 226 else
@@ -266,15 +229,16 @@ static void __call_usermodehelper(struct work_struct *work)
266 229
267 switch (wait) { 230 switch (wait) {
268 case UMH_NO_WAIT: 231 case UMH_NO_WAIT:
232 call_usermodehelper_freeinfo(sub_info);
269 break; 233 break;
270 234
271 case UMH_WAIT_PROC: 235 case UMH_WAIT_PROC:
272 if (pid > 0) 236 if (pid > 0)
273 break; 237 break;
274 sub_info->retval = pid;
275 /* FALLTHROUGH */ 238 /* FALLTHROUGH */
276
277 case UMH_WAIT_EXEC: 239 case UMH_WAIT_EXEC:
240 if (pid < 0)
241 sub_info->retval = pid;
278 complete(sub_info->complete); 242 complete(sub_info->complete);
279 } 243 }
280} 244}
@@ -376,80 +340,37 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
376 sub_info->path = path; 340 sub_info->path = path;
377 sub_info->argv = argv; 341 sub_info->argv = argv;
378 sub_info->envp = envp; 342 sub_info->envp = envp;
379 sub_info->cred = prepare_usermodehelper_creds();
380 if (!sub_info->cred) {
381 kfree(sub_info);
382 return NULL;
383 }
384
385 out: 343 out:
386 return sub_info; 344 return sub_info;
387} 345}
388EXPORT_SYMBOL(call_usermodehelper_setup); 346EXPORT_SYMBOL(call_usermodehelper_setup);
389 347
390/** 348/**
391 * call_usermodehelper_setkeys - set the session keys for usermode helper 349 * call_usermodehelper_setfns - set a cleanup/init function
392 * @info: a subprocess_info returned by call_usermodehelper_setup
393 * @session_keyring: the session keyring for the process
394 */
395void call_usermodehelper_setkeys(struct subprocess_info *info,
396 struct key *session_keyring)
397{
398#ifdef CONFIG_KEYS
399 struct thread_group_cred *tgcred = info->cred->tgcred;
400 key_put(tgcred->session_keyring);
401 tgcred->session_keyring = key_get(session_keyring);
402#else
403 BUG();
404#endif
405}
406EXPORT_SYMBOL(call_usermodehelper_setkeys);
407
408/**
409 * call_usermodehelper_setcleanup - set a cleanup function
410 * @info: a subprocess_info returned by call_usermodehelper_setup 350 * @info: a subprocess_info returned by call_usermodehelper_setup
411 * @cleanup: a cleanup function 351 * @cleanup: a cleanup function
352 * @init: an init function
353 * @data: arbitrary context sensitive data
412 * 354 *
413 * The cleanup function is just befor ethe subprocess_info is about to 355 * The init function is used to customize the helper process prior to
356 * exec. A non-zero return code causes the process to error out, exit,
357 * and return the failure to the calling process
358 *
359 * The cleanup function is just before ethe subprocess_info is about to
414 * be freed. This can be used for freeing the argv and envp. The 360 * be freed. This can be used for freeing the argv and envp. The
415 * Function must be runnable in either a process context or the 361 * Function must be runnable in either a process context or the
416 * context in which call_usermodehelper_exec is called. 362 * context in which call_usermodehelper_exec is called.
417 */ 363 */
418void call_usermodehelper_setcleanup(struct subprocess_info *info, 364void call_usermodehelper_setfns(struct subprocess_info *info,
419 void (*cleanup)(char **argv, char **envp)) 365 int (*init)(struct subprocess_info *info),
366 void (*cleanup)(struct subprocess_info *info),
367 void *data)
420{ 368{
421 info->cleanup = cleanup; 369 info->cleanup = cleanup;
370 info->init = init;
371 info->data = data;
422} 372}
423EXPORT_SYMBOL(call_usermodehelper_setcleanup); 373EXPORT_SYMBOL(call_usermodehelper_setfns);
424
425/**
426 * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
427 * @sub_info: a subprocess_info returned by call_usermodehelper_setup
428 * @filp: set to the write-end of a pipe
429 *
430 * This constructs a pipe, and sets the read end to be the stdin of the
431 * subprocess, and returns the write-end in *@filp.
432 */
433int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
434 struct file **filp)
435{
436 struct file *f;
437
438 f = create_write_pipe(0);
439 if (IS_ERR(f))
440 return PTR_ERR(f);
441 *filp = f;
442
443 f = create_read_pipe(f, 0);
444 if (IS_ERR(f)) {
445 free_write_pipe(*filp);
446 return PTR_ERR(f);
447 }
448 sub_info->stdin = f;
449
450 return 0;
451}
452EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
453 374
454/** 375/**
455 * call_usermodehelper_exec - start a usermode application 376 * call_usermodehelper_exec - start a usermode application
@@ -469,9 +390,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
469 DECLARE_COMPLETION_ONSTACK(done); 390 DECLARE_COMPLETION_ONSTACK(done);
470 int retval = 0; 391 int retval = 0;
471 392
472 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
473 validate_creds(sub_info->cred);
474
475 helper_lock(); 393 helper_lock();
476 if (sub_info->path[0] == '\0') 394 if (sub_info->path[0] == '\0')
477 goto out; 395 goto out;
@@ -498,41 +416,6 @@ unlock:
498} 416}
499EXPORT_SYMBOL(call_usermodehelper_exec); 417EXPORT_SYMBOL(call_usermodehelper_exec);
500 418
501/**
502 * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
503 * @path: path to usermode executable
504 * @argv: arg vector for process
505 * @envp: environment for process
506 * @filp: set to the write-end of a pipe
507 *
508 * This is a simple wrapper which executes a usermode-helper function
509 * with a pipe as stdin. It is implemented entirely in terms of
510 * lower-level call_usermodehelper_* functions.
511 */
512int call_usermodehelper_pipe(char *path, char **argv, char **envp,
513 struct file **filp)
514{
515 struct subprocess_info *sub_info;
516 int ret;
517
518 sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
519 if (sub_info == NULL)
520 return -ENOMEM;
521
522 ret = call_usermodehelper_stdinpipe(sub_info, filp);
523 if (ret < 0) {
524 call_usermodehelper_freeinfo(sub_info);
525 return ret;
526 }
527
528 ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
529 if (ret < 0) /* Failed to execute helper, close pipe */
530 filp_close(*filp, NULL);
531
532 return ret;
533}
534EXPORT_SYMBOL(call_usermodehelper_pipe);
535
536void __init usermodehelper_init(void) 419void __init usermodehelper_init(void)
537{ 420{
538 khelper_wq = create_singlethread_workqueue("khelper"); 421 khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0ed46f3e51e9..282035f3ae96 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1588,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1588 arch_remove_kprobe(p); 1588 arch_remove_kprobe(p);
1589} 1589}
1590 1590
1591/* Disable one kprobe */
1592int __kprobes disable_kprobe(struct kprobe *kp)
1593{
1594 int ret = 0;
1595 struct kprobe *p;
1596
1597 mutex_lock(&kprobe_mutex);
1598
1599 /* Check whether specified probe is valid. */
1600 p = __get_valid_kprobe(kp);
1601 if (unlikely(p == NULL)) {
1602 ret = -EINVAL;
1603 goto out;
1604 }
1605
1606 /* If the probe is already disabled (or gone), just return */
1607 if (kprobe_disabled(kp))
1608 goto out;
1609
1610 kp->flags |= KPROBE_FLAG_DISABLED;
1611 if (p != kp)
1612 /* When kp != p, p is always enabled. */
1613 try_to_disable_aggr_kprobe(p);
1614
1615 if (!kprobes_all_disarmed && kprobe_disabled(p))
1616 disarm_kprobe(p);
1617out:
1618 mutex_unlock(&kprobe_mutex);
1619 return ret;
1620}
1621EXPORT_SYMBOL_GPL(disable_kprobe);
1622
1623/* Enable one kprobe */
1624int __kprobes enable_kprobe(struct kprobe *kp)
1625{
1626 int ret = 0;
1627 struct kprobe *p;
1628
1629 mutex_lock(&kprobe_mutex);
1630
1631 /* Check whether specified probe is valid. */
1632 p = __get_valid_kprobe(kp);
1633 if (unlikely(p == NULL)) {
1634 ret = -EINVAL;
1635 goto out;
1636 }
1637
1638 if (kprobe_gone(kp)) {
1639 /* This kprobe has gone, we couldn't enable it. */
1640 ret = -EINVAL;
1641 goto out;
1642 }
1643
1644 if (p != kp)
1645 kp->flags &= ~KPROBE_FLAG_DISABLED;
1646
1647 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1648 p->flags &= ~KPROBE_FLAG_DISABLED;
1649 arm_kprobe(p);
1650 }
1651out:
1652 mutex_unlock(&kprobe_mutex);
1653 return ret;
1654}
1655EXPORT_SYMBOL_GPL(enable_kprobe);
1656
1591void __kprobes dump_kprobe(struct kprobe *kp) 1657void __kprobes dump_kprobe(struct kprobe *kp)
1592{ 1658{
1593 printk(KERN_WARNING "Dumping kprobe:\n"); 1659 printk(KERN_WARNING "Dumping kprobe:\n");
@@ -1805,72 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = {
1805 .release = seq_release, 1871 .release = seq_release,
1806}; 1872};
1807 1873
1808/* Disable one kprobe */
1809int __kprobes disable_kprobe(struct kprobe *kp)
1810{
1811 int ret = 0;
1812 struct kprobe *p;
1813
1814 mutex_lock(&kprobe_mutex);
1815
1816 /* Check whether specified probe is valid. */
1817 p = __get_valid_kprobe(kp);
1818 if (unlikely(p == NULL)) {
1819 ret = -EINVAL;
1820 goto out;
1821 }
1822
1823 /* If the probe is already disabled (or gone), just return */
1824 if (kprobe_disabled(kp))
1825 goto out;
1826
1827 kp->flags |= KPROBE_FLAG_DISABLED;
1828 if (p != kp)
1829 /* When kp != p, p is always enabled. */
1830 try_to_disable_aggr_kprobe(p);
1831
1832 if (!kprobes_all_disarmed && kprobe_disabled(p))
1833 disarm_kprobe(p);
1834out:
1835 mutex_unlock(&kprobe_mutex);
1836 return ret;
1837}
1838EXPORT_SYMBOL_GPL(disable_kprobe);
1839
1840/* Enable one kprobe */
1841int __kprobes enable_kprobe(struct kprobe *kp)
1842{
1843 int ret = 0;
1844 struct kprobe *p;
1845
1846 mutex_lock(&kprobe_mutex);
1847
1848 /* Check whether specified probe is valid. */
1849 p = __get_valid_kprobe(kp);
1850 if (unlikely(p == NULL)) {
1851 ret = -EINVAL;
1852 goto out;
1853 }
1854
1855 if (kprobe_gone(kp)) {
1856 /* This kprobe has gone, we couldn't enable it. */
1857 ret = -EINVAL;
1858 goto out;
1859 }
1860
1861 if (p != kp)
1862 kp->flags &= ~KPROBE_FLAG_DISABLED;
1863
1864 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1865 p->flags &= ~KPROBE_FLAG_DISABLED;
1866 arm_kprobe(p);
1867 }
1868out:
1869 mutex_unlock(&kprobe_mutex);
1870 return ret;
1871}
1872EXPORT_SYMBOL_GPL(enable_kprobe);
1873
1874static void __kprobes arm_all_kprobes(void) 1874static void __kprobes arm_all_kprobes(void)
1875{ 1875{
1876 struct hlist_head *head; 1876 struct hlist_head *head;
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 21fe3c426948..0b624e791805 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -138,7 +138,8 @@ extern const void __start_notes __attribute__((weak));
138extern const void __stop_notes __attribute__((weak)); 138extern const void __stop_notes __attribute__((weak));
139#define notes_size (&__stop_notes - &__start_notes) 139#define notes_size (&__stop_notes - &__start_notes)
140 140
141static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, 141static ssize_t notes_read(struct file *filp, struct kobject *kobj,
142 struct bin_attribute *bin_attr,
142 char *buf, loff_t off, size_t count) 143 char *buf, loff_t off, size_t count)
143{ 144{
144 memcpy(buf, &__start_notes + off, count); 145 memcpy(buf, &__start_notes + off, count);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 2594e1ce41cb..54286798c37b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -431,20 +431,7 @@ static struct stack_trace lockdep_init_trace = {
431/* 431/*
432 * Various lockdep statistics: 432 * Various lockdep statistics:
433 */ 433 */
434atomic_t chain_lookup_hits; 434DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
435atomic_t chain_lookup_misses;
436atomic_t hardirqs_on_events;
437atomic_t hardirqs_off_events;
438atomic_t redundant_hardirqs_on;
439atomic_t redundant_hardirqs_off;
440atomic_t softirqs_on_events;
441atomic_t softirqs_off_events;
442atomic_t redundant_softirqs_on;
443atomic_t redundant_softirqs_off;
444atomic_t nr_unused_locks;
445atomic_t nr_cyclic_checks;
446atomic_t nr_find_usage_forwards_checks;
447atomic_t nr_find_usage_backwards_checks;
448#endif 435#endif
449 436
450/* 437/*
@@ -748,7 +735,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
748 return NULL; 735 return NULL;
749 } 736 }
750 class = lock_classes + nr_lock_classes++; 737 class = lock_classes + nr_lock_classes++;
751 debug_atomic_inc(&nr_unused_locks); 738 debug_atomic_inc(nr_unused_locks);
752 class->key = key; 739 class->key = key;
753 class->name = lock->name; 740 class->name = lock->name;
754 class->subclass = subclass; 741 class->subclass = subclass;
@@ -818,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
818 * Add a new dependency to the head of the list: 805 * Add a new dependency to the head of the list:
819 */ 806 */
820static int add_lock_to_list(struct lock_class *class, struct lock_class *this, 807static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
821 struct list_head *head, unsigned long ip, int distance) 808 struct list_head *head, unsigned long ip,
809 int distance, struct stack_trace *trace)
822{ 810{
823 struct lock_list *entry; 811 struct lock_list *entry;
824 /* 812 /*
@@ -829,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
829 if (!entry) 817 if (!entry)
830 return 0; 818 return 0;
831 819
832 if (!save_trace(&entry->trace))
833 return 0;
834
835 entry->class = this; 820 entry->class = this;
836 entry->distance = distance; 821 entry->distance = distance;
822 entry->trace = *trace;
837 /* 823 /*
838 * Since we never remove from the dependency list, the list can 824 * Since we never remove from the dependency list, the list can
839 * be walked lockless by other CPUs, it's only allocation 825 * be walked lockless by other CPUs, it's only allocation
@@ -1205,7 +1191,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
1205{ 1191{
1206 int result; 1192 int result;
1207 1193
1208 debug_atomic_inc(&nr_cyclic_checks); 1194 debug_atomic_inc(nr_cyclic_checks);
1209 1195
1210 result = __bfs_forwards(root, target, class_equal, target_entry); 1196 result = __bfs_forwards(root, target, class_equal, target_entry);
1211 1197
@@ -1242,7 +1228,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1242{ 1228{
1243 int result; 1229 int result;
1244 1230
1245 debug_atomic_inc(&nr_find_usage_forwards_checks); 1231 debug_atomic_inc(nr_find_usage_forwards_checks);
1246 1232
1247 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); 1233 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1248 1234
@@ -1265,7 +1251,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1265{ 1251{
1266 int result; 1252 int result;
1267 1253
1268 debug_atomic_inc(&nr_find_usage_backwards_checks); 1254 debug_atomic_inc(nr_find_usage_backwards_checks);
1269 1255
1270 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); 1256 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1271 1257
@@ -1635,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1635 */ 1621 */
1636static int 1622static int
1637check_prev_add(struct task_struct *curr, struct held_lock *prev, 1623check_prev_add(struct task_struct *curr, struct held_lock *prev,
1638 struct held_lock *next, int distance) 1624 struct held_lock *next, int distance, int trylock_loop)
1639{ 1625{
1640 struct lock_list *entry; 1626 struct lock_list *entry;
1641 int ret; 1627 int ret;
1642 struct lock_list this; 1628 struct lock_list this;
1643 struct lock_list *uninitialized_var(target_entry); 1629 struct lock_list *uninitialized_var(target_entry);
1630 /*
1631 * Static variable, serialized by the graph_lock().
1632 *
1633 * We use this static variable to save the stack trace in case
1634 * we call into this function multiple times due to encountering
1635 * trylocks in the held lock stack.
1636 */
1637 static struct stack_trace trace;
1644 1638
1645 /* 1639 /*
1646 * Prove that the new <prev> -> <next> dependency would not 1640 * Prove that the new <prev> -> <next> dependency would not
@@ -1688,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1688 } 1682 }
1689 } 1683 }
1690 1684
1685 if (!trylock_loop && !save_trace(&trace))
1686 return 0;
1687
1691 /* 1688 /*
1692 * Ok, all validations passed, add the new lock 1689 * Ok, all validations passed, add the new lock
1693 * to the previous lock's dependency list: 1690 * to the previous lock's dependency list:
1694 */ 1691 */
1695 ret = add_lock_to_list(hlock_class(prev), hlock_class(next), 1692 ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
1696 &hlock_class(prev)->locks_after, 1693 &hlock_class(prev)->locks_after,
1697 next->acquire_ip, distance); 1694 next->acquire_ip, distance, &trace);
1698 1695
1699 if (!ret) 1696 if (!ret)
1700 return 0; 1697 return 0;
1701 1698
1702 ret = add_lock_to_list(hlock_class(next), hlock_class(prev), 1699 ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
1703 &hlock_class(next)->locks_before, 1700 &hlock_class(next)->locks_before,
1704 next->acquire_ip, distance); 1701 next->acquire_ip, distance, &trace);
1705 if (!ret) 1702 if (!ret)
1706 return 0; 1703 return 0;
1707 1704
@@ -1731,6 +1728,7 @@ static int
1731check_prevs_add(struct task_struct *curr, struct held_lock *next) 1728check_prevs_add(struct task_struct *curr, struct held_lock *next)
1732{ 1729{
1733 int depth = curr->lockdep_depth; 1730 int depth = curr->lockdep_depth;
1731 int trylock_loop = 0;
1734 struct held_lock *hlock; 1732 struct held_lock *hlock;
1735 1733
1736 /* 1734 /*
@@ -1756,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1756 * added: 1754 * added:
1757 */ 1755 */
1758 if (hlock->read != 2) { 1756 if (hlock->read != 2) {
1759 if (!check_prev_add(curr, hlock, next, distance)) 1757 if (!check_prev_add(curr, hlock, next,
1758 distance, trylock_loop))
1760 return 0; 1759 return 0;
1761 /* 1760 /*
1762 * Stop after the first non-trylock entry, 1761 * Stop after the first non-trylock entry,
@@ -1779,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1779 if (curr->held_locks[depth].irq_context != 1778 if (curr->held_locks[depth].irq_context !=
1780 curr->held_locks[depth-1].irq_context) 1779 curr->held_locks[depth-1].irq_context)
1781 break; 1780 break;
1781 trylock_loop = 1;
1782 } 1782 }
1783 return 1; 1783 return 1;
1784out_bug: 1784out_bug:
@@ -1825,7 +1825,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1825 list_for_each_entry(chain, hash_head, entry) { 1825 list_for_each_entry(chain, hash_head, entry) {
1826 if (chain->chain_key == chain_key) { 1826 if (chain->chain_key == chain_key) {
1827cache_hit: 1827cache_hit:
1828 debug_atomic_inc(&chain_lookup_hits); 1828 debug_atomic_inc(chain_lookup_hits);
1829 if (very_verbose(class)) 1829 if (very_verbose(class))
1830 printk("\nhash chain already cached, key: " 1830 printk("\nhash chain already cached, key: "
1831 "%016Lx tail class: [%p] %s\n", 1831 "%016Lx tail class: [%p] %s\n",
@@ -1890,7 +1890,7 @@ cache_hit:
1890 chain_hlocks[chain->base + j] = class - lock_classes; 1890 chain_hlocks[chain->base + j] = class - lock_classes;
1891 } 1891 }
1892 list_add_tail_rcu(&chain->entry, hash_head); 1892 list_add_tail_rcu(&chain->entry, hash_head);
1893 debug_atomic_inc(&chain_lookup_misses); 1893 debug_atomic_inc(chain_lookup_misses);
1894 inc_chains(); 1894 inc_chains();
1895 1895
1896 return 1; 1896 return 1;
@@ -2311,7 +2311,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
2311 return; 2311 return;
2312 2312
2313 if (unlikely(curr->hardirqs_enabled)) { 2313 if (unlikely(curr->hardirqs_enabled)) {
2314 debug_atomic_inc(&redundant_hardirqs_on); 2314 /*
2315 * Neither irq nor preemption are disabled here
2316 * so this is racy by nature but loosing one hit
2317 * in a stat is not a big deal.
2318 */
2319 __debug_atomic_inc(redundant_hardirqs_on);
2315 return; 2320 return;
2316 } 2321 }
2317 /* we'll do an OFF -> ON transition: */ 2322 /* we'll do an OFF -> ON transition: */
@@ -2338,7 +2343,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2338 2343
2339 curr->hardirq_enable_ip = ip; 2344 curr->hardirq_enable_ip = ip;
2340 curr->hardirq_enable_event = ++curr->irq_events; 2345 curr->hardirq_enable_event = ++curr->irq_events;
2341 debug_atomic_inc(&hardirqs_on_events); 2346 debug_atomic_inc(hardirqs_on_events);
2342} 2347}
2343EXPORT_SYMBOL(trace_hardirqs_on_caller); 2348EXPORT_SYMBOL(trace_hardirqs_on_caller);
2344 2349
@@ -2370,9 +2375,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
2370 curr->hardirqs_enabled = 0; 2375 curr->hardirqs_enabled = 0;
2371 curr->hardirq_disable_ip = ip; 2376 curr->hardirq_disable_ip = ip;
2372 curr->hardirq_disable_event = ++curr->irq_events; 2377 curr->hardirq_disable_event = ++curr->irq_events;
2373 debug_atomic_inc(&hardirqs_off_events); 2378 debug_atomic_inc(hardirqs_off_events);
2374 } else 2379 } else
2375 debug_atomic_inc(&redundant_hardirqs_off); 2380 debug_atomic_inc(redundant_hardirqs_off);
2376} 2381}
2377EXPORT_SYMBOL(trace_hardirqs_off_caller); 2382EXPORT_SYMBOL(trace_hardirqs_off_caller);
2378 2383
@@ -2396,7 +2401,7 @@ void trace_softirqs_on(unsigned long ip)
2396 return; 2401 return;
2397 2402
2398 if (curr->softirqs_enabled) { 2403 if (curr->softirqs_enabled) {
2399 debug_atomic_inc(&redundant_softirqs_on); 2404 debug_atomic_inc(redundant_softirqs_on);
2400 return; 2405 return;
2401 } 2406 }
2402 2407
@@ -2406,7 +2411,7 @@ void trace_softirqs_on(unsigned long ip)
2406 curr->softirqs_enabled = 1; 2411 curr->softirqs_enabled = 1;
2407 curr->softirq_enable_ip = ip; 2412 curr->softirq_enable_ip = ip;
2408 curr->softirq_enable_event = ++curr->irq_events; 2413 curr->softirq_enable_event = ++curr->irq_events;
2409 debug_atomic_inc(&softirqs_on_events); 2414 debug_atomic_inc(softirqs_on_events);
2410 /* 2415 /*
2411 * We are going to turn softirqs on, so set the 2416 * We are going to turn softirqs on, so set the
2412 * usage bit for all held locks, if hardirqs are 2417 * usage bit for all held locks, if hardirqs are
@@ -2436,10 +2441,10 @@ void trace_softirqs_off(unsigned long ip)
2436 curr->softirqs_enabled = 0; 2441 curr->softirqs_enabled = 0;
2437 curr->softirq_disable_ip = ip; 2442 curr->softirq_disable_ip = ip;
2438 curr->softirq_disable_event = ++curr->irq_events; 2443 curr->softirq_disable_event = ++curr->irq_events;
2439 debug_atomic_inc(&softirqs_off_events); 2444 debug_atomic_inc(softirqs_off_events);
2440 DEBUG_LOCKS_WARN_ON(!softirq_count()); 2445 DEBUG_LOCKS_WARN_ON(!softirq_count());
2441 } else 2446 } else
2442 debug_atomic_inc(&redundant_softirqs_off); 2447 debug_atomic_inc(redundant_softirqs_off);
2443} 2448}
2444 2449
2445static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) 2450static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2644,7 +2649,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2644 return 0; 2649 return 0;
2645 break; 2650 break;
2646 case LOCK_USED: 2651 case LOCK_USED:
2647 debug_atomic_dec(&nr_unused_locks); 2652 debug_atomic_dec(nr_unused_locks);
2648 break; 2653 break;
2649 default: 2654 default:
2650 if (!debug_locks_off_graph_unlock()) 2655 if (!debug_locks_off_graph_unlock())
@@ -2706,6 +2711,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2706} 2711}
2707EXPORT_SYMBOL_GPL(lockdep_init_map); 2712EXPORT_SYMBOL_GPL(lockdep_init_map);
2708 2713
2714struct lock_class_key __lockdep_no_validate__;
2715
2709/* 2716/*
2710 * This gets called for every mutex_lock*()/spin_lock*() operation. 2717 * This gets called for every mutex_lock*()/spin_lock*() operation.
2711 * We maintain the dependency maps and validate the locking attempt: 2718 * We maintain the dependency maps and validate the locking attempt:
@@ -2740,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2740 return 0; 2747 return 0;
2741 } 2748 }
2742 2749
2750 if (lock->key == &__lockdep_no_validate__)
2751 check = 1;
2752
2743 if (!subclass) 2753 if (!subclass)
2744 class = lock->class_cache; 2754 class = lock->class_cache;
2745 /* 2755 /*
@@ -2750,7 +2760,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2750 if (!class) 2760 if (!class)
2751 return 0; 2761 return 0;
2752 } 2762 }
2753 debug_atomic_inc((atomic_t *)&class->ops); 2763 atomic_inc((atomic_t *)&class->ops);
2754 if (very_verbose(class)) { 2764 if (very_verbose(class)) {
2755 printk("\nacquire class [%p] %s", class->key, class->name); 2765 printk("\nacquire class [%p] %s", class->key, class->name);
2756 if (class->name_version > 1) 2766 if (class->name_version > 1)
@@ -3227,7 +3237,7 @@ void lock_release(struct lockdep_map *lock, int nested,
3227 raw_local_irq_save(flags); 3237 raw_local_irq_save(flags);
3228 check_flags(flags); 3238 check_flags(flags);
3229 current->lockdep_recursion = 1; 3239 current->lockdep_recursion = 1;
3230 trace_lock_release(lock, nested, ip); 3240 trace_lock_release(lock, ip);
3231 __lock_release(lock, nested, ip); 3241 __lock_release(lock, nested, ip);
3232 current->lockdep_recursion = 0; 3242 current->lockdep_recursion = 0;
3233 raw_local_irq_restore(flags); 3243 raw_local_irq_restore(flags);
@@ -3380,7 +3390,7 @@ found_it:
3380 hlock->holdtime_stamp = now; 3390 hlock->holdtime_stamp = now;
3381 } 3391 }
3382 3392
3383 trace_lock_acquired(lock, ip, waittime); 3393 trace_lock_acquired(lock, ip);
3384 3394
3385 stats = get_lock_stats(hlock_class(hlock)); 3395 stats = get_lock_stats(hlock_class(hlock));
3386 if (waittime) { 3396 if (waittime) {
@@ -3801,8 +3811,11 @@ void lockdep_rcu_dereference(const char *file, const int line)
3801{ 3811{
3802 struct task_struct *curr = current; 3812 struct task_struct *curr = current;
3803 3813
3814#ifndef CONFIG_PROVE_RCU_REPEATEDLY
3804 if (!debug_locks_off()) 3815 if (!debug_locks_off())
3805 return; 3816 return;
3817#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
3818 /* Note: the following can be executed concurrently, so be careful. */
3806 printk("\n===================================================\n"); 3819 printk("\n===================================================\n");
3807 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); 3820 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3808 printk( "---------------------------------------------------\n"); 3821 printk( "---------------------------------------------------\n");
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad1313..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,30 +110,60 @@ lockdep_count_backward_deps(struct lock_class *class)
110#endif 110#endif
111 111
112#ifdef CONFIG_DEBUG_LOCKDEP 112#ifdef CONFIG_DEBUG_LOCKDEP
113
114#include <asm/local.h>
113/* 115/*
114 * Various lockdep statistics: 116 * Various lockdep statistics.
117 * We want them per cpu as they are often accessed in fast path
118 * and we want to avoid too much cache bouncing.
115 */ 119 */
116extern atomic_t chain_lookup_hits; 120struct lockdep_stats {
117extern atomic_t chain_lookup_misses; 121 int chain_lookup_hits;
118extern atomic_t hardirqs_on_events; 122 int chain_lookup_misses;
119extern atomic_t hardirqs_off_events; 123 int hardirqs_on_events;
120extern atomic_t redundant_hardirqs_on; 124 int hardirqs_off_events;
121extern atomic_t redundant_hardirqs_off; 125 int redundant_hardirqs_on;
122extern atomic_t softirqs_on_events; 126 int redundant_hardirqs_off;
123extern atomic_t softirqs_off_events; 127 int softirqs_on_events;
124extern atomic_t redundant_softirqs_on; 128 int softirqs_off_events;
125extern atomic_t redundant_softirqs_off; 129 int redundant_softirqs_on;
126extern atomic_t nr_unused_locks; 130 int redundant_softirqs_off;
127extern atomic_t nr_cyclic_checks; 131 int nr_unused_locks;
128extern atomic_t nr_cyclic_check_recursions; 132 int nr_cyclic_checks;
129extern atomic_t nr_find_usage_forwards_checks; 133 int nr_cyclic_check_recursions;
130extern atomic_t nr_find_usage_forwards_recursions; 134 int nr_find_usage_forwards_checks;
131extern atomic_t nr_find_usage_backwards_checks; 135 int nr_find_usage_forwards_recursions;
132extern atomic_t nr_find_usage_backwards_recursions; 136 int nr_find_usage_backwards_checks;
133# define debug_atomic_inc(ptr) atomic_inc(ptr) 137 int nr_find_usage_backwards_recursions;
134# define debug_atomic_dec(ptr) atomic_dec(ptr) 138};
135# define debug_atomic_read(ptr) atomic_read(ptr) 139
140DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
141
142#define __debug_atomic_inc(ptr) \
143 this_cpu_inc(lockdep_stats.ptr);
144
145#define debug_atomic_inc(ptr) { \
146 WARN_ON_ONCE(!irqs_disabled()); \
147 __this_cpu_inc(lockdep_stats.ptr); \
148}
149
150#define debug_atomic_dec(ptr) { \
151 WARN_ON_ONCE(!irqs_disabled()); \
152 __this_cpu_dec(lockdep_stats.ptr); \
153}
154
155#define debug_atomic_read(ptr) ({ \
156 struct lockdep_stats *__cpu_lockdep_stats; \
157 unsigned long long __total = 0; \
158 int __cpu; \
159 for_each_possible_cpu(__cpu) { \
160 __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \
161 __total += __cpu_lockdep_stats->ptr; \
162 } \
163 __total; \
164})
136#else 165#else
166# define __debug_atomic_inc(ptr) do { } while (0)
137# define debug_atomic_inc(ptr) do { } while (0) 167# define debug_atomic_inc(ptr) do { } while (0)
138# define debug_atomic_dec(ptr) do { } while (0) 168# define debug_atomic_dec(ptr) do { } while (0)
139# define debug_atomic_read(ptr) 0 169# define debug_atomic_read(ptr) 0
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584c..59b76c8ce9d7 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
184static void lockdep_stats_debug_show(struct seq_file *m) 184static void lockdep_stats_debug_show(struct seq_file *m)
185{ 185{
186#ifdef CONFIG_DEBUG_LOCKDEP 186#ifdef CONFIG_DEBUG_LOCKDEP
187 unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), 187 unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
188 hi2 = debug_atomic_read(&hardirqs_off_events), 188 hi2 = debug_atomic_read(hardirqs_off_events),
189 hr1 = debug_atomic_read(&redundant_hardirqs_on), 189 hr1 = debug_atomic_read(redundant_hardirqs_on),
190 hr2 = debug_atomic_read(&redundant_hardirqs_off), 190 hr2 = debug_atomic_read(redundant_hardirqs_off),
191 si1 = debug_atomic_read(&softirqs_on_events), 191 si1 = debug_atomic_read(softirqs_on_events),
192 si2 = debug_atomic_read(&softirqs_off_events), 192 si2 = debug_atomic_read(softirqs_off_events),
193 sr1 = debug_atomic_read(&redundant_softirqs_on), 193 sr1 = debug_atomic_read(redundant_softirqs_on),
194 sr2 = debug_atomic_read(&redundant_softirqs_off); 194 sr2 = debug_atomic_read(redundant_softirqs_off);
195 195
196 seq_printf(m, " chain lookup misses: %11u\n", 196 seq_printf(m, " chain lookup misses: %11llu\n",
197 debug_atomic_read(&chain_lookup_misses)); 197 debug_atomic_read(chain_lookup_misses));
198 seq_printf(m, " chain lookup hits: %11u\n", 198 seq_printf(m, " chain lookup hits: %11llu\n",
199 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(chain_lookup_hits));
200 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11llu\n",
201 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(nr_cyclic_checks));
202 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11llu\n",
203 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(nr_find_usage_forwards_checks));
204 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11llu\n",
205 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(nr_find_usage_backwards_checks));
206 206
207 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11llu\n", hi1);
208 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11llu\n", hi2);
209 seq_printf(m, " redundant hardirq ons: %11u\n", hr1); 209 seq_printf(m, " redundant hardirq ons: %11llu\n", hr1);
210 seq_printf(m, " redundant hardirq offs: %11u\n", hr2); 210 seq_printf(m, " redundant hardirq offs: %11llu\n", hr2);
211 seq_printf(m, " softirq on events: %11u\n", si1); 211 seq_printf(m, " softirq on events: %11llu\n", si1);
212 seq_printf(m, " softirq off events: %11u\n", si2); 212 seq_printf(m, " softirq off events: %11llu\n", si2);
213 seq_printf(m, " redundant softirq ons: %11u\n", sr1); 213 seq_printf(m, " redundant softirq ons: %11llu\n", sr1);
214 seq_printf(m, " redundant softirq offs: %11u\n", sr2); 214 seq_printf(m, " redundant softirq offs: %11llu\n", sr2);
215#endif 215#endif
216} 216}
217 217
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
263#endif 263#endif
264 } 264 }
265#ifdef CONFIG_DEBUG_LOCKDEP 265#ifdef CONFIG_DEBUG_LOCKDEP
266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); 266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
267#endif 267#endif
268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n", 268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
269 nr_lock_classes, MAX_LOCKDEP_KEYS); 269 nr_lock_classes, MAX_LOCKDEP_KEYS);
diff --git a/kernel/module.c b/kernel/module.c
index 1016b75b026a..333fbcc96978 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
59#define CREATE_TRACE_POINTS 59#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 60#include <trace/events/module.h>
61 61
62EXPORT_TRACEPOINT_SYMBOL(module_get);
63
64#if 0 62#if 0
65#define DEBUGP printk 63#define DEBUGP printk
66#else 64#else
@@ -79,6 +77,10 @@ EXPORT_TRACEPOINT_SYMBOL(module_get);
79DEFINE_MUTEX(module_mutex); 77DEFINE_MUTEX(module_mutex);
80EXPORT_SYMBOL_GPL(module_mutex); 78EXPORT_SYMBOL_GPL(module_mutex);
81static LIST_HEAD(modules); 79static LIST_HEAD(modules);
80#ifdef CONFIG_KGDB_KDB
81struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
82#endif /* CONFIG_KGDB_KDB */
83
82 84
83/* Block module loading/unloading? */ 85/* Block module loading/unloading? */
84int modules_disabled = 0; 86int modules_disabled = 0;
@@ -178,8 +180,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
178extern const struct kernel_symbol __stop___ksymtab_gpl[]; 180extern const struct kernel_symbol __stop___ksymtab_gpl[];
179extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 181extern const struct kernel_symbol __start___ksymtab_gpl_future[];
180extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 182extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
181extern const struct kernel_symbol __start___ksymtab_gpl_future[];
182extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
183extern const unsigned long __start___kcrctab[]; 183extern const unsigned long __start___kcrctab[];
184extern const unsigned long __start___kcrctab_gpl[]; 184extern const unsigned long __start___kcrctab_gpl[];
185extern const unsigned long __start___kcrctab_gpl_future[]; 185extern const unsigned long __start___kcrctab_gpl_future[];
@@ -515,6 +515,9 @@ MODINFO_ATTR(srcversion);
515static char last_unloaded_module[MODULE_NAME_LEN+1]; 515static char last_unloaded_module[MODULE_NAME_LEN+1];
516 516
517#ifdef CONFIG_MODULE_UNLOAD 517#ifdef CONFIG_MODULE_UNLOAD
518
519EXPORT_TRACEPOINT_SYMBOL(module_get);
520
518/* Init the unload section of the module. */ 521/* Init the unload section of the module. */
519static void module_unload_init(struct module *mod) 522static void module_unload_init(struct module *mod)
520{ 523{
@@ -723,16 +726,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
723 return -EFAULT; 726 return -EFAULT;
724 name[MODULE_NAME_LEN-1] = '\0'; 727 name[MODULE_NAME_LEN-1] = '\0';
725 728
726 /* Create stop_machine threads since free_module relies on 729 if (mutex_lock_interruptible(&module_mutex) != 0)
727 * a non-failing stop_machine call. */ 730 return -EINTR;
728 ret = stop_machine_create();
729 if (ret)
730 return ret;
731
732 if (mutex_lock_interruptible(&module_mutex) != 0) {
733 ret = -EINTR;
734 goto out_stop;
735 }
736 731
737 mod = find_module(name); 732 mod = find_module(name);
738 if (!mod) { 733 if (!mod) {
@@ -792,8 +787,6 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
792 787
793 out: 788 out:
794 mutex_unlock(&module_mutex); 789 mutex_unlock(&module_mutex);
795out_stop:
796 stop_machine_destroy();
797 return ret; 790 return ret;
798} 791}
799 792
@@ -867,8 +860,7 @@ void module_put(struct module *module)
867 smp_wmb(); /* see comment in module_refcount */ 860 smp_wmb(); /* see comment in module_refcount */
868 __this_cpu_inc(module->refptr->decs); 861 __this_cpu_inc(module->refptr->decs);
869 862
870 trace_module_put(module, _RET_IP_, 863 trace_module_put(module, _RET_IP_);
871 __this_cpu_read(module->refptr->decs));
872 /* Maybe they're waiting for us to drop reference? */ 864 /* Maybe they're waiting for us to drop reference? */
873 if (unlikely(!module_is_live(module))) 865 if (unlikely(!module_is_live(module)))
874 wake_up_process(module->waiter); 866 wake_up_process(module->waiter);
@@ -1192,7 +1184,7 @@ struct module_notes_attrs {
1192 struct bin_attribute attrs[0]; 1184 struct bin_attribute attrs[0];
1193}; 1185};
1194 1186
1195static ssize_t module_notes_read(struct kobject *kobj, 1187static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
1196 struct bin_attribute *bin_attr, 1188 struct bin_attribute *bin_attr,
1197 char *buf, loff_t pos, size_t count) 1189 char *buf, loff_t pos, size_t count)
1198{ 1190{
diff --git a/kernel/padata.c b/kernel/padata.c
index fd03513c7327..fdd8ae609ce3 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -29,7 +29,7 @@
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30 30
31#define MAX_SEQ_NR INT_MAX - NR_CPUS 31#define MAX_SEQ_NR INT_MAX - NR_CPUS
32#define MAX_OBJ_NUM 10000 * NR_CPUS 32#define MAX_OBJ_NUM 1000
33 33
34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 34static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
35{ 35{
@@ -88,7 +88,7 @@ static void padata_parallel_worker(struct work_struct *work)
88 local_bh_enable(); 88 local_bh_enable();
89} 89}
90 90
91/* 91/**
92 * padata_do_parallel - padata parallelization function 92 * padata_do_parallel - padata parallelization function
93 * 93 *
94 * @pinst: padata instance 94 * @pinst: padata instance
@@ -152,6 +152,23 @@ out:
152} 152}
153EXPORT_SYMBOL(padata_do_parallel); 153EXPORT_SYMBOL(padata_do_parallel);
154 154
155/*
156 * padata_get_next - Get the next object that needs serialization.
157 *
158 * Return values are:
159 *
160 * A pointer to the control struct of the next object that needs
161 * serialization, if present in one of the percpu reorder queues.
162 *
163 * NULL, if all percpu reorder queues are empty.
164 *
165 * -EINPROGRESS, if the next object that needs serialization will
166 * be parallel processed by another cpu and is not yet present in
167 * the cpu's reorder queue.
168 *
169 * -ENODATA, if this cpu has to do the parallel processing for
170 * the next object.
171 */
155static struct padata_priv *padata_get_next(struct parallel_data *pd) 172static struct padata_priv *padata_get_next(struct parallel_data *pd)
156{ 173{
157 int cpu, num_cpus, empty, calc_seq_nr; 174 int cpu, num_cpus, empty, calc_seq_nr;
@@ -173,7 +190,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
173 190
174 /* 191 /*
175 * Calculate the seq_nr of the object that should be 192 * Calculate the seq_nr of the object that should be
176 * next in this queue. 193 * next in this reorder queue.
177 */ 194 */
178 overrun = 0; 195 overrun = 0;
179 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) 196 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
@@ -231,7 +248,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
231 goto out; 248 goto out;
232 } 249 }
233 250
234 if (next_nr % num_cpus == next_queue->cpu_index) { 251 queue = per_cpu_ptr(pd->queue, smp_processor_id());
252 if (queue->cpu_index == next_queue->cpu_index) {
235 padata = ERR_PTR(-ENODATA); 253 padata = ERR_PTR(-ENODATA);
236 goto out; 254 goto out;
237 } 255 }
@@ -247,19 +265,40 @@ static void padata_reorder(struct parallel_data *pd)
247 struct padata_queue *queue; 265 struct padata_queue *queue;
248 struct padata_instance *pinst = pd->pinst; 266 struct padata_instance *pinst = pd->pinst;
249 267
250try_again: 268 /*
269 * We need to ensure that only one cpu can work on dequeueing of
270 * the reorder queue the time. Calculating in which percpu reorder
271 * queue the next object will arrive takes some time. A spinlock
272 * would be highly contended. Also it is not clear in which order
273 * the objects arrive to the reorder queues. So a cpu could wait to
274 * get the lock just to notice that there is nothing to do at the
275 * moment. Therefore we use a trylock and let the holder of the lock
276 * care for all the objects enqueued during the holdtime of the lock.
277 */
251 if (!spin_trylock_bh(&pd->lock)) 278 if (!spin_trylock_bh(&pd->lock))
252 goto out; 279 return;
253 280
254 while (1) { 281 while (1) {
255 padata = padata_get_next(pd); 282 padata = padata_get_next(pd);
256 283
284 /*
285 * All reorder queues are empty, or the next object that needs
286 * serialization is parallel processed by another cpu and is
287 * still on it's way to the cpu's reorder queue, nothing to
288 * do for now.
289 */
257 if (!padata || PTR_ERR(padata) == -EINPROGRESS) 290 if (!padata || PTR_ERR(padata) == -EINPROGRESS)
258 break; 291 break;
259 292
293 /*
294 * This cpu has to do the parallel processing of the next
295 * object. It's waiting in the cpu's parallelization queue,
296 * so exit imediately.
297 */
260 if (PTR_ERR(padata) == -ENODATA) { 298 if (PTR_ERR(padata) == -ENODATA) {
299 del_timer(&pd->timer);
261 spin_unlock_bh(&pd->lock); 300 spin_unlock_bh(&pd->lock);
262 goto out; 301 return;
263 } 302 }
264 303
265 queue = per_cpu_ptr(pd->queue, padata->cb_cpu); 304 queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
@@ -273,13 +312,27 @@ try_again:
273 312
274 spin_unlock_bh(&pd->lock); 313 spin_unlock_bh(&pd->lock);
275 314
276 if (atomic_read(&pd->reorder_objects)) 315 /*
277 goto try_again; 316 * The next object that needs serialization might have arrived to
317 * the reorder queues in the meantime, we will be called again
318 * from the timer function if noone else cares for it.
319 */
320 if (atomic_read(&pd->reorder_objects)
321 && !(pinst->flags & PADATA_RESET))
322 mod_timer(&pd->timer, jiffies + HZ);
323 else
324 del_timer(&pd->timer);
278 325
279out:
280 return; 326 return;
281} 327}
282 328
329static void padata_reorder_timer(unsigned long arg)
330{
331 struct parallel_data *pd = (struct parallel_data *)arg;
332
333 padata_reorder(pd);
334}
335
283static void padata_serial_worker(struct work_struct *work) 336static void padata_serial_worker(struct work_struct *work)
284{ 337{
285 struct padata_queue *queue; 338 struct padata_queue *queue;
@@ -308,7 +361,7 @@ static void padata_serial_worker(struct work_struct *work)
308 local_bh_enable(); 361 local_bh_enable();
309} 362}
310 363
311/* 364/**
312 * padata_do_serial - padata serialization function 365 * padata_do_serial - padata serialization function
313 * 366 *
314 * @padata: object to be serialized. 367 * @padata: object to be serialized.
@@ -338,6 +391,7 @@ void padata_do_serial(struct padata_priv *padata)
338} 391}
339EXPORT_SYMBOL(padata_do_serial); 392EXPORT_SYMBOL(padata_do_serial);
340 393
394/* Allocate and initialize the internal cpumask dependend resources. */
341static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, 395static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
342 const struct cpumask *cpumask) 396 const struct cpumask *cpumask)
343{ 397{
@@ -358,17 +412,15 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
358 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) 412 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
359 goto err_free_queue; 413 goto err_free_queue;
360 414
361 for_each_possible_cpu(cpu) { 415 cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
416
417 for_each_cpu(cpu, pd->cpumask) {
362 queue = per_cpu_ptr(pd->queue, cpu); 418 queue = per_cpu_ptr(pd->queue, cpu);
363 419
364 queue->pd = pd; 420 queue->pd = pd;
365 421
366 if (cpumask_test_cpu(cpu, cpumask) 422 queue->cpu_index = cpu_index;
367 && cpumask_test_cpu(cpu, cpu_active_mask)) { 423 cpu_index++;
368 queue->cpu_index = cpu_index;
369 cpu_index++;
370 } else
371 queue->cpu_index = -1;
372 424
373 INIT_LIST_HEAD(&queue->reorder.list); 425 INIT_LIST_HEAD(&queue->reorder.list);
374 INIT_LIST_HEAD(&queue->parallel.list); 426 INIT_LIST_HEAD(&queue->parallel.list);
@@ -382,11 +434,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
382 atomic_set(&queue->num_obj, 0); 434 atomic_set(&queue->num_obj, 0);
383 } 435 }
384 436
385 cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
386
387 num_cpus = cpumask_weight(pd->cpumask); 437 num_cpus = cpumask_weight(pd->cpumask);
388 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; 438 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
389 439
440 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
390 atomic_set(&pd->seq_nr, -1); 441 atomic_set(&pd->seq_nr, -1);
391 atomic_set(&pd->reorder_objects, 0); 442 atomic_set(&pd->reorder_objects, 0);
392 atomic_set(&pd->refcnt, 0); 443 atomic_set(&pd->refcnt, 0);
@@ -410,6 +461,31 @@ static void padata_free_pd(struct parallel_data *pd)
410 kfree(pd); 461 kfree(pd);
411} 462}
412 463
464/* Flush all objects out of the padata queues. */
465static void padata_flush_queues(struct parallel_data *pd)
466{
467 int cpu;
468 struct padata_queue *queue;
469
470 for_each_cpu(cpu, pd->cpumask) {
471 queue = per_cpu_ptr(pd->queue, cpu);
472 flush_work(&queue->pwork);
473 }
474
475 del_timer_sync(&pd->timer);
476
477 if (atomic_read(&pd->reorder_objects))
478 padata_reorder(pd);
479
480 for_each_cpu(cpu, pd->cpumask) {
481 queue = per_cpu_ptr(pd->queue, cpu);
482 flush_work(&queue->swork);
483 }
484
485 BUG_ON(atomic_read(&pd->refcnt) != 0);
486}
487
488/* Replace the internal control stucture with a new one. */
413static void padata_replace(struct padata_instance *pinst, 489static void padata_replace(struct padata_instance *pinst,
414 struct parallel_data *pd_new) 490 struct parallel_data *pd_new)
415{ 491{
@@ -421,17 +497,13 @@ static void padata_replace(struct padata_instance *pinst,
421 497
422 synchronize_rcu(); 498 synchronize_rcu();
423 499
424 while (atomic_read(&pd_old->refcnt) != 0) 500 padata_flush_queues(pd_old);
425 yield();
426
427 flush_workqueue(pinst->wq);
428
429 padata_free_pd(pd_old); 501 padata_free_pd(pd_old);
430 502
431 pinst->flags &= ~PADATA_RESET; 503 pinst->flags &= ~PADATA_RESET;
432} 504}
433 505
434/* 506/**
435 * padata_set_cpumask - set the cpumask that padata should use 507 * padata_set_cpumask - set the cpumask that padata should use
436 * 508 *
437 * @pinst: padata instance 509 * @pinst: padata instance
@@ -443,10 +515,10 @@ int padata_set_cpumask(struct padata_instance *pinst,
443 struct parallel_data *pd; 515 struct parallel_data *pd;
444 int err = 0; 516 int err = 0;
445 517
446 might_sleep();
447
448 mutex_lock(&pinst->lock); 518 mutex_lock(&pinst->lock);
449 519
520 get_online_cpus();
521
450 pd = padata_alloc_pd(pinst, cpumask); 522 pd = padata_alloc_pd(pinst, cpumask);
451 if (!pd) { 523 if (!pd) {
452 err = -ENOMEM; 524 err = -ENOMEM;
@@ -458,6 +530,8 @@ int padata_set_cpumask(struct padata_instance *pinst,
458 padata_replace(pinst, pd); 530 padata_replace(pinst, pd);
459 531
460out: 532out:
533 put_online_cpus();
534
461 mutex_unlock(&pinst->lock); 535 mutex_unlock(&pinst->lock);
462 536
463 return err; 537 return err;
@@ -479,7 +553,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
479 return 0; 553 return 0;
480} 554}
481 555
482/* 556/**
483 * padata_add_cpu - add a cpu to the padata cpumask 557 * padata_add_cpu - add a cpu to the padata cpumask
484 * 558 *
485 * @pinst: padata instance 559 * @pinst: padata instance
@@ -489,12 +563,12 @@ int padata_add_cpu(struct padata_instance *pinst, int cpu)
489{ 563{
490 int err; 564 int err;
491 565
492 might_sleep();
493
494 mutex_lock(&pinst->lock); 566 mutex_lock(&pinst->lock);
495 567
568 get_online_cpus();
496 cpumask_set_cpu(cpu, pinst->cpumask); 569 cpumask_set_cpu(cpu, pinst->cpumask);
497 err = __padata_add_cpu(pinst, cpu); 570 err = __padata_add_cpu(pinst, cpu);
571 put_online_cpus();
498 572
499 mutex_unlock(&pinst->lock); 573 mutex_unlock(&pinst->lock);
500 574
@@ -517,7 +591,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
517 return 0; 591 return 0;
518} 592}
519 593
520/* 594/**
521 * padata_remove_cpu - remove a cpu from the padata cpumask 595 * padata_remove_cpu - remove a cpu from the padata cpumask
522 * 596 *
523 * @pinst: padata instance 597 * @pinst: padata instance
@@ -527,12 +601,12 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu)
527{ 601{
528 int err; 602 int err;
529 603
530 might_sleep();
531
532 mutex_lock(&pinst->lock); 604 mutex_lock(&pinst->lock);
533 605
606 get_online_cpus();
534 cpumask_clear_cpu(cpu, pinst->cpumask); 607 cpumask_clear_cpu(cpu, pinst->cpumask);
535 err = __padata_remove_cpu(pinst, cpu); 608 err = __padata_remove_cpu(pinst, cpu);
609 put_online_cpus();
536 610
537 mutex_unlock(&pinst->lock); 611 mutex_unlock(&pinst->lock);
538 612
@@ -540,38 +614,35 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu)
540} 614}
541EXPORT_SYMBOL(padata_remove_cpu); 615EXPORT_SYMBOL(padata_remove_cpu);
542 616
543/* 617/**
544 * padata_start - start the parallel processing 618 * padata_start - start the parallel processing
545 * 619 *
546 * @pinst: padata instance to start 620 * @pinst: padata instance to start
547 */ 621 */
548void padata_start(struct padata_instance *pinst) 622void padata_start(struct padata_instance *pinst)
549{ 623{
550 might_sleep();
551
552 mutex_lock(&pinst->lock); 624 mutex_lock(&pinst->lock);
553 pinst->flags |= PADATA_INIT; 625 pinst->flags |= PADATA_INIT;
554 mutex_unlock(&pinst->lock); 626 mutex_unlock(&pinst->lock);
555} 627}
556EXPORT_SYMBOL(padata_start); 628EXPORT_SYMBOL(padata_start);
557 629
558/* 630/**
559 * padata_stop - stop the parallel processing 631 * padata_stop - stop the parallel processing
560 * 632 *
561 * @pinst: padata instance to stop 633 * @pinst: padata instance to stop
562 */ 634 */
563void padata_stop(struct padata_instance *pinst) 635void padata_stop(struct padata_instance *pinst)
564{ 636{
565 might_sleep();
566
567 mutex_lock(&pinst->lock); 637 mutex_lock(&pinst->lock);
568 pinst->flags &= ~PADATA_INIT; 638 pinst->flags &= ~PADATA_INIT;
569 mutex_unlock(&pinst->lock); 639 mutex_unlock(&pinst->lock);
570} 640}
571EXPORT_SYMBOL(padata_stop); 641EXPORT_SYMBOL(padata_stop);
572 642
573static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, 643#ifdef CONFIG_HOTPLUG_CPU
574 unsigned long action, void *hcpu) 644static int padata_cpu_callback(struct notifier_block *nfb,
645 unsigned long action, void *hcpu)
575{ 646{
576 int err; 647 int err;
577 struct padata_instance *pinst; 648 struct padata_instance *pinst;
@@ -588,7 +659,7 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
588 err = __padata_add_cpu(pinst, cpu); 659 err = __padata_add_cpu(pinst, cpu);
589 mutex_unlock(&pinst->lock); 660 mutex_unlock(&pinst->lock);
590 if (err) 661 if (err)
591 return NOTIFY_BAD; 662 return notifier_from_errno(err);
592 break; 663 break;
593 664
594 case CPU_DOWN_PREPARE: 665 case CPU_DOWN_PREPARE:
@@ -599,7 +670,7 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
599 err = __padata_remove_cpu(pinst, cpu); 670 err = __padata_remove_cpu(pinst, cpu);
600 mutex_unlock(&pinst->lock); 671 mutex_unlock(&pinst->lock);
601 if (err) 672 if (err)
602 return NOTIFY_BAD; 673 return notifier_from_errno(err);
603 break; 674 break;
604 675
605 case CPU_UP_CANCELED: 676 case CPU_UP_CANCELED:
@@ -621,8 +692,9 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
621 692
622 return NOTIFY_OK; 693 return NOTIFY_OK;
623} 694}
695#endif
624 696
625/* 697/**
626 * padata_alloc - allocate and initialize a padata instance 698 * padata_alloc - allocate and initialize a padata instance
627 * 699 *
628 * @cpumask: cpumask that padata uses for parallelization 700 * @cpumask: cpumask that padata uses for parallelization
@@ -631,7 +703,6 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
631struct padata_instance *padata_alloc(const struct cpumask *cpumask, 703struct padata_instance *padata_alloc(const struct cpumask *cpumask,
632 struct workqueue_struct *wq) 704 struct workqueue_struct *wq)
633{ 705{
634 int err;
635 struct padata_instance *pinst; 706 struct padata_instance *pinst;
636 struct parallel_data *pd; 707 struct parallel_data *pd;
637 708
@@ -639,6 +710,8 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
639 if (!pinst) 710 if (!pinst)
640 goto err; 711 goto err;
641 712
713 get_online_cpus();
714
642 pd = padata_alloc_pd(pinst, cpumask); 715 pd = padata_alloc_pd(pinst, cpumask);
643 if (!pd) 716 if (!pd)
644 goto err_free_inst; 717 goto err_free_inst;
@@ -654,31 +727,32 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
654 727
655 pinst->flags = 0; 728 pinst->flags = 0;
656 729
730#ifdef CONFIG_HOTPLUG_CPU
657 pinst->cpu_notifier.notifier_call = padata_cpu_callback; 731 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
658 pinst->cpu_notifier.priority = 0; 732 pinst->cpu_notifier.priority = 0;
659 err = register_hotcpu_notifier(&pinst->cpu_notifier); 733 register_hotcpu_notifier(&pinst->cpu_notifier);
660 if (err) 734#endif
661 goto err_free_cpumask; 735
736 put_online_cpus();
662 737
663 mutex_init(&pinst->lock); 738 mutex_init(&pinst->lock);
664 739
665 return pinst; 740 return pinst;
666 741
667err_free_cpumask:
668 free_cpumask_var(pinst->cpumask);
669err_free_pd: 742err_free_pd:
670 padata_free_pd(pd); 743 padata_free_pd(pd);
671err_free_inst: 744err_free_inst:
672 kfree(pinst); 745 kfree(pinst);
746 put_online_cpus();
673err: 747err:
674 return NULL; 748 return NULL;
675} 749}
676EXPORT_SYMBOL(padata_alloc); 750EXPORT_SYMBOL(padata_alloc);
677 751
678/* 752/**
679 * padata_free - free a padata instance 753 * padata_free - free a padata instance
680 * 754 *
681 * @ padata_inst: padata instance to free 755 * @padata_inst: padata instance to free
682 */ 756 */
683void padata_free(struct padata_instance *pinst) 757void padata_free(struct padata_instance *pinst)
684{ 758{
@@ -686,10 +760,13 @@ void padata_free(struct padata_instance *pinst)
686 760
687 synchronize_rcu(); 761 synchronize_rcu();
688 762
689 while (atomic_read(&pinst->pd->refcnt) != 0) 763#ifdef CONFIG_HOTPLUG_CPU
690 yield();
691
692 unregister_hotcpu_notifier(&pinst->cpu_notifier); 764 unregister_hotcpu_notifier(&pinst->cpu_notifier);
765#endif
766 get_online_cpus();
767 padata_flush_queues(pinst->pd);
768 put_online_cpus();
769
693 padata_free_pd(pinst->pd); 770 padata_free_pd(pinst->pd);
694 free_cpumask_var(pinst->cpumask); 771 free_cpumask_var(pinst->cpumask);
695 kfree(pinst); 772 kfree(pinst);
diff --git a/kernel/panic.c b/kernel/panic.c
index 13d966b4c14a..3b16cd93fa7d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -87,6 +87,7 @@ NORET_TYPE void panic(const char * fmt, ...)
87 */ 87 */
88 preempt_disable(); 88 preempt_disable();
89 89
90 console_verbose();
90 bust_spinlocks(1); 91 bust_spinlocks(1);
91 va_start(args, fmt); 92 va_start(args, fmt);
92 vsnprintf(buf, sizeof(buf), fmt, args); 93 vsnprintf(buf, sizeof(buf), fmt, args);
@@ -178,6 +179,7 @@ static const struct tnt tnts[] = {
178 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, 179 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
179 { TAINT_WARN, 'W', ' ' }, 180 { TAINT_WARN, 'W', ' ' },
180 { TAINT_CRAP, 'C', ' ' }, 181 { TAINT_CRAP, 'C', ' ' },
182 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
181}; 183};
182 184
183/** 185/**
@@ -194,6 +196,7 @@ static const struct tnt tnts[] = {
194 * 'A' - ACPI table overridden. 196 * 'A' - ACPI table overridden.
195 * 'W' - Taint on warning. 197 * 'W' - Taint on warning.
196 * 'C' - modules from drivers/staging are loaded. 198 * 'C' - modules from drivers/staging are loaded.
199 * 'I' - Working around severe firmware bug.
197 * 200 *
198 * The string is overwritten by the next call to print_tainted(). 201 * The string is overwritten by the next call to print_tainted().
199 */ 202 */
@@ -365,7 +368,8 @@ struct slowpath_args {
365 va_list args; 368 va_list args;
366}; 369};
367 370
368static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args) 371static void warn_slowpath_common(const char *file, int line, void *caller,
372 unsigned taint, struct slowpath_args *args)
369{ 373{
370 const char *board; 374 const char *board;
371 375
@@ -381,7 +385,7 @@ static void warn_slowpath_common(const char *file, int line, void *caller, struc
381 print_modules(); 385 print_modules();
382 dump_stack(); 386 dump_stack();
383 print_oops_end_marker(); 387 print_oops_end_marker();
384 add_taint(TAINT_WARN); 388 add_taint(taint);
385} 389}
386 390
387void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) 391void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
@@ -390,14 +394,29 @@ void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
390 394
391 args.fmt = fmt; 395 args.fmt = fmt;
392 va_start(args.args, fmt); 396 va_start(args.args, fmt);
393 warn_slowpath_common(file, line, __builtin_return_address(0), &args); 397 warn_slowpath_common(file, line, __builtin_return_address(0),
398 TAINT_WARN, &args);
394 va_end(args.args); 399 va_end(args.args);
395} 400}
396EXPORT_SYMBOL(warn_slowpath_fmt); 401EXPORT_SYMBOL(warn_slowpath_fmt);
397 402
403void warn_slowpath_fmt_taint(const char *file, int line,
404 unsigned taint, const char *fmt, ...)
405{
406 struct slowpath_args args;
407
408 args.fmt = fmt;
409 va_start(args.args, fmt);
410 warn_slowpath_common(file, line, __builtin_return_address(0),
411 taint, &args);
412 va_end(args.args);
413}
414EXPORT_SYMBOL(warn_slowpath_fmt_taint);
415
398void warn_slowpath_null(const char *file, int line) 416void warn_slowpath_null(const char *file, int line)
399{ 417{
400 warn_slowpath_common(file, line, __builtin_return_address(0), NULL); 418 warn_slowpath_common(file, line, __builtin_return_address(0),
419 TAINT_WARN, NULL);
401} 420}
402EXPORT_SYMBOL(warn_slowpath_null); 421EXPORT_SYMBOL(warn_slowpath_null);
403#endif 422#endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 3d1552d3c12b..bd7ce8ca5bb9 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -16,6 +16,7 @@
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/poll.h> 17#include <linux/poll.h>
18#include <linux/slab.h> 18#include <linux/slab.h>
19#include <linux/hash.h>
19#include <linux/sysfs.h> 20#include <linux/sysfs.h>
20#include <linux/dcache.h> 21#include <linux/dcache.h>
21#include <linux/percpu.h> 22#include <linux/percpu.h>
@@ -82,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
82void __weak hw_perf_disable(void) { barrier(); } 83void __weak hw_perf_disable(void) { barrier(); }
83void __weak hw_perf_enable(void) { barrier(); } 84void __weak hw_perf_enable(void) { barrier(); }
84 85
85int __weak
86hw_perf_group_sched_in(struct perf_event *group_leader,
87 struct perf_cpu_context *cpuctx,
88 struct perf_event_context *ctx)
89{
90 return 0;
91}
92
93void __weak perf_event_print_debug(void) { } 86void __weak perf_event_print_debug(void) { }
94 87
95static DEFINE_PER_CPU(int, perf_disable_count); 88static DEFINE_PER_CPU(int, perf_disable_count);
@@ -262,6 +255,18 @@ static void update_event_times(struct perf_event *event)
262 event->total_time_running = run_end - event->tstamp_running; 255 event->total_time_running = run_end - event->tstamp_running;
263} 256}
264 257
258/*
259 * Update total_time_enabled and total_time_running for all events in a group.
260 */
261static void update_group_times(struct perf_event *leader)
262{
263 struct perf_event *event;
264
265 update_event_times(leader);
266 list_for_each_entry(event, &leader->sibling_list, group_entry)
267 update_event_times(event);
268}
269
265static struct list_head * 270static struct list_head *
266ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) 271ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
267{ 272{
@@ -315,8 +320,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
315static void 320static void
316list_del_event(struct perf_event *event, struct perf_event_context *ctx) 321list_del_event(struct perf_event *event, struct perf_event_context *ctx)
317{ 322{
318 struct perf_event *sibling, *tmp;
319
320 if (list_empty(&event->group_entry)) 323 if (list_empty(&event->group_entry))
321 return; 324 return;
322 ctx->nr_events--; 325 ctx->nr_events--;
@@ -329,7 +332,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
329 if (event->group_leader != event) 332 if (event->group_leader != event)
330 event->group_leader->nr_siblings--; 333 event->group_leader->nr_siblings--;
331 334
332 update_event_times(event); 335 update_group_times(event);
333 336
334 /* 337 /*
335 * If event was in error state, then keep it 338 * If event was in error state, then keep it
@@ -340,6 +343,12 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
340 */ 343 */
341 if (event->state > PERF_EVENT_STATE_OFF) 344 if (event->state > PERF_EVENT_STATE_OFF)
342 event->state = PERF_EVENT_STATE_OFF; 345 event->state = PERF_EVENT_STATE_OFF;
346}
347
348static void
349perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx)
350{
351 struct perf_event *sibling, *tmp;
343 352
344 /* 353 /*
345 * If this was a group event with sibling events then 354 * If this was a group event with sibling events then
@@ -505,18 +514,6 @@ retry:
505} 514}
506 515
507/* 516/*
508 * Update total_time_enabled and total_time_running for all events in a group.
509 */
510static void update_group_times(struct perf_event *leader)
511{
512 struct perf_event *event;
513
514 update_event_times(leader);
515 list_for_each_entry(event, &leader->sibling_list, group_entry)
516 update_event_times(event);
517}
518
519/*
520 * Cross CPU call to disable a performance event 517 * Cross CPU call to disable a performance event
521 */ 518 */
522static void __perf_event_disable(void *info) 519static void __perf_event_disable(void *info)
@@ -640,15 +637,20 @@ group_sched_in(struct perf_event *group_event,
640 struct perf_cpu_context *cpuctx, 637 struct perf_cpu_context *cpuctx,
641 struct perf_event_context *ctx) 638 struct perf_event_context *ctx)
642{ 639{
643 struct perf_event *event, *partial_group; 640 struct perf_event *event, *partial_group = NULL;
641 const struct pmu *pmu = group_event->pmu;
642 bool txn = false;
644 int ret; 643 int ret;
645 644
646 if (group_event->state == PERF_EVENT_STATE_OFF) 645 if (group_event->state == PERF_EVENT_STATE_OFF)
647 return 0; 646 return 0;
648 647
649 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); 648 /* Check if group transaction availabe */
650 if (ret) 649 if (pmu->start_txn)
651 return ret < 0 ? ret : 0; 650 txn = true;
651
652 if (txn)
653 pmu->start_txn(pmu);
652 654
653 if (event_sched_in(group_event, cpuctx, ctx)) 655 if (event_sched_in(group_event, cpuctx, ctx))
654 return -EAGAIN; 656 return -EAGAIN;
@@ -663,9 +665,19 @@ group_sched_in(struct perf_event *group_event,
663 } 665 }
664 } 666 }
665 667
666 return 0; 668 if (!txn)
669 return 0;
670
671 ret = pmu->commit_txn(pmu);
672 if (!ret) {
673 pmu->cancel_txn(pmu);
674 return 0;
675 }
667 676
668group_error: 677group_error:
678 if (txn)
679 pmu->cancel_txn(pmu);
680
669 /* 681 /*
670 * Groups can be scheduled in as one unit only, so undo any 682 * Groups can be scheduled in as one unit only, so undo any
671 * partial group before returning: 683 * partial group before returning:
@@ -1367,6 +1379,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1367 if (cpuctx->task_ctx == ctx) 1379 if (cpuctx->task_ctx == ctx)
1368 return; 1380 return;
1369 1381
1382 perf_disable();
1383
1370 /* 1384 /*
1371 * We want to keep the following priority order: 1385 * We want to keep the following priority order:
1372 * cpu pinned (that don't need to move), task pinned, 1386 * cpu pinned (that don't need to move), task pinned,
@@ -1379,6 +1393,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1379 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 1393 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1380 1394
1381 cpuctx->task_ctx = ctx; 1395 cpuctx->task_ctx = ctx;
1396
1397 perf_enable();
1382} 1398}
1383 1399
1384#define MAX_INTERRUPTS (~0ULL) 1400#define MAX_INTERRUPTS (~0ULL)
@@ -1856,9 +1872,30 @@ int perf_event_release_kernel(struct perf_event *event)
1856{ 1872{
1857 struct perf_event_context *ctx = event->ctx; 1873 struct perf_event_context *ctx = event->ctx;
1858 1874
1875 /*
1876 * Remove from the PMU, can't get re-enabled since we got
1877 * here because the last ref went.
1878 */
1879 perf_event_disable(event);
1880
1859 WARN_ON_ONCE(ctx->parent_ctx); 1881 WARN_ON_ONCE(ctx->parent_ctx);
1860 mutex_lock(&ctx->mutex); 1882 /*
1861 perf_event_remove_from_context(event); 1883 * There are two ways this annotation is useful:
1884 *
1885 * 1) there is a lock recursion from perf_event_exit_task
1886 * see the comment there.
1887 *
1888 * 2) there is a lock-inversion with mmap_sem through
1889 * perf_event_read_group(), which takes faults while
1890 * holding ctx->mutex, however this is called after
1891 * the last filedesc died, so there is no possibility
1892 * to trigger the AB-BA case.
1893 */
1894 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
1895 raw_spin_lock_irq(&ctx->lock);
1896 list_del_event(event, ctx);
1897 perf_destroy_group(event, ctx);
1898 raw_spin_unlock_irq(&ctx->lock);
1862 mutex_unlock(&ctx->mutex); 1899 mutex_unlock(&ctx->mutex);
1863 1900
1864 mutex_lock(&event->owner->perf_event_mutex); 1901 mutex_lock(&event->owner->perf_event_mutex);
@@ -2260,11 +2297,6 @@ unlock:
2260 rcu_read_unlock(); 2297 rcu_read_unlock();
2261} 2298}
2262 2299
2263static unsigned long perf_data_size(struct perf_mmap_data *data)
2264{
2265 return data->nr_pages << (PAGE_SHIFT + data->data_order);
2266}
2267
2268#ifndef CONFIG_PERF_USE_VMALLOC 2300#ifndef CONFIG_PERF_USE_VMALLOC
2269 2301
2270/* 2302/*
@@ -2283,6 +2315,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2283 return virt_to_page(data->data_pages[pgoff - 1]); 2315 return virt_to_page(data->data_pages[pgoff - 1]);
2284} 2316}
2285 2317
2318static void *perf_mmap_alloc_page(int cpu)
2319{
2320 struct page *page;
2321 int node;
2322
2323 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
2324 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
2325 if (!page)
2326 return NULL;
2327
2328 return page_address(page);
2329}
2330
2286static struct perf_mmap_data * 2331static struct perf_mmap_data *
2287perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2332perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2288{ 2333{
@@ -2299,17 +2344,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2299 if (!data) 2344 if (!data)
2300 goto fail; 2345 goto fail;
2301 2346
2302 data->user_page = (void *)get_zeroed_page(GFP_KERNEL); 2347 data->user_page = perf_mmap_alloc_page(event->cpu);
2303 if (!data->user_page) 2348 if (!data->user_page)
2304 goto fail_user_page; 2349 goto fail_user_page;
2305 2350
2306 for (i = 0; i < nr_pages; i++) { 2351 for (i = 0; i < nr_pages; i++) {
2307 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); 2352 data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
2308 if (!data->data_pages[i]) 2353 if (!data->data_pages[i])
2309 goto fail_data_pages; 2354 goto fail_data_pages;
2310 } 2355 }
2311 2356
2312 data->data_order = 0;
2313 data->nr_pages = nr_pages; 2357 data->nr_pages = nr_pages;
2314 2358
2315 return data; 2359 return data;
@@ -2345,6 +2389,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2345 kfree(data); 2389 kfree(data);
2346} 2390}
2347 2391
2392static inline int page_order(struct perf_mmap_data *data)
2393{
2394 return 0;
2395}
2396
2348#else 2397#else
2349 2398
2350/* 2399/*
@@ -2353,10 +2402,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2353 * Required for architectures that have d-cache aliasing issues. 2402 * Required for architectures that have d-cache aliasing issues.
2354 */ 2403 */
2355 2404
2405static inline int page_order(struct perf_mmap_data *data)
2406{
2407 return data->page_order;
2408}
2409
2356static struct page * 2410static struct page *
2357perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2411perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2358{ 2412{
2359 if (pgoff > (1UL << data->data_order)) 2413 if (pgoff > (1UL << page_order(data)))
2360 return NULL; 2414 return NULL;
2361 2415
2362 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); 2416 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
@@ -2376,7 +2430,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
2376 int i, nr; 2430 int i, nr;
2377 2431
2378 data = container_of(work, struct perf_mmap_data, work); 2432 data = container_of(work, struct perf_mmap_data, work);
2379 nr = 1 << data->data_order; 2433 nr = 1 << page_order(data);
2380 2434
2381 base = data->user_page; 2435 base = data->user_page;
2382 for (i = 0; i < nr + 1; i++) 2436 for (i = 0; i < nr + 1; i++)
@@ -2415,7 +2469,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2415 2469
2416 data->user_page = all_buf; 2470 data->user_page = all_buf;
2417 data->data_pages[0] = all_buf + PAGE_SIZE; 2471 data->data_pages[0] = all_buf + PAGE_SIZE;
2418 data->data_order = ilog2(nr_pages); 2472 data->page_order = ilog2(nr_pages);
2419 data->nr_pages = 1; 2473 data->nr_pages = 1;
2420 2474
2421 return data; 2475 return data;
@@ -2429,6 +2483,11 @@ fail:
2429 2483
2430#endif 2484#endif
2431 2485
2486static unsigned long perf_data_size(struct perf_mmap_data *data)
2487{
2488 return data->nr_pages << (PAGE_SHIFT + page_order(data));
2489}
2490
2432static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2491static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2433{ 2492{
2434 struct perf_event *event = vma->vm_file->private_data; 2493 struct perf_event *event = vma->vm_file->private_data;
@@ -2469,8 +2528,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2469{ 2528{
2470 long max_size = perf_data_size(data); 2529 long max_size = perf_data_size(data);
2471 2530
2472 atomic_set(&data->lock, -1);
2473
2474 if (event->attr.watermark) { 2531 if (event->attr.watermark) {
2475 data->watermark = min_t(long, max_size, 2532 data->watermark = min_t(long, max_size,
2476 event->attr.wakeup_watermark); 2533 event->attr.wakeup_watermark);
@@ -2543,6 +2600,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2543 long user_extra, extra; 2600 long user_extra, extra;
2544 int ret = 0; 2601 int ret = 0;
2545 2602
2603 /*
2604 * Don't allow mmap() of inherited per-task counters. This would
2605 * create a performance issue due to all children writing to the
2606 * same buffer.
2607 */
2608 if (event->cpu == -1 && event->attr.inherit)
2609 return -EINVAL;
2610
2546 if (!(vma->vm_flags & VM_SHARED)) 2611 if (!(vma->vm_flags & VM_SHARED))
2547 return -EINVAL; 2612 return -EINVAL;
2548 2613
@@ -2642,6 +2707,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
2642} 2707}
2643 2708
2644static const struct file_operations perf_fops = { 2709static const struct file_operations perf_fops = {
2710 .llseek = no_llseek,
2645 .release = perf_release, 2711 .release = perf_release,
2646 .read = perf_read, 2712 .read = perf_read,
2647 .poll = perf_poll, 2713 .poll = perf_poll,
@@ -2792,6 +2858,27 @@ void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int ski
2792 2858
2793 2859
2794/* 2860/*
2861 * We assume there is only KVM supporting the callbacks.
2862 * Later on, we might change it to a list if there is
2863 * another virtualization implementation supporting the callbacks.
2864 */
2865struct perf_guest_info_callbacks *perf_guest_cbs;
2866
2867int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2868{
2869 perf_guest_cbs = cbs;
2870 return 0;
2871}
2872EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
2873
2874int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2875{
2876 perf_guest_cbs = NULL;
2877 return 0;
2878}
2879EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2880
2881/*
2795 * Output 2882 * Output
2796 */ 2883 */
2797static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, 2884static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
@@ -2826,120 +2913,80 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2826} 2913}
2827 2914
2828/* 2915/*
2829 * Curious locking construct.
2830 *
2831 * We need to ensure a later event_id doesn't publish a head when a former 2916 * We need to ensure a later event_id doesn't publish a head when a former
2832 * event_id isn't done writing. However since we need to deal with NMIs we 2917 * event isn't done writing. However since we need to deal with NMIs we
2833 * cannot fully serialize things. 2918 * cannot fully serialize things.
2834 * 2919 *
2835 * What we do is serialize between CPUs so we only have to deal with NMI
2836 * nesting on a single CPU.
2837 *
2838 * We only publish the head (and generate a wakeup) when the outer-most 2920 * We only publish the head (and generate a wakeup) when the outer-most
2839 * event_id completes. 2921 * event completes.
2840 */ 2922 */
2841static void perf_output_lock(struct perf_output_handle *handle) 2923static void perf_output_get_handle(struct perf_output_handle *handle)
2842{ 2924{
2843 struct perf_mmap_data *data = handle->data; 2925 struct perf_mmap_data *data = handle->data;
2844 int cur, cpu = get_cpu();
2845 2926
2846 handle->locked = 0; 2927 preempt_disable();
2847 2928 local_inc(&data->nest);
2848 for (;;) { 2929 handle->wakeup = local_read(&data->wakeup);
2849 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2850 if (cur == -1) {
2851 handle->locked = 1;
2852 break;
2853 }
2854 if (cur == cpu)
2855 break;
2856
2857 cpu_relax();
2858 }
2859} 2930}
2860 2931
2861static void perf_output_unlock(struct perf_output_handle *handle) 2932static void perf_output_put_handle(struct perf_output_handle *handle)
2862{ 2933{
2863 struct perf_mmap_data *data = handle->data; 2934 struct perf_mmap_data *data = handle->data;
2864 unsigned long head; 2935 unsigned long head;
2865 int cpu;
2866
2867 data->done_head = data->head;
2868
2869 if (!handle->locked)
2870 goto out;
2871 2936
2872again: 2937again:
2873 /* 2938 head = local_read(&data->head);
2874 * The xchg implies a full barrier that ensures all writes are done
2875 * before we publish the new head, matched by a rmb() in userspace when
2876 * reading this position.
2877 */
2878 while ((head = atomic_long_xchg(&data->done_head, 0)))
2879 data->user_page->data_head = head;
2880 2939
2881 /* 2940 /*
2882 * NMI can happen here, which means we can miss a done_head update. 2941 * IRQ/NMI can happen here, which means we can miss a head update.
2883 */ 2942 */
2884 2943
2885 cpu = atomic_xchg(&data->lock, -1); 2944 if (!local_dec_and_test(&data->nest))
2886 WARN_ON_ONCE(cpu != smp_processor_id()); 2945 goto out;
2887 2946
2888 /* 2947 /*
2889 * Therefore we have to validate we did not indeed do so. 2948 * Publish the known good head. Rely on the full barrier implied
2949 * by atomic_dec_and_test() order the data->head read and this
2950 * write.
2890 */ 2951 */
2891 if (unlikely(atomic_long_read(&data->done_head))) { 2952 data->user_page->data_head = head;
2892 /*
2893 * Since we had it locked, we can lock it again.
2894 */
2895 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2896 cpu_relax();
2897 2953
2954 /*
2955 * Now check if we missed an update, rely on the (compiler)
2956 * barrier in atomic_dec_and_test() to re-read data->head.
2957 */
2958 if (unlikely(head != local_read(&data->head))) {
2959 local_inc(&data->nest);
2898 goto again; 2960 goto again;
2899 } 2961 }
2900 2962
2901 if (atomic_xchg(&data->wakeup, 0)) 2963 if (handle->wakeup != local_read(&data->wakeup))
2902 perf_output_wakeup(handle); 2964 perf_output_wakeup(handle);
2903out: 2965
2904 put_cpu(); 2966 out:
2967 preempt_enable();
2905} 2968}
2906 2969
2907void perf_output_copy(struct perf_output_handle *handle, 2970__always_inline void perf_output_copy(struct perf_output_handle *handle,
2908 const void *buf, unsigned int len) 2971 const void *buf, unsigned int len)
2909{ 2972{
2910 unsigned int pages_mask;
2911 unsigned long offset;
2912 unsigned int size;
2913 void **pages;
2914
2915 offset = handle->offset;
2916 pages_mask = handle->data->nr_pages - 1;
2917 pages = handle->data->data_pages;
2918
2919 do { 2973 do {
2920 unsigned long page_offset; 2974 unsigned long size = min_t(unsigned long, handle->size, len);
2921 unsigned long page_size;
2922 int nr;
2923 2975
2924 nr = (offset >> PAGE_SHIFT) & pages_mask; 2976 memcpy(handle->addr, buf, size);
2925 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2926 page_offset = offset & (page_size - 1);
2927 size = min_t(unsigned int, page_size - page_offset, len);
2928 2977
2929 memcpy(pages[nr] + page_offset, buf, size); 2978 len -= size;
2979 handle->addr += size;
2980 handle->size -= size;
2981 if (!handle->size) {
2982 struct perf_mmap_data *data = handle->data;
2930 2983
2931 len -= size; 2984 handle->page++;
2932 buf += size; 2985 handle->page &= data->nr_pages - 1;
2933 offset += size; 2986 handle->addr = data->data_pages[handle->page];
2987 handle->size = PAGE_SIZE << page_order(data);
2988 }
2934 } while (len); 2989 } while (len);
2935
2936 handle->offset = offset;
2937
2938 /*
2939 * Check we didn't copy past our reservation window, taking the
2940 * possible unsigned int wrap into account.
2941 */
2942 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2943} 2990}
2944 2991
2945int perf_output_begin(struct perf_output_handle *handle, 2992int perf_output_begin(struct perf_output_handle *handle,
@@ -2977,13 +3024,13 @@ int perf_output_begin(struct perf_output_handle *handle,
2977 handle->sample = sample; 3024 handle->sample = sample;
2978 3025
2979 if (!data->nr_pages) 3026 if (!data->nr_pages)
2980 goto fail; 3027 goto out;
2981 3028
2982 have_lost = atomic_read(&data->lost); 3029 have_lost = local_read(&data->lost);
2983 if (have_lost) 3030 if (have_lost)
2984 size += sizeof(lost_event); 3031 size += sizeof(lost_event);
2985 3032
2986 perf_output_lock(handle); 3033 perf_output_get_handle(handle);
2987 3034
2988 do { 3035 do {
2989 /* 3036 /*
@@ -2993,24 +3040,28 @@ int perf_output_begin(struct perf_output_handle *handle,
2993 */ 3040 */
2994 tail = ACCESS_ONCE(data->user_page->data_tail); 3041 tail = ACCESS_ONCE(data->user_page->data_tail);
2995 smp_rmb(); 3042 smp_rmb();
2996 offset = head = atomic_long_read(&data->head); 3043 offset = head = local_read(&data->head);
2997 head += size; 3044 head += size;
2998 if (unlikely(!perf_output_space(data, tail, offset, head))) 3045 if (unlikely(!perf_output_space(data, tail, offset, head)))
2999 goto fail; 3046 goto fail;
3000 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 3047 } while (local_cmpxchg(&data->head, offset, head) != offset);
3001 3048
3002 handle->offset = offset; 3049 if (head - local_read(&data->wakeup) > data->watermark)
3003 handle->head = head; 3050 local_add(data->watermark, &data->wakeup);
3004 3051
3005 if (head - tail > data->watermark) 3052 handle->page = offset >> (PAGE_SHIFT + page_order(data));
3006 atomic_set(&data->wakeup, 1); 3053 handle->page &= data->nr_pages - 1;
3054 handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
3055 handle->addr = data->data_pages[handle->page];
3056 handle->addr += handle->size;
3057 handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
3007 3058
3008 if (have_lost) { 3059 if (have_lost) {
3009 lost_event.header.type = PERF_RECORD_LOST; 3060 lost_event.header.type = PERF_RECORD_LOST;
3010 lost_event.header.misc = 0; 3061 lost_event.header.misc = 0;
3011 lost_event.header.size = sizeof(lost_event); 3062 lost_event.header.size = sizeof(lost_event);
3012 lost_event.id = event->id; 3063 lost_event.id = event->id;
3013 lost_event.lost = atomic_xchg(&data->lost, 0); 3064 lost_event.lost = local_xchg(&data->lost, 0);
3014 3065
3015 perf_output_put(handle, lost_event); 3066 perf_output_put(handle, lost_event);
3016 } 3067 }
@@ -3018,8 +3069,8 @@ int perf_output_begin(struct perf_output_handle *handle,
3018 return 0; 3069 return 0;
3019 3070
3020fail: 3071fail:
3021 atomic_inc(&data->lost); 3072 local_inc(&data->lost);
3022 perf_output_unlock(handle); 3073 perf_output_put_handle(handle);
3023out: 3074out:
3024 rcu_read_unlock(); 3075 rcu_read_unlock();
3025 3076
@@ -3034,14 +3085,14 @@ void perf_output_end(struct perf_output_handle *handle)
3034 int wakeup_events = event->attr.wakeup_events; 3085 int wakeup_events = event->attr.wakeup_events;
3035 3086
3036 if (handle->sample && wakeup_events) { 3087 if (handle->sample && wakeup_events) {
3037 int events = atomic_inc_return(&data->events); 3088 int events = local_inc_return(&data->events);
3038 if (events >= wakeup_events) { 3089 if (events >= wakeup_events) {
3039 atomic_sub(wakeup_events, &data->events); 3090 local_sub(wakeup_events, &data->events);
3040 atomic_set(&data->wakeup, 1); 3091 local_inc(&data->wakeup);
3041 } 3092 }
3042 } 3093 }
3043 3094
3044 perf_output_unlock(handle); 3095 perf_output_put_handle(handle);
3045 rcu_read_unlock(); 3096 rcu_read_unlock();
3046} 3097}
3047 3098
@@ -3377,22 +3428,13 @@ static void perf_event_task_output(struct perf_event *event,
3377{ 3428{
3378 struct perf_output_handle handle; 3429 struct perf_output_handle handle;
3379 struct task_struct *task = task_event->task; 3430 struct task_struct *task = task_event->task;
3380 unsigned long flags;
3381 int size, ret; 3431 int size, ret;
3382 3432
3383 /*
3384 * If this CPU attempts to acquire an rq lock held by a CPU spinning
3385 * in perf_output_lock() from interrupt context, it's game over.
3386 */
3387 local_irq_save(flags);
3388
3389 size = task_event->event_id.header.size; 3433 size = task_event->event_id.header.size;
3390 ret = perf_output_begin(&handle, event, size, 0, 0); 3434 ret = perf_output_begin(&handle, event, size, 0, 0);
3391 3435
3392 if (ret) { 3436 if (ret)
3393 local_irq_restore(flags);
3394 return; 3437 return;
3395 }
3396 3438
3397 task_event->event_id.pid = perf_event_pid(event, task); 3439 task_event->event_id.pid = perf_event_pid(event, task);
3398 task_event->event_id.ppid = perf_event_pid(event, current); 3440 task_event->event_id.ppid = perf_event_pid(event, current);
@@ -3403,7 +3445,6 @@ static void perf_event_task_output(struct perf_event *event,
3403 perf_output_put(&handle, task_event->event_id); 3445 perf_output_put(&handle, task_event->event_id);
3404 3446
3405 perf_output_end(&handle); 3447 perf_output_end(&handle);
3406 local_irq_restore(flags);
3407} 3448}
3408 3449
3409static int perf_event_task_match(struct perf_event *event) 3450static int perf_event_task_match(struct perf_event *event)
@@ -3743,7 +3784,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3743 .event_id = { 3784 .event_id = {
3744 .header = { 3785 .header = {
3745 .type = PERF_RECORD_MMAP, 3786 .type = PERF_RECORD_MMAP,
3746 .misc = 0, 3787 .misc = PERF_RECORD_MISC_USER,
3747 /* .size */ 3788 /* .size */
3748 }, 3789 },
3749 /* .pid */ 3790 /* .pid */
@@ -3961,39 +4002,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3961 perf_swevent_overflow(event, 0, nmi, data, regs); 4002 perf_swevent_overflow(event, 0, nmi, data, regs);
3962} 4003}
3963 4004
3964static int perf_swevent_is_counting(struct perf_event *event)
3965{
3966 /*
3967 * The event is active, we're good!
3968 */
3969 if (event->state == PERF_EVENT_STATE_ACTIVE)
3970 return 1;
3971
3972 /*
3973 * The event is off/error, not counting.
3974 */
3975 if (event->state != PERF_EVENT_STATE_INACTIVE)
3976 return 0;
3977
3978 /*
3979 * The event is inactive, if the context is active
3980 * we're part of a group that didn't make it on the 'pmu',
3981 * not counting.
3982 */
3983 if (event->ctx->is_active)
3984 return 0;
3985
3986 /*
3987 * We're inactive and the context is too, this means the
3988 * task is scheduled out, we're counting events that happen
3989 * to us, like migration events.
3990 */
3991 return 1;
3992}
3993
3994static int perf_tp_event_match(struct perf_event *event,
3995 struct perf_sample_data *data);
3996
3997static int perf_exclude_event(struct perf_event *event, 4005static int perf_exclude_event(struct perf_event *event,
3998 struct pt_regs *regs) 4006 struct pt_regs *regs)
3999{ 4007{
@@ -4014,12 +4022,6 @@ static int perf_swevent_match(struct perf_event *event,
4014 struct perf_sample_data *data, 4022 struct perf_sample_data *data,
4015 struct pt_regs *regs) 4023 struct pt_regs *regs)
4016{ 4024{
4017 if (event->cpu != -1 && event->cpu != smp_processor_id())
4018 return 0;
4019
4020 if (!perf_swevent_is_counting(event))
4021 return 0;
4022
4023 if (event->attr.type != type) 4025 if (event->attr.type != type)
4024 return 0; 4026 return 0;
4025 4027
@@ -4029,30 +4031,88 @@ static int perf_swevent_match(struct perf_event *event,
4029 if (perf_exclude_event(event, regs)) 4031 if (perf_exclude_event(event, regs))
4030 return 0; 4032 return 0;
4031 4033
4032 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
4033 !perf_tp_event_match(event, data))
4034 return 0;
4035
4036 return 1; 4034 return 1;
4037} 4035}
4038 4036
4039static void perf_swevent_ctx_event(struct perf_event_context *ctx, 4037static inline u64 swevent_hash(u64 type, u32 event_id)
4040 enum perf_type_id type,
4041 u32 event_id, u64 nr, int nmi,
4042 struct perf_sample_data *data,
4043 struct pt_regs *regs)
4044{ 4038{
4039 u64 val = event_id | (type << 32);
4040
4041 return hash_64(val, SWEVENT_HLIST_BITS);
4042}
4043
4044static inline struct hlist_head *
4045__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4046{
4047 u64 hash = swevent_hash(type, event_id);
4048
4049 return &hlist->heads[hash];
4050}
4051
4052/* For the read side: events when they trigger */
4053static inline struct hlist_head *
4054find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4055{
4056 struct swevent_hlist *hlist;
4057
4058 hlist = rcu_dereference(ctx->swevent_hlist);
4059 if (!hlist)
4060 return NULL;
4061
4062 return __find_swevent_head(hlist, type, event_id);
4063}
4064
4065/* For the event head insertion and removal in the hlist */
4066static inline struct hlist_head *
4067find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4068{
4069 struct swevent_hlist *hlist;
4070 u32 event_id = event->attr.config;
4071 u64 type = event->attr.type;
4072
4073 /*
4074 * Event scheduling is always serialized against hlist allocation
4075 * and release. Which makes the protected version suitable here.
4076 * The context lock guarantees that.
4077 */
4078 hlist = rcu_dereference_protected(ctx->swevent_hlist,
4079 lockdep_is_held(&event->ctx->lock));
4080 if (!hlist)
4081 return NULL;
4082
4083 return __find_swevent_head(hlist, type, event_id);
4084}
4085
4086static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4087 u64 nr, int nmi,
4088 struct perf_sample_data *data,
4089 struct pt_regs *regs)
4090{
4091 struct perf_cpu_context *cpuctx;
4045 struct perf_event *event; 4092 struct perf_event *event;
4093 struct hlist_node *node;
4094 struct hlist_head *head;
4046 4095
4047 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4096 cpuctx = &__get_cpu_var(perf_cpu_context);
4097
4098 rcu_read_lock();
4099
4100 head = find_swevent_head_rcu(cpuctx, type, event_id);
4101
4102 if (!head)
4103 goto end;
4104
4105 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4048 if (perf_swevent_match(event, type, event_id, data, regs)) 4106 if (perf_swevent_match(event, type, event_id, data, regs))
4049 perf_swevent_add(event, nr, nmi, data, regs); 4107 perf_swevent_add(event, nr, nmi, data, regs);
4050 } 4108 }
4109end:
4110 rcu_read_unlock();
4051} 4111}
4052 4112
4053int perf_swevent_get_recursion_context(void) 4113int perf_swevent_get_recursion_context(void)
4054{ 4114{
4055 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 4115 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4056 int rctx; 4116 int rctx;
4057 4117
4058 if (in_nmi()) 4118 if (in_nmi())
@@ -4064,10 +4124,8 @@ int perf_swevent_get_recursion_context(void)
4064 else 4124 else
4065 rctx = 0; 4125 rctx = 0;
4066 4126
4067 if (cpuctx->recursion[rctx]) { 4127 if (cpuctx->recursion[rctx])
4068 put_cpu_var(perf_cpu_context);
4069 return -1; 4128 return -1;
4070 }
4071 4129
4072 cpuctx->recursion[rctx]++; 4130 cpuctx->recursion[rctx]++;
4073 barrier(); 4131 barrier();
@@ -4081,31 +4139,9 @@ void perf_swevent_put_recursion_context(int rctx)
4081 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4139 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4082 barrier(); 4140 barrier();
4083 cpuctx->recursion[rctx]--; 4141 cpuctx->recursion[rctx]--;
4084 put_cpu_var(perf_cpu_context);
4085} 4142}
4086EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); 4143EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4087 4144
4088static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4089 u64 nr, int nmi,
4090 struct perf_sample_data *data,
4091 struct pt_regs *regs)
4092{
4093 struct perf_cpu_context *cpuctx;
4094 struct perf_event_context *ctx;
4095
4096 cpuctx = &__get_cpu_var(perf_cpu_context);
4097 rcu_read_lock();
4098 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4099 nr, nmi, data, regs);
4100 /*
4101 * doesn't really matter which of the child contexts the
4102 * events ends up in.
4103 */
4104 ctx = rcu_dereference(current->perf_event_ctxp);
4105 if (ctx)
4106 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4107 rcu_read_unlock();
4108}
4109 4145
4110void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4146void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4111 struct pt_regs *regs, u64 addr) 4147 struct pt_regs *regs, u64 addr)
@@ -4113,6 +4149,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4113 struct perf_sample_data data; 4149 struct perf_sample_data data;
4114 int rctx; 4150 int rctx;
4115 4151
4152 preempt_disable_notrace();
4116 rctx = perf_swevent_get_recursion_context(); 4153 rctx = perf_swevent_get_recursion_context();
4117 if (rctx < 0) 4154 if (rctx < 0)
4118 return; 4155 return;
@@ -4122,6 +4159,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4122 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4159 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4123 4160
4124 perf_swevent_put_recursion_context(rctx); 4161 perf_swevent_put_recursion_context(rctx);
4162 preempt_enable_notrace();
4125} 4163}
4126 4164
4127static void perf_swevent_read(struct perf_event *event) 4165static void perf_swevent_read(struct perf_event *event)
@@ -4131,16 +4169,28 @@ static void perf_swevent_read(struct perf_event *event)
4131static int perf_swevent_enable(struct perf_event *event) 4169static int perf_swevent_enable(struct perf_event *event)
4132{ 4170{
4133 struct hw_perf_event *hwc = &event->hw; 4171 struct hw_perf_event *hwc = &event->hw;
4172 struct perf_cpu_context *cpuctx;
4173 struct hlist_head *head;
4174
4175 cpuctx = &__get_cpu_var(perf_cpu_context);
4134 4176
4135 if (hwc->sample_period) { 4177 if (hwc->sample_period) {
4136 hwc->last_period = hwc->sample_period; 4178 hwc->last_period = hwc->sample_period;
4137 perf_swevent_set_period(event); 4179 perf_swevent_set_period(event);
4138 } 4180 }
4181
4182 head = find_swevent_head(cpuctx, event);
4183 if (WARN_ON_ONCE(!head))
4184 return -EINVAL;
4185
4186 hlist_add_head_rcu(&event->hlist_entry, head);
4187
4139 return 0; 4188 return 0;
4140} 4189}
4141 4190
4142static void perf_swevent_disable(struct perf_event *event) 4191static void perf_swevent_disable(struct perf_event *event)
4143{ 4192{
4193 hlist_del_rcu(&event->hlist_entry);
4144} 4194}
4145 4195
4146static const struct pmu perf_ops_generic = { 4196static const struct pmu perf_ops_generic = {
@@ -4168,15 +4218,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4168 perf_sample_data_init(&data, 0); 4218 perf_sample_data_init(&data, 0);
4169 data.period = event->hw.last_period; 4219 data.period = event->hw.last_period;
4170 regs = get_irq_regs(); 4220 regs = get_irq_regs();
4171 /*
4172 * In case we exclude kernel IPs or are somehow not in interrupt
4173 * context, provide the next best thing, the user IP.
4174 */
4175 if ((event->attr.exclude_kernel || !regs) &&
4176 !event->attr.exclude_user)
4177 regs = task_pt_regs(current);
4178 4221
4179 if (regs) { 4222 if (regs && !perf_exclude_event(event, regs)) {
4180 if (!(event->attr.exclude_idle && current->pid == 0)) 4223 if (!(event->attr.exclude_idle && current->pid == 0))
4181 if (perf_event_overflow(event, 0, &data, regs)) 4224 if (perf_event_overflow(event, 0, &data, regs))
4182 ret = HRTIMER_NORESTART; 4225 ret = HRTIMER_NORESTART;
@@ -4324,27 +4367,122 @@ static const struct pmu perf_ops_task_clock = {
4324 .read = task_clock_perf_event_read, 4367 .read = task_clock_perf_event_read,
4325}; 4368};
4326 4369
4327#ifdef CONFIG_EVENT_TRACING 4370/* Deref the hlist from the update side */
4371static inline struct swevent_hlist *
4372swevent_hlist_deref(struct perf_cpu_context *cpuctx)
4373{
4374 return rcu_dereference_protected(cpuctx->swevent_hlist,
4375 lockdep_is_held(&cpuctx->hlist_mutex));
4376}
4328 4377
4329void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4378static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4330 int entry_size, struct pt_regs *regs)
4331{ 4379{
4332 struct perf_sample_data data; 4380 struct swevent_hlist *hlist;
4333 struct perf_raw_record raw = {
4334 .size = entry_size,
4335 .data = record,
4336 };
4337 4381
4338 perf_sample_data_init(&data, addr); 4382 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
4339 data.raw = &raw; 4383 kfree(hlist);
4384}
4340 4385
4341 /* Trace events already protected against recursion */ 4386static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
4342 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4387{
4343 &data, regs); 4388 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
4389
4390 if (!hlist)
4391 return;
4392
4393 rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
4394 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4344} 4395}
4345EXPORT_SYMBOL_GPL(perf_tp_event);
4346 4396
4347static int perf_tp_event_match(struct perf_event *event, 4397static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4398{
4399 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4400
4401 mutex_lock(&cpuctx->hlist_mutex);
4402
4403 if (!--cpuctx->hlist_refcount)
4404 swevent_hlist_release(cpuctx);
4405
4406 mutex_unlock(&cpuctx->hlist_mutex);
4407}
4408
4409static void swevent_hlist_put(struct perf_event *event)
4410{
4411 int cpu;
4412
4413 if (event->cpu != -1) {
4414 swevent_hlist_put_cpu(event, event->cpu);
4415 return;
4416 }
4417
4418 for_each_possible_cpu(cpu)
4419 swevent_hlist_put_cpu(event, cpu);
4420}
4421
4422static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4423{
4424 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4425 int err = 0;
4426
4427 mutex_lock(&cpuctx->hlist_mutex);
4428
4429 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
4430 struct swevent_hlist *hlist;
4431
4432 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
4433 if (!hlist) {
4434 err = -ENOMEM;
4435 goto exit;
4436 }
4437 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
4438 }
4439 cpuctx->hlist_refcount++;
4440 exit:
4441 mutex_unlock(&cpuctx->hlist_mutex);
4442
4443 return err;
4444}
4445
4446static int swevent_hlist_get(struct perf_event *event)
4447{
4448 int err;
4449 int cpu, failed_cpu;
4450
4451 if (event->cpu != -1)
4452 return swevent_hlist_get_cpu(event, event->cpu);
4453
4454 get_online_cpus();
4455 for_each_possible_cpu(cpu) {
4456 err = swevent_hlist_get_cpu(event, cpu);
4457 if (err) {
4458 failed_cpu = cpu;
4459 goto fail;
4460 }
4461 }
4462 put_online_cpus();
4463
4464 return 0;
4465 fail:
4466 for_each_possible_cpu(cpu) {
4467 if (cpu == failed_cpu)
4468 break;
4469 swevent_hlist_put_cpu(event, cpu);
4470 }
4471
4472 put_online_cpus();
4473 return err;
4474}
4475
4476#ifdef CONFIG_EVENT_TRACING
4477
4478static const struct pmu perf_ops_tracepoint = {
4479 .enable = perf_trace_enable,
4480 .disable = perf_trace_disable,
4481 .read = perf_swevent_read,
4482 .unthrottle = perf_swevent_unthrottle,
4483};
4484
4485static int perf_tp_filter_match(struct perf_event *event,
4348 struct perf_sample_data *data) 4486 struct perf_sample_data *data)
4349{ 4487{
4350 void *record = data->raw->data; 4488 void *record = data->raw->data;
@@ -4354,13 +4492,55 @@ static int perf_tp_event_match(struct perf_event *event,
4354 return 0; 4492 return 0;
4355} 4493}
4356 4494
4495static int perf_tp_event_match(struct perf_event *event,
4496 struct perf_sample_data *data,
4497 struct pt_regs *regs)
4498{
4499 /*
4500 * All tracepoints are from kernel-space.
4501 */
4502 if (event->attr.exclude_kernel)
4503 return 0;
4504
4505 if (!perf_tp_filter_match(event, data))
4506 return 0;
4507
4508 return 1;
4509}
4510
4511void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4512 struct pt_regs *regs, struct hlist_head *head)
4513{
4514 struct perf_sample_data data;
4515 struct perf_event *event;
4516 struct hlist_node *node;
4517
4518 struct perf_raw_record raw = {
4519 .size = entry_size,
4520 .data = record,
4521 };
4522
4523 perf_sample_data_init(&data, addr);
4524 data.raw = &raw;
4525
4526 rcu_read_lock();
4527 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4528 if (perf_tp_event_match(event, &data, regs))
4529 perf_swevent_add(event, count, 1, &data, regs);
4530 }
4531 rcu_read_unlock();
4532}
4533EXPORT_SYMBOL_GPL(perf_tp_event);
4534
4357static void tp_perf_event_destroy(struct perf_event *event) 4535static void tp_perf_event_destroy(struct perf_event *event)
4358{ 4536{
4359 perf_trace_disable(event->attr.config); 4537 perf_trace_destroy(event);
4360} 4538}
4361 4539
4362static const struct pmu *tp_perf_event_init(struct perf_event *event) 4540static const struct pmu *tp_perf_event_init(struct perf_event *event)
4363{ 4541{
4542 int err;
4543
4364 /* 4544 /*
4365 * Raw tracepoint data is a severe data leak, only allow root to 4545 * Raw tracepoint data is a severe data leak, only allow root to
4366 * have these. 4546 * have these.
@@ -4370,12 +4550,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4370 !capable(CAP_SYS_ADMIN)) 4550 !capable(CAP_SYS_ADMIN))
4371 return ERR_PTR(-EPERM); 4551 return ERR_PTR(-EPERM);
4372 4552
4373 if (perf_trace_enable(event->attr.config)) 4553 err = perf_trace_init(event);
4554 if (err)
4374 return NULL; 4555 return NULL;
4375 4556
4376 event->destroy = tp_perf_event_destroy; 4557 event->destroy = tp_perf_event_destroy;
4377 4558
4378 return &perf_ops_generic; 4559 return &perf_ops_tracepoint;
4379} 4560}
4380 4561
4381static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4562static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4403,12 +4584,6 @@ static void perf_event_free_filter(struct perf_event *event)
4403 4584
4404#else 4585#else
4405 4586
4406static int perf_tp_event_match(struct perf_event *event,
4407 struct perf_sample_data *data)
4408{
4409 return 1;
4410}
4411
4412static const struct pmu *tp_perf_event_init(struct perf_event *event) 4587static const struct pmu *tp_perf_event_init(struct perf_event *event)
4413{ 4588{
4414 return NULL; 4589 return NULL;
@@ -4474,6 +4649,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4474 WARN_ON(event->parent); 4649 WARN_ON(event->parent);
4475 4650
4476 atomic_dec(&perf_swevent_enabled[event_id]); 4651 atomic_dec(&perf_swevent_enabled[event_id]);
4652 swevent_hlist_put(event);
4477} 4653}
4478 4654
4479static const struct pmu *sw_perf_event_init(struct perf_event *event) 4655static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4512,6 +4688,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4512 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 4688 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4513 case PERF_COUNT_SW_EMULATION_FAULTS: 4689 case PERF_COUNT_SW_EMULATION_FAULTS:
4514 if (!event->parent) { 4690 if (!event->parent) {
4691 int err;
4692
4693 err = swevent_hlist_get(event);
4694 if (err)
4695 return ERR_PTR(err);
4696
4515 atomic_inc(&perf_swevent_enabled[event_id]); 4697 atomic_inc(&perf_swevent_enabled[event_id]);
4516 event->destroy = sw_perf_event_destroy; 4698 event->destroy = sw_perf_event_destroy;
4517 } 4699 }
@@ -4738,6 +4920,13 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)
4738 int fput_needed = 0; 4920 int fput_needed = 0;
4739 int ret = -EINVAL; 4921 int ret = -EINVAL;
4740 4922
4923 /*
4924 * Don't allow output of inherited per-task events. This would
4925 * create performance issues due to cross cpu access.
4926 */
4927 if (event->cpu == -1 && event->attr.inherit)
4928 return -EINVAL;
4929
4741 if (!output_fd) 4930 if (!output_fd)
4742 goto set; 4931 goto set;
4743 4932
@@ -4758,6 +4947,18 @@ static int perf_event_set_output(struct perf_event *event, int output_fd)
4758 if (event->data) 4947 if (event->data)
4759 goto out; 4948 goto out;
4760 4949
4950 /*
4951 * Don't allow cross-cpu buffers
4952 */
4953 if (output_event->cpu != event->cpu)
4954 goto out;
4955
4956 /*
4957 * If its not a per-cpu buffer, it must be the same task.
4958 */
4959 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
4960 goto out;
4961
4761 atomic_long_inc(&output_file->f_count); 4962 atomic_long_inc(&output_file->f_count);
4762 4963
4763set: 4964set:
@@ -4798,8 +4999,8 @@ SYSCALL_DEFINE5(perf_event_open,
4798 struct perf_event_context *ctx; 4999 struct perf_event_context *ctx;
4799 struct file *event_file = NULL; 5000 struct file *event_file = NULL;
4800 struct file *group_file = NULL; 5001 struct file *group_file = NULL;
5002 int event_fd;
4801 int fput_needed = 0; 5003 int fput_needed = 0;
4802 int fput_needed2 = 0;
4803 int err; 5004 int err;
4804 5005
4805 /* for future expandability... */ 5006 /* for future expandability... */
@@ -4820,12 +5021,18 @@ SYSCALL_DEFINE5(perf_event_open,
4820 return -EINVAL; 5021 return -EINVAL;
4821 } 5022 }
4822 5023
5024 event_fd = get_unused_fd_flags(O_RDWR);
5025 if (event_fd < 0)
5026 return event_fd;
5027
4823 /* 5028 /*
4824 * Get the target context (task or percpu): 5029 * Get the target context (task or percpu):
4825 */ 5030 */
4826 ctx = find_get_context(pid, cpu); 5031 ctx = find_get_context(pid, cpu);
4827 if (IS_ERR(ctx)) 5032 if (IS_ERR(ctx)) {
4828 return PTR_ERR(ctx); 5033 err = PTR_ERR(ctx);
5034 goto err_fd;
5035 }
4829 5036
4830 /* 5037 /*
4831 * Look up the group leader (we will attach this event to it): 5038 * Look up the group leader (we will attach this event to it):
@@ -4865,13 +5072,11 @@ SYSCALL_DEFINE5(perf_event_open,
4865 if (IS_ERR(event)) 5072 if (IS_ERR(event))
4866 goto err_put_context; 5073 goto err_put_context;
4867 5074
4868 err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR); 5075 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
4869 if (err < 0) 5076 if (IS_ERR(event_file)) {
4870 goto err_free_put_context; 5077 err = PTR_ERR(event_file);
4871
4872 event_file = fget_light(err, &fput_needed2);
4873 if (!event_file)
4874 goto err_free_put_context; 5078 goto err_free_put_context;
5079 }
4875 5080
4876 if (flags & PERF_FLAG_FD_OUTPUT) { 5081 if (flags & PERF_FLAG_FD_OUTPUT) {
4877 err = perf_event_set_output(event, group_fd); 5082 err = perf_event_set_output(event, group_fd);
@@ -4892,19 +5097,19 @@ SYSCALL_DEFINE5(perf_event_open,
4892 list_add_tail(&event->owner_entry, &current->perf_event_list); 5097 list_add_tail(&event->owner_entry, &current->perf_event_list);
4893 mutex_unlock(&current->perf_event_mutex); 5098 mutex_unlock(&current->perf_event_mutex);
4894 5099
4895err_fput_free_put_context: 5100 fput_light(group_file, fput_needed);
4896 fput_light(event_file, fput_needed2); 5101 fd_install(event_fd, event_file);
5102 return event_fd;
4897 5103
5104err_fput_free_put_context:
5105 fput(event_file);
4898err_free_put_context: 5106err_free_put_context:
4899 if (err < 0) 5107 free_event(event);
4900 free_event(event);
4901
4902err_put_context: 5108err_put_context:
4903 if (err < 0)
4904 put_ctx(ctx);
4905
4906 fput_light(group_file, fput_needed); 5109 fput_light(group_file, fput_needed);
4907 5110 put_ctx(ctx);
5111err_fd:
5112 put_unused_fd(event_fd);
4908 return err; 5113 return err;
4909} 5114}
4910 5115
@@ -5176,7 +5381,7 @@ void perf_event_exit_task(struct task_struct *child)
5176 * 5381 *
5177 * But since its the parent context it won't be the same instance. 5382 * But since its the parent context it won't be the same instance.
5178 */ 5383 */
5179 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5384 mutex_lock(&child_ctx->mutex);
5180 5385
5181again: 5386again:
5182 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, 5387 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
@@ -5384,6 +5589,7 @@ static void __init perf_event_init_all_cpus(void)
5384 5589
5385 for_each_possible_cpu(cpu) { 5590 for_each_possible_cpu(cpu) {
5386 cpuctx = &per_cpu(perf_cpu_context, cpu); 5591 cpuctx = &per_cpu(perf_cpu_context, cpu);
5592 mutex_init(&cpuctx->hlist_mutex);
5387 __perf_event_init_context(&cpuctx->ctx, NULL); 5593 __perf_event_init_context(&cpuctx->ctx, NULL);
5388 } 5594 }
5389} 5595}
@@ -5397,6 +5603,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
5397 spin_lock(&perf_resource_lock); 5603 spin_lock(&perf_resource_lock);
5398 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5604 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5399 spin_unlock(&perf_resource_lock); 5605 spin_unlock(&perf_resource_lock);
5606
5607 mutex_lock(&cpuctx->hlist_mutex);
5608 if (cpuctx->hlist_refcount > 0) {
5609 struct swevent_hlist *hlist;
5610
5611 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5612 WARN_ON_ONCE(!hlist);
5613 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
5614 }
5615 mutex_unlock(&cpuctx->hlist_mutex);
5400} 5616}
5401 5617
5402#ifdef CONFIG_HOTPLUG_CPU 5618#ifdef CONFIG_HOTPLUG_CPU
@@ -5416,6 +5632,10 @@ static void perf_event_exit_cpu(int cpu)
5416 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 5632 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5417 struct perf_event_context *ctx = &cpuctx->ctx; 5633 struct perf_event_context *ctx = &cpuctx->ctx;
5418 5634
5635 mutex_lock(&cpuctx->hlist_mutex);
5636 swevent_hlist_release(cpuctx);
5637 mutex_unlock(&cpuctx->hlist_mutex);
5638
5419 mutex_lock(&ctx->mutex); 5639 mutex_lock(&ctx->mutex);
5420 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); 5640 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5421 mutex_unlock(&ctx->mutex); 5641 mutex_unlock(&ctx->mutex);
diff --git a/kernel/pid.c b/kernel/pid.c
index aebb30d9c233..e9fd8c132d26 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -513,6 +513,13 @@ void __init pidhash_init(void)
513 513
514void __init pidmap_init(void) 514void __init pidmap_init(void)
515{ 515{
516 /* bump default and minimum pid_max based on number of cpus */
517 pid_max = min(pid_max_max, max_t(int, pid_max,
518 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
519 pid_max_min = max_t(int, pid_max_min,
520 PIDS_PER_CPU_MIN * num_possible_cpus());
521 pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
522
516 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 523 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
517 /* Reserve PID 0. We never call free_pidmap(0) */ 524 /* Reserve PID 0. We never call free_pidmap(0) */
518 set_bit(0, init_pid_ns.pidmap[0].page); 525 set_bit(0, init_pid_ns.pidmap[0].page);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 3db49b9ca374..f42d3f737a33 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -2,7 +2,7 @@
2 * This module exposes the interface to kernel space for specifying 2 * This module exposes the interface to kernel space for specifying
3 * QoS dependencies. It provides infrastructure for registration of: 3 * QoS dependencies. It provides infrastructure for registration of:
4 * 4 *
5 * Dependents on a QoS value : register requirements 5 * Dependents on a QoS value : register requests
6 * Watchers of QoS value : get notified when target QoS value changes 6 * Watchers of QoS value : get notified when target QoS value changes
7 * 7 *
8 * This QoS design is best effort based. Dependents register their QoS needs. 8 * This QoS design is best effort based. Dependents register their QoS needs.
@@ -14,19 +14,21 @@
14 * timeout: usec <-- currently not used. 14 * timeout: usec <-- currently not used.
15 * throughput: kbs (kilo byte / sec) 15 * throughput: kbs (kilo byte / sec)
16 * 16 *
17 * There are lists of pm_qos_objects each one wrapping requirements, notifiers 17 * There are lists of pm_qos_objects each one wrapping requests, notifiers
18 * 18 *
19 * User mode requirements on a QOS parameter register themselves to the 19 * User mode requests on a QOS parameter register themselves to the
20 * subsystem by opening the device node /dev/... and writing there request to 20 * subsystem by opening the device node /dev/... and writing there request to
21 * the node. As long as the process holds a file handle open to the node the 21 * the node. As long as the process holds a file handle open to the node the
22 * client continues to be accounted for. Upon file release the usermode 22 * client continues to be accounted for. Upon file release the usermode
23 * requirement is removed and a new qos target is computed. This way when the 23 * request is removed and a new qos target is computed. This way when the
24 * requirement that the application has is cleaned up when closes the file 24 * request that the application has is cleaned up when closes the file
25 * pointer or exits the pm_qos_object will get an opportunity to clean up. 25 * pointer or exits the pm_qos_object will get an opportunity to clean up.
26 * 26 *
27 * Mark Gross <mgross@linux.intel.com> 27 * Mark Gross <mgross@linux.intel.com>
28 */ 28 */
29 29
30/*#define DEBUG*/
31
30#include <linux/pm_qos_params.h> 32#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 33#include <linux/sched.h>
32#include <linux/spinlock.h> 34#include <linux/spinlock.h>
@@ -42,25 +44,25 @@
42#include <linux/uaccess.h> 44#include <linux/uaccess.h>
43 45
44/* 46/*
45 * locking rule: all changes to requirements or notifiers lists 47 * locking rule: all changes to requests or notifiers lists
46 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
47 * held, taken with _irqsave. One lock to rule them all 49 * held, taken with _irqsave. One lock to rule them all
48 */ 50 */
49struct requirement_list { 51struct pm_qos_request_list {
50 struct list_head list; 52 struct list_head list;
51 union { 53 union {
52 s32 value; 54 s32 value;
53 s32 usec; 55 s32 usec;
54 s32 kbps; 56 s32 kbps;
55 }; 57 };
56 char *name; 58 int pm_qos_class;
57}; 59};
58 60
59static s32 max_compare(s32 v1, s32 v2); 61static s32 max_compare(s32 v1, s32 v2);
60static s32 min_compare(s32 v1, s32 v2); 62static s32 min_compare(s32 v1, s32 v2);
61 63
62struct pm_qos_object { 64struct pm_qos_object {
63 struct requirement_list requirements; 65 struct pm_qos_request_list requests;
64 struct blocking_notifier_head *notifiers; 66 struct blocking_notifier_head *notifiers;
65 struct miscdevice pm_qos_power_miscdev; 67 struct miscdevice pm_qos_power_miscdev;
66 char *name; 68 char *name;
@@ -72,7 +74,7 @@ struct pm_qos_object {
72static struct pm_qos_object null_pm_qos; 74static struct pm_qos_object null_pm_qos;
73static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 75static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
74static struct pm_qos_object cpu_dma_pm_qos = { 76static struct pm_qos_object cpu_dma_pm_qos = {
75 .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)}, 77 .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)},
76 .notifiers = &cpu_dma_lat_notifier, 78 .notifiers = &cpu_dma_lat_notifier,
77 .name = "cpu_dma_latency", 79 .name = "cpu_dma_latency",
78 .default_value = 2000 * USEC_PER_SEC, 80 .default_value = 2000 * USEC_PER_SEC,
@@ -82,7 +84,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
82 84
83static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 85static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
84static struct pm_qos_object network_lat_pm_qos = { 86static struct pm_qos_object network_lat_pm_qos = {
85 .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)}, 87 .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)},
86 .notifiers = &network_lat_notifier, 88 .notifiers = &network_lat_notifier,
87 .name = "network_latency", 89 .name = "network_latency",
88 .default_value = 2000 * USEC_PER_SEC, 90 .default_value = 2000 * USEC_PER_SEC,
@@ -93,8 +95,7 @@ static struct pm_qos_object network_lat_pm_qos = {
93 95
94static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 96static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
95static struct pm_qos_object network_throughput_pm_qos = { 97static struct pm_qos_object network_throughput_pm_qos = {
96 .requirements = 98 .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)},
97 {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
98 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
99 .name = "network_throughput", 100 .name = "network_throughput",
100 .default_value = 0, 101 .default_value = 0,
@@ -135,31 +136,34 @@ static s32 min_compare(s32 v1, s32 v2)
135} 136}
136 137
137 138
138static void update_target(int target) 139static void update_target(int pm_qos_class)
139{ 140{
140 s32 extreme_value; 141 s32 extreme_value;
141 struct requirement_list *node; 142 struct pm_qos_request_list *node;
142 unsigned long flags; 143 unsigned long flags;
143 int call_notifier = 0; 144 int call_notifier = 0;
144 145
145 spin_lock_irqsave(&pm_qos_lock, flags); 146 spin_lock_irqsave(&pm_qos_lock, flags);
146 extreme_value = pm_qos_array[target]->default_value; 147 extreme_value = pm_qos_array[pm_qos_class]->default_value;
147 list_for_each_entry(node, 148 list_for_each_entry(node,
148 &pm_qos_array[target]->requirements.list, list) { 149 &pm_qos_array[pm_qos_class]->requests.list, list) {
149 extreme_value = pm_qos_array[target]->comparitor( 150 extreme_value = pm_qos_array[pm_qos_class]->comparitor(
150 extreme_value, node->value); 151 extreme_value, node->value);
151 } 152 }
152 if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { 153 if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) !=
154 extreme_value) {
153 call_notifier = 1; 155 call_notifier = 1;
154 atomic_set(&pm_qos_array[target]->target_value, extreme_value); 156 atomic_set(&pm_qos_array[pm_qos_class]->target_value,
155 pr_debug(KERN_ERR "new target for qos %d is %d\n", target, 157 extreme_value);
156 atomic_read(&pm_qos_array[target]->target_value)); 158 pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class,
159 atomic_read(&pm_qos_array[pm_qos_class]->target_value));
157 } 160 }
158 spin_unlock_irqrestore(&pm_qos_lock, flags); 161 spin_unlock_irqrestore(&pm_qos_lock, flags);
159 162
160 if (call_notifier) 163 if (call_notifier)
161 blocking_notifier_call_chain(pm_qos_array[target]->notifiers, 164 blocking_notifier_call_chain(
162 (unsigned long) extreme_value, NULL); 165 pm_qos_array[pm_qos_class]->notifiers,
166 (unsigned long) extreme_value, NULL);
163} 167}
164 168
165static int register_pm_qos_misc(struct pm_qos_object *qos) 169static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -185,125 +189,112 @@ static int find_pm_qos_object_by_minor(int minor)
185} 189}
186 190
187/** 191/**
188 * pm_qos_requirement - returns current system wide qos expectation 192 * pm_qos_request - returns current system wide qos expectation
189 * @pm_qos_class: identification of which qos value is requested 193 * @pm_qos_class: identification of which qos value is requested
190 * 194 *
191 * This function returns the current target value in an atomic manner. 195 * This function returns the current target value in an atomic manner.
192 */ 196 */
193int pm_qos_requirement(int pm_qos_class) 197int pm_qos_request(int pm_qos_class)
194{ 198{
195 return atomic_read(&pm_qos_array[pm_qos_class]->target_value); 199 return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
196} 200}
197EXPORT_SYMBOL_GPL(pm_qos_requirement); 201EXPORT_SYMBOL_GPL(pm_qos_request);
198 202
199/** 203/**
200 * pm_qos_add_requirement - inserts new qos request into the list 204 * pm_qos_add_request - inserts new qos request into the list
201 * @pm_qos_class: identifies which list of qos request to us 205 * @pm_qos_class: identifies which list of qos request to us
202 * @name: identifies the request
203 * @value: defines the qos request 206 * @value: defines the qos request
204 * 207 *
205 * This function inserts a new entry in the pm_qos_class list of requested qos 208 * This function inserts a new entry in the pm_qos_class list of requested qos
206 * performance characteristics. It recomputes the aggregate QoS expectations 209 * performance characteristics. It recomputes the aggregate QoS expectations
207 * for the pm_qos_class of parameters. 210 * for the pm_qos_class of parameters, and returns the pm_qos_request list
211 * element as a handle for use in updating and removal. Call needs to save
212 * this handle for later use.
208 */ 213 */
209int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) 214struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value)
210{ 215{
211 struct requirement_list *dep; 216 struct pm_qos_request_list *dep;
212 unsigned long flags; 217 unsigned long flags;
213 218
214 dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL); 219 dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL);
215 if (dep) { 220 if (dep) {
216 if (value == PM_QOS_DEFAULT_VALUE) 221 if (value == PM_QOS_DEFAULT_VALUE)
217 dep->value = pm_qos_array[pm_qos_class]->default_value; 222 dep->value = pm_qos_array[pm_qos_class]->default_value;
218 else 223 else
219 dep->value = value; 224 dep->value = value;
220 dep->name = kstrdup(name, GFP_KERNEL); 225 dep->pm_qos_class = pm_qos_class;
221 if (!dep->name)
222 goto cleanup;
223 226
224 spin_lock_irqsave(&pm_qos_lock, flags); 227 spin_lock_irqsave(&pm_qos_lock, flags);
225 list_add(&dep->list, 228 list_add(&dep->list,
226 &pm_qos_array[pm_qos_class]->requirements.list); 229 &pm_qos_array[pm_qos_class]->requests.list);
227 spin_unlock_irqrestore(&pm_qos_lock, flags); 230 spin_unlock_irqrestore(&pm_qos_lock, flags);
228 update_target(pm_qos_class); 231 update_target(pm_qos_class);
229
230 return 0;
231 } 232 }
232 233
233cleanup: 234 return dep;
234 kfree(dep);
235 return -ENOMEM;
236} 235}
237EXPORT_SYMBOL_GPL(pm_qos_add_requirement); 236EXPORT_SYMBOL_GPL(pm_qos_add_request);
238 237
239/** 238/**
240 * pm_qos_update_requirement - modifies an existing qos request 239 * pm_qos_update_request - modifies an existing qos request
241 * @pm_qos_class: identifies which list of qos request to us 240 * @pm_qos_req : handle to list element holding a pm_qos request to use
242 * @name: identifies the request
243 * @value: defines the qos request 241 * @value: defines the qos request
244 * 242 *
245 * Updates an existing qos requirement for the pm_qos_class of parameters along 243 * Updates an existing qos request for the pm_qos_class of parameters along
246 * with updating the target pm_qos_class value. 244 * with updating the target pm_qos_class value.
247 * 245 *
248 * If the named request isn't in the list then no change is made. 246 * Attempts are made to make this code callable on hot code paths.
249 */ 247 */
250int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) 248void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
249 s32 new_value)
251{ 250{
252 unsigned long flags; 251 unsigned long flags;
253 struct requirement_list *node;
254 int pending_update = 0; 252 int pending_update = 0;
253 s32 temp;
255 254
256 spin_lock_irqsave(&pm_qos_lock, flags); 255 if (pm_qos_req) { /*guard against callers passing in null */
257 list_for_each_entry(node, 256 spin_lock_irqsave(&pm_qos_lock, flags);
258 &pm_qos_array[pm_qos_class]->requirements.list, list) { 257 if (new_value == PM_QOS_DEFAULT_VALUE)
259 if (strcmp(node->name, name) == 0) { 258 temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value;
260 if (new_value == PM_QOS_DEFAULT_VALUE) 259 else
261 node->value = 260 temp = new_value;
262 pm_qos_array[pm_qos_class]->default_value; 261
263 else 262 if (temp != pm_qos_req->value) {
264 node->value = new_value;
265 pending_update = 1; 263 pending_update = 1;
266 break; 264 pm_qos_req->value = temp;
267 } 265 }
266 spin_unlock_irqrestore(&pm_qos_lock, flags);
267 if (pending_update)
268 update_target(pm_qos_req->pm_qos_class);
268 } 269 }
269 spin_unlock_irqrestore(&pm_qos_lock, flags);
270 if (pending_update)
271 update_target(pm_qos_class);
272
273 return 0;
274} 270}
275EXPORT_SYMBOL_GPL(pm_qos_update_requirement); 271EXPORT_SYMBOL_GPL(pm_qos_update_request);
276 272
277/** 273/**
278 * pm_qos_remove_requirement - modifies an existing qos request 274 * pm_qos_remove_request - modifies an existing qos request
279 * @pm_qos_class: identifies which list of qos request to us 275 * @pm_qos_req: handle to request list element
280 * @name: identifies the request
281 * 276 *
282 * Will remove named qos request from pm_qos_class list of parameters and 277 * Will remove pm qos request from the list of requests and
283 * recompute the current target value for the pm_qos_class. 278 * recompute the current target value for the pm_qos_class. Call this
279 * on slow code paths.
284 */ 280 */
285void pm_qos_remove_requirement(int pm_qos_class, char *name) 281void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
286{ 282{
287 unsigned long flags; 283 unsigned long flags;
288 struct requirement_list *node; 284 int qos_class;
289 int pending_update = 0;
290 285
286 if (pm_qos_req == NULL)
287 return;
288 /* silent return to keep pcm code cleaner */
289
290 qos_class = pm_qos_req->pm_qos_class;
291 spin_lock_irqsave(&pm_qos_lock, flags); 291 spin_lock_irqsave(&pm_qos_lock, flags);
292 list_for_each_entry(node, 292 list_del(&pm_qos_req->list);
293 &pm_qos_array[pm_qos_class]->requirements.list, list) { 293 kfree(pm_qos_req);
294 if (strcmp(node->name, name) == 0) {
295 kfree(node->name);
296 list_del(&node->list);
297 kfree(node);
298 pending_update = 1;
299 break;
300 }
301 }
302 spin_unlock_irqrestore(&pm_qos_lock, flags); 294 spin_unlock_irqrestore(&pm_qos_lock, flags);
303 if (pending_update) 295 update_target(qos_class);
304 update_target(pm_qos_class);
305} 296}
306EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); 297EXPORT_SYMBOL_GPL(pm_qos_remove_request);
307 298
308/** 299/**
309 * pm_qos_add_notifier - sets notification entry for changes to target value 300 * pm_qos_add_notifier - sets notification entry for changes to target value
@@ -313,7 +304,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
313 * will register the notifier into a notification chain that gets called 304 * will register the notifier into a notification chain that gets called
314 * upon changes to the pm_qos_class target value. 305 * upon changes to the pm_qos_class target value.
315 */ 306 */
316 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) 307int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
317{ 308{
318 int retval; 309 int retval;
319 310
@@ -343,21 +334,16 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
343} 334}
344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 335EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
345 336
346#define PID_NAME_LEN 32
347
348static int pm_qos_power_open(struct inode *inode, struct file *filp) 337static int pm_qos_power_open(struct inode *inode, struct file *filp)
349{ 338{
350 int ret;
351 long pm_qos_class; 339 long pm_qos_class;
352 char name[PID_NAME_LEN];
353 340
354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 341 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
355 if (pm_qos_class >= 0) { 342 if (pm_qos_class >= 0) {
356 filp->private_data = (void *)pm_qos_class; 343 filp->private_data = (void *) pm_qos_add_request(pm_qos_class,
357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 344 PM_QOS_DEFAULT_VALUE);
358 ret = pm_qos_add_requirement(pm_qos_class, name, 345
359 PM_QOS_DEFAULT_VALUE); 346 if (filp->private_data)
360 if (ret >= 0)
361 return 0; 347 return 0;
362 } 348 }
363 return -EPERM; 349 return -EPERM;
@@ -365,32 +351,40 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
365 351
366static int pm_qos_power_release(struct inode *inode, struct file *filp) 352static int pm_qos_power_release(struct inode *inode, struct file *filp)
367{ 353{
368 int pm_qos_class; 354 struct pm_qos_request_list *req;
369 char name[PID_NAME_LEN];
370 355
371 pm_qos_class = (long)filp->private_data; 356 req = (struct pm_qos_request_list *)filp->private_data;
372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 357 pm_qos_remove_request(req);
373 pm_qos_remove_requirement(pm_qos_class, name);
374 358
375 return 0; 359 return 0;
376} 360}
377 361
362
378static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 363static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
379 size_t count, loff_t *f_pos) 364 size_t count, loff_t *f_pos)
380{ 365{
381 s32 value; 366 s32 value;
382 int pm_qos_class; 367 int x;
383 char name[PID_NAME_LEN]; 368 char ascii_value[11];
384 369 struct pm_qos_request_list *pm_qos_req;
385 pm_qos_class = (long)filp->private_data; 370
386 if (count != sizeof(s32)) 371 if (count == sizeof(s32)) {
372 if (copy_from_user(&value, buf, sizeof(s32)))
373 return -EFAULT;
374 } else if (count == 11) { /* len('0x12345678/0') */
375 if (copy_from_user(ascii_value, buf, 11))
376 return -EFAULT;
377 x = sscanf(ascii_value, "%x", &value);
378 if (x != 1)
379 return -EINVAL;
380 pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value);
381 } else
387 return -EINVAL; 382 return -EINVAL;
388 if (copy_from_user(&value, buf, sizeof(s32)))
389 return -EFAULT;
390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
391 pm_qos_update_requirement(pm_qos_class, name, value);
392 383
393 return sizeof(s32); 384 pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
385 pm_qos_update_request(pm_qos_req, value);
386
387 return count;
394} 388}
395 389
396 390
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index bc7704b3a443..9829646d399c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -11,19 +11,18 @@
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12 12
13/* 13/*
14 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 14 * Called after updating RLIMIT_CPU to run cpu timer and update
15 * tsk->signal->cputime_expires expiration cache if necessary. Needs
16 * siglock protection since other code may update expiration cache as
17 * well.
15 */ 18 */
16void update_rlimit_cpu(unsigned long rlim_new) 19void update_rlimit_cpu(unsigned long rlim_new)
17{ 20{
18 cputime_t cputime = secs_to_cputime(rlim_new); 21 cputime_t cputime = secs_to_cputime(rlim_new);
19 struct signal_struct *const sig = current->signal;
20 22
21 if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || 23 spin_lock_irq(&current->sighand->siglock);
22 cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { 24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
23 spin_lock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&current->sighand->siglock);
24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
25 spin_unlock_irq(&current->sighand->siglock);
26 }
27} 26}
28 27
29static int check_clock(const clockid_t which_clock) 28static int check_clock(const clockid_t which_clock)
@@ -364,7 +363,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
364 } 363 }
365 } else { 364 } else {
366 read_lock(&tasklist_lock); 365 read_lock(&tasklist_lock);
367 if (thread_group_leader(p) && p->signal) { 366 if (thread_group_leader(p) && p->sighand) {
368 error = 367 error =
369 cpu_clock_sample_group(which_clock, 368 cpu_clock_sample_group(which_clock,
370 p, &rtn); 369 p, &rtn);
@@ -440,7 +439,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
440 439
441 if (likely(p != NULL)) { 440 if (likely(p != NULL)) {
442 read_lock(&tasklist_lock); 441 read_lock(&tasklist_lock);
443 if (unlikely(p->signal == NULL)) { 442 if (unlikely(p->sighand == NULL)) {
444 /* 443 /*
445 * We raced with the reaping of the task. 444 * We raced with the reaping of the task.
446 * The deletion should have cleared us off the list. 445 * The deletion should have cleared us off the list.
@@ -548,111 +547,62 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
548 cputime_gt(expires, new_exp); 547 cputime_gt(expires, new_exp);
549} 548}
550 549
551static inline int expires_le(cputime_t expires, cputime_t new_exp)
552{
553 return !cputime_eq(expires, cputime_zero) &&
554 cputime_le(expires, new_exp);
555}
556/* 550/*
557 * Insert the timer on the appropriate list before any timers that 551 * Insert the timer on the appropriate list before any timers that
558 * expire later. This must be called with the tasklist_lock held 552 * expire later. This must be called with the tasklist_lock held
559 * for reading, and interrupts disabled. 553 * for reading, interrupts disabled and p->sighand->siglock taken.
560 */ 554 */
561static void arm_timer(struct k_itimer *timer, union cpu_time_count now) 555static void arm_timer(struct k_itimer *timer)
562{ 556{
563 struct task_struct *p = timer->it.cpu.task; 557 struct task_struct *p = timer->it.cpu.task;
564 struct list_head *head, *listpos; 558 struct list_head *head, *listpos;
559 struct task_cputime *cputime_expires;
565 struct cpu_timer_list *const nt = &timer->it.cpu; 560 struct cpu_timer_list *const nt = &timer->it.cpu;
566 struct cpu_timer_list *next; 561 struct cpu_timer_list *next;
567 unsigned long i;
568 562
569 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? 563 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
570 p->cpu_timers : p->signal->cpu_timers); 564 head = p->cpu_timers;
565 cputime_expires = &p->cputime_expires;
566 } else {
567 head = p->signal->cpu_timers;
568 cputime_expires = &p->signal->cputime_expires;
569 }
571 head += CPUCLOCK_WHICH(timer->it_clock); 570 head += CPUCLOCK_WHICH(timer->it_clock);
572 571
573 BUG_ON(!irqs_disabled());
574 spin_lock(&p->sighand->siglock);
575
576 listpos = head; 572 listpos = head;
577 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 573 list_for_each_entry(next, head, entry) {
578 list_for_each_entry(next, head, entry) { 574 if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
579 if (next->expires.sched > nt->expires.sched) 575 break;
580 break; 576 listpos = &next->entry;
581 listpos = &next->entry;
582 }
583 } else {
584 list_for_each_entry(next, head, entry) {
585 if (cputime_gt(next->expires.cpu, nt->expires.cpu))
586 break;
587 listpos = &next->entry;
588 }
589 } 577 }
590 list_add(&nt->entry, listpos); 578 list_add(&nt->entry, listpos);
591 579
592 if (listpos == head) { 580 if (listpos == head) {
581 union cpu_time_count *exp = &nt->expires;
582
593 /* 583 /*
594 * We are the new earliest-expiring timer. 584 * We are the new earliest-expiring POSIX 1.b timer, hence
595 * If we are a thread timer, there can always 585 * need to update expiration cache. Take into account that
596 * be a process timer telling us to stop earlier. 586 * for process timers we share expiration cache with itimers
587 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
597 */ 588 */
598 589
599 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 590 switch (CPUCLOCK_WHICH(timer->it_clock)) {
600 union cpu_time_count *exp = &nt->expires; 591 case CPUCLOCK_PROF:
601 592 if (expires_gt(cputime_expires->prof_exp, exp->cpu))
602 switch (CPUCLOCK_WHICH(timer->it_clock)) { 593 cputime_expires->prof_exp = exp->cpu;
603 default: 594 break;
604 BUG(); 595 case CPUCLOCK_VIRT:
605 case CPUCLOCK_PROF: 596 if (expires_gt(cputime_expires->virt_exp, exp->cpu))
606 if (expires_gt(p->cputime_expires.prof_exp, 597 cputime_expires->virt_exp = exp->cpu;
607 exp->cpu)) 598 break;
608 p->cputime_expires.prof_exp = exp->cpu; 599 case CPUCLOCK_SCHED:
609 break; 600 if (cputime_expires->sched_exp == 0 ||
610 case CPUCLOCK_VIRT: 601 cputime_expires->sched_exp > exp->sched)
611 if (expires_gt(p->cputime_expires.virt_exp, 602 cputime_expires->sched_exp = exp->sched;
612 exp->cpu)) 603 break;
613 p->cputime_expires.virt_exp = exp->cpu;
614 break;
615 case CPUCLOCK_SCHED:
616 if (p->cputime_expires.sched_exp == 0 ||
617 p->cputime_expires.sched_exp > exp->sched)
618 p->cputime_expires.sched_exp =
619 exp->sched;
620 break;
621 }
622 } else {
623 struct signal_struct *const sig = p->signal;
624 union cpu_time_count *exp = &timer->it.cpu.expires;
625
626 /*
627 * For a process timer, set the cached expiration time.
628 */
629 switch (CPUCLOCK_WHICH(timer->it_clock)) {
630 default:
631 BUG();
632 case CPUCLOCK_VIRT:
633 if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
634 exp->cpu))
635 break;
636 sig->cputime_expires.virt_exp = exp->cpu;
637 break;
638 case CPUCLOCK_PROF:
639 if (expires_le(sig->it[CPUCLOCK_PROF].expires,
640 exp->cpu))
641 break;
642 i = sig->rlim[RLIMIT_CPU].rlim_cur;
643 if (i != RLIM_INFINITY &&
644 i <= cputime_to_secs(exp->cpu))
645 break;
646 sig->cputime_expires.prof_exp = exp->cpu;
647 break;
648 case CPUCLOCK_SCHED:
649 sig->cputime_expires.sched_exp = exp->sched;
650 break;
651 }
652 } 604 }
653 } 605 }
654
655 spin_unlock(&p->sighand->siglock);
656} 606}
657 607
658/* 608/*
@@ -660,7 +610,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
660 */ 610 */
661static void cpu_timer_fire(struct k_itimer *timer) 611static void cpu_timer_fire(struct k_itimer *timer)
662{ 612{
663 if (unlikely(timer->sigq == NULL)) { 613 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
614 /*
615 * User don't want any signal.
616 */
617 timer->it.cpu.expires.sched = 0;
618 } else if (unlikely(timer->sigq == NULL)) {
664 /* 619 /*
665 * This a special case for clock_nanosleep, 620 * This a special case for clock_nanosleep,
666 * not a normal timer from sys_timer_create. 621 * not a normal timer from sys_timer_create.
@@ -721,7 +676,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
721 struct itimerspec *new, struct itimerspec *old) 676 struct itimerspec *new, struct itimerspec *old)
722{ 677{
723 struct task_struct *p = timer->it.cpu.task; 678 struct task_struct *p = timer->it.cpu.task;
724 union cpu_time_count old_expires, new_expires, val; 679 union cpu_time_count old_expires, new_expires, old_incr, val;
725 int ret; 680 int ret;
726 681
727 if (unlikely(p == NULL)) { 682 if (unlikely(p == NULL)) {
@@ -736,10 +691,10 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
736 read_lock(&tasklist_lock); 691 read_lock(&tasklist_lock);
737 /* 692 /*
738 * We need the tasklist_lock to protect against reaping that 693 * We need the tasklist_lock to protect against reaping that
739 * clears p->signal. If p has just been reaped, we can no 694 * clears p->sighand. If p has just been reaped, we can no
740 * longer get any information about it at all. 695 * longer get any information about it at all.
741 */ 696 */
742 if (unlikely(p->signal == NULL)) { 697 if (unlikely(p->sighand == NULL)) {
743 read_unlock(&tasklist_lock); 698 read_unlock(&tasklist_lock);
744 put_task_struct(p); 699 put_task_struct(p);
745 timer->it.cpu.task = NULL; 700 timer->it.cpu.task = NULL;
@@ -752,6 +707,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
752 BUG_ON(!irqs_disabled()); 707 BUG_ON(!irqs_disabled());
753 708
754 ret = 0; 709 ret = 0;
710 old_incr = timer->it.cpu.incr;
755 spin_lock(&p->sighand->siglock); 711 spin_lock(&p->sighand->siglock);
756 old_expires = timer->it.cpu.expires; 712 old_expires = timer->it.cpu.expires;
757 if (unlikely(timer->it.cpu.firing)) { 713 if (unlikely(timer->it.cpu.firing)) {
@@ -759,7 +715,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
759 ret = TIMER_RETRY; 715 ret = TIMER_RETRY;
760 } else 716 } else
761 list_del_init(&timer->it.cpu.entry); 717 list_del_init(&timer->it.cpu.entry);
762 spin_unlock(&p->sighand->siglock);
763 718
764 /* 719 /*
765 * We need to sample the current value to convert the new 720 * We need to sample the current value to convert the new
@@ -813,6 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
813 * disable this firing since we are already reporting 768 * disable this firing since we are already reporting
814 * it as an overrun (thanks to bump_cpu_timer above). 769 * it as an overrun (thanks to bump_cpu_timer above).
815 */ 770 */
771 spin_unlock(&p->sighand->siglock);
816 read_unlock(&tasklist_lock); 772 read_unlock(&tasklist_lock);
817 goto out; 773 goto out;
818 } 774 }
@@ -828,11 +784,11 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
828 */ 784 */
829 timer->it.cpu.expires = new_expires; 785 timer->it.cpu.expires = new_expires;
830 if (new_expires.sched != 0 && 786 if (new_expires.sched != 0 &&
831 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
832 cpu_time_before(timer->it_clock, val, new_expires)) { 787 cpu_time_before(timer->it_clock, val, new_expires)) {
833 arm_timer(timer, val); 788 arm_timer(timer);
834 } 789 }
835 790
791 spin_unlock(&p->sighand->siglock);
836 read_unlock(&tasklist_lock); 792 read_unlock(&tasklist_lock);
837 793
838 /* 794 /*
@@ -853,7 +809,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
853 timer->it_overrun = -1; 809 timer->it_overrun = -1;
854 810
855 if (new_expires.sched != 0 && 811 if (new_expires.sched != 0 &&
856 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
857 !cpu_time_before(timer->it_clock, val, new_expires)) { 812 !cpu_time_before(timer->it_clock, val, new_expires)) {
858 /* 813 /*
859 * The designated time already passed, so we notify 814 * The designated time already passed, so we notify
@@ -867,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
867 out: 822 out:
868 if (old) { 823 if (old) {
869 sample_to_timespec(timer->it_clock, 824 sample_to_timespec(timer->it_clock,
870 timer->it.cpu.incr, &old->it_interval); 825 old_incr, &old->it_interval);
871 } 826 }
872 return ret; 827 return ret;
873} 828}
@@ -908,7 +863,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
908 clear_dead = p->exit_state; 863 clear_dead = p->exit_state;
909 } else { 864 } else {
910 read_lock(&tasklist_lock); 865 read_lock(&tasklist_lock);
911 if (unlikely(p->signal == NULL)) { 866 if (unlikely(p->sighand == NULL)) {
912 /* 867 /*
913 * The process has been reaped. 868 * The process has been reaped.
914 * We can't even collect a sample any more. 869 * We can't even collect a sample any more.
@@ -927,25 +882,6 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
927 read_unlock(&tasklist_lock); 882 read_unlock(&tasklist_lock);
928 } 883 }
929 884
930 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
931 if (timer->it.cpu.incr.sched == 0 &&
932 cpu_time_before(timer->it_clock,
933 timer->it.cpu.expires, now)) {
934 /*
935 * Do-nothing timer expired and has no reload,
936 * so it's as if it was never set.
937 */
938 timer->it.cpu.expires.sched = 0;
939 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
940 return;
941 }
942 /*
943 * Account for any expirations and reloads that should
944 * have happened.
945 */
946 bump_cpu_timer(timer, now);
947 }
948
949 if (unlikely(clear_dead)) { 885 if (unlikely(clear_dead)) {
950 /* 886 /*
951 * We've noticed that the thread is dead, but 887 * We've noticed that the thread is dead, but
@@ -1066,16 +1002,9 @@ static void stop_process_timers(struct signal_struct *sig)
1066 struct thread_group_cputimer *cputimer = &sig->cputimer; 1002 struct thread_group_cputimer *cputimer = &sig->cputimer;
1067 unsigned long flags; 1003 unsigned long flags;
1068 1004
1069 if (!cputimer->running)
1070 return;
1071
1072 spin_lock_irqsave(&cputimer->lock, flags); 1005 spin_lock_irqsave(&cputimer->lock, flags);
1073 cputimer->running = 0; 1006 cputimer->running = 0;
1074 spin_unlock_irqrestore(&cputimer->lock, flags); 1007 spin_unlock_irqrestore(&cputimer->lock, flags);
1075
1076 sig->cputime_expires.prof_exp = cputime_zero;
1077 sig->cputime_expires.virt_exp = cputime_zero;
1078 sig->cputime_expires.sched_exp = 0;
1079} 1008}
1080 1009
1081static u32 onecputick; 1010static u32 onecputick;
@@ -1112,6 +1041,23 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1112 } 1041 }
1113} 1042}
1114 1043
1044/**
1045 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1046 *
1047 * @cputime: The struct to compare.
1048 *
1049 * Checks @cputime to see if all fields are zero. Returns true if all fields
1050 * are zero, false if any field is nonzero.
1051 */
1052static inline int task_cputime_zero(const struct task_cputime *cputime)
1053{
1054 if (cputime_eq(cputime->utime, cputime_zero) &&
1055 cputime_eq(cputime->stime, cputime_zero) &&
1056 cputime->sum_exec_runtime == 0)
1057 return 1;
1058 return 0;
1059}
1060
1115/* 1061/*
1116 * Check for any per-thread CPU timers that have fired and move them 1062 * Check for any per-thread CPU timers that have fired and move them
1117 * off the tsk->*_timers list onto the firing list. Per-thread timers 1063 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1129,19 +1075,6 @@ static void check_process_timers(struct task_struct *tsk,
1129 unsigned long soft; 1075 unsigned long soft;
1130 1076
1131 /* 1077 /*
1132 * Don't sample the current process CPU clocks if there are no timers.
1133 */
1134 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1135 cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
1136 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1137 list_empty(&timers[CPUCLOCK_VIRT]) &&
1138 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1139 list_empty(&timers[CPUCLOCK_SCHED])) {
1140 stop_process_timers(sig);
1141 return;
1142 }
1143
1144 /*
1145 * Collect the current process totals. 1078 * Collect the current process totals.
1146 */ 1079 */
1147 thread_group_cputimer(tsk, &cputime); 1080 thread_group_cputimer(tsk, &cputime);
@@ -1230,18 +1163,11 @@ static void check_process_timers(struct task_struct *tsk,
1230 } 1163 }
1231 } 1164 }
1232 1165
1233 if (!cputime_eq(prof_expires, cputime_zero) && 1166 sig->cputime_expires.prof_exp = prof_expires;
1234 (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || 1167 sig->cputime_expires.virt_exp = virt_expires;
1235 cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) 1168 sig->cputime_expires.sched_exp = sched_expires;
1236 sig->cputime_expires.prof_exp = prof_expires; 1169 if (task_cputime_zero(&sig->cputime_expires))
1237 if (!cputime_eq(virt_expires, cputime_zero) && 1170 stop_process_timers(sig);
1238 (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
1239 cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
1240 sig->cputime_expires.virt_exp = virt_expires;
1241 if (sched_expires != 0 &&
1242 (sig->cputime_expires.sched_exp == 0 ||
1243 sig->cputime_expires.sched_exp > sched_expires))
1244 sig->cputime_expires.sched_exp = sched_expires;
1245} 1171}
1246 1172
1247/* 1173/*
@@ -1270,9 +1196,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1270 goto out; 1196 goto out;
1271 } 1197 }
1272 read_lock(&tasklist_lock); /* arm_timer needs it. */ 1198 read_lock(&tasklist_lock); /* arm_timer needs it. */
1199 spin_lock(&p->sighand->siglock);
1273 } else { 1200 } else {
1274 read_lock(&tasklist_lock); 1201 read_lock(&tasklist_lock);
1275 if (unlikely(p->signal == NULL)) { 1202 if (unlikely(p->sighand == NULL)) {
1276 /* 1203 /*
1277 * The process has been reaped. 1204 * The process has been reaped.
1278 * We can't even collect a sample any more. 1205 * We can't even collect a sample any more.
@@ -1290,6 +1217,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1290 clear_dead_task(timer, now); 1217 clear_dead_task(timer, now);
1291 goto out_unlock; 1218 goto out_unlock;
1292 } 1219 }
1220 spin_lock(&p->sighand->siglock);
1293 cpu_timer_sample_group(timer->it_clock, p, &now); 1221 cpu_timer_sample_group(timer->it_clock, p, &now);
1294 bump_cpu_timer(timer, now); 1222 bump_cpu_timer(timer, now);
1295 /* Leave the tasklist_lock locked for the call below. */ 1223 /* Leave the tasklist_lock locked for the call below. */
@@ -1298,7 +1226,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1298 /* 1226 /*
1299 * Now re-arm for the new expiry time. 1227 * Now re-arm for the new expiry time.
1300 */ 1228 */
1301 arm_timer(timer, now); 1229 BUG_ON(!irqs_disabled());
1230 arm_timer(timer);
1231 spin_unlock(&p->sighand->siglock);
1302 1232
1303out_unlock: 1233out_unlock:
1304 read_unlock(&tasklist_lock); 1234 read_unlock(&tasklist_lock);
@@ -1310,23 +1240,6 @@ out:
1310} 1240}
1311 1241
1312/** 1242/**
1313 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1314 *
1315 * @cputime: The struct to compare.
1316 *
1317 * Checks @cputime to see if all fields are zero. Returns true if all fields
1318 * are zero, false if any field is nonzero.
1319 */
1320static inline int task_cputime_zero(const struct task_cputime *cputime)
1321{
1322 if (cputime_eq(cputime->utime, cputime_zero) &&
1323 cputime_eq(cputime->stime, cputime_zero) &&
1324 cputime->sum_exec_runtime == 0)
1325 return 1;
1326 return 0;
1327}
1328
1329/**
1330 * task_cputime_expired - Compare two task_cputime entities. 1243 * task_cputime_expired - Compare two task_cputime entities.
1331 * 1244 *
1332 * @sample: The task_cputime structure to be checked for expiration. 1245 * @sample: The task_cputime structure to be checked for expiration.
@@ -1382,7 +1295,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1382 } 1295 }
1383 1296
1384 sig = tsk->signal; 1297 sig = tsk->signal;
1385 if (!task_cputime_zero(&sig->cputime_expires)) { 1298 if (sig->cputimer.running) {
1386 struct task_cputime group_sample; 1299 struct task_cputime group_sample;
1387 1300
1388 thread_group_cputimer(tsk, &group_sample); 1301 thread_group_cputimer(tsk, &group_sample);
@@ -1390,7 +1303,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1390 return 1; 1303 return 1;
1391 } 1304 }
1392 1305
1393 return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY; 1306 return 0;
1394} 1307}
1395 1308
1396/* 1309/*
@@ -1419,7 +1332,12 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1419 * put them on the firing list. 1332 * put them on the firing list.
1420 */ 1333 */
1421 check_thread_timers(tsk, &firing); 1334 check_thread_timers(tsk, &firing);
1422 check_process_timers(tsk, &firing); 1335 /*
1336 * If there are any active process wide timers (POSIX 1.b, itimers,
1337 * RLIMIT_CPU) cputimer must be running.
1338 */
1339 if (tsk->signal->cputimer.running)
1340 check_process_timers(tsk, &firing);
1423 1341
1424 /* 1342 /*
1425 * We must release these locks before taking any timer's lock. 1343 * We must release these locks before taking any timer's lock.
@@ -1456,21 +1374,23 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1456} 1374}
1457 1375
1458/* 1376/*
1459 * Set one of the process-wide special case CPU timers. 1377 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1460 * The tsk->sighand->siglock must be held by the caller. 1378 * The tsk->sighand->siglock must be held by the caller.
1461 * The *newval argument is relative and we update it to be absolute, *oldval
1462 * is absolute and we update it to be relative.
1463 */ 1379 */
1464void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1380void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1465 cputime_t *newval, cputime_t *oldval) 1381 cputime_t *newval, cputime_t *oldval)
1466{ 1382{
1467 union cpu_time_count now; 1383 union cpu_time_count now;
1468 struct list_head *head;
1469 1384
1470 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1385 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1471 cpu_timer_sample_group(clock_idx, tsk, &now); 1386 cpu_timer_sample_group(clock_idx, tsk, &now);
1472 1387
1473 if (oldval) { 1388 if (oldval) {
1389 /*
1390 * We are setting itimer. The *oldval is absolute and we update
1391 * it to be relative, *newval argument is relative and we update
1392 * it to be absolute.
1393 */
1474 if (!cputime_eq(*oldval, cputime_zero)) { 1394 if (!cputime_eq(*oldval, cputime_zero)) {
1475 if (cputime_le(*oldval, now.cpu)) { 1395 if (cputime_le(*oldval, now.cpu)) {
1476 /* Just about to fire. */ 1396 /* Just about to fire. */
@@ -1483,33 +1403,21 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1483 if (cputime_eq(*newval, cputime_zero)) 1403 if (cputime_eq(*newval, cputime_zero))
1484 return; 1404 return;
1485 *newval = cputime_add(*newval, now.cpu); 1405 *newval = cputime_add(*newval, now.cpu);
1486
1487 /*
1488 * If the RLIMIT_CPU timer will expire before the
1489 * ITIMER_PROF timer, we have nothing else to do.
1490 */
1491 if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
1492 < cputime_to_secs(*newval))
1493 return;
1494 } 1406 }
1495 1407
1496 /* 1408 /*
1497 * Check whether there are any process timers already set to fire 1409 * Update expiration cache if we are the earliest timer, or eventually
1498 * before this one. If so, we don't have anything more to do. 1410 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
1499 */ 1411 */
1500 head = &tsk->signal->cpu_timers[clock_idx]; 1412 switch (clock_idx) {
1501 if (list_empty(head) || 1413 case CPUCLOCK_PROF:
1502 cputime_ge(list_first_entry(head, 1414 if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1503 struct cpu_timer_list, entry)->expires.cpu,
1504 *newval)) {
1505 switch (clock_idx) {
1506 case CPUCLOCK_PROF:
1507 tsk->signal->cputime_expires.prof_exp = *newval; 1415 tsk->signal->cputime_expires.prof_exp = *newval;
1508 break; 1416 break;
1509 case CPUCLOCK_VIRT: 1417 case CPUCLOCK_VIRT:
1418 if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1510 tsk->signal->cputime_expires.virt_exp = *newval; 1419 tsk->signal->cputime_expires.virt_exp = *newval;
1511 break; 1420 break;
1512 }
1513 } 1421 }
1514} 1422}
1515 1423
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 00d1fda58ab6..ad723420acc3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -559,14 +559,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
559 new_timer->it_id = (timer_t) new_timer_id; 559 new_timer->it_id = (timer_t) new_timer_id;
560 new_timer->it_clock = which_clock; 560 new_timer->it_clock = which_clock;
561 new_timer->it_overrun = -1; 561 new_timer->it_overrun = -1;
562 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
563 if (error)
564 goto out;
565 562
566 /*
567 * return the timer_id now. The next step is hard to
568 * back out if there is an error.
569 */
570 if (copy_to_user(created_timer_id, 563 if (copy_to_user(created_timer_id,
571 &new_timer_id, sizeof (new_timer_id))) { 564 &new_timer_id, sizeof (new_timer_id))) {
572 error = -EFAULT; 565 error = -EFAULT;
@@ -597,6 +590,10 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
597 new_timer->sigq->info.si_tid = new_timer->it_id; 590 new_timer->sigq->info.si_tid = new_timer->it_id;
598 new_timer->sigq->info.si_code = SI_TIMER; 591 new_timer->sigq->info.si_code = SI_TIMER;
599 592
593 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
594 if (error)
595 goto out;
596
600 spin_lock_irq(&current->sighand->siglock); 597 spin_lock_irq(&current->sighand->siglock);
601 new_timer->it_signal = current->signal; 598 new_timer->it_signal = current->signal;
602 list_add(&new_timer->list, &current->signal->posix_timers); 599 list_add(&new_timer->list, &current->signal->posix_timers);
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 43191815f874..524e058dcf06 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12 block_io.o
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 13obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o
13 14
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
new file mode 100644
index 000000000000..97024fd40cd5
--- /dev/null
+++ b/kernel/power/block_io.c
@@ -0,0 +1,103 @@
1/*
2 * This file provides functions for block I/O operations on swap/file.
3 *
4 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
5 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#include <linux/bio.h>
11#include <linux/kernel.h>
12#include <linux/pagemap.h>
13#include <linux/swap.h>
14
15#include "power.h"
16
17/**
18 * submit - submit BIO request.
19 * @rw: READ or WRITE.
20 * @off physical offset of page.
21 * @page: page we're reading or writing.
22 * @bio_chain: list of pending biod (for async reading)
23 *
24 * Straight from the textbook - allocate and initialize the bio.
25 * If we're reading, make sure the page is marked as dirty.
26 * Then submit it and, if @bio_chain == NULL, wait.
27 */
28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain)
30{
31 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
32 struct bio *bio;
33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
35 bio->bi_sector = sector;
36 bio->bi_bdev = bdev;
37 bio->bi_end_io = end_swap_bio_read;
38
39 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
40 printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
41 (unsigned long long)sector);
42 bio_put(bio);
43 return -EFAULT;
44 }
45
46 lock_page(page);
47 bio_get(bio);
48
49 if (bio_chain == NULL) {
50 submit_bio(bio_rw, bio);
51 wait_on_page_locked(page);
52 if (rw == READ)
53 bio_set_pages_dirty(bio);
54 bio_put(bio);
55 } else {
56 if (rw == READ)
57 get_page(page); /* These pages are freed later */
58 bio->bi_private = *bio_chain;
59 *bio_chain = bio;
60 submit_bio(bio_rw, bio);
61 }
62 return 0;
63}
64
65int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
66{
67 return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
68 virt_to_page(addr), bio_chain);
69}
70
71int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
72{
73 return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
74 virt_to_page(addr), bio_chain);
75}
76
77int hib_wait_on_bio_chain(struct bio **bio_chain)
78{
79 struct bio *bio;
80 struct bio *next_bio;
81 int ret = 0;
82
83 if (bio_chain == NULL)
84 return 0;
85
86 bio = *bio_chain;
87 if (bio == NULL)
88 return 0;
89 while (bio) {
90 struct page *page;
91
92 next_bio = bio->bi_private;
93 page = bio->bi_io_vec[0].bv_page;
94 wait_on_page_locked(page);
95 if (!PageUptodate(page) || PageError(page))
96 ret = -EIO;
97 put_page(page);
98 bio_put(bio);
99 bio = next_bio;
100 }
101 *bio_chain = NULL;
102 return ret;
103}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46c5a26630a3..006270fe382d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void);
97 */ 97 */
98 98
99struct snapshot_handle { 99struct snapshot_handle {
100 loff_t offset; /* number of the last byte ready for reading
101 * or writing in the sequence
102 */
103 unsigned int cur; /* number of the block of PAGE_SIZE bytes the 100 unsigned int cur; /* number of the block of PAGE_SIZE bytes the
104 * next operation will refer to (ie. current) 101 * next operation will refer to (ie. current)
105 */ 102 */
106 unsigned int cur_offset; /* offset with respect to the current
107 * block (for the next operation)
108 */
109 unsigned int prev; /* number of the block of PAGE_SIZE bytes that
110 * was the current one previously
111 */
112 void *buffer; /* address of the block to read from 103 void *buffer; /* address of the block to read from
113 * or write to 104 * or write to
114 */ 105 */
115 unsigned int buf_offset; /* location to read from or write to,
116 * given as a displacement from 'buffer'
117 */
118 int sync_read; /* Set to one to notify the caller of 106 int sync_read; /* Set to one to notify the caller of
119 * snapshot_write_next() that it may 107 * snapshot_write_next() that it may
120 * need to call wait_on_bio_chain() 108 * need to call wait_on_bio_chain()
@@ -125,12 +113,12 @@ struct snapshot_handle {
125 * snapshot_read_next()/snapshot_write_next() is allowed to 113 * snapshot_read_next()/snapshot_write_next() is allowed to
126 * read/write data after the function returns 114 * read/write data after the function returns
127 */ 115 */
128#define data_of(handle) ((handle).buffer + (handle).buf_offset) 116#define data_of(handle) ((handle).buffer)
129 117
130extern unsigned int snapshot_additional_pages(struct zone *zone); 118extern unsigned int snapshot_additional_pages(struct zone *zone);
131extern unsigned long snapshot_get_image_size(void); 119extern unsigned long snapshot_get_image_size(void);
132extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 120extern int snapshot_read_next(struct snapshot_handle *handle);
133extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 121extern int snapshot_write_next(struct snapshot_handle *handle);
134extern void snapshot_write_finalize(struct snapshot_handle *handle); 122extern void snapshot_write_finalize(struct snapshot_handle *handle);
135extern int snapshot_image_loaded(struct snapshot_handle *handle); 123extern int snapshot_image_loaded(struct snapshot_handle *handle);
136 124
@@ -154,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p);
154extern int swsusp_write(unsigned int flags); 142extern int swsusp_write(unsigned int flags);
155extern void swsusp_close(fmode_t); 143extern void swsusp_close(fmode_t);
156 144
145/* kernel/power/block_io.c */
146extern struct block_device *hib_resume_bdev;
147
148extern int hib_bio_read_page(pgoff_t page_off, void *addr,
149 struct bio **bio_chain);
150extern int hib_bio_write_page(pgoff_t page_off, void *addr,
151 struct bio **bio_chain);
152extern int hib_wait_on_bio_chain(struct bio **bio_chain);
153
157struct timeval; 154struct timeval;
158/* kernel/power/swsusp.c */ 155/* kernel/power/swsusp.c */
159extern void swsusp_show_speed(struct timeval *, struct timeval *, 156extern void swsusp_show_speed(struct timeval *, struct timeval *,
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index be861c26dda7..25ce010e9f8b 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1604,14 +1604,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1604 * snapshot_handle structure. The structure gets updated and a pointer 1604 * snapshot_handle structure. The structure gets updated and a pointer
1605 * to it should be passed to this function every next time. 1605 * to it should be passed to this function every next time.
1606 * 1606 *
1607 * The @count parameter should contain the number of bytes the caller
1608 * wants to read from the snapshot. It must not be zero.
1609 *
1610 * On success the function returns a positive number. Then, the caller 1607 * On success the function returns a positive number. Then, the caller
1611 * is allowed to read up to the returned number of bytes from the memory 1608 * is allowed to read up to the returned number of bytes from the memory
1612 * location computed by the data_of() macro. The number returned 1609 * location computed by the data_of() macro.
1613 * may be smaller than @count, but this only happens if the read would
1614 * cross a page boundary otherwise.
1615 * 1610 *
1616 * The function returns 0 to indicate the end of data stream condition, 1611 * The function returns 0 to indicate the end of data stream condition,
1617 * and a negative number is returned on error. In such cases the 1612 * and a negative number is returned on error. In such cases the
@@ -1619,7 +1614,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1619 * any more. 1614 * any more.
1620 */ 1615 */
1621 1616
1622int snapshot_read_next(struct snapshot_handle *handle, size_t count) 1617int snapshot_read_next(struct snapshot_handle *handle)
1623{ 1618{
1624 if (handle->cur > nr_meta_pages + nr_copy_pages) 1619 if (handle->cur > nr_meta_pages + nr_copy_pages)
1625 return 0; 1620 return 0;
@@ -1630,7 +1625,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1630 if (!buffer) 1625 if (!buffer)
1631 return -ENOMEM; 1626 return -ENOMEM;
1632 } 1627 }
1633 if (!handle->offset) { 1628 if (!handle->cur) {
1634 int error; 1629 int error;
1635 1630
1636 error = init_header((struct swsusp_info *)buffer); 1631 error = init_header((struct swsusp_info *)buffer);
@@ -1639,42 +1634,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1639 handle->buffer = buffer; 1634 handle->buffer = buffer;
1640 memory_bm_position_reset(&orig_bm); 1635 memory_bm_position_reset(&orig_bm);
1641 memory_bm_position_reset(&copy_bm); 1636 memory_bm_position_reset(&copy_bm);
1642 } 1637 } else if (handle->cur <= nr_meta_pages) {
1643 if (handle->prev < handle->cur) { 1638 memset(buffer, 0, PAGE_SIZE);
1644 if (handle->cur <= nr_meta_pages) { 1639 pack_pfns(buffer, &orig_bm);
1645 memset(buffer, 0, PAGE_SIZE); 1640 } else {
1646 pack_pfns(buffer, &orig_bm); 1641 struct page *page;
1647 } else {
1648 struct page *page;
1649 1642
1650 page = pfn_to_page(memory_bm_next_pfn(&copy_bm)); 1643 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1651 if (PageHighMem(page)) { 1644 if (PageHighMem(page)) {
1652 /* Highmem pages are copied to the buffer, 1645 /* Highmem pages are copied to the buffer,
1653 * because we can't return with a kmapped 1646 * because we can't return with a kmapped
1654 * highmem page (we may not be called again). 1647 * highmem page (we may not be called again).
1655 */ 1648 */
1656 void *kaddr; 1649 void *kaddr;
1657 1650
1658 kaddr = kmap_atomic(page, KM_USER0); 1651 kaddr = kmap_atomic(page, KM_USER0);
1659 memcpy(buffer, kaddr, PAGE_SIZE); 1652 memcpy(buffer, kaddr, PAGE_SIZE);
1660 kunmap_atomic(kaddr, KM_USER0); 1653 kunmap_atomic(kaddr, KM_USER0);
1661 handle->buffer = buffer; 1654 handle->buffer = buffer;
1662 } else { 1655 } else {
1663 handle->buffer = page_address(page); 1656 handle->buffer = page_address(page);
1664 }
1665 } 1657 }
1666 handle->prev = handle->cur;
1667 }
1668 handle->buf_offset = handle->cur_offset;
1669 if (handle->cur_offset + count >= PAGE_SIZE) {
1670 count = PAGE_SIZE - handle->cur_offset;
1671 handle->cur_offset = 0;
1672 handle->cur++;
1673 } else {
1674 handle->cur_offset += count;
1675 } 1658 }
1676 handle->offset += count; 1659 handle->cur++;
1677 return count; 1660 return PAGE_SIZE;
1678} 1661}
1679 1662
1680/** 1663/**
@@ -2133,14 +2116,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2133 * snapshot_handle structure. The structure gets updated and a pointer 2116 * snapshot_handle structure. The structure gets updated and a pointer
2134 * to it should be passed to this function every next time. 2117 * to it should be passed to this function every next time.
2135 * 2118 *
2136 * The @count parameter should contain the number of bytes the caller
2137 * wants to write to the image. It must not be zero.
2138 *
2139 * On success the function returns a positive number. Then, the caller 2119 * On success the function returns a positive number. Then, the caller
2140 * is allowed to write up to the returned number of bytes to the memory 2120 * is allowed to write up to the returned number of bytes to the memory
2141 * location computed by the data_of() macro. The number returned 2121 * location computed by the data_of() macro.
2142 * may be smaller than @count, but this only happens if the write would
2143 * cross a page boundary otherwise.
2144 * 2122 *
2145 * The function returns 0 to indicate the "end of file" condition, 2123 * The function returns 0 to indicate the "end of file" condition,
2146 * and a negative number is returned on error. In such cases the 2124 * and a negative number is returned on error. In such cases the
@@ -2148,16 +2126,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2148 * any more. 2126 * any more.
2149 */ 2127 */
2150 2128
2151int snapshot_write_next(struct snapshot_handle *handle, size_t count) 2129int snapshot_write_next(struct snapshot_handle *handle)
2152{ 2130{
2153 static struct chain_allocator ca; 2131 static struct chain_allocator ca;
2154 int error = 0; 2132 int error = 0;
2155 2133
2156 /* Check if we have already loaded the entire image */ 2134 /* Check if we have already loaded the entire image */
2157 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 2135 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
2158 return 0; 2136 return 0;
2159 2137
2160 if (handle->offset == 0) { 2138 handle->sync_read = 1;
2139
2140 if (!handle->cur) {
2161 if (!buffer) 2141 if (!buffer)
2162 /* This makes the buffer be freed by swsusp_free() */ 2142 /* This makes the buffer be freed by swsusp_free() */
2163 buffer = get_image_page(GFP_ATOMIC, PG_ANY); 2143 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
@@ -2166,56 +2146,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
2166 return -ENOMEM; 2146 return -ENOMEM;
2167 2147
2168 handle->buffer = buffer; 2148 handle->buffer = buffer;
2169 } 2149 } else if (handle->cur == 1) {
2170 handle->sync_read = 1; 2150 error = load_header(buffer);
2171 if (handle->prev < handle->cur) { 2151 if (error)
2172 if (handle->prev == 0) { 2152 return error;
2173 error = load_header(buffer);
2174 if (error)
2175 return error;
2176 2153
2177 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY); 2154 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
2178 if (error) 2155 if (error)
2179 return error; 2156 return error;
2157
2158 } else if (handle->cur <= nr_meta_pages + 1) {
2159 error = unpack_orig_pfns(buffer, &copy_bm);
2160 if (error)
2161 return error;
2180 2162
2181 } else if (handle->prev <= nr_meta_pages) { 2163 if (handle->cur == nr_meta_pages + 1) {
2182 error = unpack_orig_pfns(buffer, &copy_bm); 2164 error = prepare_image(&orig_bm, &copy_bm);
2183 if (error) 2165 if (error)
2184 return error; 2166 return error;
2185 2167
2186 if (handle->prev == nr_meta_pages) { 2168 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2187 error = prepare_image(&orig_bm, &copy_bm); 2169 memory_bm_position_reset(&orig_bm);
2188 if (error) 2170 restore_pblist = NULL;
2189 return error;
2190
2191 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2192 memory_bm_position_reset(&orig_bm);
2193 restore_pblist = NULL;
2194 handle->buffer = get_buffer(&orig_bm, &ca);
2195 handle->sync_read = 0;
2196 if (IS_ERR(handle->buffer))
2197 return PTR_ERR(handle->buffer);
2198 }
2199 } else {
2200 copy_last_highmem_page();
2201 handle->buffer = get_buffer(&orig_bm, &ca); 2171 handle->buffer = get_buffer(&orig_bm, &ca);
2172 handle->sync_read = 0;
2202 if (IS_ERR(handle->buffer)) 2173 if (IS_ERR(handle->buffer))
2203 return PTR_ERR(handle->buffer); 2174 return PTR_ERR(handle->buffer);
2204 if (handle->buffer != buffer)
2205 handle->sync_read = 0;
2206 } 2175 }
2207 handle->prev = handle->cur;
2208 }
2209 handle->buf_offset = handle->cur_offset;
2210 if (handle->cur_offset + count >= PAGE_SIZE) {
2211 count = PAGE_SIZE - handle->cur_offset;
2212 handle->cur_offset = 0;
2213 handle->cur++;
2214 } else { 2176 } else {
2215 handle->cur_offset += count; 2177 copy_last_highmem_page();
2178 handle->buffer = get_buffer(&orig_bm, &ca);
2179 if (IS_ERR(handle->buffer))
2180 return PTR_ERR(handle->buffer);
2181 if (handle->buffer != buffer)
2182 handle->sync_read = 0;
2216 } 2183 }
2217 handle->offset += count; 2184 handle->cur++;
2218 return count; 2185 return PAGE_SIZE;
2219} 2186}
2220 2187
2221/** 2188/**
@@ -2230,7 +2197,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
2230{ 2197{
2231 copy_last_highmem_page(); 2198 copy_last_highmem_page();
2232 /* Free only if we have loaded the image entirely */ 2199 /* Free only if we have loaded the image entirely */
2233 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { 2200 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2234 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2201 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
2235 free_highmem_data(); 2202 free_highmem_data();
2236 } 2203 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 66824d71983a..b0bb21778391 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -29,6 +29,40 @@
29 29
30#define SWSUSP_SIG "S1SUSPEND" 30#define SWSUSP_SIG "S1SUSPEND"
31 31
32/*
33 * The swap map is a data structure used for keeping track of each page
34 * written to a swap partition. It consists of many swap_map_page
35 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
36 * These structures are stored on the swap and linked together with the
37 * help of the .next_swap member.
38 *
39 * The swap map is created during suspend. The swap map pages are
40 * allocated and populated one at a time, so we only need one memory
41 * page to set up the entire structure.
42 *
43 * During resume we also only need to use one swap_map_page structure
44 * at a time.
45 */
46
47#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
48
49struct swap_map_page {
50 sector_t entries[MAP_PAGE_ENTRIES];
51 sector_t next_swap;
52};
53
54/**
55 * The swap_map_handle structure is used for handling swap in
56 * a file-alike way
57 */
58
59struct swap_map_handle {
60 struct swap_map_page *cur;
61 sector_t cur_swap;
62 sector_t first_sector;
63 unsigned int k;
64};
65
32struct swsusp_header { 66struct swsusp_header {
33 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; 67 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
34 sector_t image; 68 sector_t image;
@@ -145,110 +179,24 @@ int swsusp_swap_in_use(void)
145 */ 179 */
146 180
147static unsigned short root_swap = 0xffff; 181static unsigned short root_swap = 0xffff;
148static struct block_device *resume_bdev; 182struct block_device *hib_resume_bdev;
149
150/**
151 * submit - submit BIO request.
152 * @rw: READ or WRITE.
153 * @off physical offset of page.
154 * @page: page we're reading or writing.
155 * @bio_chain: list of pending biod (for async reading)
156 *
157 * Straight from the textbook - allocate and initialize the bio.
158 * If we're reading, make sure the page is marked as dirty.
159 * Then submit it and, if @bio_chain == NULL, wait.
160 */
161static int submit(int rw, pgoff_t page_off, struct page *page,
162 struct bio **bio_chain)
163{
164 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
165 struct bio *bio;
166
167 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
168 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
169 bio->bi_bdev = resume_bdev;
170 bio->bi_end_io = end_swap_bio_read;
171
172 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
173 printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
174 page_off);
175 bio_put(bio);
176 return -EFAULT;
177 }
178
179 lock_page(page);
180 bio_get(bio);
181
182 if (bio_chain == NULL) {
183 submit_bio(bio_rw, bio);
184 wait_on_page_locked(page);
185 if (rw == READ)
186 bio_set_pages_dirty(bio);
187 bio_put(bio);
188 } else {
189 if (rw == READ)
190 get_page(page); /* These pages are freed later */
191 bio->bi_private = *bio_chain;
192 *bio_chain = bio;
193 submit_bio(bio_rw, bio);
194 }
195 return 0;
196}
197
198static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
199{
200 return submit(READ, page_off, virt_to_page(addr), bio_chain);
201}
202
203static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
204{
205 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
206}
207
208static int wait_on_bio_chain(struct bio **bio_chain)
209{
210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235 183
236/* 184/*
237 * Saving part 185 * Saving part
238 */ 186 */
239 187
240static int mark_swapfiles(sector_t start, unsigned int flags) 188static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
241{ 189{
242 int error; 190 int error;
243 191
244 bio_read_page(swsusp_resume_block, swsusp_header, NULL); 192 hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
245 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 193 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
246 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 194 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
247 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 195 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
248 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 196 memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
249 swsusp_header->image = start; 197 swsusp_header->image = handle->first_sector;
250 swsusp_header->flags = flags; 198 swsusp_header->flags = flags;
251 error = bio_write_page(swsusp_resume_block, 199 error = hib_bio_write_page(swsusp_resume_block,
252 swsusp_header, NULL); 200 swsusp_header, NULL);
253 } else { 201 } else {
254 printk(KERN_ERR "PM: Swap header not found!\n"); 202 printk(KERN_ERR "PM: Swap header not found!\n");
@@ -260,25 +208,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
260/** 208/**
261 * swsusp_swap_check - check if the resume device is a swap device 209 * swsusp_swap_check - check if the resume device is a swap device
262 * and get its index (if so) 210 * and get its index (if so)
211 *
212 * This is called before saving image
263 */ 213 */
264 214static int swsusp_swap_check(void)
265static int swsusp_swap_check(void) /* This is called before saving image */
266{ 215{
267 int res; 216 int res;
268 217
269 res = swap_type_of(swsusp_resume_device, swsusp_resume_block, 218 res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
270 &resume_bdev); 219 &hib_resume_bdev);
271 if (res < 0) 220 if (res < 0)
272 return res; 221 return res;
273 222
274 root_swap = res; 223 root_swap = res;
275 res = blkdev_get(resume_bdev, FMODE_WRITE); 224 res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
276 if (res) 225 if (res)
277 return res; 226 return res;
278 227
279 res = set_blocksize(resume_bdev, PAGE_SIZE); 228 res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
280 if (res < 0) 229 if (res < 0)
281 blkdev_put(resume_bdev, FMODE_WRITE); 230 blkdev_put(hib_resume_bdev, FMODE_WRITE);
282 231
283 return res; 232 return res;
284} 233}
@@ -309,42 +258,9 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
309 } else { 258 } else {
310 src = buf; 259 src = buf;
311 } 260 }
312 return bio_write_page(offset, src, bio_chain); 261 return hib_bio_write_page(offset, src, bio_chain);
313} 262}
314 263
315/*
316 * The swap map is a data structure used for keeping track of each page
317 * written to a swap partition. It consists of many swap_map_page
318 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
319 * These structures are stored on the swap and linked together with the
320 * help of the .next_swap member.
321 *
322 * The swap map is created during suspend. The swap map pages are
323 * allocated and populated one at a time, so we only need one memory
324 * page to set up the entire structure.
325 *
326 * During resume we also only need to use one swap_map_page structure
327 * at a time.
328 */
329
330#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
331
332struct swap_map_page {
333 sector_t entries[MAP_PAGE_ENTRIES];
334 sector_t next_swap;
335};
336
337/**
338 * The swap_map_handle structure is used for handling swap in
339 * a file-alike way
340 */
341
342struct swap_map_handle {
343 struct swap_map_page *cur;
344 sector_t cur_swap;
345 unsigned int k;
346};
347
348static void release_swap_writer(struct swap_map_handle *handle) 264static void release_swap_writer(struct swap_map_handle *handle)
349{ 265{
350 if (handle->cur) 266 if (handle->cur)
@@ -354,16 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle)
354 270
355static int get_swap_writer(struct swap_map_handle *handle) 271static int get_swap_writer(struct swap_map_handle *handle)
356{ 272{
273 int ret;
274
275 ret = swsusp_swap_check();
276 if (ret) {
277 if (ret != -ENOSPC)
278 printk(KERN_ERR "PM: Cannot find swap device, try "
279 "swapon -a.\n");
280 return ret;
281 }
357 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 282 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
358 if (!handle->cur) 283 if (!handle->cur) {
359 return -ENOMEM; 284 ret = -ENOMEM;
285 goto err_close;
286 }
360 handle->cur_swap = alloc_swapdev_block(root_swap); 287 handle->cur_swap = alloc_swapdev_block(root_swap);
361 if (!handle->cur_swap) { 288 if (!handle->cur_swap) {
362 release_swap_writer(handle); 289 ret = -ENOSPC;
363 return -ENOSPC; 290 goto err_rel;
364 } 291 }
365 handle->k = 0; 292 handle->k = 0;
293 handle->first_sector = handle->cur_swap;
366 return 0; 294 return 0;
295err_rel:
296 release_swap_writer(handle);
297err_close:
298 swsusp_close(FMODE_WRITE);
299 return ret;
367} 300}
368 301
369static int swap_write_page(struct swap_map_handle *handle, void *buf, 302static int swap_write_page(struct swap_map_handle *handle, void *buf,
@@ -380,7 +313,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
380 return error; 313 return error;
381 handle->cur->entries[handle->k++] = offset; 314 handle->cur->entries[handle->k++] = offset;
382 if (handle->k >= MAP_PAGE_ENTRIES) { 315 if (handle->k >= MAP_PAGE_ENTRIES) {
383 error = wait_on_bio_chain(bio_chain); 316 error = hib_wait_on_bio_chain(bio_chain);
384 if (error) 317 if (error)
385 goto out; 318 goto out;
386 offset = alloc_swapdev_block(root_swap); 319 offset = alloc_swapdev_block(root_swap);
@@ -406,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle)
406 return -EINVAL; 339 return -EINVAL;
407} 340}
408 341
342static int swap_writer_finish(struct swap_map_handle *handle,
343 unsigned int flags, int error)
344{
345 if (!error) {
346 flush_swap_writer(handle);
347 printk(KERN_INFO "PM: S");
348 error = mark_swapfiles(handle, flags);
349 printk("|\n");
350 }
351
352 if (error)
353 free_all_swap_pages(root_swap);
354 release_swap_writer(handle);
355 swsusp_close(FMODE_WRITE);
356
357 return error;
358}
359
409/** 360/**
410 * save_image - save the suspend image data 361 * save_image - save the suspend image data
411 */ 362 */
@@ -431,7 +382,7 @@ static int save_image(struct swap_map_handle *handle,
431 bio = NULL; 382 bio = NULL;
432 do_gettimeofday(&start); 383 do_gettimeofday(&start);
433 while (1) { 384 while (1) {
434 ret = snapshot_read_next(snapshot, PAGE_SIZE); 385 ret = snapshot_read_next(snapshot);
435 if (ret <= 0) 386 if (ret <= 0)
436 break; 387 break;
437 ret = swap_write_page(handle, data_of(*snapshot), &bio); 388 ret = swap_write_page(handle, data_of(*snapshot), &bio);
@@ -441,7 +392,7 @@ static int save_image(struct swap_map_handle *handle,
441 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 392 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
442 nr_pages++; 393 nr_pages++;
443 } 394 }
444 err2 = wait_on_bio_chain(&bio); 395 err2 = hib_wait_on_bio_chain(&bio);
445 do_gettimeofday(&stop); 396 do_gettimeofday(&stop);
446 if (!ret) 397 if (!ret)
447 ret = err2; 398 ret = err2;
@@ -483,50 +434,34 @@ int swsusp_write(unsigned int flags)
483 struct swap_map_handle handle; 434 struct swap_map_handle handle;
484 struct snapshot_handle snapshot; 435 struct snapshot_handle snapshot;
485 struct swsusp_info *header; 436 struct swsusp_info *header;
437 unsigned long pages;
486 int error; 438 int error;
487 439
488 error = swsusp_swap_check(); 440 pages = snapshot_get_image_size();
441 error = get_swap_writer(&handle);
489 if (error) { 442 if (error) {
490 printk(KERN_ERR "PM: Cannot find swap device, try " 443 printk(KERN_ERR "PM: Cannot get swap writer\n");
491 "swapon -a.\n");
492 return error; 444 return error;
493 } 445 }
446 if (!enough_swap(pages)) {
447 printk(KERN_ERR "PM: Not enough free swap\n");
448 error = -ENOSPC;
449 goto out_finish;
450 }
494 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 451 memset(&snapshot, 0, sizeof(struct snapshot_handle));
495 error = snapshot_read_next(&snapshot, PAGE_SIZE); 452 error = snapshot_read_next(&snapshot);
496 if (error < PAGE_SIZE) { 453 if (error < PAGE_SIZE) {
497 if (error >= 0) 454 if (error >= 0)
498 error = -EFAULT; 455 error = -EFAULT;
499 456
500 goto out; 457 goto out_finish;
501 } 458 }
502 header = (struct swsusp_info *)data_of(snapshot); 459 header = (struct swsusp_info *)data_of(snapshot);
503 if (!enough_swap(header->pages)) { 460 error = swap_write_page(&handle, header, NULL);
504 printk(KERN_ERR "PM: Not enough free swap\n"); 461 if (!error)
505 error = -ENOSPC; 462 error = save_image(&handle, &snapshot, pages - 1);
506 goto out; 463out_finish:
507 } 464 error = swap_writer_finish(&handle, flags, error);
508 error = get_swap_writer(&handle);
509 if (!error) {
510 sector_t start = handle.cur_swap;
511
512 error = swap_write_page(&handle, header, NULL);
513 if (!error)
514 error = save_image(&handle, &snapshot,
515 header->pages - 1);
516
517 if (!error) {
518 flush_swap_writer(&handle);
519 printk(KERN_INFO "PM: S");
520 error = mark_swapfiles(start, flags);
521 printk("|\n");
522 }
523 }
524 if (error)
525 free_all_swap_pages(root_swap);
526
527 release_swap_writer(&handle);
528 out:
529 swsusp_close(FMODE_WRITE);
530 return error; 465 return error;
531} 466}
532 467
@@ -542,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle)
542 handle->cur = NULL; 477 handle->cur = NULL;
543} 478}
544 479
545static int get_swap_reader(struct swap_map_handle *handle, sector_t start) 480static int get_swap_reader(struct swap_map_handle *handle,
481 unsigned int *flags_p)
546{ 482{
547 int error; 483 int error;
548 484
549 if (!start) 485 *flags_p = swsusp_header->flags;
486
487 if (!swsusp_header->image) /* how can this happen? */
550 return -EINVAL; 488 return -EINVAL;
551 489
552 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); 490 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
553 if (!handle->cur) 491 if (!handle->cur)
554 return -ENOMEM; 492 return -ENOMEM;
555 493
556 error = bio_read_page(start, handle->cur, NULL); 494 error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
557 if (error) { 495 if (error) {
558 release_swap_reader(handle); 496 release_swap_reader(handle);
559 return error; 497 return error;
@@ -573,21 +511,28 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
573 offset = handle->cur->entries[handle->k]; 511 offset = handle->cur->entries[handle->k];
574 if (!offset) 512 if (!offset)
575 return -EFAULT; 513 return -EFAULT;
576 error = bio_read_page(offset, buf, bio_chain); 514 error = hib_bio_read_page(offset, buf, bio_chain);
577 if (error) 515 if (error)
578 return error; 516 return error;
579 if (++handle->k >= MAP_PAGE_ENTRIES) { 517 if (++handle->k >= MAP_PAGE_ENTRIES) {
580 error = wait_on_bio_chain(bio_chain); 518 error = hib_wait_on_bio_chain(bio_chain);
581 handle->k = 0; 519 handle->k = 0;
582 offset = handle->cur->next_swap; 520 offset = handle->cur->next_swap;
583 if (!offset) 521 if (!offset)
584 release_swap_reader(handle); 522 release_swap_reader(handle);
585 else if (!error) 523 else if (!error)
586 error = bio_read_page(offset, handle->cur, NULL); 524 error = hib_bio_read_page(offset, handle->cur, NULL);
587 } 525 }
588 return error; 526 return error;
589} 527}
590 528
529static int swap_reader_finish(struct swap_map_handle *handle)
530{
531 release_swap_reader(handle);
532
533 return 0;
534}
535
591/** 536/**
592 * load_image - load the image using the swap map handle 537 * load_image - load the image using the swap map handle
593 * @handle and the snapshot handle @snapshot 538 * @handle and the snapshot handle @snapshot
@@ -615,21 +560,21 @@ static int load_image(struct swap_map_handle *handle,
615 bio = NULL; 560 bio = NULL;
616 do_gettimeofday(&start); 561 do_gettimeofday(&start);
617 for ( ; ; ) { 562 for ( ; ; ) {
618 error = snapshot_write_next(snapshot, PAGE_SIZE); 563 error = snapshot_write_next(snapshot);
619 if (error <= 0) 564 if (error <= 0)
620 break; 565 break;
621 error = swap_read_page(handle, data_of(*snapshot), &bio); 566 error = swap_read_page(handle, data_of(*snapshot), &bio);
622 if (error) 567 if (error)
623 break; 568 break;
624 if (snapshot->sync_read) 569 if (snapshot->sync_read)
625 error = wait_on_bio_chain(&bio); 570 error = hib_wait_on_bio_chain(&bio);
626 if (error) 571 if (error)
627 break; 572 break;
628 if (!(nr_pages % m)) 573 if (!(nr_pages % m))
629 printk("\b\b\b\b%3d%%", nr_pages / m); 574 printk("\b\b\b\b%3d%%", nr_pages / m);
630 nr_pages++; 575 nr_pages++;
631 } 576 }
632 err2 = wait_on_bio_chain(&bio); 577 err2 = hib_wait_on_bio_chain(&bio);
633 do_gettimeofday(&stop); 578 do_gettimeofday(&stop);
634 if (!error) 579 if (!error)
635 error = err2; 580 error = err2;
@@ -657,20 +602,20 @@ int swsusp_read(unsigned int *flags_p)
657 struct snapshot_handle snapshot; 602 struct snapshot_handle snapshot;
658 struct swsusp_info *header; 603 struct swsusp_info *header;
659 604
660 *flags_p = swsusp_header->flags;
661
662 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 605 memset(&snapshot, 0, sizeof(struct snapshot_handle));
663 error = snapshot_write_next(&snapshot, PAGE_SIZE); 606 error = snapshot_write_next(&snapshot);
664 if (error < PAGE_SIZE) 607 if (error < PAGE_SIZE)
665 return error < 0 ? error : -EFAULT; 608 return error < 0 ? error : -EFAULT;
666 header = (struct swsusp_info *)data_of(snapshot); 609 header = (struct swsusp_info *)data_of(snapshot);
667 error = get_swap_reader(&handle, swsusp_header->image); 610 error = get_swap_reader(&handle, flags_p);
611 if (error)
612 goto end;
668 if (!error) 613 if (!error)
669 error = swap_read_page(&handle, header, NULL); 614 error = swap_read_page(&handle, header, NULL);
670 if (!error) 615 if (!error)
671 error = load_image(&handle, &snapshot, header->pages - 1); 616 error = load_image(&handle, &snapshot, header->pages - 1);
672 release_swap_reader(&handle); 617 swap_reader_finish(&handle);
673 618end:
674 if (!error) 619 if (!error)
675 pr_debug("PM: Image successfully loaded\n"); 620 pr_debug("PM: Image successfully loaded\n");
676 else 621 else
@@ -686,11 +631,11 @@ int swsusp_check(void)
686{ 631{
687 int error; 632 int error;
688 633
689 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 634 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
690 if (!IS_ERR(resume_bdev)) { 635 if (!IS_ERR(hib_resume_bdev)) {
691 set_blocksize(resume_bdev, PAGE_SIZE); 636 set_blocksize(hib_resume_bdev, PAGE_SIZE);
692 memset(swsusp_header, 0, PAGE_SIZE); 637 memset(swsusp_header, 0, PAGE_SIZE);
693 error = bio_read_page(swsusp_resume_block, 638 error = hib_bio_read_page(swsusp_resume_block,
694 swsusp_header, NULL); 639 swsusp_header, NULL);
695 if (error) 640 if (error)
696 goto put; 641 goto put;
@@ -698,7 +643,7 @@ int swsusp_check(void)
698 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 643 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
699 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 644 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
700 /* Reset swap signature now */ 645 /* Reset swap signature now */
701 error = bio_write_page(swsusp_resume_block, 646 error = hib_bio_write_page(swsusp_resume_block,
702 swsusp_header, NULL); 647 swsusp_header, NULL);
703 } else { 648 } else {
704 error = -EINVAL; 649 error = -EINVAL;
@@ -706,11 +651,11 @@ int swsusp_check(void)
706 651
707put: 652put:
708 if (error) 653 if (error)
709 blkdev_put(resume_bdev, FMODE_READ); 654 blkdev_put(hib_resume_bdev, FMODE_READ);
710 else 655 else
711 pr_debug("PM: Signature found, resuming\n"); 656 pr_debug("PM: Signature found, resuming\n");
712 } else { 657 } else {
713 error = PTR_ERR(resume_bdev); 658 error = PTR_ERR(hib_resume_bdev);
714 } 659 }
715 660
716 if (error) 661 if (error)
@@ -725,12 +670,12 @@ put:
725 670
726void swsusp_close(fmode_t mode) 671void swsusp_close(fmode_t mode)
727{ 672{
728 if (IS_ERR(resume_bdev)) { 673 if (IS_ERR(hib_resume_bdev)) {
729 pr_debug("PM: Image device not initialised\n"); 674 pr_debug("PM: Image device not initialised\n");
730 return; 675 return;
731 } 676 }
732 677
733 blkdev_put(resume_bdev, mode); 678 blkdev_put(hib_resume_bdev, mode);
734} 679}
735 680
736static int swsusp_header_init(void) 681static int swsusp_header_init(void)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a8c96212bc1b..e819e17877ca 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
151{ 151{
152 struct snapshot_data *data; 152 struct snapshot_data *data;
153 ssize_t res; 153 ssize_t res;
154 loff_t pg_offp = *offp & ~PAGE_MASK;
154 155
155 mutex_lock(&pm_mutex); 156 mutex_lock(&pm_mutex);
156 157
@@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
159 res = -ENODATA; 160 res = -ENODATA;
160 goto Unlock; 161 goto Unlock;
161 } 162 }
162 res = snapshot_read_next(&data->handle, count); 163 if (!pg_offp) { /* on page boundary? */
163 if (res > 0) { 164 res = snapshot_read_next(&data->handle);
164 if (copy_to_user(buf, data_of(data->handle), res)) 165 if (res <= 0)
165 res = -EFAULT; 166 goto Unlock;
166 else 167 } else {
167 *offp = data->handle.offset; 168 res = PAGE_SIZE - pg_offp;
168 } 169 }
169 170
171 res = simple_read_from_buffer(buf, count, &pg_offp,
172 data_of(data->handle), res);
173 if (res > 0)
174 *offp += res;
175
170 Unlock: 176 Unlock:
171 mutex_unlock(&pm_mutex); 177 mutex_unlock(&pm_mutex);
172 178
@@ -178,18 +184,25 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
178{ 184{
179 struct snapshot_data *data; 185 struct snapshot_data *data;
180 ssize_t res; 186 ssize_t res;
187 loff_t pg_offp = *offp & ~PAGE_MASK;
181 188
182 mutex_lock(&pm_mutex); 189 mutex_lock(&pm_mutex);
183 190
184 data = filp->private_data; 191 data = filp->private_data;
185 res = snapshot_write_next(&data->handle, count); 192
186 if (res > 0) { 193 if (!pg_offp) {
187 if (copy_from_user(data_of(data->handle), buf, res)) 194 res = snapshot_write_next(&data->handle);
188 res = -EFAULT; 195 if (res <= 0)
189 else 196 goto unlock;
190 *offp = data->handle.offset; 197 } else {
198 res = PAGE_SIZE - pg_offp;
191 } 199 }
192 200
201 res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
202 buf, count);
203 if (res > 0)
204 *offp += res;
205unlock:
193 mutex_unlock(&pm_mutex); 206 mutex_unlock(&pm_mutex);
194 207
195 return res; 208 return res;
diff --git a/kernel/printk.c b/kernel/printk.c
index 75077ad0b537..444b770c9595 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,7 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/kdb.h>
36#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h> 38#include <linux/kmsg_dump.h>
38#include <linux/syslog.h> 39#include <linux/syslog.h>
@@ -413,6 +414,22 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
413 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 414 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
414} 415}
415 416
417#ifdef CONFIG_KGDB_KDB
418/* kdb dmesg command needs access to the syslog buffer. do_syslog()
419 * uses locks so it cannot be used during debugging. Just tell kdb
420 * where the start and end of the physical and logical logs are. This
421 * is equivalent to do_syslog(3).
422 */
423void kdb_syslog_data(char *syslog_data[4])
424{
425 syslog_data[0] = log_buf;
426 syslog_data[1] = log_buf + log_buf_len;
427 syslog_data[2] = log_buf + log_end -
428 (logged_chars < log_buf_len ? logged_chars : log_buf_len);
429 syslog_data[3] = log_buf + log_end;
430}
431#endif /* CONFIG_KGDB_KDB */
432
416/* 433/*
417 * Call the console drivers on a range of log_buf 434 * Call the console drivers on a range of log_buf
418 */ 435 */
@@ -586,6 +603,14 @@ asmlinkage int printk(const char *fmt, ...)
586 va_list args; 603 va_list args;
587 int r; 604 int r;
588 605
606#ifdef CONFIG_KGDB_KDB
607 if (unlikely(kdb_trap_printk)) {
608 va_start(args, fmt);
609 r = vkdb_printf(fmt, args);
610 va_end(args);
611 return r;
612 }
613#endif
589 va_start(args, fmt); 614 va_start(args, fmt);
590 r = vprintk(fmt, args); 615 r = vprintk(fmt, args);
591 va_end(args); 616 va_end(args);
diff --git a/kernel/profile.c b/kernel/profile.c
index dfadc5b729f1..b22a899934cc 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -365,14 +365,14 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
365 switch (action) { 365 switch (action) {
366 case CPU_UP_PREPARE: 366 case CPU_UP_PREPARE:
367 case CPU_UP_PREPARE_FROZEN: 367 case CPU_UP_PREPARE_FROZEN:
368 node = cpu_to_node(cpu); 368 node = cpu_to_mem(cpu);
369 per_cpu(cpu_profile_flip, cpu) = 0; 369 per_cpu(cpu_profile_flip, cpu) = 0;
370 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 370 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
371 page = alloc_pages_exact_node(node, 371 page = alloc_pages_exact_node(node,
372 GFP_KERNEL | __GFP_ZERO, 372 GFP_KERNEL | __GFP_ZERO,
373 0); 373 0);
374 if (!page) 374 if (!page)
375 return NOTIFY_BAD; 375 return notifier_from_errno(-ENOMEM);
376 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 376 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
377 } 377 }
378 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 378 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
@@ -388,7 +388,7 @@ out_free:
388 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 388 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
389 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 389 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
390 __free_page(page); 390 __free_page(page);
391 return NOTIFY_BAD; 391 return notifier_from_errno(-ENOMEM);
392 case CPU_ONLINE: 392 case CPU_ONLINE:
393 case CPU_ONLINE_FROZEN: 393 case CPU_ONLINE_FROZEN:
394 if (prof_cpu_mask != NULL) 394 if (prof_cpu_mask != NULL)
@@ -567,7 +567,7 @@ static int create_hash_tables(void)
567 int cpu; 567 int cpu;
568 568
569 for_each_online_cpu(cpu) { 569 for_each_online_cpu(cpu) {
570 int node = cpu_to_node(cpu); 570 int node = cpu_to_mem(cpu);
571 struct page *page; 571 struct page *page;
572 572
573 page = alloc_pages_exact_node(node, 573 page = alloc_pages_exact_node(node,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a0..74a3d693c196 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18#include <linux/ptrace.h> 17#include <linux/ptrace.h>
19#include <linux/security.h> 18#include <linux/security.h>
20#include <linux/signal.h> 19#include <linux/signal.h>
@@ -76,7 +75,6 @@ void __ptrace_unlink(struct task_struct *child)
76 child->parent = child->real_parent; 75 child->parent = child->real_parent;
77 list_del_init(&child->ptrace_entry); 76 list_del_init(&child->ptrace_entry);
78 77
79 arch_ptrace_untrace(child);
80 if (task_is_traced(child)) 78 if (task_is_traced(child))
81 ptrace_untrace(child); 79 ptrace_untrace(child);
82} 80}
@@ -596,6 +594,32 @@ int ptrace_request(struct task_struct *child, long request,
596 ret = ptrace_detach(child, data); 594 ret = ptrace_detach(child, data);
597 break; 595 break;
598 596
597#ifdef CONFIG_BINFMT_ELF_FDPIC
598 case PTRACE_GETFDPIC: {
599 struct mm_struct *mm = get_task_mm(child);
600 unsigned long tmp = 0;
601
602 ret = -ESRCH;
603 if (!mm)
604 break;
605
606 switch (addr) {
607 case PTRACE_GETFDPIC_EXEC:
608 tmp = mm->context.exec_fdpic_loadmap;
609 break;
610 case PTRACE_GETFDPIC_INTERP:
611 tmp = mm->context.interp_fdpic_loadmap;
612 break;
613 default:
614 break;
615 }
616 mmput(mm);
617
618 ret = put_user(tmp, (unsigned long __user *) data);
619 break;
620 }
621#endif
622
599#ifdef PTRACE_SINGLESTEP 623#ifdef PTRACE_SINGLESTEP
600 case PTRACE_SINGLESTEP: 624 case PTRACE_SINGLESTEP:
601#endif 625#endif
@@ -666,10 +690,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
666 struct task_struct *child; 690 struct task_struct *child;
667 long ret; 691 long ret;
668 692
669 /*
670 * This lock_kernel fixes a subtle race with suid exec
671 */
672 lock_kernel();
673 if (request == PTRACE_TRACEME) { 693 if (request == PTRACE_TRACEME) {
674 ret = ptrace_traceme(); 694 ret = ptrace_traceme();
675 if (!ret) 695 if (!ret)
@@ -703,7 +723,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
703 out_put_task_struct: 723 out_put_task_struct:
704 put_task_struct(child); 724 put_task_struct(child);
705 out: 725 out:
706 unlock_kernel();
707 return ret; 726 return ret;
708} 727}
709 728
@@ -813,10 +832,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
813 struct task_struct *child; 832 struct task_struct *child;
814 long ret; 833 long ret;
815 834
816 /*
817 * This lock_kernel fixes a subtle race with suid exec
818 */
819 lock_kernel();
820 if (request == PTRACE_TRACEME) { 835 if (request == PTRACE_TRACEME) {
821 ret = ptrace_traceme(); 836 ret = ptrace_traceme();
822 goto out; 837 goto out;
@@ -846,7 +861,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
846 out_put_task_struct: 861 out_put_task_struct:
847 put_task_struct(child); 862 put_task_struct(child);
848 out: 863 out:
849 unlock_kernel();
850 return ret; 864 return ret;
851} 865}
852#endif /* CONFIG_COMPAT */ 866#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 49d808e833b0..72a8dc9567f5 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,6 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h>
48#include <linux/hardirq.h> 47#include <linux/hardirq.h>
49 48
50#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -64,9 +63,6 @@ struct lockdep_map rcu_sched_lock_map =
64EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
65#endif 64#endif
66 65
67int rcu_scheduler_active __read_mostly;
68EXPORT_SYMBOL_GPL(rcu_scheduler_active);
69
70#ifdef CONFIG_DEBUG_LOCK_ALLOC 66#ifdef CONFIG_DEBUG_LOCK_ALLOC
71 67
72int debug_lockdep_rcu_enabled(void) 68int debug_lockdep_rcu_enabled(void)
@@ -97,21 +93,6 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
97#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 93#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
98 94
99/* 95/*
100 * This function is invoked towards the end of the scheduler's initialization
101 * process. Before this is called, the idle task might contain
102 * RCU read-side critical sections (during which time, this idle
103 * task is booting the system). After this function is called, the
104 * idle tasks are prohibited from containing RCU read-side critical
105 * sections.
106 */
107void rcu_scheduler_starting(void)
108{
109 WARN_ON(num_online_cpus() != 1);
110 WARN_ON(nr_context_switches() > 0);
111 rcu_scheduler_active = 1;
112}
113
114/*
115 * Awaken the corresponding synchronize_rcu() instance now that a 96 * Awaken the corresponding synchronize_rcu() instance now that a
116 * grace period has elapsed. 97 * grace period has elapsed.
117 */ 98 */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572c..38729d3cd236 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
44}; 44};
45 45
46/* Definition for rcupdate control block. */ 46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = { 47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist, 48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist, 49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50}; 50};
51 51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = { 52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
54 .curtail = &rcu_bh_ctrlblk.rcucblist, 54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55}; 55};
56 56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61
57#ifdef CONFIG_NO_HZ 62#ifdef CONFIG_NO_HZ
58 63
59static long rcu_dynticks_nesting = 1; 64static long rcu_dynticks_nesting = 1;
@@ -108,7 +113,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
108 */ 113 */
109void rcu_sched_qs(int cpu) 114void rcu_sched_qs(int cpu)
110{ 115{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) 116 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
117 rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ); 118 raise_softirq(RCU_SOFTIRQ);
113} 119}
114 120
@@ -173,7 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
173 */ 179 */
174static void rcu_process_callbacks(struct softirq_action *unused) 180static void rcu_process_callbacks(struct softirq_action *unused)
175{ 181{
176 __rcu_process_callbacks(&rcu_ctrlblk); 182 __rcu_process_callbacks(&rcu_sched_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk); 183 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178} 184}
179 185
@@ -187,7 +193,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
187 * 193 *
188 * Cool, huh? (Due to Josh Triplett.) 194 * Cool, huh? (Due to Josh Triplett.)
189 * 195 *
190 * But we want to make this a static inline later. 196 * But we want to make this a static inline later. The cond_resched()
197 * currently makes this problematic.
191 */ 198 */
192void synchronize_sched(void) 199void synchronize_sched(void)
193{ 200{
@@ -195,12 +202,6 @@ void synchronize_sched(void)
195} 202}
196EXPORT_SYMBOL_GPL(synchronize_sched); 203EXPORT_SYMBOL_GPL(synchronize_sched);
197 204
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/* 205/*
205 * Helper function for call_rcu() and call_rcu_bh(). 206 * Helper function for call_rcu() and call_rcu_bh().
206 */ 207 */
@@ -226,7 +227,7 @@ static void __call_rcu(struct rcu_head *head,
226 */ 227 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 228void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{ 229{
229 __call_rcu(head, func, &rcu_ctrlblk); 230 __call_rcu(head, func, &rcu_sched_ctrlblk);
230} 231}
231EXPORT_SYMBOL_GPL(call_rcu); 232EXPORT_SYMBOL_GPL(call_rcu);
232 233
@@ -244,11 +245,13 @@ void rcu_barrier(void)
244{ 245{
245 struct rcu_synchronize rcu; 246 struct rcu_synchronize rcu;
246 247
248 init_rcu_head_on_stack(&rcu.head);
247 init_completion(&rcu.completion); 249 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */ 250 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu); 251 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */ 252 /* Wait for it. */
251 wait_for_completion(&rcu.completion); 253 wait_for_completion(&rcu.completion);
254 destroy_rcu_head_on_stack(&rcu.head);
252} 255}
253EXPORT_SYMBOL_GPL(rcu_barrier); 256EXPORT_SYMBOL_GPL(rcu_barrier);
254 257
@@ -256,11 +259,13 @@ void rcu_barrier_bh(void)
256{ 259{
257 struct rcu_synchronize rcu; 260 struct rcu_synchronize rcu;
258 261
262 init_rcu_head_on_stack(&rcu.head);
259 init_completion(&rcu.completion); 263 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */ 264 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu); 265 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */ 266 /* Wait for it. */
263 wait_for_completion(&rcu.completion); 267 wait_for_completion(&rcu.completion);
268 destroy_rcu_head_on_stack(&rcu.head);
264} 269}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh); 270EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266 271
@@ -268,11 +273,13 @@ void rcu_barrier_sched(void)
268{ 273{
269 struct rcu_synchronize rcu; 274 struct rcu_synchronize rcu;
270 275
276 init_rcu_head_on_stack(&rcu.head);
271 init_completion(&rcu.completion); 277 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */ 278 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu); 279 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */ 280 /* Wait for it. */
275 wait_for_completion(&rcu.completion); 281 wait_for_completion(&rcu.completion);
282 destroy_rcu_head_on_stack(&rcu.head);
276} 283}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched); 284EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278 285
@@ -280,3 +287,5 @@ void __init rcu_init(void)
280{ 287{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 288 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282} 289}
290
291#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 000000000000..d223a92bc742
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright IBM Corporation, 2009
21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#ifdef CONFIG_DEBUG_LOCK_ALLOC
26
27#include <linux/kernel_stat.h>
28
29/*
30 * During boot, we forgive RCU lockdep issues. After this function is
31 * invoked, we start taking RCU lockdep issues seriously.
32 */
33void rcu_scheduler_starting(void)
34{
35 WARN_ON(nr_context_switches() > 0);
36 rcu_scheduler_active = 1;
37}
38
39#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83ed..6535ac8bc6a5 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -464,9 +464,11 @@ static void rcu_bh_torture_synchronize(void)
464{ 464{
465 struct rcu_bh_torture_synchronize rcu; 465 struct rcu_bh_torture_synchronize rcu;
466 466
467 init_rcu_head_on_stack(&rcu.head);
467 init_completion(&rcu.completion); 468 init_completion(&rcu.completion);
468 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); 469 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
469 wait_for_completion(&rcu.completion); 470 wait_for_completion(&rcu.completion);
471 destroy_rcu_head_on_stack(&rcu.head);
470} 472}
471 473
472static struct rcu_torture_ops rcu_bh_ops = { 474static struct rcu_torture_ops rcu_bh_ops = {
@@ -669,7 +671,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
669 .sync = synchronize_sched_expedited, 671 .sync = synchronize_sched_expedited,
670 .cb_barrier = NULL, 672 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state, 673 .fqs = rcu_sched_force_quiescent_state,
672 .stats = rcu_expedited_torture_stats, 674 .stats = NULL,
673 .irq_capable = 1, 675 .irq_capable = 1,
674 .name = "sched_expedited" 676 .name = "sched_expedited"
675}; 677};
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3ec8160fc75f..d4437345706f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,7 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
@@ -53,8 +54,8 @@
53 54
54static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
55 56
56#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(structname) { \
57 .level = { &name.node[0] }, \ 58 .level = { &structname.node[0] }, \
58 .levelcnt = { \ 59 .levelcnt = { \
59 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
60 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
@@ -65,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
65 .signaled = RCU_GP_IDLE, \ 66 .signaled = RCU_GP_IDLE, \
66 .gpnum = -300, \ 67 .gpnum = -300, \
67 .completed = -300, \ 68 .completed = -300, \
68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
69 .orphan_cbs_list = NULL, \ 70 .orphan_cbs_list = NULL, \
70 .orphan_cbs_tail = &name.orphan_cbs_list, \ 71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
71 .orphan_qlen = 0, \ 72 .orphan_qlen = 0, \
72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ 73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
73 .n_force_qs = 0, \ 74 .n_force_qs = 0, \
74 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
76 .name = #structname, \
75} 77}
76 78
77struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); 79struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -80,6 +82,9 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
82 84
85int rcu_scheduler_active __read_mostly;
86EXPORT_SYMBOL_GPL(rcu_scheduler_active);
87
83/* 88/*
84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 89 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
85 * permit this function to be invoked without holding the root rcu_node 90 * permit this function to be invoked without holding the root rcu_node
@@ -97,25 +102,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
97 */ 102 */
98void rcu_sched_qs(int cpu) 103void rcu_sched_qs(int cpu)
99{ 104{
100 struct rcu_data *rdp; 105 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
101 106
102 rdp = &per_cpu(rcu_sched_data, cpu);
103 rdp->passed_quiesc_completed = rdp->gpnum - 1; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
104 barrier(); 108 barrier();
105 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
106 rcu_preempt_note_context_switch(cpu);
107} 110}
108 111
109void rcu_bh_qs(int cpu) 112void rcu_bh_qs(int cpu)
110{ 113{
111 struct rcu_data *rdp; 114 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
112 115
113 rdp = &per_cpu(rcu_bh_data, cpu);
114 rdp->passed_quiesc_completed = rdp->gpnum - 1; 116 rdp->passed_quiesc_completed = rdp->gpnum - 1;
115 barrier(); 117 barrier();
116 rdp->passed_quiesc = 1; 118 rdp->passed_quiesc = 1;
117} 119}
118 120
121/*
122 * Note a context switch. This is a quiescent state for RCU-sched,
123 * and requires special handling for preemptible RCU.
124 */
125void rcu_note_context_switch(int cpu)
126{
127 rcu_sched_qs(cpu);
128 rcu_preempt_note_context_switch(cpu);
129}
130
119#ifdef CONFIG_NO_HZ 131#ifdef CONFIG_NO_HZ
120DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 132DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
121 .dynticks_nesting = 1, 133 .dynticks_nesting = 1,
@@ -438,6 +450,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
438 450
439#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
440 452
453int rcu_cpu_stall_panicking __read_mostly;
454
441static void record_gp_stall_check_time(struct rcu_state *rsp) 455static void record_gp_stall_check_time(struct rcu_state *rsp)
442{ 456{
443 rsp->gp_start = jiffies; 457 rsp->gp_start = jiffies;
@@ -470,7 +484,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
470 484
471 /* OK, time to rat on our buddy... */ 485 /* OK, time to rat on our buddy... */
472 486
473 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name);
474 rcu_for_each_leaf_node(rsp, rnp) { 489 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags); 490 raw_spin_lock_irqsave(&rnp->lock, flags);
476 rcu_print_task_stall(rnp); 491 rcu_print_task_stall(rnp);
@@ -481,7 +496,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
481 if (rnp->qsmask & (1UL << cpu)) 496 if (rnp->qsmask & (1UL << cpu))
482 printk(" %d", rnp->grplo + cpu); 497 printk(" %d", rnp->grplo + cpu);
483 } 498 }
484 printk(" (detected by %d, t=%ld jiffies)\n", 499 printk("} (detected by %d, t=%ld jiffies)\n",
485 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 500 smp_processor_id(), (long)(jiffies - rsp->gp_start));
486 trigger_all_cpu_backtrace(); 501 trigger_all_cpu_backtrace();
487 502
@@ -497,8 +512,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
497 unsigned long flags; 512 unsigned long flags;
498 struct rcu_node *rnp = rcu_get_root(rsp); 513 struct rcu_node *rnp = rcu_get_root(rsp);
499 514
500 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
501 smp_processor_id(), jiffies - rsp->gp_start); 516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
502 trigger_all_cpu_backtrace(); 517 trigger_all_cpu_backtrace();
503 518
504 raw_spin_lock_irqsave(&rnp->lock, flags); 519 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -515,6 +530,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
515 long delta; 530 long delta;
516 struct rcu_node *rnp; 531 struct rcu_node *rnp;
517 532
533 if (rcu_cpu_stall_panicking)
534 return;
518 delta = jiffies - rsp->jiffies_stall; 535 delta = jiffies - rsp->jiffies_stall;
519 rnp = rdp->mynode; 536 rnp = rdp->mynode;
520 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -529,6 +546,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
529 } 546 }
530} 547}
531 548
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{
551 rcu_cpu_stall_panicking = 1;
552 return NOTIFY_DONE;
553}
554
555static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic,
557};
558
559static void __init check_cpu_stall_init(void)
560{
561 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
562}
563
532#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 564#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
533 565
534static void record_gp_stall_check_time(struct rcu_state *rsp) 566static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -539,6 +571,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 571{
540} 572}
541 573
574static void __init check_cpu_stall_init(void)
575{
576}
577
542#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 578#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
543 579
544/* 580/*
@@ -1125,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1125 */ 1161 */
1126void rcu_check_callbacks(int cpu, int user) 1162void rcu_check_callbacks(int cpu, int user)
1127{ 1163{
1128 if (!rcu_pending(cpu))
1129 return; /* if nothing for RCU to do. */
1130 if (user || 1164 if (user ||
1131 (idle_cpu(cpu) && rcu_scheduler_active && 1165 (idle_cpu(cpu) && rcu_scheduler_active &&
1132 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1166 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1158,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user)
1158 rcu_bh_qs(cpu); 1192 rcu_bh_qs(cpu);
1159 } 1193 }
1160 rcu_preempt_check_callbacks(cpu); 1194 rcu_preempt_check_callbacks(cpu);
1161 raise_softirq(RCU_SOFTIRQ); 1195 if (rcu_pending(cpu))
1196 raise_softirq(RCU_SOFTIRQ);
1162} 1197}
1163 1198
1164#ifdef CONFIG_SMP 1199#ifdef CONFIG_SMP
@@ -1236,11 +1271,11 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1236 break; /* grace period idle or initializing, ignore. */ 1271 break; /* grace period idle or initializing, ignore. */
1237 1272
1238 case RCU_SAVE_DYNTICK: 1273 case RCU_SAVE_DYNTICK:
1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1274 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1242 break; /* So gcc recognizes the dead code. */ 1275 break; /* So gcc recognizes the dead code. */
1243 1276
1277 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1278
1244 /* Record dyntick-idle state. */ 1279 /* Record dyntick-idle state. */
1245 force_qs_rnp(rsp, dyntick_save_progress_counter); 1280 force_qs_rnp(rsp, dyntick_save_progress_counter);
1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1281 raw_spin_lock(&rnp->lock); /* irqs already disabled */
@@ -1449,11 +1484,13 @@ void synchronize_sched(void)
1449 if (rcu_blocking_is_gp()) 1484 if (rcu_blocking_is_gp())
1450 return; 1485 return;
1451 1486
1487 init_rcu_head_on_stack(&rcu.head);
1452 init_completion(&rcu.completion); 1488 init_completion(&rcu.completion);
1453 /* Will wake me after RCU finished. */ 1489 /* Will wake me after RCU finished. */
1454 call_rcu_sched(&rcu.head, wakeme_after_rcu); 1490 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1455 /* Wait for it. */ 1491 /* Wait for it. */
1456 wait_for_completion(&rcu.completion); 1492 wait_for_completion(&rcu.completion);
1493 destroy_rcu_head_on_stack(&rcu.head);
1457} 1494}
1458EXPORT_SYMBOL_GPL(synchronize_sched); 1495EXPORT_SYMBOL_GPL(synchronize_sched);
1459 1496
@@ -1473,11 +1510,13 @@ void synchronize_rcu_bh(void)
1473 if (rcu_blocking_is_gp()) 1510 if (rcu_blocking_is_gp())
1474 return; 1511 return;
1475 1512
1513 init_rcu_head_on_stack(&rcu.head);
1476 init_completion(&rcu.completion); 1514 init_completion(&rcu.completion);
1477 /* Will wake me after RCU finished. */ 1515 /* Will wake me after RCU finished. */
1478 call_rcu_bh(&rcu.head, wakeme_after_rcu); 1516 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1479 /* Wait for it. */ 1517 /* Wait for it. */
1480 wait_for_completion(&rcu.completion); 1518 wait_for_completion(&rcu.completion);
1519 destroy_rcu_head_on_stack(&rcu.head);
1481} 1520}
1482EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1521EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1483 1522
@@ -1498,8 +1537,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1498 check_cpu_stall(rsp, rdp); 1537 check_cpu_stall(rsp, rdp);
1499 1538
1500 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1539 /* Is the RCU core waiting for a quiescent state from this CPU? */
1501 if (rdp->qs_pending) { 1540 if (rdp->qs_pending && !rdp->passed_quiesc) {
1541
1542 /*
1543 * If force_quiescent_state() coming soon and this CPU
1544 * needs a quiescent state, and this is either RCU-sched
1545 * or RCU-bh, force a local reschedule.
1546 */
1502 rdp->n_rp_qs_pending++; 1547 rdp->n_rp_qs_pending++;
1548 if (!rdp->preemptable &&
1549 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1550 jiffies))
1551 set_need_resched();
1552 } else if (rdp->qs_pending && rdp->passed_quiesc) {
1553 rdp->n_rp_report_qs++;
1503 return 1; 1554 return 1;
1504 } 1555 }
1505 1556
@@ -1767,6 +1818,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1767} 1818}
1768 1819
1769/* 1820/*
1821 * This function is invoked towards the end of the scheduler's initialization
1822 * process. Before this is called, the idle task might contain
1823 * RCU read-side critical sections (during which time, this idle
1824 * task is booting the system). After this function is called, the
1825 * idle tasks are prohibited from containing RCU read-side critical
1826 * sections. This function also enables RCU lockdep checking.
1827 */
1828void rcu_scheduler_starting(void)
1829{
1830 WARN_ON(num_online_cpus() != 1);
1831 WARN_ON(nr_context_switches() > 0);
1832 rcu_scheduler_active = 1;
1833}
1834
1835/*
1770 * Compute the per-level fanout, either using the exact fanout specified 1836 * Compute the per-level fanout, either using the exact fanout specified
1771 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. 1837 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1772 */ 1838 */
@@ -1849,6 +1915,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]); 1915 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1850 } 1916 }
1851 } 1917 }
1918
1919 rnp = rsp->level[NUM_RCU_LVLS - 1];
1920 for_each_possible_cpu(i) {
1921 while (i > rnp->grphi)
1922 rnp++;
1923 rsp->rda[i]->mynode = rnp;
1924 rcu_boot_init_percpu_data(i, rsp);
1925 }
1852} 1926}
1853 1927
1854/* 1928/*
@@ -1859,19 +1933,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1859#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1933#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1860do { \ 1934do { \
1861 int i; \ 1935 int i; \
1862 int j; \
1863 struct rcu_node *rnp; \
1864 \ 1936 \
1865 rcu_init_one(rsp); \
1866 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1867 j = 0; \
1868 for_each_possible_cpu(i) { \ 1937 for_each_possible_cpu(i) { \
1869 if (i > rnp[j].grphi) \
1870 j++; \
1871 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1872 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1938 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1873 rcu_boot_init_percpu_data(i, rsp); \
1874 } \ 1939 } \
1940 rcu_init_one(rsp); \
1875} while (0) 1941} while (0)
1876 1942
1877void __init rcu_init(void) 1943void __init rcu_init(void)
@@ -1879,12 +1945,6 @@ void __init rcu_init(void)
1879 int cpu; 1945 int cpu;
1880 1946
1881 rcu_bootup_announce(); 1947 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1948 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1949 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1890 __rcu_init_preempt(); 1950 __rcu_init_preempt();
@@ -1898,6 +1958,7 @@ void __init rcu_init(void)
1898 cpu_notifier(rcu_cpu_notify, 0); 1958 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(cpu) 1959 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1960 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1961 check_cpu_stall_init();
1901} 1962}
1902 1963
1903#include "rcutree_plugin.h" 1964#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4a525a30e08e..14c040b18ed0 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -223,6 +223,7 @@ struct rcu_data {
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 unsigned long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 unsigned long n_rp_report_qs;
226 unsigned long n_rp_cb_ready; 227 unsigned long n_rp_cb_ready;
227 unsigned long n_rp_cpu_needs_gp; 228 unsigned long n_rp_cpu_needs_gp;
228 unsigned long n_rp_gp_completed; 229 unsigned long n_rp_gp_completed;
@@ -326,6 +327,7 @@ struct rcu_state {
326 unsigned long jiffies_stall; /* Time at which to check */ 327 unsigned long jiffies_stall; /* Time at which to check */
327 /* for CPU stalls. */ 328 /* for CPU stalls. */
328#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 329#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
330 char *name; /* Name of structure. */
329}; 331};
330 332
331/* Return values for rcu_preempt_offline_tasks(). */ 333/* Return values for rcu_preempt_offline_tasks(). */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 79b53bda8943..0e4f420245d9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28 28
29/*
30 * Check the RCU kernel configuration parameters and print informative
31 * messages about anything out of the ordinary. If you like #ifdef, you
32 * will love this function.
33 */
34static void __init rcu_bootup_announce_oddness(void)
35{
36#ifdef CONFIG_RCU_TRACE
37 printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
38#endif
39#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
40 printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
41 CONFIG_RCU_FANOUT);
42#endif
43#ifdef CONFIG_RCU_FANOUT_EXACT
44 printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
45#endif
46#ifdef CONFIG_RCU_FAST_NO_HZ
47 printk(KERN_INFO
48 "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
49#endif
50#ifdef CONFIG_PROVE_RCU
51 printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
52#endif
53#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
54 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
55#endif
56#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif
63#if NUM_RCU_LVL_4 != 0
64 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
65#endif
66}
67
29#ifdef CONFIG_TREE_PREEMPT_RCU 68#ifdef CONFIG_TREE_PREEMPT_RCU
30 69
31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 70struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
38 */ 77 */
39static void __init rcu_bootup_announce(void) 78static void __init rcu_bootup_announce(void)
40{ 79{
41 printk(KERN_INFO 80 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
42 "Experimental preemptable hierarchical RCU implementation.\n"); 81 rcu_bootup_announce_oddness();
43} 82}
44 83
45/* 84/*
@@ -75,13 +114,19 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
75 * that this just means that the task currently running on the CPU is 114 * that this just means that the task currently running on the CPU is
76 * not in a quiescent state. There might be any number of tasks blocked 115 * not in a quiescent state. There might be any number of tasks blocked
77 * while in an RCU read-side critical section. 116 * while in an RCU read-side critical section.
117 *
118 * Unlike the other rcu_*_qs() functions, callers to this function
119 * must disable irqs in order to protect the assignment to
120 * ->rcu_read_unlock_special.
78 */ 121 */
79static void rcu_preempt_qs(int cpu) 122static void rcu_preempt_qs(int cpu)
80{ 123{
81 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 124 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
125
82 rdp->passed_quiesc_completed = rdp->gpnum - 1; 126 rdp->passed_quiesc_completed = rdp->gpnum - 1;
83 barrier(); 127 barrier();
84 rdp->passed_quiesc = 1; 128 rdp->passed_quiesc = 1;
129 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
85} 130}
86 131
87/* 132/*
@@ -144,9 +189,8 @@ static void rcu_preempt_note_context_switch(int cpu)
144 * grace period, then the fact that the task has been enqueued 189 * grace period, then the fact that the task has been enqueued
145 * means that we continue to block the current grace period. 190 * means that we continue to block the current grace period.
146 */ 191 */
147 rcu_preempt_qs(cpu);
148 local_irq_save(flags); 192 local_irq_save(flags);
149 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 193 rcu_preempt_qs(cpu);
150 local_irq_restore(flags); 194 local_irq_restore(flags);
151} 195}
152 196
@@ -236,7 +280,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
236 */ 280 */
237 special = t->rcu_read_unlock_special; 281 special = t->rcu_read_unlock_special;
238 if (special & RCU_READ_UNLOCK_NEED_QS) { 282 if (special & RCU_READ_UNLOCK_NEED_QS) {
239 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
240 rcu_preempt_qs(smp_processor_id()); 283 rcu_preempt_qs(smp_processor_id());
241 } 284 }
242 285
@@ -473,7 +516,6 @@ static void rcu_preempt_check_callbacks(int cpu)
473 struct task_struct *t = current; 516 struct task_struct *t = current;
474 517
475 if (t->rcu_read_lock_nesting == 0) { 518 if (t->rcu_read_lock_nesting == 0) {
476 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
477 rcu_preempt_qs(cpu); 519 rcu_preempt_qs(cpu);
478 return; 520 return;
479 } 521 }
@@ -515,11 +557,13 @@ void synchronize_rcu(void)
515 if (!rcu_scheduler_active) 557 if (!rcu_scheduler_active)
516 return; 558 return;
517 559
560 init_rcu_head_on_stack(&rcu.head);
518 init_completion(&rcu.completion); 561 init_completion(&rcu.completion);
519 /* Will wake me after RCU finished. */ 562 /* Will wake me after RCU finished. */
520 call_rcu(&rcu.head, wakeme_after_rcu); 563 call_rcu(&rcu.head, wakeme_after_rcu);
521 /* Wait for it. */ 564 /* Wait for it. */
522 wait_for_completion(&rcu.completion); 565 wait_for_completion(&rcu.completion);
566 destroy_rcu_head_on_stack(&rcu.head);
523} 567}
524EXPORT_SYMBOL_GPL(synchronize_rcu); 568EXPORT_SYMBOL_GPL(synchronize_rcu);
525 569
@@ -754,6 +798,7 @@ void exit_rcu(void)
754static void __init rcu_bootup_announce(void) 798static void __init rcu_bootup_announce(void)
755{ 799{
756 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 800 printk(KERN_INFO "Hierarchical RCU implementation.\n");
801 rcu_bootup_announce_oddness();
757} 802}
758 803
759/* 804/*
@@ -1008,6 +1053,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1008int rcu_needs_cpu(int cpu) 1053int rcu_needs_cpu(int cpu)
1009{ 1054{
1010 int c = 0; 1055 int c = 0;
1056 int snap;
1057 int snap_nmi;
1011 int thatcpu; 1058 int thatcpu;
1012 1059
1013 /* Check for being in the holdoff period. */ 1060 /* Check for being in the holdoff period. */
@@ -1015,12 +1062,18 @@ int rcu_needs_cpu(int cpu)
1015 return rcu_needs_cpu_quick_check(cpu); 1062 return rcu_needs_cpu_quick_check(cpu);
1016 1063
1017 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 1064 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1018 for_each_cpu_not(thatcpu, nohz_cpu_mask) 1065 for_each_online_cpu(thatcpu) {
1019 if (thatcpu != cpu) { 1066 if (thatcpu == cpu)
1067 continue;
1068 snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
1069 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
1070 smp_mb(); /* Order sampling of snap with end of grace period. */
1071 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
1020 per_cpu(rcu_dyntick_drain, cpu) = 0; 1072 per_cpu(rcu_dyntick_drain, cpu) = 0;
1021 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1073 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1022 return rcu_needs_cpu_quick_check(cpu); 1074 return rcu_needs_cpu_quick_check(cpu);
1023 } 1075 }
1076 }
1024 1077
1025 /* Check and update the rcu_dyntick_drain sequencing. */ 1078 /* Check and update the rcu_dyntick_drain sequencing. */
1026 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 1079 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d45db2e35d27..36c95b45738e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
242{ 242{
243 seq_printf(m, "%3d%cnp=%ld " 243 seq_printf(m, "%3d%cnp=%ld "
244 "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", 244 "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
245 "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
245 rdp->cpu, 246 rdp->cpu,
246 cpu_is_offline(rdp->cpu) ? '!' : ' ', 247 cpu_is_offline(rdp->cpu) ? '!' : ' ',
247 rdp->n_rcu_pending, 248 rdp->n_rcu_pending,
248 rdp->n_rp_qs_pending, 249 rdp->n_rp_qs_pending,
250 rdp->n_rp_report_qs,
249 rdp->n_rp_cb_ready, 251 rdp->n_rp_cb_ready,
250 rdp->n_rp_cpu_needs_gp, 252 rdp->n_rp_cpu_needs_gp,
251 rdp->n_rp_gp_completed, 253 rdp->n_rp_gp_completed,
diff --git a/kernel/relay.c b/kernel/relay.c
index 3d97f2821611..c7cf397fb929 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -539,7 +539,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
539 "relay_hotcpu_callback: cpu %d buffer " 539 "relay_hotcpu_callback: cpu %d buffer "
540 "creation failed\n", hotcpu); 540 "creation failed\n", hotcpu);
541 mutex_unlock(&relay_channels_mutex); 541 mutex_unlock(&relay_channels_mutex);
542 return NOTIFY_BAD; 542 return notifier_from_errno(-ENOMEM);
543 } 543 }
544 } 544 }
545 mutex_unlock(&relay_channels_mutex); 545 mutex_unlock(&relay_channels_mutex);
@@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
1231 size_t read_subbuf = read_start / subbuf_size; 1231 size_t read_subbuf = read_start / subbuf_size;
1232 size_t padding = rbuf->padding[read_subbuf]; 1232 size_t padding = rbuf->padding[read_subbuf];
1233 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; 1233 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
1234 struct page *pages[PIPE_BUFFERS]; 1234 struct page *pages[PIPE_DEF_BUFFERS];
1235 struct partial_page partial[PIPE_BUFFERS]; 1235 struct partial_page partial[PIPE_DEF_BUFFERS];
1236 struct splice_pipe_desc spd = { 1236 struct splice_pipe_desc spd = {
1237 .pages = pages, 1237 .pages = pages,
1238 .nr_pages = 0, 1238 .nr_pages = 0,
@@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
1245 1245
1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1247 return 0; 1247 return 0;
1248 if (splice_grow_spd(pipe, &spd))
1249 return -ENOMEM;
1248 1250
1249 /* 1251 /*
1250 * Adjust read len, if longer than what is available 1252 * Adjust read len, if longer than what is available
@@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
1255 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; 1257 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1256 pidx = (read_start / PAGE_SIZE) % subbuf_pages; 1258 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1257 poff = read_start & ~PAGE_MASK; 1259 poff = read_start & ~PAGE_MASK;
1258 nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); 1260 nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
1259 1261
1260 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { 1262 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
1261 unsigned int this_len, this_end, private; 1263 unsigned int this_len, this_end, private;
@@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in,
1289 } 1291 }
1290 } 1292 }
1291 1293
1294 ret = 0;
1292 if (!spd.nr_pages) 1295 if (!spd.nr_pages)
1293 return 0; 1296 goto out;
1294 1297
1295 ret = *nonpad_ret = splice_to_pipe(pipe, &spd); 1298 ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
1296 if (ret < 0 || ret < total_len) 1299 if (ret < 0 || ret < total_len)
1297 return ret; 1300 goto out;
1298 1301
1299 if (read_start + ret == nonpad_end) 1302 if (read_start + ret == nonpad_end)
1300 ret += padding; 1303 ret += padding;
1301 1304
1305out:
1306 splice_shrink_spd(pipe, &spd);
1302 return ret; 1307 return ret;
1303} 1308}
1304 1309
diff --git a/kernel/resource.c b/kernel/resource.c
index 9c358e263534..7b36976e5dea 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -15,6 +15,7 @@
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/sched.h>
18#include <linux/seq_file.h> 19#include <linux/seq_file.h>
19#include <linux/device.h> 20#include <linux/device.h>
20#include <linux/pfn.h> 21#include <linux/pfn.h>
@@ -681,6 +682,8 @@ resource_size_t resource_alignment(struct resource *res)
681 * release_region releases a matching busy region. 682 * release_region releases a matching busy region.
682 */ 683 */
683 684
685static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
686
684/** 687/**
685 * __request_region - create a new busy resource region 688 * __request_region - create a new busy resource region
686 * @parent: parent resource descriptor 689 * @parent: parent resource descriptor
@@ -693,6 +696,7 @@ struct resource * __request_region(struct resource *parent,
693 resource_size_t start, resource_size_t n, 696 resource_size_t start, resource_size_t n,
694 const char *name, int flags) 697 const char *name, int flags)
695{ 698{
699 DECLARE_WAITQUEUE(wait, current);
696 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 700 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
697 701
698 if (!res) 702 if (!res)
@@ -717,7 +721,15 @@ struct resource * __request_region(struct resource *parent,
717 if (!(conflict->flags & IORESOURCE_BUSY)) 721 if (!(conflict->flags & IORESOURCE_BUSY))
718 continue; 722 continue;
719 } 723 }
720 724 if (conflict->flags & flags & IORESOURCE_MUXED) {
725 add_wait_queue(&muxed_resource_wait, &wait);
726 write_unlock(&resource_lock);
727 set_current_state(TASK_UNINTERRUPTIBLE);
728 schedule();
729 remove_wait_queue(&muxed_resource_wait, &wait);
730 write_lock(&resource_lock);
731 continue;
732 }
721 /* Uhhuh, that didn't work out.. */ 733 /* Uhhuh, that didn't work out.. */
722 kfree(res); 734 kfree(res);
723 res = NULL; 735 res = NULL;
@@ -791,6 +803,8 @@ void __release_region(struct resource *parent, resource_size_t start,
791 break; 803 break;
792 *p = res->sibling; 804 *p = res->sibling;
793 write_unlock(&resource_lock); 805 write_unlock(&resource_lock);
806 if (res->flags & IORESOURCE_MUXED)
807 wake_up(&muxed_resource_wait);
794 kfree(res); 808 kfree(res);
795 return; 809 return;
796 } 810 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 4d051c7517fd..d48408142503 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/cpuset.h> 56#include <linux/cpuset.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h> 58#include <linux/proc_fs.h>
60#include <linux/seq_file.h> 59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h> 61#include <linux/sysctl.h>
62#include <linux/syscalls.h> 62#include <linux/syscalls.h>
63#include <linux/times.h> 63#include <linux/times.h>
@@ -503,8 +503,11 @@ struct rq {
503 #define CPU_LOAD_IDX_MAX 5 503 #define CPU_LOAD_IDX_MAX 5
504 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 504 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
505#ifdef CONFIG_NO_HZ 505#ifdef CONFIG_NO_HZ
506 u64 nohz_stamp;
506 unsigned char in_nohz_recently; 507 unsigned char in_nohz_recently;
507#endif 508#endif
509 unsigned int skip_clock_update;
510
508 /* capture load from *all* tasks on this cpu: */ 511 /* capture load from *all* tasks on this cpu: */
509 struct load_weight load; 512 struct load_weight load;
510 unsigned long nr_load_updates; 513 unsigned long nr_load_updates;
@@ -546,15 +549,13 @@ struct rq {
546 int post_schedule; 549 int post_schedule;
547 int active_balance; 550 int active_balance;
548 int push_cpu; 551 int push_cpu;
552 struct cpu_stop_work active_balance_work;
549 /* cpu of this runqueue: */ 553 /* cpu of this runqueue: */
550 int cpu; 554 int cpu;
551 int online; 555 int online;
552 556
553 unsigned long avg_load_per_task; 557 unsigned long avg_load_per_task;
554 558
555 struct task_struct *migration_thread;
556 struct list_head migration_queue;
557
558 u64 rt_avg; 559 u64 rt_avg;
559 u64 age_stamp; 560 u64 age_stamp;
560 u64 idle_stamp; 561 u64 idle_stamp;
@@ -602,6 +603,13 @@ static inline
602void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 603void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
603{ 604{
604 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 605 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
606
607 /*
608 * A queue event has occurred, and we're going to schedule. In
609 * this case, we can save a useless back to back clock update.
610 */
611 if (test_tsk_need_resched(p))
612 rq->skip_clock_update = 1;
605} 613}
606 614
607static inline int cpu_of(struct rq *rq) 615static inline int cpu_of(struct rq *rq)
@@ -636,7 +644,8 @@ static inline int cpu_of(struct rq *rq)
636 644
637inline void update_rq_clock(struct rq *rq) 645inline void update_rq_clock(struct rq *rq)
638{ 646{
639 rq->clock = sched_clock_cpu(cpu_of(rq)); 647 if (!rq->skip_clock_update)
648 rq->clock = sched_clock_cpu(cpu_of(rq));
640} 649}
641 650
642/* 651/*
@@ -914,16 +923,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
914#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 923#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
915 924
916/* 925/*
917 * Check whether the task is waking, we use this to synchronize against 926 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
918 * ttwu() so that task_cpu() reports a stable number. 927 * against ttwu().
919 *
920 * We need to make an exception for PF_STARTING tasks because the fork
921 * path might require task_rq_lock() to work, eg. it can call
922 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
923 */ 928 */
924static inline int task_is_waking(struct task_struct *p) 929static inline int task_is_waking(struct task_struct *p)
925{ 930{
926 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); 931 return unlikely(p->state == TASK_WAKING);
927} 932}
928 933
929/* 934/*
@@ -936,11 +941,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
936 struct rq *rq; 941 struct rq *rq;
937 942
938 for (;;) { 943 for (;;) {
939 while (task_is_waking(p))
940 cpu_relax();
941 rq = task_rq(p); 944 rq = task_rq(p);
942 raw_spin_lock(&rq->lock); 945 raw_spin_lock(&rq->lock);
943 if (likely(rq == task_rq(p) && !task_is_waking(p))) 946 if (likely(rq == task_rq(p)))
944 return rq; 947 return rq;
945 raw_spin_unlock(&rq->lock); 948 raw_spin_unlock(&rq->lock);
946 } 949 }
@@ -957,25 +960,15 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
957 struct rq *rq; 960 struct rq *rq;
958 961
959 for (;;) { 962 for (;;) {
960 while (task_is_waking(p))
961 cpu_relax();
962 local_irq_save(*flags); 963 local_irq_save(*flags);
963 rq = task_rq(p); 964 rq = task_rq(p);
964 raw_spin_lock(&rq->lock); 965 raw_spin_lock(&rq->lock);
965 if (likely(rq == task_rq(p) && !task_is_waking(p))) 966 if (likely(rq == task_rq(p)))
966 return rq; 967 return rq;
967 raw_spin_unlock_irqrestore(&rq->lock, *flags); 968 raw_spin_unlock_irqrestore(&rq->lock, *flags);
968 } 969 }
969} 970}
970 971
971void task_rq_unlock_wait(struct task_struct *p)
972{
973 struct rq *rq = task_rq(p);
974
975 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
976 raw_spin_unlock_wait(&rq->lock);
977}
978
979static void __task_rq_unlock(struct rq *rq) 972static void __task_rq_unlock(struct rq *rq)
980 __releases(rq->lock) 973 __releases(rq->lock)
981{ 974{
@@ -1239,6 +1232,17 @@ void wake_up_idle_cpu(int cpu)
1239 if (!tsk_is_polling(rq->idle)) 1232 if (!tsk_is_polling(rq->idle))
1240 smp_send_reschedule(cpu); 1233 smp_send_reschedule(cpu);
1241} 1234}
1235
1236int nohz_ratelimit(int cpu)
1237{
1238 struct rq *rq = cpu_rq(cpu);
1239 u64 diff = rq->clock - rq->nohz_stamp;
1240
1241 rq->nohz_stamp = rq->clock;
1242
1243 return diff < (NSEC_PER_SEC / HZ) >> 1;
1244}
1245
1242#endif /* CONFIG_NO_HZ */ 1246#endif /* CONFIG_NO_HZ */
1243 1247
1244static u64 sched_avg_period(void) 1248static u64 sched_avg_period(void)
@@ -1781,8 +1785,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1781 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1785 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1782 } 1786 }
1783 } 1787 }
1784 update_rq_clock(rq1);
1785 update_rq_clock(rq2);
1786} 1788}
1787 1789
1788/* 1790/*
@@ -1813,7 +1815,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1813} 1815}
1814#endif 1816#endif
1815 1817
1816static void calc_load_account_active(struct rq *this_rq); 1818static void calc_load_account_idle(struct rq *this_rq);
1817static void update_sysctl(void); 1819static void update_sysctl(void);
1818static int get_update_sysctl_factor(void); 1820static int get_update_sysctl_factor(void);
1819 1821
@@ -1870,62 +1872,43 @@ static void set_load_weight(struct task_struct *p)
1870 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1872 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1871} 1873}
1872 1874
1873static void update_avg(u64 *avg, u64 sample) 1875static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1874{
1875 s64 diff = sample - *avg;
1876 *avg += diff >> 3;
1877}
1878
1879static void
1880enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1881{ 1876{
1882 if (wakeup) 1877 update_rq_clock(rq);
1883 p->se.start_runtime = p->se.sum_exec_runtime;
1884
1885 sched_info_queued(p); 1878 sched_info_queued(p);
1886 p->sched_class->enqueue_task(rq, p, wakeup, head); 1879 p->sched_class->enqueue_task(rq, p, flags);
1887 p->se.on_rq = 1; 1880 p->se.on_rq = 1;
1888} 1881}
1889 1882
1890static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1883static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1891{ 1884{
1892 if (sleep) { 1885 update_rq_clock(rq);
1893 if (p->se.last_wakeup) {
1894 update_avg(&p->se.avg_overlap,
1895 p->se.sum_exec_runtime - p->se.last_wakeup);
1896 p->se.last_wakeup = 0;
1897 } else {
1898 update_avg(&p->se.avg_wakeup,
1899 sysctl_sched_wakeup_granularity);
1900 }
1901 }
1902
1903 sched_info_dequeued(p); 1886 sched_info_dequeued(p);
1904 p->sched_class->dequeue_task(rq, p, sleep); 1887 p->sched_class->dequeue_task(rq, p, flags);
1905 p->se.on_rq = 0; 1888 p->se.on_rq = 0;
1906} 1889}
1907 1890
1908/* 1891/*
1909 * activate_task - move a task to the runqueue. 1892 * activate_task - move a task to the runqueue.
1910 */ 1893 */
1911static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 1894static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1912{ 1895{
1913 if (task_contributes_to_load(p)) 1896 if (task_contributes_to_load(p))
1914 rq->nr_uninterruptible--; 1897 rq->nr_uninterruptible--;
1915 1898
1916 enqueue_task(rq, p, wakeup, false); 1899 enqueue_task(rq, p, flags);
1917 inc_nr_running(rq); 1900 inc_nr_running(rq);
1918} 1901}
1919 1902
1920/* 1903/*
1921 * deactivate_task - remove a task from the runqueue. 1904 * deactivate_task - remove a task from the runqueue.
1922 */ 1905 */
1923static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 1906static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1924{ 1907{
1925 if (task_contributes_to_load(p)) 1908 if (task_contributes_to_load(p))
1926 rq->nr_uninterruptible++; 1909 rq->nr_uninterruptible++;
1927 1910
1928 dequeue_task(rq, p, sleep); 1911 dequeue_task(rq, p, flags);
1929 dec_nr_running(rq); 1912 dec_nr_running(rq);
1930} 1913}
1931 1914
@@ -2054,21 +2037,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2054 __set_task_cpu(p, new_cpu); 2037 __set_task_cpu(p, new_cpu);
2055} 2038}
2056 2039
2057struct migration_req { 2040struct migration_arg {
2058 struct list_head list;
2059
2060 struct task_struct *task; 2041 struct task_struct *task;
2061 int dest_cpu; 2042 int dest_cpu;
2062
2063 struct completion done;
2064}; 2043};
2065 2044
2045static int migration_cpu_stop(void *data);
2046
2066/* 2047/*
2067 * The task's runqueue lock must be held. 2048 * The task's runqueue lock must be held.
2068 * Returns true if you have to wait for migration thread. 2049 * Returns true if you have to wait for migration thread.
2069 */ 2050 */
2070static int 2051static bool migrate_task(struct task_struct *p, int dest_cpu)
2071migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2072{ 2052{
2073 struct rq *rq = task_rq(p); 2053 struct rq *rq = task_rq(p);
2074 2054
@@ -2076,58 +2056,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2076 * If the task is not on a runqueue (and not running), then 2056 * If the task is not on a runqueue (and not running), then
2077 * the next wake-up will properly place the task. 2057 * the next wake-up will properly place the task.
2078 */ 2058 */
2079 if (!p->se.on_rq && !task_running(rq, p)) 2059 return p->se.on_rq || task_running(rq, p);
2080 return 0;
2081
2082 init_completion(&req->done);
2083 req->task = p;
2084 req->dest_cpu = dest_cpu;
2085 list_add(&req->list, &rq->migration_queue);
2086
2087 return 1;
2088}
2089
2090/*
2091 * wait_task_context_switch - wait for a thread to complete at least one
2092 * context switch.
2093 *
2094 * @p must not be current.
2095 */
2096void wait_task_context_switch(struct task_struct *p)
2097{
2098 unsigned long nvcsw, nivcsw, flags;
2099 int running;
2100 struct rq *rq;
2101
2102 nvcsw = p->nvcsw;
2103 nivcsw = p->nivcsw;
2104 for (;;) {
2105 /*
2106 * The runqueue is assigned before the actual context
2107 * switch. We need to take the runqueue lock.
2108 *
2109 * We could check initially without the lock but it is
2110 * very likely that we need to take the lock in every
2111 * iteration.
2112 */
2113 rq = task_rq_lock(p, &flags);
2114 running = task_running(rq, p);
2115 task_rq_unlock(rq, &flags);
2116
2117 if (likely(!running))
2118 break;
2119 /*
2120 * The switch count is incremented before the actual
2121 * context switch. We thus wait for two switches to be
2122 * sure at least one completed.
2123 */
2124 if ((p->nvcsw - nvcsw) > 1)
2125 break;
2126 if ((p->nivcsw - nivcsw) > 1)
2127 break;
2128
2129 cpu_relax();
2130 }
2131} 2060}
2132 2061
2133/* 2062/*
@@ -2185,7 +2114,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2185 * just go back and repeat. 2114 * just go back and repeat.
2186 */ 2115 */
2187 rq = task_rq_lock(p, &flags); 2116 rq = task_rq_lock(p, &flags);
2188 trace_sched_wait_task(rq, p); 2117 trace_sched_wait_task(p);
2189 running = task_running(rq, p); 2118 running = task_running(rq, p);
2190 on_rq = p->se.on_rq; 2119 on_rq = p->se.on_rq;
2191 ncsw = 0; 2120 ncsw = 0;
@@ -2283,6 +2212,9 @@ void task_oncpu_function_call(struct task_struct *p,
2283} 2212}
2284 2213
2285#ifdef CONFIG_SMP 2214#ifdef CONFIG_SMP
2215/*
2216 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2217 */
2286static int select_fallback_rq(int cpu, struct task_struct *p) 2218static int select_fallback_rq(int cpu, struct task_struct *p)
2287{ 2219{
2288 int dest_cpu; 2220 int dest_cpu;
@@ -2299,12 +2231,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2299 return dest_cpu; 2231 return dest_cpu;
2300 2232
2301 /* No more Mr. Nice Guy. */ 2233 /* No more Mr. Nice Guy. */
2302 if (dest_cpu >= nr_cpu_ids) { 2234 if (unlikely(dest_cpu >= nr_cpu_ids)) {
2303 rcu_read_lock(); 2235 dest_cpu = cpuset_cpus_allowed_fallback(p);
2304 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2305 rcu_read_unlock();
2306 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2307
2308 /* 2236 /*
2309 * Don't tell them about moving exiting tasks or 2237 * Don't tell them about moving exiting tasks or
2310 * kernel threads (both mm NULL), since they never 2238 * kernel threads (both mm NULL), since they never
@@ -2321,17 +2249,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2321} 2249}
2322 2250
2323/* 2251/*
2324 * Gets called from 3 sites (exec, fork, wakeup), since it is called without 2252 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2325 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2326 * by:
2327 *
2328 * exec: is unstable, retry loop
2329 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2330 */ 2253 */
2331static inline 2254static inline
2332int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2255int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2333{ 2256{
2334 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2257 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2335 2258
2336 /* 2259 /*
2337 * In order not to call set_task_cpu() on a blocking task we need 2260 * In order not to call set_task_cpu() on a blocking task we need
@@ -2349,6 +2272,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2349 2272
2350 return cpu; 2273 return cpu;
2351} 2274}
2275
2276static void update_avg(u64 *avg, u64 sample)
2277{
2278 s64 diff = sample - *avg;
2279 *avg += diff >> 3;
2280}
2352#endif 2281#endif
2353 2282
2354/*** 2283/***
@@ -2370,16 +2299,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2370{ 2299{
2371 int cpu, orig_cpu, this_cpu, success = 0; 2300 int cpu, orig_cpu, this_cpu, success = 0;
2372 unsigned long flags; 2301 unsigned long flags;
2302 unsigned long en_flags = ENQUEUE_WAKEUP;
2373 struct rq *rq; 2303 struct rq *rq;
2374 2304
2375 if (!sched_feat(SYNC_WAKEUPS))
2376 wake_flags &= ~WF_SYNC;
2377
2378 this_cpu = get_cpu(); 2305 this_cpu = get_cpu();
2379 2306
2380 smp_wmb(); 2307 smp_wmb();
2381 rq = task_rq_lock(p, &flags); 2308 rq = task_rq_lock(p, &flags);
2382 update_rq_clock(rq);
2383 if (!(p->state & state)) 2309 if (!(p->state & state))
2384 goto out; 2310 goto out;
2385 2311
@@ -2399,28 +2325,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2399 * 2325 *
2400 * First fix up the nr_uninterruptible count: 2326 * First fix up the nr_uninterruptible count:
2401 */ 2327 */
2402 if (task_contributes_to_load(p)) 2328 if (task_contributes_to_load(p)) {
2403 rq->nr_uninterruptible--; 2329 if (likely(cpu_online(orig_cpu)))
2330 rq->nr_uninterruptible--;
2331 else
2332 this_rq()->nr_uninterruptible--;
2333 }
2404 p->state = TASK_WAKING; 2334 p->state = TASK_WAKING;
2405 2335
2406 if (p->sched_class->task_waking) 2336 if (p->sched_class->task_waking) {
2407 p->sched_class->task_waking(rq, p); 2337 p->sched_class->task_waking(rq, p);
2338 en_flags |= ENQUEUE_WAKING;
2339 }
2408 2340
2409 __task_rq_unlock(rq); 2341 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2410 2342 if (cpu != orig_cpu)
2411 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2412 if (cpu != orig_cpu) {
2413 /*
2414 * Since we migrate the task without holding any rq->lock,
2415 * we need to be careful with task_rq_lock(), since that
2416 * might end up locking an invalid rq.
2417 */
2418 set_task_cpu(p, cpu); 2343 set_task_cpu(p, cpu);
2419 } 2344 __task_rq_unlock(rq);
2420 2345
2421 rq = cpu_rq(cpu); 2346 rq = cpu_rq(cpu);
2422 raw_spin_lock(&rq->lock); 2347 raw_spin_lock(&rq->lock);
2423 update_rq_clock(rq);
2424 2348
2425 /* 2349 /*
2426 * We migrated the task without holding either rq->lock, however 2350 * We migrated the task without holding either rq->lock, however
@@ -2448,36 +2372,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2448 2372
2449out_activate: 2373out_activate:
2450#endif /* CONFIG_SMP */ 2374#endif /* CONFIG_SMP */
2451 schedstat_inc(p, se.nr_wakeups); 2375 schedstat_inc(p, se.statistics.nr_wakeups);
2452 if (wake_flags & WF_SYNC) 2376 if (wake_flags & WF_SYNC)
2453 schedstat_inc(p, se.nr_wakeups_sync); 2377 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2454 if (orig_cpu != cpu) 2378 if (orig_cpu != cpu)
2455 schedstat_inc(p, se.nr_wakeups_migrate); 2379 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2456 if (cpu == this_cpu) 2380 if (cpu == this_cpu)
2457 schedstat_inc(p, se.nr_wakeups_local); 2381 schedstat_inc(p, se.statistics.nr_wakeups_local);
2458 else 2382 else
2459 schedstat_inc(p, se.nr_wakeups_remote); 2383 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2460 activate_task(rq, p, 1); 2384 activate_task(rq, p, en_flags);
2461 success = 1; 2385 success = 1;
2462 2386
2463 /*
2464 * Only attribute actual wakeups done by this task.
2465 */
2466 if (!in_interrupt()) {
2467 struct sched_entity *se = &current->se;
2468 u64 sample = se->sum_exec_runtime;
2469
2470 if (se->last_wakeup)
2471 sample -= se->last_wakeup;
2472 else
2473 sample -= se->start_runtime;
2474 update_avg(&se->avg_wakeup, sample);
2475
2476 se->last_wakeup = se->sum_exec_runtime;
2477 }
2478
2479out_running: 2387out_running:
2480 trace_sched_wakeup(rq, p, success); 2388 trace_sched_wakeup(p, success);
2481 check_preempt_curr(rq, p, wake_flags); 2389 check_preempt_curr(rq, p, wake_flags);
2482 2390
2483 p->state = TASK_RUNNING; 2391 p->state = TASK_RUNNING;
@@ -2537,42 +2445,9 @@ static void __sched_fork(struct task_struct *p)
2537 p->se.sum_exec_runtime = 0; 2445 p->se.sum_exec_runtime = 0;
2538 p->se.prev_sum_exec_runtime = 0; 2446 p->se.prev_sum_exec_runtime = 0;
2539 p->se.nr_migrations = 0; 2447 p->se.nr_migrations = 0;
2540 p->se.last_wakeup = 0;
2541 p->se.avg_overlap = 0;
2542 p->se.start_runtime = 0;
2543 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2544 2448
2545#ifdef CONFIG_SCHEDSTATS 2449#ifdef CONFIG_SCHEDSTATS
2546 p->se.wait_start = 0; 2450 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2547 p->se.wait_max = 0;
2548 p->se.wait_count = 0;
2549 p->se.wait_sum = 0;
2550
2551 p->se.sleep_start = 0;
2552 p->se.sleep_max = 0;
2553 p->se.sum_sleep_runtime = 0;
2554
2555 p->se.block_start = 0;
2556 p->se.block_max = 0;
2557 p->se.exec_max = 0;
2558 p->se.slice_max = 0;
2559
2560 p->se.nr_migrations_cold = 0;
2561 p->se.nr_failed_migrations_affine = 0;
2562 p->se.nr_failed_migrations_running = 0;
2563 p->se.nr_failed_migrations_hot = 0;
2564 p->se.nr_forced_migrations = 0;
2565
2566 p->se.nr_wakeups = 0;
2567 p->se.nr_wakeups_sync = 0;
2568 p->se.nr_wakeups_migrate = 0;
2569 p->se.nr_wakeups_local = 0;
2570 p->se.nr_wakeups_remote = 0;
2571 p->se.nr_wakeups_affine = 0;
2572 p->se.nr_wakeups_affine_attempts = 0;
2573 p->se.nr_wakeups_passive = 0;
2574 p->se.nr_wakeups_idle = 0;
2575
2576#endif 2451#endif
2577 2452
2578 INIT_LIST_HEAD(&p->rt.run_list); 2453 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2593,11 +2468,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2593 2468
2594 __sched_fork(p); 2469 __sched_fork(p);
2595 /* 2470 /*
2596 * We mark the process as waking here. This guarantees that 2471 * We mark the process as running here. This guarantees that
2597 * nobody will actually run it, and a signal or other external 2472 * nobody will actually run it, and a signal or other external
2598 * event cannot wake it up and insert it on the runqueue either. 2473 * event cannot wake it up and insert it on the runqueue either.
2599 */ 2474 */
2600 p->state = TASK_WAKING; 2475 p->state = TASK_RUNNING;
2601 2476
2602 /* 2477 /*
2603 * Revert to default priority/policy on fork if requested. 2478 * Revert to default priority/policy on fork if requested.
@@ -2664,31 +2539,27 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2664 int cpu __maybe_unused = get_cpu(); 2539 int cpu __maybe_unused = get_cpu();
2665 2540
2666#ifdef CONFIG_SMP 2541#ifdef CONFIG_SMP
2542 rq = task_rq_lock(p, &flags);
2543 p->state = TASK_WAKING;
2544
2667 /* 2545 /*
2668 * Fork balancing, do it here and not earlier because: 2546 * Fork balancing, do it here and not earlier because:
2669 * - cpus_allowed can change in the fork path 2547 * - cpus_allowed can change in the fork path
2670 * - any previously selected cpu might disappear through hotplug 2548 * - any previously selected cpu might disappear through hotplug
2671 * 2549 *
2672 * We still have TASK_WAKING but PF_STARTING is gone now, meaning 2550 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2673 * ->cpus_allowed is stable, we have preemption disabled, meaning 2551 * without people poking at ->cpus_allowed.
2674 * cpu_online_mask is stable.
2675 */ 2552 */
2676 cpu = select_task_rq(p, SD_BALANCE_FORK, 0); 2553 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2677 set_task_cpu(p, cpu); 2554 set_task_cpu(p, cpu);
2678#endif
2679
2680 /*
2681 * Since the task is not on the rq and we still have TASK_WAKING set
2682 * nobody else will migrate this task.
2683 */
2684 rq = cpu_rq(cpu);
2685 raw_spin_lock_irqsave(&rq->lock, flags);
2686 2555
2687 BUG_ON(p->state != TASK_WAKING);
2688 p->state = TASK_RUNNING; 2556 p->state = TASK_RUNNING;
2689 update_rq_clock(rq); 2557 task_rq_unlock(rq, &flags);
2558#endif
2559
2560 rq = task_rq_lock(p, &flags);
2690 activate_task(rq, p, 0); 2561 activate_task(rq, p, 0);
2691 trace_sched_wakeup_new(rq, p, 1); 2562 trace_sched_wakeup_new(p, 1);
2692 check_preempt_curr(rq, p, WF_FORK); 2563 check_preempt_curr(rq, p, WF_FORK);
2693#ifdef CONFIG_SMP 2564#ifdef CONFIG_SMP
2694 if (p->sched_class->task_woken) 2565 if (p->sched_class->task_woken)
@@ -2908,7 +2779,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2908 struct mm_struct *mm, *oldmm; 2779 struct mm_struct *mm, *oldmm;
2909 2780
2910 prepare_task_switch(rq, prev, next); 2781 prepare_task_switch(rq, prev, next);
2911 trace_sched_switch(rq, prev, next); 2782 trace_sched_switch(prev, next);
2912 mm = next->mm; 2783 mm = next->mm;
2913 oldmm = prev->active_mm; 2784 oldmm = prev->active_mm;
2914 /* 2785 /*
@@ -3025,6 +2896,61 @@ static unsigned long calc_load_update;
3025unsigned long avenrun[3]; 2896unsigned long avenrun[3];
3026EXPORT_SYMBOL(avenrun); 2897EXPORT_SYMBOL(avenrun);
3027 2898
2899static long calc_load_fold_active(struct rq *this_rq)
2900{
2901 long nr_active, delta = 0;
2902
2903 nr_active = this_rq->nr_running;
2904 nr_active += (long) this_rq->nr_uninterruptible;
2905
2906 if (nr_active != this_rq->calc_load_active) {
2907 delta = nr_active - this_rq->calc_load_active;
2908 this_rq->calc_load_active = nr_active;
2909 }
2910
2911 return delta;
2912}
2913
2914#ifdef CONFIG_NO_HZ
2915/*
2916 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2917 *
2918 * When making the ILB scale, we should try to pull this in as well.
2919 */
2920static atomic_long_t calc_load_tasks_idle;
2921
2922static void calc_load_account_idle(struct rq *this_rq)
2923{
2924 long delta;
2925
2926 delta = calc_load_fold_active(this_rq);
2927 if (delta)
2928 atomic_long_add(delta, &calc_load_tasks_idle);
2929}
2930
2931static long calc_load_fold_idle(void)
2932{
2933 long delta = 0;
2934
2935 /*
2936 * Its got a race, we don't care...
2937 */
2938 if (atomic_long_read(&calc_load_tasks_idle))
2939 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2940
2941 return delta;
2942}
2943#else
2944static void calc_load_account_idle(struct rq *this_rq)
2945{
2946}
2947
2948static inline long calc_load_fold_idle(void)
2949{
2950 return 0;
2951}
2952#endif
2953
3028/** 2954/**
3029 * get_avenrun - get the load average array 2955 * get_avenrun - get the load average array
3030 * @loads: pointer to dest load array 2956 * @loads: pointer to dest load array
@@ -3071,20 +2997,22 @@ void calc_global_load(void)
3071} 2997}
3072 2998
3073/* 2999/*
3074 * Either called from update_cpu_load() or from a cpu going idle 3000 * Called from update_cpu_load() to periodically update this CPU's
3001 * active count.
3075 */ 3002 */
3076static void calc_load_account_active(struct rq *this_rq) 3003static void calc_load_account_active(struct rq *this_rq)
3077{ 3004{
3078 long nr_active, delta; 3005 long delta;
3079 3006
3080 nr_active = this_rq->nr_running; 3007 if (time_before(jiffies, this_rq->calc_load_update))
3081 nr_active += (long) this_rq->nr_uninterruptible; 3008 return;
3082 3009
3083 if (nr_active != this_rq->calc_load_active) { 3010 delta = calc_load_fold_active(this_rq);
3084 delta = nr_active - this_rq->calc_load_active; 3011 delta += calc_load_fold_idle();
3085 this_rq->calc_load_active = nr_active; 3012 if (delta)
3086 atomic_long_add(delta, &calc_load_tasks); 3013 atomic_long_add(delta, &calc_load_tasks);
3087 } 3014
3015 this_rq->calc_load_update += LOAD_FREQ;
3088} 3016}
3089 3017
3090/* 3018/*
@@ -3116,10 +3044,7 @@ static void update_cpu_load(struct rq *this_rq)
3116 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3044 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3117 } 3045 }
3118 3046
3119 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3047 calc_load_account_active(this_rq);
3120 this_rq->calc_load_update += LOAD_FREQ;
3121 calc_load_account_active(this_rq);
3122 }
3123} 3048}
3124 3049
3125#ifdef CONFIG_SMP 3050#ifdef CONFIG_SMP
@@ -3131,44 +3056,27 @@ static void update_cpu_load(struct rq *this_rq)
3131void sched_exec(void) 3056void sched_exec(void)
3132{ 3057{
3133 struct task_struct *p = current; 3058 struct task_struct *p = current;
3134 struct migration_req req;
3135 int dest_cpu, this_cpu;
3136 unsigned long flags; 3059 unsigned long flags;
3137 struct rq *rq; 3060 struct rq *rq;
3138 3061 int dest_cpu;
3139again:
3140 this_cpu = get_cpu();
3141 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3142 if (dest_cpu == this_cpu) {
3143 put_cpu();
3144 return;
3145 }
3146 3062
3147 rq = task_rq_lock(p, &flags); 3063 rq = task_rq_lock(p, &flags);
3148 put_cpu(); 3064 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3065 if (dest_cpu == smp_processor_id())
3066 goto unlock;
3149 3067
3150 /* 3068 /*
3151 * select_task_rq() can race against ->cpus_allowed 3069 * select_task_rq() can race against ->cpus_allowed
3152 */ 3070 */
3153 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3071 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3154 || unlikely(!cpu_active(dest_cpu))) { 3072 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3155 task_rq_unlock(rq, &flags); 3073 struct migration_arg arg = { p, dest_cpu };
3156 goto again;
3157 }
3158 3074
3159 /* force the process onto the specified CPU */
3160 if (migrate_task(p, dest_cpu, &req)) {
3161 /* Need to wait for migration thread (might exit: take ref). */
3162 struct task_struct *mt = rq->migration_thread;
3163
3164 get_task_struct(mt);
3165 task_rq_unlock(rq, &flags); 3075 task_rq_unlock(rq, &flags);
3166 wake_up_process(mt); 3076 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3167 put_task_struct(mt);
3168 wait_for_completion(&req.done);
3169
3170 return; 3077 return;
3171 } 3078 }
3079unlock:
3172 task_rq_unlock(rq, &flags); 3080 task_rq_unlock(rq, &flags);
3173} 3081}
3174 3082
@@ -3640,23 +3548,9 @@ static inline void schedule_debug(struct task_struct *prev)
3640 3548
3641static void put_prev_task(struct rq *rq, struct task_struct *prev) 3549static void put_prev_task(struct rq *rq, struct task_struct *prev)
3642{ 3550{
3643 if (prev->state == TASK_RUNNING) { 3551 if (prev->se.on_rq)
3644 u64 runtime = prev->se.sum_exec_runtime; 3552 update_rq_clock(rq);
3645 3553 rq->skip_clock_update = 0;
3646 runtime -= prev->se.prev_sum_exec_runtime;
3647 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
3648
3649 /*
3650 * In order to avoid avg_overlap growing stale when we are
3651 * indeed overlapping and hence not getting put to sleep, grow
3652 * the avg_overlap on preemption.
3653 *
3654 * We use the average preemption runtime because that
3655 * correlates to the amount of cache footprint a task can
3656 * build up.
3657 */
3658 update_avg(&prev->se.avg_overlap, runtime);
3659 }
3660 prev->sched_class->put_prev_task(rq, prev); 3554 prev->sched_class->put_prev_task(rq, prev);
3661} 3555}
3662 3556
@@ -3706,7 +3600,7 @@ need_resched:
3706 preempt_disable(); 3600 preempt_disable();
3707 cpu = smp_processor_id(); 3601 cpu = smp_processor_id();
3708 rq = cpu_rq(cpu); 3602 rq = cpu_rq(cpu);
3709 rcu_sched_qs(cpu); 3603 rcu_note_context_switch(cpu);
3710 prev = rq->curr; 3604 prev = rq->curr;
3711 switch_count = &prev->nivcsw; 3605 switch_count = &prev->nivcsw;
3712 3606
@@ -3719,14 +3613,13 @@ need_resched_nonpreemptible:
3719 hrtick_clear(rq); 3613 hrtick_clear(rq);
3720 3614
3721 raw_spin_lock_irq(&rq->lock); 3615 raw_spin_lock_irq(&rq->lock);
3722 update_rq_clock(rq);
3723 clear_tsk_need_resched(prev); 3616 clear_tsk_need_resched(prev);
3724 3617
3725 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3618 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3726 if (unlikely(signal_pending_state(prev->state, prev))) 3619 if (unlikely(signal_pending_state(prev->state, prev)))
3727 prev->state = TASK_RUNNING; 3620 prev->state = TASK_RUNNING;
3728 else 3621 else
3729 deactivate_task(rq, prev, 1); 3622 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3730 switch_count = &prev->nvcsw; 3623 switch_count = &prev->nvcsw;
3731 } 3624 }
3732 3625
@@ -3950,6 +3843,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3950{ 3843{
3951 __wake_up_common(q, mode, 1, 0, NULL); 3844 __wake_up_common(q, mode, 1, 0, NULL);
3952} 3845}
3846EXPORT_SYMBOL_GPL(__wake_up_locked);
3953 3847
3954void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 3848void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3955{ 3849{
@@ -4049,8 +3943,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4049 if (!x->done) { 3943 if (!x->done) {
4050 DECLARE_WAITQUEUE(wait, current); 3944 DECLARE_WAITQUEUE(wait, current);
4051 3945
4052 wait.flags |= WQ_FLAG_EXCLUSIVE; 3946 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4053 __add_wait_queue_tail(&x->wait, &wait);
4054 do { 3947 do {
4055 if (signal_pending_state(state, current)) { 3948 if (signal_pending_state(state, current)) {
4056 timeout = -ERESTARTSYS; 3949 timeout = -ERESTARTSYS;
@@ -4293,7 +4186,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4293 BUG_ON(prio < 0 || prio > MAX_PRIO); 4186 BUG_ON(prio < 0 || prio > MAX_PRIO);
4294 4187
4295 rq = task_rq_lock(p, &flags); 4188 rq = task_rq_lock(p, &flags);
4296 update_rq_clock(rq);
4297 4189
4298 oldprio = p->prio; 4190 oldprio = p->prio;
4299 prev_class = p->sched_class; 4191 prev_class = p->sched_class;
@@ -4314,7 +4206,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4314 if (running) 4206 if (running)
4315 p->sched_class->set_curr_task(rq); 4207 p->sched_class->set_curr_task(rq);
4316 if (on_rq) { 4208 if (on_rq) {
4317 enqueue_task(rq, p, 0, oldprio < prio); 4209 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4318 4210
4319 check_class_changed(rq, p, prev_class, oldprio, running); 4211 check_class_changed(rq, p, prev_class, oldprio, running);
4320 } 4212 }
@@ -4336,7 +4228,6 @@ void set_user_nice(struct task_struct *p, long nice)
4336 * the task might be in the middle of scheduling on another CPU. 4228 * the task might be in the middle of scheduling on another CPU.
4337 */ 4229 */
4338 rq = task_rq_lock(p, &flags); 4230 rq = task_rq_lock(p, &flags);
4339 update_rq_clock(rq);
4340 /* 4231 /*
4341 * The RT priorities are set via sched_setscheduler(), but we still 4232 * The RT priorities are set via sched_setscheduler(), but we still
4342 * allow the 'normal' nice value to be set - but as expected 4233 * allow the 'normal' nice value to be set - but as expected
@@ -4358,7 +4249,7 @@ void set_user_nice(struct task_struct *p, long nice)
4358 delta = p->prio - old_prio; 4249 delta = p->prio - old_prio;
4359 4250
4360 if (on_rq) { 4251 if (on_rq) {
4361 enqueue_task(rq, p, 0, false); 4252 enqueue_task(rq, p, 0);
4362 /* 4253 /*
4363 * If the task increased its priority or is running and 4254 * If the task increased its priority or is running and
4364 * lowered its priority, then reschedule its CPU: 4255 * lowered its priority, then reschedule its CPU:
@@ -4619,7 +4510,6 @@ recheck:
4619 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4510 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4620 goto recheck; 4511 goto recheck;
4621 } 4512 }
4622 update_rq_clock(rq);
4623 on_rq = p->se.on_rq; 4513 on_rq = p->se.on_rq;
4624 running = task_current(rq, p); 4514 running = task_current(rq, p);
4625 if (on_rq) 4515 if (on_rq)
@@ -5356,17 +5246,15 @@ static inline void sched_init_granularity(void)
5356/* 5246/*
5357 * This is how migration works: 5247 * This is how migration works:
5358 * 5248 *
5359 * 1) we queue a struct migration_req structure in the source CPU's 5249 * 1) we invoke migration_cpu_stop() on the target CPU using
5360 * runqueue and wake up that CPU's migration thread. 5250 * stop_one_cpu().
5361 * 2) we down() the locked semaphore => thread blocks. 5251 * 2) stopper starts to run (implicitly forcing the migrated thread
5362 * 3) migration thread wakes up (implicitly it forces the migrated 5252 * off the CPU)
5363 * thread off the CPU) 5253 * 3) it checks whether the migrated task is still in the wrong runqueue.
5364 * 4) it gets the migration request and checks whether the migrated 5254 * 4) if it's in the wrong runqueue then the migration thread removes
5365 * task is still in the wrong runqueue.
5366 * 5) if it's in the wrong runqueue then the migration thread removes
5367 * it and puts it into the right queue. 5255 * it and puts it into the right queue.
5368 * 6) migration thread up()s the semaphore. 5256 * 5) stopper completes and stop_one_cpu() returns and the migration
5369 * 7) we wake up and the migration is done. 5257 * is done.
5370 */ 5258 */
5371 5259
5372/* 5260/*
@@ -5380,12 +5268,23 @@ static inline void sched_init_granularity(void)
5380 */ 5268 */
5381int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5269int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5382{ 5270{
5383 struct migration_req req;
5384 unsigned long flags; 5271 unsigned long flags;
5385 struct rq *rq; 5272 struct rq *rq;
5273 unsigned int dest_cpu;
5386 int ret = 0; 5274 int ret = 0;
5387 5275
5276 /*
5277 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5278 * drop the rq->lock and still rely on ->cpus_allowed.
5279 */
5280again:
5281 while (task_is_waking(p))
5282 cpu_relax();
5388 rq = task_rq_lock(p, &flags); 5283 rq = task_rq_lock(p, &flags);
5284 if (task_is_waking(p)) {
5285 task_rq_unlock(rq, &flags);
5286 goto again;
5287 }
5389 5288
5390 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5289 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5391 ret = -EINVAL; 5290 ret = -EINVAL;
@@ -5409,15 +5308,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5409 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5308 if (cpumask_test_cpu(task_cpu(p), new_mask))
5410 goto out; 5309 goto out;
5411 5310
5412 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { 5311 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5312 if (migrate_task(p, dest_cpu)) {
5313 struct migration_arg arg = { p, dest_cpu };
5413 /* Need help from migration thread: drop lock and wait. */ 5314 /* Need help from migration thread: drop lock and wait. */
5414 struct task_struct *mt = rq->migration_thread;
5415
5416 get_task_struct(mt);
5417 task_rq_unlock(rq, &flags); 5315 task_rq_unlock(rq, &flags);
5418 wake_up_process(mt); 5316 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5419 put_task_struct(mt);
5420 wait_for_completion(&req.done);
5421 tlb_migrate_finish(p->mm); 5317 tlb_migrate_finish(p->mm);
5422 return 0; 5318 return 0;
5423 } 5319 }
@@ -5475,98 +5371,49 @@ fail:
5475 return ret; 5371 return ret;
5476} 5372}
5477 5373
5478#define RCU_MIGRATION_IDLE 0
5479#define RCU_MIGRATION_NEED_QS 1
5480#define RCU_MIGRATION_GOT_QS 2
5481#define RCU_MIGRATION_MUST_SYNC 3
5482
5483/* 5374/*
5484 * migration_thread - this is a highprio system thread that performs 5375 * migration_cpu_stop - this will be executed by a highprio stopper thread
5485 * thread migration by bumping thread off CPU then 'pushing' onto 5376 * and performs thread migration by bumping thread off CPU then
5486 * another runqueue. 5377 * 'pushing' onto another runqueue.
5487 */ 5378 */
5488static int migration_thread(void *data) 5379static int migration_cpu_stop(void *data)
5489{
5490 int badcpu;
5491 int cpu = (long)data;
5492 struct rq *rq;
5493
5494 rq = cpu_rq(cpu);
5495 BUG_ON(rq->migration_thread != current);
5496
5497 set_current_state(TASK_INTERRUPTIBLE);
5498 while (!kthread_should_stop()) {
5499 struct migration_req *req;
5500 struct list_head *head;
5501
5502 raw_spin_lock_irq(&rq->lock);
5503
5504 if (cpu_is_offline(cpu)) {
5505 raw_spin_unlock_irq(&rq->lock);
5506 break;
5507 }
5508
5509 if (rq->active_balance) {
5510 active_load_balance(rq, cpu);
5511 rq->active_balance = 0;
5512 }
5513
5514 head = &rq->migration_queue;
5515
5516 if (list_empty(head)) {
5517 raw_spin_unlock_irq(&rq->lock);
5518 schedule();
5519 set_current_state(TASK_INTERRUPTIBLE);
5520 continue;
5521 }
5522 req = list_entry(head->next, struct migration_req, list);
5523 list_del_init(head->next);
5524
5525 if (req->task != NULL) {
5526 raw_spin_unlock(&rq->lock);
5527 __migrate_task(req->task, cpu, req->dest_cpu);
5528 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
5529 req->dest_cpu = RCU_MIGRATION_GOT_QS;
5530 raw_spin_unlock(&rq->lock);
5531 } else {
5532 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5533 raw_spin_unlock(&rq->lock);
5534 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5535 }
5536 local_irq_enable();
5537
5538 complete(&req->done);
5539 }
5540 __set_current_state(TASK_RUNNING);
5541
5542 return 0;
5543}
5544
5545#ifdef CONFIG_HOTPLUG_CPU
5546
5547static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5548{ 5380{
5549 int ret; 5381 struct migration_arg *arg = data;
5550 5382
5383 /*
5384 * The original target cpu might have gone down and we might
5385 * be on another cpu but it doesn't matter.
5386 */
5551 local_irq_disable(); 5387 local_irq_disable();
5552 ret = __migrate_task(p, src_cpu, dest_cpu); 5388 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5553 local_irq_enable(); 5389 local_irq_enable();
5554 return ret; 5390 return 0;
5555} 5391}
5556 5392
5393#ifdef CONFIG_HOTPLUG_CPU
5557/* 5394/*
5558 * Figure out where task on dead CPU should go, use force if necessary. 5395 * Figure out where task on dead CPU should go, use force if necessary.
5559 */ 5396 */
5560static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5397void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5561{ 5398{
5562 int dest_cpu; 5399 struct rq *rq = cpu_rq(dead_cpu);
5400 int needs_cpu, uninitialized_var(dest_cpu);
5401 unsigned long flags;
5563 5402
5564again: 5403 local_irq_save(flags);
5565 dest_cpu = select_fallback_rq(dead_cpu, p);
5566 5404
5567 /* It can have affinity changed while we were choosing. */ 5405 raw_spin_lock(&rq->lock);
5568 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5406 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
5569 goto again; 5407 if (needs_cpu)
5408 dest_cpu = select_fallback_rq(dead_cpu, p);
5409 raw_spin_unlock(&rq->lock);
5410 /*
5411 * It can only fail if we race with set_cpus_allowed(),
5412 * in the racer should migrate the task anyway.
5413 */
5414 if (needs_cpu)
5415 __migrate_task(p, dead_cpu, dest_cpu);
5416 local_irq_restore(flags);
5570} 5417}
5571 5418
5572/* 5419/*
@@ -5630,7 +5477,6 @@ void sched_idle_next(void)
5630 5477
5631 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5478 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5632 5479
5633 update_rq_clock(rq);
5634 activate_task(rq, p, 0); 5480 activate_task(rq, p, 0);
5635 5481
5636 raw_spin_unlock_irqrestore(&rq->lock, flags); 5482 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5685,7 +5531,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5685 for ( ; ; ) { 5531 for ( ; ; ) {
5686 if (!rq->nr_running) 5532 if (!rq->nr_running)
5687 break; 5533 break;
5688 update_rq_clock(rq);
5689 next = pick_next_task(rq); 5534 next = pick_next_task(rq);
5690 if (!next) 5535 if (!next)
5691 break; 5536 break;
@@ -5908,35 +5753,20 @@ static void set_rq_offline(struct rq *rq)
5908static int __cpuinit 5753static int __cpuinit
5909migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5754migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5910{ 5755{
5911 struct task_struct *p;
5912 int cpu = (long)hcpu; 5756 int cpu = (long)hcpu;
5913 unsigned long flags; 5757 unsigned long flags;
5914 struct rq *rq; 5758 struct rq *rq = cpu_rq(cpu);
5915 5759
5916 switch (action) { 5760 switch (action) {
5917 5761
5918 case CPU_UP_PREPARE: 5762 case CPU_UP_PREPARE:
5919 case CPU_UP_PREPARE_FROZEN: 5763 case CPU_UP_PREPARE_FROZEN:
5920 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5921 if (IS_ERR(p))
5922 return NOTIFY_BAD;
5923 kthread_bind(p, cpu);
5924 /* Must be high prio: stop_machine expects to yield to it. */
5925 rq = task_rq_lock(p, &flags);
5926 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5927 task_rq_unlock(rq, &flags);
5928 get_task_struct(p);
5929 cpu_rq(cpu)->migration_thread = p;
5930 rq->calc_load_update = calc_load_update; 5764 rq->calc_load_update = calc_load_update;
5931 break; 5765 break;
5932 5766
5933 case CPU_ONLINE: 5767 case CPU_ONLINE:
5934 case CPU_ONLINE_FROZEN: 5768 case CPU_ONLINE_FROZEN:
5935 /* Strictly unnecessary, as first user will wake it. */
5936 wake_up_process(cpu_rq(cpu)->migration_thread);
5937
5938 /* Update our root-domain */ 5769 /* Update our root-domain */
5939 rq = cpu_rq(cpu);
5940 raw_spin_lock_irqsave(&rq->lock, flags); 5770 raw_spin_lock_irqsave(&rq->lock, flags);
5941 if (rq->rd) { 5771 if (rq->rd) {
5942 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5772 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5947,61 +5777,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5947 break; 5777 break;
5948 5778
5949#ifdef CONFIG_HOTPLUG_CPU 5779#ifdef CONFIG_HOTPLUG_CPU
5950 case CPU_UP_CANCELED:
5951 case CPU_UP_CANCELED_FROZEN:
5952 if (!cpu_rq(cpu)->migration_thread)
5953 break;
5954 /* Unbind it from offline cpu so it can run. Fall thru. */
5955 kthread_bind(cpu_rq(cpu)->migration_thread,
5956 cpumask_any(cpu_online_mask));
5957 kthread_stop(cpu_rq(cpu)->migration_thread);
5958 put_task_struct(cpu_rq(cpu)->migration_thread);
5959 cpu_rq(cpu)->migration_thread = NULL;
5960 break;
5961
5962 case CPU_DEAD: 5780 case CPU_DEAD:
5963 case CPU_DEAD_FROZEN: 5781 case CPU_DEAD_FROZEN:
5964 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
5965 migrate_live_tasks(cpu); 5782 migrate_live_tasks(cpu);
5966 rq = cpu_rq(cpu);
5967 kthread_stop(rq->migration_thread);
5968 put_task_struct(rq->migration_thread);
5969 rq->migration_thread = NULL;
5970 /* Idle task back to normal (off runqueue, low prio) */ 5783 /* Idle task back to normal (off runqueue, low prio) */
5971 raw_spin_lock_irq(&rq->lock); 5784 raw_spin_lock_irq(&rq->lock);
5972 update_rq_clock(rq);
5973 deactivate_task(rq, rq->idle, 0); 5785 deactivate_task(rq, rq->idle, 0);
5974 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5786 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5975 rq->idle->sched_class = &idle_sched_class; 5787 rq->idle->sched_class = &idle_sched_class;
5976 migrate_dead_tasks(cpu); 5788 migrate_dead_tasks(cpu);
5977 raw_spin_unlock_irq(&rq->lock); 5789 raw_spin_unlock_irq(&rq->lock);
5978 cpuset_unlock();
5979 migrate_nr_uninterruptible(rq); 5790 migrate_nr_uninterruptible(rq);
5980 BUG_ON(rq->nr_running != 0); 5791 BUG_ON(rq->nr_running != 0);
5981 calc_global_load_remove(rq); 5792 calc_global_load_remove(rq);
5982 /*
5983 * No need to migrate the tasks: it was best-effort if
5984 * they didn't take sched_hotcpu_mutex. Just wake up
5985 * the requestors.
5986 */
5987 raw_spin_lock_irq(&rq->lock);
5988 while (!list_empty(&rq->migration_queue)) {
5989 struct migration_req *req;
5990
5991 req = list_entry(rq->migration_queue.next,
5992 struct migration_req, list);
5993 list_del_init(&req->list);
5994 raw_spin_unlock_irq(&rq->lock);
5995 complete(&req->done);
5996 raw_spin_lock_irq(&rq->lock);
5997 }
5998 raw_spin_unlock_irq(&rq->lock);
5999 break; 5793 break;
6000 5794
6001 case CPU_DYING: 5795 case CPU_DYING:
6002 case CPU_DYING_FROZEN: 5796 case CPU_DYING_FROZEN:
6003 /* Update our root-domain */ 5797 /* Update our root-domain */
6004 rq = cpu_rq(cpu);
6005 raw_spin_lock_irqsave(&rq->lock, flags); 5798 raw_spin_lock_irqsave(&rq->lock, flags);
6006 if (rq->rd) { 5799 if (rq->rd) {
6007 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5800 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6332,6 +6125,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6332 struct rq *rq = cpu_rq(cpu); 6125 struct rq *rq = cpu_rq(cpu);
6333 struct sched_domain *tmp; 6126 struct sched_domain *tmp;
6334 6127
6128 for (tmp = sd; tmp; tmp = tmp->parent)
6129 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6130
6335 /* Remove the sched domains which do not contribute to scheduling. */ 6131 /* Remove the sched domains which do not contribute to scheduling. */
6336 for (tmp = sd; tmp; ) { 6132 for (tmp = sd; tmp; ) {
6337 struct sched_domain *parent = tmp->parent; 6133 struct sched_domain *parent = tmp->parent;
@@ -7815,10 +7611,8 @@ void __init sched_init(void)
7815 rq->push_cpu = 0; 7611 rq->push_cpu = 0;
7816 rq->cpu = i; 7612 rq->cpu = i;
7817 rq->online = 0; 7613 rq->online = 0;
7818 rq->migration_thread = NULL;
7819 rq->idle_stamp = 0; 7614 rq->idle_stamp = 0;
7820 rq->avg_idle = 2*sysctl_sched_migration_cost; 7615 rq->avg_idle = 2*sysctl_sched_migration_cost;
7821 INIT_LIST_HEAD(&rq->migration_queue);
7822 rq_attach_root(rq, &def_root_domain); 7616 rq_attach_root(rq, &def_root_domain);
7823#endif 7617#endif
7824 init_rq_hrtick(rq); 7618 init_rq_hrtick(rq);
@@ -7919,7 +7713,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7919{ 7713{
7920 int on_rq; 7714 int on_rq;
7921 7715
7922 update_rq_clock(rq);
7923 on_rq = p->se.on_rq; 7716 on_rq = p->se.on_rq;
7924 if (on_rq) 7717 if (on_rq)
7925 deactivate_task(rq, p, 0); 7718 deactivate_task(rq, p, 0);
@@ -7946,9 +7739,9 @@ void normalize_rt_tasks(void)
7946 7739
7947 p->se.exec_start = 0; 7740 p->se.exec_start = 0;
7948#ifdef CONFIG_SCHEDSTATS 7741#ifdef CONFIG_SCHEDSTATS
7949 p->se.wait_start = 0; 7742 p->se.statistics.wait_start = 0;
7950 p->se.sleep_start = 0; 7743 p->se.statistics.sleep_start = 0;
7951 p->se.block_start = 0; 7744 p->se.statistics.block_start = 0;
7952#endif 7745#endif
7953 7746
7954 if (!rt_task(p)) { 7747 if (!rt_task(p)) {
@@ -7975,9 +7768,9 @@ void normalize_rt_tasks(void)
7975 7768
7976#endif /* CONFIG_MAGIC_SYSRQ */ 7769#endif /* CONFIG_MAGIC_SYSRQ */
7977 7770
7978#ifdef CONFIG_IA64 7771#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7979/* 7772/*
7980 * These functions are only useful for the IA64 MCA handling. 7773 * These functions are only useful for the IA64 MCA handling, or kdb.
7981 * 7774 *
7982 * They can only be called when the whole system has been 7775 * They can only be called when the whole system has been
7983 * stopped - every CPU needs to be quiescent, and no scheduling 7776 * stopped - every CPU needs to be quiescent, and no scheduling
@@ -7997,6 +7790,9 @@ struct task_struct *curr_task(int cpu)
7997 return cpu_curr(cpu); 7790 return cpu_curr(cpu);
7998} 7791}
7999 7792
7793#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7794
7795#ifdef CONFIG_IA64
8000/** 7796/**
8001 * set_curr_task - set the current task for a given cpu. 7797 * set_curr_task - set the current task for a given cpu.
8002 * @cpu: the processor in question. 7798 * @cpu: the processor in question.
@@ -8281,8 +8077,6 @@ void sched_move_task(struct task_struct *tsk)
8281 8077
8282 rq = task_rq_lock(tsk, &flags); 8078 rq = task_rq_lock(tsk, &flags);
8283 8079
8284 update_rq_clock(rq);
8285
8286 running = task_current(rq, tsk); 8080 running = task_current(rq, tsk);
8287 on_rq = tsk->se.on_rq; 8081 on_rq = tsk->se.on_rq;
8288 8082
@@ -8301,7 +8095,7 @@ void sched_move_task(struct task_struct *tsk)
8301 if (unlikely(running)) 8095 if (unlikely(running))
8302 tsk->sched_class->set_curr_task(rq); 8096 tsk->sched_class->set_curr_task(rq);
8303 if (on_rq) 8097 if (on_rq)
8304 enqueue_task(rq, tsk, 0, false); 8098 enqueue_task(rq, tsk, 0);
8305 8099
8306 task_rq_unlock(rq, &flags); 8100 task_rq_unlock(rq, &flags);
8307} 8101}
@@ -9115,43 +8909,32 @@ struct cgroup_subsys cpuacct_subsys = {
9115 8909
9116#ifndef CONFIG_SMP 8910#ifndef CONFIG_SMP
9117 8911
9118int rcu_expedited_torture_stats(char *page)
9119{
9120 return 0;
9121}
9122EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9123
9124void synchronize_sched_expedited(void) 8912void synchronize_sched_expedited(void)
9125{ 8913{
8914 barrier();
9126} 8915}
9127EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8916EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9128 8917
9129#else /* #ifndef CONFIG_SMP */ 8918#else /* #ifndef CONFIG_SMP */
9130 8919
9131static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 8920static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9132static DEFINE_MUTEX(rcu_sched_expedited_mutex);
9133 8921
9134#define RCU_EXPEDITED_STATE_POST -2 8922static int synchronize_sched_expedited_cpu_stop(void *data)
9135#define RCU_EXPEDITED_STATE_IDLE -1
9136
9137static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9138
9139int rcu_expedited_torture_stats(char *page)
9140{ 8923{
9141 int cnt = 0; 8924 /*
9142 int cpu; 8925 * There must be a full memory barrier on each affected CPU
9143 8926 * between the time that try_stop_cpus() is called and the
9144 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 8927 * time that it returns.
9145 for_each_online_cpu(cpu) { 8928 *
9146 cnt += sprintf(&page[cnt], " %d:%d", 8929 * In the current initial implementation of cpu_stop, the
9147 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 8930 * above condition is already met when the control reaches
9148 } 8931 * this point and the following smp_mb() is not strictly
9149 cnt += sprintf(&page[cnt], "\n"); 8932 * necessary. Do smp_mb() anyway for documentation and
9150 return cnt; 8933 * robustness against future implementation changes.
8934 */
8935 smp_mb(); /* See above comment block. */
8936 return 0;
9151} 8937}
9152EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9153
9154static long synchronize_sched_expedited_count;
9155 8938
9156/* 8939/*
9157 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 8940 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9165,18 +8948,14 @@ static long synchronize_sched_expedited_count;
9165 */ 8948 */
9166void synchronize_sched_expedited(void) 8949void synchronize_sched_expedited(void)
9167{ 8950{
9168 int cpu; 8951 int snap, trycount = 0;
9169 unsigned long flags;
9170 bool need_full_sync = 0;
9171 struct rq *rq;
9172 struct migration_req *req;
9173 long snap;
9174 int trycount = 0;
9175 8952
9176 smp_mb(); /* ensure prior mod happens before capturing snap. */ 8953 smp_mb(); /* ensure prior mod happens before capturing snap. */
9177 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 8954 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9178 get_online_cpus(); 8955 get_online_cpus();
9179 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 8956 while (try_stop_cpus(cpu_online_mask,
8957 synchronize_sched_expedited_cpu_stop,
8958 NULL) == -EAGAIN) {
9180 put_online_cpus(); 8959 put_online_cpus();
9181 if (trycount++ < 10) 8960 if (trycount++ < 10)
9182 udelay(trycount * num_online_cpus()); 8961 udelay(trycount * num_online_cpus());
@@ -9184,41 +8963,15 @@ void synchronize_sched_expedited(void)
9184 synchronize_sched(); 8963 synchronize_sched();
9185 return; 8964 return;
9186 } 8965 }
9187 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 8966 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9188 smp_mb(); /* ensure test happens before caller kfree */ 8967 smp_mb(); /* ensure test happens before caller kfree */
9189 return; 8968 return;
9190 } 8969 }
9191 get_online_cpus(); 8970 get_online_cpus();
9192 } 8971 }
9193 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 8972 atomic_inc(&synchronize_sched_expedited_count);
9194 for_each_online_cpu(cpu) { 8973 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9195 rq = cpu_rq(cpu);
9196 req = &per_cpu(rcu_migration_req, cpu);
9197 init_completion(&req->done);
9198 req->task = NULL;
9199 req->dest_cpu = RCU_MIGRATION_NEED_QS;
9200 raw_spin_lock_irqsave(&rq->lock, flags);
9201 list_add(&req->list, &rq->migration_queue);
9202 raw_spin_unlock_irqrestore(&rq->lock, flags);
9203 wake_up_process(rq->migration_thread);
9204 }
9205 for_each_online_cpu(cpu) {
9206 rcu_expedited_state = cpu;
9207 req = &per_cpu(rcu_migration_req, cpu);
9208 rq = cpu_rq(cpu);
9209 wait_for_completion(&req->done);
9210 raw_spin_lock_irqsave(&rq->lock, flags);
9211 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
9212 need_full_sync = 1;
9213 req->dest_cpu = RCU_MIGRATION_IDLE;
9214 raw_spin_unlock_irqrestore(&rq->lock, flags);
9215 }
9216 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9217 synchronize_sched_expedited_count++;
9218 mutex_unlock(&rcu_sched_expedited_mutex);
9219 put_online_cpus(); 8974 put_online_cpus();
9220 if (need_full_sync)
9221 synchronize_sched();
9222} 8975}
9223EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8976EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9224 8977
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28a..906a0f718cb3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
41 return (unsigned long long)(jiffies - INITIAL_JIFFIES) 41 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
42 * (NSEC_PER_SEC / HZ); 42 * (NSEC_PER_SEC / HZ);
43} 43}
44EXPORT_SYMBOL_GPL(sched_clock);
44 45
45static __read_mostly int sched_clock_running; 46static __read_mostly int sched_clock_running;
46 47
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 19be00ba6123..35565395d00d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
70 PN(se->vruntime); 70 PN(se->vruntime);
71 PN(se->sum_exec_runtime); 71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS 72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start); 73 PN(se->statistics.wait_start);
74 PN(se->sleep_start); 74 PN(se->statistics.sleep_start);
75 PN(se->block_start); 75 PN(se->statistics.block_start);
76 PN(se->sleep_max); 76 PN(se->statistics.sleep_max);
77 PN(se->block_max); 77 PN(se->statistics.block_max);
78 PN(se->exec_max); 78 PN(se->statistics.exec_max);
79 PN(se->slice_max); 79 PN(se->statistics.slice_max);
80 PN(se->wait_max); 80 PN(se->statistics.wait_max);
81 PN(se->wait_sum); 81 PN(se->statistics.wait_sum);
82 P(se->wait_count); 82 P(se->statistics.wait_count);
83#endif 83#endif
84 P(se->load.weight); 84 P(se->load.weight);
85#undef PN 85#undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
105 SPLIT_NS(p->se.vruntime), 105 SPLIT_NS(p->se.vruntime),
106 SPLIT_NS(p->se.sum_exec_runtime), 106 SPLIT_NS(p->se.sum_exec_runtime),
107 SPLIT_NS(p->se.sum_sleep_runtime)); 107 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
108#else 108#else
109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -175,11 +175,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
175 task_group_path(tg, path, sizeof(path)); 175 task_group_path(tg, path, sizeof(path));
176 176
177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
178#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
179 {
180 uid_t uid = cfs_rq->tg->uid;
181 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
182 }
183#else 178#else
184 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
185#endif 180#endif
@@ -386,15 +381,9 @@ __initcall(init_sched_debug_procfs);
386void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 381void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
387{ 382{
388 unsigned long nr_switches; 383 unsigned long nr_switches;
389 unsigned long flags;
390 int num_threads = 1;
391
392 if (lock_task_sighand(p, &flags)) {
393 num_threads = atomic_read(&p->signal->count);
394 unlock_task_sighand(p, &flags);
395 }
396 384
397 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 385 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
386 get_nr_threads(p));
398 SEQ_printf(m, 387 SEQ_printf(m,
399 "---------------------------------------------------------\n"); 388 "---------------------------------------------------------\n");
400#define __P(F) \ 389#define __P(F) \
@@ -409,40 +398,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
409 PN(se.exec_start); 398 PN(se.exec_start);
410 PN(se.vruntime); 399 PN(se.vruntime);
411 PN(se.sum_exec_runtime); 400 PN(se.sum_exec_runtime);
412 PN(se.avg_overlap);
413 PN(se.avg_wakeup);
414 401
415 nr_switches = p->nvcsw + p->nivcsw; 402 nr_switches = p->nvcsw + p->nivcsw;
416 403
417#ifdef CONFIG_SCHEDSTATS 404#ifdef CONFIG_SCHEDSTATS
418 PN(se.wait_start); 405 PN(se.statistics.wait_start);
419 PN(se.sleep_start); 406 PN(se.statistics.sleep_start);
420 PN(se.block_start); 407 PN(se.statistics.block_start);
421 PN(se.sleep_max); 408 PN(se.statistics.sleep_max);
422 PN(se.block_max); 409 PN(se.statistics.block_max);
423 PN(se.exec_max); 410 PN(se.statistics.exec_max);
424 PN(se.slice_max); 411 PN(se.statistics.slice_max);
425 PN(se.wait_max); 412 PN(se.statistics.wait_max);
426 PN(se.wait_sum); 413 PN(se.statistics.wait_sum);
427 P(se.wait_count); 414 P(se.statistics.wait_count);
428 PN(se.iowait_sum); 415 PN(se.statistics.iowait_sum);
429 P(se.iowait_count); 416 P(se.statistics.iowait_count);
430 P(sched_info.bkl_count); 417 P(sched_info.bkl_count);
431 P(se.nr_migrations); 418 P(se.nr_migrations);
432 P(se.nr_migrations_cold); 419 P(se.statistics.nr_migrations_cold);
433 P(se.nr_failed_migrations_affine); 420 P(se.statistics.nr_failed_migrations_affine);
434 P(se.nr_failed_migrations_running); 421 P(se.statistics.nr_failed_migrations_running);
435 P(se.nr_failed_migrations_hot); 422 P(se.statistics.nr_failed_migrations_hot);
436 P(se.nr_forced_migrations); 423 P(se.statistics.nr_forced_migrations);
437 P(se.nr_wakeups); 424 P(se.statistics.nr_wakeups);
438 P(se.nr_wakeups_sync); 425 P(se.statistics.nr_wakeups_sync);
439 P(se.nr_wakeups_migrate); 426 P(se.statistics.nr_wakeups_migrate);
440 P(se.nr_wakeups_local); 427 P(se.statistics.nr_wakeups_local);
441 P(se.nr_wakeups_remote); 428 P(se.statistics.nr_wakeups_remote);
442 P(se.nr_wakeups_affine); 429 P(se.statistics.nr_wakeups_affine);
443 P(se.nr_wakeups_affine_attempts); 430 P(se.statistics.nr_wakeups_affine_attempts);
444 P(se.nr_wakeups_passive); 431 P(se.statistics.nr_wakeups_passive);
445 P(se.nr_wakeups_idle); 432 P(se.statistics.nr_wakeups_idle);
446 433
447 { 434 {
448 u64 avg_atom, avg_per_cpu; 435 u64 avg_atom, avg_per_cpu;
@@ -493,31 +480,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
493void proc_sched_set_task(struct task_struct *p) 480void proc_sched_set_task(struct task_struct *p)
494{ 481{
495#ifdef CONFIG_SCHEDSTATS 482#ifdef CONFIG_SCHEDSTATS
496 p->se.wait_max = 0; 483 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
497 p->se.wait_sum = 0;
498 p->se.wait_count = 0;
499 p->se.iowait_sum = 0;
500 p->se.iowait_count = 0;
501 p->se.sleep_max = 0;
502 p->se.sum_sleep_runtime = 0;
503 p->se.block_max = 0;
504 p->se.exec_max = 0;
505 p->se.slice_max = 0;
506 p->se.nr_migrations = 0;
507 p->se.nr_migrations_cold = 0;
508 p->se.nr_failed_migrations_affine = 0;
509 p->se.nr_failed_migrations_running = 0;
510 p->se.nr_failed_migrations_hot = 0;
511 p->se.nr_forced_migrations = 0;
512 p->se.nr_wakeups = 0;
513 p->se.nr_wakeups_sync = 0;
514 p->se.nr_wakeups_migrate = 0;
515 p->se.nr_wakeups_local = 0;
516 p->se.nr_wakeups_remote = 0;
517 p->se.nr_wakeups_affine = 0;
518 p->se.nr_wakeups_affine_attempts = 0;
519 p->se.nr_wakeups_passive = 0;
520 p->se.nr_wakeups_idle = 0;
521 p->sched_info.bkl_count = 0;
522#endif 484#endif
523} 485}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5a5ea2cd924f..217e4a9393e4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
35 * (to see the precise effective timeslice length of your workload, 35 * (to see the precise effective timeslice length of your workload,
36 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
37 */ 37 */
38unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 6000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL; 39unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40 40
41/* 41/*
42 * The initial- and re-scaling of tunables is configurable 42 * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 2000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 5; 63static unsigned int sched_nr_latency = 3;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
505{ 505{
506 unsigned long delta_exec_weighted; 506 unsigned long delta_exec_weighted;
507 507
508 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 508 schedstat_set(curr->statistics.exec_max,
509 max((u64)delta_exec, curr->statistics.exec_max));
509 510
510 curr->sum_exec_runtime += delta_exec; 511 curr->sum_exec_runtime += delta_exec;
511 schedstat_add(cfs_rq, exec_clock, delta_exec); 512 schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
548static inline void 549static inline void
549update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 550update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
550{ 551{
551 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); 552 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
552} 553}
553 554
554/* 555/*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
567static void 568static void
568update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 569update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
569{ 570{
570 schedstat_set(se->wait_max, max(se->wait_max, 571 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
571 rq_of(cfs_rq)->clock - se->wait_start)); 572 rq_of(cfs_rq)->clock - se->statistics.wait_start));
572 schedstat_set(se->wait_count, se->wait_count + 1); 573 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
573 schedstat_set(se->wait_sum, se->wait_sum + 574 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
574 rq_of(cfs_rq)->clock - se->wait_start); 575 rq_of(cfs_rq)->clock - se->statistics.wait_start);
575#ifdef CONFIG_SCHEDSTATS 576#ifdef CONFIG_SCHEDSTATS
576 if (entity_is_task(se)) { 577 if (entity_is_task(se)) {
577 trace_sched_stat_wait(task_of(se), 578 trace_sched_stat_wait(task_of(se),
578 rq_of(cfs_rq)->clock - se->wait_start); 579 rq_of(cfs_rq)->clock - se->statistics.wait_start);
579 } 580 }
580#endif 581#endif
581 schedstat_set(se->wait_start, 0); 582 schedstat_set(se->statistics.wait_start, 0);
582} 583}
583 584
584static inline void 585static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
657 if (entity_is_task(se)) 658 if (entity_is_task(se))
658 tsk = task_of(se); 659 tsk = task_of(se);
659 660
660 if (se->sleep_start) { 661 if (se->statistics.sleep_start) {
661 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 662 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
662 663
663 if ((s64)delta < 0) 664 if ((s64)delta < 0)
664 delta = 0; 665 delta = 0;
665 666
666 if (unlikely(delta > se->sleep_max)) 667 if (unlikely(delta > se->statistics.sleep_max))
667 se->sleep_max = delta; 668 se->statistics.sleep_max = delta;
668 669
669 se->sleep_start = 0; 670 se->statistics.sleep_start = 0;
670 se->sum_sleep_runtime += delta; 671 se->statistics.sum_sleep_runtime += delta;
671 672
672 if (tsk) { 673 if (tsk) {
673 account_scheduler_latency(tsk, delta >> 10, 1); 674 account_scheduler_latency(tsk, delta >> 10, 1);
674 trace_sched_stat_sleep(tsk, delta); 675 trace_sched_stat_sleep(tsk, delta);
675 } 676 }
676 } 677 }
677 if (se->block_start) { 678 if (se->statistics.block_start) {
678 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 679 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
679 680
680 if ((s64)delta < 0) 681 if ((s64)delta < 0)
681 delta = 0; 682 delta = 0;
682 683
683 if (unlikely(delta > se->block_max)) 684 if (unlikely(delta > se->statistics.block_max))
684 se->block_max = delta; 685 se->statistics.block_max = delta;
685 686
686 se->block_start = 0; 687 se->statistics.block_start = 0;
687 se->sum_sleep_runtime += delta; 688 se->statistics.sum_sleep_runtime += delta;
688 689
689 if (tsk) { 690 if (tsk) {
690 if (tsk->in_iowait) { 691 if (tsk->in_iowait) {
691 se->iowait_sum += delta; 692 se->statistics.iowait_sum += delta;
692 se->iowait_count++; 693 se->statistics.iowait_count++;
693 trace_sched_stat_iowait(tsk, delta); 694 trace_sched_stat_iowait(tsk, delta);
694 } 695 }
695 696
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
737 vruntime += sched_vslice(cfs_rq, se); 738 vruntime += sched_vslice(cfs_rq, se);
738 739
739 /* sleeps up to a single latency don't count. */ 740 /* sleeps up to a single latency don't count. */
740 if (!initial && sched_feat(FAIR_SLEEPERS)) { 741 if (!initial) {
741 unsigned long thresh = sysctl_sched_latency; 742 unsigned long thresh = sysctl_sched_latency;
742 743
743 /* 744 /*
744 * Convert the sleeper threshold into virtual time.
745 * SCHED_IDLE is a special sub-class. We care about
746 * fairness only relative to other SCHED_IDLE tasks,
747 * all of which have the same weight.
748 */
749 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
750 task_of(se)->policy != SCHED_IDLE))
751 thresh = calc_delta_fair(thresh, se);
752
753 /*
754 * Halve their sleep time's effect, to allow 745 * Halve their sleep time's effect, to allow
755 * for a gentler effect of sleepers: 746 * for a gentler effect of sleepers:
756 */ 747 */
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
766 se->vruntime = vruntime; 757 se->vruntime = vruntime;
767} 758}
768 759
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
772static void 760static void
773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 761enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
774{ 762{
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
776 * Update the normalized vruntime before updating min_vruntime 764 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr(). 765 * through callig update_curr().
778 */ 766 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) 767 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
780 se->vruntime += cfs_rq->min_vruntime; 768 se->vruntime += cfs_rq->min_vruntime;
781 769
782 /* 770 /*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
812} 800}
813 801
814static void 802static void
815dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 803dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
816{ 804{
817 /* 805 /*
818 * Update run-time statistics of the 'current'. 806 * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
820 update_curr(cfs_rq); 808 update_curr(cfs_rq);
821 809
822 update_stats_dequeue(cfs_rq, se); 810 update_stats_dequeue(cfs_rq, se);
823 if (sleep) { 811 if (flags & DEQUEUE_SLEEP) {
824#ifdef CONFIG_SCHEDSTATS 812#ifdef CONFIG_SCHEDSTATS
825 if (entity_is_task(se)) { 813 if (entity_is_task(se)) {
826 struct task_struct *tsk = task_of(se); 814 struct task_struct *tsk = task_of(se);
827 815
828 if (tsk->state & TASK_INTERRUPTIBLE) 816 if (tsk->state & TASK_INTERRUPTIBLE)
829 se->sleep_start = rq_of(cfs_rq)->clock; 817 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
830 if (tsk->state & TASK_UNINTERRUPTIBLE) 818 if (tsk->state & TASK_UNINTERRUPTIBLE)
831 se->block_start = rq_of(cfs_rq)->clock; 819 se->statistics.block_start = rq_of(cfs_rq)->clock;
832 } 820 }
833#endif 821#endif
834 } 822 }
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
845 * update can refer to the ->curr item and we need to reflect this 833 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position. 834 * movement in our normalized position.
847 */ 835 */
848 if (!sleep) 836 if (!(flags & DEQUEUE_SLEEP))
849 se->vruntime -= cfs_rq->min_vruntime; 837 se->vruntime -= cfs_rq->min_vruntime;
850} 838}
851 839
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
912 * when there are only lesser-weight tasks around): 900 * when there are only lesser-weight tasks around):
913 */ 901 */
914 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 902 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
915 se->slice_max = max(se->slice_max, 903 se->statistics.slice_max = max(se->statistics.slice_max,
916 se->sum_exec_runtime - se->prev_sum_exec_runtime); 904 se->sum_exec_runtime - se->prev_sum_exec_runtime);
917 } 905 }
918#endif 906#endif
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
1054 * then put the task into the rbtree: 1042 * then put the task into the rbtree:
1055 */ 1043 */
1056static void 1044static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) 1045enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1058{ 1046{
1059 struct cfs_rq *cfs_rq; 1047 struct cfs_rq *cfs_rq;
1060 struct sched_entity *se = &p->se; 1048 struct sched_entity *se = &p->se;
1061 int flags = 0;
1062
1063 if (wakeup)
1064 flags |= ENQUEUE_WAKEUP;
1065 if (p->state == TASK_WAKING)
1066 flags |= ENQUEUE_MIGRATE;
1067 1049
1068 for_each_sched_entity(se) { 1050 for_each_sched_entity(se) {
1069 if (se->on_rq) 1051 if (se->on_rq)
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1081 * decreased. We remove the task from the rbtree and 1063 * decreased. We remove the task from the rbtree and
1082 * update the fair scheduling stats: 1064 * update the fair scheduling stats:
1083 */ 1065 */
1084static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 1066static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1085{ 1067{
1086 struct cfs_rq *cfs_rq; 1068 struct cfs_rq *cfs_rq;
1087 struct sched_entity *se = &p->se; 1069 struct sched_entity *se = &p->se;
1088 1070
1089 for_each_sched_entity(se) { 1071 for_each_sched_entity(se) {
1090 cfs_rq = cfs_rq_of(se); 1072 cfs_rq = cfs_rq_of(se);
1091 dequeue_entity(cfs_rq, se, sleep); 1073 dequeue_entity(cfs_rq, se, flags);
1092 /* Don't dequeue parent if it has other entities besides us */ 1074 /* Don't dequeue parent if it has other entities besides us */
1093 if (cfs_rq->load.weight) 1075 if (cfs_rq->load.weight)
1094 break; 1076 break;
1095 sleep = 1; 1077 flags |= DEQUEUE_SLEEP;
1096 } 1078 }
1097 1079
1098 hrtick_update(rq); 1080 hrtick_update(rq);
@@ -1240,7 +1222,6 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1240 1222
1241static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1223static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1242{ 1224{
1243 struct task_struct *curr = current;
1244 unsigned long this_load, load; 1225 unsigned long this_load, load;
1245 int idx, this_cpu, prev_cpu; 1226 int idx, this_cpu, prev_cpu;
1246 unsigned long tl_per_task; 1227 unsigned long tl_per_task;
@@ -1255,18 +1236,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1255 load = source_load(prev_cpu, idx); 1236 load = source_load(prev_cpu, idx);
1256 this_load = target_load(this_cpu, idx); 1237 this_load = target_load(this_cpu, idx);
1257 1238
1258 if (sync) {
1259 if (sched_feat(SYNC_LESS) &&
1260 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1261 p->se.avg_overlap > sysctl_sched_migration_cost))
1262 sync = 0;
1263 } else {
1264 if (sched_feat(SYNC_MORE) &&
1265 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1266 p->se.avg_overlap < sysctl_sched_migration_cost))
1267 sync = 1;
1268 }
1269
1270 /* 1239 /*
1271 * If sync wakeup then subtract the (maximum possible) 1240 * If sync wakeup then subtract the (maximum possible)
1272 * effect of the currently running task from the load 1241 * effect of the currently running task from the load
@@ -1306,7 +1275,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1306 if (sync && balanced) 1275 if (sync && balanced)
1307 return 1; 1276 return 1;
1308 1277
1309 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1278 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
1310 tl_per_task = cpu_avg_load_per_task(this_cpu); 1279 tl_per_task = cpu_avg_load_per_task(this_cpu);
1311 1280
1312 if (balanced || 1281 if (balanced ||
@@ -1318,7 +1287,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1318 * there is no bad imbalance. 1287 * there is no bad imbalance.
1319 */ 1288 */
1320 schedstat_inc(sd, ttwu_move_affine); 1289 schedstat_inc(sd, ttwu_move_affine);
1321 schedstat_inc(p, se.nr_wakeups_affine); 1290 schedstat_inc(p, se.statistics.nr_wakeups_affine);
1322 1291
1323 return 1; 1292 return 1;
1324 } 1293 }
@@ -1406,29 +1375,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1406/* 1375/*
1407 * Try and locate an idle CPU in the sched_domain. 1376 * Try and locate an idle CPU in the sched_domain.
1408 */ 1377 */
1409static int 1378static int select_idle_sibling(struct task_struct *p, int target)
1410select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1411{ 1379{
1412 int cpu = smp_processor_id(); 1380 int cpu = smp_processor_id();
1413 int prev_cpu = task_cpu(p); 1381 int prev_cpu = task_cpu(p);
1382 struct sched_domain *sd;
1414 int i; 1383 int i;
1415 1384
1416 /* 1385 /*
1417 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE 1386 * If the task is going to be woken-up on this cpu and if it is
1418 * test in select_task_rq_fair) and the prev_cpu is idle then that's 1387 * already idle, then it is the right target.
1419 * always a better target than the current cpu.
1420 */ 1388 */
1421 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) 1389 if (target == cpu && idle_cpu(cpu))
1390 return cpu;
1391
1392 /*
1393 * If the task is going to be woken-up on the cpu where it previously
1394 * ran and if it is currently idle, then it the right target.
1395 */
1396 if (target == prev_cpu && idle_cpu(prev_cpu))
1422 return prev_cpu; 1397 return prev_cpu;
1423 1398
1424 /* 1399 /*
1425 * Otherwise, iterate the domain and find an elegible idle cpu. 1400 * Otherwise, iterate the domains and find an elegible idle cpu.
1426 */ 1401 */
1427 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 1402 for_each_domain(target, sd) {
1428 if (!cpu_rq(i)->cfs.nr_running) { 1403 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1429 target = i;
1430 break; 1404 break;
1405
1406 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1407 if (idle_cpu(i)) {
1408 target = i;
1409 break;
1410 }
1431 } 1411 }
1412
1413 /*
1414 * Lets stop looking for an idle sibling when we reached
1415 * the domain that spans the current cpu and prev_cpu.
1416 */
1417 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
1418 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1419 break;
1432 } 1420 }
1433 1421
1434 return target; 1422 return target;
@@ -1445,7 +1433,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1445 * 1433 *
1446 * preempt must be disabled. 1434 * preempt must be disabled.
1447 */ 1435 */
1448static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 1436static int
1437select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
1449{ 1438{
1450 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1439 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1451 int cpu = smp_processor_id(); 1440 int cpu = smp_processor_id();
@@ -1456,8 +1445,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1456 int sync = wake_flags & WF_SYNC; 1445 int sync = wake_flags & WF_SYNC;
1457 1446
1458 if (sd_flag & SD_BALANCE_WAKE) { 1447 if (sd_flag & SD_BALANCE_WAKE) {
1459 if (sched_feat(AFFINE_WAKEUPS) && 1448 if (cpumask_test_cpu(cpu, &p->cpus_allowed))
1460 cpumask_test_cpu(cpu, &p->cpus_allowed))
1461 want_affine = 1; 1449 want_affine = 1;
1462 new_cpu = prev_cpu; 1450 new_cpu = prev_cpu;
1463 } 1451 }
@@ -1491,34 +1479,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1491 } 1479 }
1492 1480
1493 /* 1481 /*
1494 * While iterating the domains looking for a spanning 1482 * If both cpu and prev_cpu are part of this domain,
1495 * WAKE_AFFINE domain, adjust the affine target to any idle cpu 1483 * cpu is a valid SD_WAKE_AFFINE target.
1496 * in cache sharing domains along the way.
1497 */ 1484 */
1498 if (want_affine) { 1485 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1499 int target = -1; 1486 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1500 1487 affine_sd = tmp;
1501 /* 1488 want_affine = 0;
1502 * If both cpu and prev_cpu are part of this domain,
1503 * cpu is a valid SD_WAKE_AFFINE target.
1504 */
1505 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1506 target = cpu;
1507
1508 /*
1509 * If there's an idle sibling in this domain, make that
1510 * the wake_affine target instead of the current cpu.
1511 */
1512 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1513 target = select_idle_sibling(p, tmp, target);
1514
1515 if (target >= 0) {
1516 if (tmp->flags & SD_WAKE_AFFINE) {
1517 affine_sd = tmp;
1518 want_affine = 0;
1519 }
1520 cpu = target;
1521 }
1522 } 1489 }
1523 1490
1524 if (!want_sd && !want_affine) 1491 if (!want_sd && !want_affine)
@@ -1531,22 +1498,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1531 sd = tmp; 1498 sd = tmp;
1532 } 1499 }
1533 1500
1501#ifdef CONFIG_FAIR_GROUP_SCHED
1534 if (sched_feat(LB_SHARES_UPDATE)) { 1502 if (sched_feat(LB_SHARES_UPDATE)) {
1535 /* 1503 /*
1536 * Pick the largest domain to update shares over 1504 * Pick the largest domain to update shares over
1537 */ 1505 */
1538 tmp = sd; 1506 tmp = sd;
1539 if (affine_sd && (!tmp || 1507 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1540 cpumask_weight(sched_domain_span(affine_sd)) >
1541 cpumask_weight(sched_domain_span(sd))))
1542 tmp = affine_sd; 1508 tmp = affine_sd;
1543 1509
1544 if (tmp) 1510 if (tmp) {
1511 raw_spin_unlock(&rq->lock);
1545 update_shares(tmp); 1512 update_shares(tmp);
1513 raw_spin_lock(&rq->lock);
1514 }
1546 } 1515 }
1516#endif
1547 1517
1548 if (affine_sd && wake_affine(affine_sd, p, sync)) 1518 if (affine_sd) {
1549 return cpu; 1519 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1520 return select_idle_sibling(p, cpu);
1521 else
1522 return select_idle_sibling(p, prev_cpu);
1523 }
1550 1524
1551 while (sd) { 1525 while (sd) {
1552 int load_idx = sd->forkexec_idx; 1526 int load_idx = sd->forkexec_idx;
@@ -1576,10 +1550,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1576 1550
1577 /* Now try balancing at a lower domain level of new_cpu */ 1551 /* Now try balancing at a lower domain level of new_cpu */
1578 cpu = new_cpu; 1552 cpu = new_cpu;
1579 weight = cpumask_weight(sched_domain_span(sd)); 1553 weight = sd->span_weight;
1580 sd = NULL; 1554 sd = NULL;
1581 for_each_domain(cpu, tmp) { 1555 for_each_domain(cpu, tmp) {
1582 if (weight <= cpumask_weight(sched_domain_span(tmp))) 1556 if (weight <= tmp->span_weight)
1583 break; 1557 break;
1584 if (tmp->flags & sd_flag) 1558 if (tmp->flags & sd_flag)
1585 sd = tmp; 1559 sd = tmp;
@@ -1591,63 +1565,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1591} 1565}
1592#endif /* CONFIG_SMP */ 1566#endif /* CONFIG_SMP */
1593 1567
1594/*
1595 * Adaptive granularity
1596 *
1597 * se->avg_wakeup gives the average time a task runs until it does a wakeup,
1598 * with the limit of wakeup_gran -- when it never does a wakeup.
1599 *
1600 * So the smaller avg_wakeup is the faster we want this task to preempt,
1601 * but we don't want to treat the preemptee unfairly and therefore allow it
1602 * to run for at least the amount of time we'd like to run.
1603 *
1604 * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
1605 *
1606 * NOTE: we use *nr_running to scale with load, this nicely matches the
1607 * degrading latency on load.
1608 */
1609static unsigned long
1610adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
1611{
1612 u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1613 u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
1614 u64 gran = 0;
1615
1616 if (this_run < expected_wakeup)
1617 gran = expected_wakeup - this_run;
1618
1619 return min_t(s64, gran, sysctl_sched_wakeup_granularity);
1620}
1621
1622static unsigned long 1568static unsigned long
1623wakeup_gran(struct sched_entity *curr, struct sched_entity *se) 1569wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1624{ 1570{
1625 unsigned long gran = sysctl_sched_wakeup_granularity; 1571 unsigned long gran = sysctl_sched_wakeup_granularity;
1626 1572
1627 if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
1628 gran = adaptive_gran(curr, se);
1629
1630 /* 1573 /*
1631 * Since its curr running now, convert the gran from real-time 1574 * Since its curr running now, convert the gran from real-time
1632 * to virtual-time in his units. 1575 * to virtual-time in his units.
1576 *
1577 * By using 'se' instead of 'curr' we penalize light tasks, so
1578 * they get preempted easier. That is, if 'se' < 'curr' then
1579 * the resulting gran will be larger, therefore penalizing the
1580 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1581 * be smaller, again penalizing the lighter task.
1582 *
1583 * This is especially important for buddies when the leftmost
1584 * task is higher priority than the buddy.
1633 */ 1585 */
1634 if (sched_feat(ASYM_GRAN)) { 1586 if (unlikely(se->load.weight != NICE_0_LOAD))
1635 /* 1587 gran = calc_delta_fair(gran, se);
1636 * By using 'se' instead of 'curr' we penalize light tasks, so
1637 * they get preempted easier. That is, if 'se' < 'curr' then
1638 * the resulting gran will be larger, therefore penalizing the
1639 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1640 * be smaller, again penalizing the lighter task.
1641 *
1642 * This is especially important for buddies when the leftmost
1643 * task is higher priority than the buddy.
1644 */
1645 if (unlikely(se->load.weight != NICE_0_LOAD))
1646 gran = calc_delta_fair(gran, se);
1647 } else {
1648 if (unlikely(curr->load.weight != NICE_0_LOAD))
1649 gran = calc_delta_fair(gran, curr);
1650 }
1651 1588
1652 return gran; 1589 return gran;
1653} 1590}
@@ -1705,7 +1642,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1705 struct task_struct *curr = rq->curr; 1642 struct task_struct *curr = rq->curr;
1706 struct sched_entity *se = &curr->se, *pse = &p->se; 1643 struct sched_entity *se = &curr->se, *pse = &p->se;
1707 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1644 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1708 int sync = wake_flags & WF_SYNC;
1709 int scale = cfs_rq->nr_running >= sched_nr_latency; 1645 int scale = cfs_rq->nr_running >= sched_nr_latency;
1710 1646
1711 if (unlikely(rt_prio(p->prio))) 1647 if (unlikely(rt_prio(p->prio)))
@@ -1738,14 +1674,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1738 if (unlikely(curr->policy == SCHED_IDLE)) 1674 if (unlikely(curr->policy == SCHED_IDLE))
1739 goto preempt; 1675 goto preempt;
1740 1676
1741 if (sched_feat(WAKEUP_SYNC) && sync)
1742 goto preempt;
1743
1744 if (sched_feat(WAKEUP_OVERLAP) &&
1745 se->avg_overlap < sysctl_sched_migration_cost &&
1746 pse->avg_overlap < sysctl_sched_migration_cost)
1747 goto preempt;
1748
1749 if (!sched_feat(WAKEUP_PREEMPT)) 1677 if (!sched_feat(WAKEUP_PREEMPT))
1750 return; 1678 return;
1751 1679
@@ -1844,13 +1772,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1844 * 3) are cache-hot on their current CPU. 1772 * 3) are cache-hot on their current CPU.
1845 */ 1773 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 1774 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine); 1775 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
1848 return 0; 1776 return 0;
1849 } 1777 }
1850 *all_pinned = 0; 1778 *all_pinned = 0;
1851 1779
1852 if (task_running(rq, p)) { 1780 if (task_running(rq, p)) {
1853 schedstat_inc(p, se.nr_failed_migrations_running); 1781 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1854 return 0; 1782 return 0;
1855 } 1783 }
1856 1784
@@ -1866,14 +1794,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1866#ifdef CONFIG_SCHEDSTATS 1794#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) { 1795 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]); 1796 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations); 1797 schedstat_inc(p, se.statistics.nr_forced_migrations);
1870 } 1798 }
1871#endif 1799#endif
1872 return 1; 1800 return 1;
1873 } 1801 }
1874 1802
1875 if (tsk_cache_hot) { 1803 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot); 1804 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
1877 return 0; 1805 return 0;
1878 } 1806 }
1879 return 1; 1807 return 1;
@@ -2311,7 +2239,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2311 2239
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 2240unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{ 2241{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2242 unsigned long weight = sd->span_weight;
2315 unsigned long smt_gain = sd->smt_gain; 2243 unsigned long smt_gain = sd->smt_gain;
2316 2244
2317 smt_gain /= weight; 2245 smt_gain /= weight;
@@ -2344,7 +2272,7 @@ unsigned long scale_rt_power(int cpu)
2344 2272
2345static void update_cpu_power(struct sched_domain *sd, int cpu) 2273static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{ 2274{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2275 unsigned long weight = sd->span_weight;
2348 unsigned long power = SCHED_LOAD_SCALE; 2276 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups; 2277 struct sched_group *sdg = sd->groups;
2350 2278
@@ -2870,6 +2798,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 2798 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871} 2799}
2872 2800
2801static int active_load_balance_cpu_stop(void *data);
2802
2873/* 2803/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2804 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance. 2805 * tasks if there is an imbalance.
@@ -2959,8 +2889,9 @@ redo:
2959 if (need_active_balance(sd, sd_idle, idle)) { 2889 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags); 2890 raw_spin_lock_irqsave(&busiest->lock, flags);
2961 2891
2962 /* don't kick the migration_thread, if the curr 2892 /* don't kick the active_load_balance_cpu_stop,
2963 * task on busiest cpu can't be moved to this_cpu 2893 * if the curr task on busiest cpu can't be
2894 * moved to this_cpu
2964 */ 2895 */
2965 if (!cpumask_test_cpu(this_cpu, 2896 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) { 2897 &busiest->curr->cpus_allowed)) {
@@ -2970,14 +2901,22 @@ redo:
2970 goto out_one_pinned; 2901 goto out_one_pinned;
2971 } 2902 }
2972 2903
2904 /*
2905 * ->active_balance synchronizes accesses to
2906 * ->active_balance_work. Once set, it's cleared
2907 * only after active load balance is finished.
2908 */
2973 if (!busiest->active_balance) { 2909 if (!busiest->active_balance) {
2974 busiest->active_balance = 1; 2910 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu; 2911 busiest->push_cpu = this_cpu;
2976 active_balance = 1; 2912 active_balance = 1;
2977 } 2913 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags); 2914 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2915
2979 if (active_balance) 2916 if (active_balance)
2980 wake_up_process(busiest->migration_thread); 2917 stop_one_cpu_nowait(cpu_of(busiest),
2918 active_load_balance_cpu_stop, busiest,
2919 &busiest->active_balance_work);
2981 2920
2982 /* 2921 /*
2983 * We've kicked active balancing, reset the failure 2922 * We've kicked active balancing, reset the failure
@@ -3084,24 +3023,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3084} 3023}
3085 3024
3086/* 3025/*
3087 * active_load_balance is run by migration threads. It pushes running tasks 3026 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 3027 * running tasks off the busiest CPU onto idle CPUs. It requires at
3089 * running on each physical CPU where possible, and avoids physical / 3028 * least 1 task to be running on each physical CPU where possible, and
3090 * logical imbalances. 3029 * avoids physical / logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */ 3030 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) 3031static int active_load_balance_cpu_stop(void *data)
3095{ 3032{
3033 struct rq *busiest_rq = data;
3034 int busiest_cpu = cpu_of(busiest_rq);
3096 int target_cpu = busiest_rq->push_cpu; 3035 int target_cpu = busiest_rq->push_cpu;
3036 struct rq *target_rq = cpu_rq(target_cpu);
3097 struct sched_domain *sd; 3037 struct sched_domain *sd;
3098 struct rq *target_rq; 3038
3039 raw_spin_lock_irq(&busiest_rq->lock);
3040
3041 /* make sure the requested cpu hasn't gone down in the meantime */
3042 if (unlikely(busiest_cpu != smp_processor_id() ||
3043 !busiest_rq->active_balance))
3044 goto out_unlock;
3099 3045
3100 /* Is there any task to move? */ 3046 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1) 3047 if (busiest_rq->nr_running <= 1)
3102 return; 3048 goto out_unlock;
3103
3104 target_rq = cpu_rq(target_cpu);
3105 3049
3106 /* 3050 /*
3107 * This condition is "impossible", if it occurs 3051 * This condition is "impossible", if it occurs
@@ -3112,8 +3056,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3112 3056
3113 /* move a task from busiest_rq to target_rq */ 3057 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq); 3058 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117 3059
3118 /* Search for an sd spanning us and the target CPU. */ 3060 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) { 3061 for_each_domain(target_cpu, sd) {
@@ -3132,6 +3074,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3132 schedstat_inc(sd, alb_failed); 3074 schedstat_inc(sd, alb_failed);
3133 } 3075 }
3134 double_unlock_balance(busiest_rq, target_rq); 3076 double_unlock_balance(busiest_rq, target_rq);
3077out_unlock:
3078 busiest_rq->active_balance = 0;
3079 raw_spin_unlock_irq(&busiest_rq->lock);
3080 return 0;
3135} 3081}
3136 3082
3137#ifdef CONFIG_NO_HZ 3083#ifdef CONFIG_NO_HZ
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d9..83c66e8ad3ee 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
1/* 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows 2 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart. 4 * rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14 7
15/* 8/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
21
22/*
23 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
24 * tasks 10 * tasks
25 */ 11 */
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
31SCHED_FEAT(WAKEUP_PREEMPT, 1) 17SCHED_FEAT(WAKEUP_PREEMPT, 1)
32 18
33/* 19/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
58 * the remote end is likely to consume the data we just wrote, and
59 * therefore has cache benefit from being placed on the same cpu, see
60 * also AFFINE_WAKEUPS.
61 */
62SCHED_FEAT(SYNC_WAKEUPS, 1)
63
64/*
65 * Based on load and program behaviour, see if it makes sense to place 20 * Based on load and program behaviour, see if it makes sense to place
66 * a newly woken task on the same cpu as the task that woke it -- 21 * a newly woken task on the same cpu as the task that woke it --
67 * improve cache locality. Typically used with SYNC wakeups as 22 * improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
70SCHED_FEAT(AFFINE_WAKEUPS, 1) 25SCHED_FEAT(AFFINE_WAKEUPS, 1)
71 26
72/* 27/*
73 * Weaken SYNC hint based on overlap
74 */
75SCHED_FEAT(SYNC_LESS, 1)
76
77/*
78 * Add SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_MORE, 0)
81
82/*
83 * Prefer to schedule the task we woke last (assuming it failed 28 * Prefer to schedule the task we woke last (assuming it failed
84 * wakeup-preemption), since its likely going to consume data we 29 * wakeup-preemption), since its likely going to consume data we
85 * touched, increases cache locality. 30 * touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a50947..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
10{ 11{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
12} 13}
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
22static struct task_struct *pick_next_task_idle(struct rq *rq) 23static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 24{
24 schedstat_inc(rq, sched_goidle); 25 schedstat_inc(rq, sched_goidle);
25 /* adjust the active tasks as we might go into a long sleep */ 26 calc_load_account_idle(rq);
26 calc_load_account_active(rq);
27 return rq->idle; 27 return rq->idle;
28} 28}
29 29
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
32 * message if some code attempts to do it: 32 * message if some code attempts to do it:
33 */ 33 */
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 raw_spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b5b920ae2ea7..8afb953e31c6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
616 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 616 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
617 617
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
888 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
889 */ 889 */
890static void 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) 891enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
892{ 892{
893 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
894 894
895 if (wakeup) 895 if (flags & ENQUEUE_WAKEUP)
896 rt_se->timeout = 0; 896 rt_se->timeout = 0;
897 897
898 enqueue_rt_entity(rt_se, head); 898 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
899 899
900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
901 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
902} 902}
903 903
904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
905{ 905{
906 struct sched_rt_entity *rt_se = &p->rt; 906 struct sched_rt_entity *rt_se = &p->rt;
907 907
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
948#ifdef CONFIG_SMP 948#ifdef CONFIG_SMP
949static int find_lowest_rq(struct task_struct *task); 949static int find_lowest_rq(struct task_struct *task);
950 950
951static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 951static int
952select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
952{ 953{
953 struct rq *rq = task_rq(p);
954
955 if (sd_flag != SD_BALANCE_WAKE) 954 if (sd_flag != SD_BALANCE_WAKE)
956 return smp_processor_id(); 955 return smp_processor_id();
957 956
diff --git a/kernel/signal.c b/kernel/signal.c
index dbd7fe073c55..906ae5a1779c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -642,7 +642,7 @@ static inline bool si_fromuser(const struct siginfo *info)
642static int check_kill_permission(int sig, struct siginfo *info, 642static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 643 struct task_struct *t)
644{ 644{
645 const struct cred *cred = current_cred(), *tcred; 645 const struct cred *cred, *tcred;
646 struct pid *sid; 646 struct pid *sid;
647 int error; 647 int error;
648 648
@@ -656,8 +656,10 @@ static int check_kill_permission(int sig, struct siginfo *info,
656 if (error) 656 if (error)
657 return error; 657 return error;
658 658
659 cred = current_cred();
659 tcred = __task_cred(t); 660 tcred = __task_cred(t);
660 if ((cred->euid ^ tcred->suid) && 661 if (!same_thread_group(current, t) &&
662 (cred->euid ^ tcred->suid) &&
661 (cred->euid ^ tcred->uid) && 663 (cred->euid ^ tcred->uid) &&
662 (cred->uid ^ tcred->suid) && 664 (cred->uid ^ tcred->suid) &&
663 (cred->uid ^ tcred->uid) && 665 (cred->uid ^ tcred->uid) &&
@@ -1083,23 +1085,24 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1083/* 1085/*
1084 * Nuke all other threads in the group. 1086 * Nuke all other threads in the group.
1085 */ 1087 */
1086void zap_other_threads(struct task_struct *p) 1088int zap_other_threads(struct task_struct *p)
1087{ 1089{
1088 struct task_struct *t; 1090 struct task_struct *t = p;
1091 int count = 0;
1089 1092
1090 p->signal->group_stop_count = 0; 1093 p->signal->group_stop_count = 0;
1091 1094
1092 for (t = next_thread(p); t != p; t = next_thread(t)) { 1095 while_each_thread(p, t) {
1093 /* 1096 count++;
1094 * Don't bother with already dead threads 1097
1095 */ 1098 /* Don't bother with already dead threads */
1096 if (t->exit_state) 1099 if (t->exit_state)
1097 continue; 1100 continue;
1098
1099 /* SIGKILL will be handled before any pending SIGSTOP */
1100 sigaddset(&t->pending.signal, SIGKILL); 1101 sigaddset(&t->pending.signal, SIGKILL);
1101 signal_wake_up(t, 1); 1102 signal_wake_up(t, 1);
1102 } 1103 }
1104
1105 return count;
1103} 1106}
1104 1107
1105struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1108struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
@@ -2735,3 +2738,43 @@ void __init signals_init(void)
2735{ 2738{
2736 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); 2739 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
2737} 2740}
2741
2742#ifdef CONFIG_KGDB_KDB
2743#include <linux/kdb.h>
2744/*
2745 * kdb_send_sig_info - Allows kdb to send signals without exposing
2746 * signal internals. This function checks if the required locks are
2747 * available before calling the main signal code, to avoid kdb
2748 * deadlocks.
2749 */
2750void
2751kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
2752{
2753 static struct task_struct *kdb_prev_t;
2754 int sig, new_t;
2755 if (!spin_trylock(&t->sighand->siglock)) {
2756 kdb_printf("Can't do kill command now.\n"
2757 "The sigmask lock is held somewhere else in "
2758 "kernel, try again later\n");
2759 return;
2760 }
2761 spin_unlock(&t->sighand->siglock);
2762 new_t = kdb_prev_t != t;
2763 kdb_prev_t = t;
2764 if (t->state != TASK_RUNNING && new_t) {
2765 kdb_printf("Process is not RUNNING, sending a signal from "
2766 "kdb risks deadlock\n"
2767 "on the run queue locks. "
2768 "The signal has _not_ been sent.\n"
2769 "Reissue the kill command if you want to risk "
2770 "the deadlock.\n");
2771 return;
2772 }
2773 sig = info->si_signo;
2774 if (send_sig_info(sig, info, t))
2775 kdb_printf("Fail to deliver Signal %d to process %d.\n",
2776 sig, t->pid);
2777 else
2778 kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
2779}
2780#endif /* CONFIG_KGDB_KDB */
diff --git a/kernel/smp.c b/kernel/smp.c
index 3fc697336183..75c970c715d3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
52 case CPU_UP_PREPARE_FROZEN: 52 case CPU_UP_PREPARE_FROZEN:
53 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 53 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
54 cpu_to_node(cpu))) 54 cpu_to_node(cpu)))
55 return NOTIFY_BAD; 55 return notifier_from_errno(-ENOMEM);
56 break; 56 break;
57 57
58#ifdef CONFIG_HOTPLUG_CPU 58#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7c1a67ef0274..825e1126008f 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -716,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
716 preempt_enable_no_resched(); 716 preempt_enable_no_resched();
717 cond_resched(); 717 cond_resched();
718 preempt_disable(); 718 preempt_disable();
719 rcu_sched_qs((long)__bind_cpu); 719 rcu_note_context_switch((long)__bind_cpu);
720 } 720 }
721 preempt_enable(); 721 preempt_enable();
722 set_current_state(TASK_INTERRUPTIBLE); 722 set_current_state(TASK_INTERRUPTIBLE);
@@ -808,7 +808,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
809 if (IS_ERR(p)) { 809 if (IS_ERR(p)) {
810 printk("ksoftirqd for %i failed\n", hotcpu); 810 printk("ksoftirqd for %i failed\n", hotcpu);
811 return NOTIFY_BAD; 811 return notifier_from_errno(PTR_ERR(p));
812 } 812 }
813 kthread_bind(p, hotcpu); 813 kthread_bind(p, hotcpu);
814 per_cpu(ksoftirqd, hotcpu) = p; 814 per_cpu(ksoftirqd, hotcpu) = p;
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bb9fb1bd79c..b4e7431e7c78 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,384 @@
1/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. 1/*
2 * GPL v2 and any later version. 2 * kernel/stop_machine.c
3 *
4 * Copyright (C) 2008, 2005 IBM Corporation.
5 * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
6 * Copyright (C) 2010 SUSE Linux Products GmbH
7 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
8 *
9 * This file is released under the GPLv2 and any later version.
3 */ 10 */
11#include <linux/completion.h>
4#include <linux/cpu.h> 12#include <linux/cpu.h>
5#include <linux/err.h> 13#include <linux/init.h>
6#include <linux/kthread.h> 14#include <linux/kthread.h>
7#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/percpu.h>
8#include <linux/sched.h> 17#include <linux/sched.h>
9#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
10#include <linux/syscalls.h>
11#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h>
12 21
13#include <asm/atomic.h> 22#include <asm/atomic.h>
14#include <asm/uaccess.h> 23
24/*
25 * Structure to determine completion condition and record errors. May
26 * be shared by works on different cpus.
27 */
28struct cpu_stop_done {
29 atomic_t nr_todo; /* nr left to execute */
30 bool executed; /* actually executed? */
31 int ret; /* collected return value */
32 struct completion completion; /* fired if nr_todo reaches 0 */
33};
34
35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper {
37 spinlock_t lock;
38 struct list_head works; /* list of pending works */
39 struct task_struct *thread; /* stopper thread */
40 bool enabled; /* is this stopper enabled? */
41};
42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44
45static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
46{
47 memset(done, 0, sizeof(*done));
48 atomic_set(&done->nr_todo, nr_todo);
49 init_completion(&done->completion);
50}
51
52/* signal completion unless @done is NULL */
53static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
54{
55 if (done) {
56 if (executed)
57 done->executed = true;
58 if (atomic_dec_and_test(&done->nr_todo))
59 complete(&done->completion);
60 }
61}
62
63/* queue @work to @stopper. if offline, @work is completed immediately */
64static void cpu_stop_queue_work(struct cpu_stopper *stopper,
65 struct cpu_stop_work *work)
66{
67 unsigned long flags;
68
69 spin_lock_irqsave(&stopper->lock, flags);
70
71 if (stopper->enabled) {
72 list_add_tail(&work->list, &stopper->works);
73 wake_up_process(stopper->thread);
74 } else
75 cpu_stop_signal_done(work->done, false);
76
77 spin_unlock_irqrestore(&stopper->lock, flags);
78}
79
80/**
81 * stop_one_cpu - stop a cpu
82 * @cpu: cpu to stop
83 * @fn: function to execute
84 * @arg: argument to @fn
85 *
86 * Execute @fn(@arg) on @cpu. @fn is run in a process context with
87 * the highest priority preempting any task on the cpu and
88 * monopolizing it. This function returns after the execution is
89 * complete.
90 *
91 * This function doesn't guarantee @cpu stays online till @fn
92 * completes. If @cpu goes down in the middle, execution may happen
93 * partially or fully on different cpus. @fn should either be ready
94 * for that or the caller should ensure that @cpu stays online until
95 * this function completes.
96 *
97 * CONTEXT:
98 * Might sleep.
99 *
100 * RETURNS:
101 * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
102 * otherwise, the return value of @fn.
103 */
104int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
105{
106 struct cpu_stop_done done;
107 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
108
109 cpu_stop_init_done(&done, 1);
110 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
111 wait_for_completion(&done.completion);
112 return done.executed ? done.ret : -ENOENT;
113}
114
115/**
116 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
117 * @cpu: cpu to stop
118 * @fn: function to execute
119 * @arg: argument to @fn
120 *
121 * Similar to stop_one_cpu() but doesn't wait for completion. The
122 * caller is responsible for ensuring @work_buf is currently unused
123 * and will remain untouched until stopper starts executing @fn.
124 *
125 * CONTEXT:
126 * Don't care.
127 */
128void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
129 struct cpu_stop_work *work_buf)
130{
131 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
132 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
133}
134
135/* static data for stop_cpus */
136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
140{
141 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu;
144
145 /* initialize works and done */
146 for_each_cpu(cpu, cpumask) {
147 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn;
149 work->arg = arg;
150 work->done = &done;
151 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153
154 /*
155 * Disable preemption while queueing to avoid getting
156 * preempted by a stopper which might wait for other stoppers
157 * to enter @fn which can lead to deadlock.
158 */
159 preempt_disable();
160 for_each_cpu(cpu, cpumask)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable();
164
165 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT;
167}
168
169/**
170 * stop_cpus - stop multiple cpus
171 * @cpumask: cpus to stop
172 * @fn: function to execute
173 * @arg: argument to @fn
174 *
175 * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
176 * @fn is run in a process context with the highest priority
177 * preempting any task on the cpu and monopolizing it. This function
178 * returns after all executions are complete.
179 *
180 * This function doesn't guarantee the cpus in @cpumask stay online
181 * till @fn completes. If some cpus go down in the middle, execution
182 * on the cpu may happen partially or fully on different cpus. @fn
183 * should either be ready for that or the caller should ensure that
184 * the cpus stay online until this function completes.
185 *
186 * All stop_cpus() calls are serialized making it safe for @fn to wait
187 * for all cpus to start executing it.
188 *
189 * CONTEXT:
190 * Might sleep.
191 *
192 * RETURNS:
193 * -ENOENT if @fn(@arg) was not executed at all because all cpus in
194 * @cpumask were offline; otherwise, 0 if all executions of @fn
195 * returned 0, any non zero return value if any returned non zero.
196 */
197int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
198{
199 int ret;
200
201 /* static works are used, process one request at a time */
202 mutex_lock(&stop_cpus_mutex);
203 ret = __stop_cpus(cpumask, fn, arg);
204 mutex_unlock(&stop_cpus_mutex);
205 return ret;
206}
207
208/**
209 * try_stop_cpus - try to stop multiple cpus
210 * @cpumask: cpus to stop
211 * @fn: function to execute
212 * @arg: argument to @fn
213 *
214 * Identical to stop_cpus() except that it fails with -EAGAIN if
215 * someone else is already using the facility.
216 *
217 * CONTEXT:
218 * Might sleep.
219 *
220 * RETURNS:
221 * -EAGAIN if someone else is already stopping cpus, -ENOENT if
222 * @fn(@arg) was not executed at all because all cpus in @cpumask were
223 * offline; otherwise, 0 if all executions of @fn returned 0, any non
224 * zero return value if any returned non zero.
225 */
226int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
227{
228 int ret;
229
230 /* static works are used, process one request at a time */
231 if (!mutex_trylock(&stop_cpus_mutex))
232 return -EAGAIN;
233 ret = __stop_cpus(cpumask, fn, arg);
234 mutex_unlock(&stop_cpus_mutex);
235 return ret;
236}
237
238static int cpu_stopper_thread(void *data)
239{
240 struct cpu_stopper *stopper = data;
241 struct cpu_stop_work *work;
242 int ret;
243
244repeat:
245 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
246
247 if (kthread_should_stop()) {
248 __set_current_state(TASK_RUNNING);
249 return 0;
250 }
251
252 work = NULL;
253 spin_lock_irq(&stopper->lock);
254 if (!list_empty(&stopper->works)) {
255 work = list_first_entry(&stopper->works,
256 struct cpu_stop_work, list);
257 list_del_init(&work->list);
258 }
259 spin_unlock_irq(&stopper->lock);
260
261 if (work) {
262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN];
266
267 __set_current_state(TASK_RUNNING);
268
269 /* cpu stop callbacks are not allowed to sleep */
270 preempt_disable();
271
272 ret = fn(arg);
273 if (ret)
274 done->ret = ret;
275
276 /* restore preemption and check it's still balanced */
277 preempt_enable();
278 WARN_ONCE(preempt_count(),
279 "cpu_stop: %s(%p) leaked preempt count\n",
280 kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
281 ksym_buf), arg);
282
283 cpu_stop_signal_done(done, true);
284 } else
285 schedule();
286
287 goto repeat;
288}
289
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu)
293{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p;
298
299 switch (action & ~CPU_TASKS_FROZEN) {
300 case CPU_UP_PREPARE:
301 BUG_ON(stopper->thread || stopper->enabled ||
302 !list_empty(&stopper->works));
303 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
304 cpu);
305 if (IS_ERR(p))
306 return NOTIFY_BAD;
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p);
309 stopper->thread = p;
310 break;
311
312 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread);
316 /* mark enabled */
317 spin_lock_irq(&stopper->lock);
318 stopper->enabled = true;
319 spin_unlock_irq(&stopper->lock);
320 break;
321
322#ifdef CONFIG_HOTPLUG_CPU
323 case CPU_UP_CANCELED:
324 case CPU_DEAD:
325 {
326 struct cpu_stop_work *work;
327
328 /* kill the stopper */
329 kthread_stop(stopper->thread);
330 /* drain remaining works */
331 spin_lock_irq(&stopper->lock);
332 list_for_each_entry(work, &stopper->works, list)
333 cpu_stop_signal_done(work->done, false);
334 stopper->enabled = false;
335 spin_unlock_irq(&stopper->lock);
336 /* release the stopper */
337 put_task_struct(stopper->thread);
338 stopper->thread = NULL;
339 break;
340 }
341#endif
342 }
343
344 return NOTIFY_OK;
345}
346
347/*
348 * Give it a higher priority so that cpu stopper is available to other
349 * cpu notifiers. It currently shares the same priority as sched
350 * migration_notifier.
351 */
352static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
353 .notifier_call = cpu_stop_cpu_callback,
354 .priority = 10,
355};
356
357static int __init cpu_stop_init(void)
358{
359 void *bcpu = (void *)(long)smp_processor_id();
360 unsigned int cpu;
361 int err;
362
363 for_each_possible_cpu(cpu) {
364 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
365
366 spin_lock_init(&stopper->lock);
367 INIT_LIST_HEAD(&stopper->works);
368 }
369
370 /* start one for the boot cpu */
371 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
372 bcpu);
373 BUG_ON(err == NOTIFY_BAD);
374 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
375 register_cpu_notifier(&cpu_stop_cpu_notifier);
376
377 return 0;
378}
379early_initcall(cpu_stop_init);
380
381#ifdef CONFIG_STOP_MACHINE
15 382
16/* This controls the threads on each CPU. */ 383/* This controls the threads on each CPU. */
17enum stopmachine_state { 384enum stopmachine_state {
@@ -26,174 +393,94 @@ enum stopmachine_state {
26 /* Exit */ 393 /* Exit */
27 STOPMACHINE_EXIT, 394 STOPMACHINE_EXIT,
28}; 395};
29static enum stopmachine_state state;
30 396
31struct stop_machine_data { 397struct stop_machine_data {
32 int (*fn)(void *); 398 int (*fn)(void *);
33 void *data; 399 void *data;
34 int fnret; 400 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
401 unsigned int num_threads;
402 const struct cpumask *active_cpus;
403
404 enum stopmachine_state state;
405 atomic_t thread_ack;
35}; 406};
36 407
37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ 408static void set_state(struct stop_machine_data *smdata,
38static unsigned int num_threads; 409 enum stopmachine_state newstate)
39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock);
41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus;
48static void __percpu *stop_machine_work;
49
50static void set_state(enum stopmachine_state newstate)
51{ 410{
52 /* Reset ack counter. */ 411 /* Reset ack counter. */
53 atomic_set(&thread_ack, num_threads); 412 atomic_set(&smdata->thread_ack, smdata->num_threads);
54 smp_wmb(); 413 smp_wmb();
55 state = newstate; 414 smdata->state = newstate;
56} 415}
57 416
58/* Last one to ack a state moves to the next state. */ 417/* Last one to ack a state moves to the next state. */
59static void ack_state(void) 418static void ack_state(struct stop_machine_data *smdata)
60{ 419{
61 if (atomic_dec_and_test(&thread_ack)) 420 if (atomic_dec_and_test(&smdata->thread_ack))
62 set_state(state + 1); 421 set_state(smdata, smdata->state + 1);
63} 422}
64 423
65/* This is the actual function which stops the CPU. It runs 424/* This is the cpu_stop function which stops the CPU. */
66 * in the context of a dedicated stopmachine workqueue. */ 425static int stop_machine_cpu_stop(void *data)
67static void stop_cpu(struct work_struct *unused)
68{ 426{
427 struct stop_machine_data *smdata = data;
69 enum stopmachine_state curstate = STOPMACHINE_NONE; 428 enum stopmachine_state curstate = STOPMACHINE_NONE;
70 struct stop_machine_data *smdata = &idle; 429 int cpu = smp_processor_id(), err = 0;
71 int cpu = smp_processor_id(); 430 bool is_active;
72 int err; 431
432 if (!smdata->active_cpus)
433 is_active = cpu == cpumask_first(cpu_online_mask);
434 else
435 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
73 436
74 if (!active_cpus) {
75 if (cpu == cpumask_first(cpu_online_mask))
76 smdata = &active;
77 } else {
78 if (cpumask_test_cpu(cpu, active_cpus))
79 smdata = &active;
80 }
81 /* Simple state machine */ 437 /* Simple state machine */
82 do { 438 do {
83 /* Chill out and ensure we re-read stopmachine_state. */ 439 /* Chill out and ensure we re-read stopmachine_state. */
84 cpu_relax(); 440 cpu_relax();
85 if (state != curstate) { 441 if (smdata->state != curstate) {
86 curstate = state; 442 curstate = smdata->state;
87 switch (curstate) { 443 switch (curstate) {
88 case STOPMACHINE_DISABLE_IRQ: 444 case STOPMACHINE_DISABLE_IRQ:
89 local_irq_disable(); 445 local_irq_disable();
90 hard_irq_disable(); 446 hard_irq_disable();
91 break; 447 break;
92 case STOPMACHINE_RUN: 448 case STOPMACHINE_RUN:
93 /* On multiple CPUs only a single error code 449 if (is_active)
94 * is needed to tell that something failed. */ 450 err = smdata->fn(smdata->data);
95 err = smdata->fn(smdata->data);
96 if (err)
97 smdata->fnret = err;
98 break; 451 break;
99 default: 452 default:
100 break; 453 break;
101 } 454 }
102 ack_state(); 455 ack_state(smdata);
103 } 456 }
104 } while (curstate != STOPMACHINE_EXIT); 457 } while (curstate != STOPMACHINE_EXIT);
105 458
106 local_irq_enable(); 459 local_irq_enable();
460 return err;
107} 461}
108 462
109/* Callback for CPUs which aren't supposed to do anything. */
110static int chill(void *unused)
111{
112 return 0;
113}
114
115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 463int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
153{ 464{
154 struct work_struct *sm_work; 465 struct stop_machine_data smdata = { .fn = fn, .data = data,
155 int i, ret; 466 .num_threads = num_online_cpus(),
156 467 .active_cpus = cpus };
157 /* Set up initial state. */ 468
158 mutex_lock(&lock); 469 /* Set the initial state and stop all online cpus. */
159 num_threads = num_online_cpus(); 470 set_state(&smdata, STOPMACHINE_PREPARE);
160 active_cpus = cpus; 471 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
161 active.fn = fn;
162 active.data = data;
163 active.fnret = 0;
164 idle.fn = chill;
165 idle.data = NULL;
166
167 set_state(STOPMACHINE_PREPARE);
168
169 /* Schedule the stop_cpu work on all cpus: hold this CPU so one
170 * doesn't hit this CPU until we're ready. */
171 get_cpu();
172 for_each_online_cpu(i) {
173 sm_work = per_cpu_ptr(stop_machine_work, i);
174 INIT_WORK(sm_work, stop_cpu);
175 queue_work_on(i, stop_machine_wq, sm_work);
176 }
177 /* This will release the thread on our CPU. */
178 put_cpu();
179 flush_workqueue(stop_machine_wq);
180 ret = active.fnret;
181 mutex_unlock(&lock);
182 return ret;
183} 472}
184 473
185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 474int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
186{ 475{
187 int ret; 476 int ret;
188 477
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
192 /* No CPUs can come up or down during this. */ 478 /* No CPUs can come up or down during this. */
193 get_online_cpus(); 479 get_online_cpus();
194 ret = __stop_machine(fn, data, cpus); 480 ret = __stop_machine(fn, data, cpus);
195 put_online_cpus(); 481 put_online_cpus();
196 stop_machine_destroy();
197 return ret; 482 return ret;
198} 483}
199EXPORT_SYMBOL_GPL(stop_machine); 484EXPORT_SYMBOL_GPL(stop_machine);
485
486#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index 7cb426a58965..e83ddbbaf89d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -492,10 +492,6 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
492 return -ENOMEM; 492 return -ENOMEM;
493 old = current_cred(); 493 old = current_cred();
494 494
495 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
496 if (retval)
497 goto error;
498
499 retval = -EPERM; 495 retval = -EPERM;
500 if (rgid != (gid_t) -1) { 496 if (rgid != (gid_t) -1) {
501 if (old->gid == rgid || 497 if (old->gid == rgid ||
@@ -543,10 +539,6 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
543 return -ENOMEM; 539 return -ENOMEM;
544 old = current_cred(); 540 old = current_cred();
545 541
546 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
547 if (retval)
548 goto error;
549
550 retval = -EPERM; 542 retval = -EPERM;
551 if (capable(CAP_SETGID)) 543 if (capable(CAP_SETGID))
552 new->gid = new->egid = new->sgid = new->fsgid = gid; 544 new->gid = new->egid = new->sgid = new->fsgid = gid;
@@ -610,10 +602,6 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
610 return -ENOMEM; 602 return -ENOMEM;
611 old = current_cred(); 603 old = current_cred();
612 604
613 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
614 if (retval)
615 goto error;
616
617 retval = -EPERM; 605 retval = -EPERM;
618 if (ruid != (uid_t) -1) { 606 if (ruid != (uid_t) -1) {
619 new->uid = ruid; 607 new->uid = ruid;
@@ -675,10 +663,6 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
675 return -ENOMEM; 663 return -ENOMEM;
676 old = current_cred(); 664 old = current_cred();
677 665
678 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
679 if (retval)
680 goto error;
681
682 retval = -EPERM; 666 retval = -EPERM;
683 if (capable(CAP_SETUID)) { 667 if (capable(CAP_SETUID)) {
684 new->suid = new->uid = uid; 668 new->suid = new->uid = uid;
@@ -719,9 +703,6 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
719 if (!new) 703 if (!new)
720 return -ENOMEM; 704 return -ENOMEM;
721 705
722 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
723 if (retval)
724 goto error;
725 old = current_cred(); 706 old = current_cred();
726 707
727 retval = -EPERM; 708 retval = -EPERM;
@@ -788,10 +769,6 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
788 return -ENOMEM; 769 return -ENOMEM;
789 old = current_cred(); 770 old = current_cred();
790 771
791 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
792 if (retval)
793 goto error;
794
795 retval = -EPERM; 772 retval = -EPERM;
796 if (!capable(CAP_SETGID)) { 773 if (!capable(CAP_SETGID)) {
797 if (rgid != (gid_t) -1 && rgid != old->gid && 774 if (rgid != (gid_t) -1 && rgid != old->gid &&
@@ -851,9 +828,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
851 old = current_cred(); 828 old = current_cred();
852 old_fsuid = old->fsuid; 829 old_fsuid = old->fsuid;
853 830
854 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
855 goto error;
856
857 if (uid == old->uid || uid == old->euid || 831 if (uid == old->uid || uid == old->euid ||
858 uid == old->suid || uid == old->fsuid || 832 uid == old->suid || uid == old->fsuid ||
859 capable(CAP_SETUID)) { 833 capable(CAP_SETUID)) {
@@ -864,7 +838,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
864 } 838 }
865 } 839 }
866 840
867error:
868 abort_creds(new); 841 abort_creds(new);
869 return old_fsuid; 842 return old_fsuid;
870 843
@@ -888,9 +861,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
888 old = current_cred(); 861 old = current_cred();
889 old_fsgid = old->fsgid; 862 old_fsgid = old->fsgid;
890 863
891 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
892 goto error;
893
894 if (gid == old->gid || gid == old->egid || 864 if (gid == old->gid || gid == old->egid ||
895 gid == old->sgid || gid == old->fsgid || 865 gid == old->sgid || gid == old->fsgid ||
896 capable(CAP_SETGID)) { 866 capable(CAP_SETGID)) {
@@ -900,7 +870,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
900 } 870 }
901 } 871 }
902 872
903error:
904 abort_creds(new); 873 abort_creds(new);
905 return old_fsgid; 874 return old_fsgid;
906 875
@@ -1663,9 +1632,9 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
1663 1632
1664char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; 1633char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
1665 1634
1666static void argv_cleanup(char **argv, char **envp) 1635static void argv_cleanup(struct subprocess_info *info)
1667{ 1636{
1668 argv_free(argv); 1637 argv_free(info->argv);
1669} 1638}
1670 1639
1671/** 1640/**
@@ -1699,7 +1668,7 @@ int orderly_poweroff(bool force)
1699 goto out; 1668 goto out;
1700 } 1669 }
1701 1670
1702 call_usermodehelper_setcleanup(info, argv_cleanup); 1671 call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
1703 1672
1704 ret = call_usermodehelper_exec(info, UMH_NO_WAIT); 1673 ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
1705 1674
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8686b0f5fc12..997080f00e0b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,6 +37,7 @@
37#include <linux/highuid.h> 37#include <linux/highuid.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40#include <linux/compaction.h>
40#include <linux/hugetlb.h> 41#include <linux/hugetlb.h>
41#include <linux/initrd.h> 42#include <linux/initrd.h>
42#include <linux/key.h> 43#include <linux/key.h>
@@ -52,6 +53,7 @@
52#include <linux/slow-work.h> 53#include <linux/slow-work.h>
53#include <linux/perf_event.h> 54#include <linux/perf_event.h>
54#include <linux/kprobes.h> 55#include <linux/kprobes.h>
56#include <linux/pipe_fs_i.h>
55 57
56#include <asm/uaccess.h> 58#include <asm/uaccess.h>
57#include <asm/processor.h> 59#include <asm/processor.h>
@@ -163,6 +165,27 @@ static int proc_taint(struct ctl_table *table, int write,
163 void __user *buffer, size_t *lenp, loff_t *ppos); 165 void __user *buffer, size_t *lenp, loff_t *ppos);
164#endif 166#endif
165 167
168#ifdef CONFIG_MAGIC_SYSRQ
169static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
170
171static int sysrq_sysctl_handler(ctl_table *table, int write,
172 void __user *buffer, size_t *lenp,
173 loff_t *ppos)
174{
175 int error;
176
177 error = proc_dointvec(table, write, buffer, lenp, ppos);
178 if (error)
179 return error;
180
181 if (write)
182 sysrq_toggle_support(__sysrq_enabled);
183
184 return 0;
185}
186
187#endif
188
166static struct ctl_table root_table[]; 189static struct ctl_table root_table[];
167static struct ctl_table_root sysctl_table_root; 190static struct ctl_table_root sysctl_table_root;
168static struct ctl_table_header root_table_header = { 191static struct ctl_table_header root_table_header = {
@@ -240,6 +263,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */
240static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ 263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
241#endif 264#endif
242 265
266#ifdef CONFIG_COMPACTION
267static int min_extfrag_threshold;
268static int max_extfrag_threshold = 1000;
269#endif
270
243static struct ctl_table kern_table[] = { 271static struct ctl_table kern_table[] = {
244 { 272 {
245 .procname = "sched_child_runs_first", 273 .procname = "sched_child_runs_first",
@@ -567,7 +595,7 @@ static struct ctl_table kern_table[] = {
567 .data = &__sysrq_enabled, 595 .data = &__sysrq_enabled,
568 .maxlen = sizeof (int), 596 .maxlen = sizeof (int),
569 .mode = 0644, 597 .mode = 0644,
570 .proc_handler = proc_dointvec, 598 .proc_handler = sysrq_sysctl_handler,
571 }, 599 },
572#endif 600#endif
573#ifdef CONFIG_PROC_SYSCTL 601#ifdef CONFIG_PROC_SYSCTL
@@ -621,7 +649,7 @@ static struct ctl_table kern_table[] = {
621#endif 649#endif
622 { 650 {
623 .procname = "userprocess_debug", 651 .procname = "userprocess_debug",
624 .data = &sysctl_userprocess_debug, 652 .data = &show_unhandled_signals,
625 .maxlen = sizeof(int), 653 .maxlen = sizeof(int),
626 .mode = 0644, 654 .mode = 0644,
627 .proc_handler = proc_dointvec, 655 .proc_handler = proc_dointvec,
@@ -1099,6 +1127,25 @@ static struct ctl_table vm_table[] = {
1099 .mode = 0644, 1127 .mode = 0644,
1100 .proc_handler = drop_caches_sysctl_handler, 1128 .proc_handler = drop_caches_sysctl_handler,
1101 }, 1129 },
1130#ifdef CONFIG_COMPACTION
1131 {
1132 .procname = "compact_memory",
1133 .data = &sysctl_compact_memory,
1134 .maxlen = sizeof(int),
1135 .mode = 0200,
1136 .proc_handler = sysctl_compaction_handler,
1137 },
1138 {
1139 .procname = "extfrag_threshold",
1140 .data = &sysctl_extfrag_threshold,
1141 .maxlen = sizeof(int),
1142 .mode = 0644,
1143 .proc_handler = sysctl_extfrag_handler,
1144 .extra1 = &min_extfrag_threshold,
1145 .extra2 = &max_extfrag_threshold,
1146 },
1147
1148#endif /* CONFIG_COMPACTION */
1102 { 1149 {
1103 .procname = "min_free_kbytes", 1150 .procname = "min_free_kbytes",
1104 .data = &min_free_kbytes, 1151 .data = &min_free_kbytes,
@@ -1423,6 +1470,14 @@ static struct ctl_table fs_table[] = {
1423 .child = binfmt_misc_table, 1470 .child = binfmt_misc_table,
1424 }, 1471 },
1425#endif 1472#endif
1473 {
1474 .procname = "pipe-max-pages",
1475 .data = &pipe_max_pages,
1476 .maxlen = sizeof(int),
1477 .mode = 0644,
1478 .proc_handler = &proc_dointvec_minmax,
1479 .extra1 = &two,
1480 },
1426/* 1481/*
1427 * NOTE: do not add new entries to this table unless you have read 1482 * NOTE: do not add new entries to this table unless you have read
1428 * Documentation/sysctl/ctl_unnumbered.txt 1483 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1431,7 +1486,8 @@ static struct ctl_table fs_table[] = {
1431}; 1486};
1432 1487
1433static struct ctl_table debug_table[] = { 1488static struct ctl_table debug_table[] = {
1434#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) 1489#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
1490 defined(CONFIG_S390)
1435 { 1491 {
1436 .procname = "exception-trace", 1492 .procname = "exception-trace",
1437 .data = &show_unhandled_signals, 1493 .data = &show_unhandled_signals,
@@ -2040,8 +2096,132 @@ int proc_dostring(struct ctl_table *table, int write,
2040 buffer, lenp, ppos); 2096 buffer, lenp, ppos);
2041} 2097}
2042 2098
2099static size_t proc_skip_spaces(char **buf)
2100{
2101 size_t ret;
2102 char *tmp = skip_spaces(*buf);
2103 ret = tmp - *buf;
2104 *buf = tmp;
2105 return ret;
2106}
2107
2108static void proc_skip_char(char **buf, size_t *size, const char v)
2109{
2110 while (*size) {
2111 if (**buf != v)
2112 break;
2113 (*size)--;
2114 (*buf)++;
2115 }
2116}
2117
2118#define TMPBUFLEN 22
2119/**
2120 * proc_get_long - reads an ASCII formatted integer from a user buffer
2121 *
2122 * @buf: a kernel buffer
2123 * @size: size of the kernel buffer
2124 * @val: this is where the number will be stored
2125 * @neg: set to %TRUE if number is negative
2126 * @perm_tr: a vector which contains the allowed trailers
2127 * @perm_tr_len: size of the perm_tr vector
2128 * @tr: pointer to store the trailer character
2129 *
2130 * In case of success %0 is returned and @buf and @size are updated with
2131 * the amount of bytes read. If @tr is non-NULL and a trailing
2132 * character exists (size is non-zero after returning from this
2133 * function), @tr is updated with the trailing character.
2134 */
2135static int proc_get_long(char **buf, size_t *size,
2136 unsigned long *val, bool *neg,
2137 const char *perm_tr, unsigned perm_tr_len, char *tr)
2138{
2139 int len;
2140 char *p, tmp[TMPBUFLEN];
2141
2142 if (!*size)
2143 return -EINVAL;
2144
2145 len = *size;
2146 if (len > TMPBUFLEN - 1)
2147 len = TMPBUFLEN - 1;
2148
2149 memcpy(tmp, *buf, len);
2150
2151 tmp[len] = 0;
2152 p = tmp;
2153 if (*p == '-' && *size > 1) {
2154 *neg = true;
2155 p++;
2156 } else
2157 *neg = false;
2158 if (!isdigit(*p))
2159 return -EINVAL;
2160
2161 *val = simple_strtoul(p, &p, 0);
2162
2163 len = p - tmp;
2164
2165 /* We don't know if the next char is whitespace thus we may accept
2166 * invalid integers (e.g. 1234...a) or two integers instead of one
2167 * (e.g. 123...1). So lets not allow such large numbers. */
2168 if (len == TMPBUFLEN - 1)
2169 return -EINVAL;
2170
2171 if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
2172 return -EINVAL;
2173
2174 if (tr && (len < *size))
2175 *tr = *p;
2176
2177 *buf += len;
2178 *size -= len;
2179
2180 return 0;
2181}
2182
2183/**
2184 * proc_put_long - converts an integer to a decimal ASCII formatted string
2185 *
2186 * @buf: the user buffer
2187 * @size: the size of the user buffer
2188 * @val: the integer to be converted
2189 * @neg: sign of the number, %TRUE for negative
2190 *
2191 * In case of success %0 is returned and @buf and @size are updated with
2192 * the amount of bytes written.
2193 */
2194static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
2195 bool neg)
2196{
2197 int len;
2198 char tmp[TMPBUFLEN], *p = tmp;
2199
2200 sprintf(p, "%s%lu", neg ? "-" : "", val);
2201 len = strlen(tmp);
2202 if (len > *size)
2203 len = *size;
2204 if (copy_to_user(*buf, tmp, len))
2205 return -EFAULT;
2206 *size -= len;
2207 *buf += len;
2208 return 0;
2209}
2210#undef TMPBUFLEN
2211
2212static int proc_put_char(void __user **buf, size_t *size, char c)
2213{
2214 if (*size) {
2215 char __user **buffer = (char __user **)buf;
2216 if (put_user(c, *buffer))
2217 return -EFAULT;
2218 (*size)--, (*buffer)++;
2219 *buf = *buffer;
2220 }
2221 return 0;
2222}
2043 2223
2044static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 2224static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
2045 int *valp, 2225 int *valp,
2046 int write, void *data) 2226 int write, void *data)
2047{ 2227{
@@ -2050,33 +2230,31 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2050 } else { 2230 } else {
2051 int val = *valp; 2231 int val = *valp;
2052 if (val < 0) { 2232 if (val < 0) {
2053 *negp = -1; 2233 *negp = true;
2054 *lvalp = (unsigned long)-val; 2234 *lvalp = (unsigned long)-val;
2055 } else { 2235 } else {
2056 *negp = 0; 2236 *negp = false;
2057 *lvalp = (unsigned long)val; 2237 *lvalp = (unsigned long)val;
2058 } 2238 }
2059 } 2239 }
2060 return 0; 2240 return 0;
2061} 2241}
2062 2242
2243static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
2244
2063static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2245static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2064 int write, void __user *buffer, 2246 int write, void __user *buffer,
2065 size_t *lenp, loff_t *ppos, 2247 size_t *lenp, loff_t *ppos,
2066 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2248 int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
2067 int write, void *data), 2249 int write, void *data),
2068 void *data) 2250 void *data)
2069{ 2251{
2070#define TMPBUFLEN 21 2252 int *i, vleft, first = 1, err = 0;
2071 int *i, vleft, first = 1, neg; 2253 unsigned long page = 0;
2072 unsigned long lval; 2254 size_t left;
2073 size_t left, len; 2255 char *kbuf;
2074 2256
2075 char buf[TMPBUFLEN], *p; 2257 if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
2076 char __user *s = buffer;
2077
2078 if (!tbl_data || !table->maxlen || !*lenp ||
2079 (*ppos && !write)) {
2080 *lenp = 0; 2258 *lenp = 0;
2081 return 0; 2259 return 0;
2082 } 2260 }
@@ -2088,89 +2266,71 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2088 if (!conv) 2266 if (!conv)
2089 conv = do_proc_dointvec_conv; 2267 conv = do_proc_dointvec_conv;
2090 2268
2269 if (write) {
2270 if (left > PAGE_SIZE - 1)
2271 left = PAGE_SIZE - 1;
2272 page = __get_free_page(GFP_TEMPORARY);
2273 kbuf = (char *) page;
2274 if (!kbuf)
2275 return -ENOMEM;
2276 if (copy_from_user(kbuf, buffer, left)) {
2277 err = -EFAULT;
2278 goto free;
2279 }
2280 kbuf[left] = 0;
2281 }
2282
2091 for (; left && vleft--; i++, first=0) { 2283 for (; left && vleft--; i++, first=0) {
2284 unsigned long lval;
2285 bool neg;
2286
2092 if (write) { 2287 if (write) {
2093 while (left) { 2288 left -= proc_skip_spaces(&kbuf);
2094 char c; 2289
2095 if (get_user(c, s))
2096 return -EFAULT;
2097 if (!isspace(c))
2098 break;
2099 left--;
2100 s++;
2101 }
2102 if (!left) 2290 if (!left)
2103 break; 2291 break;
2104 neg = 0; 2292 err = proc_get_long(&kbuf, &left, &lval, &neg,
2105 len = left; 2293 proc_wspace_sep,
2106 if (len > sizeof(buf) - 1) 2294 sizeof(proc_wspace_sep), NULL);
2107 len = sizeof(buf) - 1; 2295 if (err)
2108 if (copy_from_user(buf, s, len))
2109 return -EFAULT;
2110 buf[len] = 0;
2111 p = buf;
2112 if (*p == '-' && left > 1) {
2113 neg = 1;
2114 p++;
2115 }
2116 if (*p < '0' || *p > '9')
2117 break; 2296 break;
2118 2297 if (conv(&neg, &lval, i, 1, data)) {
2119 lval = simple_strtoul(p, &p, 0); 2298 err = -EINVAL;
2120
2121 len = p-buf;
2122 if ((len < left) && *p && !isspace(*p))
2123 break;
2124 s += len;
2125 left -= len;
2126
2127 if (conv(&neg, &lval, i, 1, data))
2128 break; 2299 break;
2300 }
2129 } else { 2301 } else {
2130 p = buf; 2302 if (conv(&neg, &lval, i, 0, data)) {
2303 err = -EINVAL;
2304 break;
2305 }
2131 if (!first) 2306 if (!first)
2132 *p++ = '\t'; 2307 err = proc_put_char(&buffer, &left, '\t');
2133 2308 if (err)
2134 if (conv(&neg, &lval, i, 0, data)) 2309 break;
2310 err = proc_put_long(&buffer, &left, lval, neg);
2311 if (err)
2135 break; 2312 break;
2136
2137 sprintf(p, "%s%lu", neg ? "-" : "", lval);
2138 len = strlen(buf);
2139 if (len > left)
2140 len = left;
2141 if(copy_to_user(s, buf, len))
2142 return -EFAULT;
2143 left -= len;
2144 s += len;
2145 } 2313 }
2146 } 2314 }
2147 2315
2148 if (!write && !first && left) { 2316 if (!write && !first && left && !err)
2149 if(put_user('\n', s)) 2317 err = proc_put_char(&buffer, &left, '\n');
2150 return -EFAULT; 2318 if (write && !err && left)
2151 left--, s++; 2319 left -= proc_skip_spaces(&kbuf);
2152 } 2320free:
2153 if (write) { 2321 if (write) {
2154 while (left) { 2322 free_page(page);
2155 char c; 2323 if (first)
2156 if (get_user(c, s++)) 2324 return err ? : -EINVAL;
2157 return -EFAULT;
2158 if (!isspace(c))
2159 break;
2160 left--;
2161 }
2162 } 2325 }
2163 if (write && first)
2164 return -EINVAL;
2165 *lenp -= left; 2326 *lenp -= left;
2166 *ppos += *lenp; 2327 *ppos += *lenp;
2167 return 0; 2328 return err;
2168#undef TMPBUFLEN
2169} 2329}
2170 2330
2171static int do_proc_dointvec(struct ctl_table *table, int write, 2331static int do_proc_dointvec(struct ctl_table *table, int write,
2172 void __user *buffer, size_t *lenp, loff_t *ppos, 2332 void __user *buffer, size_t *lenp, loff_t *ppos,
2173 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2333 int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
2174 int write, void *data), 2334 int write, void *data),
2175 void *data) 2335 void *data)
2176{ 2336{
@@ -2238,8 +2398,8 @@ struct do_proc_dointvec_minmax_conv_param {
2238 int *max; 2398 int *max;
2239}; 2399};
2240 2400
2241static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 2401static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
2242 int *valp, 2402 int *valp,
2243 int write, void *data) 2403 int write, void *data)
2244{ 2404{
2245 struct do_proc_dointvec_minmax_conv_param *param = data; 2405 struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2252,10 +2412,10 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2252 } else { 2412 } else {
2253 int val = *valp; 2413 int val = *valp;
2254 if (val < 0) { 2414 if (val < 0) {
2255 *negp = -1; 2415 *negp = true;
2256 *lvalp = (unsigned long)-val; 2416 *lvalp = (unsigned long)-val;
2257 } else { 2417 } else {
2258 *negp = 0; 2418 *negp = false;
2259 *lvalp = (unsigned long)val; 2419 *lvalp = (unsigned long)val;
2260 } 2420 }
2261 } 2421 }
@@ -2295,102 +2455,78 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2295 unsigned long convmul, 2455 unsigned long convmul,
2296 unsigned long convdiv) 2456 unsigned long convdiv)
2297{ 2457{
2298#define TMPBUFLEN 21 2458 unsigned long *i, *min, *max;
2299 unsigned long *i, *min, *max, val; 2459 int vleft, first = 1, err = 0;
2300 int vleft, first=1, neg; 2460 unsigned long page = 0;
2301 size_t len, left; 2461 size_t left;
2302 char buf[TMPBUFLEN], *p; 2462 char *kbuf;
2303 char __user *s = buffer; 2463
2304 2464 if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
2305 if (!data || !table->maxlen || !*lenp ||
2306 (*ppos && !write)) {
2307 *lenp = 0; 2465 *lenp = 0;
2308 return 0; 2466 return 0;
2309 } 2467 }
2310 2468
2311 i = (unsigned long *) data; 2469 i = (unsigned long *) data;
2312 min = (unsigned long *) table->extra1; 2470 min = (unsigned long *) table->extra1;
2313 max = (unsigned long *) table->extra2; 2471 max = (unsigned long *) table->extra2;
2314 vleft = table->maxlen / sizeof(unsigned long); 2472 vleft = table->maxlen / sizeof(unsigned long);
2315 left = *lenp; 2473 left = *lenp;
2316 2474
2475 if (write) {
2476 if (left > PAGE_SIZE - 1)
2477 left = PAGE_SIZE - 1;
2478 page = __get_free_page(GFP_TEMPORARY);
2479 kbuf = (char *) page;
2480 if (!kbuf)
2481 return -ENOMEM;
2482 if (copy_from_user(kbuf, buffer, left)) {
2483 err = -EFAULT;
2484 goto free;
2485 }
2486 kbuf[left] = 0;
2487 }
2488
2317 for (; left && vleft--; i++, min++, max++, first=0) { 2489 for (; left && vleft--; i++, min++, max++, first=0) {
2490 unsigned long val;
2491
2318 if (write) { 2492 if (write) {
2319 while (left) { 2493 bool neg;
2320 char c; 2494
2321 if (get_user(c, s)) 2495 left -= proc_skip_spaces(&kbuf);
2322 return -EFAULT; 2496
2323 if (!isspace(c)) 2497 err = proc_get_long(&kbuf, &left, &val, &neg,
2324 break; 2498 proc_wspace_sep,
2325 left--; 2499 sizeof(proc_wspace_sep), NULL);
2326 s++; 2500 if (err)
2327 }
2328 if (!left)
2329 break;
2330 neg = 0;
2331 len = left;
2332 if (len > TMPBUFLEN-1)
2333 len = TMPBUFLEN-1;
2334 if (copy_from_user(buf, s, len))
2335 return -EFAULT;
2336 buf[len] = 0;
2337 p = buf;
2338 if (*p == '-' && left > 1) {
2339 neg = 1;
2340 p++;
2341 }
2342 if (*p < '0' || *p > '9')
2343 break;
2344 val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
2345 len = p-buf;
2346 if ((len < left) && *p && !isspace(*p))
2347 break; 2501 break;
2348 if (neg) 2502 if (neg)
2349 val = -val;
2350 s += len;
2351 left -= len;
2352
2353 if(neg)
2354 continue; 2503 continue;
2355 if ((min && val < *min) || (max && val > *max)) 2504 if ((min && val < *min) || (max && val > *max))
2356 continue; 2505 continue;
2357 *i = val; 2506 *i = val;
2358 } else { 2507 } else {
2359 p = buf; 2508 val = convdiv * (*i) / convmul;
2360 if (!first) 2509 if (!first)
2361 *p++ = '\t'; 2510 err = proc_put_char(&buffer, &left, '\t');
2362 sprintf(p, "%lu", convdiv * (*i) / convmul); 2511 err = proc_put_long(&buffer, &left, val, false);
2363 len = strlen(buf); 2512 if (err)
2364 if (len > left) 2513 break;
2365 len = left;
2366 if(copy_to_user(s, buf, len))
2367 return -EFAULT;
2368 left -= len;
2369 s += len;
2370 } 2514 }
2371 } 2515 }
2372 2516
2373 if (!write && !first && left) { 2517 if (!write && !first && left && !err)
2374 if(put_user('\n', s)) 2518 err = proc_put_char(&buffer, &left, '\n');
2375 return -EFAULT; 2519 if (write && !err)
2376 left--, s++; 2520 left -= proc_skip_spaces(&kbuf);
2377 } 2521free:
2378 if (write) { 2522 if (write) {
2379 while (left) { 2523 free_page(page);
2380 char c; 2524 if (first)
2381 if (get_user(c, s++)) 2525 return err ? : -EINVAL;
2382 return -EFAULT;
2383 if (!isspace(c))
2384 break;
2385 left--;
2386 }
2387 } 2526 }
2388 if (write && first)
2389 return -EINVAL;
2390 *lenp -= left; 2527 *lenp -= left;
2391 *ppos += *lenp; 2528 *ppos += *lenp;
2392 return 0; 2529 return err;
2393#undef TMPBUFLEN
2394} 2530}
2395 2531
2396static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2532static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2451,7 +2587,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2451} 2587}
2452 2588
2453 2589
2454static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, 2590static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
2455 int *valp, 2591 int *valp,
2456 int write, void *data) 2592 int write, void *data)
2457{ 2593{
@@ -2463,10 +2599,10 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2463 int val = *valp; 2599 int val = *valp;
2464 unsigned long lval; 2600 unsigned long lval;
2465 if (val < 0) { 2601 if (val < 0) {
2466 *negp = -1; 2602 *negp = true;
2467 lval = (unsigned long)-val; 2603 lval = (unsigned long)-val;
2468 } else { 2604 } else {
2469 *negp = 0; 2605 *negp = false;
2470 lval = (unsigned long)val; 2606 lval = (unsigned long)val;
2471 } 2607 }
2472 *lvalp = lval / HZ; 2608 *lvalp = lval / HZ;
@@ -2474,7 +2610,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2474 return 0; 2610 return 0;
2475} 2611}
2476 2612
2477static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, 2613static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
2478 int *valp, 2614 int *valp,
2479 int write, void *data) 2615 int write, void *data)
2480{ 2616{
@@ -2486,10 +2622,10 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2486 int val = *valp; 2622 int val = *valp;
2487 unsigned long lval; 2623 unsigned long lval;
2488 if (val < 0) { 2624 if (val < 0) {
2489 *negp = -1; 2625 *negp = true;
2490 lval = (unsigned long)-val; 2626 lval = (unsigned long)-val;
2491 } else { 2627 } else {
2492 *negp = 0; 2628 *negp = false;
2493 lval = (unsigned long)val; 2629 lval = (unsigned long)val;
2494 } 2630 }
2495 *lvalp = jiffies_to_clock_t(lval); 2631 *lvalp = jiffies_to_clock_t(lval);
@@ -2497,7 +2633,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2497 return 0; 2633 return 0;
2498} 2634}
2499 2635
2500static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, 2636static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
2501 int *valp, 2637 int *valp,
2502 int write, void *data) 2638 int write, void *data)
2503{ 2639{
@@ -2507,10 +2643,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2507 int val = *valp; 2643 int val = *valp;
2508 unsigned long lval; 2644 unsigned long lval;
2509 if (val < 0) { 2645 if (val < 0) {
2510 *negp = -1; 2646 *negp = true;
2511 lval = (unsigned long)-val; 2647 lval = (unsigned long)-val;
2512 } else { 2648 } else {
2513 *negp = 0; 2649 *negp = false;
2514 lval = (unsigned long)val; 2650 lval = (unsigned long)val;
2515 } 2651 }
2516 *lvalp = jiffies_to_msecs(lval); 2652 *lvalp = jiffies_to_msecs(lval);
@@ -2607,6 +2743,157 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
2607 return 0; 2743 return 0;
2608} 2744}
2609 2745
2746/**
2747 * proc_do_large_bitmap - read/write from/to a large bitmap
2748 * @table: the sysctl table
2749 * @write: %TRUE if this is a write to the sysctl file
2750 * @buffer: the user buffer
2751 * @lenp: the size of the user buffer
2752 * @ppos: file position
2753 *
2754 * The bitmap is stored at table->data and the bitmap length (in bits)
2755 * in table->maxlen.
2756 *
2757 * We use a range comma separated format (e.g. 1,3-4,10-10) so that
2758 * large bitmaps may be represented in a compact manner. Writing into
2759 * the file will clear the bitmap then update it with the given input.
2760 *
2761 * Returns 0 on success.
2762 */
2763int proc_do_large_bitmap(struct ctl_table *table, int write,
2764 void __user *buffer, size_t *lenp, loff_t *ppos)
2765{
2766 int err = 0;
2767 bool first = 1;
2768 size_t left = *lenp;
2769 unsigned long bitmap_len = table->maxlen;
2770 unsigned long *bitmap = (unsigned long *) table->data;
2771 unsigned long *tmp_bitmap = NULL;
2772 char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
2773
2774 if (!bitmap_len || !left || (*ppos && !write)) {
2775 *lenp = 0;
2776 return 0;
2777 }
2778
2779 if (write) {
2780 unsigned long page = 0;
2781 char *kbuf;
2782
2783 if (left > PAGE_SIZE - 1)
2784 left = PAGE_SIZE - 1;
2785
2786 page = __get_free_page(GFP_TEMPORARY);
2787 kbuf = (char *) page;
2788 if (!kbuf)
2789 return -ENOMEM;
2790 if (copy_from_user(kbuf, buffer, left)) {
2791 free_page(page);
2792 return -EFAULT;
2793 }
2794 kbuf[left] = 0;
2795
2796 tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
2797 GFP_KERNEL);
2798 if (!tmp_bitmap) {
2799 free_page(page);
2800 return -ENOMEM;
2801 }
2802 proc_skip_char(&kbuf, &left, '\n');
2803 while (!err && left) {
2804 unsigned long val_a, val_b;
2805 bool neg;
2806
2807 err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
2808 sizeof(tr_a), &c);
2809 if (err)
2810 break;
2811 if (val_a >= bitmap_len || neg) {
2812 err = -EINVAL;
2813 break;
2814 }
2815
2816 val_b = val_a;
2817 if (left) {
2818 kbuf++;
2819 left--;
2820 }
2821
2822 if (c == '-') {
2823 err = proc_get_long(&kbuf, &left, &val_b,
2824 &neg, tr_b, sizeof(tr_b),
2825 &c);
2826 if (err)
2827 break;
2828 if (val_b >= bitmap_len || neg ||
2829 val_a > val_b) {
2830 err = -EINVAL;
2831 break;
2832 }
2833 if (left) {
2834 kbuf++;
2835 left--;
2836 }
2837 }
2838
2839 while (val_a <= val_b)
2840 set_bit(val_a++, tmp_bitmap);
2841
2842 first = 0;
2843 proc_skip_char(&kbuf, &left, '\n');
2844 }
2845 free_page(page);
2846 } else {
2847 unsigned long bit_a, bit_b = 0;
2848
2849 while (left) {
2850 bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
2851 if (bit_a >= bitmap_len)
2852 break;
2853 bit_b = find_next_zero_bit(bitmap, bitmap_len,
2854 bit_a + 1) - 1;
2855
2856 if (!first) {
2857 err = proc_put_char(&buffer, &left, ',');
2858 if (err)
2859 break;
2860 }
2861 err = proc_put_long(&buffer, &left, bit_a, false);
2862 if (err)
2863 break;
2864 if (bit_a != bit_b) {
2865 err = proc_put_char(&buffer, &left, '-');
2866 if (err)
2867 break;
2868 err = proc_put_long(&buffer, &left, bit_b, false);
2869 if (err)
2870 break;
2871 }
2872
2873 first = 0; bit_b++;
2874 }
2875 if (!err)
2876 err = proc_put_char(&buffer, &left, '\n');
2877 }
2878
2879 if (!err) {
2880 if (write) {
2881 if (*ppos)
2882 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
2883 else
2884 memcpy(bitmap, tmp_bitmap,
2885 BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
2886 }
2887 kfree(tmp_bitmap);
2888 *lenp -= left;
2889 *ppos += *lenp;
2890 return 0;
2891 } else {
2892 kfree(tmp_bitmap);
2893 return err;
2894 }
2895}
2896
2610#else /* CONFIG_PROC_FS */ 2897#else /* CONFIG_PROC_FS */
2611 2898
2612int proc_dostring(struct ctl_table *table, int write, 2899int proc_dostring(struct ctl_table *table, int write,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 59030570f5ca..1357c5786064 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/netdevice.h> 15#include <linux/netdevice.h>
16#include <linux/kernel.h>
16#include <linux/slab.h> 17#include <linux/slab.h>
17 18
18#ifdef CONFIG_SYSCTL_SYSCALL 19#ifdef CONFIG_SYSCTL_SYSCALL
@@ -224,7 +225,6 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
224 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, 225 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
225 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, 226 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
226 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, 227 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
227 { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
228 {} 228 {}
229}; 229};
230 230
@@ -1125,11 +1125,6 @@ out:
1125 return result; 1125 return result;
1126} 1126}
1127 1127
1128static unsigned hex_value(int ch)
1129{
1130 return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
1131}
1132
1133static ssize_t bin_uuid(struct file *file, 1128static ssize_t bin_uuid(struct file *file,
1134 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1129 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1135{ 1130{
@@ -1157,7 +1152,8 @@ static ssize_t bin_uuid(struct file *file,
1157 if (!isxdigit(str[0]) || !isxdigit(str[1])) 1152 if (!isxdigit(str[0]) || !isxdigit(str[1]))
1158 goto out; 1153 goto out;
1159 1154
1160 uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]); 1155 uuid[i] = (hex_to_bin(str[0]) << 4) |
1156 hex_to_bin(str[1]);
1161 str += 2; 1157 str += 2;
1162 if (*str == '-') 1158 if (*str == '-')
1163 str++; 1159 str++;
diff --git a/kernel/time.c b/kernel/time.c
index 656dccfe1cbb..848b1c2ab09a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -132,12 +132,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
132 */ 132 */
133static inline void warp_clock(void) 133static inline void warp_clock(void)
134{ 134{
135 write_seqlock_irq(&xtime_lock); 135 struct timespec adjust;
136 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 136
137 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 137 adjust = current_kernel_time();
138 update_xtime_cache(0); 138 adjust.tv_sec += sys_tz.tz_minuteswest * 60;
139 write_sequnlock_irq(&xtime_lock); 139 do_settimeofday(&adjust);
140 clock_was_set();
141} 140}
142 141
143/* 142/*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1f5dde637457..f08e99c1d561 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -625,6 +625,54 @@ static void clocksource_enqueue(struct clocksource *cs)
625 list_add(&cs->list, entry); 625 list_add(&cs->list, entry);
626} 626}
627 627
628
629/*
630 * Maximum time we expect to go between ticks. This includes idle
631 * tickless time. It provides the trade off between selecting a
632 * mult/shift pair that is very precise but can only handle a short
633 * period of time, vs. a mult/shift pair that can handle long periods
634 * of time but isn't as precise.
635 *
636 * This is a subsystem constant, and actual hardware limitations
637 * may override it (ie: clocksources that wrap every 3 seconds).
638 */
639#define MAX_UPDATE_LENGTH 5 /* Seconds */
640
641/**
642 * __clocksource_register_scale - Used to install new clocksources
643 * @t: clocksource to be registered
644 * @scale: Scale factor multiplied against freq to get clocksource hz
645 * @freq: clocksource frequency (cycles per second) divided by scale
646 *
647 * Returns -EBUSY if registration fails, zero otherwise.
648 *
649 * This *SHOULD NOT* be called directly! Please use the
650 * clocksource_register_hz() or clocksource_register_khz helper functions.
651 */
652int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
653{
654
655 /*
656 * Ideally we want to use some of the limits used in
657 * clocksource_max_deferment, to provide a more informed
658 * MAX_UPDATE_LENGTH. But for now this just gets the
659 * register interface working properly.
660 */
661 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
662 NSEC_PER_SEC/scale,
663 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs);
665
666 mutex_lock(&clocksource_mutex);
667 clocksource_enqueue(cs);
668 clocksource_select();
669 clocksource_enqueue_watchdog(cs);
670 mutex_unlock(&clocksource_mutex);
671 return 0;
672}
673EXPORT_SYMBOL_GPL(__clocksource_register_scale);
674
675
628/** 676/**
629 * clocksource_register - Used to install new clocksources 677 * clocksource_register - Used to install new clocksources
630 * @t: clocksource to be registered 678 * @t: clocksource to be registered
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7c0f180d6e9d..c63116863a80 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -69,7 +69,7 @@ static s64 time_freq;
69/* time at last adjustment (secs): */ 69/* time at last adjustment (secs): */
70static long time_reftime; 70static long time_reftime;
71 71
72long time_adjust; 72static long time_adjust;
73 73
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 75static s64 ntp_tick_adj;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f51..1d7b9bc1c034 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,14 +150,32 @@ static void tick_nohz_update_jiffies(ktime_t now)
150 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
151} 151}
152 152
153/*
154 * Updates the per cpu time idle statistics counters
155 */
156static void
157update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time)
158{
159 ktime_t delta;
160
161 if (ts->idle_active) {
162 delta = ktime_sub(now, ts->idle_entrytime);
163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
164 if (nr_iowait_cpu() > 0)
165 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
166 ts->idle_entrytime = now;
167 }
168
169 if (last_update_time)
170 *last_update_time = ktime_to_us(now);
171
172}
173
153static void tick_nohz_stop_idle(int cpu, ktime_t now) 174static void tick_nohz_stop_idle(int cpu, ktime_t now)
154{ 175{
155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 176 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
157 177
158 delta = ktime_sub(now, ts->idle_entrytime); 178 update_ts_time_stats(ts, now, NULL);
159 ts->idle_lastupdate = now;
160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
161 ts->idle_active = 0; 179 ts->idle_active = 0;
162 180
163 sched_clock_idle_wakeup_event(0); 181 sched_clock_idle_wakeup_event(0);
@@ -165,20 +183,32 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now)
165 183
166static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 184static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
167{ 185{
168 ktime_t now, delta; 186 ktime_t now;
169 187
170 now = ktime_get(); 188 now = ktime_get();
171 if (ts->idle_active) { 189
172 delta = ktime_sub(now, ts->idle_entrytime); 190 update_ts_time_stats(ts, now, NULL);
173 ts->idle_lastupdate = now; 191
174 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
175 }
176 ts->idle_entrytime = now; 192 ts->idle_entrytime = now;
177 ts->idle_active = 1; 193 ts->idle_active = 1;
178 sched_clock_idle_sleep_event(); 194 sched_clock_idle_sleep_event();
179 return now; 195 return now;
180} 196}
181 197
198/**
199 * get_cpu_idle_time_us - get the total idle time of a cpu
200 * @cpu: CPU number to query
201 * @last_update_time: variable to store update time in
202 *
203 * Return the cummulative idle time (since boot) for a given
204 * CPU, in microseconds. The idle time returned includes
205 * the iowait time (unlike what "top" and co report).
206 *
207 * This time is measured via accounting rather than sampling,
208 * and is as accurate as ktime_get() is.
209 *
210 * This function returns -1 if NOHZ is not enabled.
211 */
182u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 212u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
183{ 213{
184 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 214 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
186 if (!tick_nohz_enabled) 216 if (!tick_nohz_enabled)
187 return -1; 217 return -1;
188 218
189 if (ts->idle_active) 219 update_ts_time_stats(ts, ktime_get(), last_update_time);
190 *last_update_time = ktime_to_us(ts->idle_lastupdate);
191 else
192 *last_update_time = ktime_to_us(ktime_get());
193 220
194 return ktime_to_us(ts->idle_sleeptime); 221 return ktime_to_us(ts->idle_sleeptime);
195} 222}
196EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 223EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
197 224
225/*
226 * get_cpu_iowait_time_us - get the total iowait time of a cpu
227 * @cpu: CPU number to query
228 * @last_update_time: variable to store update time in
229 *
230 * Return the cummulative iowait time (since boot) for a given
231 * CPU, in microseconds.
232 *
233 * This time is measured via accounting rather than sampling,
234 * and is as accurate as ktime_get() is.
235 *
236 * This function returns -1 if NOHZ is not enabled.
237 */
238u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
239{
240 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
241
242 if (!tick_nohz_enabled)
243 return -1;
244
245 update_ts_time_stats(ts, ktime_get(), last_update_time);
246
247 return ktime_to_us(ts->iowait_sleeptime);
248}
249EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
250
198/** 251/**
199 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 252 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
200 * 253 *
@@ -262,6 +315,9 @@ void tick_nohz_stop_sched_tick(int inidle)
262 goto end; 315 goto end;
263 } 316 }
264 317
318 if (nohz_ratelimit(cpu))
319 goto end;
320
265 ts->idle_calls++; 321 ts->idle_calls++;
266 /* Read jiffies and the time when jiffies were updated last */ 322 /* Read jiffies and the time when jiffies were updated last */
267 do { 323 do {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 39f6177fafac..caf8d4d4f5c8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,13 +165,6 @@ struct timespec raw_time;
165/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
167 167
168static struct timespec xtime_cache __attribute__ ((aligned (16)));
169void update_xtime_cache(u64 nsec)
170{
171 xtime_cache = xtime;
172 timespec_add_ns(&xtime_cache, nsec);
173}
174
175/* must hold xtime_lock */ 168/* must hold xtime_lock */
176void timekeeping_leap_insert(int leapsecond) 169void timekeeping_leap_insert(int leapsecond)
177{ 170{
@@ -332,8 +325,6 @@ int do_settimeofday(struct timespec *tv)
332 325
333 xtime = *tv; 326 xtime = *tv;
334 327
335 update_xtime_cache(0);
336
337 timekeeper.ntp_error = 0; 328 timekeeper.ntp_error = 0;
338 ntp_clear(); 329 ntp_clear();
339 330
@@ -559,7 +550,6 @@ void __init timekeeping_init(void)
559 } 550 }
560 set_normalized_timespec(&wall_to_monotonic, 551 set_normalized_timespec(&wall_to_monotonic,
561 -boot.tv_sec, -boot.tv_nsec); 552 -boot.tv_sec, -boot.tv_nsec);
562 update_xtime_cache(0);
563 total_sleep_time.tv_sec = 0; 553 total_sleep_time.tv_sec = 0;
564 total_sleep_time.tv_nsec = 0; 554 total_sleep_time.tv_nsec = 0;
565 write_sequnlock_irqrestore(&xtime_lock, flags); 555 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -593,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
593 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
594 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 584 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
595 } 585 }
596 update_xtime_cache(0);
597 /* re-base the last cycle value */ 586 /* re-base the last cycle value */
598 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
599 timekeeper.ntp_error = 0; 588 timekeeper.ntp_error = 0;
@@ -788,7 +777,6 @@ void update_wall_time(void)
788{ 777{
789 struct clocksource *clock; 778 struct clocksource *clock;
790 cycle_t offset; 779 cycle_t offset;
791 u64 nsecs;
792 int shift = 0, maxshift; 780 int shift = 0, maxshift;
793 781
794 /* Make sure we're fully resumed: */ 782 /* Make sure we're fully resumed: */
@@ -847,7 +835,9 @@ void update_wall_time(void)
847 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; 835 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
848 } 836 }
849 837
850 /* store full nanoseconds into xtime after rounding it up and 838
839 /*
840 * Store full nanoseconds into xtime after rounding it up and
851 * add the remainder to the error difference. 841 * add the remainder to the error difference.
852 */ 842 */
853 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; 843 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
@@ -855,8 +845,15 @@ void update_wall_time(void)
855 timekeeper.ntp_error += timekeeper.xtime_nsec << 845 timekeeper.ntp_error += timekeeper.xtime_nsec <<
856 timekeeper.ntp_error_shift; 846 timekeeper.ntp_error_shift;
857 847
858 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); 848 /*
859 update_xtime_cache(nsecs); 849 * Finally, make sure that after the rounding
850 * xtime.tv_nsec isn't larger then NSEC_PER_SEC
851 */
852 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
853 xtime.tv_nsec -= NSEC_PER_SEC;
854 xtime.tv_sec++;
855 second_overflow();
856 }
860 857
861 /* check to see if there is a new clocksource to use */ 858 /* check to see if there is a new clocksource to use */
862 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 859 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
@@ -896,13 +893,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
896 893
897unsigned long get_seconds(void) 894unsigned long get_seconds(void)
898{ 895{
899 return xtime_cache.tv_sec; 896 return xtime.tv_sec;
900} 897}
901EXPORT_SYMBOL(get_seconds); 898EXPORT_SYMBOL(get_seconds);
902 899
903struct timespec __current_kernel_time(void) 900struct timespec __current_kernel_time(void)
904{ 901{
905 return xtime_cache; 902 return xtime;
906} 903}
907 904
908struct timespec current_kernel_time(void) 905struct timespec current_kernel_time(void)
@@ -913,7 +910,7 @@ struct timespec current_kernel_time(void)
913 do { 910 do {
914 seq = read_seqbegin(&xtime_lock); 911 seq = read_seqbegin(&xtime_lock);
915 912
916 now = xtime_cache; 913 now = xtime;
917 } while (read_seqretry(&xtime_lock, seq)); 914 } while (read_seqretry(&xtime_lock, seq));
918 915
919 return now; 916 return now;
@@ -928,7 +925,7 @@ struct timespec get_monotonic_coarse(void)
928 do { 925 do {
929 seq = read_seqbegin(&xtime_lock); 926 seq = read_seqbegin(&xtime_lock);
930 927
931 now = xtime_cache; 928 now = xtime;
932 mono = wall_to_monotonic; 929 mono = wall_to_monotonic;
933 } while (read_seqretry(&xtime_lock, seq)); 930 } while (read_seqretry(&xtime_lock, seq));
934 931
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1a4a7dd78777..ab8f5e33fa92 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
176 P_ns(idle_waketime); 176 P_ns(idle_waketime);
177 P_ns(idle_exittime); 177 P_ns(idle_exittime);
178 P_ns(idle_sleeptime); 178 P_ns(idle_sleeptime);
179 P_ns(iowait_sleeptime);
179 P(last_jiffies); 180 P(last_jiffies);
180 P(next_jiffies); 181 P(next_jiffies);
181 P_ns(idle_expires); 182 P_ns(idle_expires);
diff --git a/kernel/timer.c b/kernel/timer.c
index aeb6a54f2771..2454172a80d3 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -319,6 +319,24 @@ unsigned long round_jiffies_up_relative(unsigned long j)
319} 319}
320EXPORT_SYMBOL_GPL(round_jiffies_up_relative); 320EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
321 321
322/**
323 * set_timer_slack - set the allowed slack for a timer
324 * @slack_hz: the amount of time (in jiffies) allowed for rounding
325 *
326 * Set the amount of time, in jiffies, that a certain timer has
327 * in terms of slack. By setting this value, the timer subsystem
328 * will schedule the actual timer somewhere between
329 * the time mod_timer() asks for, and that time plus the slack.
330 *
331 * By setting the slack to -1, a percentage of the delay is used
332 * instead.
333 */
334void set_timer_slack(struct timer_list *timer, int slack_hz)
335{
336 timer->slack = slack_hz;
337}
338EXPORT_SYMBOL_GPL(set_timer_slack);
339
322 340
323static inline void set_running_timer(struct tvec_base *base, 341static inline void set_running_timer(struct tvec_base *base,
324 struct timer_list *timer) 342 struct timer_list *timer)
@@ -550,6 +568,7 @@ static void __init_timer(struct timer_list *timer,
550{ 568{
551 timer->entry.next = NULL; 569 timer->entry.next = NULL;
552 timer->base = __raw_get_cpu_var(tvec_bases); 570 timer->base = __raw_get_cpu_var(tvec_bases);
571 timer->slack = -1;
553#ifdef CONFIG_TIMER_STATS 572#ifdef CONFIG_TIMER_STATS
554 timer->start_site = NULL; 573 timer->start_site = NULL;
555 timer->start_pid = -1; 574 timer->start_pid = -1;
@@ -715,6 +734,46 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
715} 734}
716EXPORT_SYMBOL(mod_timer_pending); 735EXPORT_SYMBOL(mod_timer_pending);
717 736
737/*
738 * Decide where to put the timer while taking the slack into account
739 *
740 * Algorithm:
741 * 1) calculate the maximum (absolute) time
742 * 2) calculate the highest bit where the expires and new max are different
743 * 3) use this bit to make a mask
744 * 4) use the bitmask to round down the maximum time, so that all last
745 * bits are zeros
746 */
747static inline
748unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
749{
750 unsigned long expires_limit, mask;
751 int bit;
752
753 expires_limit = expires;
754
755 if (timer->slack >= 0) {
756 expires_limit = expires + timer->slack;
757 } else {
758 unsigned long now = jiffies;
759
760 /* No slack, if already expired else auto slack 0.4% */
761 if (time_after(expires, now))
762 expires_limit = expires + (expires - now)/256;
763 }
764 mask = expires ^ expires_limit;
765 if (mask == 0)
766 return expires;
767
768 bit = find_last_bit(&mask, BITS_PER_LONG);
769
770 mask = (1 << bit) - 1;
771
772 expires_limit = expires_limit & ~(mask);
773
774 return expires_limit;
775}
776
718/** 777/**
719 * mod_timer - modify a timer's timeout 778 * mod_timer - modify a timer's timeout
720 * @timer: the timer to be modified 779 * @timer: the timer to be modified
@@ -745,6 +804,8 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
745 if (timer_pending(timer) && timer->expires == expires) 804 if (timer_pending(timer) && timer->expires == expires)
746 return 1; 805 return 1;
747 806
807 expires = apply_slack(timer, expires);
808
748 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 809 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
749} 810}
750EXPORT_SYMBOL(mod_timer); 811EXPORT_SYMBOL(mod_timer);
@@ -955,6 +1016,47 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
955 return index; 1016 return index;
956} 1017}
957 1018
1019static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1020 unsigned long data)
1021{
1022 int preempt_count = preempt_count();
1023
1024#ifdef CONFIG_LOCKDEP
1025 /*
1026 * It is permissible to free the timer from inside the
1027 * function that is called from it, this we need to take into
1028 * account for lockdep too. To avoid bogus "held lock freed"
1029 * warnings as well as problems when looking into
1030 * timer->lockdep_map, make a copy and use that here.
1031 */
1032 struct lockdep_map lockdep_map = timer->lockdep_map;
1033#endif
1034 /*
1035 * Couple the lock chain with the lock chain at
1036 * del_timer_sync() by acquiring the lock_map around the fn()
1037 * call here and in del_timer_sync().
1038 */
1039 lock_map_acquire(&lockdep_map);
1040
1041 trace_timer_expire_entry(timer);
1042 fn(data);
1043 trace_timer_expire_exit(timer);
1044
1045 lock_map_release(&lockdep_map);
1046
1047 if (preempt_count != preempt_count()) {
1048 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1049 fn, preempt_count, preempt_count());
1050 /*
1051 * Restore the preempt count. That gives us a decent
1052 * chance to survive and extract information. If the
1053 * callback kept a lock held, bad luck, but not worse
1054 * than the BUG() we had.
1055 */
1056 preempt_count() = preempt_count;
1057 }
1058}
1059
958#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) 1060#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
959 1061
960/** 1062/**
@@ -998,45 +1100,7 @@ static inline void __run_timers(struct tvec_base *base)
998 detach_timer(timer, 1); 1100 detach_timer(timer, 1);
999 1101
1000 spin_unlock_irq(&base->lock); 1102 spin_unlock_irq(&base->lock);
1001 { 1103 call_timer_fn(timer, fn, data);
1002 int preempt_count = preempt_count();
1003
1004#ifdef CONFIG_LOCKDEP
1005 /*
1006 * It is permissible to free the timer from
1007 * inside the function that is called from
1008 * it, this we need to take into account for
1009 * lockdep too. To avoid bogus "held lock
1010 * freed" warnings as well as problems when
1011 * looking into timer->lockdep_map, make a
1012 * copy and use that here.
1013 */
1014 struct lockdep_map lockdep_map =
1015 timer->lockdep_map;
1016#endif
1017 /*
1018 * Couple the lock chain with the lock chain at
1019 * del_timer_sync() by acquiring the lock_map
1020 * around the fn() call here and in
1021 * del_timer_sync().
1022 */
1023 lock_map_acquire(&lockdep_map);
1024
1025 trace_timer_expire_entry(timer);
1026 fn(data);
1027 trace_timer_expire_exit(timer);
1028
1029 lock_map_release(&lockdep_map);
1030
1031 if (preempt_count != preempt_count()) {
1032 printk(KERN_ERR "huh, entered %p "
1033 "with preempt_count %08x, exited"
1034 " with %08x?\n",
1035 fn, preempt_count,
1036 preempt_count());
1037 BUG();
1038 }
1039 }
1040 spin_lock_irq(&base->lock); 1104 spin_lock_irq(&base->lock);
1041 } 1105 }
1042 } 1106 }
@@ -1620,11 +1684,14 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1620 unsigned long action, void *hcpu) 1684 unsigned long action, void *hcpu)
1621{ 1685{
1622 long cpu = (long)hcpu; 1686 long cpu = (long)hcpu;
1687 int err;
1688
1623 switch(action) { 1689 switch(action) {
1624 case CPU_UP_PREPARE: 1690 case CPU_UP_PREPARE:
1625 case CPU_UP_PREPARE_FROZEN: 1691 case CPU_UP_PREPARE_FROZEN:
1626 if (init_timers_cpu(cpu) < 0) 1692 err = init_timers_cpu(cpu);
1627 return NOTIFY_BAD; 1693 if (err < 0)
1694 return notifier_from_errno(err);
1628 break; 1695 break;
1629#ifdef CONFIG_HOTPLUG_CPU 1696#ifdef CONFIG_HOTPLUG_CPU
1630 case CPU_DEAD: 1697 case CPU_DEAD:
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 13e13d428cd3..8b1797c4545b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
44 help 44 help
45 See Documentation/trace/ftrace-design.txt 45 See Documentation/trace/ftrace-design.txt
46 46
47config HAVE_HW_BRANCH_TRACER
48 bool
49
50config HAVE_SYSCALL_TRACEPOINTS 47config HAVE_SYSCALL_TRACEPOINTS
51 bool 48 bool
52 help 49 help
@@ -374,14 +371,6 @@ config STACK_TRACER
374 371
375 Say N if unsure. 372 Say N if unsure.
376 373
377config HW_BRANCH_TRACER
378 depends on HAVE_HW_BRANCH_TRACER
379 bool "Trace hw branches"
380 select GENERIC_TRACER
381 help
382 This tracer records all branches on the system in a circular
383 buffer, giving access to the last N branches for each cpu.
384
385config KMEMTRACE 374config KMEMTRACE
386 bool "Trace SLAB allocations" 375 bool "Trace SLAB allocations"
387 select GENERIC_TRACER 376 select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 78edc6490038..ffb1a5b0550e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o 41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 44obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 45obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 46obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b3bc91a3f510..36ea2b65dcdc 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -675,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
675 } 675 }
676} 676}
677 677
678static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) 678static void blk_add_trace_rq_abort(void *ignore,
679 struct request_queue *q, struct request *rq)
679{ 680{
680 blk_add_trace_rq(q, rq, BLK_TA_ABORT); 681 blk_add_trace_rq(q, rq, BLK_TA_ABORT);
681} 682}
682 683
683static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) 684static void blk_add_trace_rq_insert(void *ignore,
685 struct request_queue *q, struct request *rq)
684{ 686{
685 blk_add_trace_rq(q, rq, BLK_TA_INSERT); 687 blk_add_trace_rq(q, rq, BLK_TA_INSERT);
686} 688}
687 689
688static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) 690static void blk_add_trace_rq_issue(void *ignore,
691 struct request_queue *q, struct request *rq)
689{ 692{
690 blk_add_trace_rq(q, rq, BLK_TA_ISSUE); 693 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
691} 694}
692 695
693static void blk_add_trace_rq_requeue(struct request_queue *q, 696static void blk_add_trace_rq_requeue(void *ignore,
697 struct request_queue *q,
694 struct request *rq) 698 struct request *rq)
695{ 699{
696 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 700 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
697} 701}
698 702
699static void blk_add_trace_rq_complete(struct request_queue *q, 703static void blk_add_trace_rq_complete(void *ignore,
704 struct request_queue *q,
700 struct request *rq) 705 struct request *rq)
701{ 706{
702 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 707 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -724,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
724 !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 729 !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
725} 730}
726 731
727static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) 732static void blk_add_trace_bio_bounce(void *ignore,
733 struct request_queue *q, struct bio *bio)
728{ 734{
729 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); 735 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
730} 736}
731 737
732static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) 738static void blk_add_trace_bio_complete(void *ignore,
739 struct request_queue *q, struct bio *bio)
733{ 740{
734 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 741 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
735} 742}
736 743
737static void blk_add_trace_bio_backmerge(struct request_queue *q, 744static void blk_add_trace_bio_backmerge(void *ignore,
745 struct request_queue *q,
738 struct bio *bio) 746 struct bio *bio)
739{ 747{
740 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 748 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
741} 749}
742 750
743static void blk_add_trace_bio_frontmerge(struct request_queue *q, 751static void blk_add_trace_bio_frontmerge(void *ignore,
752 struct request_queue *q,
744 struct bio *bio) 753 struct bio *bio)
745{ 754{
746 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 755 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
747} 756}
748 757
749static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) 758static void blk_add_trace_bio_queue(void *ignore,
759 struct request_queue *q, struct bio *bio)
750{ 760{
751 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 761 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
752} 762}
753 763
754static void blk_add_trace_getrq(struct request_queue *q, 764static void blk_add_trace_getrq(void *ignore,
765 struct request_queue *q,
755 struct bio *bio, int rw) 766 struct bio *bio, int rw)
756{ 767{
757 if (bio) 768 if (bio)
@@ -765,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
765} 776}
766 777
767 778
768static void blk_add_trace_sleeprq(struct request_queue *q, 779static void blk_add_trace_sleeprq(void *ignore,
780 struct request_queue *q,
769 struct bio *bio, int rw) 781 struct bio *bio, int rw)
770{ 782{
771 if (bio) 783 if (bio)
@@ -779,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
779 } 791 }
780} 792}
781 793
782static void blk_add_trace_plug(struct request_queue *q) 794static void blk_add_trace_plug(void *ignore, struct request_queue *q)
783{ 795{
784 struct blk_trace *bt = q->blk_trace; 796 struct blk_trace *bt = q->blk_trace;
785 797
@@ -787,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q)
787 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); 799 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
788} 800}
789 801
790static void blk_add_trace_unplug_io(struct request_queue *q) 802static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
791{ 803{
792 struct blk_trace *bt = q->blk_trace; 804 struct blk_trace *bt = q->blk_trace;
793 805
@@ -800,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
800 } 812 }
801} 813}
802 814
803static void blk_add_trace_unplug_timer(struct request_queue *q) 815static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
804{ 816{
805 struct blk_trace *bt = q->blk_trace; 817 struct blk_trace *bt = q->blk_trace;
806 818
@@ -813,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
813 } 825 }
814} 826}
815 827
816static void blk_add_trace_split(struct request_queue *q, struct bio *bio, 828static void blk_add_trace_split(void *ignore,
829 struct request_queue *q, struct bio *bio,
817 unsigned int pdu) 830 unsigned int pdu)
818{ 831{
819 struct blk_trace *bt = q->blk_trace; 832 struct blk_trace *bt = q->blk_trace;
@@ -839,8 +852,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
839 * it spans a stripe (or similar). Add a trace for that action. 852 * it spans a stripe (or similar). Add a trace for that action.
840 * 853 *
841 **/ 854 **/
842static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, 855static void blk_add_trace_remap(void *ignore,
843 dev_t dev, sector_t from) 856 struct request_queue *q, struct bio *bio,
857 dev_t dev, sector_t from)
844{ 858{
845 struct blk_trace *bt = q->blk_trace; 859 struct blk_trace *bt = q->blk_trace;
846 struct blk_io_trace_remap r; 860 struct blk_io_trace_remap r;
@@ -869,7 +883,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
869 * Add a trace for that action. 883 * Add a trace for that action.
870 * 884 *
871 **/ 885 **/
872static void blk_add_trace_rq_remap(struct request_queue *q, 886static void blk_add_trace_rq_remap(void *ignore,
887 struct request_queue *q,
873 struct request *rq, dev_t dev, 888 struct request *rq, dev_t dev,
874 sector_t from) 889 sector_t from)
875{ 890{
@@ -921,64 +936,64 @@ static void blk_register_tracepoints(void)
921{ 936{
922 int ret; 937 int ret;
923 938
924 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); 939 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
925 WARN_ON(ret); 940 WARN_ON(ret);
926 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); 941 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
927 WARN_ON(ret); 942 WARN_ON(ret);
928 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); 943 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
929 WARN_ON(ret); 944 WARN_ON(ret);
930 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); 945 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
931 WARN_ON(ret); 946 WARN_ON(ret);
932 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); 947 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
933 WARN_ON(ret); 948 WARN_ON(ret);
934 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); 949 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
935 WARN_ON(ret); 950 WARN_ON(ret);
936 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); 951 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
937 WARN_ON(ret); 952 WARN_ON(ret);
938 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); 953 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
939 WARN_ON(ret); 954 WARN_ON(ret);
940 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); 955 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
941 WARN_ON(ret); 956 WARN_ON(ret);
942 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); 957 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
943 WARN_ON(ret); 958 WARN_ON(ret);
944 ret = register_trace_block_getrq(blk_add_trace_getrq); 959 ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
945 WARN_ON(ret); 960 WARN_ON(ret);
946 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); 961 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
947 WARN_ON(ret); 962 WARN_ON(ret);
948 ret = register_trace_block_plug(blk_add_trace_plug); 963 ret = register_trace_block_plug(blk_add_trace_plug, NULL);
949 WARN_ON(ret); 964 WARN_ON(ret);
950 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); 965 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
951 WARN_ON(ret); 966 WARN_ON(ret);
952 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); 967 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
953 WARN_ON(ret); 968 WARN_ON(ret);
954 ret = register_trace_block_split(blk_add_trace_split); 969 ret = register_trace_block_split(blk_add_trace_split, NULL);
955 WARN_ON(ret); 970 WARN_ON(ret);
956 ret = register_trace_block_remap(blk_add_trace_remap); 971 ret = register_trace_block_remap(blk_add_trace_remap, NULL);
957 WARN_ON(ret); 972 WARN_ON(ret);
958 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); 973 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
959 WARN_ON(ret); 974 WARN_ON(ret);
960} 975}
961 976
962static void blk_unregister_tracepoints(void) 977static void blk_unregister_tracepoints(void)
963{ 978{
964 unregister_trace_block_rq_remap(blk_add_trace_rq_remap); 979 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
965 unregister_trace_block_remap(blk_add_trace_remap); 980 unregister_trace_block_remap(blk_add_trace_remap, NULL);
966 unregister_trace_block_split(blk_add_trace_split); 981 unregister_trace_block_split(blk_add_trace_split, NULL);
967 unregister_trace_block_unplug_io(blk_add_trace_unplug_io); 982 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
968 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); 983 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
969 unregister_trace_block_plug(blk_add_trace_plug); 984 unregister_trace_block_plug(blk_add_trace_plug, NULL);
970 unregister_trace_block_sleeprq(blk_add_trace_sleeprq); 985 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
971 unregister_trace_block_getrq(blk_add_trace_getrq); 986 unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
972 unregister_trace_block_bio_queue(blk_add_trace_bio_queue); 987 unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
973 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); 988 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
974 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); 989 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
975 unregister_trace_block_bio_complete(blk_add_trace_bio_complete); 990 unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
976 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); 991 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
977 unregister_trace_block_rq_complete(blk_add_trace_rq_complete); 992 unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
978 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); 993 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
979 unregister_trace_block_rq_issue(blk_add_trace_rq_issue); 994 unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
980 unregister_trace_block_rq_insert(blk_add_trace_rq_insert); 995 unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
981 unregister_trace_block_rq_abort(blk_add_trace_rq_abort); 996 unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
982 997
983 tracepoint_synchronize_unregister(); 998 tracepoint_synchronize_unregister();
984} 999}
@@ -1321,7 +1336,7 @@ out:
1321} 1336}
1322 1337
1323static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1338static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1324 int flags) 1339 int flags, struct trace_event *event)
1325{ 1340{
1326 return print_one_line(iter, false); 1341 return print_one_line(iter, false);
1327} 1342}
@@ -1343,7 +1358,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1343} 1358}
1344 1359
1345static enum print_line_t 1360static enum print_line_t
1346blk_trace_event_print_binary(struct trace_iterator *iter, int flags) 1361blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
1362 struct trace_event *event)
1347{ 1363{
1348 return blk_trace_synthesize_old_trace(iter) ? 1364 return blk_trace_synthesize_old_trace(iter) ?
1349 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1365 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1381,12 +1397,16 @@ static struct tracer blk_tracer __read_mostly = {
1381 .set_flag = blk_tracer_set_flag, 1397 .set_flag = blk_tracer_set_flag,
1382}; 1398};
1383 1399
1384static struct trace_event trace_blk_event = { 1400static struct trace_event_functions trace_blk_event_funcs = {
1385 .type = TRACE_BLK,
1386 .trace = blk_trace_event_print, 1401 .trace = blk_trace_event_print,
1387 .binary = blk_trace_event_print_binary, 1402 .binary = blk_trace_event_print_binary,
1388}; 1403};
1389 1404
1405static struct trace_event trace_blk_event = {
1406 .type = TRACE_BLK,
1407 .funcs = &trace_blk_event_funcs,
1408};
1409
1390static int __init init_blk_tracer(void) 1410static int __init init_blk_tracer(void)
1391{ 1411{
1392 if (!register_ftrace_event(&trace_blk_event)) { 1412 if (!register_ftrace_event(&trace_blk_event)) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2404b59b3097..6d2cb14f9449 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -264,6 +264,7 @@ struct ftrace_profile {
264 unsigned long counter; 264 unsigned long counter;
265#ifdef CONFIG_FUNCTION_GRAPH_TRACER 265#ifdef CONFIG_FUNCTION_GRAPH_TRACER
266 unsigned long long time; 266 unsigned long long time;
267 unsigned long long time_squared;
267#endif 268#endif
268}; 269};
269 270
@@ -366,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
366{ 367{
367#ifdef CONFIG_FUNCTION_GRAPH_TRACER 368#ifdef CONFIG_FUNCTION_GRAPH_TRACER
368 seq_printf(m, " Function " 369 seq_printf(m, " Function "
369 "Hit Time Avg\n" 370 "Hit Time Avg s^2\n"
370 " -------- " 371 " -------- "
371 "--- ---- ---\n"); 372 "--- ---- --- ---\n");
372#else 373#else
373 seq_printf(m, " Function Hit\n" 374 seq_printf(m, " Function Hit\n"
374 " -------- ---\n"); 375 " -------- ---\n");
@@ -384,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
384 static DEFINE_MUTEX(mutex); 385 static DEFINE_MUTEX(mutex);
385 static struct trace_seq s; 386 static struct trace_seq s;
386 unsigned long long avg; 387 unsigned long long avg;
388 unsigned long long stddev;
387#endif 389#endif
388 390
389 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 391 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -394,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
394 avg = rec->time; 396 avg = rec->time;
395 do_div(avg, rec->counter); 397 do_div(avg, rec->counter);
396 398
399 /* Sample standard deviation (s^2) */
400 if (rec->counter <= 1)
401 stddev = 0;
402 else {
403 stddev = rec->time_squared - rec->counter * avg * avg;
404 /*
405 * Divide only 1000 for ns^2 -> us^2 conversion.
406 * trace_print_graph_duration will divide 1000 again.
407 */
408 do_div(stddev, (rec->counter - 1) * 1000);
409 }
410
397 mutex_lock(&mutex); 411 mutex_lock(&mutex);
398 trace_seq_init(&s); 412 trace_seq_init(&s);
399 trace_print_graph_duration(rec->time, &s); 413 trace_print_graph_duration(rec->time, &s);
400 trace_seq_puts(&s, " "); 414 trace_seq_puts(&s, " ");
401 trace_print_graph_duration(avg, &s); 415 trace_print_graph_duration(avg, &s);
416 trace_seq_puts(&s, " ");
417 trace_print_graph_duration(stddev, &s);
402 trace_print_seq(m, &s); 418 trace_print_seq(m, &s);
403 mutex_unlock(&mutex); 419 mutex_unlock(&mutex);
404#endif 420#endif
@@ -650,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
650 if (!stat->hash || !ftrace_profile_enabled) 666 if (!stat->hash || !ftrace_profile_enabled)
651 goto out; 667 goto out;
652 668
669 /* If the calltime was zero'd ignore it */
670 if (!trace->calltime)
671 goto out;
672
653 calltime = trace->rettime - trace->calltime; 673 calltime = trace->rettime - trace->calltime;
654 674
655 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { 675 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -668,8 +688,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
668 } 688 }
669 689
670 rec = ftrace_find_profiled_func(stat, trace->func); 690 rec = ftrace_find_profiled_func(stat, trace->func);
671 if (rec) 691 if (rec) {
672 rec->time += calltime; 692 rec->time += calltime;
693 rec->time_squared += calltime * calltime;
694 }
673 695
674 out: 696 out:
675 local_irq_restore(flags); 697 local_irq_restore(flags);
@@ -3212,8 +3234,8 @@ free:
3212} 3234}
3213 3235
3214static void 3236static void
3215ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, 3237ftrace_graph_probe_sched_switch(void *ignore,
3216 struct task_struct *next) 3238 struct task_struct *prev, struct task_struct *next)
3217{ 3239{
3218 unsigned long long timestamp; 3240 unsigned long long timestamp;
3219 int index; 3241 int index;
@@ -3267,7 +3289,7 @@ static int start_graph_tracing(void)
3267 } while (ret == -EAGAIN); 3289 } while (ret == -EAGAIN);
3268 3290
3269 if (!ret) { 3291 if (!ret) {
3270 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); 3292 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3271 if (ret) 3293 if (ret)
3272 pr_info("ftrace_graph: Couldn't activate tracepoint" 3294 pr_info("ftrace_graph: Couldn't activate tracepoint"
3273 " probe to kernel_sched_switch\n"); 3295 " probe to kernel_sched_switch\n");
@@ -3339,11 +3361,11 @@ void unregister_ftrace_graph(void)
3339 goto out; 3361 goto out;
3340 3362
3341 ftrace_graph_active--; 3363 ftrace_graph_active--;
3342 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
3343 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 3364 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3344 ftrace_graph_entry = ftrace_graph_entry_stub; 3365 ftrace_graph_entry = ftrace_graph_entry_stub;
3345 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 3366 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
3346 unregister_pm_notifier(&ftrace_suspend_notifier); 3367 unregister_pm_notifier(&ftrace_suspend_notifier);
3368 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3347 3369
3348 out: 3370 out:
3349 mutex_unlock(&ftrace_lock); 3371 mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index a91da69f153a..bbfc1bb1660b 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -95,7 +95,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
95 trace_wake_up(); 95 trace_wake_up();
96} 96}
97 97
98static void kmemtrace_kmalloc(unsigned long call_site, 98static void kmemtrace_kmalloc(void *ignore,
99 unsigned long call_site,
99 const void *ptr, 100 const void *ptr,
100 size_t bytes_req, 101 size_t bytes_req,
101 size_t bytes_alloc, 102 size_t bytes_alloc,
@@ -105,7 +106,8 @@ static void kmemtrace_kmalloc(unsigned long call_site,
105 bytes_req, bytes_alloc, gfp_flags, -1); 106 bytes_req, bytes_alloc, gfp_flags, -1);
106} 107}
107 108
108static void kmemtrace_kmem_cache_alloc(unsigned long call_site, 109static void kmemtrace_kmem_cache_alloc(void *ignore,
110 unsigned long call_site,
109 const void *ptr, 111 const void *ptr,
110 size_t bytes_req, 112 size_t bytes_req,
111 size_t bytes_alloc, 113 size_t bytes_alloc,
@@ -115,7 +117,8 @@ static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
115 bytes_req, bytes_alloc, gfp_flags, -1); 117 bytes_req, bytes_alloc, gfp_flags, -1);
116} 118}
117 119
118static void kmemtrace_kmalloc_node(unsigned long call_site, 120static void kmemtrace_kmalloc_node(void *ignore,
121 unsigned long call_site,
119 const void *ptr, 122 const void *ptr,
120 size_t bytes_req, 123 size_t bytes_req,
121 size_t bytes_alloc, 124 size_t bytes_alloc,
@@ -126,7 +129,8 @@ static void kmemtrace_kmalloc_node(unsigned long call_site,
126 bytes_req, bytes_alloc, gfp_flags, node); 129 bytes_req, bytes_alloc, gfp_flags, node);
127} 130}
128 131
129static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, 132static void kmemtrace_kmem_cache_alloc_node(void *ignore,
133 unsigned long call_site,
130 const void *ptr, 134 const void *ptr,
131 size_t bytes_req, 135 size_t bytes_req,
132 size_t bytes_alloc, 136 size_t bytes_alloc,
@@ -137,12 +141,14 @@ static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
137 bytes_req, bytes_alloc, gfp_flags, node); 141 bytes_req, bytes_alloc, gfp_flags, node);
138} 142}
139 143
140static void kmemtrace_kfree(unsigned long call_site, const void *ptr) 144static void
145kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
141{ 146{
142 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); 147 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
143} 148}
144 149
145static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr) 150static void kmemtrace_kmem_cache_free(void *ignore,
151 unsigned long call_site, const void *ptr)
146{ 152{
147 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); 153 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
148} 154}
@@ -151,34 +157,34 @@ static int kmemtrace_start_probes(void)
151{ 157{
152 int err; 158 int err;
153 159
154 err = register_trace_kmalloc(kmemtrace_kmalloc); 160 err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
155 if (err) 161 if (err)
156 return err; 162 return err;
157 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); 163 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
158 if (err) 164 if (err)
159 return err; 165 return err;
160 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node); 166 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
161 if (err) 167 if (err)
162 return err; 168 return err;
163 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); 169 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
164 if (err) 170 if (err)
165 return err; 171 return err;
166 err = register_trace_kfree(kmemtrace_kfree); 172 err = register_trace_kfree(kmemtrace_kfree, NULL);
167 if (err) 173 if (err)
168 return err; 174 return err;
169 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free); 175 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
170 176
171 return err; 177 return err;
172} 178}
173 179
174static void kmemtrace_stop_probes(void) 180static void kmemtrace_stop_probes(void)
175{ 181{
176 unregister_trace_kmalloc(kmemtrace_kmalloc); 182 unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
177 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); 183 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
178 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node); 184 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
179 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); 185 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
180 unregister_trace_kfree(kmemtrace_kfree); 186 unregister_trace_kfree(kmemtrace_kfree, NULL);
181 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free); 187 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
182} 188}
183 189
184static int kmem_trace_init(struct trace_array *tr) 190static int kmem_trace_init(struct trace_array *tr)
@@ -237,7 +243,8 @@ struct kmemtrace_user_event_alloc {
237}; 243};
238 244
239static enum print_line_t 245static enum print_line_t
240kmemtrace_print_alloc(struct trace_iterator *iter, int flags) 246kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
247 struct trace_event *event)
241{ 248{
242 struct trace_seq *s = &iter->seq; 249 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry; 250 struct kmemtrace_alloc_entry *entry;
@@ -257,7 +264,8 @@ kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
257} 264}
258 265
259static enum print_line_t 266static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags) 267kmemtrace_print_free(struct trace_iterator *iter, int flags,
268 struct trace_event *event)
261{ 269{
262 struct trace_seq *s = &iter->seq; 270 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry; 271 struct kmemtrace_free_entry *entry;
@@ -275,7 +283,8 @@ kmemtrace_print_free(struct trace_iterator *iter, int flags)
275} 283}
276 284
277static enum print_line_t 285static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) 286kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
287 struct trace_event *event)
279{ 288{
280 struct trace_seq *s = &iter->seq; 289 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry; 290 struct kmemtrace_alloc_entry *entry;
@@ -309,7 +318,8 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
309} 318}
310 319
311static enum print_line_t 320static enum print_line_t
312kmemtrace_print_free_user(struct trace_iterator *iter, int flags) 321kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
322 struct trace_event *event)
313{ 323{
314 struct trace_seq *s = &iter->seq; 324 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry; 325 struct kmemtrace_free_entry *entry;
@@ -463,18 +473,26 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
463 } 473 }
464} 474}
465 475
466static struct trace_event kmem_trace_alloc = { 476static struct trace_event_functions kmem_trace_alloc_funcs = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc, 477 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user, 478 .binary = kmemtrace_print_alloc_user,
470}; 479};
471 480
472static struct trace_event kmem_trace_free = { 481static struct trace_event kmem_trace_alloc = {
473 .type = TRACE_KMEM_FREE, 482 .type = TRACE_KMEM_ALLOC,
483 .funcs = &kmem_trace_alloc_funcs,
484};
485
486static struct trace_event_functions kmem_trace_free_funcs = {
474 .trace = kmemtrace_print_free, 487 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user, 488 .binary = kmemtrace_print_free_user,
476}; 489};
477 490
491static struct trace_event kmem_trace_free = {
492 .type = TRACE_KMEM_FREE,
493 .funcs = &kmem_trace_free_funcs,
494};
495
478static struct tracer kmem_tracer __read_mostly = { 496static struct tracer kmem_tracer __read_mostly = {
479 .name = "kmemtrace", 497 .name = "kmemtrace",
480 .init = kmem_trace_init, 498 .init = kmem_trace_init,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 41ca394feb22..7f6059c5aa94 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -319,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
319#define TS_MASK ((1ULL << TS_SHIFT) - 1) 319#define TS_MASK ((1ULL << TS_SHIFT) - 1)
320#define TS_DELTA_TEST (~TS_MASK) 320#define TS_DELTA_TEST (~TS_MASK)
321 321
322/* Flag when events were overwritten */
323#define RB_MISSED_EVENTS (1 << 31)
324/* Missed count stored at end */
325#define RB_MISSED_STORED (1 << 30)
326
322struct buffer_data_page { 327struct buffer_data_page {
323 u64 time_stamp; /* page time stamp */ 328 u64 time_stamp; /* page time stamp */
324 local_t commit; /* write committed index */ 329 local_t commit; /* write committed index */
@@ -338,6 +343,7 @@ struct buffer_page {
338 local_t write; /* index for next write */ 343 local_t write; /* index for next write */
339 unsigned read; /* index for next read */ 344 unsigned read; /* index for next read */
340 local_t entries; /* entries on this page */ 345 local_t entries; /* entries on this page */
346 unsigned long real_end; /* real end of data */
341 struct buffer_data_page *page; /* Actual data page */ 347 struct buffer_data_page *page; /* Actual data page */
342}; 348};
343 349
@@ -417,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
417 (unsigned int)sizeof(field.commit), 423 (unsigned int)sizeof(field.commit),
418 (unsigned int)is_signed_type(long)); 424 (unsigned int)is_signed_type(long));
419 425
426 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
427 "offset:%u;\tsize:%u;\tsigned:%u;\n",
428 (unsigned int)offsetof(typeof(field), commit),
429 1,
430 (unsigned int)is_signed_type(long));
431
420 ret = trace_seq_printf(s, "\tfield: char data;\t" 432 ret = trace_seq_printf(s, "\tfield: char data;\t"
421 "offset:%u;\tsize:%u;\tsigned:%u;\n", 433 "offset:%u;\tsize:%u;\tsigned:%u;\n",
422 (unsigned int)offsetof(typeof(field), data), 434 (unsigned int)offsetof(typeof(field), data),
@@ -440,6 +452,8 @@ struct ring_buffer_per_cpu {
440 struct buffer_page *tail_page; /* write to tail */ 452 struct buffer_page *tail_page; /* write to tail */
441 struct buffer_page *commit_page; /* committed pages */ 453 struct buffer_page *commit_page; /* committed pages */
442 struct buffer_page *reader_page; 454 struct buffer_page *reader_page;
455 unsigned long lost_events;
456 unsigned long last_overrun;
443 local_t commit_overrun; 457 local_t commit_overrun;
444 local_t overrun; 458 local_t overrun;
445 local_t entries; 459 local_t entries;
@@ -1762,6 +1776,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1762 kmemcheck_annotate_bitfield(event, bitfield); 1776 kmemcheck_annotate_bitfield(event, bitfield);
1763 1777
1764 /* 1778 /*
1779 * Save the original length to the meta data.
1780 * This will be used by the reader to add lost event
1781 * counter.
1782 */
1783 tail_page->real_end = tail;
1784
1785 /*
1765 * If this event is bigger than the minimum size, then 1786 * If this event is bigger than the minimum size, then
1766 * we need to be careful that we don't subtract the 1787 * we need to be careful that we don't subtract the
1767 * write counter enough to allow another writer to slip 1788 * write counter enough to allow another writer to slip
@@ -1979,17 +2000,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1979 u64 *ts, u64 *delta) 2000 u64 *ts, u64 *delta)
1980{ 2001{
1981 struct ring_buffer_event *event; 2002 struct ring_buffer_event *event;
1982 static int once;
1983 int ret; 2003 int ret;
1984 2004
1985 if (unlikely(*delta > (1ULL << 59) && !once++)) { 2005 WARN_ONCE(*delta > (1ULL << 59),
1986 printk(KERN_WARNING "Delta way too big! %llu" 2006 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
1987 " ts=%llu write stamp = %llu\n", 2007 (unsigned long long)*delta,
1988 (unsigned long long)*delta, 2008 (unsigned long long)*ts,
1989 (unsigned long long)*ts, 2009 (unsigned long long)cpu_buffer->write_stamp);
1990 (unsigned long long)cpu_buffer->write_stamp);
1991 WARN_ON(1);
1992 }
1993 2010
1994 /* 2011 /*
1995 * The delta is too big, we to add a 2012 * The delta is too big, we to add a
@@ -2838,6 +2855,7 @@ static struct buffer_page *
2838rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 2855rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2839{ 2856{
2840 struct buffer_page *reader = NULL; 2857 struct buffer_page *reader = NULL;
2858 unsigned long overwrite;
2841 unsigned long flags; 2859 unsigned long flags;
2842 int nr_loops = 0; 2860 int nr_loops = 0;
2843 int ret; 2861 int ret;
@@ -2879,6 +2897,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2879 local_set(&cpu_buffer->reader_page->write, 0); 2897 local_set(&cpu_buffer->reader_page->write, 0);
2880 local_set(&cpu_buffer->reader_page->entries, 0); 2898 local_set(&cpu_buffer->reader_page->entries, 0);
2881 local_set(&cpu_buffer->reader_page->page->commit, 0); 2899 local_set(&cpu_buffer->reader_page->page->commit, 0);
2900 cpu_buffer->reader_page->real_end = 0;
2882 2901
2883 spin: 2902 spin:
2884 /* 2903 /*
@@ -2899,6 +2918,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2899 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); 2918 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2900 2919
2901 /* 2920 /*
2921 * We want to make sure we read the overruns after we set up our
2922 * pointers to the next object. The writer side does a
2923 * cmpxchg to cross pages which acts as the mb on the writer
2924 * side. Note, the reader will constantly fail the swap
2925 * while the writer is updating the pointers, so this
2926 * guarantees that the overwrite recorded here is the one we
2927 * want to compare with the last_overrun.
2928 */
2929 smp_mb();
2930 overwrite = local_read(&(cpu_buffer->overrun));
2931
2932 /*
2902 * Here's the tricky part. 2933 * Here's the tricky part.
2903 * 2934 *
2904 * We need to move the pointer past the header page. 2935 * We need to move the pointer past the header page.
@@ -2929,6 +2960,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2929 cpu_buffer->reader_page = reader; 2960 cpu_buffer->reader_page = reader;
2930 rb_reset_reader_page(cpu_buffer); 2961 rb_reset_reader_page(cpu_buffer);
2931 2962
2963 if (overwrite != cpu_buffer->last_overrun) {
2964 cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
2965 cpu_buffer->last_overrun = overwrite;
2966 }
2967
2932 goto again; 2968 goto again;
2933 2969
2934 out: 2970 out:
@@ -3005,8 +3041,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
3005 rb_advance_iter(iter); 3041 rb_advance_iter(iter);
3006} 3042}
3007 3043
3044static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
3045{
3046 return cpu_buffer->lost_events;
3047}
3048
3008static struct ring_buffer_event * 3049static struct ring_buffer_event *
3009rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) 3050rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3051 unsigned long *lost_events)
3010{ 3052{
3011 struct ring_buffer_event *event; 3053 struct ring_buffer_event *event;
3012 struct buffer_page *reader; 3054 struct buffer_page *reader;
@@ -3058,6 +3100,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3058 ring_buffer_normalize_time_stamp(cpu_buffer->buffer, 3100 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3059 cpu_buffer->cpu, ts); 3101 cpu_buffer->cpu, ts);
3060 } 3102 }
3103 if (lost_events)
3104 *lost_events = rb_lost_events(cpu_buffer);
3061 return event; 3105 return event;
3062 3106
3063 default: 3107 default:
@@ -3168,12 +3212,14 @@ static inline int rb_ok_to_lock(void)
3168 * @buffer: The ring buffer to read 3212 * @buffer: The ring buffer to read
3169 * @cpu: The cpu to peak at 3213 * @cpu: The cpu to peak at
3170 * @ts: The timestamp counter of this event. 3214 * @ts: The timestamp counter of this event.
3215 * @lost_events: a variable to store if events were lost (may be NULL)
3171 * 3216 *
3172 * This will return the event that will be read next, but does 3217 * This will return the event that will be read next, but does
3173 * not consume the data. 3218 * not consume the data.
3174 */ 3219 */
3175struct ring_buffer_event * 3220struct ring_buffer_event *
3176ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3221ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3222 unsigned long *lost_events)
3177{ 3223{
3178 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 3224 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3179 struct ring_buffer_event *event; 3225 struct ring_buffer_event *event;
@@ -3188,7 +3234,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3188 local_irq_save(flags); 3234 local_irq_save(flags);
3189 if (dolock) 3235 if (dolock)
3190 spin_lock(&cpu_buffer->reader_lock); 3236 spin_lock(&cpu_buffer->reader_lock);
3191 event = rb_buffer_peek(cpu_buffer, ts); 3237 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3192 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3238 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3193 rb_advance_reader(cpu_buffer); 3239 rb_advance_reader(cpu_buffer);
3194 if (dolock) 3240 if (dolock)
@@ -3230,13 +3276,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3230/** 3276/**
3231 * ring_buffer_consume - return an event and consume it 3277 * ring_buffer_consume - return an event and consume it
3232 * @buffer: The ring buffer to get the next event from 3278 * @buffer: The ring buffer to get the next event from
3279 * @cpu: the cpu to read the buffer from
3280 * @ts: a variable to store the timestamp (may be NULL)
3281 * @lost_events: a variable to store if events were lost (may be NULL)
3233 * 3282 *
3234 * Returns the next event in the ring buffer, and that event is consumed. 3283 * Returns the next event in the ring buffer, and that event is consumed.
3235 * Meaning, that sequential reads will keep returning a different event, 3284 * Meaning, that sequential reads will keep returning a different event,
3236 * and eventually empty the ring buffer if the producer is slower. 3285 * and eventually empty the ring buffer if the producer is slower.
3237 */ 3286 */
3238struct ring_buffer_event * 3287struct ring_buffer_event *
3239ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 3288ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3289 unsigned long *lost_events)
3240{ 3290{
3241 struct ring_buffer_per_cpu *cpu_buffer; 3291 struct ring_buffer_per_cpu *cpu_buffer;
3242 struct ring_buffer_event *event = NULL; 3292 struct ring_buffer_event *event = NULL;
@@ -3257,9 +3307,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3257 if (dolock) 3307 if (dolock)
3258 spin_lock(&cpu_buffer->reader_lock); 3308 spin_lock(&cpu_buffer->reader_lock);
3259 3309
3260 event = rb_buffer_peek(cpu_buffer, ts); 3310 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3261 if (event) 3311 if (event) {
3312 cpu_buffer->lost_events = 0;
3262 rb_advance_reader(cpu_buffer); 3313 rb_advance_reader(cpu_buffer);
3314 }
3263 3315
3264 if (dolock) 3316 if (dolock)
3265 spin_unlock(&cpu_buffer->reader_lock); 3317 spin_unlock(&cpu_buffer->reader_lock);
@@ -3276,23 +3328,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3276EXPORT_SYMBOL_GPL(ring_buffer_consume); 3328EXPORT_SYMBOL_GPL(ring_buffer_consume);
3277 3329
3278/** 3330/**
3279 * ring_buffer_read_start - start a non consuming read of the buffer 3331 * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
3280 * @buffer: The ring buffer to read from 3332 * @buffer: The ring buffer to read from
3281 * @cpu: The cpu buffer to iterate over 3333 * @cpu: The cpu buffer to iterate over
3282 * 3334 *
3283 * This starts up an iteration through the buffer. It also disables 3335 * This performs the initial preparations necessary to iterate
3284 * the recording to the buffer until the reading is finished. 3336 * through the buffer. Memory is allocated, buffer recording
3285 * This prevents the reading from being corrupted. This is not 3337 * is disabled, and the iterator pointer is returned to the caller.
3286 * a consuming read, so a producer is not expected.
3287 * 3338 *
3288 * Must be paired with ring_buffer_finish. 3339 * Disabling buffer recordng prevents the reading from being
3340 * corrupted. This is not a consuming read, so a producer is not
3341 * expected.
3342 *
3343 * After a sequence of ring_buffer_read_prepare calls, the user is
3344 * expected to make at least one call to ring_buffer_prepare_sync.
3345 * Afterwards, ring_buffer_read_start is invoked to get things going
3346 * for real.
3347 *
3348 * This overall must be paired with ring_buffer_finish.
3289 */ 3349 */
3290struct ring_buffer_iter * 3350struct ring_buffer_iter *
3291ring_buffer_read_start(struct ring_buffer *buffer, int cpu) 3351ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3292{ 3352{
3293 struct ring_buffer_per_cpu *cpu_buffer; 3353 struct ring_buffer_per_cpu *cpu_buffer;
3294 struct ring_buffer_iter *iter; 3354 struct ring_buffer_iter *iter;
3295 unsigned long flags;
3296 3355
3297 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3356 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3298 return NULL; 3357 return NULL;
@@ -3306,15 +3365,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3306 iter->cpu_buffer = cpu_buffer; 3365 iter->cpu_buffer = cpu_buffer;
3307 3366
3308 atomic_inc(&cpu_buffer->record_disabled); 3367 atomic_inc(&cpu_buffer->record_disabled);
3368
3369 return iter;
3370}
3371EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
3372
3373/**
3374 * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
3375 *
3376 * All previously invoked ring_buffer_read_prepare calls to prepare
3377 * iterators will be synchronized. Afterwards, read_buffer_read_start
3378 * calls on those iterators are allowed.
3379 */
3380void
3381ring_buffer_read_prepare_sync(void)
3382{
3309 synchronize_sched(); 3383 synchronize_sched();
3384}
3385EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
3386
3387/**
3388 * ring_buffer_read_start - start a non consuming read of the buffer
3389 * @iter: The iterator returned by ring_buffer_read_prepare
3390 *
3391 * This finalizes the startup of an iteration through the buffer.
3392 * The iterator comes from a call to ring_buffer_read_prepare and
3393 * an intervening ring_buffer_read_prepare_sync must have been
3394 * performed.
3395 *
3396 * Must be paired with ring_buffer_finish.
3397 */
3398void
3399ring_buffer_read_start(struct ring_buffer_iter *iter)
3400{
3401 struct ring_buffer_per_cpu *cpu_buffer;
3402 unsigned long flags;
3403
3404 if (!iter)
3405 return;
3406
3407 cpu_buffer = iter->cpu_buffer;
3310 3408
3311 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3409 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3312 arch_spin_lock(&cpu_buffer->lock); 3410 arch_spin_lock(&cpu_buffer->lock);
3313 rb_iter_reset(iter); 3411 rb_iter_reset(iter);
3314 arch_spin_unlock(&cpu_buffer->lock); 3412 arch_spin_unlock(&cpu_buffer->lock);
3315 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3413 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3316
3317 return iter;
3318} 3414}
3319EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3415EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3320 3416
@@ -3408,6 +3504,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3408 cpu_buffer->write_stamp = 0; 3504 cpu_buffer->write_stamp = 0;
3409 cpu_buffer->read_stamp = 0; 3505 cpu_buffer->read_stamp = 0;
3410 3506
3507 cpu_buffer->lost_events = 0;
3508 cpu_buffer->last_overrun = 0;
3509
3411 rb_head_page_activate(cpu_buffer); 3510 rb_head_page_activate(cpu_buffer);
3412} 3511}
3413 3512
@@ -3683,6 +3782,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3683 struct ring_buffer_event *event; 3782 struct ring_buffer_event *event;
3684 struct buffer_data_page *bpage; 3783 struct buffer_data_page *bpage;
3685 struct buffer_page *reader; 3784 struct buffer_page *reader;
3785 unsigned long missed_events;
3686 unsigned long flags; 3786 unsigned long flags;
3687 unsigned int commit; 3787 unsigned int commit;
3688 unsigned int read; 3788 unsigned int read;
@@ -3719,6 +3819,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3719 read = reader->read; 3819 read = reader->read;
3720 commit = rb_page_commit(reader); 3820 commit = rb_page_commit(reader);
3721 3821
3822 /* Check if any events were dropped */
3823 missed_events = cpu_buffer->lost_events;
3824
3722 /* 3825 /*
3723 * If this page has been partially read or 3826 * If this page has been partially read or
3724 * if len is not big enough to read the rest of the page or 3827 * if len is not big enough to read the rest of the page or
@@ -3779,9 +3882,35 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3779 local_set(&reader->entries, 0); 3882 local_set(&reader->entries, 0);
3780 reader->read = 0; 3883 reader->read = 0;
3781 *data_page = bpage; 3884 *data_page = bpage;
3885
3886 /*
3887 * Use the real_end for the data size,
3888 * This gives us a chance to store the lost events
3889 * on the page.
3890 */
3891 if (reader->real_end)
3892 local_set(&bpage->commit, reader->real_end);
3782 } 3893 }
3783 ret = read; 3894 ret = read;
3784 3895
3896 cpu_buffer->lost_events = 0;
3897 /*
3898 * Set a flag in the commit field if we lost events
3899 */
3900 if (missed_events) {
3901 commit = local_read(&bpage->commit);
3902
3903 /* If there is room at the end of the page to save the
3904 * missed events, then record it there.
3905 */
3906 if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
3907 memcpy(&bpage->data[commit], &missed_events,
3908 sizeof(missed_events));
3909 local_add(RB_MISSED_STORED, &bpage->commit);
3910 }
3911 local_add(RB_MISSED_EVENTS, &bpage->commit);
3912 }
3913
3785 out_unlock: 3914 out_unlock:
3786 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3915 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3787 3916
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index df74c7982255..302f8a614635 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -81,7 +81,7 @@ static enum event_status read_event(int cpu)
81 int *entry; 81 int *entry;
82 u64 ts; 82 u64 ts;
83 83
84 event = ring_buffer_consume(buffer, cpu, &ts); 84 event = ring_buffer_consume(buffer, cpu, &ts, NULL);
85 if (!event) 85 if (!event)
86 return EVENT_DROPPED; 86 return EVENT_DROPPED;
87 87
@@ -113,7 +113,8 @@ static enum event_status read_page(int cpu)
113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); 113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
114 if (ret >= 0) { 114 if (ret >= 0) {
115 rpage = bpage; 115 rpage = bpage;
116 commit = local_read(&rpage->commit); 116 /* The commit may have missed event flags set, clear them */
117 commit = local_read(&rpage->commit) & 0xfffff;
117 for (i = 0; i < commit && !kill_test; i += inc) { 118 for (i = 0; i < commit && !kill_test; i += inc) {
118 119
119 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { 120 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 44f916a04065..55e48511d7c8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
117 * 117 *
118 * It is default off, but you can enable it with either specifying 118 * It is default off, but you can enable it with either specifying
119 * "ftrace_dump_on_oops" in the kernel command line, or setting 119 * "ftrace_dump_on_oops" in the kernel command line, or setting
120 * /proc/sys/kernel/ftrace_dump_on_oops to true. 120 * /proc/sys/kernel/ftrace_dump_on_oops
121 * Set 1 if you want to dump buffers of all CPUs
122 * Set 2 if you want to dump the buffer of the CPU that triggered oops
121 */ 123 */
122int ftrace_dump_on_oops; 124
125enum ftrace_dump_mode ftrace_dump_on_oops;
123 126
124static int tracing_set_tracer(const char *buf); 127static int tracing_set_tracer(const char *buf);
125 128
@@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
139 142
140static int __init set_ftrace_dump_on_oops(char *str) 143static int __init set_ftrace_dump_on_oops(char *str)
141{ 144{
142 ftrace_dump_on_oops = 1; 145 if (*str++ != '=' || !*str) {
143 return 1; 146 ftrace_dump_on_oops = DUMP_ALL;
147 return 1;
148 }
149
150 if (!strcmp("orig_cpu", str)) {
151 ftrace_dump_on_oops = DUMP_ORIG;
152 return 1;
153 }
154
155 return 0;
144} 156}
145__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 157__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
146 158
@@ -1545,7 +1557,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
1545} 1557}
1546 1558
1547static struct trace_entry * 1559static struct trace_entry *
1548peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) 1560peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1561 unsigned long *lost_events)
1549{ 1562{
1550 struct ring_buffer_event *event; 1563 struct ring_buffer_event *event;
1551 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1564 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1556,7 +1569,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1556 if (buf_iter) 1569 if (buf_iter)
1557 event = ring_buffer_iter_peek(buf_iter, ts); 1570 event = ring_buffer_iter_peek(buf_iter, ts);
1558 else 1571 else
1559 event = ring_buffer_peek(iter->tr->buffer, cpu, ts); 1572 event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
1573 lost_events);
1560 1574
1561 ftrace_enable_cpu(); 1575 ftrace_enable_cpu();
1562 1576
@@ -1564,10 +1578,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1564} 1578}
1565 1579
1566static struct trace_entry * 1580static struct trace_entry *
1567__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) 1581__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1582 unsigned long *missing_events, u64 *ent_ts)
1568{ 1583{
1569 struct ring_buffer *buffer = iter->tr->buffer; 1584 struct ring_buffer *buffer = iter->tr->buffer;
1570 struct trace_entry *ent, *next = NULL; 1585 struct trace_entry *ent, *next = NULL;
1586 unsigned long lost_events = 0, next_lost = 0;
1571 int cpu_file = iter->cpu_file; 1587 int cpu_file = iter->cpu_file;
1572 u64 next_ts = 0, ts; 1588 u64 next_ts = 0, ts;
1573 int next_cpu = -1; 1589 int next_cpu = -1;
@@ -1580,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1580 if (cpu_file > TRACE_PIPE_ALL_CPU) { 1596 if (cpu_file > TRACE_PIPE_ALL_CPU) {
1581 if (ring_buffer_empty_cpu(buffer, cpu_file)) 1597 if (ring_buffer_empty_cpu(buffer, cpu_file))
1582 return NULL; 1598 return NULL;
1583 ent = peek_next_entry(iter, cpu_file, ent_ts); 1599 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
1584 if (ent_cpu) 1600 if (ent_cpu)
1585 *ent_cpu = cpu_file; 1601 *ent_cpu = cpu_file;
1586 1602
@@ -1592,7 +1608,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1592 if (ring_buffer_empty_cpu(buffer, cpu)) 1608 if (ring_buffer_empty_cpu(buffer, cpu))
1593 continue; 1609 continue;
1594 1610
1595 ent = peek_next_entry(iter, cpu, &ts); 1611 ent = peek_next_entry(iter, cpu, &ts, &lost_events);
1596 1612
1597 /* 1613 /*
1598 * Pick the entry with the smallest timestamp: 1614 * Pick the entry with the smallest timestamp:
@@ -1601,6 +1617,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1601 next = ent; 1617 next = ent;
1602 next_cpu = cpu; 1618 next_cpu = cpu;
1603 next_ts = ts; 1619 next_ts = ts;
1620 next_lost = lost_events;
1604 } 1621 }
1605 } 1622 }
1606 1623
@@ -1610,6 +1627,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1610 if (ent_ts) 1627 if (ent_ts)
1611 *ent_ts = next_ts; 1628 *ent_ts = next_ts;
1612 1629
1630 if (missing_events)
1631 *missing_events = next_lost;
1632
1613 return next; 1633 return next;
1614} 1634}
1615 1635
@@ -1617,13 +1637,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1617struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 1637struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1618 int *ent_cpu, u64 *ent_ts) 1638 int *ent_cpu, u64 *ent_ts)
1619{ 1639{
1620 return __find_next_entry(iter, ent_cpu, ent_ts); 1640 return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
1621} 1641}
1622 1642
1623/* Find the next real entry, and increment the iterator to the next entry */ 1643/* Find the next real entry, and increment the iterator to the next entry */
1624static void *find_next_entry_inc(struct trace_iterator *iter) 1644static void *find_next_entry_inc(struct trace_iterator *iter)
1625{ 1645{
1626 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); 1646 iter->ent = __find_next_entry(iter, &iter->cpu,
1647 &iter->lost_events, &iter->ts);
1627 1648
1628 if (iter->ent) 1649 if (iter->ent)
1629 trace_iterator_increment(iter); 1650 trace_iterator_increment(iter);
@@ -1635,7 +1656,8 @@ static void trace_consume(struct trace_iterator *iter)
1635{ 1656{
1636 /* Don't allow ftrace to trace into the ring buffers */ 1657 /* Don't allow ftrace to trace into the ring buffers */
1637 ftrace_disable_cpu(); 1658 ftrace_disable_cpu();
1638 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); 1659 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
1660 &iter->lost_events);
1639 ftrace_enable_cpu(); 1661 ftrace_enable_cpu();
1640} 1662}
1641 1663
@@ -1786,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
1786} 1808}
1787 1809
1788 1810
1789static void 1811void
1790print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1812print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1791{ 1813{
1792 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1814 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1914,7 +1936,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1914 } 1936 }
1915 1937
1916 if (event) 1938 if (event)
1917 return event->trace(iter, sym_flags); 1939 return event->funcs->trace(iter, sym_flags, event);
1918 1940
1919 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) 1941 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
1920 goto partial; 1942 goto partial;
@@ -1940,7 +1962,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1940 1962
1941 event = ftrace_find_event(entry->type); 1963 event = ftrace_find_event(entry->type);
1942 if (event) 1964 if (event)
1943 return event->raw(iter, 0); 1965 return event->funcs->raw(iter, 0, event);
1944 1966
1945 if (!trace_seq_printf(s, "%d ?\n", entry->type)) 1967 if (!trace_seq_printf(s, "%d ?\n", entry->type))
1946 goto partial; 1968 goto partial;
@@ -1967,7 +1989,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1967 1989
1968 event = ftrace_find_event(entry->type); 1990 event = ftrace_find_event(entry->type);
1969 if (event) { 1991 if (event) {
1970 enum print_line_t ret = event->hex(iter, 0); 1992 enum print_line_t ret = event->funcs->hex(iter, 0, event);
1971 if (ret != TRACE_TYPE_HANDLED) 1993 if (ret != TRACE_TYPE_HANDLED)
1972 return ret; 1994 return ret;
1973 } 1995 }
@@ -1992,10 +2014,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1992 } 2014 }
1993 2015
1994 event = ftrace_find_event(entry->type); 2016 event = ftrace_find_event(entry->type);
1995 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; 2017 return event ? event->funcs->binary(iter, 0, event) :
2018 TRACE_TYPE_HANDLED;
1996} 2019}
1997 2020
1998static int trace_empty(struct trace_iterator *iter) 2021int trace_empty(struct trace_iterator *iter)
1999{ 2022{
2000 int cpu; 2023 int cpu;
2001 2024
@@ -2030,6 +2053,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2030{ 2053{
2031 enum print_line_t ret; 2054 enum print_line_t ret;
2032 2055
2056 if (iter->lost_events)
2057 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2058 iter->cpu, iter->lost_events);
2059
2033 if (iter->trace && iter->trace->print_line) { 2060 if (iter->trace && iter->trace->print_line) {
2034 ret = iter->trace->print_line(iter); 2061 ret = iter->trace->print_line(iter);
2035 if (ret != TRACE_TYPE_UNHANDLED) 2062 if (ret != TRACE_TYPE_UNHANDLED)
@@ -2058,6 +2085,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2058 return print_trace_fmt(iter); 2085 return print_trace_fmt(iter);
2059} 2086}
2060 2087
2088void trace_default_header(struct seq_file *m)
2089{
2090 struct trace_iterator *iter = m->private;
2091
2092 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2093 /* print nothing if the buffers are empty */
2094 if (trace_empty(iter))
2095 return;
2096 print_trace_header(m, iter);
2097 if (!(trace_flags & TRACE_ITER_VERBOSE))
2098 print_lat_help_header(m);
2099 } else {
2100 if (!(trace_flags & TRACE_ITER_VERBOSE))
2101 print_func_help_header(m);
2102 }
2103}
2104
2061static int s_show(struct seq_file *m, void *v) 2105static int s_show(struct seq_file *m, void *v)
2062{ 2106{
2063 struct trace_iterator *iter = v; 2107 struct trace_iterator *iter = v;
@@ -2070,17 +2114,9 @@ static int s_show(struct seq_file *m, void *v)
2070 } 2114 }
2071 if (iter->trace && iter->trace->print_header) 2115 if (iter->trace && iter->trace->print_header)
2072 iter->trace->print_header(m); 2116 iter->trace->print_header(m);
2073 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2117 else
2074 /* print nothing if the buffers are empty */ 2118 trace_default_header(m);
2075 if (trace_empty(iter)) 2119
2076 return 0;
2077 print_trace_header(m, iter);
2078 if (!(trace_flags & TRACE_ITER_VERBOSE))
2079 print_lat_help_header(m);
2080 } else {
2081 if (!(trace_flags & TRACE_ITER_VERBOSE))
2082 print_func_help_header(m);
2083 }
2084 } else if (iter->leftover) { 2120 } else if (iter->leftover) {
2085 /* 2121 /*
2086 * If we filled the seq_file buffer earlier, we 2122 * If we filled the seq_file buffer earlier, we
@@ -2166,15 +2202,20 @@ __tracing_open(struct inode *inode, struct file *file)
2166 2202
2167 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2203 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2168 for_each_tracing_cpu(cpu) { 2204 for_each_tracing_cpu(cpu) {
2169
2170 iter->buffer_iter[cpu] = 2205 iter->buffer_iter[cpu] =
2171 ring_buffer_read_start(iter->tr->buffer, cpu); 2206 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2207 }
2208 ring_buffer_read_prepare_sync();
2209 for_each_tracing_cpu(cpu) {
2210 ring_buffer_read_start(iter->buffer_iter[cpu]);
2172 tracing_iter_reset(iter, cpu); 2211 tracing_iter_reset(iter, cpu);
2173 } 2212 }
2174 } else { 2213 } else {
2175 cpu = iter->cpu_file; 2214 cpu = iter->cpu_file;
2176 iter->buffer_iter[cpu] = 2215 iter->buffer_iter[cpu] =
2177 ring_buffer_read_start(iter->tr->buffer, cpu); 2216 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2217 ring_buffer_read_prepare_sync();
2218 ring_buffer_read_start(iter->buffer_iter[cpu]);
2178 tracing_iter_reset(iter, cpu); 2219 tracing_iter_reset(iter, cpu);
2179 } 2220 }
2180 2221
@@ -3269,12 +3310,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3269 size_t len, 3310 size_t len,
3270 unsigned int flags) 3311 unsigned int flags)
3271{ 3312{
3272 struct page *pages[PIPE_BUFFERS]; 3313 struct page *pages_def[PIPE_DEF_BUFFERS];
3273 struct partial_page partial[PIPE_BUFFERS]; 3314 struct partial_page partial_def[PIPE_DEF_BUFFERS];
3274 struct trace_iterator *iter = filp->private_data; 3315 struct trace_iterator *iter = filp->private_data;
3275 struct splice_pipe_desc spd = { 3316 struct splice_pipe_desc spd = {
3276 .pages = pages, 3317 .pages = pages_def,
3277 .partial = partial, 3318 .partial = partial_def,
3278 .nr_pages = 0, /* This gets updated below. */ 3319 .nr_pages = 0, /* This gets updated below. */
3279 .flags = flags, 3320 .flags = flags,
3280 .ops = &tracing_pipe_buf_ops, 3321 .ops = &tracing_pipe_buf_ops,
@@ -3285,6 +3326,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3285 size_t rem; 3326 size_t rem;
3286 unsigned int i; 3327 unsigned int i;
3287 3328
3329 if (splice_grow_spd(pipe, &spd))
3330 return -ENOMEM;
3331
3288 /* copy the tracer to avoid using a global lock all around */ 3332 /* copy the tracer to avoid using a global lock all around */
3289 mutex_lock(&trace_types_lock); 3333 mutex_lock(&trace_types_lock);
3290 if (unlikely(old_tracer != current_trace && current_trace)) { 3334 if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3315,23 +3359,23 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3315 trace_access_lock(iter->cpu_file); 3359 trace_access_lock(iter->cpu_file);
3316 3360
3317 /* Fill as many pages as possible. */ 3361 /* Fill as many pages as possible. */
3318 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3362 for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
3319 pages[i] = alloc_page(GFP_KERNEL); 3363 spd.pages[i] = alloc_page(GFP_KERNEL);
3320 if (!pages[i]) 3364 if (!spd.pages[i])
3321 break; 3365 break;
3322 3366
3323 rem = tracing_fill_pipe_page(rem, iter); 3367 rem = tracing_fill_pipe_page(rem, iter);
3324 3368
3325 /* Copy the data into the page, so we can start over. */ 3369 /* Copy the data into the page, so we can start over. */
3326 ret = trace_seq_to_buffer(&iter->seq, 3370 ret = trace_seq_to_buffer(&iter->seq,
3327 page_address(pages[i]), 3371 page_address(spd.pages[i]),
3328 iter->seq.len); 3372 iter->seq.len);
3329 if (ret < 0) { 3373 if (ret < 0) {
3330 __free_page(pages[i]); 3374 __free_page(spd.pages[i]);
3331 break; 3375 break;
3332 } 3376 }
3333 partial[i].offset = 0; 3377 spd.partial[i].offset = 0;
3334 partial[i].len = iter->seq.len; 3378 spd.partial[i].len = iter->seq.len;
3335 3379
3336 trace_seq_init(&iter->seq); 3380 trace_seq_init(&iter->seq);
3337 } 3381 }
@@ -3342,12 +3386,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3342 3386
3343 spd.nr_pages = i; 3387 spd.nr_pages = i;
3344 3388
3345 return splice_to_pipe(pipe, &spd); 3389 ret = splice_to_pipe(pipe, &spd);
3390out:
3391 splice_shrink_spd(pipe, &spd);
3392 return ret;
3346 3393
3347out_err: 3394out_err:
3348 mutex_unlock(&iter->mutex); 3395 mutex_unlock(&iter->mutex);
3349 3396 goto out;
3350 return ret;
3351} 3397}
3352 3398
3353static ssize_t 3399static ssize_t
@@ -3746,11 +3792,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3746 unsigned int flags) 3792 unsigned int flags)
3747{ 3793{
3748 struct ftrace_buffer_info *info = file->private_data; 3794 struct ftrace_buffer_info *info = file->private_data;
3749 struct partial_page partial[PIPE_BUFFERS]; 3795 struct partial_page partial_def[PIPE_DEF_BUFFERS];
3750 struct page *pages[PIPE_BUFFERS]; 3796 struct page *pages_def[PIPE_DEF_BUFFERS];
3751 struct splice_pipe_desc spd = { 3797 struct splice_pipe_desc spd = {
3752 .pages = pages, 3798 .pages = pages_def,
3753 .partial = partial, 3799 .partial = partial_def,
3754 .flags = flags, 3800 .flags = flags,
3755 .ops = &buffer_pipe_buf_ops, 3801 .ops = &buffer_pipe_buf_ops,
3756 .spd_release = buffer_spd_release, 3802 .spd_release = buffer_spd_release,
@@ -3759,22 +3805,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3759 int entries, size, i; 3805 int entries, size, i;
3760 size_t ret; 3806 size_t ret;
3761 3807
3808 if (splice_grow_spd(pipe, &spd))
3809 return -ENOMEM;
3810
3762 if (*ppos & (PAGE_SIZE - 1)) { 3811 if (*ppos & (PAGE_SIZE - 1)) {
3763 WARN_ONCE(1, "Ftrace: previous read must page-align\n"); 3812 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
3764 return -EINVAL; 3813 ret = -EINVAL;
3814 goto out;
3765 } 3815 }
3766 3816
3767 if (len & (PAGE_SIZE - 1)) { 3817 if (len & (PAGE_SIZE - 1)) {
3768 WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); 3818 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
3769 if (len < PAGE_SIZE) 3819 if (len < PAGE_SIZE) {
3770 return -EINVAL; 3820 ret = -EINVAL;
3821 goto out;
3822 }
3771 len &= PAGE_MASK; 3823 len &= PAGE_MASK;
3772 } 3824 }
3773 3825
3774 trace_access_lock(info->cpu); 3826 trace_access_lock(info->cpu);
3775 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3827 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3776 3828
3777 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3829 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
3778 struct page *page; 3830 struct page *page;
3779 int r; 3831 int r;
3780 3832
@@ -3829,11 +3881,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3829 else 3881 else
3830 ret = 0; 3882 ret = 0;
3831 /* TODO: block */ 3883 /* TODO: block */
3832 return ret; 3884 goto out;
3833 } 3885 }
3834 3886
3835 ret = splice_to_pipe(pipe, &spd); 3887 ret = splice_to_pipe(pipe, &spd);
3836 3888 splice_shrink_spd(pipe, &spd);
3889out:
3837 return ret; 3890 return ret;
3838} 3891}
3839 3892
@@ -4324,7 +4377,7 @@ static int trace_panic_handler(struct notifier_block *this,
4324 unsigned long event, void *unused) 4377 unsigned long event, void *unused)
4325{ 4378{
4326 if (ftrace_dump_on_oops) 4379 if (ftrace_dump_on_oops)
4327 ftrace_dump(); 4380 ftrace_dump(ftrace_dump_on_oops);
4328 return NOTIFY_OK; 4381 return NOTIFY_OK;
4329} 4382}
4330 4383
@@ -4341,7 +4394,7 @@ static int trace_die_handler(struct notifier_block *self,
4341 switch (val) { 4394 switch (val) {
4342 case DIE_OOPS: 4395 case DIE_OOPS:
4343 if (ftrace_dump_on_oops) 4396 if (ftrace_dump_on_oops)
4344 ftrace_dump(); 4397 ftrace_dump(ftrace_dump_on_oops);
4345 break; 4398 break;
4346 default: 4399 default:
4347 break; 4400 break;
@@ -4382,7 +4435,8 @@ trace_printk_seq(struct trace_seq *s)
4382 trace_seq_init(s); 4435 trace_seq_init(s);
4383} 4436}
4384 4437
4385static void __ftrace_dump(bool disable_tracing) 4438static void
4439__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4386{ 4440{
4387 static arch_spinlock_t ftrace_dump_lock = 4441 static arch_spinlock_t ftrace_dump_lock =
4388 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 4442 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4415,12 +4469,25 @@ static void __ftrace_dump(bool disable_tracing)
4415 /* don't look at user memory in panic mode */ 4469 /* don't look at user memory in panic mode */
4416 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 4470 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
4417 4471
4418 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4419
4420 /* Simulate the iterator */ 4472 /* Simulate the iterator */
4421 iter.tr = &global_trace; 4473 iter.tr = &global_trace;
4422 iter.trace = current_trace; 4474 iter.trace = current_trace;
4423 iter.cpu_file = TRACE_PIPE_ALL_CPU; 4475
4476 switch (oops_dump_mode) {
4477 case DUMP_ALL:
4478 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4479 break;
4480 case DUMP_ORIG:
4481 iter.cpu_file = raw_smp_processor_id();
4482 break;
4483 case DUMP_NONE:
4484 goto out_enable;
4485 default:
4486 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
4487 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4488 }
4489
4490 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4424 4491
4425 /* 4492 /*
4426 * We need to stop all tracing on all CPUS to read the 4493 * We need to stop all tracing on all CPUS to read the
@@ -4459,6 +4526,7 @@ static void __ftrace_dump(bool disable_tracing)
4459 else 4526 else
4460 printk(KERN_TRACE "---------------------------------\n"); 4527 printk(KERN_TRACE "---------------------------------\n");
4461 4528
4529 out_enable:
4462 /* Re-enable tracing if requested */ 4530 /* Re-enable tracing if requested */
4463 if (!disable_tracing) { 4531 if (!disable_tracing) {
4464 trace_flags |= old_userobj; 4532 trace_flags |= old_userobj;
@@ -4475,9 +4543,9 @@ static void __ftrace_dump(bool disable_tracing)
4475} 4543}
4476 4544
4477/* By default: disable tracing after the dump */ 4545/* By default: disable tracing after the dump */
4478void ftrace_dump(void) 4546void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4479{ 4547{
4480 __ftrace_dump(true); 4548 __ftrace_dump(true, oops_dump_mode);
4481} 4549}
4482 4550
4483__init static int tracer_alloc_buffers(void) 4551__init static int tracer_alloc_buffers(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2825ef2c0b15..2cd96399463f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,7 +34,6 @@ enum trace_type {
34 TRACE_GRAPH_RET, 34 TRACE_GRAPH_RET,
35 TRACE_GRAPH_ENT, 35 TRACE_GRAPH_ENT,
36 TRACE_USER_STACK, 36 TRACE_USER_STACK,
37 TRACE_HW_BRANCHES,
38 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
39 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
40 TRACE_BLK, 39 TRACE_BLK,
@@ -103,29 +102,17 @@ struct syscall_trace_exit {
103 long ret; 102 long ret;
104}; 103};
105 104
106struct kprobe_trace_entry { 105struct kprobe_trace_entry_head {
107 struct trace_entry ent; 106 struct trace_entry ent;
108 unsigned long ip; 107 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111}; 108};
112 109
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ 110struct kretprobe_trace_entry_head {
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent; 111 struct trace_entry ent;
119 unsigned long func; 112 unsigned long func;
120 unsigned long ret_ip; 113 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123}; 114};
124 115
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
129/* 116/*
130 * trace_flag_type is an enumeration that holds different 117 * trace_flag_type is an enumeration that holds different
131 * states when a trace occurs. These are: 118 * states when a trace occurs. These are:
@@ -229,7 +216,6 @@ extern void __ftrace_bad_type(void);
229 TRACE_GRAPH_ENT); \ 216 TRACE_GRAPH_ENT); \
230 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 217 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
231 TRACE_GRAPH_RET); \ 218 TRACE_GRAPH_RET); \
232 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
233 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 219 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
234 TRACE_KMEM_ALLOC); \ 220 TRACE_KMEM_ALLOC); \
235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 221 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
@@ -378,6 +364,9 @@ void trace_function(struct trace_array *tr,
378 unsigned long ip, 364 unsigned long ip,
379 unsigned long parent_ip, 365 unsigned long parent_ip,
380 unsigned long flags, int pc); 366 unsigned long flags, int pc);
367void trace_default_header(struct seq_file *m);
368void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
369int trace_empty(struct trace_iterator *iter);
381 370
382void trace_graph_return(struct ftrace_graph_ret *trace); 371void trace_graph_return(struct ftrace_graph_ret *trace);
383int trace_graph_entry(struct ftrace_graph_ent *trace); 372int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -416,12 +405,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
416void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 405void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
417 int pc); 406 int pc);
418#else 407#else
419static inline void ftrace_trace_stack(struct trace_array *tr, 408static inline void ftrace_trace_stack(struct ring_buffer *buffer,
420 unsigned long flags, int skip, int pc) 409 unsigned long flags, int skip, int pc)
421{ 410{
422} 411}
423 412
424static inline void ftrace_trace_userstack(struct trace_array *tr, 413static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
425 unsigned long flags, int pc) 414 unsigned long flags, int pc)
426{ 415{
427} 416}
@@ -467,8 +456,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
467 struct trace_array *tr); 456 struct trace_array *tr);
468extern int trace_selftest_startup_branch(struct tracer *trace, 457extern int trace_selftest_startup_branch(struct tracer *trace,
469 struct trace_array *tr); 458 struct trace_array *tr);
470extern int trace_selftest_startup_hw_branches(struct tracer *trace,
471 struct trace_array *tr);
472extern int trace_selftest_startup_ksym(struct tracer *trace, 459extern int trace_selftest_startup_ksym(struct tracer *trace,
473 struct trace_array *tr); 460 struct trace_array *tr);
474#endif /* CONFIG_FTRACE_STARTUP_TEST */ 461#endif /* CONFIG_FTRACE_STARTUP_TEST */
@@ -491,9 +478,29 @@ extern int trace_clock_id;
491 478
492/* Standard output formatting function used for function return traces */ 479/* Standard output formatting function used for function return traces */
493#ifdef CONFIG_FUNCTION_GRAPH_TRACER 480#ifdef CONFIG_FUNCTION_GRAPH_TRACER
494extern enum print_line_t print_graph_function(struct trace_iterator *iter); 481
482/* Flag options */
483#define TRACE_GRAPH_PRINT_OVERRUN 0x1
484#define TRACE_GRAPH_PRINT_CPU 0x2
485#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
486#define TRACE_GRAPH_PRINT_PROC 0x8
487#define TRACE_GRAPH_PRINT_DURATION 0x10
488#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
489
490extern enum print_line_t
491print_graph_function_flags(struct trace_iterator *iter, u32 flags);
492extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
495extern enum print_line_t 493extern enum print_line_t
496trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); 494trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
495extern void graph_trace_open(struct trace_iterator *iter);
496extern void graph_trace_close(struct trace_iterator *iter);
497extern int __trace_graph_entry(struct trace_array *tr,
498 struct ftrace_graph_ent *trace,
499 unsigned long flags, int pc);
500extern void __trace_graph_return(struct trace_array *tr,
501 struct ftrace_graph_ret *trace,
502 unsigned long flags, int pc);
503
497 504
498#ifdef CONFIG_DYNAMIC_FTRACE 505#ifdef CONFIG_DYNAMIC_FTRACE
499/* TODO: make this variable */ 506/* TODO: make this variable */
@@ -524,7 +531,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
524#endif /* CONFIG_DYNAMIC_FTRACE */ 531#endif /* CONFIG_DYNAMIC_FTRACE */
525#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 532#else /* CONFIG_FUNCTION_GRAPH_TRACER */
526static inline enum print_line_t 533static inline enum print_line_t
527print_graph_function(struct trace_iterator *iter) 534print_graph_function_flags(struct trace_iterator *iter, u32 flags)
528{ 535{
529 return TRACE_TYPE_UNHANDLED; 536 return TRACE_TYPE_UNHANDLED;
530} 537}
@@ -771,12 +778,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
771 struct trace_seq *s); 778 struct trace_seq *s);
772extern int filter_assign_type(const char *type); 779extern int filter_assign_type(const char *type);
773 780
781struct list_head *
782trace_get_fields(struct ftrace_event_call *event_call);
783
774static inline int 784static inline int
775filter_check_discard(struct ftrace_event_call *call, void *rec, 785filter_check_discard(struct ftrace_event_call *call, void *rec,
776 struct ring_buffer *buffer, 786 struct ring_buffer *buffer,
777 struct ring_buffer_event *event) 787 struct ring_buffer_event *event)
778{ 788{
779 if (unlikely(call->filter_active) && 789 if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
780 !filter_match_preds(call->filter, rec)) { 790 !filter_match_preds(call->filter, rec)) {
781 ring_buffer_discard_commit(buffer, event); 791 ring_buffer_discard_commit(buffer, event);
782 return 1; 792 return 1;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index b9bc4d470177..8d3538b4ea5f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
143} 143}
144 144
145static enum print_line_t trace_branch_print(struct trace_iterator *iter, 145static enum print_line_t trace_branch_print(struct trace_iterator *iter,
146 int flags) 146 int flags, struct trace_event *event)
147{ 147{
148 struct trace_branch *field; 148 struct trace_branch *field;
149 149
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
167 " |\n"); 167 " |\n");
168} 168}
169 169
170static struct trace_event_functions trace_branch_funcs = {
171 .trace = trace_branch_print,
172};
173
170static struct trace_event trace_branch_event = { 174static struct trace_event trace_branch_event = {
171 .type = TRACE_BRANCH, 175 .type = TRACE_BRANCH,
172 .trace = trace_branch_print, 176 .funcs = &trace_branch_funcs,
173}; 177};
174 178
175static struct tracer branch_trace __read_mostly = 179static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399df..dc008c1240da 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch,
318 __entry->func, __entry->file, __entry->correct) 318 __entry->func, __entry->file, __entry->correct)
319); 319);
320 320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, 321FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334 322
335 TRACE_KMEM_ALLOC, 323 TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 0565bb42566f..cb6f365016e4 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,13 +9,9 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
13EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
14
15EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); 12EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
16 13
17static char *perf_trace_buf; 14static char *perf_trace_buf[4];
18static char *perf_trace_buf_nmi;
19 15
20/* 16/*
21 * Force it to be aligned to unsigned long to avoid misaligned accesses 17 * Force it to be aligned to unsigned long to avoid misaligned accesses
@@ -27,57 +23,82 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
27/* Count the events in use (per event id, not per instance) */ 23/* Count the events in use (per event id, not per instance) */
28static int total_ref_count; 24static int total_ref_count;
29 25
30static int perf_trace_event_enable(struct ftrace_event_call *event) 26static int perf_trace_event_init(struct ftrace_event_call *tp_event,
27 struct perf_event *p_event)
31{ 28{
32 char *buf; 29 struct hlist_head *list;
33 int ret = -ENOMEM; 30 int ret = -ENOMEM;
31 int cpu;
34 32
35 if (event->perf_refcount++ > 0) 33 p_event->tp_event = tp_event;
34 if (tp_event->perf_refcount++ > 0)
36 return 0; 35 return 0;
37 36
38 if (!total_ref_count) { 37 list = alloc_percpu(struct hlist_head);
39 buf = (char *)alloc_percpu(perf_trace_t); 38 if (!list)
40 if (!buf) 39 goto fail;
41 goto fail_buf;
42 40
43 rcu_assign_pointer(perf_trace_buf, buf); 41 for_each_possible_cpu(cpu)
42 INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
44 43
45 buf = (char *)alloc_percpu(perf_trace_t); 44 tp_event->perf_events = list;
46 if (!buf)
47 goto fail_buf_nmi;
48 45
49 rcu_assign_pointer(perf_trace_buf_nmi, buf); 46 if (!total_ref_count) {
50 } 47 char *buf;
48 int i;
51 49
52 ret = event->perf_event_enable(event); 50 for (i = 0; i < 4; i++) {
53 if (!ret) { 51 buf = (char *)alloc_percpu(perf_trace_t);
54 total_ref_count++; 52 if (!buf)
55 return 0; 53 goto fail;
54
55 perf_trace_buf[i] = buf;
56 }
56 } 57 }
57 58
58fail_buf_nmi: 59 if (tp_event->class->reg)
60 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
61 else
62 ret = tracepoint_probe_register(tp_event->name,
63 tp_event->class->perf_probe,
64 tp_event);
65
66 if (ret)
67 goto fail;
68
69 total_ref_count++;
70 return 0;
71
72fail:
59 if (!total_ref_count) { 73 if (!total_ref_count) {
60 free_percpu(perf_trace_buf_nmi); 74 int i;
61 free_percpu(perf_trace_buf); 75
62 perf_trace_buf_nmi = NULL; 76 for (i = 0; i < 4; i++) {
63 perf_trace_buf = NULL; 77 free_percpu(perf_trace_buf[i]);
78 perf_trace_buf[i] = NULL;
79 }
80 }
81
82 if (!--tp_event->perf_refcount) {
83 free_percpu(tp_event->perf_events);
84 tp_event->perf_events = NULL;
64 } 85 }
65fail_buf:
66 event->perf_refcount--;
67 86
68 return ret; 87 return ret;
69} 88}
70 89
71int perf_trace_enable(int event_id) 90int perf_trace_init(struct perf_event *p_event)
72{ 91{
73 struct ftrace_event_call *event; 92 struct ftrace_event_call *tp_event;
93 int event_id = p_event->attr.config;
74 int ret = -EINVAL; 94 int ret = -EINVAL;
75 95
76 mutex_lock(&event_mutex); 96 mutex_lock(&event_mutex);
77 list_for_each_entry(event, &ftrace_events, list) { 97 list_for_each_entry(tp_event, &ftrace_events, list) {
78 if (event->id == event_id && event->perf_event_enable && 98 if (tp_event->event.type == event_id &&
79 try_module_get(event->mod)) { 99 tp_event->class && tp_event->class->perf_probe &&
80 ret = perf_trace_event_enable(event); 100 try_module_get(tp_event->mod)) {
101 ret = perf_trace_event_init(tp_event, p_event);
81 break; 102 break;
82 } 103 }
83 } 104 }
@@ -86,90 +107,78 @@ int perf_trace_enable(int event_id)
86 return ret; 107 return ret;
87} 108}
88 109
89static void perf_trace_event_disable(struct ftrace_event_call *event) 110int perf_trace_enable(struct perf_event *p_event)
90{ 111{
91 char *buf, *nmi_buf; 112 struct ftrace_event_call *tp_event = p_event->tp_event;
92 113 struct hlist_head *list;
93 if (--event->perf_refcount > 0)
94 return;
95
96 event->perf_event_disable(event);
97 114
98 if (!--total_ref_count) { 115 list = tp_event->perf_events;
99 buf = perf_trace_buf; 116 if (WARN_ON_ONCE(!list))
100 rcu_assign_pointer(perf_trace_buf, NULL); 117 return -EINVAL;
101 118
102 nmi_buf = perf_trace_buf_nmi; 119 list = per_cpu_ptr(list, smp_processor_id());
103 rcu_assign_pointer(perf_trace_buf_nmi, NULL); 120 hlist_add_head_rcu(&p_event->hlist_entry, list);
104 121
105 /* 122 return 0;
106 * Ensure every events in profiling have finished before 123}
107 * releasing the buffers
108 */
109 synchronize_sched();
110 124
111 free_percpu(buf); 125void perf_trace_disable(struct perf_event *p_event)
112 free_percpu(nmi_buf); 126{
113 } 127 hlist_del_rcu(&p_event->hlist_entry);
114} 128}
115 129
116void perf_trace_disable(int event_id) 130void perf_trace_destroy(struct perf_event *p_event)
117{ 131{
118 struct ftrace_event_call *event; 132 struct ftrace_event_call *tp_event = p_event->tp_event;
133 int i;
119 134
120 mutex_lock(&event_mutex); 135 if (--tp_event->perf_refcount > 0)
121 list_for_each_entry(event, &ftrace_events, list) { 136 return;
122 if (event->id == event_id) { 137
123 perf_trace_event_disable(event); 138 if (tp_event->class->reg)
124 module_put(event->mod); 139 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
125 break; 140 else
141 tracepoint_probe_unregister(tp_event->name,
142 tp_event->class->perf_probe,
143 tp_event);
144
145 free_percpu(tp_event->perf_events);
146 tp_event->perf_events = NULL;
147
148 if (!--total_ref_count) {
149 for (i = 0; i < 4; i++) {
150 free_percpu(perf_trace_buf[i]);
151 perf_trace_buf[i] = NULL;
126 } 152 }
127 } 153 }
128 mutex_unlock(&event_mutex);
129} 154}
130 155
131__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 156__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
132 int *rctxp, unsigned long *irq_flags) 157 struct pt_regs *regs, int *rctxp)
133{ 158{
134 struct trace_entry *entry; 159 struct trace_entry *entry;
135 char *trace_buf, *raw_data; 160 unsigned long flags;
136 int pc, cpu; 161 char *raw_data;
162 int pc;
137 163
138 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); 164 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
139 165
140 pc = preempt_count(); 166 pc = preempt_count();
141 167
142 /* Protect the per cpu buffer, begin the rcu read side */
143 local_irq_save(*irq_flags);
144
145 *rctxp = perf_swevent_get_recursion_context(); 168 *rctxp = perf_swevent_get_recursion_context();
146 if (*rctxp < 0) 169 if (*rctxp < 0)
147 goto err_recursion; 170 return NULL;
148
149 cpu = smp_processor_id();
150
151 if (in_nmi())
152 trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
153 else
154 trace_buf = rcu_dereference_sched(perf_trace_buf);
155
156 if (!trace_buf)
157 goto err;
158 171
159 raw_data = per_cpu_ptr(trace_buf, cpu); 172 raw_data = per_cpu_ptr(perf_trace_buf[*rctxp], smp_processor_id());
160 173
161 /* zero the dead bytes from align to not leak stack to user */ 174 /* zero the dead bytes from align to not leak stack to user */
162 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); 175 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
163 176
164 entry = (struct trace_entry *)raw_data; 177 entry = (struct trace_entry *)raw_data;
165 tracing_generic_entry_update(entry, *irq_flags, pc); 178 local_save_flags(flags);
179 tracing_generic_entry_update(entry, flags, pc);
166 entry->type = type; 180 entry->type = type;
167 181
168 return raw_data; 182 return raw_data;
169err:
170 perf_swevent_put_recursion_context(*rctxp);
171err_recursion:
172 local_irq_restore(*irq_flags);
173 return NULL;
174} 183}
175EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 184EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c697c7043349..53cffc0b0801 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -29,11 +29,23 @@ DEFINE_MUTEX(event_mutex);
29 29
30LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
31 31
32struct list_head *
33trace_get_fields(struct ftrace_event_call *event_call)
34{
35 if (!event_call->class->get_fields)
36 return &event_call->class->fields;
37 return event_call->class->get_fields(event_call);
38}
39
32int trace_define_field(struct ftrace_event_call *call, const char *type, 40int trace_define_field(struct ftrace_event_call *call, const char *type,
33 const char *name, int offset, int size, int is_signed, 41 const char *name, int offset, int size, int is_signed,
34 int filter_type) 42 int filter_type)
35{ 43{
36 struct ftrace_event_field *field; 44 struct ftrace_event_field *field;
45 struct list_head *head;
46
47 if (WARN_ON(!call->class))
48 return 0;
37 49
38 field = kzalloc(sizeof(*field), GFP_KERNEL); 50 field = kzalloc(sizeof(*field), GFP_KERNEL);
39 if (!field) 51 if (!field)
@@ -56,7 +68,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
56 field->size = size; 68 field->size = size;
57 field->is_signed = is_signed; 69 field->is_signed = is_signed;
58 70
59 list_add(&field->link, &call->fields); 71 head = trace_get_fields(call);
72 list_add(&field->link, head);
60 73
61 return 0; 74 return 0;
62 75
@@ -94,8 +107,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
94void trace_destroy_fields(struct ftrace_event_call *call) 107void trace_destroy_fields(struct ftrace_event_call *call)
95{ 108{
96 struct ftrace_event_field *field, *next; 109 struct ftrace_event_field *field, *next;
110 struct list_head *head;
97 111
98 list_for_each_entry_safe(field, next, &call->fields, link) { 112 head = trace_get_fields(call);
113 list_for_each_entry_safe(field, next, head, link) {
99 list_del(&field->link); 114 list_del(&field->link);
100 kfree(field->type); 115 kfree(field->type);
101 kfree(field->name); 116 kfree(field->name);
@@ -107,11 +122,9 @@ int trace_event_raw_init(struct ftrace_event_call *call)
107{ 122{
108 int id; 123 int id;
109 124
110 id = register_ftrace_event(call->event); 125 id = register_ftrace_event(&call->event);
111 if (!id) 126 if (!id)
112 return -ENODEV; 127 return -ENODEV;
113 call->id = id;
114 INIT_LIST_HEAD(&call->fields);
115 128
116 return 0; 129 return 0;
117} 130}
@@ -124,23 +137,33 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
124 137
125 switch (enable) { 138 switch (enable) {
126 case 0: 139 case 0:
127 if (call->enabled) { 140 if (call->flags & TRACE_EVENT_FL_ENABLED) {
128 call->enabled = 0; 141 call->flags &= ~TRACE_EVENT_FL_ENABLED;
129 tracing_stop_cmdline_record(); 142 tracing_stop_cmdline_record();
130 call->unregfunc(call); 143 if (call->class->reg)
144 call->class->reg(call, TRACE_REG_UNREGISTER);
145 else
146 tracepoint_probe_unregister(call->name,
147 call->class->probe,
148 call);
131 } 149 }
132 break; 150 break;
133 case 1: 151 case 1:
134 if (!call->enabled) { 152 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
135 tracing_start_cmdline_record(); 153 tracing_start_cmdline_record();
136 ret = call->regfunc(call); 154 if (call->class->reg)
155 ret = call->class->reg(call, TRACE_REG_REGISTER);
156 else
157 ret = tracepoint_probe_register(call->name,
158 call->class->probe,
159 call);
137 if (ret) { 160 if (ret) {
138 tracing_stop_cmdline_record(); 161 tracing_stop_cmdline_record();
139 pr_info("event trace: Could not enable event " 162 pr_info("event trace: Could not enable event "
140 "%s\n", call->name); 163 "%s\n", call->name);
141 break; 164 break;
142 } 165 }
143 call->enabled = 1; 166 call->flags |= TRACE_EVENT_FL_ENABLED;
144 } 167 }
145 break; 168 break;
146 } 169 }
@@ -171,15 +194,16 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
171 mutex_lock(&event_mutex); 194 mutex_lock(&event_mutex);
172 list_for_each_entry(call, &ftrace_events, list) { 195 list_for_each_entry(call, &ftrace_events, list) {
173 196
174 if (!call->name || !call->regfunc) 197 if (!call->name || !call->class ||
198 (!call->class->probe && !call->class->reg))
175 continue; 199 continue;
176 200
177 if (match && 201 if (match &&
178 strcmp(match, call->name) != 0 && 202 strcmp(match, call->name) != 0 &&
179 strcmp(match, call->system) != 0) 203 strcmp(match, call->class->system) != 0)
180 continue; 204 continue;
181 205
182 if (sub && strcmp(sub, call->system) != 0) 206 if (sub && strcmp(sub, call->class->system) != 0)
183 continue; 207 continue;
184 208
185 if (event && strcmp(event, call->name) != 0) 209 if (event && strcmp(event, call->name) != 0)
@@ -297,7 +321,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
297 * The ftrace subsystem is for showing formats only. 321 * The ftrace subsystem is for showing formats only.
298 * They can not be enabled or disabled via the event files. 322 * They can not be enabled or disabled via the event files.
299 */ 323 */
300 if (call->regfunc) 324 if (call->class && (call->class->probe || call->class->reg))
301 return call; 325 return call;
302 } 326 }
303 327
@@ -328,7 +352,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
328 (*pos)++; 352 (*pos)++;
329 353
330 list_for_each_entry_continue(call, &ftrace_events, list) { 354 list_for_each_entry_continue(call, &ftrace_events, list) {
331 if (call->enabled) 355 if (call->flags & TRACE_EVENT_FL_ENABLED)
332 return call; 356 return call;
333 } 357 }
334 358
@@ -355,8 +379,8 @@ static int t_show(struct seq_file *m, void *v)
355{ 379{
356 struct ftrace_event_call *call = v; 380 struct ftrace_event_call *call = v;
357 381
358 if (strcmp(call->system, TRACE_SYSTEM) != 0) 382 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
359 seq_printf(m, "%s:", call->system); 383 seq_printf(m, "%s:", call->class->system);
360 seq_printf(m, "%s\n", call->name); 384 seq_printf(m, "%s\n", call->name);
361 385
362 return 0; 386 return 0;
@@ -387,7 +411,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
387 struct ftrace_event_call *call = filp->private_data; 411 struct ftrace_event_call *call = filp->private_data;
388 char *buf; 412 char *buf;
389 413
390 if (call->enabled) 414 if (call->flags & TRACE_EVENT_FL_ENABLED)
391 buf = "1\n"; 415 buf = "1\n";
392 else 416 else
393 buf = "0\n"; 417 buf = "0\n";
@@ -450,10 +474,11 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
450 474
451 mutex_lock(&event_mutex); 475 mutex_lock(&event_mutex);
452 list_for_each_entry(call, &ftrace_events, list) { 476 list_for_each_entry(call, &ftrace_events, list) {
453 if (!call->name || !call->regfunc) 477 if (!call->name || !call->class ||
478 (!call->class->probe && !call->class->reg))
454 continue; 479 continue;
455 480
456 if (system && strcmp(call->system, system) != 0) 481 if (system && strcmp(call->class->system, system) != 0)
457 continue; 482 continue;
458 483
459 /* 484 /*
@@ -461,7 +486,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
461 * or if all events or cleared, or if we have 486 * or if all events or cleared, or if we have
462 * a mixture. 487 * a mixture.
463 */ 488 */
464 set |= (1 << !!call->enabled); 489 set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
465 490
466 /* 491 /*
467 * If we have a mixture, no need to look further. 492 * If we have a mixture, no need to look further.
@@ -525,6 +550,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
525{ 550{
526 struct ftrace_event_call *call = filp->private_data; 551 struct ftrace_event_call *call = filp->private_data;
527 struct ftrace_event_field *field; 552 struct ftrace_event_field *field;
553 struct list_head *head;
528 struct trace_seq *s; 554 struct trace_seq *s;
529 int common_field_count = 5; 555 int common_field_count = 5;
530 char *buf; 556 char *buf;
@@ -540,10 +566,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
540 trace_seq_init(s); 566 trace_seq_init(s);
541 567
542 trace_seq_printf(s, "name: %s\n", call->name); 568 trace_seq_printf(s, "name: %s\n", call->name);
543 trace_seq_printf(s, "ID: %d\n", call->id); 569 trace_seq_printf(s, "ID: %d\n", call->event.type);
544 trace_seq_printf(s, "format:\n"); 570 trace_seq_printf(s, "format:\n");
545 571
546 list_for_each_entry_reverse(field, &call->fields, link) { 572 head = trace_get_fields(call);
573 list_for_each_entry_reverse(field, head, link) {
547 /* 574 /*
548 * Smartly shows the array type(except dynamic array). 575 * Smartly shows the array type(except dynamic array).
549 * Normal: 576 * Normal:
@@ -613,7 +640,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
613 return -ENOMEM; 640 return -ENOMEM;
614 641
615 trace_seq_init(s); 642 trace_seq_init(s);
616 trace_seq_printf(s, "%d\n", call->id); 643 trace_seq_printf(s, "%d\n", call->event.type);
617 644
618 r = simple_read_from_buffer(ubuf, cnt, ppos, 645 r = simple_read_from_buffer(ubuf, cnt, ppos,
619 s->buffer, s->len); 646 s->buffer, s->len);
@@ -919,14 +946,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
919 const struct file_operations *filter, 946 const struct file_operations *filter,
920 const struct file_operations *format) 947 const struct file_operations *format)
921{ 948{
949 struct list_head *head;
922 int ret; 950 int ret;
923 951
924 /* 952 /*
925 * If the trace point header did not define TRACE_SYSTEM 953 * If the trace point header did not define TRACE_SYSTEM
926 * then the system would be called "TRACE_SYSTEM". 954 * then the system would be called "TRACE_SYSTEM".
927 */ 955 */
928 if (strcmp(call->system, TRACE_SYSTEM) != 0) 956 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
929 d_events = event_subsystem_dir(call->system, d_events); 957 d_events = event_subsystem_dir(call->class->system, d_events);
930 958
931 call->dir = debugfs_create_dir(call->name, d_events); 959 call->dir = debugfs_create_dir(call->name, d_events);
932 if (!call->dir) { 960 if (!call->dir) {
@@ -935,22 +963,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
935 return -1; 963 return -1;
936 } 964 }
937 965
938 if (call->regfunc) 966 if (call->class->probe || call->class->reg)
939 trace_create_file("enable", 0644, call->dir, call, 967 trace_create_file("enable", 0644, call->dir, call,
940 enable); 968 enable);
941 969
942 if (call->id && call->perf_event_enable) 970#ifdef CONFIG_PERF_EVENTS
971 if (call->event.type && (call->class->perf_probe || call->class->reg))
943 trace_create_file("id", 0444, call->dir, call, 972 trace_create_file("id", 0444, call->dir, call,
944 id); 973 id);
974#endif
945 975
946 if (call->define_fields) { 976 if (call->class->define_fields) {
947 ret = trace_define_common_fields(call); 977 /*
948 if (!ret) 978 * Other events may have the same class. Only update
949 ret = call->define_fields(call); 979 * the fields if they are not already defined.
950 if (ret < 0) { 980 */
951 pr_warning("Could not initialize trace point" 981 head = trace_get_fields(call);
952 " events/%s\n", call->name); 982 if (list_empty(head)) {
953 return ret; 983 ret = trace_define_common_fields(call);
984 if (!ret)
985 ret = call->class->define_fields(call);
986 if (ret < 0) {
987 pr_warning("Could not initialize trace point"
988 " events/%s\n", call->name);
989 return ret;
990 }
954 } 991 }
955 trace_create_file("filter", 0644, call->dir, call, 992 trace_create_file("filter", 0644, call->dir, call,
956 filter); 993 filter);
@@ -970,8 +1007,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
970 if (!call->name) 1007 if (!call->name)
971 return -EINVAL; 1008 return -EINVAL;
972 1009
973 if (call->raw_init) { 1010 if (call->class->raw_init) {
974 ret = call->raw_init(call); 1011 ret = call->class->raw_init(call);
975 if (ret < 0) { 1012 if (ret < 0) {
976 if (ret != -ENOSYS) 1013 if (ret != -ENOSYS)
977 pr_warning("Could not initialize trace " 1014 pr_warning("Could not initialize trace "
@@ -1035,13 +1072,13 @@ static void remove_subsystem_dir(const char *name)
1035static void __trace_remove_event_call(struct ftrace_event_call *call) 1072static void __trace_remove_event_call(struct ftrace_event_call *call)
1036{ 1073{
1037 ftrace_event_enable_disable(call, 0); 1074 ftrace_event_enable_disable(call, 0);
1038 if (call->event) 1075 if (call->event.funcs)
1039 __unregister_ftrace_event(call->event); 1076 __unregister_ftrace_event(&call->event);
1040 debugfs_remove_recursive(call->dir); 1077 debugfs_remove_recursive(call->dir);
1041 list_del(&call->list); 1078 list_del(&call->list);
1042 trace_destroy_fields(call); 1079 trace_destroy_fields(call);
1043 destroy_preds(call); 1080 destroy_preds(call);
1044 remove_subsystem_dir(call->system); 1081 remove_subsystem_dir(call->class->system);
1045} 1082}
1046 1083
1047/* Remove an event_call */ 1084/* Remove an event_call */
@@ -1132,8 +1169,8 @@ static void trace_module_add_events(struct module *mod)
1132 /* The linker may leave blanks */ 1169 /* The linker may leave blanks */
1133 if (!call->name) 1170 if (!call->name)
1134 continue; 1171 continue;
1135 if (call->raw_init) { 1172 if (call->class->raw_init) {
1136 ret = call->raw_init(call); 1173 ret = call->class->raw_init(call);
1137 if (ret < 0) { 1174 if (ret < 0) {
1138 if (ret != -ENOSYS) 1175 if (ret != -ENOSYS)
1139 pr_warning("Could not initialize trace " 1176 pr_warning("Could not initialize trace "
@@ -1286,8 +1323,8 @@ static __init int event_trace_init(void)
1286 /* The linker may leave blanks */ 1323 /* The linker may leave blanks */
1287 if (!call->name) 1324 if (!call->name)
1288 continue; 1325 continue;
1289 if (call->raw_init) { 1326 if (call->class->raw_init) {
1290 ret = call->raw_init(call); 1327 ret = call->class->raw_init(call);
1291 if (ret < 0) { 1328 if (ret < 0) {
1292 if (ret != -ENOSYS) 1329 if (ret != -ENOSYS)
1293 pr_warning("Could not initialize trace " 1330 pr_warning("Could not initialize trace "
@@ -1388,8 +1425,8 @@ static __init void event_trace_self_tests(void)
1388 1425
1389 list_for_each_entry(call, &ftrace_events, list) { 1426 list_for_each_entry(call, &ftrace_events, list) {
1390 1427
1391 /* Only test those that have a regfunc */ 1428 /* Only test those that have a probe */
1392 if (!call->regfunc) 1429 if (!call->class || !call->class->probe)
1393 continue; 1430 continue;
1394 1431
1395/* 1432/*
@@ -1399,8 +1436,8 @@ static __init void event_trace_self_tests(void)
1399 * syscalls as we test. 1436 * syscalls as we test.
1400 */ 1437 */
1401#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS 1438#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1402 if (call->system && 1439 if (call->class->system &&
1403 strcmp(call->system, "syscalls") == 0) 1440 strcmp(call->class->system, "syscalls") == 0)
1404 continue; 1441 continue;
1405#endif 1442#endif
1406 1443
@@ -1410,7 +1447,7 @@ static __init void event_trace_self_tests(void)
1410 * If an event is already enabled, someone is using 1447 * If an event is already enabled, someone is using
1411 * it and the self test should not be on. 1448 * it and the self test should not be on.
1412 */ 1449 */
1413 if (call->enabled) { 1450 if (call->flags & TRACE_EVENT_FL_ENABLED) {
1414 pr_warning("Enabled event during self test!\n"); 1451 pr_warning("Enabled event during self test!\n");
1415 WARN_ON_ONCE(1); 1452 WARN_ON_ONCE(1);
1416 continue; 1453 continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 88c0b6dbd7fe..57bb1bb32999 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -500,8 +500,10 @@ static struct ftrace_event_field *
500find_event_field(struct ftrace_event_call *call, char *name) 500find_event_field(struct ftrace_event_call *call, char *name)
501{ 501{
502 struct ftrace_event_field *field; 502 struct ftrace_event_field *field;
503 struct list_head *head;
503 504
504 list_for_each_entry(field, &call->fields, link) { 505 head = trace_get_fields(call);
506 list_for_each_entry(field, head, link) {
505 if (!strcmp(field->name, name)) 507 if (!strcmp(field->name, name))
506 return field; 508 return field;
507 } 509 }
@@ -545,7 +547,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
545 struct event_filter *filter = call->filter; 547 struct event_filter *filter = call->filter;
546 int i; 548 int i;
547 549
548 call->filter_active = 0; 550 call->flags &= ~TRACE_EVENT_FL_FILTERED;
549 filter->n_preds = 0; 551 filter->n_preds = 0;
550 552
551 for (i = 0; i < MAX_FILTER_PRED; i++) 553 for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -572,7 +574,7 @@ void destroy_preds(struct ftrace_event_call *call)
572{ 574{
573 __free_preds(call->filter); 575 __free_preds(call->filter);
574 call->filter = NULL; 576 call->filter = NULL;
575 call->filter_active = 0; 577 call->flags &= ~TRACE_EVENT_FL_FILTERED;
576} 578}
577 579
578static struct event_filter *__alloc_preds(void) 580static struct event_filter *__alloc_preds(void)
@@ -611,7 +613,7 @@ static int init_preds(struct ftrace_event_call *call)
611 if (call->filter) 613 if (call->filter)
612 return 0; 614 return 0;
613 615
614 call->filter_active = 0; 616 call->flags &= ~TRACE_EVENT_FL_FILTERED;
615 call->filter = __alloc_preds(); 617 call->filter = __alloc_preds();
616 if (IS_ERR(call->filter)) 618 if (IS_ERR(call->filter))
617 return PTR_ERR(call->filter); 619 return PTR_ERR(call->filter);
@@ -625,10 +627,10 @@ static int init_subsystem_preds(struct event_subsystem *system)
625 int err; 627 int err;
626 628
627 list_for_each_entry(call, &ftrace_events, list) { 629 list_for_each_entry(call, &ftrace_events, list) {
628 if (!call->define_fields) 630 if (!call->class || !call->class->define_fields)
629 continue; 631 continue;
630 632
631 if (strcmp(call->system, system->name) != 0) 633 if (strcmp(call->class->system, system->name) != 0)
632 continue; 634 continue;
633 635
634 err = init_preds(call); 636 err = init_preds(call);
@@ -644,10 +646,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
644 struct ftrace_event_call *call; 646 struct ftrace_event_call *call;
645 647
646 list_for_each_entry(call, &ftrace_events, list) { 648 list_for_each_entry(call, &ftrace_events, list) {
647 if (!call->define_fields) 649 if (!call->class || !call->class->define_fields)
648 continue; 650 continue;
649 651
650 if (strcmp(call->system, system->name) != 0) 652 if (strcmp(call->class->system, system->name) != 0)
651 continue; 653 continue;
652 654
653 filter_disable_preds(call); 655 filter_disable_preds(call);
@@ -1249,10 +1251,10 @@ static int replace_system_preds(struct event_subsystem *system,
1249 list_for_each_entry(call, &ftrace_events, list) { 1251 list_for_each_entry(call, &ftrace_events, list) {
1250 struct event_filter *filter = call->filter; 1252 struct event_filter *filter = call->filter;
1251 1253
1252 if (!call->define_fields) 1254 if (!call->class || !call->class->define_fields)
1253 continue; 1255 continue;
1254 1256
1255 if (strcmp(call->system, system->name) != 0) 1257 if (strcmp(call->class->system, system->name) != 0)
1256 continue; 1258 continue;
1257 1259
1258 /* try to see if the filter can be applied */ 1260 /* try to see if the filter can be applied */
@@ -1266,7 +1268,7 @@ static int replace_system_preds(struct event_subsystem *system,
1266 if (err) 1268 if (err)
1267 filter_disable_preds(call); 1269 filter_disable_preds(call);
1268 else { 1270 else {
1269 call->filter_active = 1; 1271 call->flags |= TRACE_EVENT_FL_FILTERED;
1270 replace_filter_string(filter, filter_string); 1272 replace_filter_string(filter, filter_string);
1271 } 1273 }
1272 fail = false; 1274 fail = false;
@@ -1315,7 +1317,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1315 if (err) 1317 if (err)
1316 append_filter_err(ps, call->filter); 1318 append_filter_err(ps, call->filter);
1317 else 1319 else
1318 call->filter_active = 1; 1320 call->flags |= TRACE_EVENT_FL_FILTERED;
1319out: 1321out:
1320 filter_opstack_clear(ps); 1322 filter_opstack_clear(ps);
1321 postfix_clear(ps); 1323 postfix_clear(ps);
@@ -1393,12 +1395,12 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1393 mutex_lock(&event_mutex); 1395 mutex_lock(&event_mutex);
1394 1396
1395 list_for_each_entry(call, &ftrace_events, list) { 1397 list_for_each_entry(call, &ftrace_events, list) {
1396 if (call->id == event_id) 1398 if (call->event.type == event_id)
1397 break; 1399 break;
1398 } 1400 }
1399 1401
1400 err = -EINVAL; 1402 err = -EINVAL;
1401 if (!call) 1403 if (&call->list == &ftrace_events)
1402 goto out_unlock; 1404 goto out_unlock;
1403 1405
1404 err = -EEXIST; 1406 err = -EEXIST;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e091f64ba6ce..8536e2a65969 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -127,7 +127,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
127 127
128static int ftrace_raw_init_event(struct ftrace_event_call *call) 128static int ftrace_raw_init_event(struct ftrace_event_call *call)
129{ 129{
130 INIT_LIST_HEAD(&call->fields); 130 INIT_LIST_HEAD(&call->class->fields);
131 return 0; 131 return 0;
132} 132}
133 133
@@ -153,17 +153,21 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
153#define F_printk(fmt, args...) #fmt ", " __stringify(args) 153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154 154
155#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
157 \
158struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \
161 .raw_init = ftrace_raw_init_event, \
162}; \
157 \ 163 \
158struct ftrace_event_call __used \ 164struct ftrace_event_call __used \
159__attribute__((__aligned__(4))) \ 165__attribute__((__aligned__(4))) \
160__attribute__((section("_ftrace_events"))) event_##call = { \ 166__attribute__((section("_ftrace_events"))) event_##call = { \
161 .name = #call, \ 167 .name = #call, \
162 .id = type, \ 168 .event.type = etype, \
163 .system = __stringify(TRACE_SYSTEM), \ 169 .class = &event_class_ftrace_##call, \
164 .raw_init = ftrace_raw_init_event, \
165 .print_fmt = print, \ 170 .print_fmt = print, \
166 .define_fields = ftrace_define_fields_##call, \
167}; \ 171}; \
168 172
169#include "trace_entries.h" 173#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 9aed1a5cf553..79f4bac99a94 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -40,7 +40,7 @@ struct fgraph_data {
40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
41#define TRACE_GRAPH_PRINT_PROC 0x8 41#define TRACE_GRAPH_PRINT_PROC 0x8
42#define TRACE_GRAPH_PRINT_DURATION 0x10 42#define TRACE_GRAPH_PRINT_DURATION 0x10
43#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
44 44
45static struct tracer_opt trace_opts[] = { 45static struct tracer_opt trace_opts[] = {
46 /* Display overruns? (for self-debug purpose) */ 46 /* Display overruns? (for self-debug purpose) */
@@ -179,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
179 return ret; 179 return ret;
180} 180}
181 181
182static int __trace_graph_entry(struct trace_array *tr, 182int __trace_graph_entry(struct trace_array *tr,
183 struct ftrace_graph_ent *trace, 183 struct ftrace_graph_ent *trace,
184 unsigned long flags, 184 unsigned long flags,
185 int pc) 185 int pc)
@@ -246,7 +246,7 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
246 return trace_graph_entry(trace); 246 return trace_graph_entry(trace);
247} 247}
248 248
249static void __trace_graph_return(struct trace_array *tr, 249void __trace_graph_return(struct trace_array *tr,
250 struct ftrace_graph_ret *trace, 250 struct ftrace_graph_ret *trace,
251 unsigned long flags, 251 unsigned long flags,
252 int pc) 252 int pc)
@@ -490,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
490 * We need to consume the current entry to see 490 * We need to consume the current entry to see
491 * the next one. 491 * the next one.
492 */ 492 */
493 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 493 ring_buffer_consume(iter->tr->buffer, iter->cpu,
494 NULL, NULL);
494 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 495 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
495 NULL); 496 NULL, NULL);
496 } 497 }
497 498
498 if (!event) 499 if (!event)
@@ -526,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
526 527
527/* Signal a overhead of time execution to the output */ 528/* Signal a overhead of time execution to the output */
528static int 529static int
529print_graph_overhead(unsigned long long duration, struct trace_seq *s) 530print_graph_overhead(unsigned long long duration, struct trace_seq *s,
531 u32 flags)
530{ 532{
531 /* If duration disappear, we don't need anything */ 533 /* If duration disappear, we don't need anything */
532 if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) 534 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
533 return 1; 535 return 1;
534 536
535 /* Non nested entry or return */ 537 /* Non nested entry or return */
536 if (duration == -1) 538 if (duration == -1)
537 return trace_seq_printf(s, " "); 539 return trace_seq_printf(s, " ");
538 540
539 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 541 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
540 /* Duration exceeded 100 msecs */ 542 /* Duration exceeded 100 msecs */
541 if (duration > 100000ULL) 543 if (duration > 100000ULL)
542 return trace_seq_printf(s, "! "); 544 return trace_seq_printf(s, "! ");
@@ -562,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
562 564
563static enum print_line_t 565static enum print_line_t
564print_graph_irq(struct trace_iterator *iter, unsigned long addr, 566print_graph_irq(struct trace_iterator *iter, unsigned long addr,
565 enum trace_type type, int cpu, pid_t pid) 567 enum trace_type type, int cpu, pid_t pid, u32 flags)
566{ 568{
567 int ret; 569 int ret;
568 struct trace_seq *s = &iter->seq; 570 struct trace_seq *s = &iter->seq;
@@ -572,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
572 return TRACE_TYPE_UNHANDLED; 574 return TRACE_TYPE_UNHANDLED;
573 575
574 /* Absolute time */ 576 /* Absolute time */
575 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 577 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
576 ret = print_graph_abs_time(iter->ts, s); 578 ret = print_graph_abs_time(iter->ts, s);
577 if (!ret) 579 if (!ret)
578 return TRACE_TYPE_PARTIAL_LINE; 580 return TRACE_TYPE_PARTIAL_LINE;
579 } 581 }
580 582
581 /* Cpu */ 583 /* Cpu */
582 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 584 if (flags & TRACE_GRAPH_PRINT_CPU) {
583 ret = print_graph_cpu(s, cpu); 585 ret = print_graph_cpu(s, cpu);
584 if (ret == TRACE_TYPE_PARTIAL_LINE) 586 if (ret == TRACE_TYPE_PARTIAL_LINE)
585 return TRACE_TYPE_PARTIAL_LINE; 587 return TRACE_TYPE_PARTIAL_LINE;
586 } 588 }
587 589
588 /* Proc */ 590 /* Proc */
589 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 591 if (flags & TRACE_GRAPH_PRINT_PROC) {
590 ret = print_graph_proc(s, pid); 592 ret = print_graph_proc(s, pid);
591 if (ret == TRACE_TYPE_PARTIAL_LINE) 593 if (ret == TRACE_TYPE_PARTIAL_LINE)
592 return TRACE_TYPE_PARTIAL_LINE; 594 return TRACE_TYPE_PARTIAL_LINE;
@@ -596,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
596 } 598 }
597 599
598 /* No overhead */ 600 /* No overhead */
599 ret = print_graph_overhead(-1, s); 601 ret = print_graph_overhead(-1, s, flags);
600 if (!ret) 602 if (!ret)
601 return TRACE_TYPE_PARTIAL_LINE; 603 return TRACE_TYPE_PARTIAL_LINE;
602 604
@@ -609,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
609 return TRACE_TYPE_PARTIAL_LINE; 611 return TRACE_TYPE_PARTIAL_LINE;
610 612
611 /* Don't close the duration column if haven't one */ 613 /* Don't close the duration column if haven't one */
612 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 614 if (flags & TRACE_GRAPH_PRINT_DURATION)
613 trace_seq_printf(s, " |"); 615 trace_seq_printf(s, " |");
614 ret = trace_seq_printf(s, "\n"); 616 ret = trace_seq_printf(s, "\n");
615 617
@@ -679,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
679static enum print_line_t 681static enum print_line_t
680print_graph_entry_leaf(struct trace_iterator *iter, 682print_graph_entry_leaf(struct trace_iterator *iter,
681 struct ftrace_graph_ent_entry *entry, 683 struct ftrace_graph_ent_entry *entry,
682 struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) 684 struct ftrace_graph_ret_entry *ret_entry,
685 struct trace_seq *s, u32 flags)
683{ 686{
684 struct fgraph_data *data = iter->private; 687 struct fgraph_data *data = iter->private;
685 struct ftrace_graph_ret *graph_ret; 688 struct ftrace_graph_ret *graph_ret;
@@ -711,12 +714,12 @@ print_graph_entry_leaf(struct trace_iterator *iter,
711 } 714 }
712 715
713 /* Overhead */ 716 /* Overhead */
714 ret = print_graph_overhead(duration, s); 717 ret = print_graph_overhead(duration, s, flags);
715 if (!ret) 718 if (!ret)
716 return TRACE_TYPE_PARTIAL_LINE; 719 return TRACE_TYPE_PARTIAL_LINE;
717 720
718 /* Duration */ 721 /* Duration */
719 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 722 if (flags & TRACE_GRAPH_PRINT_DURATION) {
720 ret = print_graph_duration(duration, s); 723 ret = print_graph_duration(duration, s);
721 if (ret == TRACE_TYPE_PARTIAL_LINE) 724 if (ret == TRACE_TYPE_PARTIAL_LINE)
722 return TRACE_TYPE_PARTIAL_LINE; 725 return TRACE_TYPE_PARTIAL_LINE;
@@ -739,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
739static enum print_line_t 742static enum print_line_t
740print_graph_entry_nested(struct trace_iterator *iter, 743print_graph_entry_nested(struct trace_iterator *iter,
741 struct ftrace_graph_ent_entry *entry, 744 struct ftrace_graph_ent_entry *entry,
742 struct trace_seq *s, int cpu) 745 struct trace_seq *s, int cpu, u32 flags)
743{ 746{
744 struct ftrace_graph_ent *call = &entry->graph_ent; 747 struct ftrace_graph_ent *call = &entry->graph_ent;
745 struct fgraph_data *data = iter->private; 748 struct fgraph_data *data = iter->private;
@@ -759,12 +762,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
759 } 762 }
760 763
761 /* No overhead */ 764 /* No overhead */
762 ret = print_graph_overhead(-1, s); 765 ret = print_graph_overhead(-1, s, flags);
763 if (!ret) 766 if (!ret)
764 return TRACE_TYPE_PARTIAL_LINE; 767 return TRACE_TYPE_PARTIAL_LINE;
765 768
766 /* No time */ 769 /* No time */
767 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 770 if (flags & TRACE_GRAPH_PRINT_DURATION) {
768 ret = trace_seq_printf(s, " | "); 771 ret = trace_seq_printf(s, " | ");
769 if (!ret) 772 if (!ret)
770 return TRACE_TYPE_PARTIAL_LINE; 773 return TRACE_TYPE_PARTIAL_LINE;
@@ -790,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
790 793
791static enum print_line_t 794static enum print_line_t
792print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, 795print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
793 int type, unsigned long addr) 796 int type, unsigned long addr, u32 flags)
794{ 797{
795 struct fgraph_data *data = iter->private; 798 struct fgraph_data *data = iter->private;
796 struct trace_entry *ent = iter->ent; 799 struct trace_entry *ent = iter->ent;
@@ -803,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
803 806
804 if (type) { 807 if (type) {
805 /* Interrupt */ 808 /* Interrupt */
806 ret = print_graph_irq(iter, addr, type, cpu, ent->pid); 809 ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
807 if (ret == TRACE_TYPE_PARTIAL_LINE) 810 if (ret == TRACE_TYPE_PARTIAL_LINE)
808 return TRACE_TYPE_PARTIAL_LINE; 811 return TRACE_TYPE_PARTIAL_LINE;
809 } 812 }
810 813
811 /* Absolute time */ 814 /* Absolute time */
812 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 815 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
813 ret = print_graph_abs_time(iter->ts, s); 816 ret = print_graph_abs_time(iter->ts, s);
814 if (!ret) 817 if (!ret)
815 return TRACE_TYPE_PARTIAL_LINE; 818 return TRACE_TYPE_PARTIAL_LINE;
816 } 819 }
817 820
818 /* Cpu */ 821 /* Cpu */
819 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 822 if (flags & TRACE_GRAPH_PRINT_CPU) {
820 ret = print_graph_cpu(s, cpu); 823 ret = print_graph_cpu(s, cpu);
821 if (ret == TRACE_TYPE_PARTIAL_LINE) 824 if (ret == TRACE_TYPE_PARTIAL_LINE)
822 return TRACE_TYPE_PARTIAL_LINE; 825 return TRACE_TYPE_PARTIAL_LINE;
823 } 826 }
824 827
825 /* Proc */ 828 /* Proc */
826 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 829 if (flags & TRACE_GRAPH_PRINT_PROC) {
827 ret = print_graph_proc(s, ent->pid); 830 ret = print_graph_proc(s, ent->pid);
828 if (ret == TRACE_TYPE_PARTIAL_LINE) 831 if (ret == TRACE_TYPE_PARTIAL_LINE)
829 return TRACE_TYPE_PARTIAL_LINE; 832 return TRACE_TYPE_PARTIAL_LINE;
@@ -845,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
845 848
846static enum print_line_t 849static enum print_line_t
847print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 850print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
848 struct trace_iterator *iter) 851 struct trace_iterator *iter, u32 flags)
849{ 852{
850 struct fgraph_data *data = iter->private; 853 struct fgraph_data *data = iter->private;
851 struct ftrace_graph_ent *call = &field->graph_ent; 854 struct ftrace_graph_ent *call = &field->graph_ent;
@@ -853,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
853 static enum print_line_t ret; 856 static enum print_line_t ret;
854 int cpu = iter->cpu; 857 int cpu = iter->cpu;
855 858
856 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 859 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
857 return TRACE_TYPE_PARTIAL_LINE; 860 return TRACE_TYPE_PARTIAL_LINE;
858 861
859 leaf_ret = get_return_for_leaf(iter, field); 862 leaf_ret = get_return_for_leaf(iter, field);
860 if (leaf_ret) 863 if (leaf_ret)
861 ret = print_graph_entry_leaf(iter, field, leaf_ret, s); 864 ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
862 else 865 else
863 ret = print_graph_entry_nested(iter, field, s, cpu); 866 ret = print_graph_entry_nested(iter, field, s, cpu, flags);
864 867
865 if (data) { 868 if (data) {
866 /* 869 /*
@@ -879,7 +882,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
879 882
880static enum print_line_t 883static enum print_line_t
881print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, 884print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
882 struct trace_entry *ent, struct trace_iterator *iter) 885 struct trace_entry *ent, struct trace_iterator *iter,
886 u32 flags)
883{ 887{
884 unsigned long long duration = trace->rettime - trace->calltime; 888 unsigned long long duration = trace->rettime - trace->calltime;
885 struct fgraph_data *data = iter->private; 889 struct fgraph_data *data = iter->private;
@@ -909,16 +913,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
909 } 913 }
910 } 914 }
911 915
912 if (print_graph_prologue(iter, s, 0, 0)) 916 if (print_graph_prologue(iter, s, 0, 0, flags))
913 return TRACE_TYPE_PARTIAL_LINE; 917 return TRACE_TYPE_PARTIAL_LINE;
914 918
915 /* Overhead */ 919 /* Overhead */
916 ret = print_graph_overhead(duration, s); 920 ret = print_graph_overhead(duration, s, flags);
917 if (!ret) 921 if (!ret)
918 return TRACE_TYPE_PARTIAL_LINE; 922 return TRACE_TYPE_PARTIAL_LINE;
919 923
920 /* Duration */ 924 /* Duration */
921 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 925 if (flags & TRACE_GRAPH_PRINT_DURATION) {
922 ret = print_graph_duration(duration, s); 926 ret = print_graph_duration(duration, s);
923 if (ret == TRACE_TYPE_PARTIAL_LINE) 927 if (ret == TRACE_TYPE_PARTIAL_LINE)
924 return TRACE_TYPE_PARTIAL_LINE; 928 return TRACE_TYPE_PARTIAL_LINE;
@@ -948,14 +952,15 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
948 } 952 }
949 953
950 /* Overrun */ 954 /* Overrun */
951 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 955 if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
952 ret = trace_seq_printf(s, " (Overruns: %lu)\n", 956 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
953 trace->overrun); 957 trace->overrun);
954 if (!ret) 958 if (!ret)
955 return TRACE_TYPE_PARTIAL_LINE; 959 return TRACE_TYPE_PARTIAL_LINE;
956 } 960 }
957 961
958 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid); 962 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
963 cpu, pid, flags);
959 if (ret == TRACE_TYPE_PARTIAL_LINE) 964 if (ret == TRACE_TYPE_PARTIAL_LINE)
960 return TRACE_TYPE_PARTIAL_LINE; 965 return TRACE_TYPE_PARTIAL_LINE;
961 966
@@ -963,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
963} 968}
964 969
965static enum print_line_t 970static enum print_line_t
966print_graph_comment(struct trace_seq *s, struct trace_entry *ent, 971print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
967 struct trace_iterator *iter) 972 struct trace_iterator *iter, u32 flags)
968{ 973{
969 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 974 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
970 struct fgraph_data *data = iter->private; 975 struct fgraph_data *data = iter->private;
@@ -976,16 +981,16 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
976 if (data) 981 if (data)
977 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; 982 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
978 983
979 if (print_graph_prologue(iter, s, 0, 0)) 984 if (print_graph_prologue(iter, s, 0, 0, flags))
980 return TRACE_TYPE_PARTIAL_LINE; 985 return TRACE_TYPE_PARTIAL_LINE;
981 986
982 /* No overhead */ 987 /* No overhead */
983 ret = print_graph_overhead(-1, s); 988 ret = print_graph_overhead(-1, s, flags);
984 if (!ret) 989 if (!ret)
985 return TRACE_TYPE_PARTIAL_LINE; 990 return TRACE_TYPE_PARTIAL_LINE;
986 991
987 /* No time */ 992 /* No time */
988 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 993 if (flags & TRACE_GRAPH_PRINT_DURATION) {
989 ret = trace_seq_printf(s, " | "); 994 ret = trace_seq_printf(s, " | ");
990 if (!ret) 995 if (!ret)
991 return TRACE_TYPE_PARTIAL_LINE; 996 return TRACE_TYPE_PARTIAL_LINE;
@@ -1020,7 +1025,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1020 if (!event) 1025 if (!event)
1021 return TRACE_TYPE_UNHANDLED; 1026 return TRACE_TYPE_UNHANDLED;
1022 1027
1023 ret = event->trace(iter, sym_flags); 1028 ret = event->funcs->trace(iter, sym_flags, event);
1024 if (ret != TRACE_TYPE_HANDLED) 1029 if (ret != TRACE_TYPE_HANDLED)
1025 return ret; 1030 return ret;
1026 } 1031 }
@@ -1040,7 +1045,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1040 1045
1041 1046
1042enum print_line_t 1047enum print_line_t
1043print_graph_function(struct trace_iterator *iter) 1048print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1044{ 1049{
1045 struct ftrace_graph_ent_entry *field; 1050 struct ftrace_graph_ent_entry *field;
1046 struct fgraph_data *data = iter->private; 1051 struct fgraph_data *data = iter->private;
@@ -1061,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
1061 if (data && data->failed) { 1066 if (data && data->failed) {
1062 field = &data->ent; 1067 field = &data->ent;
1063 iter->cpu = data->cpu; 1068 iter->cpu = data->cpu;
1064 ret = print_graph_entry(field, s, iter); 1069 ret = print_graph_entry(field, s, iter, flags);
1065 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { 1070 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1066 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; 1071 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1067 ret = TRACE_TYPE_NO_CONSUME; 1072 ret = TRACE_TYPE_NO_CONSUME;
@@ -1081,32 +1086,50 @@ print_graph_function(struct trace_iterator *iter)
1081 struct ftrace_graph_ent_entry saved; 1086 struct ftrace_graph_ent_entry saved;
1082 trace_assign_type(field, entry); 1087 trace_assign_type(field, entry);
1083 saved = *field; 1088 saved = *field;
1084 return print_graph_entry(&saved, s, iter); 1089 return print_graph_entry(&saved, s, iter, flags);
1085 } 1090 }
1086 case TRACE_GRAPH_RET: { 1091 case TRACE_GRAPH_RET: {
1087 struct ftrace_graph_ret_entry *field; 1092 struct ftrace_graph_ret_entry *field;
1088 trace_assign_type(field, entry); 1093 trace_assign_type(field, entry);
1089 return print_graph_return(&field->ret, s, entry, iter); 1094 return print_graph_return(&field->ret, s, entry, iter, flags);
1090 } 1095 }
1096 case TRACE_STACK:
1097 case TRACE_FN:
1098 /* dont trace stack and functions as comments */
1099 return TRACE_TYPE_UNHANDLED;
1100
1091 default: 1101 default:
1092 return print_graph_comment(s, entry, iter); 1102 return print_graph_comment(s, entry, iter, flags);
1093 } 1103 }
1094 1104
1095 return TRACE_TYPE_HANDLED; 1105 return TRACE_TYPE_HANDLED;
1096} 1106}
1097 1107
1098static void print_lat_header(struct seq_file *s) 1108static enum print_line_t
1109print_graph_function(struct trace_iterator *iter)
1110{
1111 return print_graph_function_flags(iter, tracer_flags.val);
1112}
1113
1114static enum print_line_t
1115print_graph_function_event(struct trace_iterator *iter, int flags,
1116 struct trace_event *event)
1117{
1118 return print_graph_function(iter);
1119}
1120
1121static void print_lat_header(struct seq_file *s, u32 flags)
1099{ 1122{
1100 static const char spaces[] = " " /* 16 spaces */ 1123 static const char spaces[] = " " /* 16 spaces */
1101 " " /* 4 spaces */ 1124 " " /* 4 spaces */
1102 " "; /* 17 spaces */ 1125 " "; /* 17 spaces */
1103 int size = 0; 1126 int size = 0;
1104 1127
1105 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1128 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1106 size += 16; 1129 size += 16;
1107 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1130 if (flags & TRACE_GRAPH_PRINT_CPU)
1108 size += 4; 1131 size += 4;
1109 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1132 if (flags & TRACE_GRAPH_PRINT_PROC)
1110 size += 17; 1133 size += 17;
1111 1134
1112 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); 1135 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
@@ -1117,43 +1140,48 @@ static void print_lat_header(struct seq_file *s)
1117 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1140 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1118} 1141}
1119 1142
1120static void print_graph_headers(struct seq_file *s) 1143void print_graph_headers_flags(struct seq_file *s, u32 flags)
1121{ 1144{
1122 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1145 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1123 1146
1124 if (lat) 1147 if (lat)
1125 print_lat_header(s); 1148 print_lat_header(s, flags);
1126 1149
1127 /* 1st line */ 1150 /* 1st line */
1128 seq_printf(s, "#"); 1151 seq_printf(s, "#");
1129 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1152 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1130 seq_printf(s, " TIME "); 1153 seq_printf(s, " TIME ");
1131 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1154 if (flags & TRACE_GRAPH_PRINT_CPU)
1132 seq_printf(s, " CPU"); 1155 seq_printf(s, " CPU");
1133 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1156 if (flags & TRACE_GRAPH_PRINT_PROC)
1134 seq_printf(s, " TASK/PID "); 1157 seq_printf(s, " TASK/PID ");
1135 if (lat) 1158 if (lat)
1136 seq_printf(s, "|||||"); 1159 seq_printf(s, "|||||");
1137 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1160 if (flags & TRACE_GRAPH_PRINT_DURATION)
1138 seq_printf(s, " DURATION "); 1161 seq_printf(s, " DURATION ");
1139 seq_printf(s, " FUNCTION CALLS\n"); 1162 seq_printf(s, " FUNCTION CALLS\n");
1140 1163
1141 /* 2nd line */ 1164 /* 2nd line */
1142 seq_printf(s, "#"); 1165 seq_printf(s, "#");
1143 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1166 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1144 seq_printf(s, " | "); 1167 seq_printf(s, " | ");
1145 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1168 if (flags & TRACE_GRAPH_PRINT_CPU)
1146 seq_printf(s, " | "); 1169 seq_printf(s, " | ");
1147 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1170 if (flags & TRACE_GRAPH_PRINT_PROC)
1148 seq_printf(s, " | | "); 1171 seq_printf(s, " | | ");
1149 if (lat) 1172 if (lat)
1150 seq_printf(s, "|||||"); 1173 seq_printf(s, "|||||");
1151 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1174 if (flags & TRACE_GRAPH_PRINT_DURATION)
1152 seq_printf(s, " | | "); 1175 seq_printf(s, " | | ");
1153 seq_printf(s, " | | | |\n"); 1176 seq_printf(s, " | | | |\n");
1154} 1177}
1155 1178
1156static void graph_trace_open(struct trace_iterator *iter) 1179void print_graph_headers(struct seq_file *s)
1180{
1181 print_graph_headers_flags(s, tracer_flags.val);
1182}
1183
1184void graph_trace_open(struct trace_iterator *iter)
1157{ 1185{
1158 /* pid and depth on the last trace processed */ 1186 /* pid and depth on the last trace processed */
1159 struct fgraph_data *data; 1187 struct fgraph_data *data;
@@ -1188,7 +1216,7 @@ static void graph_trace_open(struct trace_iterator *iter)
1188 pr_warning("function graph tracer: not enough memory\n"); 1216 pr_warning("function graph tracer: not enough memory\n");
1189} 1217}
1190 1218
1191static void graph_trace_close(struct trace_iterator *iter) 1219void graph_trace_close(struct trace_iterator *iter)
1192{ 1220{
1193 struct fgraph_data *data = iter->private; 1221 struct fgraph_data *data = iter->private;
1194 1222
@@ -1198,6 +1226,20 @@ static void graph_trace_close(struct trace_iterator *iter)
1198 } 1226 }
1199} 1227}
1200 1228
1229static struct trace_event_functions graph_functions = {
1230 .trace = print_graph_function_event,
1231};
1232
1233static struct trace_event graph_trace_entry_event = {
1234 .type = TRACE_GRAPH_ENT,
1235 .funcs = &graph_functions,
1236};
1237
1238static struct trace_event graph_trace_ret_event = {
1239 .type = TRACE_GRAPH_RET,
1240 .funcs = &graph_functions
1241};
1242
1201static struct tracer graph_trace __read_mostly = { 1243static struct tracer graph_trace __read_mostly = {
1202 .name = "function_graph", 1244 .name = "function_graph",
1203 .open = graph_trace_open, 1245 .open = graph_trace_open,
@@ -1219,6 +1261,16 @@ static __init int init_graph_trace(void)
1219{ 1261{
1220 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1262 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1221 1263
1264 if (!register_ftrace_event(&graph_trace_entry_event)) {
1265 pr_warning("Warning: could not register graph trace events\n");
1266 return 1;
1267 }
1268
1269 if (!register_ftrace_event(&graph_trace_ret_event)) {
1270 pr_warning("Warning: could not register graph trace events\n");
1271 return 1;
1272 }
1273
1222 return register_tracer(&graph_trace); 1274 return register_tracer(&graph_trace);
1223} 1275}
1224 1276
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f5..000000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
1/*
2 * h/w branch tracer for x86 based on BTS
3 *
4 * Copyright (C) 2008-2009 Intel Corporation.
5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */
7#include <linux/kallsyms.h>
8#include <linux/debugfs.h>
9#include <linux/ftrace.h>
10#include <linux/module.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
13#include <linux/fs.h>
14
15#include <asm/ds.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20
21#define BTS_BUFFER_SIZE (1 << 13)
22
23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25
26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27
28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly;
30static struct trace_array *hw_branch_trace __read_mostly;
31
32
33static void bts_trace_init_cpu(int cpu)
34{
35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
39
40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
41 per_cpu(hwb_tracer, cpu) = NULL;
42}
43
44static int bts_trace_init(struct trace_array *tr)
45{
46 int cpu;
47
48 hw_branch_trace = tr;
49 trace_hw_branches_enabled = 0;
50
51 get_online_cpus();
52 for_each_online_cpu(cpu) {
53 bts_trace_init_cpu(cpu);
54
55 if (likely(per_cpu(hwb_tracer, cpu)))
56 trace_hw_branches_enabled = 1;
57 }
58 trace_hw_branches_suspended = 0;
59 put_online_cpus();
60
61 /* If we could not enable tracing on a single cpu, we fail. */
62 return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
63}
64
65static void bts_trace_reset(struct trace_array *tr)
66{
67 int cpu;
68
69 get_online_cpus();
70 for_each_online_cpu(cpu) {
71 if (likely(per_cpu(hwb_tracer, cpu))) {
72 ds_release_bts(per_cpu(hwb_tracer, cpu));
73 per_cpu(hwb_tracer, cpu) = NULL;
74 }
75 }
76 trace_hw_branches_enabled = 0;
77 trace_hw_branches_suspended = 0;
78 put_online_cpus();
79}
80
81static void bts_trace_start(struct trace_array *tr)
82{
83 int cpu;
84
85 get_online_cpus();
86 for_each_online_cpu(cpu)
87 if (likely(per_cpu(hwb_tracer, cpu)))
88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
89 trace_hw_branches_suspended = 0;
90 put_online_cpus();
91}
92
93static void bts_trace_stop(struct trace_array *tr)
94{
95 int cpu;
96
97 get_online_cpus();
98 for_each_online_cpu(cpu)
99 if (likely(per_cpu(hwb_tracer, cpu)))
100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
101 trace_hw_branches_suspended = 1;
102 put_online_cpus();
103}
104
105static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
106 unsigned long action, void *hcpu)
107{
108 int cpu = (long)hcpu;
109
110 switch (action) {
111 case CPU_ONLINE:
112 case CPU_DOWN_FAILED:
113 /* The notification is sent with interrupts enabled. */
114 if (trace_hw_branches_enabled) {
115 bts_trace_init_cpu(cpu);
116
117 if (trace_hw_branches_suspended &&
118 likely(per_cpu(hwb_tracer, cpu)))
119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
120 }
121 break;
122
123 case CPU_DOWN_PREPARE:
124 /* The notification is sent with interrupts enabled. */
125 if (likely(per_cpu(hwb_tracer, cpu))) {
126 ds_release_bts(per_cpu(hwb_tracer, cpu));
127 per_cpu(hwb_tracer, cpu) = NULL;
128 }
129 }
130
131 return NOTIFY_DONE;
132}
133
134static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
135 .notifier_call = bts_hotcpu_handler
136};
137
138static void bts_trace_print_header(struct seq_file *m)
139{
140 seq_puts(m, "# CPU# TO <- FROM\n");
141}
142
143static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
144{
145 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
146 struct trace_entry *entry = iter->ent;
147 struct trace_seq *seq = &iter->seq;
148 struct hw_branch_entry *it;
149
150 trace_assign_type(it, entry);
151
152 if (entry->type == TRACE_HW_BRANCHES) {
153 if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
154 seq_print_ip_sym(seq, it->to, symflags) &&
155 trace_seq_printf(seq, "\t <- ") &&
156 seq_print_ip_sym(seq, it->from, symflags) &&
157 trace_seq_printf(seq, "\n"))
158 return TRACE_TYPE_HANDLED;
159 return TRACE_TYPE_PARTIAL_LINE;
160 }
161 return TRACE_TYPE_UNHANDLED;
162}
163
164void trace_hw_branch(u64 from, u64 to)
165{
166 struct ftrace_event_call *call = &event_hw_branch;
167 struct trace_array *tr = hw_branch_trace;
168 struct ring_buffer_event *event;
169 struct ring_buffer *buf;
170 struct hw_branch_entry *entry;
171 unsigned long irq1;
172 int cpu;
173
174 if (unlikely(!tr))
175 return;
176
177 if (unlikely(!trace_hw_branches_enabled))
178 return;
179
180 local_irq_save(irq1);
181 cpu = raw_smp_processor_id();
182 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
183 goto out;
184
185 buf = tr->buffer;
186 event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
187 sizeof(*entry), 0, 0);
188 if (!event)
189 goto out;
190 entry = ring_buffer_event_data(event);
191 tracing_generic_entry_update(&entry->ent, 0, from);
192 entry->ent.type = TRACE_HW_BRANCHES;
193 entry->from = from;
194 entry->to = to;
195 if (!filter_check_discard(call, entry, buf, event))
196 trace_buffer_unlock_commit(buf, event, 0, 0);
197
198 out:
199 atomic_dec(&tr->data[cpu]->disabled);
200 local_irq_restore(irq1);
201}
202
203static void trace_bts_at(const struct bts_trace *trace, void *at)
204{
205 struct bts_struct bts;
206 int err = 0;
207
208 WARN_ON_ONCE(!trace->read);
209 if (!trace->read)
210 return;
211
212 err = trace->read(this_tracer, at, &bts);
213 if (err < 0)
214 return;
215
216 switch (bts.qualifier) {
217 case BTS_BRANCH:
218 trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
219 break;
220 }
221}
222
223/*
224 * Collect the trace on the current cpu and write it into the ftrace buffer.
225 *
226 * pre: tracing must be suspended on the current cpu
227 */
228static void trace_bts_cpu(void *arg)
229{
230 struct trace_array *tr = (struct trace_array *)arg;
231 const struct bts_trace *trace;
232 unsigned char *at;
233
234 if (unlikely(!tr))
235 return;
236
237 if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
238 return;
239
240 if (unlikely(!this_tracer))
241 return;
242
243 trace = ds_read_bts(this_tracer);
244 if (!trace)
245 return;
246
247 for (at = trace->ds.top; (void *)at < trace->ds.end;
248 at += trace->ds.size)
249 trace_bts_at(trace, at);
250
251 for (at = trace->ds.begin; (void *)at < trace->ds.top;
252 at += trace->ds.size)
253 trace_bts_at(trace, at);
254}
255
256static void trace_bts_prepare(struct trace_iterator *iter)
257{
258 int cpu;
259
260 get_online_cpus();
261 for_each_online_cpu(cpu)
262 if (likely(per_cpu(hwb_tracer, cpu)))
263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
264 /*
265 * We need to collect the trace on the respective cpu since ftrace
266 * implicitly adds the record for the current cpu.
267 * Once that is more flexible, we could collect the data from any cpu.
268 */
269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
270
271 for_each_online_cpu(cpu)
272 if (likely(per_cpu(hwb_tracer, cpu)))
273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
274 put_online_cpus();
275}
276
277static void trace_bts_close(struct trace_iterator *iter)
278{
279 tracing_reset_online_cpus(iter->tr);
280}
281
282void trace_hw_branch_oops(void)
283{
284 if (this_tracer) {
285 ds_suspend_bts_noirq(this_tracer);
286 trace_bts_cpu(hw_branch_trace);
287 ds_resume_bts_noirq(this_tracer);
288 }
289}
290
291struct tracer bts_tracer __read_mostly =
292{
293 .name = "hw-branch-tracer",
294 .init = bts_trace_init,
295 .reset = bts_trace_reset,
296 .print_header = bts_trace_print_header,
297 .print_line = bts_trace_print_line,
298 .start = bts_trace_start,
299 .stop = bts_trace_stop,
300 .open = trace_bts_prepare,
301 .close = trace_bts_close,
302#ifdef CONFIG_FTRACE_SELFTEST
303 .selftest = trace_selftest_startup_hw_branches,
304#endif /* CONFIG_FTRACE_SELFTEST */
305};
306
307__init static int init_bts_trace(void)
308{
309 register_hotcpu_notifier(&bts_hotcpu_notifier);
310 return register_tracer(&bts_tracer);
311}
312device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c7..6fd486e0cef4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
34 34
35static int save_lat_flag; 35static int save_lat_flag;
36 36
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph);
39
37#ifdef CONFIG_PREEMPT_TRACER 40#ifdef CONFIG_PREEMPT_TRACER
38static inline int 41static inline int
39preempt_trace(void) 42preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
55# define irq_trace() (0) 58# define irq_trace() (0)
56#endif 59#endif
57 60
61#define TRACE_DISPLAY_GRAPH 1
62
63static struct tracer_opt trace_opts[] = {
64#ifdef CONFIG_FUNCTION_GRAPH_TRACER
65 /* display latency trace as call graph */
66 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
67#endif
68 { } /* Empty entry */
69};
70
71static struct tracer_flags tracer_flags = {
72 .val = 0,
73 .opts = trace_opts,
74};
75
76#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
77
58/* 78/*
59 * Sequence count - we record it when starting a measurement and 79 * Sequence count - we record it when starting a measurement and
60 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
108}; 128};
109#endif /* CONFIG_FUNCTION_TRACER */ 129#endif /* CONFIG_FUNCTION_TRACER */
110 130
131#ifdef CONFIG_FUNCTION_GRAPH_TRACER
132static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
133{
134 int cpu;
135
136 if (!(bit & TRACE_DISPLAY_GRAPH))
137 return -EINVAL;
138
139 if (!(is_graph() ^ set))
140 return 0;
141
142 stop_irqsoff_tracer(irqsoff_trace, !set);
143
144 for_each_possible_cpu(cpu)
145 per_cpu(tracing_cpu, cpu) = 0;
146
147 tracing_max_latency = 0;
148 tracing_reset_online_cpus(irqsoff_trace);
149
150 return start_irqsoff_tracer(irqsoff_trace, set);
151}
152
153static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
154{
155 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data;
157 unsigned long flags;
158 long disabled;
159 int ret;
160 int cpu;
161 int pc;
162
163 cpu = raw_smp_processor_id();
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0;
166
167 local_save_flags(flags);
168 /* slight chance to get a false positive on tracing_cpu */
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled);
182 return ret;
183}
184
185static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
186{
187 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data;
189 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc;
193
194 cpu = raw_smp_processor_id();
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return;
197
198 local_save_flags(flags);
199 /* slight chance to get a false positive on tracing_cpu */
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled);
212}
213
214static void irqsoff_trace_open(struct trace_iterator *iter)
215{
216 if (is_graph())
217 graph_trace_open(iter);
218
219}
220
221static void irqsoff_trace_close(struct trace_iterator *iter)
222{
223 if (iter->private)
224 graph_trace_close(iter);
225}
226
227#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
228 TRACE_GRAPH_PRINT_PROC)
229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /*
240 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler
242 */
243 if (is_graph())
244 return print_graph_function_flags(iter, flags);
245
246 return TRACE_TYPE_UNHANDLED;
247}
248
249static void irqsoff_print_header(struct seq_file *s)
250{
251 if (is_graph()) {
252 struct trace_iterator *iter = s->private;
253 u32 flags = GRAPH_TRACER_FLAGS;
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s);
268}
269
270static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc)
294{
295 if (!is_graph())
296 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301}
302
303#else
304#define __trace_function trace_function
305
306static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
307{
308 return -EINVAL;
309}
310
311static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
312{
313 return -1;
314}
315
316static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
317{
318 return TRACE_TYPE_UNHANDLED;
319}
320
321static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
322static void irqsoff_print_header(struct seq_file *s) { }
323static void irqsoff_trace_open(struct trace_iterator *iter) { }
324static void irqsoff_trace_close(struct trace_iterator *iter) { }
325#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
326
111/* 327/*
112 * Should this new latency be reported/recorded? 328 * Should this new latency be reported/recorded?
113 */ 329 */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
150 if (!report_latency(delta)) 366 if (!report_latency(delta))
151 goto out_unlock; 367 goto out_unlock;
152 368
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 369 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */ 370 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc); 371 __trace_stack(tr, flags, 5, pc);
156 372
@@ -172,7 +388,7 @@ out_unlock:
172out: 388out:
173 data->critical_sequence = max_sequence; 389 data->critical_sequence = max_sequence;
174 data->preempt_timestamp = ftrace_now(cpu); 390 data->preempt_timestamp = ftrace_now(cpu);
175 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 391 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
176} 392}
177 393
178static inline void 394static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
204 420
205 local_save_flags(flags); 421 local_save_flags(flags);
206 422
207 trace_function(tr, ip, parent_ip, flags, preempt_count()); 423 __trace_function(tr, ip, parent_ip, flags, preempt_count());
208 424
209 per_cpu(tracing_cpu, cpu) = 1; 425 per_cpu(tracing_cpu, cpu) = 1;
210 426
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
238 atomic_inc(&data->disabled); 454 atomic_inc(&data->disabled);
239 455
240 local_save_flags(flags); 456 local_save_flags(flags);
241 trace_function(tr, ip, parent_ip, flags, preempt_count()); 457 __trace_function(tr, ip, parent_ip, flags, preempt_count());
242 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 458 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
243 data->critical_start = 0; 459 data->critical_start = 0;
244 atomic_dec(&data->disabled); 460 atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
347} 563}
348#endif /* CONFIG_PREEMPT_TRACER */ 564#endif /* CONFIG_PREEMPT_TRACER */
349 565
350static void start_irqsoff_tracer(struct trace_array *tr) 566static int start_irqsoff_tracer(struct trace_array *tr, int graph)
351{ 567{
352 register_ftrace_function(&trace_ops); 568 int ret = 0;
353 if (tracing_is_enabled()) 569
570 if (!graph)
571 ret = register_ftrace_function(&trace_ops);
572 else
573 ret = register_ftrace_graph(&irqsoff_graph_return,
574 &irqsoff_graph_entry);
575
576 if (!ret && tracing_is_enabled())
354 tracer_enabled = 1; 577 tracer_enabled = 1;
355 else 578 else
356 tracer_enabled = 0; 579 tracer_enabled = 0;
580
581 return ret;
357} 582}
358 583
359static void stop_irqsoff_tracer(struct trace_array *tr) 584static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
360{ 585{
361 tracer_enabled = 0; 586 tracer_enabled = 0;
362 unregister_ftrace_function(&trace_ops); 587
588 if (!graph)
589 unregister_ftrace_function(&trace_ops);
590 else
591 unregister_ftrace_graph();
363} 592}
364 593
365static void __irqsoff_tracer_init(struct trace_array *tr) 594static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
372 /* make sure that the tracer is visible */ 601 /* make sure that the tracer is visible */
373 smp_wmb(); 602 smp_wmb();
374 tracing_reset_online_cpus(tr); 603 tracing_reset_online_cpus(tr);
375 start_irqsoff_tracer(tr); 604
605 if (start_irqsoff_tracer(tr, is_graph()))
606 printk(KERN_ERR "failed to start irqsoff tracer\n");
376} 607}
377 608
378static void irqsoff_tracer_reset(struct trace_array *tr) 609static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 610{
380 stop_irqsoff_tracer(tr); 611 stop_irqsoff_tracer(tr, is_graph());
381 612
382 if (!save_lat_flag) 613 if (!save_lat_flag)
383 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 614 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
409 .start = irqsoff_tracer_start, 640 .start = irqsoff_tracer_start,
410 .stop = irqsoff_tracer_stop, 641 .stop = irqsoff_tracer_stop,
411 .print_max = 1, 642 .print_max = 1,
643 .print_header = irqsoff_print_header,
644 .print_line = irqsoff_print_line,
645 .flags = &tracer_flags,
646 .set_flag = irqsoff_set_flag,
412#ifdef CONFIG_FTRACE_SELFTEST 647#ifdef CONFIG_FTRACE_SELFTEST
413 .selftest = trace_selftest_startup_irqsoff, 648 .selftest = trace_selftest_startup_irqsoff,
414#endif 649#endif
650 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close,
415}; 652};
416# define register_irqsoff(trace) register_tracer(&trace) 653# define register_irqsoff(trace) register_tracer(&trace)
417#else 654#else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
435 .start = irqsoff_tracer_start, 672 .start = irqsoff_tracer_start,
436 .stop = irqsoff_tracer_stop, 673 .stop = irqsoff_tracer_stop,
437 .print_max = 1, 674 .print_max = 1,
675 .print_header = irqsoff_print_header,
676 .print_line = irqsoff_print_line,
677 .flags = &tracer_flags,
678 .set_flag = irqsoff_set_flag,
438#ifdef CONFIG_FTRACE_SELFTEST 679#ifdef CONFIG_FTRACE_SELFTEST
439 .selftest = trace_selftest_startup_preemptoff, 680 .selftest = trace_selftest_startup_preemptoff,
440#endif 681#endif
682 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close,
441}; 684};
442# define register_preemptoff(trace) register_tracer(&trace) 685# define register_preemptoff(trace) register_tracer(&trace)
443#else 686#else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
463 .start = irqsoff_tracer_start, 706 .start = irqsoff_tracer_start,
464 .stop = irqsoff_tracer_stop, 707 .stop = irqsoff_tracer_stop,
465 .print_max = 1, 708 .print_max = 1,
709 .print_header = irqsoff_print_header,
710 .print_line = irqsoff_print_line,
711 .flags = &tracer_flags,
712 .set_flag = irqsoff_set_flag,
466#ifdef CONFIG_FTRACE_SELFTEST 713#ifdef CONFIG_FTRACE_SELFTEST
467 .selftest = trace_selftest_startup_preemptirqsoff, 714 .selftest = trace_selftest_startup_preemptirqsoff,
468#endif 715#endif
716 .open = irqsoff_trace_open,
717 .close = irqsoff_trace_close,
469}; 718};
470 719
471# define register_preemptirqsoff(trace) register_tracer(&trace) 720# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1251e367bae9..faf7cefd15da 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,6 +29,8 @@
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h>
33#include <asm/bitsperlong.h>
32 34
33#include "trace.h" 35#include "trace.h"
34#include "trace_output.h" 36#include "trace_output.h"
@@ -40,7 +42,6 @@
40 42
41/* Reserved field names */ 43/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip" 44#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip" 45#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func" 46#define FIELD_STRING_FUNC "__probe_func"
46 47
@@ -52,56 +53,102 @@ const char *reserved_field_names[] = {
52 "common_tgid", 53 "common_tgid",
53 "common_lock_depth", 54 "common_lock_depth",
54 FIELD_STRING_IP, 55 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP, 56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC, 57 FIELD_STRING_FUNC,
58}; 58};
59 59
60struct fetch_func { 60/* Printing function type */
61 unsigned long (*func)(struct pt_regs *, void *); 61typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
62#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
63#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
64
65/* Printing in basic type function template */
66#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
67static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
68 const char *name, void *data)\
69{ \
70 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
71} \
72static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
73
74DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
75DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
76DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
77DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
78DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
79DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
80DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
82
83/* Data fetch function type */
84typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
85
86struct fetch_param {
87 fetch_func_t fn;
62 void *data; 88 void *data;
63}; 89};
64 90
65static __kprobes unsigned long call_fetch(struct fetch_func *f, 91static __kprobes void call_fetch(struct fetch_param *fprm,
66 struct pt_regs *regs) 92 struct pt_regs *regs, void *dest)
67{ 93{
68 return f->func(regs, f->data); 94 return fprm->fn(regs, fprm->data, dest);
69} 95}
70 96
71/* fetch handlers */ 97#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type
72static __kprobes unsigned long fetch_register(struct pt_regs *regs, 98/*
73 void *offset) 99 * Define macro for basic types - we don't need to define s* types, because
74{ 100 * we have to care only about bitwidth at recording time.
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset)); 101 */
102#define DEFINE_BASIC_FETCH_FUNCS(kind) \
103DEFINE_FETCH_##kind(u8) \
104DEFINE_FETCH_##kind(u16) \
105DEFINE_FETCH_##kind(u32) \
106DEFINE_FETCH_##kind(u64)
107
108#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \
109 ((FETCH_FUNC_NAME(kind, u8) == fn) || \
110 (FETCH_FUNC_NAME(kind, u16) == fn) || \
111 (FETCH_FUNC_NAME(kind, u32) == fn) || \
112 (FETCH_FUNC_NAME(kind, u64) == fn))
113
114/* Data fetch function templates */
115#define DEFINE_FETCH_reg(type) \
116static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
117 void *offset, void *dest) \
118{ \
119 *(type *)dest = (type)regs_get_register(regs, \
120 (unsigned int)((unsigned long)offset)); \
76} 121}
77 122DEFINE_BASIC_FETCH_FUNCS(reg)
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs, 123
79 void *num) 124#define DEFINE_FETCH_stack(type) \
80{ 125static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
81 return regs_get_kernel_stack_nth(regs, 126 void *offset, void *dest) \
82 (unsigned int)((unsigned long)num)); 127{ \
128 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
129 (unsigned int)((unsigned long)offset)); \
83} 130}
131DEFINE_BASIC_FETCH_FUNCS(stack)
84 132
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) 133#define DEFINE_FETCH_retval(type) \
86{ 134static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
87 unsigned long retval; 135 void *dummy, void *dest) \
88 136{ \
89 if (probe_kernel_address(addr, retval)) 137 *(type *)dest = (type)regs_return_value(regs); \
90 return 0;
91 return retval;
92} 138}
93 139DEFINE_BASIC_FETCH_FUNCS(retval)
94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 140
95 void *dummy) 141#define DEFINE_FETCH_memory(type) \
96{ 142static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
97 return regs_return_value(regs); 143 void *addr, void *dest) \
98} 144{ \
99 145 type retval; \
100static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, 146 if (probe_kernel_address(addr, retval)) \
101 void *dummy) 147 *(type *)dest = 0; \
102{ 148 else \
103 return kernel_stack_pointer(regs); 149 *(type *)dest = retval; \
104} 150}
151DEFINE_BASIC_FETCH_FUNCS(memory)
105 152
106/* Memory fetching by symbol */ 153/* Memory fetching by symbol */
107struct symbol_cache { 154struct symbol_cache {
@@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
145 return sc; 192 return sc;
146} 193}
147 194
148static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) 195#define DEFINE_FETCH_symbol(type) \
149{ 196static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
150 struct symbol_cache *sc = data; 197 void *data, void *dest) \
151 198{ \
152 if (sc->addr) 199 struct symbol_cache *sc = data; \
153 return fetch_memory(regs, (void *)sc->addr); 200 if (sc->addr) \
154 else 201 fetch_memory_##type(regs, (void *)sc->addr, dest); \
155 return 0; 202 else \
203 *(type *)dest = 0; \
156} 204}
205DEFINE_BASIC_FETCH_FUNCS(symbol)
157 206
158/* Special indirect memory access interface */ 207/* Dereference memory access function */
159struct indirect_fetch_data { 208struct deref_fetch_param {
160 struct fetch_func orig; 209 struct fetch_param orig;
161 long offset; 210 long offset;
162}; 211};
163 212
164static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) 213#define DEFINE_FETCH_deref(type) \
165{ 214static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
166 struct indirect_fetch_data *ind = data; 215 void *data, void *dest) \
167 unsigned long addr; 216{ \
168 217 struct deref_fetch_param *dprm = data; \
169 addr = call_fetch(&ind->orig, regs); 218 unsigned long addr; \
170 if (addr) { 219 call_fetch(&dprm->orig, regs, &addr); \
171 addr += ind->offset; 220 if (addr) { \
172 return fetch_memory(regs, (void *)addr); 221 addr += dprm->offset; \
173 } else 222 fetch_memory_##type(regs, (void *)addr, dest); \
174 return 0; 223 } else \
224 *(type *)dest = 0; \
175} 225}
226DEFINE_BASIC_FETCH_FUNCS(deref)
176 227
177static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) 228static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
178{ 229{
179 if (data->orig.func == fetch_indirect) 230 if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
180 free_indirect_fetch_data(data->orig.data); 231 free_deref_fetch_param(data->orig.data);
181 else if (data->orig.func == fetch_symbol) 232 else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
182 free_symbol_cache(data->orig.data); 233 free_symbol_cache(data->orig.data);
183 kfree(data); 234 kfree(data);
184} 235}
185 236
237/* Default (unsigned long) fetch type */
238#define __DEFAULT_FETCH_TYPE(t) u##t
239#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
240#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
241#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
242
243#define ASSIGN_FETCH_FUNC(kind, type) \
244 .kind = FETCH_FUNC_NAME(kind, type)
245
246#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
247 {.name = #ptype, \
248 .size = sizeof(ftype), \
249 .is_signed = sign, \
250 .print = PRINT_TYPE_FUNC_NAME(ptype), \
251 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
252ASSIGN_FETCH_FUNC(reg, ftype), \
253ASSIGN_FETCH_FUNC(stack, ftype), \
254ASSIGN_FETCH_FUNC(retval, ftype), \
255ASSIGN_FETCH_FUNC(memory, ftype), \
256ASSIGN_FETCH_FUNC(symbol, ftype), \
257ASSIGN_FETCH_FUNC(deref, ftype), \
258 }
259
260/* Fetch type information table */
261static const struct fetch_type {
262 const char *name; /* Name of type */
263 size_t size; /* Byte size of type */
264 int is_signed; /* Signed flag */
265 print_type_func_t print; /* Print functions */
266 const char *fmt; /* Fromat string */
267 /* Fetch functions */
268 fetch_func_t reg;
269 fetch_func_t stack;
270 fetch_func_t retval;
271 fetch_func_t memory;
272 fetch_func_t symbol;
273 fetch_func_t deref;
274} fetch_type_table[] = {
275 ASSIGN_FETCH_TYPE(u8, u8, 0),
276 ASSIGN_FETCH_TYPE(u16, u16, 0),
277 ASSIGN_FETCH_TYPE(u32, u32, 0),
278 ASSIGN_FETCH_TYPE(u64, u64, 0),
279 ASSIGN_FETCH_TYPE(s8, u8, 1),
280 ASSIGN_FETCH_TYPE(s16, u16, 1),
281 ASSIGN_FETCH_TYPE(s32, u32, 1),
282 ASSIGN_FETCH_TYPE(s64, u64, 1),
283};
284
285static const struct fetch_type *find_fetch_type(const char *type)
286{
287 int i;
288
289 if (!type)
290 type = DEFAULT_FETCH_TYPE_STR;
291
292 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
293 if (strcmp(type, fetch_type_table[i].name) == 0)
294 return &fetch_type_table[i];
295 return NULL;
296}
297
298/* Special function : only accept unsigned long */
299static __kprobes void fetch_stack_address(struct pt_regs *regs,
300 void *dummy, void *dest)
301{
302 *(unsigned long *)dest = kernel_stack_pointer(regs);
303}
304
186/** 305/**
187 * Kprobe event core functions 306 * Kprobe event core functions
188 */ 307 */
189 308
190struct probe_arg { 309struct probe_arg {
191 struct fetch_func fetch; 310 struct fetch_param fetch;
192 const char *name; 311 unsigned int offset; /* Offset from argument entry */
312 const char *name; /* Name of this argument */
313 const char *comm; /* Command of this argument */
314 const struct fetch_type *type; /* Type of this argument */
193}; 315};
194 316
195/* Flags for trace_probe */ 317/* Flags for trace_probe */
@@ -202,8 +324,9 @@ struct trace_probe {
202 unsigned long nhit; 324 unsigned long nhit;
203 unsigned int flags; /* For TP_FLAG_* */ 325 unsigned int flags; /* For TP_FLAG_* */
204 const char *symbol; /* symbol name */ 326 const char *symbol; /* symbol name */
327 struct ftrace_event_class class;
205 struct ftrace_event_call call; 328 struct ftrace_event_call call;
206 struct trace_event event; 329 ssize_t size; /* trace entry size */
207 unsigned int nr_args; 330 unsigned int nr_args;
208 struct probe_arg args[]; 331 struct probe_arg args[];
209}; 332};
@@ -212,6 +335,7 @@ struct trace_probe {
212 (offsetof(struct trace_probe, args) + \ 335 (offsetof(struct trace_probe, args) + \
213 (sizeof(struct probe_arg) * (n))) 336 (sizeof(struct probe_arg) * (n)))
214 337
338
215static __kprobes int probe_is_return(struct trace_probe *tp) 339static __kprobes int probe_is_return(struct trace_probe *tp)
216{ 340{
217 return tp->rp.handler != NULL; 341 return tp->rp.handler != NULL;
@@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
222 return tp->symbol ? tp->symbol : "unknown"; 346 return tp->symbol ? tp->symbol : "unknown";
223} 347}
224 348
225static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
226{
227 int ret = -EINVAL;
228
229 if (ff->func == fetch_register) {
230 const char *name;
231 name = regs_query_register_name((unsigned int)((long)ff->data));
232 ret = snprintf(buf, n, "%%%s", name);
233 } else if (ff->func == fetch_stack)
234 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
235 else if (ff->func == fetch_memory)
236 ret = snprintf(buf, n, "@0x%p", ff->data);
237 else if (ff->func == fetch_symbol) {
238 struct symbol_cache *sc = ff->data;
239 if (sc->offset)
240 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
241 sc->offset);
242 else
243 ret = snprintf(buf, n, "@%s", sc->symbol);
244 } else if (ff->func == fetch_retvalue)
245 ret = snprintf(buf, n, "$retval");
246 else if (ff->func == fetch_stack_address)
247 ret = snprintf(buf, n, "$stack");
248 else if (ff->func == fetch_indirect) {
249 struct indirect_fetch_data *id = ff->data;
250 size_t l = 0;
251 ret = snprintf(buf, n, "%+ld(", id->offset);
252 if (ret >= n)
253 goto end;
254 l += ret;
255 ret = probe_arg_string(buf + l, n - l, &id->orig);
256 if (ret < 0)
257 goto end;
258 l += ret;
259 ret = snprintf(buf + l, n - l, ")");
260 ret += l;
261 }
262end:
263 if (ret >= n)
264 return -ENOSPC;
265 return ret;
266}
267
268static int register_probe_event(struct trace_probe *tp); 349static int register_probe_event(struct trace_probe *tp);
269static void unregister_probe_event(struct trace_probe *tp); 350static void unregister_probe_event(struct trace_probe *tp);
270 351
@@ -323,6 +404,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
323 goto error; 404 goto error;
324 } 405 }
325 406
407 tp->call.class = &tp->class;
326 tp->call.name = kstrdup(event, GFP_KERNEL); 408 tp->call.name = kstrdup(event, GFP_KERNEL);
327 if (!tp->call.name) 409 if (!tp->call.name)
328 goto error; 410 goto error;
@@ -332,8 +414,8 @@ static struct trace_probe *alloc_trace_probe(const char *group,
332 goto error; 414 goto error;
333 } 415 }
334 416
335 tp->call.system = kstrdup(group, GFP_KERNEL); 417 tp->class.system = kstrdup(group, GFP_KERNEL);
336 if (!tp->call.system) 418 if (!tp->class.system)
337 goto error; 419 goto error;
338 420
339 INIT_LIST_HEAD(&tp->list); 421 INIT_LIST_HEAD(&tp->list);
@@ -347,11 +429,12 @@ error:
347 429
348static void free_probe_arg(struct probe_arg *arg) 430static void free_probe_arg(struct probe_arg *arg)
349{ 431{
350 if (arg->fetch.func == fetch_symbol) 432 if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
433 free_deref_fetch_param(arg->fetch.data);
434 else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
351 free_symbol_cache(arg->fetch.data); 435 free_symbol_cache(arg->fetch.data);
352 else if (arg->fetch.func == fetch_indirect)
353 free_indirect_fetch_data(arg->fetch.data);
354 kfree(arg->name); 436 kfree(arg->name);
437 kfree(arg->comm);
355} 438}
356 439
357static void free_trace_probe(struct trace_probe *tp) 440static void free_trace_probe(struct trace_probe *tp)
@@ -361,7 +444,7 @@ static void free_trace_probe(struct trace_probe *tp)
361 for (i = 0; i < tp->nr_args; i++) 444 for (i = 0; i < tp->nr_args; i++)
362 free_probe_arg(&tp->args[i]); 445 free_probe_arg(&tp->args[i]);
363 446
364 kfree(tp->call.system); 447 kfree(tp->call.class->system);
365 kfree(tp->call.name); 448 kfree(tp->call.name);
366 kfree(tp->symbol); 449 kfree(tp->symbol);
367 kfree(tp); 450 kfree(tp);
@@ -374,7 +457,7 @@ static struct trace_probe *find_probe_event(const char *event,
374 457
375 list_for_each_entry(tp, &probe_list, list) 458 list_for_each_entry(tp, &probe_list, list)
376 if (strcmp(tp->call.name, event) == 0 && 459 if (strcmp(tp->call.name, event) == 0 &&
377 strcmp(tp->call.system, group) == 0) 460 strcmp(tp->call.class->system, group) == 0)
378 return tp; 461 return tp;
379 return NULL; 462 return NULL;
380} 463}
@@ -399,7 +482,7 @@ static int register_trace_probe(struct trace_probe *tp)
399 mutex_lock(&probe_lock); 482 mutex_lock(&probe_lock);
400 483
401 /* register as an event */ 484 /* register as an event */
402 old_tp = find_probe_event(tp->call.name, tp->call.system); 485 old_tp = find_probe_event(tp->call.name, tp->call.class->system);
403 if (old_tp) { 486 if (old_tp) {
404 /* delete old event */ 487 /* delete old event */
405 unregister_trace_probe(old_tp); 488 unregister_trace_probe(old_tp);
@@ -457,28 +540,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
457#define PARAM_MAX_ARGS 16 540#define PARAM_MAX_ARGS 16
458#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) 541#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
459 542
460static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) 543static int parse_probe_vars(char *arg, const struct fetch_type *t,
544 struct fetch_param *f, int is_return)
461{ 545{
462 int ret = 0; 546 int ret = 0;
463 unsigned long param; 547 unsigned long param;
464 548
465 if (strcmp(arg, "retval") == 0) { 549 if (strcmp(arg, "retval") == 0) {
466 if (is_return) { 550 if (is_return)
467 ff->func = fetch_retvalue; 551 f->fn = t->retval;
468 ff->data = NULL; 552 else
469 } else
470 ret = -EINVAL; 553 ret = -EINVAL;
471 } else if (strncmp(arg, "stack", 5) == 0) { 554 } else if (strncmp(arg, "stack", 5) == 0) {
472 if (arg[5] == '\0') { 555 if (arg[5] == '\0') {
473 ff->func = fetch_stack_address; 556 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
474 ff->data = NULL; 557 f->fn = fetch_stack_address;
558 else
559 ret = -EINVAL;
475 } else if (isdigit(arg[5])) { 560 } else if (isdigit(arg[5])) {
476 ret = strict_strtoul(arg + 5, 10, &param); 561 ret = strict_strtoul(arg + 5, 10, &param);
477 if (ret || param > PARAM_MAX_STACK) 562 if (ret || param > PARAM_MAX_STACK)
478 ret = -EINVAL; 563 ret = -EINVAL;
479 else { 564 else {
480 ff->func = fetch_stack; 565 f->fn = t->stack;
481 ff->data = (void *)param; 566 f->data = (void *)param;
482 } 567 }
483 } else 568 } else
484 ret = -EINVAL; 569 ret = -EINVAL;
@@ -488,7 +573,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
488} 573}
489 574
490/* Recursive argument parser */ 575/* Recursive argument parser */
491static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 576static int __parse_probe_arg(char *arg, const struct fetch_type *t,
577 struct fetch_param *f, int is_return)
492{ 578{
493 int ret = 0; 579 int ret = 0;
494 unsigned long param; 580 unsigned long param;
@@ -497,13 +583,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
497 583
498 switch (arg[0]) { 584 switch (arg[0]) {
499 case '$': 585 case '$':
500 ret = parse_probe_vars(arg + 1, ff, is_return); 586 ret = parse_probe_vars(arg + 1, t, f, is_return);
501 break; 587 break;
502 case '%': /* named register */ 588 case '%': /* named register */
503 ret = regs_query_register_offset(arg + 1); 589 ret = regs_query_register_offset(arg + 1);
504 if (ret >= 0) { 590 if (ret >= 0) {
505 ff->func = fetch_register; 591 f->fn = t->reg;
506 ff->data = (void *)(unsigned long)ret; 592 f->data = (void *)(unsigned long)ret;
507 ret = 0; 593 ret = 0;
508 } 594 }
509 break; 595 break;
@@ -512,26 +598,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
512 ret = strict_strtoul(arg + 1, 0, &param); 598 ret = strict_strtoul(arg + 1, 0, &param);
513 if (ret) 599 if (ret)
514 break; 600 break;
515 ff->func = fetch_memory; 601 f->fn = t->memory;
516 ff->data = (void *)param; 602 f->data = (void *)param;
517 } else { 603 } else {
518 ret = split_symbol_offset(arg + 1, &offset); 604 ret = split_symbol_offset(arg + 1, &offset);
519 if (ret) 605 if (ret)
520 break; 606 break;
521 ff->data = alloc_symbol_cache(arg + 1, offset); 607 f->data = alloc_symbol_cache(arg + 1, offset);
522 if (ff->data) 608 if (f->data)
523 ff->func = fetch_symbol; 609 f->fn = t->symbol;
524 else
525 ret = -EINVAL;
526 } 610 }
527 break; 611 break;
528 case '+': /* indirect memory */ 612 case '+': /* deref memory */
529 case '-': 613 case '-':
530 tmp = strchr(arg, '('); 614 tmp = strchr(arg, '(');
531 if (!tmp) { 615 if (!tmp)
532 ret = -EINVAL;
533 break; 616 break;
534 }
535 *tmp = '\0'; 617 *tmp = '\0';
536 ret = strict_strtol(arg + 1, 0, &offset); 618 ret = strict_strtol(arg + 1, 0, &offset);
537 if (ret) 619 if (ret)
@@ -541,38 +623,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
541 arg = tmp + 1; 623 arg = tmp + 1;
542 tmp = strrchr(arg, ')'); 624 tmp = strrchr(arg, ')');
543 if (tmp) { 625 if (tmp) {
544 struct indirect_fetch_data *id; 626 struct deref_fetch_param *dprm;
627 const struct fetch_type *t2 = find_fetch_type(NULL);
545 *tmp = '\0'; 628 *tmp = '\0';
546 id = kzalloc(sizeof(struct indirect_fetch_data), 629 dprm = kzalloc(sizeof(struct deref_fetch_param),
547 GFP_KERNEL); 630 GFP_KERNEL);
548 if (!id) 631 if (!dprm)
549 return -ENOMEM; 632 return -ENOMEM;
550 id->offset = offset; 633 dprm->offset = offset;
551 ret = __parse_probe_arg(arg, &id->orig, is_return); 634 ret = __parse_probe_arg(arg, t2, &dprm->orig,
635 is_return);
552 if (ret) 636 if (ret)
553 kfree(id); 637 kfree(dprm);
554 else { 638 else {
555 ff->func = fetch_indirect; 639 f->fn = t->deref;
556 ff->data = (void *)id; 640 f->data = (void *)dprm;
557 } 641 }
558 } else 642 }
559 ret = -EINVAL;
560 break; 643 break;
561 default:
562 /* TODO: support custom handler */
563 ret = -EINVAL;
564 } 644 }
645 if (!ret && !f->fn)
646 ret = -EINVAL;
565 return ret; 647 return ret;
566} 648}
567 649
568/* String length checking wrapper */ 650/* String length checking wrapper */
569static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 651static int parse_probe_arg(char *arg, struct trace_probe *tp,
652 struct probe_arg *parg, int is_return)
570{ 653{
654 const char *t;
655
571 if (strlen(arg) > MAX_ARGSTR_LEN) { 656 if (strlen(arg) > MAX_ARGSTR_LEN) {
572 pr_info("Argument is too long.: %s\n", arg); 657 pr_info("Argument is too long.: %s\n", arg);
573 return -ENOSPC; 658 return -ENOSPC;
574 } 659 }
575 return __parse_probe_arg(arg, ff, is_return); 660 parg->comm = kstrdup(arg, GFP_KERNEL);
661 if (!parg->comm) {
662 pr_info("Failed to allocate memory for command '%s'.\n", arg);
663 return -ENOMEM;
664 }
665 t = strchr(parg->comm, ':');
666 if (t) {
667 arg[t - parg->comm] = '\0';
668 t++;
669 }
670 parg->type = find_fetch_type(t);
671 if (!parg->type) {
672 pr_info("Unsupported type: %s\n", t);
673 return -EINVAL;
674 }
675 parg->offset = tp->size;
676 tp->size += parg->type->size;
677 return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
576} 678}
577 679
578/* Return 1 if name is reserved or already used by another argument */ 680/* Return 1 if name is reserved or already used by another argument */
@@ -602,15 +704,18 @@ static int create_trace_probe(int argc, char **argv)
602 * @ADDR : fetch memory at ADDR (ADDR should be in kernel) 704 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
603 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) 705 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
604 * %REG : fetch register REG 706 * %REG : fetch register REG
605 * Indirect memory fetch: 707 * Dereferencing memory fetch:
606 * +|-offs(ARG) : fetch memory at ARG +|- offs address. 708 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
607 * Alias name of args: 709 * Alias name of args:
608 * NAME=FETCHARG : set NAME as alias of FETCHARG. 710 * NAME=FETCHARG : set NAME as alias of FETCHARG.
711 * Type of args:
712 * FETCHARG:TYPE : use TYPE instead of unsigned long.
609 */ 713 */
610 struct trace_probe *tp; 714 struct trace_probe *tp;
611 int i, ret = 0; 715 int i, ret = 0;
612 int is_return = 0, is_delete = 0; 716 int is_return = 0, is_delete = 0;
613 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; 717 char *symbol = NULL, *event = NULL, *group = NULL;
718 char *arg, *tmp;
614 unsigned long offset = 0; 719 unsigned long offset = 0;
615 void *addr = NULL; 720 void *addr = NULL;
616 char buf[MAX_EVENT_NAME_LEN]; 721 char buf[MAX_EVENT_NAME_LEN];
@@ -723,13 +828,6 @@ static int create_trace_probe(int argc, char **argv)
723 else 828 else
724 arg = argv[i]; 829 arg = argv[i];
725 830
726 if (conflict_field_name(argv[i], tp->args, i)) {
727 pr_info("Argument%d name '%s' conflicts with "
728 "another field.\n", i, argv[i]);
729 ret = -EINVAL;
730 goto error;
731 }
732
733 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); 831 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
734 if (!tp->args[i].name) { 832 if (!tp->args[i].name) {
735 pr_info("Failed to allocate argument%d name '%s'.\n", 833 pr_info("Failed to allocate argument%d name '%s'.\n",
@@ -737,9 +835,19 @@ static int create_trace_probe(int argc, char **argv)
737 ret = -ENOMEM; 835 ret = -ENOMEM;
738 goto error; 836 goto error;
739 } 837 }
838 tmp = strchr(tp->args[i].name, ':');
839 if (tmp)
840 *tmp = '_'; /* convert : to _ */
841
842 if (conflict_field_name(tp->args[i].name, tp->args, i)) {
843 pr_info("Argument%d name '%s' conflicts with "
844 "another field.\n", i, argv[i]);
845 ret = -EINVAL;
846 goto error;
847 }
740 848
741 /* Parse fetch argument */ 849 /* Parse fetch argument */
742 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); 850 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
743 if (ret) { 851 if (ret) {
744 pr_info("Parse error at argument%d. (%d)\n", i, ret); 852 pr_info("Parse error at argument%d. (%d)\n", i, ret);
745 kfree(tp->args[i].name); 853 kfree(tp->args[i].name);
@@ -794,11 +902,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
794static int probes_seq_show(struct seq_file *m, void *v) 902static int probes_seq_show(struct seq_file *m, void *v)
795{ 903{
796 struct trace_probe *tp = v; 904 struct trace_probe *tp = v;
797 int i, ret; 905 int i;
798 char buf[MAX_ARGSTR_LEN + 1];
799 906
800 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 907 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
801 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); 908 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
802 909
803 if (!tp->symbol) 910 if (!tp->symbol)
804 seq_printf(m, " 0x%p", tp->rp.kp.addr); 911 seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -807,15 +914,10 @@ static int probes_seq_show(struct seq_file *m, void *v)
807 else 914 else
808 seq_printf(m, " %s", probe_symbol(tp)); 915 seq_printf(m, " %s", probe_symbol(tp));
809 916
810 for (i = 0; i < tp->nr_args; i++) { 917 for (i = 0; i < tp->nr_args; i++)
811 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); 918 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
812 if (ret < 0) {
813 pr_warning("Argument%d decoding error(%d).\n", i, ret);
814 return ret;
815 }
816 seq_printf(m, " %s=%s", tp->args[i].name, buf);
817 }
818 seq_printf(m, "\n"); 919 seq_printf(m, "\n");
920
819 return 0; 921 return 0;
820} 922}
821 923
@@ -945,9 +1047,10 @@ static const struct file_operations kprobe_profile_ops = {
945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 1047static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
946{ 1048{
947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1049 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
948 struct kprobe_trace_entry *entry; 1050 struct kprobe_trace_entry_head *entry;
949 struct ring_buffer_event *event; 1051 struct ring_buffer_event *event;
950 struct ring_buffer *buffer; 1052 struct ring_buffer *buffer;
1053 u8 *data;
951 int size, i, pc; 1054 int size, i, pc;
952 unsigned long irq_flags; 1055 unsigned long irq_flags;
953 struct ftrace_event_call *call = &tp->call; 1056 struct ftrace_event_call *call = &tp->call;
@@ -957,18 +1060,18 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
957 local_save_flags(irq_flags); 1060 local_save_flags(irq_flags);
958 pc = preempt_count(); 1061 pc = preempt_count();
959 1062
960 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1063 size = sizeof(*entry) + tp->size;
961 1064
962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1065 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
963 irq_flags, pc); 1066 size, irq_flags, pc);
964 if (!event) 1067 if (!event)
965 return; 1068 return;
966 1069
967 entry = ring_buffer_event_data(event); 1070 entry = ring_buffer_event_data(event);
968 entry->nargs = tp->nr_args;
969 entry->ip = (unsigned long)kp->addr; 1071 entry->ip = (unsigned long)kp->addr;
1072 data = (u8 *)&entry[1];
970 for (i = 0; i < tp->nr_args; i++) 1073 for (i = 0; i < tp->nr_args; i++)
971 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1074 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
972 1075
973 if (!filter_current_check_discard(buffer, call, entry, event)) 1076 if (!filter_current_check_discard(buffer, call, entry, event))
974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1077 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -979,9 +1082,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
979 struct pt_regs *regs) 1082 struct pt_regs *regs)
980{ 1083{
981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1084 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
982 struct kretprobe_trace_entry *entry; 1085 struct kretprobe_trace_entry_head *entry;
983 struct ring_buffer_event *event; 1086 struct ring_buffer_event *event;
984 struct ring_buffer *buffer; 1087 struct ring_buffer *buffer;
1088 u8 *data;
985 int size, i, pc; 1089 int size, i, pc;
986 unsigned long irq_flags; 1090 unsigned long irq_flags;
987 struct ftrace_event_call *call = &tp->call; 1091 struct ftrace_event_call *call = &tp->call;
@@ -989,19 +1093,19 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
989 local_save_flags(irq_flags); 1093 local_save_flags(irq_flags);
990 pc = preempt_count(); 1094 pc = preempt_count();
991 1095
992 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1096 size = sizeof(*entry) + tp->size;
993 1097
994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1098 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
995 irq_flags, pc); 1099 size, irq_flags, pc);
996 if (!event) 1100 if (!event)
997 return; 1101 return;
998 1102
999 entry = ring_buffer_event_data(event); 1103 entry = ring_buffer_event_data(event);
1000 entry->nargs = tp->nr_args;
1001 entry->func = (unsigned long)tp->rp.kp.addr; 1104 entry->func = (unsigned long)tp->rp.kp.addr;
1002 entry->ret_ip = (unsigned long)ri->ret_addr; 1105 entry->ret_ip = (unsigned long)ri->ret_addr;
1106 data = (u8 *)&entry[1];
1003 for (i = 0; i < tp->nr_args; i++) 1107 for (i = 0; i < tp->nr_args; i++)
1004 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1108 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1005 1109
1006 if (!filter_current_check_discard(buffer, call, entry, event)) 1110 if (!filter_current_check_discard(buffer, call, entry, event))
1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1111 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1009,17 +1113,17 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1009 1113
1010/* Event entry printers */ 1114/* Event entry printers */
1011enum print_line_t 1115enum print_line_t
1012print_kprobe_event(struct trace_iterator *iter, int flags) 1116print_kprobe_event(struct trace_iterator *iter, int flags,
1117 struct trace_event *event)
1013{ 1118{
1014 struct kprobe_trace_entry *field; 1119 struct kprobe_trace_entry_head *field;
1015 struct trace_seq *s = &iter->seq; 1120 struct trace_seq *s = &iter->seq;
1016 struct trace_event *event;
1017 struct trace_probe *tp; 1121 struct trace_probe *tp;
1122 u8 *data;
1018 int i; 1123 int i;
1019 1124
1020 field = (struct kprobe_trace_entry *)iter->ent; 1125 field = (struct kprobe_trace_entry_head *)iter->ent;
1021 event = ftrace_find_event(field->ent.type); 1126 tp = container_of(event, struct trace_probe, call.event);
1022 tp = container_of(event, struct trace_probe, event);
1023 1127
1024 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1128 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1025 goto partial; 1129 goto partial;
@@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
1030 if (!trace_seq_puts(s, ")")) 1134 if (!trace_seq_puts(s, ")"))
1031 goto partial; 1135 goto partial;
1032 1136
1033 for (i = 0; i < field->nargs; i++) 1137 data = (u8 *)&field[1];
1034 if (!trace_seq_printf(s, " %s=%lx", 1138 for (i = 0; i < tp->nr_args; i++)
1035 tp->args[i].name, field->args[i])) 1139 if (!tp->args[i].type->print(s, tp->args[i].name,
1140 data + tp->args[i].offset))
1036 goto partial; 1141 goto partial;
1037 1142
1038 if (!trace_seq_puts(s, "\n")) 1143 if (!trace_seq_puts(s, "\n"))
@@ -1044,17 +1149,17 @@ partial:
1044} 1149}
1045 1150
1046enum print_line_t 1151enum print_line_t
1047print_kretprobe_event(struct trace_iterator *iter, int flags) 1152print_kretprobe_event(struct trace_iterator *iter, int flags,
1153 struct trace_event *event)
1048{ 1154{
1049 struct kretprobe_trace_entry *field; 1155 struct kretprobe_trace_entry_head *field;
1050 struct trace_seq *s = &iter->seq; 1156 struct trace_seq *s = &iter->seq;
1051 struct trace_event *event;
1052 struct trace_probe *tp; 1157 struct trace_probe *tp;
1158 u8 *data;
1053 int i; 1159 int i;
1054 1160
1055 field = (struct kretprobe_trace_entry *)iter->ent; 1161 field = (struct kretprobe_trace_entry_head *)iter->ent;
1056 event = ftrace_find_event(field->ent.type); 1162 tp = container_of(event, struct trace_probe, call.event);
1057 tp = container_of(event, struct trace_probe, event);
1058 1163
1059 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1164 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1060 goto partial; 1165 goto partial;
@@ -1071,9 +1176,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
1071 if (!trace_seq_puts(s, ")")) 1176 if (!trace_seq_puts(s, ")"))
1072 goto partial; 1177 goto partial;
1073 1178
1074 for (i = 0; i < field->nargs; i++) 1179 data = (u8 *)&field[1];
1075 if (!trace_seq_printf(s, " %s=%lx", 1180 for (i = 0; i < tp->nr_args; i++)
1076 tp->args[i].name, field->args[i])) 1181 if (!tp->args[i].type->print(s, tp->args[i].name,
1182 data + tp->args[i].offset))
1077 goto partial; 1183 goto partial;
1078 1184
1079 if (!trace_seq_puts(s, "\n")) 1185 if (!trace_seq_puts(s, "\n"))
@@ -1110,8 +1216,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
1110 1216
1111static int probe_event_raw_init(struct ftrace_event_call *event_call) 1217static int probe_event_raw_init(struct ftrace_event_call *event_call)
1112{ 1218{
1113 INIT_LIST_HEAD(&event_call->fields);
1114
1115 return 0; 1219 return 0;
1116} 1220}
1117 1221
@@ -1129,29 +1233,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call)
1129static int kprobe_event_define_fields(struct ftrace_event_call *event_call) 1233static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1130{ 1234{
1131 int ret, i; 1235 int ret, i;
1132 struct kprobe_trace_entry field; 1236 struct kprobe_trace_entry_head field;
1133 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1237 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1134 1238
1135 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1239 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1136 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1137 /* Set argument names as fields */ 1240 /* Set argument names as fields */
1138 for (i = 0; i < tp->nr_args; i++) 1241 for (i = 0; i < tp->nr_args; i++) {
1139 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1242 ret = trace_define_field(event_call, tp->args[i].type->name,
1243 tp->args[i].name,
1244 sizeof(field) + tp->args[i].offset,
1245 tp->args[i].type->size,
1246 tp->args[i].type->is_signed,
1247 FILTER_OTHER);
1248 if (ret)
1249 return ret;
1250 }
1140 return 0; 1251 return 0;
1141} 1252}
1142 1253
1143static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) 1254static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1144{ 1255{
1145 int ret, i; 1256 int ret, i;
1146 struct kretprobe_trace_entry field; 1257 struct kretprobe_trace_entry_head field;
1147 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1258 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1148 1259
1149 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); 1260 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1150 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1261 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1151 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1152 /* Set argument names as fields */ 1262 /* Set argument names as fields */
1153 for (i = 0; i < tp->nr_args; i++) 1263 for (i = 0; i < tp->nr_args; i++) {
1154 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1264 ret = trace_define_field(event_call, tp->args[i].type->name,
1265 tp->args[i].name,
1266 sizeof(field) + tp->args[i].offset,
1267 tp->args[i].type->size,
1268 tp->args[i].type->is_signed,
1269 FILTER_OTHER);
1270 if (ret)
1271 return ret;
1272 }
1155 return 0; 1273 return 0;
1156} 1274}
1157 1275
@@ -1176,8 +1294,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); 1294 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1177 1295
1178 for (i = 0; i < tp->nr_args; i++) { 1296 for (i = 0; i < tp->nr_args; i++) {
1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", 1297 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
1180 tp->args[i].name); 1298 tp->args[i].name, tp->args[i].type->fmt);
1181 } 1299 }
1182 1300
1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); 1301 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
@@ -1219,28 +1337,30 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1219{ 1337{
1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1338 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1221 struct ftrace_event_call *call = &tp->call; 1339 struct ftrace_event_call *call = &tp->call;
1222 struct kprobe_trace_entry *entry; 1340 struct kprobe_trace_entry_head *entry;
1341 struct hlist_head *head;
1342 u8 *data;
1223 int size, __size, i; 1343 int size, __size, i;
1224 unsigned long irq_flags;
1225 int rctx; 1344 int rctx;
1226 1345
1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1346 __size = sizeof(*entry) + tp->size;
1228 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1347 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1229 size -= sizeof(u32); 1348 size -= sizeof(u32);
1230 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1349 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1231 "profile buffer not large enough")) 1350 "profile buffer not large enough"))
1232 return; 1351 return;
1233 1352
1234 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); 1353 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1235 if (!entry) 1354 if (!entry)
1236 return; 1355 return;
1237 1356
1238 entry->nargs = tp->nr_args;
1239 entry->ip = (unsigned long)kp->addr; 1357 entry->ip = (unsigned long)kp->addr;
1358 data = (u8 *)&entry[1];
1240 for (i = 0; i < tp->nr_args; i++) 1359 for (i = 0; i < tp->nr_args; i++)
1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1360 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1242 1361
1243 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); 1362 head = per_cpu_ptr(call->perf_events, smp_processor_id());
1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
1244} 1364}
1245 1365
1246/* Kretprobe profile handler */ 1366/* Kretprobe profile handler */
@@ -1249,30 +1369,31 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1249{ 1369{
1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1370 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1251 struct ftrace_event_call *call = &tp->call; 1371 struct ftrace_event_call *call = &tp->call;
1252 struct kretprobe_trace_entry *entry; 1372 struct kretprobe_trace_entry_head *entry;
1373 struct hlist_head *head;
1374 u8 *data;
1253 int size, __size, i; 1375 int size, __size, i;
1254 unsigned long irq_flags;
1255 int rctx; 1376 int rctx;
1256 1377
1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1378 __size = sizeof(*entry) + tp->size;
1258 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1379 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1259 size -= sizeof(u32); 1380 size -= sizeof(u32);
1260 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1381 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1261 "profile buffer not large enough")) 1382 "profile buffer not large enough"))
1262 return; 1383 return;
1263 1384
1264 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); 1385 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1265 if (!entry) 1386 if (!entry)
1266 return; 1387 return;
1267 1388
1268 entry->nargs = tp->nr_args;
1269 entry->func = (unsigned long)tp->rp.kp.addr; 1389 entry->func = (unsigned long)tp->rp.kp.addr;
1270 entry->ret_ip = (unsigned long)ri->ret_addr; 1390 entry->ret_ip = (unsigned long)ri->ret_addr;
1391 data = (u8 *)&entry[1];
1271 for (i = 0; i < tp->nr_args; i++) 1392 for (i = 0; i < tp->nr_args; i++)
1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1393 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1273 1394
1274 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, 1395 head = per_cpu_ptr(call->perf_events, smp_processor_id());
1275 irq_flags, regs); 1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1276} 1397}
1277 1398
1278static int probe_perf_enable(struct ftrace_event_call *call) 1399static int probe_perf_enable(struct ftrace_event_call *call)
@@ -1302,6 +1423,26 @@ static void probe_perf_disable(struct ftrace_event_call *call)
1302} 1423}
1303#endif /* CONFIG_PERF_EVENTS */ 1424#endif /* CONFIG_PERF_EVENTS */
1304 1425
1426static __kprobes
1427int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1428{
1429 switch (type) {
1430 case TRACE_REG_REGISTER:
1431 return probe_event_enable(event);
1432 case TRACE_REG_UNREGISTER:
1433 probe_event_disable(event);
1434 return 0;
1435
1436#ifdef CONFIG_PERF_EVENTS
1437 case TRACE_REG_PERF_REGISTER:
1438 return probe_perf_enable(event);
1439 case TRACE_REG_PERF_UNREGISTER:
1440 probe_perf_disable(event);
1441 return 0;
1442#endif
1443 }
1444 return 0;
1445}
1305 1446
1306static __kprobes 1447static __kprobes
1307int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) 1448int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1331,6 +1472,14 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1331 return 0; /* We don't tweek kernel, so just return 0 */ 1472 return 0; /* We don't tweek kernel, so just return 0 */
1332} 1473}
1333 1474
1475static struct trace_event_functions kretprobe_funcs = {
1476 .trace = print_kretprobe_event
1477};
1478
1479static struct trace_event_functions kprobe_funcs = {
1480 .trace = print_kprobe_event
1481};
1482
1334static int register_probe_event(struct trace_probe *tp) 1483static int register_probe_event(struct trace_probe *tp)
1335{ 1484{
1336 struct ftrace_event_call *call = &tp->call; 1485 struct ftrace_event_call *call = &tp->call;
@@ -1338,36 +1487,31 @@ static int register_probe_event(struct trace_probe *tp)
1338 1487
1339 /* Initialize ftrace_event_call */ 1488 /* Initialize ftrace_event_call */
1340 if (probe_is_return(tp)) { 1489 if (probe_is_return(tp)) {
1341 tp->event.trace = print_kretprobe_event; 1490 INIT_LIST_HEAD(&call->class->fields);
1342 call->raw_init = probe_event_raw_init; 1491 call->event.funcs = &kretprobe_funcs;
1343 call->define_fields = kretprobe_event_define_fields; 1492 call->class->raw_init = probe_event_raw_init;
1493 call->class->define_fields = kretprobe_event_define_fields;
1344 } else { 1494 } else {
1345 tp->event.trace = print_kprobe_event; 1495 INIT_LIST_HEAD(&call->class->fields);
1346 call->raw_init = probe_event_raw_init; 1496 call->event.funcs = &kprobe_funcs;
1347 call->define_fields = kprobe_event_define_fields; 1497 call->class->raw_init = probe_event_raw_init;
1498 call->class->define_fields = kprobe_event_define_fields;
1348 } 1499 }
1349 if (set_print_fmt(tp) < 0) 1500 if (set_print_fmt(tp) < 0)
1350 return -ENOMEM; 1501 return -ENOMEM;
1351 call->event = &tp->event; 1502 ret = register_ftrace_event(&call->event);
1352 call->id = register_ftrace_event(&tp->event); 1503 if (!ret) {
1353 if (!call->id) {
1354 kfree(call->print_fmt); 1504 kfree(call->print_fmt);
1355 return -ENODEV; 1505 return -ENODEV;
1356 } 1506 }
1357 call->enabled = 0; 1507 call->flags = 0;
1358 call->regfunc = probe_event_enable; 1508 call->class->reg = kprobe_register;
1359 call->unregfunc = probe_event_disable;
1360
1361#ifdef CONFIG_PERF_EVENTS
1362 call->perf_event_enable = probe_perf_enable;
1363 call->perf_event_disable = probe_perf_disable;
1364#endif
1365 call->data = tp; 1509 call->data = tp;
1366 ret = trace_add_event_call(call); 1510 ret = trace_add_event_call(call);
1367 if (ret) { 1511 if (ret) {
1368 pr_info("Failed to register kprobe event: %s\n", call->name); 1512 pr_info("Failed to register kprobe event: %s\n", call->name);
1369 kfree(call->print_fmt); 1513 kfree(call->print_fmt);
1370 unregister_ftrace_event(&tp->event); 1514 unregister_ftrace_event(&call->event);
1371 } 1515 }
1372 return ret; 1516 return ret;
1373} 1517}
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index d59cd6879477..8eaf00749b65 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -34,12 +34,6 @@
34 34
35#include <asm/atomic.h> 35#include <asm/atomic.h>
36 36
37/*
38 * For now, let us restrict the no. of symbols traced simultaneously to number
39 * of available hardware breakpoint registers.
40 */
41#define KSYM_TRACER_MAX HBP_NUM
42
43#define KSYM_TRACER_OP_LEN 3 /* rw- */ 37#define KSYM_TRACER_OP_LEN 3 /* rw- */
44 38
45struct trace_ksym { 39struct trace_ksym {
@@ -53,7 +47,6 @@ struct trace_ksym {
53 47
54static struct trace_array *ksym_trace_array; 48static struct trace_array *ksym_trace_array;
55 49
56static unsigned int ksym_filter_entry_count;
57static unsigned int ksym_tracing_enabled; 50static unsigned int ksym_tracing_enabled;
58 51
59static HLIST_HEAD(ksym_filter_head); 52static HLIST_HEAD(ksym_filter_head);
@@ -181,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
181 struct trace_ksym *entry; 174 struct trace_ksym *entry;
182 int ret = -ENOMEM; 175 int ret = -ENOMEM;
183 176
184 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
185 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
186 " new requests for tracing can be accepted now.\n",
187 KSYM_TRACER_MAX);
188 return -ENOSPC;
189 }
190
191 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); 177 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
192 if (!entry) 178 if (!entry)
193 return -ENOMEM; 179 return -ENOMEM;
@@ -203,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
203 189
204 if (IS_ERR(entry->ksym_hbp)) { 190 if (IS_ERR(entry->ksym_hbp)) {
205 ret = PTR_ERR(entry->ksym_hbp); 191 ret = PTR_ERR(entry->ksym_hbp);
206 printk(KERN_INFO "ksym_tracer request failed. Try again" 192 if (ret == -ENOSPC) {
207 " later!!\n"); 193 printk(KERN_ERR "ksym_tracer: Maximum limit reached."
194 " No new requests for tracing can be accepted now.\n");
195 } else {
196 printk(KERN_INFO "ksym_tracer request failed. Try again"
197 " later!!\n");
198 }
208 goto err; 199 goto err;
209 } 200 }
210 201
211 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); 202 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
212 ksym_filter_entry_count++;
213 203
214 return 0; 204 return 0;
215 205
@@ -265,7 +255,6 @@ static void __ksym_trace_reset(void)
265 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, 255 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
266 ksym_hlist) { 256 ksym_hlist) {
267 unregister_wide_hw_breakpoint(entry->ksym_hbp); 257 unregister_wide_hw_breakpoint(entry->ksym_hbp);
268 ksym_filter_entry_count--;
269 hlist_del_rcu(&(entry->ksym_hlist)); 258 hlist_del_rcu(&(entry->ksym_hlist));
270 synchronize_rcu(); 259 synchronize_rcu();
271 kfree(entry); 260 kfree(entry);
@@ -338,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file,
338 goto out_unlock; 327 goto out_unlock;
339 } 328 }
340 /* Error or "symbol:---" case: drop it */ 329 /* Error or "symbol:---" case: drop it */
341 ksym_filter_entry_count--;
342 hlist_del_rcu(&(entry->ksym_hlist)); 330 hlist_del_rcu(&(entry->ksym_hlist));
343 synchronize_rcu(); 331 synchronize_rcu();
344 kfree(entry); 332 kfree(entry);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cdc..57c1b4596470 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -209,6 +209,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
209 209
210 return 1; 210 return 1;
211} 211}
212EXPORT_SYMBOL(trace_seq_putc);
212 213
213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 214int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
214{ 215{
@@ -253,7 +254,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
253 void *ret; 254 void *ret;
254 255
255 if (s->full) 256 if (s->full)
256 return 0; 257 return NULL;
257 258
258 if (len > ((PAGE_SIZE - 1) - s->len)) { 259 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1; 260 s->full = 1;
@@ -355,6 +356,21 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
355} 356}
356EXPORT_SYMBOL(ftrace_print_symbols_seq); 357EXPORT_SYMBOL(ftrace_print_symbols_seq);
357 358
359const char *
360ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
361{
362 int i;
363 const char *ret = p->buffer + p->len;
364
365 for (i = 0; i < buf_len; i++)
366 trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
367
368 trace_seq_putc(p, 0);
369
370 return ret;
371}
372EXPORT_SYMBOL(ftrace_print_hex_seq);
373
358#ifdef CONFIG_KRETPROBES 374#ifdef CONFIG_KRETPROBES
359static inline const char *kretprobed(const char *name) 375static inline const char *kretprobed(const char *name)
360{ 376{
@@ -726,6 +742,9 @@ int register_ftrace_event(struct trace_event *event)
726 if (WARN_ON(!event)) 742 if (WARN_ON(!event))
727 goto out; 743 goto out;
728 744
745 if (WARN_ON(!event->funcs))
746 goto out;
747
729 INIT_LIST_HEAD(&event->list); 748 INIT_LIST_HEAD(&event->list);
730 749
731 if (!event->type) { 750 if (!event->type) {
@@ -758,14 +777,14 @@ int register_ftrace_event(struct trace_event *event)
758 goto out; 777 goto out;
759 } 778 }
760 779
761 if (event->trace == NULL) 780 if (event->funcs->trace == NULL)
762 event->trace = trace_nop_print; 781 event->funcs->trace = trace_nop_print;
763 if (event->raw == NULL) 782 if (event->funcs->raw == NULL)
764 event->raw = trace_nop_print; 783 event->funcs->raw = trace_nop_print;
765 if (event->hex == NULL) 784 if (event->funcs->hex == NULL)
766 event->hex = trace_nop_print; 785 event->funcs->hex = trace_nop_print;
767 if (event->binary == NULL) 786 if (event->funcs->binary == NULL)
768 event->binary = trace_nop_print; 787 event->funcs->binary = trace_nop_print;
769 788
770 key = event->type & (EVENT_HASHSIZE - 1); 789 key = event->type & (EVENT_HASHSIZE - 1);
771 790
@@ -807,13 +826,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
807 * Standard events 826 * Standard events
808 */ 827 */
809 828
810enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) 829enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
830 struct trace_event *event)
811{ 831{
812 return TRACE_TYPE_HANDLED; 832 return TRACE_TYPE_HANDLED;
813} 833}
814 834
815/* TRACE_FN */ 835/* TRACE_FN */
816static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) 836static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
837 struct trace_event *event)
817{ 838{
818 struct ftrace_entry *field; 839 struct ftrace_entry *field;
819 struct trace_seq *s = &iter->seq; 840 struct trace_seq *s = &iter->seq;
@@ -840,7 +861,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
840 return TRACE_TYPE_PARTIAL_LINE; 861 return TRACE_TYPE_PARTIAL_LINE;
841} 862}
842 863
843static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) 864static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
865 struct trace_event *event)
844{ 866{
845 struct ftrace_entry *field; 867 struct ftrace_entry *field;
846 868
@@ -854,7 +876,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
854 return TRACE_TYPE_HANDLED; 876 return TRACE_TYPE_HANDLED;
855} 877}
856 878
857static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) 879static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
880 struct trace_event *event)
858{ 881{
859 struct ftrace_entry *field; 882 struct ftrace_entry *field;
860 struct trace_seq *s = &iter->seq; 883 struct trace_seq *s = &iter->seq;
@@ -867,7 +890,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
867 return TRACE_TYPE_HANDLED; 890 return TRACE_TYPE_HANDLED;
868} 891}
869 892
870static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) 893static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
894 struct trace_event *event)
871{ 895{
872 struct ftrace_entry *field; 896 struct ftrace_entry *field;
873 struct trace_seq *s = &iter->seq; 897 struct trace_seq *s = &iter->seq;
@@ -880,14 +904,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
880 return TRACE_TYPE_HANDLED; 904 return TRACE_TYPE_HANDLED;
881} 905}
882 906
883static struct trace_event trace_fn_event = { 907static struct trace_event_functions trace_fn_funcs = {
884 .type = TRACE_FN,
885 .trace = trace_fn_trace, 908 .trace = trace_fn_trace,
886 .raw = trace_fn_raw, 909 .raw = trace_fn_raw,
887 .hex = trace_fn_hex, 910 .hex = trace_fn_hex,
888 .binary = trace_fn_bin, 911 .binary = trace_fn_bin,
889}; 912};
890 913
914static struct trace_event trace_fn_event = {
915 .type = TRACE_FN,
916 .funcs = &trace_fn_funcs,
917};
918
891/* TRACE_CTX an TRACE_WAKE */ 919/* TRACE_CTX an TRACE_WAKE */
892static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, 920static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
893 char *delim) 921 char *delim)
@@ -916,13 +944,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
916 return TRACE_TYPE_HANDLED; 944 return TRACE_TYPE_HANDLED;
917} 945}
918 946
919static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags) 947static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
948 struct trace_event *event)
920{ 949{
921 return trace_ctxwake_print(iter, "==>"); 950 return trace_ctxwake_print(iter, "==>");
922} 951}
923 952
924static enum print_line_t trace_wake_print(struct trace_iterator *iter, 953static enum print_line_t trace_wake_print(struct trace_iterator *iter,
925 int flags) 954 int flags, struct trace_event *event)
926{ 955{
927 return trace_ctxwake_print(iter, " +"); 956 return trace_ctxwake_print(iter, " +");
928} 957}
@@ -950,12 +979,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
950 return TRACE_TYPE_HANDLED; 979 return TRACE_TYPE_HANDLED;
951} 980}
952 981
953static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags) 982static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
983 struct trace_event *event)
954{ 984{
955 return trace_ctxwake_raw(iter, 0); 985 return trace_ctxwake_raw(iter, 0);
956} 986}
957 987
958static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags) 988static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
989 struct trace_event *event)
959{ 990{
960 return trace_ctxwake_raw(iter, '+'); 991 return trace_ctxwake_raw(iter, '+');
961} 992}
@@ -984,18 +1015,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
984 return TRACE_TYPE_HANDLED; 1015 return TRACE_TYPE_HANDLED;
985} 1016}
986 1017
987static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags) 1018static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
1019 struct trace_event *event)
988{ 1020{
989 return trace_ctxwake_hex(iter, 0); 1021 return trace_ctxwake_hex(iter, 0);
990} 1022}
991 1023
992static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags) 1024static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
1025 struct trace_event *event)
993{ 1026{
994 return trace_ctxwake_hex(iter, '+'); 1027 return trace_ctxwake_hex(iter, '+');
995} 1028}
996 1029
997static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, 1030static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
998 int flags) 1031 int flags, struct trace_event *event)
999{ 1032{
1000 struct ctx_switch_entry *field; 1033 struct ctx_switch_entry *field;
1001 struct trace_seq *s = &iter->seq; 1034 struct trace_seq *s = &iter->seq;
@@ -1012,25 +1045,33 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
1012 return TRACE_TYPE_HANDLED; 1045 return TRACE_TYPE_HANDLED;
1013} 1046}
1014 1047
1015static struct trace_event trace_ctx_event = { 1048static struct trace_event_functions trace_ctx_funcs = {
1016 .type = TRACE_CTX,
1017 .trace = trace_ctx_print, 1049 .trace = trace_ctx_print,
1018 .raw = trace_ctx_raw, 1050 .raw = trace_ctx_raw,
1019 .hex = trace_ctx_hex, 1051 .hex = trace_ctx_hex,
1020 .binary = trace_ctxwake_bin, 1052 .binary = trace_ctxwake_bin,
1021}; 1053};
1022 1054
1023static struct trace_event trace_wake_event = { 1055static struct trace_event trace_ctx_event = {
1024 .type = TRACE_WAKE, 1056 .type = TRACE_CTX,
1057 .funcs = &trace_ctx_funcs,
1058};
1059
1060static struct trace_event_functions trace_wake_funcs = {
1025 .trace = trace_wake_print, 1061 .trace = trace_wake_print,
1026 .raw = trace_wake_raw, 1062 .raw = trace_wake_raw,
1027 .hex = trace_wake_hex, 1063 .hex = trace_wake_hex,
1028 .binary = trace_ctxwake_bin, 1064 .binary = trace_ctxwake_bin,
1029}; 1065};
1030 1066
1067static struct trace_event trace_wake_event = {
1068 .type = TRACE_WAKE,
1069 .funcs = &trace_wake_funcs,
1070};
1071
1031/* TRACE_SPECIAL */ 1072/* TRACE_SPECIAL */
1032static enum print_line_t trace_special_print(struct trace_iterator *iter, 1073static enum print_line_t trace_special_print(struct trace_iterator *iter,
1033 int flags) 1074 int flags, struct trace_event *event)
1034{ 1075{
1035 struct special_entry *field; 1076 struct special_entry *field;
1036 1077
@@ -1046,7 +1087,7 @@ static enum print_line_t trace_special_print(struct trace_iterator *iter,
1046} 1087}
1047 1088
1048static enum print_line_t trace_special_hex(struct trace_iterator *iter, 1089static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1049 int flags) 1090 int flags, struct trace_event *event)
1050{ 1091{
1051 struct special_entry *field; 1092 struct special_entry *field;
1052 struct trace_seq *s = &iter->seq; 1093 struct trace_seq *s = &iter->seq;
@@ -1061,7 +1102,7 @@ static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1061} 1102}
1062 1103
1063static enum print_line_t trace_special_bin(struct trace_iterator *iter, 1104static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1064 int flags) 1105 int flags, struct trace_event *event)
1065{ 1106{
1066 struct special_entry *field; 1107 struct special_entry *field;
1067 struct trace_seq *s = &iter->seq; 1108 struct trace_seq *s = &iter->seq;
@@ -1075,18 +1116,22 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1075 return TRACE_TYPE_HANDLED; 1116 return TRACE_TYPE_HANDLED;
1076} 1117}
1077 1118
1078static struct trace_event trace_special_event = { 1119static struct trace_event_functions trace_special_funcs = {
1079 .type = TRACE_SPECIAL,
1080 .trace = trace_special_print, 1120 .trace = trace_special_print,
1081 .raw = trace_special_print, 1121 .raw = trace_special_print,
1082 .hex = trace_special_hex, 1122 .hex = trace_special_hex,
1083 .binary = trace_special_bin, 1123 .binary = trace_special_bin,
1084}; 1124};
1085 1125
1126static struct trace_event trace_special_event = {
1127 .type = TRACE_SPECIAL,
1128 .funcs = &trace_special_funcs,
1129};
1130
1086/* TRACE_STACK */ 1131/* TRACE_STACK */
1087 1132
1088static enum print_line_t trace_stack_print(struct trace_iterator *iter, 1133static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1089 int flags) 1134 int flags, struct trace_event *event)
1090{ 1135{
1091 struct stack_entry *field; 1136 struct stack_entry *field;
1092 struct trace_seq *s = &iter->seq; 1137 struct trace_seq *s = &iter->seq;
@@ -1114,17 +1159,21 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1114 return TRACE_TYPE_PARTIAL_LINE; 1159 return TRACE_TYPE_PARTIAL_LINE;
1115} 1160}
1116 1161
1117static struct trace_event trace_stack_event = { 1162static struct trace_event_functions trace_stack_funcs = {
1118 .type = TRACE_STACK,
1119 .trace = trace_stack_print, 1163 .trace = trace_stack_print,
1120 .raw = trace_special_print, 1164 .raw = trace_special_print,
1121 .hex = trace_special_hex, 1165 .hex = trace_special_hex,
1122 .binary = trace_special_bin, 1166 .binary = trace_special_bin,
1123}; 1167};
1124 1168
1169static struct trace_event trace_stack_event = {
1170 .type = TRACE_STACK,
1171 .funcs = &trace_stack_funcs,
1172};
1173
1125/* TRACE_USER_STACK */ 1174/* TRACE_USER_STACK */
1126static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, 1175static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1127 int flags) 1176 int flags, struct trace_event *event)
1128{ 1177{
1129 struct userstack_entry *field; 1178 struct userstack_entry *field;
1130 struct trace_seq *s = &iter->seq; 1179 struct trace_seq *s = &iter->seq;
@@ -1143,17 +1192,22 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1143 return TRACE_TYPE_PARTIAL_LINE; 1192 return TRACE_TYPE_PARTIAL_LINE;
1144} 1193}
1145 1194
1146static struct trace_event trace_user_stack_event = { 1195static struct trace_event_functions trace_user_stack_funcs = {
1147 .type = TRACE_USER_STACK,
1148 .trace = trace_user_stack_print, 1196 .trace = trace_user_stack_print,
1149 .raw = trace_special_print, 1197 .raw = trace_special_print,
1150 .hex = trace_special_hex, 1198 .hex = trace_special_hex,
1151 .binary = trace_special_bin, 1199 .binary = trace_special_bin,
1152}; 1200};
1153 1201
1202static struct trace_event trace_user_stack_event = {
1203 .type = TRACE_USER_STACK,
1204 .funcs = &trace_user_stack_funcs,
1205};
1206
1154/* TRACE_BPRINT */ 1207/* TRACE_BPRINT */
1155static enum print_line_t 1208static enum print_line_t
1156trace_bprint_print(struct trace_iterator *iter, int flags) 1209trace_bprint_print(struct trace_iterator *iter, int flags,
1210 struct trace_event *event)
1157{ 1211{
1158 struct trace_entry *entry = iter->ent; 1212 struct trace_entry *entry = iter->ent;
1159 struct trace_seq *s = &iter->seq; 1213 struct trace_seq *s = &iter->seq;
@@ -1178,7 +1232,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
1178 1232
1179 1233
1180static enum print_line_t 1234static enum print_line_t
1181trace_bprint_raw(struct trace_iterator *iter, int flags) 1235trace_bprint_raw(struct trace_iterator *iter, int flags,
1236 struct trace_event *event)
1182{ 1237{
1183 struct bprint_entry *field; 1238 struct bprint_entry *field;
1184 struct trace_seq *s = &iter->seq; 1239 struct trace_seq *s = &iter->seq;
@@ -1197,16 +1252,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
1197 return TRACE_TYPE_PARTIAL_LINE; 1252 return TRACE_TYPE_PARTIAL_LINE;
1198} 1253}
1199 1254
1255static struct trace_event_functions trace_bprint_funcs = {
1256 .trace = trace_bprint_print,
1257 .raw = trace_bprint_raw,
1258};
1200 1259
1201static struct trace_event trace_bprint_event = { 1260static struct trace_event trace_bprint_event = {
1202 .type = TRACE_BPRINT, 1261 .type = TRACE_BPRINT,
1203 .trace = trace_bprint_print, 1262 .funcs = &trace_bprint_funcs,
1204 .raw = trace_bprint_raw,
1205}; 1263};
1206 1264
1207/* TRACE_PRINT */ 1265/* TRACE_PRINT */
1208static enum print_line_t trace_print_print(struct trace_iterator *iter, 1266static enum print_line_t trace_print_print(struct trace_iterator *iter,
1209 int flags) 1267 int flags, struct trace_event *event)
1210{ 1268{
1211 struct print_entry *field; 1269 struct print_entry *field;
1212 struct trace_seq *s = &iter->seq; 1270 struct trace_seq *s = &iter->seq;
@@ -1225,7 +1283,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
1225 return TRACE_TYPE_PARTIAL_LINE; 1283 return TRACE_TYPE_PARTIAL_LINE;
1226} 1284}
1227 1285
1228static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) 1286static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
1287 struct trace_event *event)
1229{ 1288{
1230 struct print_entry *field; 1289 struct print_entry *field;
1231 1290
@@ -1240,12 +1299,16 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
1240 return TRACE_TYPE_PARTIAL_LINE; 1299 return TRACE_TYPE_PARTIAL_LINE;
1241} 1300}
1242 1301
1243static struct trace_event trace_print_event = { 1302static struct trace_event_functions trace_print_funcs = {
1244 .type = TRACE_PRINT,
1245 .trace = trace_print_print, 1303 .trace = trace_print_print,
1246 .raw = trace_print_raw, 1304 .raw = trace_print_raw,
1247}; 1305};
1248 1306
1307static struct trace_event trace_print_event = {
1308 .type = TRACE_PRINT,
1309 .funcs = &trace_print_funcs,
1310};
1311
1249 1312
1250static struct trace_event *events[] __initdata = { 1313static struct trace_event *events[] __initdata = {
1251 &trace_fn_event, 1314 &trace_fn_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9d91c72ba38b..c038eba0492b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
25extern struct trace_event *ftrace_find_event(int type); 25extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags, struct trace_event *event);
29extern int 29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); 30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
31 31
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde4..8f758d070c43 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
50} 50}
51 51
52static void 52static void
53probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
54 struct task_struct *next)
55{ 54{
56 struct trace_array_cpu *data; 55 struct trace_array_cpu *data;
57 unsigned long flags; 56 unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
109} 108}
110 109
111static void 110static void
112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 111probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
113{ 112{
114 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
115 unsigned long flags; 114 unsigned long flags;
@@ -139,21 +138,21 @@ static int tracing_sched_register(void)
139{ 138{
140 int ret; 139 int ret;
141 140
142 ret = register_trace_sched_wakeup(probe_sched_wakeup); 141 ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
143 if (ret) { 142 if (ret) {
144 pr_info("wakeup trace: Couldn't activate tracepoint" 143 pr_info("wakeup trace: Couldn't activate tracepoint"
145 " probe to kernel_sched_wakeup\n"); 144 " probe to kernel_sched_wakeup\n");
146 return ret; 145 return ret;
147 } 146 }
148 147
149 ret = register_trace_sched_wakeup_new(probe_sched_wakeup); 148 ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
150 if (ret) { 149 if (ret) {
151 pr_info("wakeup trace: Couldn't activate tracepoint" 150 pr_info("wakeup trace: Couldn't activate tracepoint"
152 " probe to kernel_sched_wakeup_new\n"); 151 " probe to kernel_sched_wakeup_new\n");
153 goto fail_deprobe; 152 goto fail_deprobe;
154 } 153 }
155 154
156 ret = register_trace_sched_switch(probe_sched_switch); 155 ret = register_trace_sched_switch(probe_sched_switch, NULL);
157 if (ret) { 156 if (ret) {
158 pr_info("sched trace: Couldn't activate tracepoint" 157 pr_info("sched trace: Couldn't activate tracepoint"
159 " probe to kernel_sched_switch\n"); 158 " probe to kernel_sched_switch\n");
@@ -162,17 +161,17 @@ static int tracing_sched_register(void)
162 161
163 return ret; 162 return ret;
164fail_deprobe_wake_new: 163fail_deprobe_wake_new:
165 unregister_trace_sched_wakeup_new(probe_sched_wakeup); 164 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
166fail_deprobe: 165fail_deprobe:
167 unregister_trace_sched_wakeup(probe_sched_wakeup); 166 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
168 return ret; 167 return ret;
169} 168}
170 169
171static void tracing_sched_unregister(void) 170static void tracing_sched_unregister(void)
172{ 171{
173 unregister_trace_sched_switch(probe_sched_switch); 172 unregister_trace_sched_switch(probe_sched_switch, NULL);
174 unregister_trace_sched_wakeup_new(probe_sched_wakeup); 173 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
175 unregister_trace_sched_wakeup(probe_sched_wakeup); 174 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
176} 175}
177 176
178static void tracing_start_sched_switch(void) 177static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8d..0e73bc2ef8c5 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -98,7 +98,8 @@ static int report_latency(cycle_t delta)
98 return 1; 98 return 1;
99} 99}
100 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) 101static void
102probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
102{ 103{
103 if (task != wakeup_task) 104 if (task != wakeup_task)
104 return; 105 return;
@@ -107,8 +108,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
107} 108}
108 109
109static void notrace 110static void notrace
110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 111probe_wakeup_sched_switch(void *ignore,
111 struct task_struct *next) 112 struct task_struct *prev, struct task_struct *next)
112{ 113{
113 struct trace_array_cpu *data; 114 struct trace_array_cpu *data;
114 cycle_t T0, T1, delta; 115 cycle_t T0, T1, delta;
@@ -200,7 +201,7 @@ static void wakeup_reset(struct trace_array *tr)
200} 201}
201 202
202static void 203static void
203probe_wakeup(struct rq *rq, struct task_struct *p, int success) 204probe_wakeup(void *ignore, struct task_struct *p, int success)
204{ 205{
205 struct trace_array_cpu *data; 206 struct trace_array_cpu *data;
206 int cpu = smp_processor_id(); 207 int cpu = smp_processor_id();
@@ -264,28 +265,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
264{ 265{
265 int ret; 266 int ret;
266 267
267 ret = register_trace_sched_wakeup(probe_wakeup); 268 ret = register_trace_sched_wakeup(probe_wakeup, NULL);
268 if (ret) { 269 if (ret) {
269 pr_info("wakeup trace: Couldn't activate tracepoint" 270 pr_info("wakeup trace: Couldn't activate tracepoint"
270 " probe to kernel_sched_wakeup\n"); 271 " probe to kernel_sched_wakeup\n");
271 return; 272 return;
272 } 273 }
273 274
274 ret = register_trace_sched_wakeup_new(probe_wakeup); 275 ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
275 if (ret) { 276 if (ret) {
276 pr_info("wakeup trace: Couldn't activate tracepoint" 277 pr_info("wakeup trace: Couldn't activate tracepoint"
277 " probe to kernel_sched_wakeup_new\n"); 278 " probe to kernel_sched_wakeup_new\n");
278 goto fail_deprobe; 279 goto fail_deprobe;
279 } 280 }
280 281
281 ret = register_trace_sched_switch(probe_wakeup_sched_switch); 282 ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
282 if (ret) { 283 if (ret) {
283 pr_info("sched trace: Couldn't activate tracepoint" 284 pr_info("sched trace: Couldn't activate tracepoint"
284 " probe to kernel_sched_switch\n"); 285 " probe to kernel_sched_switch\n");
285 goto fail_deprobe_wake_new; 286 goto fail_deprobe_wake_new;
286 } 287 }
287 288
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); 289 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
289 if (ret) { 290 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint" 291 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n"); 292 " probe to kernel_sched_migrate_task\n");
@@ -312,19 +313,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
312 313
313 return; 314 return;
314fail_deprobe_wake_new: 315fail_deprobe_wake_new:
315 unregister_trace_sched_wakeup_new(probe_wakeup); 316 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
316fail_deprobe: 317fail_deprobe:
317 unregister_trace_sched_wakeup(probe_wakeup); 318 unregister_trace_sched_wakeup(probe_wakeup, NULL);
318} 319}
319 320
320static void stop_wakeup_tracer(struct trace_array *tr) 321static void stop_wakeup_tracer(struct trace_array *tr)
321{ 322{
322 tracer_enabled = 0; 323 tracer_enabled = 0;
323 unregister_ftrace_function(&trace_ops); 324 unregister_ftrace_function(&trace_ops);
324 unregister_trace_sched_switch(probe_wakeup_sched_switch); 325 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
325 unregister_trace_sched_wakeup_new(probe_wakeup); 326 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
326 unregister_trace_sched_wakeup(probe_wakeup); 327 unregister_trace_sched_wakeup(probe_wakeup, NULL);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); 328 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
328} 329}
329 330
330static int __wakeup_tracer_init(struct trace_array *tr) 331static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 81003b4d617f..250e7f9bd2f0 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -17,7 +17,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
17 case TRACE_BRANCH: 17 case TRACE_BRANCH:
18 case TRACE_GRAPH_ENT: 18 case TRACE_GRAPH_ENT:
19 case TRACE_GRAPH_RET: 19 case TRACE_GRAPH_RET:
20 case TRACE_HW_BRANCHES:
21 case TRACE_KSYM: 20 case TRACE_KSYM:
22 return 1; 21 return 1;
23 } 22 }
@@ -30,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
30 struct trace_entry *entry; 29 struct trace_entry *entry;
31 unsigned int loops = 0; 30 unsigned int loops = 0;
32 31
33 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { 32 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
34 entry = ring_buffer_event_data(event); 33 entry = ring_buffer_event_data(event);
35 34
36 /* 35 /*
@@ -256,7 +255,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
256/* Maximum number of functions to trace before diagnosing a hang */ 255/* Maximum number of functions to trace before diagnosing a hang */
257#define GRAPH_MAX_FUNC_TEST 100000000 256#define GRAPH_MAX_FUNC_TEST 100000000
258 257
259static void __ftrace_dump(bool disable_tracing); 258static void
259__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
260static unsigned int graph_hang_thresh; 260static unsigned int graph_hang_thresh;
261 261
262/* Wrap the real function entry probe to avoid possible hanging */ 262/* Wrap the real function entry probe to avoid possible hanging */
@@ -267,7 +267,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
267 ftrace_graph_stop(); 267 ftrace_graph_stop();
268 printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); 268 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
269 if (ftrace_dump_on_oops) 269 if (ftrace_dump_on_oops)
270 __ftrace_dump(false); 270 __ftrace_dump(false, DUMP_ALL);
271 return 0; 271 return 0;
272 } 272 }
273 273
@@ -755,62 +755,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
755} 755}
756#endif /* CONFIG_BRANCH_TRACER */ 756#endif /* CONFIG_BRANCH_TRACER */
757 757
758#ifdef CONFIG_HW_BRANCH_TRACER
759int
760trace_selftest_startup_hw_branches(struct tracer *trace,
761 struct trace_array *tr)
762{
763 struct trace_iterator *iter;
764 struct tracer tracer;
765 unsigned long count;
766 int ret;
767
768 if (!trace->open) {
769 printk(KERN_CONT "missing open function...");
770 return -1;
771 }
772
773 ret = tracer_init(trace, tr);
774 if (ret) {
775 warn_failed_init_tracer(trace, ret);
776 return ret;
777 }
778
779 /*
780 * The hw-branch tracer needs to collect the trace from the various
781 * cpu trace buffers - before tracing is stopped.
782 */
783 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
784 if (!iter)
785 return -ENOMEM;
786
787 memcpy(&tracer, trace, sizeof(tracer));
788
789 iter->trace = &tracer;
790 iter->tr = tr;
791 iter->pos = -1;
792 mutex_init(&iter->mutex);
793
794 trace->open(iter);
795
796 mutex_destroy(&iter->mutex);
797 kfree(iter);
798
799 tracing_stop();
800
801 ret = trace_test_buffer(tr, &count);
802 trace->reset(tr);
803 tracing_start();
804
805 if (!ret && !count) {
806 printk(KERN_CONT "no entries found..");
807 ret = -1;
808 }
809
810 return ret;
811}
812#endif /* CONFIG_HW_BRANCH_TRACER */
813
814#ifdef CONFIG_KSYM_TRACER 758#ifdef CONFIG_KSYM_TRACER
815static int ksym_selftest_dummy; 759static int ksym_selftest_dummy;
816 760
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 4d6d711717f2..d2c859cec9ea 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -15,6 +15,54 @@ static int sys_refcount_exit;
15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
17 17
18static int syscall_enter_register(struct ftrace_event_call *event,
19 enum trace_reg type);
20static int syscall_exit_register(struct ftrace_event_call *event,
21 enum trace_reg type);
22
23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25
26static struct list_head *
27syscall_get_enter_fields(struct ftrace_event_call *call)
28{
29 struct syscall_metadata *entry = call->data;
30
31 return &entry->enter_fields;
32}
33
34static struct list_head *
35syscall_get_exit_fields(struct ftrace_event_call *call)
36{
37 struct syscall_metadata *entry = call->data;
38
39 return &entry->exit_fields;
40}
41
42struct trace_event_functions enter_syscall_print_funcs = {
43 .trace = print_syscall_enter,
44};
45
46struct trace_event_functions exit_syscall_print_funcs = {
47 .trace = print_syscall_exit,
48};
49
50struct ftrace_event_class event_class_syscall_enter = {
51 .system = "syscalls",
52 .reg = syscall_enter_register,
53 .define_fields = syscall_enter_define_fields,
54 .get_fields = syscall_get_enter_fields,
55 .raw_init = init_syscall_trace,
56};
57
58struct ftrace_event_class event_class_syscall_exit = {
59 .system = "syscalls",
60 .reg = syscall_exit_register,
61 .define_fields = syscall_exit_define_fields,
62 .get_fields = syscall_get_exit_fields,
63 .raw_init = init_syscall_trace,
64};
65
18extern unsigned long __start_syscalls_metadata[]; 66extern unsigned long __start_syscalls_metadata[];
19extern unsigned long __stop_syscalls_metadata[]; 67extern unsigned long __stop_syscalls_metadata[];
20 68
@@ -53,7 +101,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
53} 101}
54 102
55enum print_line_t 103enum print_line_t
56print_syscall_enter(struct trace_iterator *iter, int flags) 104print_syscall_enter(struct trace_iterator *iter, int flags,
105 struct trace_event *event)
57{ 106{
58 struct trace_seq *s = &iter->seq; 107 struct trace_seq *s = &iter->seq;
59 struct trace_entry *ent = iter->ent; 108 struct trace_entry *ent = iter->ent;
@@ -68,7 +117,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
68 if (!entry) 117 if (!entry)
69 goto end; 118 goto end;
70 119
71 if (entry->enter_event->id != ent->type) { 120 if (entry->enter_event->event.type != ent->type) {
72 WARN_ON_ONCE(1); 121 WARN_ON_ONCE(1);
73 goto end; 122 goto end;
74 } 123 }
@@ -105,7 +154,8 @@ end:
105} 154}
106 155
107enum print_line_t 156enum print_line_t
108print_syscall_exit(struct trace_iterator *iter, int flags) 157print_syscall_exit(struct trace_iterator *iter, int flags,
158 struct trace_event *event)
109{ 159{
110 struct trace_seq *s = &iter->seq; 160 struct trace_seq *s = &iter->seq;
111 struct trace_entry *ent = iter->ent; 161 struct trace_entry *ent = iter->ent;
@@ -123,7 +173,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
123 return TRACE_TYPE_HANDLED; 173 return TRACE_TYPE_HANDLED;
124 } 174 }
125 175
126 if (entry->exit_event->id != ent->type) { 176 if (entry->exit_event->event.type != ent->type) {
127 WARN_ON_ONCE(1); 177 WARN_ON_ONCE(1);
128 return TRACE_TYPE_UNHANDLED; 178 return TRACE_TYPE_UNHANDLED;
129 } 179 }
@@ -205,7 +255,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
205 kfree(call->print_fmt); 255 kfree(call->print_fmt);
206} 256}
207 257
208int syscall_enter_define_fields(struct ftrace_event_call *call) 258static int syscall_enter_define_fields(struct ftrace_event_call *call)
209{ 259{
210 struct syscall_trace_enter trace; 260 struct syscall_trace_enter trace;
211 struct syscall_metadata *meta = call->data; 261 struct syscall_metadata *meta = call->data;
@@ -228,7 +278,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
228 return ret; 278 return ret;
229} 279}
230 280
231int syscall_exit_define_fields(struct ftrace_event_call *call) 281static int syscall_exit_define_fields(struct ftrace_event_call *call)
232{ 282{
233 struct syscall_trace_exit trace; 283 struct syscall_trace_exit trace;
234 int ret; 284 int ret;
@@ -243,7 +293,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
243 return ret; 293 return ret;
244} 294}
245 295
246void ftrace_syscall_enter(struct pt_regs *regs, long id) 296void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
247{ 297{
248 struct syscall_trace_enter *entry; 298 struct syscall_trace_enter *entry;
249 struct syscall_metadata *sys_data; 299 struct syscall_metadata *sys_data;
@@ -265,7 +315,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
265 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 315 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
266 316
267 event = trace_current_buffer_lock_reserve(&buffer, 317 event = trace_current_buffer_lock_reserve(&buffer,
268 sys_data->enter_event->id, size, 0, 0); 318 sys_data->enter_event->event.type, size, 0, 0);
269 if (!event) 319 if (!event)
270 return; 320 return;
271 321
@@ -278,7 +328,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
278 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 328 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
279} 329}
280 330
281void ftrace_syscall_exit(struct pt_regs *regs, long ret) 331void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
282{ 332{
283 struct syscall_trace_exit *entry; 333 struct syscall_trace_exit *entry;
284 struct syscall_metadata *sys_data; 334 struct syscall_metadata *sys_data;
@@ -297,7 +347,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
297 return; 347 return;
298 348
299 event = trace_current_buffer_lock_reserve(&buffer, 349 event = trace_current_buffer_lock_reserve(&buffer,
300 sys_data->exit_event->id, sizeof(*entry), 0, 0); 350 sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
301 if (!event) 351 if (!event)
302 return; 352 return;
303 353
@@ -320,7 +370,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
320 return -ENOSYS; 370 return -ENOSYS;
321 mutex_lock(&syscall_trace_lock); 371 mutex_lock(&syscall_trace_lock);
322 if (!sys_refcount_enter) 372 if (!sys_refcount_enter)
323 ret = register_trace_sys_enter(ftrace_syscall_enter); 373 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
324 if (!ret) { 374 if (!ret) {
325 set_bit(num, enabled_enter_syscalls); 375 set_bit(num, enabled_enter_syscalls);
326 sys_refcount_enter++; 376 sys_refcount_enter++;
@@ -340,7 +390,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
340 sys_refcount_enter--; 390 sys_refcount_enter--;
341 clear_bit(num, enabled_enter_syscalls); 391 clear_bit(num, enabled_enter_syscalls);
342 if (!sys_refcount_enter) 392 if (!sys_refcount_enter)
343 unregister_trace_sys_enter(ftrace_syscall_enter); 393 unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
344 mutex_unlock(&syscall_trace_lock); 394 mutex_unlock(&syscall_trace_lock);
345} 395}
346 396
@@ -354,7 +404,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
354 return -ENOSYS; 404 return -ENOSYS;
355 mutex_lock(&syscall_trace_lock); 405 mutex_lock(&syscall_trace_lock);
356 if (!sys_refcount_exit) 406 if (!sys_refcount_exit)
357 ret = register_trace_sys_exit(ftrace_syscall_exit); 407 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
358 if (!ret) { 408 if (!ret) {
359 set_bit(num, enabled_exit_syscalls); 409 set_bit(num, enabled_exit_syscalls);
360 sys_refcount_exit++; 410 sys_refcount_exit++;
@@ -374,7 +424,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
374 sys_refcount_exit--; 424 sys_refcount_exit--;
375 clear_bit(num, enabled_exit_syscalls); 425 clear_bit(num, enabled_exit_syscalls);
376 if (!sys_refcount_exit) 426 if (!sys_refcount_exit)
377 unregister_trace_sys_exit(ftrace_syscall_exit); 427 unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
378 mutex_unlock(&syscall_trace_lock); 428 mutex_unlock(&syscall_trace_lock);
379} 429}
380 430
@@ -434,11 +484,11 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
434static int sys_perf_refcount_enter; 484static int sys_perf_refcount_enter;
435static int sys_perf_refcount_exit; 485static int sys_perf_refcount_exit;
436 486
437static void perf_syscall_enter(struct pt_regs *regs, long id) 487static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
438{ 488{
439 struct syscall_metadata *sys_data; 489 struct syscall_metadata *sys_data;
440 struct syscall_trace_enter *rec; 490 struct syscall_trace_enter *rec;
441 unsigned long flags; 491 struct hlist_head *head;
442 int syscall_nr; 492 int syscall_nr;
443 int rctx; 493 int rctx;
444 int size; 494 int size;
@@ -461,14 +511,16 @@ static void perf_syscall_enter(struct pt_regs *regs, long id)
461 return; 511 return;
462 512
463 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 513 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
464 sys_data->enter_event->id, &rctx, &flags); 514 sys_data->enter_event->event.type, regs, &rctx);
465 if (!rec) 515 if (!rec)
466 return; 516 return;
467 517
468 rec->nr = syscall_nr; 518 rec->nr = syscall_nr;
469 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 519 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
470 (unsigned long *)&rec->args); 520 (unsigned long *)&rec->args);
471 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); 521
522 head = per_cpu_ptr(sys_data->enter_event->perf_events, smp_processor_id());
523 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
472} 524}
473 525
474int perf_sysenter_enable(struct ftrace_event_call *call) 526int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -480,7 +532,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
480 532
481 mutex_lock(&syscall_trace_lock); 533 mutex_lock(&syscall_trace_lock);
482 if (!sys_perf_refcount_enter) 534 if (!sys_perf_refcount_enter)
483 ret = register_trace_sys_enter(perf_syscall_enter); 535 ret = register_trace_sys_enter(perf_syscall_enter, NULL);
484 if (ret) { 536 if (ret) {
485 pr_info("event trace: Could not activate" 537 pr_info("event trace: Could not activate"
486 "syscall entry trace point"); 538 "syscall entry trace point");
@@ -502,15 +554,15 @@ void perf_sysenter_disable(struct ftrace_event_call *call)
502 sys_perf_refcount_enter--; 554 sys_perf_refcount_enter--;
503 clear_bit(num, enabled_perf_enter_syscalls); 555 clear_bit(num, enabled_perf_enter_syscalls);
504 if (!sys_perf_refcount_enter) 556 if (!sys_perf_refcount_enter)
505 unregister_trace_sys_enter(perf_syscall_enter); 557 unregister_trace_sys_enter(perf_syscall_enter, NULL);
506 mutex_unlock(&syscall_trace_lock); 558 mutex_unlock(&syscall_trace_lock);
507} 559}
508 560
509static void perf_syscall_exit(struct pt_regs *regs, long ret) 561static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
510{ 562{
511 struct syscall_metadata *sys_data; 563 struct syscall_metadata *sys_data;
512 struct syscall_trace_exit *rec; 564 struct syscall_trace_exit *rec;
513 unsigned long flags; 565 struct hlist_head *head;
514 int syscall_nr; 566 int syscall_nr;
515 int rctx; 567 int rctx;
516 int size; 568 int size;
@@ -536,14 +588,15 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret)
536 return; 588 return;
537 589
538 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 590 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
539 sys_data->exit_event->id, &rctx, &flags); 591 sys_data->exit_event->event.type, regs, &rctx);
540 if (!rec) 592 if (!rec)
541 return; 593 return;
542 594
543 rec->nr = syscall_nr; 595 rec->nr = syscall_nr;
544 rec->ret = syscall_get_return_value(current, regs); 596 rec->ret = syscall_get_return_value(current, regs);
545 597
546 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); 598 head = per_cpu_ptr(sys_data->exit_event->perf_events, smp_processor_id());
599 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
547} 600}
548 601
549int perf_sysexit_enable(struct ftrace_event_call *call) 602int perf_sysexit_enable(struct ftrace_event_call *call)
@@ -555,7 +608,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
555 608
556 mutex_lock(&syscall_trace_lock); 609 mutex_lock(&syscall_trace_lock);
557 if (!sys_perf_refcount_exit) 610 if (!sys_perf_refcount_exit)
558 ret = register_trace_sys_exit(perf_syscall_exit); 611 ret = register_trace_sys_exit(perf_syscall_exit, NULL);
559 if (ret) { 612 if (ret) {
560 pr_info("event trace: Could not activate" 613 pr_info("event trace: Could not activate"
561 "syscall exit trace point"); 614 "syscall exit trace point");
@@ -577,9 +630,50 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
577 sys_perf_refcount_exit--; 630 sys_perf_refcount_exit--;
578 clear_bit(num, enabled_perf_exit_syscalls); 631 clear_bit(num, enabled_perf_exit_syscalls);
579 if (!sys_perf_refcount_exit) 632 if (!sys_perf_refcount_exit)
580 unregister_trace_sys_exit(perf_syscall_exit); 633 unregister_trace_sys_exit(perf_syscall_exit, NULL);
581 mutex_unlock(&syscall_trace_lock); 634 mutex_unlock(&syscall_trace_lock);
582} 635}
583 636
584#endif /* CONFIG_PERF_EVENTS */ 637#endif /* CONFIG_PERF_EVENTS */
585 638
639static int syscall_enter_register(struct ftrace_event_call *event,
640 enum trace_reg type)
641{
642 switch (type) {
643 case TRACE_REG_REGISTER:
644 return reg_event_syscall_enter(event);
645 case TRACE_REG_UNREGISTER:
646 unreg_event_syscall_enter(event);
647 return 0;
648
649#ifdef CONFIG_PERF_EVENTS
650 case TRACE_REG_PERF_REGISTER:
651 return perf_sysenter_enable(event);
652 case TRACE_REG_PERF_UNREGISTER:
653 perf_sysenter_disable(event);
654 return 0;
655#endif
656 }
657 return 0;
658}
659
660static int syscall_exit_register(struct ftrace_event_call *event,
661 enum trace_reg type)
662{
663 switch (type) {
664 case TRACE_REG_REGISTER:
665 return reg_event_syscall_exit(event);
666 case TRACE_REG_UNREGISTER:
667 unreg_event_syscall_exit(event);
668 return 0;
669
670#ifdef CONFIG_PERF_EVENTS
671 case TRACE_REG_PERF_REGISTER:
672 return perf_sysexit_enable(event);
673 case TRACE_REG_PERF_UNREGISTER:
674 perf_sysexit_disable(event);
675 return 0;
676#endif
677 }
678 return 0;
679}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index cc2d2faa7d9e..a7cc3793baf6 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -49,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
49 49
50/* Insertion of a work */ 50/* Insertion of a work */
51static void 51static void
52probe_workqueue_insertion(struct task_struct *wq_thread, 52probe_workqueue_insertion(void *ignore,
53 struct task_struct *wq_thread,
53 struct work_struct *work) 54 struct work_struct *work)
54{ 55{
55 int cpu = cpumask_first(&wq_thread->cpus_allowed); 56 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -70,7 +71,8 @@ found:
70 71
71/* Execution of a work */ 72/* Execution of a work */
72static void 73static void
73probe_workqueue_execution(struct task_struct *wq_thread, 74probe_workqueue_execution(void *ignore,
75 struct task_struct *wq_thread,
74 struct work_struct *work) 76 struct work_struct *work)
75{ 77{
76 int cpu = cpumask_first(&wq_thread->cpus_allowed); 78 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -90,7 +92,8 @@ found:
90} 92}
91 93
92/* Creation of a cpu workqueue thread */ 94/* Creation of a cpu workqueue thread */
93static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) 95static void probe_workqueue_creation(void *ignore,
96 struct task_struct *wq_thread, int cpu)
94{ 97{
95 struct cpu_workqueue_stats *cws; 98 struct cpu_workqueue_stats *cws;
96 unsigned long flags; 99 unsigned long flags;
@@ -114,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
114} 117}
115 118
116/* Destruction of a cpu workqueue thread */ 119/* Destruction of a cpu workqueue thread */
117static void probe_workqueue_destruction(struct task_struct *wq_thread) 120static void
121probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
118{ 122{
119 /* Workqueue only execute on one cpu */ 123 /* Workqueue only execute on one cpu */
120 int cpu = cpumask_first(&wq_thread->cpus_allowed); 124 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -259,19 +263,19 @@ int __init trace_workqueue_early_init(void)
259{ 263{
260 int ret, cpu; 264 int ret, cpu;
261 265
262 ret = register_trace_workqueue_insertion(probe_workqueue_insertion); 266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
263 if (ret) 267 if (ret)
264 goto out; 268 goto out;
265 269
266 ret = register_trace_workqueue_execution(probe_workqueue_execution); 270 ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
267 if (ret) 271 if (ret)
268 goto no_insertion; 272 goto no_insertion;
269 273
270 ret = register_trace_workqueue_creation(probe_workqueue_creation); 274 ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
271 if (ret) 275 if (ret)
272 goto no_execution; 276 goto no_execution;
273 277
274 ret = register_trace_workqueue_destruction(probe_workqueue_destruction); 278 ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
275 if (ret) 279 if (ret)
276 goto no_creation; 280 goto no_creation;
277 281
@@ -283,11 +287,11 @@ int __init trace_workqueue_early_init(void)
283 return 0; 287 return 0;
284 288
285no_creation: 289no_creation:
286 unregister_trace_workqueue_creation(probe_workqueue_creation); 290 unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
287no_execution: 291no_execution:
288 unregister_trace_workqueue_execution(probe_workqueue_execution); 292 unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
289no_insertion: 293no_insertion:
290 unregister_trace_workqueue_insertion(probe_workqueue_insertion); 294 unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
291out: 295out:
292 pr_warning("trace_workqueue: unable to trace workqueues\n"); 296 pr_warning("trace_workqueue: unable to trace workqueues\n");
293 297
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5bc0f8..c77f3eceea25 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
54 */ 54 */
55struct tracepoint_entry { 55struct tracepoint_entry {
56 struct hlist_node hlist; 56 struct hlist_node hlist;
57 void **funcs; 57 struct tracepoint_func *funcs;
58 int refcount; /* Number of times armed. 0 if disarmed. */ 58 int refcount; /* Number of times armed. 0 if disarmed. */
59 char name[0]; 59 char name[0];
60}; 60};
@@ -64,12 +64,12 @@ struct tp_probes {
64 struct rcu_head rcu; 64 struct rcu_head rcu;
65 struct list_head list; 65 struct list_head list;
66 } u; 66 } u;
67 void *probes[0]; 67 struct tracepoint_func probes[0];
68}; 68};
69 69
70static inline void *allocate_probes(int count) 70static inline void *allocate_probes(int count)
71{ 71{
72 struct tp_probes *p = kmalloc(count * sizeof(void *) 72 struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func)
73 + sizeof(struct tp_probes), GFP_KERNEL); 73 + sizeof(struct tp_probes), GFP_KERNEL);
74 return p == NULL ? NULL : p->probes; 74 return p == NULL ? NULL : p->probes;
75} 75}
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
79 kfree(container_of(head, struct tp_probes, u.rcu)); 79 kfree(container_of(head, struct tp_probes, u.rcu));
80} 80}
81 81
82static inline void release_probes(void *old) 82static inline void release_probes(struct tracepoint_func *old)
83{ 83{
84 if (old) { 84 if (old) {
85 struct tp_probes *tp_probes = container_of(old, 85 struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
95 if (!tracepoint_debug || !entry->funcs) 95 if (!tracepoint_debug || !entry->funcs)
96 return; 96 return;
97 97
98 for (i = 0; entry->funcs[i]; i++) 98 for (i = 0; entry->funcs[i].func; i++)
99 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); 99 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
100} 100}
101 101
102static void * 102static struct tracepoint_func *
103tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) 103tracepoint_entry_add_probe(struct tracepoint_entry *entry,
104 void *probe, void *data)
104{ 105{
105 int nr_probes = 0; 106 int nr_probes = 0;
106 void **old, **new; 107 struct tracepoint_func *old, *new;
107 108
108 WARN_ON(!probe); 109 WARN_ON(!probe);
109 110
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
111 old = entry->funcs; 112 old = entry->funcs;
112 if (old) { 113 if (old) {
113 /* (N -> N+1), (N != 0, 1) probes */ 114 /* (N -> N+1), (N != 0, 1) probes */
114 for (nr_probes = 0; old[nr_probes]; nr_probes++) 115 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
115 if (old[nr_probes] == probe) 116 if (old[nr_probes].func == probe &&
117 old[nr_probes].data == data)
116 return ERR_PTR(-EEXIST); 118 return ERR_PTR(-EEXIST);
117 } 119 }
118 /* + 2 : one for new probe, one for NULL func */ 120 /* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
120 if (new == NULL) 122 if (new == NULL)
121 return ERR_PTR(-ENOMEM); 123 return ERR_PTR(-ENOMEM);
122 if (old) 124 if (old)
123 memcpy(new, old, nr_probes * sizeof(void *)); 125 memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
124 new[nr_probes] = probe; 126 new[nr_probes].func = probe;
125 new[nr_probes + 1] = NULL; 127 new[nr_probes].data = data;
128 new[nr_probes + 1].func = NULL;
126 entry->refcount = nr_probes + 1; 129 entry->refcount = nr_probes + 1;
127 entry->funcs = new; 130 entry->funcs = new;
128 debug_print_probes(entry); 131 debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
130} 133}
131 134
132static void * 135static void *
133tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) 136tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
137 void *probe, void *data)
134{ 138{
135 int nr_probes = 0, nr_del = 0, i; 139 int nr_probes = 0, nr_del = 0, i;
136 void **old, **new; 140 struct tracepoint_func *old, *new;
137 141
138 old = entry->funcs; 142 old = entry->funcs;
139 143
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
142 146
143 debug_print_probes(entry); 147 debug_print_probes(entry);
144 /* (N -> M), (N > 1, M >= 0) probes */ 148 /* (N -> M), (N > 1, M >= 0) probes */
145 for (nr_probes = 0; old[nr_probes]; nr_probes++) { 149 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
146 if ((!probe || old[nr_probes] == probe)) 150 if (!probe ||
151 (old[nr_probes].func == probe &&
152 old[nr_probes].data == data))
147 nr_del++; 153 nr_del++;
148 } 154 }
149 155
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
160 new = allocate_probes(nr_probes - nr_del + 1); 166 new = allocate_probes(nr_probes - nr_del + 1);
161 if (new == NULL) 167 if (new == NULL)
162 return ERR_PTR(-ENOMEM); 168 return ERR_PTR(-ENOMEM);
163 for (i = 0; old[i]; i++) 169 for (i = 0; old[i].func; i++)
164 if ((probe && old[i] != probe)) 170 if (probe &&
171 (old[i].func != probe || old[i].data != data))
165 new[j++] = old[i]; 172 new[j++] = old[i];
166 new[nr_probes - nr_del] = NULL; 173 new[nr_probes - nr_del].func = NULL;
167 entry->refcount = nr_probes - nr_del; 174 entry->refcount = nr_probes - nr_del;
168 entry->funcs = new; 175 entry->funcs = new;
169 } 176 }
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
315 module_update_tracepoints(); 322 module_update_tracepoints();
316} 323}
317 324
318static void *tracepoint_add_probe(const char *name, void *probe) 325static struct tracepoint_func *
326tracepoint_add_probe(const char *name, void *probe, void *data)
319{ 327{
320 struct tracepoint_entry *entry; 328 struct tracepoint_entry *entry;
321 void *old; 329 struct tracepoint_func *old;
322 330
323 entry = get_tracepoint(name); 331 entry = get_tracepoint(name);
324 if (!entry) { 332 if (!entry) {
325 entry = add_tracepoint(name); 333 entry = add_tracepoint(name);
326 if (IS_ERR(entry)) 334 if (IS_ERR(entry))
327 return entry; 335 return (struct tracepoint_func *)entry;
328 } 336 }
329 old = tracepoint_entry_add_probe(entry, probe); 337 old = tracepoint_entry_add_probe(entry, probe, data);
330 if (IS_ERR(old) && !entry->refcount) 338 if (IS_ERR(old) && !entry->refcount)
331 remove_tracepoint(entry); 339 remove_tracepoint(entry);
332 return old; 340 return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
340 * Returns 0 if ok, error value on error. 348 * Returns 0 if ok, error value on error.
341 * The probe address must at least be aligned on the architecture pointer size. 349 * The probe address must at least be aligned on the architecture pointer size.
342 */ 350 */
343int tracepoint_probe_register(const char *name, void *probe) 351int tracepoint_probe_register(const char *name, void *probe, void *data)
344{ 352{
345 void *old; 353 struct tracepoint_func *old;
346 354
347 mutex_lock(&tracepoints_mutex); 355 mutex_lock(&tracepoints_mutex);
348 old = tracepoint_add_probe(name, probe); 356 old = tracepoint_add_probe(name, probe, data);
349 mutex_unlock(&tracepoints_mutex); 357 mutex_unlock(&tracepoints_mutex);
350 if (IS_ERR(old)) 358 if (IS_ERR(old))
351 return PTR_ERR(old); 359 return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
356} 364}
357EXPORT_SYMBOL_GPL(tracepoint_probe_register); 365EXPORT_SYMBOL_GPL(tracepoint_probe_register);
358 366
359static void *tracepoint_remove_probe(const char *name, void *probe) 367static struct tracepoint_func *
368tracepoint_remove_probe(const char *name, void *probe, void *data)
360{ 369{
361 struct tracepoint_entry *entry; 370 struct tracepoint_entry *entry;
362 void *old; 371 struct tracepoint_func *old;
363 372
364 entry = get_tracepoint(name); 373 entry = get_tracepoint(name);
365 if (!entry) 374 if (!entry)
366 return ERR_PTR(-ENOENT); 375 return ERR_PTR(-ENOENT);
367 old = tracepoint_entry_remove_probe(entry, probe); 376 old = tracepoint_entry_remove_probe(entry, probe, data);
368 if (IS_ERR(old)) 377 if (IS_ERR(old))
369 return old; 378 return old;
370 if (!entry->refcount) 379 if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
382 * itself uses stop_machine(), which insures that every preempt disabled section 391 * itself uses stop_machine(), which insures that every preempt disabled section
383 * have finished. 392 * have finished.
384 */ 393 */
385int tracepoint_probe_unregister(const char *name, void *probe) 394int tracepoint_probe_unregister(const char *name, void *probe, void *data)
386{ 395{
387 void *old; 396 struct tracepoint_func *old;
388 397
389 mutex_lock(&tracepoints_mutex); 398 mutex_lock(&tracepoints_mutex);
390 old = tracepoint_remove_probe(name, probe); 399 old = tracepoint_remove_probe(name, probe, data);
391 mutex_unlock(&tracepoints_mutex); 400 mutex_unlock(&tracepoints_mutex);
392 if (IS_ERR(old)) 401 if (IS_ERR(old))
393 return PTR_ERR(old); 402 return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
418 * 427 *
419 * caller must call tracepoint_probe_update_all() 428 * caller must call tracepoint_probe_update_all()
420 */ 429 */
421int tracepoint_probe_register_noupdate(const char *name, void *probe) 430int tracepoint_probe_register_noupdate(const char *name, void *probe,
431 void *data)
422{ 432{
423 void *old; 433 struct tracepoint_func *old;
424 434
425 mutex_lock(&tracepoints_mutex); 435 mutex_lock(&tracepoints_mutex);
426 old = tracepoint_add_probe(name, probe); 436 old = tracepoint_add_probe(name, probe, data);
427 if (IS_ERR(old)) { 437 if (IS_ERR(old)) {
428 mutex_unlock(&tracepoints_mutex); 438 mutex_unlock(&tracepoints_mutex);
429 return PTR_ERR(old); 439 return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
441 * 451 *
442 * caller must call tracepoint_probe_update_all() 452 * caller must call tracepoint_probe_update_all()
443 */ 453 */
444int tracepoint_probe_unregister_noupdate(const char *name, void *probe) 454int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
455 void *data)
445{ 456{
446 void *old; 457 struct tracepoint_func *old;
447 458
448 mutex_lock(&tracepoints_mutex); 459 mutex_lock(&tracepoints_mutex);
449 old = tracepoint_remove_probe(name, probe); 460 old = tracepoint_remove_probe(name, probe, data);
450 if (IS_ERR(old)) { 461 if (IS_ERR(old)) {
451 mutex_unlock(&tracepoints_mutex); 462 mutex_unlock(&tracepoints_mutex);
452 return PTR_ERR(old); 463 return PTR_ERR(old);
diff --git a/kernel/user.c b/kernel/user.c
index 766467b3bcb7..7e72614b736d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include "cred-internals.h"
20 19
21struct user_namespace init_user_ns = { 20struct user_namespace init_user_ns = {
22 .kref = { 21 .kref = {
@@ -137,9 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
137 struct hlist_head *hashent = uidhashentry(ns, uid); 136 struct hlist_head *hashent = uidhashentry(ns, uid);
138 struct user_struct *up, *new; 137 struct user_struct *up, *new;
139 138
140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
141 * atomic.
142 */
143 spin_lock_irq(&uidhash_lock); 139 spin_lock_irq(&uidhash_lock);
144 up = uid_hash_find(uid, hashent); 140 up = uid_hash_find(uid, hashent);
145 spin_unlock_irq(&uidhash_lock); 141 spin_unlock_irq(&uidhash_lock);
@@ -161,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
161 spin_lock_irq(&uidhash_lock); 157 spin_lock_irq(&uidhash_lock);
162 up = uid_hash_find(uid, hashent); 158 up = uid_hash_find(uid, hashent);
163 if (up) { 159 if (up) {
164 /* This case is not possible when CONFIG_USER_SCHED
165 * is defined, since we serialize alloc_uid() using
166 * uids_mutex. Hence no need to call
167 * sched_destroy_user() or remove_user_sysfs_dir().
168 */
169 key_put(new->uid_keyring); 160 key_put(new->uid_keyring);
170 key_put(new->session_keyring); 161 key_put(new->session_keyring);
171 kmem_cache_free(uid_cachep, new); 162 kmem_cache_free(uid_cachep, new);
@@ -178,8 +169,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
178 169
179 return up; 170 return up;
180 171
181 put_user_ns(new->user_ns);
182 kmem_cache_free(uid_cachep, new);
183out_unlock: 172out_unlock:
184 return NULL; 173 return NULL;
185} 174}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8215b0..b2d70d38dff4 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -54,8 +54,8 @@ int create_user_ns(struct cred *new)
54#endif 54#endif
55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
56 56
57 /* alloc_uid() incremented the userns refcount. Just set it to 1 */ 57 /* root_user holds a reference to ns, our reference can be dropped */
58 kref_set(&ns->kref, 1); 58 put_user_ns(ns);
59 59
60 return 0; 60 return 0;
61} 61}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5bfb213984b2..327d2deb4451 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -229,6 +229,16 @@ static inline void set_wq_data(struct work_struct *work,
229 atomic_long_set(&work->data, new); 229 atomic_long_set(&work->data, new);
230} 230}
231 231
232/*
233 * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
234 */
235static inline void clear_wq_data(struct work_struct *work)
236{
237 unsigned long flags = *work_data_bits(work) &
238 (1UL << WORK_STRUCT_STATIC);
239 atomic_long_set(&work->data, flags);
240}
241
232static inline 242static inline
233struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) 243struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
234{ 244{
@@ -671,7 +681,7 @@ static int __cancel_work_timer(struct work_struct *work,
671 wait_on_work(work); 681 wait_on_work(work);
672 } while (unlikely(ret < 0)); 682 } while (unlikely(ret < 0));
673 683
674 work_clear_pending(work); 684 clear_wq_data(work);
675 return ret; 685 return ret;
676} 686}
677 687
@@ -845,6 +855,30 @@ int schedule_on_each_cpu(work_func_t func)
845 return 0; 855 return 0;
846} 856}
847 857
858/**
859 * flush_scheduled_work - ensure that any scheduled work has run to completion.
860 *
861 * Forces execution of the kernel-global workqueue and blocks until its
862 * completion.
863 *
864 * Think twice before calling this function! It's very easy to get into
865 * trouble if you don't take great care. Either of the following situations
866 * will lead to deadlock:
867 *
868 * One of the work items currently on the workqueue needs to acquire
869 * a lock held by your code or its caller.
870 *
871 * Your code is running in the context of a work routine.
872 *
873 * They will be detected by lockdep when they occur, but the first might not
874 * occur very often. It depends on what work items are on the workqueue and
875 * what locks they need, which you have no control over.
876 *
877 * In most situations flushing the entire workqueue is overkill; you merely
878 * need to know that a particular work item isn't queued and isn't running.
879 * In such cases you should use cancel_delayed_work_sync() or
880 * cancel_work_sync() instead.
881 */
848void flush_scheduled_work(void) 882void flush_scheduled_work(void)
849{ 883{
850 flush_workqueue(keventd_wq); 884 flush_workqueue(keventd_wq);
@@ -1076,7 +1110,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
1076 unsigned int cpu = (unsigned long)hcpu; 1110 unsigned int cpu = (unsigned long)hcpu;
1077 struct cpu_workqueue_struct *cwq; 1111 struct cpu_workqueue_struct *cwq;
1078 struct workqueue_struct *wq; 1112 struct workqueue_struct *wq;
1079 int ret = NOTIFY_OK; 1113 int err = 0;
1080 1114
1081 action &= ~CPU_TASKS_FROZEN; 1115 action &= ~CPU_TASKS_FROZEN;
1082 1116
@@ -1090,12 +1124,13 @@ undo:
1090 1124
1091 switch (action) { 1125 switch (action) {
1092 case CPU_UP_PREPARE: 1126 case CPU_UP_PREPARE:
1093 if (!create_workqueue_thread(cwq, cpu)) 1127 err = create_workqueue_thread(cwq, cpu);
1128 if (!err)
1094 break; 1129 break;
1095 printk(KERN_ERR "workqueue [%s] for %i failed\n", 1130 printk(KERN_ERR "workqueue [%s] for %i failed\n",
1096 wq->name, cpu); 1131 wq->name, cpu);
1097 action = CPU_UP_CANCELED; 1132 action = CPU_UP_CANCELED;
1098 ret = NOTIFY_BAD; 1133 err = -ENOMEM;
1099 goto undo; 1134 goto undo;
1100 1135
1101 case CPU_ONLINE: 1136 case CPU_ONLINE:
@@ -1116,7 +1151,7 @@ undo:
1116 cpumask_clear_cpu(cpu, cpu_populated_map); 1151 cpumask_clear_cpu(cpu, cpu_populated_map);
1117 } 1152 }
1118 1153
1119 return ret; 1154 return notifier_from_errno(err);
1120} 1155}
1121 1156
1122#ifdef CONFIG_SMP 1157#ifdef CONFIG_SMP