aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile4
-rw-r--r--kernel/acct.c37
-rw-r--r--kernel/async.c1
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/audit_tree.c1
-rw-r--r--kernel/audit_watch.c1
-rw-r--r--kernel/auditfilter.c1
-rw-r--r--kernel/auditsc.c3
-rw-r--r--kernel/capability.c1
-rw-r--r--kernel/cgroup.c91
-rw-r--r--kernel/cgroup_freezer.c36
-rw-r--r--kernel/compat.c26
-rw-r--r--kernel/cpu.c160
-rw-r--r--kernel/cpuset.c253
-rw-r--r--kernel/cred-internals.h21
-rw-r--r--kernel/cred.c87
-rw-r--r--kernel/debug/Makefile6
-rw-r--r--kernel/debug/debug_core.c983
-rw-r--r--kernel/debug/debug_core.h81
-rw-r--r--kernel/debug/gdbstub.c1014
-rw-r--r--kernel/debug/kdb/.gitignore1
-rw-r--r--kernel/debug/kdb/Makefile25
-rw-r--r--kernel/debug/kdb/kdb_bp.c564
-rw-r--r--kernel/debug/kdb/kdb_bt.c210
-rw-r--r--kernel/debug/kdb/kdb_cmds35
-rw-r--r--kernel/debug/kdb/kdb_debugger.c169
-rw-r--r--kernel/debug/kdb/kdb_io.c826
-rw-r--r--kernel/debug/kdb/kdb_keyboard.c212
-rw-r--r--kernel/debug/kdb/kdb_main.c2846
-rw-r--r--kernel/debug/kdb/kdb_private.h300
-rw-r--r--kernel/debug/kdb/kdb_support.c927
-rw-r--r--kernel/early_res.c12
-rw-r--r--kernel/exec_domain.c18
-rw-r--r--kernel/exit.c46
-rw-r--r--kernel/fork.c58
-rw-r--r--kernel/futex.c17
-rw-r--r--kernel/groups.c6
-rw-r--r--kernel/hrtimer.c69
-rw-r--r--kernel/hw_breakpoint.c196
-rw-r--r--kernel/irq/chip.c35
-rw-r--r--kernel/irq/handle.c3
-rw-r--r--kernel/irq/manage.c104
-rw-r--r--kernel/irq/numa_migrate.c1
-rw-r--r--kernel/irq/proc.c61
-rw-r--r--kernel/kallsyms.c22
-rw-r--r--kernel/kexec.c11
-rw-r--r--kernel/kgdb.c1763
-rw-r--r--kernel/kmod.c193
-rw-r--r--kernel/kprobes.c132
-rw-r--r--kernel/ksysfs.c3
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/latencytop.c1
-rw-r--r--kernel/lockdep.c115
-rw-r--r--kernel/lockdep_internals.h72
-rw-r--r--kernel/lockdep_proc.c58
-rw-r--r--kernel/module.c517
-rw-r--r--kernel/mutex.c7
-rw-r--r--kernel/nsproxy.c1
-rw-r--r--kernel/padata.c883
-rw-r--r--kernel/panic.c27
-rw-r--r--kernel/perf_event.c1081
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pid_namespace.c1
-rw-r--r--kernel/pm_qos_params.c335
-rw-r--r--kernel/posix-cpu-timers.c310
-rw-r--r--kernel/posix-timers.c11
-rw-r--r--kernel/power/Kconfig9
-rw-r--r--kernel/power/Makefile5
-rw-r--r--kernel/power/block_io.c103
-rw-r--r--kernel/power/hibernate.c27
-rw-r--r--kernel/power/main.c55
-rw-r--r--kernel/power/nvs.c (renamed from kernel/power/hibernate_nvs.c)25
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/power/snapshot.c148
-rw-r--r--kernel/power/suspend.c20
-rw-r--r--kernel/power/swap.c338
-rw-r--r--kernel/power/user.c39
-rw-r--r--kernel/printk.c25
-rw-r--r--kernel/profile.c12
-rw-r--r--kernel/ptrace.c38
-rw-r--r--kernel/rcupdate.c50
-rw-r--r--kernel/rcutiny.c35
-rw-r--r--kernel/rcutiny_plugin.h39
-rw-r--r--kernel/rcutorture.c4
-rw-r--r--kernel/rcutree.c131
-rw-r--r--kernel/rcutree.h2
-rw-r--r--kernel/rcutree_plugin.h69
-rw-r--r--kernel/rcutree_trace.c4
-rw-r--r--kernel/relay.c17
-rw-r--r--kernel/res_counter.c1
-rw-r--r--kernel/resource.c60
-rw-r--r--kernel/sched.c976
-rw-r--r--kernel/sched_clock.c1
-rw-r--r--kernel/sched_cpupri.c1
-rw-r--r--kernel/sched_debug.c124
-rw-r--r--kernel/sched_fair.c374
-rw-r--r--kernel/sched_features.h55
-rw-r--r--kernel/sched_idletask.c8
-rw-r--r--kernel/sched_rt.c15
-rw-r--r--kernel/signal.c72
-rw-r--r--kernel/slow-work.c2
-rw-r--r--kernel/slow-work.h8
-rw-r--r--kernel/smp.c3
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/srcu.c1
-rw-r--r--kernel/stop_machine.c537
-rw-r--r--kernel/sys.c40
-rw-r--r--kernel/sysctl.c613
-rw-r--r--kernel/sysctl_binary.c11
-rw-r--r--kernel/taskstats.c1
-rw-r--r--kernel/time.c12
-rw-r--r--kernel/time/clocksource.c48
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-broadcast.c2
-rw-r--r--kernel/time/tick-oneshot.c52
-rw-r--r--kernel/time/tick-sched.c87
-rw-r--r--kernel/time/timecompare.c1
-rw-r--r--kernel/time/timekeeping.c38
-rw-r--r--kernel/time/timer_list.c4
-rw-r--r--kernel/timer.c166
-rw-r--r--kernel/trace/Kconfig11
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/blktrace.c141
-rw-r--r--kernel/trace/ftrace.c37
-rw-r--r--kernel/trace/kmemtrace.c70
-rw-r--r--kernel/trace/power-traces.c1
-rw-r--r--kernel/trace/ring_buffer.c217
-rw-r--r--kernel/trace/ring_buffer_benchmark.c5
-rw-r--r--kernel/trace/trace.c204
-rw-r--r--kernel/trace/trace.h56
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c4
-rw-r--r--kernel/trace/trace_entries.h12
-rw-r--r--kernel/trace/trace_event_perf.c203
-rw-r--r--kernel/trace/trace_events.c140
-rw-r--r--kernel/trace/trace_events_filter.c31
-rw-r--r--kernel/trace/trace_export.c16
-rw-r--r--kernel/trace/trace_functions_graph.c177
-rw-r--r--kernel/trace/trace_hw_branches.c312
-rw-r--r--kernel/trace/trace_irqsoff.c271
-rw-r--r--kernel/trace/trace_kprobe.c648
-rw-r--r--kernel/trace/trace_ksym.c27
-rw-r--r--kernel/trace/trace_mmiotrace.c1
-rw-r--r--kernel/trace/trace_output.c155
-rw-r--r--kernel/trace/trace_output.h2
-rw-r--r--kernel/trace/trace_sched_switch.c21
-rw-r--r--kernel/trace/trace_sched_wakeup.c29
-rw-r--r--kernel/trace/trace_selftest.c65
-rw-r--r--kernel/trace/trace_stat.c1
-rw-r--r--kernel/trace/trace_syscalls.c147
-rw-r--r--kernel/trace/trace_workqueue.c27
-rw-r--r--kernel/tracepoint.c91
-rw-r--r--kernel/user.c11
-rw-r--r--kernel/user_namespace.c48
-rw-r--r--kernel/workqueue.c47
157 files changed, 16005 insertions, 6887 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index a987aa1676b5..057472fbc272 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -68,14 +68,14 @@ obj-$(CONFIG_USER_NS) += user_namespace.o
68obj-$(CONFIG_PID_NS) += pid_namespace.o 68obj-$(CONFIG_PID_NS) += pid_namespace.o
69obj-$(CONFIG_IKCONFIG) += configs.o 69obj-$(CONFIG_IKCONFIG) += configs.o
70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o 70obj-$(CONFIG_RESOURCE_COUNTERS) += res_counter.o
71obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 71obj-$(CONFIG_SMP) += stop_machine.o
72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o 72obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o 73obj-$(CONFIG_AUDIT) += audit.o auditfilter.o audit_watch.o
74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 74obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
75obj-$(CONFIG_GCOV_KERNEL) += gcov/ 75obj-$(CONFIG_GCOV_KERNEL) += gcov/
76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 76obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
77obj-$(CONFIG_KPROBES) += kprobes.o 77obj-$(CONFIG_KPROBES) += kprobes.o
78obj-$(CONFIG_KGDB) += kgdb.o 78obj-$(CONFIG_KGDB) += debug/
79obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o 79obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o 80obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 81obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
diff --git a/kernel/acct.c b/kernel/acct.c
index 24f8c81fc48d..385b88461c29 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -216,7 +216,6 @@ static int acct_on(char *name)
216{ 216{
217 struct file *file; 217 struct file *file;
218 struct vfsmount *mnt; 218 struct vfsmount *mnt;
219 int error;
220 struct pid_namespace *ns; 219 struct pid_namespace *ns;
221 struct bsd_acct_struct *acct = NULL; 220 struct bsd_acct_struct *acct = NULL;
222 221
@@ -244,13 +243,6 @@ static int acct_on(char *name)
244 } 243 }
245 } 244 }
246 245
247 error = security_acct(file);
248 if (error) {
249 kfree(acct);
250 filp_close(file, NULL);
251 return error;
252 }
253
254 spin_lock(&acct_lock); 246 spin_lock(&acct_lock);
255 if (ns->bacct == NULL) { 247 if (ns->bacct == NULL) {
256 ns->bacct = acct; 248 ns->bacct = acct;
@@ -281,7 +273,7 @@ static int acct_on(char *name)
281 */ 273 */
282SYSCALL_DEFINE1(acct, const char __user *, name) 274SYSCALL_DEFINE1(acct, const char __user *, name)
283{ 275{
284 int error; 276 int error = 0;
285 277
286 if (!capable(CAP_SYS_PACCT)) 278 if (!capable(CAP_SYS_PACCT))
287 return -EPERM; 279 return -EPERM;
@@ -299,13 +291,11 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
299 if (acct == NULL) 291 if (acct == NULL)
300 return 0; 292 return 0;
301 293
302 error = security_acct(NULL); 294 spin_lock(&acct_lock);
303 if (!error) { 295 acct_file_reopen(acct, NULL, NULL);
304 spin_lock(&acct_lock); 296 spin_unlock(&acct_lock);
305 acct_file_reopen(acct, NULL, NULL);
306 spin_unlock(&acct_lock);
307 }
308 } 297 }
298
309 return error; 299 return error;
310} 300}
311 301
@@ -353,17 +343,18 @@ restart:
353 343
354void acct_exit_ns(struct pid_namespace *ns) 344void acct_exit_ns(struct pid_namespace *ns)
355{ 345{
356 struct bsd_acct_struct *acct; 346 struct bsd_acct_struct *acct = ns->bacct;
357 347
358 spin_lock(&acct_lock); 348 if (acct == NULL)
359 acct = ns->bacct; 349 return;
360 if (acct != NULL) {
361 if (acct->file != NULL)
362 acct_file_reopen(acct, NULL, NULL);
363 350
364 kfree(acct); 351 del_timer_sync(&acct->timer);
365 } 352 spin_lock(&acct_lock);
353 if (acct->file != NULL)
354 acct_file_reopen(acct, NULL, NULL);
366 spin_unlock(&acct_lock); 355 spin_unlock(&acct_lock);
356
357 kfree(acct);
367} 358}
368 359
369/* 360/*
diff --git a/kernel/async.c b/kernel/async.c
index 27235f5de198..15319d6c18fe 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -56,6 +56,7 @@ asynchronous and synchronous parts of the kernel.
56#include <linux/init.h> 56#include <linux/init.h>
57#include <linux/kthread.h> 57#include <linux/kthread.h>
58#include <linux/delay.h> 58#include <linux/delay.h>
59#include <linux/slab.h>
59#include <asm/atomic.h> 60#include <asm/atomic.h>
60 61
61static async_cookie_t next_cookie = 1; 62static async_cookie_t next_cookie = 1;
diff --git a/kernel/audit.c b/kernel/audit.c
index 78f7f86aa238..8296aa516c5a 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -46,6 +46,7 @@
46#include <asm/atomic.h> 46#include <asm/atomic.h>
47#include <linux/mm.h> 47#include <linux/mm.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/slab.h>
49#include <linux/err.h> 50#include <linux/err.h>
50#include <linux/kthread.h> 51#include <linux/kthread.h>
51 52
@@ -406,7 +407,7 @@ static void kauditd_send_skb(struct sk_buff *skb)
406 audit_hold_skb(skb); 407 audit_hold_skb(skb);
407 } else 408 } else
408 /* drop the extra reference if sent ok */ 409 /* drop the extra reference if sent ok */
409 kfree_skb(skb); 410 consume_skb(skb);
410} 411}
411 412
412static int kauditd_thread(void *dummy) 413static int kauditd_thread(void *dummy)
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 028e85663f27..46a57b57a335 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -3,6 +3,7 @@
3#include <linux/namei.h> 3#include <linux/namei.h>
4#include <linux/mount.h> 4#include <linux/mount.h>
5#include <linux/kthread.h> 5#include <linux/kthread.h>
6#include <linux/slab.h>
6 7
7struct audit_tree; 8struct audit_tree;
8struct audit_chunk; 9struct audit_chunk;
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index cc7e87936cbc..8df43696f4ba 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -27,6 +27,7 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/slab.h>
30#include <linux/inotify.h> 31#include <linux/inotify.h>
31#include <linux/security.h> 32#include <linux/security.h>
32#include "audit.h" 33#include "audit.h"
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index a70604047f3c..ce08041f578d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -27,6 +27,7 @@
27#include <linux/namei.h> 27#include <linux/namei.h>
28#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/slab.h>
30#include <linux/security.h> 31#include <linux/security.h>
31#include "audit.h" 32#include "audit.h"
32 33
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f3a461c0970a..3828ad5fb8f1 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -49,6 +49,7 @@
49#include <linux/namei.h> 49#include <linux/namei.h>
50#include <linux/mm.h> 50#include <linux/mm.h>
51#include <linux/module.h> 51#include <linux/module.h>
52#include <linux/slab.h>
52#include <linux/mount.h> 53#include <linux/mount.h>
53#include <linux/socket.h> 54#include <linux/socket.h>
54#include <linux/mqueue.h> 55#include <linux/mqueue.h>
@@ -1893,7 +1894,7 @@ static int audit_inc_name_count(struct audit_context *context,
1893{ 1894{
1894 if (context->name_count >= AUDIT_NAMES) { 1895 if (context->name_count >= AUDIT_NAMES) {
1895 if (inode) 1896 if (inode)
1896 printk(KERN_DEBUG "name_count maxed, losing inode data: " 1897 printk(KERN_DEBUG "audit: name_count maxed, losing inode data: "
1897 "dev=%02x:%02x, inode=%lu\n", 1898 "dev=%02x:%02x, inode=%lu\n",
1898 MAJOR(inode->i_sb->s_dev), 1899 MAJOR(inode->i_sb->s_dev),
1899 MINOR(inode->i_sb->s_dev), 1900 MINOR(inode->i_sb->s_dev),
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e4697e9b276..2f05303715a5 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -15,7 +15,6 @@
15#include <linux/syscalls.h> 15#include <linux/syscalls.h>
16#include <linux/pid_namespace.h> 16#include <linux/pid_namespace.h>
17#include <asm/uaccess.h> 17#include <asm/uaccess.h>
18#include "cred-internals.h"
19 18
20/* 19/*
21 * Leveraged for setting/resetting capabilities 20 * Leveraged for setting/resetting capabilities
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ef909a329750..a8ce09954404 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -27,7 +27,6 @@
27 */ 27 */
28 28
29#include <linux/cgroup.h> 29#include <linux/cgroup.h>
30#include <linux/module.h>
31#include <linux/ctype.h> 30#include <linux/ctype.h>
32#include <linux/errno.h> 31#include <linux/errno.h>
33#include <linux/fs.h> 32#include <linux/fs.h>
@@ -1647,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
1647int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1646int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1648{ 1647{
1649 char *start; 1648 char *start;
1650 struct dentry *dentry = rcu_dereference(cgrp->dentry); 1649 struct dentry *dentry = rcu_dereference_check(cgrp->dentry,
1650 rcu_read_lock_held() ||
1651 cgroup_lock_is_held());
1651 1652
1652 if (!dentry || cgrp == dummytop) { 1653 if (!dentry || cgrp == dummytop) {
1653 /* 1654 /*
@@ -1663,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
1663 *--start = '\0'; 1664 *--start = '\0';
1664 for (;;) { 1665 for (;;) {
1665 int len = dentry->d_name.len; 1666 int len = dentry->d_name.len;
1667
1666 if ((start -= len) < buf) 1668 if ((start -= len) < buf)
1667 return -ENAMETOOLONG; 1669 return -ENAMETOOLONG;
1668 memcpy(start, cgrp->dentry->d_name.name, len); 1670 memcpy(start, dentry->d_name.name, len);
1669 cgrp = cgrp->parent; 1671 cgrp = cgrp->parent;
1670 if (!cgrp) 1672 if (!cgrp)
1671 break; 1673 break;
1672 dentry = rcu_dereference(cgrp->dentry); 1674
1675 dentry = rcu_dereference_check(cgrp->dentry,
1676 rcu_read_lock_held() ||
1677 cgroup_lock_is_held());
1673 if (!cgrp->parent) 1678 if (!cgrp->parent)
1674 continue; 1679 continue;
1675 if (--start < buf) 1680 if (--start < buf)
@@ -1783,6 +1788,29 @@ out:
1783 return retval; 1788 return retval;
1784} 1789}
1785 1790
1791/**
1792 * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup
1793 * @tsk: the task to be attached
1794 */
1795int cgroup_attach_task_current_cg(struct task_struct *tsk)
1796{
1797 struct cgroupfs_root *root;
1798 struct cgroup *cur_cg;
1799 int retval = 0;
1800
1801 cgroup_lock();
1802 for_each_active_root(root) {
1803 cur_cg = task_cgroup_from_root(current, root);
1804 retval = cgroup_attach_task(cur_cg, tsk);
1805 if (retval)
1806 break;
1807 }
1808 cgroup_unlock();
1809
1810 return retval;
1811}
1812EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg);
1813
1786/* 1814/*
1787 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex 1815 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
1788 * held. May take task_lock of task 1816 * held. May take task_lock of task
@@ -2989,7 +3017,6 @@ static void cgroup_event_remove(struct work_struct *work)
2989 remove); 3017 remove);
2990 struct cgroup *cgrp = event->cgrp; 3018 struct cgroup *cgrp = event->cgrp;
2991 3019
2992 /* TODO: check return code */
2993 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3020 event->cft->unregister_event(cgrp, event->cft, event->eventfd);
2994 3021
2995 eventfd_ctx_put(event->eventfd); 3022 eventfd_ctx_put(event->eventfd);
@@ -3011,7 +3038,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
3011 unsigned long flags = (unsigned long)key; 3038 unsigned long flags = (unsigned long)key;
3012 3039
3013 if (flags & POLLHUP) { 3040 if (flags & POLLHUP) {
3014 remove_wait_queue_locked(event->wqh, &event->wait); 3041 __remove_wait_queue(event->wqh, &event->wait);
3015 spin_lock(&cgrp->event_list_lock); 3042 spin_lock(&cgrp->event_list_lock);
3016 list_del(&event->list); 3043 list_del(&event->list);
3017 spin_unlock(&cgrp->event_list_lock); 3044 spin_unlock(&cgrp->event_list_lock);
@@ -3610,7 +3637,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
3610 * @ss: the subsystem to load 3637 * @ss: the subsystem to load
3611 * 3638 *
3612 * This function should be called in a modular subsystem's initcall. If the 3639 * This function should be called in a modular subsystem's initcall. If the
3613 * subsytem is built as a module, it will be assigned a new subsys_id and set 3640 * subsystem is built as a module, it will be assigned a new subsys_id and set
3614 * up for use. If the subsystem is built-in anyway, work is delegated to the 3641 * up for use. If the subsystem is built-in anyway, work is delegated to the
3615 * simpler cgroup_init_subsys. 3642 * simpler cgroup_init_subsys.
3616 */ 3643 */
@@ -4430,7 +4457,15 @@ __setup("cgroup_disable=", cgroup_disable);
4430 */ 4457 */
4431unsigned short css_id(struct cgroup_subsys_state *css) 4458unsigned short css_id(struct cgroup_subsys_state *css)
4432{ 4459{
4433 struct css_id *cssid = rcu_dereference(css->id); 4460 struct css_id *cssid;
4461
4462 /*
4463 * This css_id() can return correct value when somone has refcnt
4464 * on this or this is under rcu_read_lock(). Once css->id is allocated,
4465 * it's unchanged until freed.
4466 */
4467 cssid = rcu_dereference_check(css->id,
4468 rcu_read_lock_held() || atomic_read(&css->refcnt));
4434 4469
4435 if (cssid) 4470 if (cssid)
4436 return cssid->id; 4471 return cssid->id;
@@ -4440,7 +4475,10 @@ EXPORT_SYMBOL_GPL(css_id);
4440 4475
4441unsigned short css_depth(struct cgroup_subsys_state *css) 4476unsigned short css_depth(struct cgroup_subsys_state *css)
4442{ 4477{
4443 struct css_id *cssid = rcu_dereference(css->id); 4478 struct css_id *cssid;
4479
4480 cssid = rcu_dereference_check(css->id,
4481 rcu_read_lock_held() || atomic_read(&css->refcnt));
4444 4482
4445 if (cssid) 4483 if (cssid)
4446 return cssid->depth; 4484 return cssid->depth;
@@ -4448,15 +4486,36 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
4448} 4486}
4449EXPORT_SYMBOL_GPL(css_depth); 4487EXPORT_SYMBOL_GPL(css_depth);
4450 4488
4489/**
4490 * css_is_ancestor - test "root" css is an ancestor of "child"
4491 * @child: the css to be tested.
4492 * @root: the css supporsed to be an ancestor of the child.
4493 *
4494 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
4495 * this function reads css->id, this use rcu_dereference() and rcu_read_lock().
4496 * But, considering usual usage, the csses should be valid objects after test.
4497 * Assuming that the caller will do some action to the child if this returns
4498 * returns true, the caller must take "child";s reference count.
4499 * If "child" is valid object and this returns true, "root" is valid, too.
4500 */
4501
4451bool css_is_ancestor(struct cgroup_subsys_state *child, 4502bool css_is_ancestor(struct cgroup_subsys_state *child,
4452 const struct cgroup_subsys_state *root) 4503 const struct cgroup_subsys_state *root)
4453{ 4504{
4454 struct css_id *child_id = rcu_dereference(child->id); 4505 struct css_id *child_id;
4455 struct css_id *root_id = rcu_dereference(root->id); 4506 struct css_id *root_id;
4507 bool ret = true;
4456 4508
4457 if (!child_id || !root_id || (child_id->depth < root_id->depth)) 4509 rcu_read_lock();
4458 return false; 4510 child_id = rcu_dereference(child->id);
4459 return child_id->stack[root_id->depth] == root_id->id; 4511 root_id = rcu_dereference(root->id);
4512 if (!child_id
4513 || !root_id
4514 || (child_id->depth < root_id->depth)
4515 || (child_id->stack[root_id->depth] != root_id->id))
4516 ret = false;
4517 rcu_read_unlock();
4518 return ret;
4460} 4519}
4461 4520
4462static void __free_css_id_cb(struct rcu_head *head) 4521static void __free_css_id_cb(struct rcu_head *head)
@@ -4556,13 +4615,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
4556{ 4615{
4557 int subsys_id, i, depth = 0; 4616 int subsys_id, i, depth = 0;
4558 struct cgroup_subsys_state *parent_css, *child_css; 4617 struct cgroup_subsys_state *parent_css, *child_css;
4559 struct css_id *child_id, *parent_id = NULL; 4618 struct css_id *child_id, *parent_id;
4560 4619
4561 subsys_id = ss->subsys_id; 4620 subsys_id = ss->subsys_id;
4562 parent_css = parent->subsys[subsys_id]; 4621 parent_css = parent->subsys[subsys_id];
4563 child_css = child->subsys[subsys_id]; 4622 child_css = child->subsys[subsys_id];
4564 depth = css_depth(parent_css) + 1;
4565 parent_id = parent_css->id; 4623 parent_id = parent_css->id;
4624 depth = parent_id->depth + 1;
4566 4625
4567 child_id = get_new_cssid(ss, depth); 4626 child_id = get_new_cssid(ss, depth);
4568 if (IS_ERR(child_id)) 4627 if (IS_ERR(child_id))
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index 59e9ef6aab40..ce71ed53e88f 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -15,6 +15,7 @@
15 */ 15 */
16 16
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/slab.h>
18#include <linux/cgroup.h> 19#include <linux/cgroup.h>
19#include <linux/fs.h> 20#include <linux/fs.h>
20#include <linux/uaccess.h> 21#include <linux/uaccess.h>
@@ -47,17 +48,20 @@ static inline struct freezer *task_freezer(struct task_struct *task)
47 struct freezer, css); 48 struct freezer, css);
48} 49}
49 50
50int cgroup_frozen(struct task_struct *task) 51int cgroup_freezing_or_frozen(struct task_struct *task)
51{ 52{
52 struct freezer *freezer; 53 struct freezer *freezer;
53 enum freezer_state state; 54 enum freezer_state state;
54 55
55 task_lock(task); 56 task_lock(task);
56 freezer = task_freezer(task); 57 freezer = task_freezer(task);
57 state = freezer->state; 58 if (!freezer->css.cgroup->parent)
59 state = CGROUP_THAWED; /* root cgroup can't be frozen */
60 else
61 state = freezer->state;
58 task_unlock(task); 62 task_unlock(task);
59 63
60 return state == CGROUP_FROZEN; 64 return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
61} 65}
62 66
63/* 67/*
@@ -85,10 +89,10 @@ struct cgroup_subsys freezer_subsys;
85 89
86/* Locks taken and their ordering 90/* Locks taken and their ordering
87 * ------------------------------ 91 * ------------------------------
88 * css_set_lock
89 * cgroup_mutex (AKA cgroup_lock) 92 * cgroup_mutex (AKA cgroup_lock)
90 * task->alloc_lock (AKA task_lock)
91 * freezer->lock 93 * freezer->lock
94 * css_set_lock
95 * task->alloc_lock (AKA task_lock)
92 * task->sighand->siglock 96 * task->sighand->siglock
93 * 97 *
94 * cgroup code forces css_set_lock to be taken before task->alloc_lock 98 * cgroup code forces css_set_lock to be taken before task->alloc_lock
@@ -96,33 +100,38 @@ struct cgroup_subsys freezer_subsys;
96 * freezer_create(), freezer_destroy(): 100 * freezer_create(), freezer_destroy():
97 * cgroup_mutex [ by cgroup core ] 101 * cgroup_mutex [ by cgroup core ]
98 * 102 *
99 * can_attach(): 103 * freezer_can_attach():
100 * cgroup_mutex 104 * cgroup_mutex (held by caller of can_attach)
101 * 105 *
102 * cgroup_frozen(): 106 * cgroup_freezing_or_frozen():
103 * task->alloc_lock (to get task's cgroup) 107 * task->alloc_lock (to get task's cgroup)
104 * 108 *
105 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex): 109 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
106 * task->alloc_lock (to get task's cgroup)
107 * freezer->lock 110 * freezer->lock
108 * sighand->siglock (if the cgroup is freezing) 111 * sighand->siglock (if the cgroup is freezing)
109 * 112 *
110 * freezer_read(): 113 * freezer_read():
111 * cgroup_mutex 114 * cgroup_mutex
112 * freezer->lock 115 * freezer->lock
116 * write_lock css_set_lock (cgroup iterator start)
117 * task->alloc_lock
113 * read_lock css_set_lock (cgroup iterator start) 118 * read_lock css_set_lock (cgroup iterator start)
114 * 119 *
115 * freezer_write() (freeze): 120 * freezer_write() (freeze):
116 * cgroup_mutex 121 * cgroup_mutex
117 * freezer->lock 122 * freezer->lock
123 * write_lock css_set_lock (cgroup iterator start)
124 * task->alloc_lock
118 * read_lock css_set_lock (cgroup iterator start) 125 * read_lock css_set_lock (cgroup iterator start)
119 * sighand->siglock 126 * sighand->siglock (fake signal delivery inside freeze_task())
120 * 127 *
121 * freezer_write() (unfreeze): 128 * freezer_write() (unfreeze):
122 * cgroup_mutex 129 * cgroup_mutex
123 * freezer->lock 130 * freezer->lock
131 * write_lock css_set_lock (cgroup iterator start)
132 * task->alloc_lock
124 * read_lock css_set_lock (cgroup iterator start) 133 * read_lock css_set_lock (cgroup iterator start)
125 * task->alloc_lock (to prevent races with freeze_task()) 134 * task->alloc_lock (inside thaw_process(), prevents race with refrigerator())
126 * sighand->siglock 135 * sighand->siglock
127 */ 136 */
128static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss, 137static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
@@ -201,9 +210,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
201 * No lock is needed, since the task isn't on tasklist yet, 210 * No lock is needed, since the task isn't on tasklist yet,
202 * so it can't be moved to another cgroup, which means the 211 * so it can't be moved to another cgroup, which means the
203 * freezer won't be removed and will be valid during this 212 * freezer won't be removed and will be valid during this
204 * function call. 213 * function call. Nevertheless, apply RCU read-side critical
214 * section to suppress RCU lockdep false positives.
205 */ 215 */
216 rcu_read_lock();
206 freezer = task_freezer(task); 217 freezer = task_freezer(task);
218 rcu_read_unlock();
207 219
208 /* 220 /*
209 * The root cgroup is non-freezable, so we can skip the 221 * The root cgroup is non-freezable, so we can skip the
diff --git a/kernel/compat.c b/kernel/compat.c
index f6c204f07ea6..5adab05a3172 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -25,6 +25,7 @@
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h> 26#include <linux/times.h>
27#include <linux/ptrace.h> 27#include <linux/ptrace.h>
28#include <linux/gfp.h>
28 29
29#include <asm/uaccess.h> 30#include <asm/uaccess.h>
30 31
@@ -494,29 +495,26 @@ asmlinkage long compat_sys_sched_getaffinity(compat_pid_t pid, unsigned int len,
494{ 495{
495 int ret; 496 int ret;
496 cpumask_var_t mask; 497 cpumask_var_t mask;
497 unsigned long *k;
498 unsigned int min_length = cpumask_size();
499
500 if (nr_cpu_ids <= BITS_PER_COMPAT_LONG)
501 min_length = sizeof(compat_ulong_t);
502 498
503 if (len < min_length) 499 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
500 return -EINVAL;
501 if (len & (sizeof(compat_ulong_t)-1))
504 return -EINVAL; 502 return -EINVAL;
505 503
506 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 504 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
507 return -ENOMEM; 505 return -ENOMEM;
508 506
509 ret = sched_getaffinity(pid, mask); 507 ret = sched_getaffinity(pid, mask);
510 if (ret < 0) 508 if (ret == 0) {
511 goto out; 509 size_t retlen = min_t(size_t, len, cpumask_size());
512 510
513 k = cpumask_bits(mask); 511 if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8))
514 ret = compat_put_bitmap(user_mask_ptr, k, min_length * 8); 512 ret = -EFAULT;
515 if (ret == 0) 513 else
516 ret = min_length; 514 ret = retlen;
517 515 }
518out:
519 free_cpumask_var(mask); 516 free_cpumask_var(mask);
517
520 return ret; 518 return ret;
521} 519}
522 520
diff --git a/kernel/cpu.c b/kernel/cpu.c
index f8cced2692b3..97d1b426a4ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -14,18 +14,35 @@
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/gfp.h>
17 18
18#ifdef CONFIG_SMP 19#ifdef CONFIG_SMP
19/* Serializes the updates to cpu_online_mask, cpu_present_mask */ 20/* Serializes the updates to cpu_online_mask, cpu_present_mask */
20static DEFINE_MUTEX(cpu_add_remove_lock); 21static DEFINE_MUTEX(cpu_add_remove_lock);
21 22
22static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 23/*
24 * The following two API's must be used when attempting
25 * to serialize the updates to cpu_online_mask, cpu_present_mask.
26 */
27void cpu_maps_update_begin(void)
28{
29 mutex_lock(&cpu_add_remove_lock);
30}
31
32void cpu_maps_update_done(void)
33{
34 mutex_unlock(&cpu_add_remove_lock);
35}
36
37static RAW_NOTIFIER_HEAD(cpu_chain);
23 38
24/* If set, cpu_up and cpu_down will return -EBUSY and do nothing. 39/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
25 * Should always be manipulated under cpu_add_remove_lock 40 * Should always be manipulated under cpu_add_remove_lock
26 */ 41 */
27static int cpu_hotplug_disabled; 42static int cpu_hotplug_disabled;
28 43
44#ifdef CONFIG_HOTPLUG_CPU
45
29static struct { 46static struct {
30 struct task_struct *active_writer; 47 struct task_struct *active_writer;
31 struct mutex lock; /* Synchronizes accesses to refcount, */ 48 struct mutex lock; /* Synchronizes accesses to refcount, */
@@ -40,8 +57,6 @@ static struct {
40 .refcount = 0, 57 .refcount = 0,
41}; 58};
42 59
43#ifdef CONFIG_HOTPLUG_CPU
44
45void get_online_cpus(void) 60void get_online_cpus(void)
46{ 61{
47 might_sleep(); 62 might_sleep();
@@ -66,22 +81,6 @@ void put_online_cpus(void)
66} 81}
67EXPORT_SYMBOL_GPL(put_online_cpus); 82EXPORT_SYMBOL_GPL(put_online_cpus);
68 83
69#endif /* CONFIG_HOTPLUG_CPU */
70
71/*
72 * The following two API's must be used when attempting
73 * to serialize the updates to cpu_online_mask, cpu_present_mask.
74 */
75void cpu_maps_update_begin(void)
76{
77 mutex_lock(&cpu_add_remove_lock);
78}
79
80void cpu_maps_update_done(void)
81{
82 mutex_unlock(&cpu_add_remove_lock);
83}
84
85/* 84/*
86 * This ensures that the hotplug operation can begin only when the 85 * This ensures that the hotplug operation can begin only when the
87 * refcount goes to zero. 86 * refcount goes to zero.
@@ -123,6 +122,12 @@ static void cpu_hotplug_done(void)
123 cpu_hotplug.active_writer = NULL; 122 cpu_hotplug.active_writer = NULL;
124 mutex_unlock(&cpu_hotplug.lock); 123 mutex_unlock(&cpu_hotplug.lock);
125} 124}
125
126#else /* #if CONFIG_HOTPLUG_CPU */
127static void cpu_hotplug_begin(void) {}
128static void cpu_hotplug_done(void) {}
129#endif /* #esle #if CONFIG_HOTPLUG_CPU */
130
126/* Need to know about CPUs going up/down? */ 131/* Need to know about CPUs going up/down? */
127int __ref register_cpu_notifier(struct notifier_block *nb) 132int __ref register_cpu_notifier(struct notifier_block *nb)
128{ 133{
@@ -133,8 +138,29 @@ int __ref register_cpu_notifier(struct notifier_block *nb)
133 return ret; 138 return ret;
134} 139}
135 140
141static int __cpu_notify(unsigned long val, void *v, int nr_to_call,
142 int *nr_calls)
143{
144 int ret;
145
146 ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call,
147 nr_calls);
148
149 return notifier_to_errno(ret);
150}
151
152static int cpu_notify(unsigned long val, void *v)
153{
154 return __cpu_notify(val, v, -1, NULL);
155}
156
136#ifdef CONFIG_HOTPLUG_CPU 157#ifdef CONFIG_HOTPLUG_CPU
137 158
159static void cpu_notify_nofail(unsigned long val, void *v)
160{
161 BUG_ON(cpu_notify(val, v));
162}
163
138EXPORT_SYMBOL(register_cpu_notifier); 164EXPORT_SYMBOL(register_cpu_notifier);
139 165
140void __ref unregister_cpu_notifier(struct notifier_block *nb) 166void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -163,6 +189,7 @@ static inline void check_for_tasks(int cpu)
163} 189}
164 190
165struct take_cpu_down_param { 191struct take_cpu_down_param {
192 struct task_struct *caller;
166 unsigned long mod; 193 unsigned long mod;
167 void *hcpu; 194 void *hcpu;
168}; 195};
@@ -171,6 +198,7 @@ struct take_cpu_down_param {
171static int __ref take_cpu_down(void *_param) 198static int __ref take_cpu_down(void *_param)
172{ 199{
173 struct take_cpu_down_param *param = _param; 200 struct take_cpu_down_param *param = _param;
201 unsigned int cpu = (unsigned long)param->hcpu;
174 int err; 202 int err;
175 203
176 /* Ensure this CPU doesn't handle any more interrupts. */ 204 /* Ensure this CPU doesn't handle any more interrupts. */
@@ -178,9 +206,10 @@ static int __ref take_cpu_down(void *_param)
178 if (err < 0) 206 if (err < 0)
179 return err; 207 return err;
180 208
181 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, 209 cpu_notify(CPU_DYING | param->mod, param->hcpu);
182 param->hcpu);
183 210
211 if (task_cpu(param->caller) == cpu)
212 move_task_off_dead_cpu(cpu, param->caller);
184 /* Force idle task to run as soon as we yield: it should 213 /* Force idle task to run as soon as we yield: it should
185 immediately notice cpu is offline and die quickly. */ 214 immediately notice cpu is offline and die quickly. */
186 sched_idle_next(); 215 sched_idle_next();
@@ -191,10 +220,10 @@ static int __ref take_cpu_down(void *_param)
191static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 220static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
192{ 221{
193 int err, nr_calls = 0; 222 int err, nr_calls = 0;
194 cpumask_var_t old_allowed;
195 void *hcpu = (void *)(long)cpu; 223 void *hcpu = (void *)(long)cpu;
196 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 224 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
197 struct take_cpu_down_param tcd_param = { 225 struct take_cpu_down_param tcd_param = {
226 .caller = current,
198 .mod = mod, 227 .mod = mod,
199 .hcpu = hcpu, 228 .hcpu = hcpu,
200 }; 229 };
@@ -205,38 +234,26 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
205 if (!cpu_online(cpu)) 234 if (!cpu_online(cpu))
206 return -EINVAL; 235 return -EINVAL;
207 236
208 if (!alloc_cpumask_var(&old_allowed, GFP_KERNEL))
209 return -ENOMEM;
210
211 cpu_hotplug_begin(); 237 cpu_hotplug_begin();
212 set_cpu_active(cpu, false); 238 set_cpu_active(cpu, false);
213 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 239 err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
214 hcpu, -1, &nr_calls); 240 if (err) {
215 if (err == NOTIFY_BAD) {
216 set_cpu_active(cpu, true); 241 set_cpu_active(cpu, true);
217 242
218 nr_calls--; 243 nr_calls--;
219 __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 244 __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
220 hcpu, nr_calls, NULL);
221 printk("%s: attempt to take down CPU %u failed\n", 245 printk("%s: attempt to take down CPU %u failed\n",
222 __func__, cpu); 246 __func__, cpu);
223 err = -EINVAL;
224 goto out_release; 247 goto out_release;
225 } 248 }
226 249
227 /* Ensure that we are not runnable on dying cpu */
228 cpumask_copy(old_allowed, &current->cpus_allowed);
229 set_cpus_allowed_ptr(current, cpu_active_mask);
230
231 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); 250 err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
232 if (err) { 251 if (err) {
233 set_cpu_active(cpu, true); 252 set_cpu_active(cpu, true);
234 /* CPU didn't die: tell everyone. Can't complain. */ 253 /* CPU didn't die: tell everyone. Can't complain. */
235 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 254 cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
236 hcpu) == NOTIFY_BAD)
237 BUG();
238 255
239 goto out_allowed; 256 goto out_release;
240 } 257 }
241 BUG_ON(cpu_online(cpu)); 258 BUG_ON(cpu_online(cpu));
242 259
@@ -248,22 +265,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
248 __cpu_die(cpu); 265 __cpu_die(cpu);
249 266
250 /* CPU is completely dead: tell everyone. Too late to complain. */ 267 /* CPU is completely dead: tell everyone. Too late to complain. */
251 if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod, 268 cpu_notify_nofail(CPU_DEAD | mod, hcpu);
252 hcpu) == NOTIFY_BAD)
253 BUG();
254 269
255 check_for_tasks(cpu); 270 check_for_tasks(cpu);
256 271
257out_allowed:
258 set_cpus_allowed_ptr(current, old_allowed);
259out_release: 272out_release:
260 cpu_hotplug_done(); 273 cpu_hotplug_done();
261 if (!err) { 274 if (!err)
262 if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod, 275 cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
263 hcpu) == NOTIFY_BAD)
264 BUG();
265 }
266 free_cpumask_var(old_allowed);
267 return err; 276 return err;
268} 277}
269 278
@@ -271,9 +280,6 @@ int __ref cpu_down(unsigned int cpu)
271{ 280{
272 int err; 281 int err;
273 282
274 err = stop_machine_create();
275 if (err)
276 return err;
277 cpu_maps_update_begin(); 283 cpu_maps_update_begin();
278 284
279 if (cpu_hotplug_disabled) { 285 if (cpu_hotplug_disabled) {
@@ -285,7 +291,6 @@ int __ref cpu_down(unsigned int cpu)
285 291
286out: 292out:
287 cpu_maps_update_done(); 293 cpu_maps_update_done();
288 stop_machine_destroy();
289 return err; 294 return err;
290} 295}
291EXPORT_SYMBOL(cpu_down); 296EXPORT_SYMBOL(cpu_down);
@@ -302,13 +307,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
302 return -EINVAL; 307 return -EINVAL;
303 308
304 cpu_hotplug_begin(); 309 cpu_hotplug_begin();
305 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, 310 ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
306 -1, &nr_calls); 311 if (ret) {
307 if (ret == NOTIFY_BAD) {
308 nr_calls--; 312 nr_calls--;
309 printk("%s: attempt to bring up CPU %u failed\n", 313 printk("%s: attempt to bring up CPU %u failed\n",
310 __func__, cpu); 314 __func__, cpu);
311 ret = -EINVAL;
312 goto out_notify; 315 goto out_notify;
313 } 316 }
314 317
@@ -321,12 +324,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
321 set_cpu_active(cpu, true); 324 set_cpu_active(cpu, true);
322 325
323 /* Now call notifier in preparation. */ 326 /* Now call notifier in preparation. */
324 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); 327 cpu_notify(CPU_ONLINE | mod, hcpu);
325 328
326out_notify: 329out_notify:
327 if (ret != 0) 330 if (ret != 0)
328 __raw_notifier_call_chain(&cpu_chain, 331 __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
329 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
330 cpu_hotplug_done(); 332 cpu_hotplug_done();
331 333
332 return ret; 334 return ret;
@@ -335,6 +337,12 @@ out_notify:
335int __cpuinit cpu_up(unsigned int cpu) 337int __cpuinit cpu_up(unsigned int cpu)
336{ 338{
337 int err = 0; 339 int err = 0;
340
341#ifdef CONFIG_MEMORY_HOTPLUG
342 int nid;
343 pg_data_t *pgdat;
344#endif
345
338 if (!cpu_possible(cpu)) { 346 if (!cpu_possible(cpu)) {
339 printk(KERN_ERR "can't online cpu %d because it is not " 347 printk(KERN_ERR "can't online cpu %d because it is not "
340 "configured as may-hotadd at boot time\n", cpu); 348 "configured as may-hotadd at boot time\n", cpu);
@@ -345,6 +353,28 @@ int __cpuinit cpu_up(unsigned int cpu)
345 return -EINVAL; 353 return -EINVAL;
346 } 354 }
347 355
356#ifdef CONFIG_MEMORY_HOTPLUG
357 nid = cpu_to_node(cpu);
358 if (!node_online(nid)) {
359 err = mem_online_node(nid);
360 if (err)
361 return err;
362 }
363
364 pgdat = NODE_DATA(nid);
365 if (!pgdat) {
366 printk(KERN_ERR
367 "Can't online cpu %d due to NULL pgdat\n", cpu);
368 return -ENOMEM;
369 }
370
371 if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
372 mutex_lock(&zonelists_mutex);
373 build_all_zonelists(NULL);
374 mutex_unlock(&zonelists_mutex);
375 }
376#endif
377
348 cpu_maps_update_begin(); 378 cpu_maps_update_begin();
349 379
350 if (cpu_hotplug_disabled) { 380 if (cpu_hotplug_disabled) {
@@ -364,11 +394,8 @@ static cpumask_var_t frozen_cpus;
364 394
365int disable_nonboot_cpus(void) 395int disable_nonboot_cpus(void)
366{ 396{
367 int cpu, first_cpu, error; 397 int cpu, first_cpu, error = 0;
368 398
369 error = stop_machine_create();
370 if (error)
371 return error;
372 cpu_maps_update_begin(); 399 cpu_maps_update_begin();
373 first_cpu = cpumask_first(cpu_online_mask); 400 first_cpu = cpumask_first(cpu_online_mask);
374 /* 401 /*
@@ -399,7 +426,6 @@ int disable_nonboot_cpus(void)
399 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 426 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
400 } 427 }
401 cpu_maps_update_done(); 428 cpu_maps_update_done();
402 stop_machine_destroy();
403 return error; 429 return error;
404} 430}
405 431
@@ -466,7 +492,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
466 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) 492 if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus))
467 val = CPU_STARTING_FROZEN; 493 val = CPU_STARTING_FROZEN;
468#endif /* CONFIG_PM_SLEEP_SMP */ 494#endif /* CONFIG_PM_SLEEP_SMP */
469 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); 495 cpu_notify(val, (void *)(long)cpu);
470} 496}
471 497
472#endif /* CONFIG_SMP */ 498#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ba401fab459f..7cb37d86a005 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -105,7 +105,7 @@ struct cpuset {
105 /* for custom sched domain */ 105 /* for custom sched domain */
106 int relax_domain_level; 106 int relax_domain_level;
107 107
108 /* used for walking a cpuset heirarchy */ 108 /* used for walking a cpuset hierarchy */
109 struct list_head stack_list; 109 struct list_head stack_list;
110}; 110};
111 111
@@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
920 * call to guarantee_online_mems(), as we know no one is changing 920 * call to guarantee_online_mems(), as we know no one is changing
921 * our task's cpuset. 921 * our task's cpuset.
922 * 922 *
923 * Hold callback_mutex around the two modifications of our tasks
924 * mems_allowed to synchronize with cpuset_mems_allowed().
925 *
926 * While the mm_struct we are migrating is typically from some 923 * While the mm_struct we are migrating is typically from some
927 * other task, the task_struct mems_allowed that we are hacking 924 * other task, the task_struct mems_allowed that we are hacking
928 * is for our current task, which must allocate new pages for that 925 * is for our current task, which must allocate new pages for that
@@ -949,16 +946,62 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
949 * In order to avoid seeing no nodes if the old and new nodes are disjoint, 946 * In order to avoid seeing no nodes if the old and new nodes are disjoint,
950 * we structure updates as setting all new allowed nodes, then clearing newly 947 * we structure updates as setting all new allowed nodes, then clearing newly
951 * disallowed ones. 948 * disallowed ones.
952 *
953 * Called with task's alloc_lock held
954 */ 949 */
955static void cpuset_change_task_nodemask(struct task_struct *tsk, 950static void cpuset_change_task_nodemask(struct task_struct *tsk,
956 nodemask_t *newmems) 951 nodemask_t *newmems)
957{ 952{
953repeat:
954 /*
955 * Allow tasks that have access to memory reserves because they have
956 * been OOM killed to get memory anywhere.
957 */
958 if (unlikely(test_thread_flag(TIF_MEMDIE)))
959 return;
960 if (current->flags & PF_EXITING) /* Let dying task have memory */
961 return;
962
963 task_lock(tsk);
958 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems); 964 nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
959 mpol_rebind_task(tsk, &tsk->mems_allowed); 965 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
960 mpol_rebind_task(tsk, newmems); 966
967
968 /*
969 * ensure checking ->mems_allowed_change_disable after setting all new
970 * allowed nodes.
971 *
972 * the read-side task can see an nodemask with new allowed nodes and
973 * old allowed nodes. and if it allocates page when cpuset clears newly
974 * disallowed ones continuous, it can see the new allowed bits.
975 *
976 * And if setting all new allowed nodes is after the checking, setting
977 * all new allowed nodes and clearing newly disallowed ones will be done
978 * continuous, and the read-side task may find no node to alloc page.
979 */
980 smp_mb();
981
982 /*
983 * Allocation of memory is very fast, we needn't sleep when waiting
984 * for the read-side.
985 */
986 while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
987 task_unlock(tsk);
988 if (!task_curr(tsk))
989 yield();
990 goto repeat;
991 }
992
993 /*
994 * ensure checking ->mems_allowed_change_disable before clearing all new
995 * disallowed nodes.
996 *
997 * if clearing newly disallowed bits before the checking, the read-side
998 * task may find no node to alloc page.
999 */
1000 smp_mb();
1001
1002 mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
961 tsk->mems_allowed = *newmems; 1003 tsk->mems_allowed = *newmems;
1004 task_unlock(tsk);
962} 1005}
963 1006
964/* 1007/*
@@ -973,14 +1016,17 @@ static void cpuset_change_nodemask(struct task_struct *p,
973 struct cpuset *cs; 1016 struct cpuset *cs;
974 int migrate; 1017 int migrate;
975 const nodemask_t *oldmem = scan->data; 1018 const nodemask_t *oldmem = scan->data;
976 nodemask_t newmems; 1019 NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
1020
1021 if (!newmems)
1022 return;
977 1023
978 cs = cgroup_cs(scan->cg); 1024 cs = cgroup_cs(scan->cg);
979 guarantee_online_mems(cs, &newmems); 1025 guarantee_online_mems(cs, newmems);
980 1026
981 task_lock(p); 1027 cpuset_change_task_nodemask(p, newmems);
982 cpuset_change_task_nodemask(p, &newmems); 1028
983 task_unlock(p); 1029 NODEMASK_FREE(newmems);
984 1030
985 mm = get_task_mm(p); 1031 mm = get_task_mm(p);
986 if (!mm) 1032 if (!mm)
@@ -1051,16 +1097,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
1051static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, 1097static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1052 const char *buf) 1098 const char *buf)
1053{ 1099{
1054 nodemask_t oldmem; 1100 NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
1055 int retval; 1101 int retval;
1056 struct ptr_heap heap; 1102 struct ptr_heap heap;
1057 1103
1104 if (!oldmem)
1105 return -ENOMEM;
1106
1058 /* 1107 /*
1059 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; 1108 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1060 * it's read-only 1109 * it's read-only
1061 */ 1110 */
1062 if (cs == &top_cpuset) 1111 if (cs == &top_cpuset) {
1063 return -EACCES; 1112 retval = -EACCES;
1113 goto done;
1114 }
1064 1115
1065 /* 1116 /*
1066 * An empty mems_allowed is ok iff there are no tasks in the cpuset. 1117 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
@@ -1076,11 +1127,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1076 goto done; 1127 goto done;
1077 1128
1078 if (!nodes_subset(trialcs->mems_allowed, 1129 if (!nodes_subset(trialcs->mems_allowed,
1079 node_states[N_HIGH_MEMORY])) 1130 node_states[N_HIGH_MEMORY])) {
1080 return -EINVAL; 1131 retval = -EINVAL;
1132 goto done;
1133 }
1081 } 1134 }
1082 oldmem = cs->mems_allowed; 1135 *oldmem = cs->mems_allowed;
1083 if (nodes_equal(oldmem, trialcs->mems_allowed)) { 1136 if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
1084 retval = 0; /* Too easy - nothing to do */ 1137 retval = 0; /* Too easy - nothing to do */
1085 goto done; 1138 goto done;
1086 } 1139 }
@@ -1096,10 +1149,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
1096 cs->mems_allowed = trialcs->mems_allowed; 1149 cs->mems_allowed = trialcs->mems_allowed;
1097 mutex_unlock(&callback_mutex); 1150 mutex_unlock(&callback_mutex);
1098 1151
1099 update_tasks_nodemask(cs, &oldmem, &heap); 1152 update_tasks_nodemask(cs, oldmem, &heap);
1100 1153
1101 heap_free(&heap); 1154 heap_free(&heap);
1102done: 1155done:
1156 NODEMASK_FREE(oldmem);
1103 return retval; 1157 return retval;
1104} 1158}
1105 1159
@@ -1373,9 +1427,7 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
1373 err = set_cpus_allowed_ptr(tsk, cpus_attach); 1427 err = set_cpus_allowed_ptr(tsk, cpus_attach);
1374 WARN_ON_ONCE(err); 1428 WARN_ON_ONCE(err);
1375 1429
1376 task_lock(tsk);
1377 cpuset_change_task_nodemask(tsk, to); 1430 cpuset_change_task_nodemask(tsk, to);
1378 task_unlock(tsk);
1379 cpuset_update_task_spread_flag(cs, tsk); 1431 cpuset_update_task_spread_flag(cs, tsk);
1380 1432
1381} 1433}
@@ -1384,40 +1436,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
1384 struct cgroup *oldcont, struct task_struct *tsk, 1436 struct cgroup *oldcont, struct task_struct *tsk,
1385 bool threadgroup) 1437 bool threadgroup)
1386{ 1438{
1387 nodemask_t from, to;
1388 struct mm_struct *mm; 1439 struct mm_struct *mm;
1389 struct cpuset *cs = cgroup_cs(cont); 1440 struct cpuset *cs = cgroup_cs(cont);
1390 struct cpuset *oldcs = cgroup_cs(oldcont); 1441 struct cpuset *oldcs = cgroup_cs(oldcont);
1442 NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
1443 NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
1444
1445 if (from == NULL || to == NULL)
1446 goto alloc_fail;
1391 1447
1392 if (cs == &top_cpuset) { 1448 if (cs == &top_cpuset) {
1393 cpumask_copy(cpus_attach, cpu_possible_mask); 1449 cpumask_copy(cpus_attach, cpu_possible_mask);
1394 to = node_possible_map;
1395 } else { 1450 } else {
1396 guarantee_online_cpus(cs, cpus_attach); 1451 guarantee_online_cpus(cs, cpus_attach);
1397 guarantee_online_mems(cs, &to);
1398 } 1452 }
1453 guarantee_online_mems(cs, to);
1399 1454
1400 /* do per-task migration stuff possibly for each in the threadgroup */ 1455 /* do per-task migration stuff possibly for each in the threadgroup */
1401 cpuset_attach_task(tsk, &to, cs); 1456 cpuset_attach_task(tsk, to, cs);
1402 if (threadgroup) { 1457 if (threadgroup) {
1403 struct task_struct *c; 1458 struct task_struct *c;
1404 rcu_read_lock(); 1459 rcu_read_lock();
1405 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { 1460 list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
1406 cpuset_attach_task(c, &to, cs); 1461 cpuset_attach_task(c, to, cs);
1407 } 1462 }
1408 rcu_read_unlock(); 1463 rcu_read_unlock();
1409 } 1464 }
1410 1465
1411 /* change mm; only needs to be done once even if threadgroup */ 1466 /* change mm; only needs to be done once even if threadgroup */
1412 from = oldcs->mems_allowed; 1467 *from = oldcs->mems_allowed;
1413 to = cs->mems_allowed; 1468 *to = cs->mems_allowed;
1414 mm = get_task_mm(tsk); 1469 mm = get_task_mm(tsk);
1415 if (mm) { 1470 if (mm) {
1416 mpol_rebind_mm(mm, &to); 1471 mpol_rebind_mm(mm, to);
1417 if (is_memory_migrate(cs)) 1472 if (is_memory_migrate(cs))
1418 cpuset_migrate_mm(mm, &from, &to); 1473 cpuset_migrate_mm(mm, from, to);
1419 mmput(mm); 1474 mmput(mm);
1420 } 1475 }
1476
1477alloc_fail:
1478 NODEMASK_FREE(from);
1479 NODEMASK_FREE(to);
1421} 1480}
1422 1481
1423/* The various types of files and directories in a cpuset file system */ 1482/* The various types of files and directories in a cpuset file system */
@@ -1562,13 +1621,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
1562 1621
1563static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) 1622static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
1564{ 1623{
1565 nodemask_t mask; 1624 NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
1625 int retval;
1626
1627 if (mask == NULL)
1628 return -ENOMEM;
1566 1629
1567 mutex_lock(&callback_mutex); 1630 mutex_lock(&callback_mutex);
1568 mask = cs->mems_allowed; 1631 *mask = cs->mems_allowed;
1569 mutex_unlock(&callback_mutex); 1632 mutex_unlock(&callback_mutex);
1570 1633
1571 return nodelist_scnprintf(page, PAGE_SIZE, mask); 1634 retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
1635
1636 NODEMASK_FREE(mask);
1637
1638 return retval;
1572} 1639}
1573 1640
1574static ssize_t cpuset_common_file_read(struct cgroup *cont, 1641static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1997,7 +2064,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
1997 struct cpuset *cp; /* scans cpusets being updated */ 2064 struct cpuset *cp; /* scans cpusets being updated */
1998 struct cpuset *child; /* scans child cpusets of cp */ 2065 struct cpuset *child; /* scans child cpusets of cp */
1999 struct cgroup *cont; 2066 struct cgroup *cont;
2000 nodemask_t oldmems; 2067 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2068
2069 if (oldmems == NULL)
2070 return;
2001 2071
2002 list_add_tail((struct list_head *)&root->stack_list, &queue); 2072 list_add_tail((struct list_head *)&root->stack_list, &queue);
2003 2073
@@ -2014,7 +2084,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2014 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 2084 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
2015 continue; 2085 continue;
2016 2086
2017 oldmems = cp->mems_allowed; 2087 *oldmems = cp->mems_allowed;
2018 2088
2019 /* Remove offline cpus and mems from this cpuset. */ 2089 /* Remove offline cpus and mems from this cpuset. */
2020 mutex_lock(&callback_mutex); 2090 mutex_lock(&callback_mutex);
@@ -2030,9 +2100,10 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2030 remove_tasks_in_empty_cpuset(cp); 2100 remove_tasks_in_empty_cpuset(cp);
2031 else { 2101 else {
2032 update_tasks_cpumask(cp, NULL); 2102 update_tasks_cpumask(cp, NULL);
2033 update_tasks_nodemask(cp, &oldmems, NULL); 2103 update_tasks_nodemask(cp, oldmems, NULL);
2034 } 2104 }
2035 } 2105 }
2106 NODEMASK_FREE(oldmems);
2036} 2107}
2037 2108
2038/* 2109/*
@@ -2090,20 +2161,33 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2090static int cpuset_track_online_nodes(struct notifier_block *self, 2161static int cpuset_track_online_nodes(struct notifier_block *self,
2091 unsigned long action, void *arg) 2162 unsigned long action, void *arg)
2092{ 2163{
2164 NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
2165
2166 if (oldmems == NULL)
2167 return NOTIFY_DONE;
2168
2093 cgroup_lock(); 2169 cgroup_lock();
2094 switch (action) { 2170 switch (action) {
2095 case MEM_ONLINE: 2171 case MEM_ONLINE:
2096 case MEM_OFFLINE: 2172 *oldmems = top_cpuset.mems_allowed;
2097 mutex_lock(&callback_mutex); 2173 mutex_lock(&callback_mutex);
2098 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2174 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2099 mutex_unlock(&callback_mutex); 2175 mutex_unlock(&callback_mutex);
2100 if (action == MEM_OFFLINE) 2176 update_tasks_nodemask(&top_cpuset, oldmems, NULL);
2101 scan_for_empty_cpusets(&top_cpuset); 2177 break;
2178 case MEM_OFFLINE:
2179 /*
2180 * needn't update top_cpuset.mems_allowed explicitly because
2181 * scan_for_empty_cpusets() will update it.
2182 */
2183 scan_for_empty_cpusets(&top_cpuset);
2102 break; 2184 break;
2103 default: 2185 default:
2104 break; 2186 break;
2105 } 2187 }
2106 cgroup_unlock(); 2188 cgroup_unlock();
2189
2190 NODEMASK_FREE(oldmems);
2107 return NOTIFY_OK; 2191 return NOTIFY_OK;
2108} 2192}
2109#endif 2193#endif
@@ -2140,19 +2224,52 @@ void __init cpuset_init_smp(void)
2140void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) 2224void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
2141{ 2225{
2142 mutex_lock(&callback_mutex); 2226 mutex_lock(&callback_mutex);
2143 cpuset_cpus_allowed_locked(tsk, pmask); 2227 task_lock(tsk);
2228 guarantee_online_cpus(task_cs(tsk), pmask);
2229 task_unlock(tsk);
2144 mutex_unlock(&callback_mutex); 2230 mutex_unlock(&callback_mutex);
2145} 2231}
2146 2232
2147/** 2233int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
2148 * cpuset_cpus_allowed_locked - return cpus_allowed mask from a tasks cpuset.
2149 * Must be called with callback_mutex held.
2150 **/
2151void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
2152{ 2234{
2153 task_lock(tsk); 2235 const struct cpuset *cs;
2154 guarantee_online_cpus(task_cs(tsk), pmask); 2236 int cpu;
2155 task_unlock(tsk); 2237
2238 rcu_read_lock();
2239 cs = task_cs(tsk);
2240 if (cs)
2241 cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
2242 rcu_read_unlock();
2243
2244 /*
2245 * We own tsk->cpus_allowed, nobody can change it under us.
2246 *
2247 * But we used cs && cs->cpus_allowed lockless and thus can
2248 * race with cgroup_attach_task() or update_cpumask() and get
2249 * the wrong tsk->cpus_allowed. However, both cases imply the
2250 * subsequent cpuset_change_cpumask()->set_cpus_allowed_ptr()
2251 * which takes task_rq_lock().
2252 *
2253 * If we are called after it dropped the lock we must see all
2254 * changes in tsk_cs()->cpus_allowed. Otherwise we can temporary
2255 * set any mask even if it is not right from task_cs() pov,
2256 * the pending set_cpus_allowed_ptr() will fix things.
2257 */
2258
2259 cpu = cpumask_any_and(&tsk->cpus_allowed, cpu_active_mask);
2260 if (cpu >= nr_cpu_ids) {
2261 /*
2262 * Either tsk->cpus_allowed is wrong (see above) or it
2263 * is actually empty. The latter case is only possible
2264 * if we are racing with remove_tasks_in_empty_cpuset().
2265 * Like above we can temporary set any mask and rely on
2266 * set_cpus_allowed_ptr() as synchronization point.
2267 */
2268 cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
2269 cpu = cpumask_any(cpu_active_mask);
2270 }
2271
2272 return cpu;
2156} 2273}
2157 2274
2158void cpuset_init_current_mems_allowed(void) 2275void cpuset_init_current_mems_allowed(void)
@@ -2341,22 +2458,6 @@ int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2341} 2458}
2342 2459
2343/** 2460/**
2344 * cpuset_lock - lock out any changes to cpuset structures
2345 *
2346 * The out of memory (oom) code needs to mutex_lock cpusets
2347 * from being changed while it scans the tasklist looking for a
2348 * task in an overlapping cpuset. Expose callback_mutex via this
2349 * cpuset_lock() routine, so the oom code can lock it, before
2350 * locking the task list. The tasklist_lock is a spinlock, so
2351 * must be taken inside callback_mutex.
2352 */
2353
2354void cpuset_lock(void)
2355{
2356 mutex_lock(&callback_mutex);
2357}
2358
2359/**
2360 * cpuset_unlock - release lock on cpuset changes 2461 * cpuset_unlock - release lock on cpuset changes
2361 * 2462 *
2362 * Undo the lock taken in a previous cpuset_lock() call. 2463 * Undo the lock taken in a previous cpuset_lock() call.
@@ -2368,7 +2469,8 @@ void cpuset_unlock(void)
2368} 2469}
2369 2470
2370/** 2471/**
2371 * cpuset_mem_spread_node() - On which node to begin search for a page 2472 * cpuset_mem_spread_node() - On which node to begin search for a file page
2473 * cpuset_slab_spread_node() - On which node to begin search for a slab page
2372 * 2474 *
2373 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for 2475 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2374 * tasks in a cpuset with is_spread_page or is_spread_slab set), 2476 * tasks in a cpuset with is_spread_page or is_spread_slab set),
@@ -2393,16 +2495,27 @@ void cpuset_unlock(void)
2393 * See kmem_cache_alloc_node(). 2495 * See kmem_cache_alloc_node().
2394 */ 2496 */
2395 2497
2396int cpuset_mem_spread_node(void) 2498static int cpuset_spread_node(int *rotor)
2397{ 2499{
2398 int node; 2500 int node;
2399 2501
2400 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); 2502 node = next_node(*rotor, current->mems_allowed);
2401 if (node == MAX_NUMNODES) 2503 if (node == MAX_NUMNODES)
2402 node = first_node(current->mems_allowed); 2504 node = first_node(current->mems_allowed);
2403 current->cpuset_mem_spread_rotor = node; 2505 *rotor = node;
2404 return node; 2506 return node;
2405} 2507}
2508
2509int cpuset_mem_spread_node(void)
2510{
2511 return cpuset_spread_node(&current->cpuset_mem_spread_rotor);
2512}
2513
2514int cpuset_slab_spread_node(void)
2515{
2516 return cpuset_spread_node(&current->cpuset_slab_spread_rotor);
2517}
2518
2406EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); 2519EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2407 2520
2408/** 2521/**
diff --git a/kernel/cred-internals.h b/kernel/cred-internals.h
deleted file mode 100644
index 2dc4fc2d0bf1..000000000000
--- a/kernel/cred-internals.h
+++ /dev/null
@@ -1,21 +0,0 @@
1/* Internal credentials stuff
2 *
3 * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12/*
13 * user.c
14 */
15static inline void sched_switch_user(struct task_struct *p)
16{
17#ifdef CONFIG_USER_SCHED
18 sched_move_task(p);
19#endif /* CONFIG_USER_SCHED */
20}
21
diff --git a/kernel/cred.c b/kernel/cred.c
index 1ed8ca18790c..60bc8b1e32e6 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -10,13 +10,13 @@
10 */ 10 */
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/cred.h> 12#include <linux/cred.h>
13#include <linux/slab.h>
13#include <linux/sched.h> 14#include <linux/sched.h>
14#include <linux/key.h> 15#include <linux/key.h>
15#include <linux/keyctl.h> 16#include <linux/keyctl.h>
16#include <linux/init_task.h> 17#include <linux/init_task.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/cn_proc.h> 19#include <linux/cn_proc.h>
19#include "cred-internals.h"
20 20
21#if 0 21#if 0
22#define kdebug(FMT, ...) \ 22#define kdebug(FMT, ...) \
@@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk)
209 } 209 }
210} 210}
211 211
212/**
213 * get_task_cred - Get another task's objective credentials
214 * @task: The task to query
215 *
216 * Get the objective credentials of a task, pinning them so that they can't go
217 * away. Accessing a task's credentials directly is not permitted.
218 *
219 * The caller must also make sure task doesn't get deleted, either by holding a
220 * ref on task or by holding tasklist_lock to prevent it from being unlinked.
221 */
222const struct cred *get_task_cred(struct task_struct *task)
223{
224 const struct cred *cred;
225
226 rcu_read_lock();
227
228 do {
229 cred = __task_cred((task));
230 BUG_ON(!cred);
231 } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
232
233 rcu_read_unlock();
234 return cred;
235}
236
212/* 237/*
213 * Allocate blank credentials, such that the credentials can be filled in at a 238 * Allocate blank credentials, such that the credentials can be filled in at a
214 * later date without risk of ENOMEM. 239 * later date without risk of ENOMEM.
@@ -347,60 +372,6 @@ struct cred *prepare_exec_creds(void)
347} 372}
348 373
349/* 374/*
350 * prepare new credentials for the usermode helper dispatcher
351 */
352struct cred *prepare_usermodehelper_creds(void)
353{
354#ifdef CONFIG_KEYS
355 struct thread_group_cred *tgcred = NULL;
356#endif
357 struct cred *new;
358
359#ifdef CONFIG_KEYS
360 tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC);
361 if (!tgcred)
362 return NULL;
363#endif
364
365 new = kmem_cache_alloc(cred_jar, GFP_ATOMIC);
366 if (!new)
367 return NULL;
368
369 kdebug("prepare_usermodehelper_creds() alloc %p", new);
370
371 memcpy(new, &init_cred, sizeof(struct cred));
372
373 atomic_set(&new->usage, 1);
374 set_cred_subscribers(new, 0);
375 get_group_info(new->group_info);
376 get_uid(new->user);
377
378#ifdef CONFIG_KEYS
379 new->thread_keyring = NULL;
380 new->request_key_auth = NULL;
381 new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT;
382
383 atomic_set(&tgcred->usage, 1);
384 spin_lock_init(&tgcred->lock);
385 new->tgcred = tgcred;
386#endif
387
388#ifdef CONFIG_SECURITY
389 new->security = NULL;
390#endif
391 if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0)
392 goto error;
393 validate_creds(new);
394
395 BUG_ON(atomic_read(&new->usage) != 1);
396 return new;
397
398error:
399 put_cred(new);
400 return NULL;
401}
402
403/*
404 * Copy credentials for the new process created by fork() 375 * Copy credentials for the new process created by fork()
405 * 376 *
406 * We share if we can, but under some circumstances we have to generate a new 377 * We share if we can, but under some circumstances we have to generate a new
@@ -516,8 +487,6 @@ int commit_creds(struct cred *new)
516#endif 487#endif
517 BUG_ON(atomic_read(&new->usage) < 1); 488 BUG_ON(atomic_read(&new->usage) < 1);
518 489
519 security_commit_creds(new, old);
520
521 get_cred(new); /* we will require a ref for the subj creds too */ 490 get_cred(new); /* we will require a ref for the subj creds too */
522 491
523 /* dumpability changes */ 492 /* dumpability changes */
@@ -553,8 +522,6 @@ int commit_creds(struct cred *new)
553 atomic_dec(&old->user->processes); 522 atomic_dec(&old->user->processes);
554 alter_cred_subscribers(old, -2); 523 alter_cred_subscribers(old, -2);
555 524
556 sched_switch_user(task);
557
558 /* send notifications */ 525 /* send notifications */
559 if (new->uid != old->uid || 526 if (new->uid != old->uid ||
560 new->euid != old->euid || 527 new->euid != old->euid ||
@@ -786,8 +753,6 @@ bool creds_are_invalid(const struct cred *cred)
786{ 753{
787 if (cred->magic != CRED_MAGIC) 754 if (cred->magic != CRED_MAGIC)
788 return true; 755 return true;
789 if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers))
790 return true;
791#ifdef CONFIG_SECURITY_SELINUX 756#ifdef CONFIG_SECURITY_SELINUX
792 if (selinux_is_enabled()) { 757 if (selinux_is_enabled()) {
793 if ((unsigned long) cred->security < PAGE_SIZE) 758 if ((unsigned long) cred->security < PAGE_SIZE)
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
new file mode 100644
index 000000000000..a85edc339985
--- /dev/null
+++ b/kernel/debug/Makefile
@@ -0,0 +1,6 @@
1#
2# Makefile for the linux kernel debugger
3#
4
5obj-$(CONFIG_KGDB) += debug_core.o gdbstub.o
6obj-$(CONFIG_KGDB_KDB) += kdb/
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
new file mode 100644
index 000000000000..51d14fe87648
--- /dev/null
+++ b/kernel/debug/debug_core.c
@@ -0,0 +1,983 @@
1/*
2 * Kernel Debug Core
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/console.h>
35#include <linux/threads.h>
36#include <linux/uaccess.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/ptrace.h>
40#include <linux/string.h>
41#include <linux/delay.h>
42#include <linux/sched.h>
43#include <linux/sysrq.h>
44#include <linux/init.h>
45#include <linux/kgdb.h>
46#include <linux/kdb.h>
47#include <linux/pid.h>
48#include <linux/smp.h>
49#include <linux/mm.h>
50
51#include <asm/cacheflush.h>
52#include <asm/byteorder.h>
53#include <asm/atomic.h>
54#include <asm/system.h>
55
56#include "debug_core.h"
57
58static int kgdb_break_asap;
59
60struct debuggerinfo_struct kgdb_info[NR_CPUS];
61
62/**
63 * kgdb_connected - Is a host GDB connected to us?
64 */
65int kgdb_connected;
66EXPORT_SYMBOL_GPL(kgdb_connected);
67
68/* All the KGDB handlers are installed */
69int kgdb_io_module_registered;
70
71/* Guard for recursive entry */
72static int exception_level;
73
74struct kgdb_io *dbg_io_ops;
75static DEFINE_SPINLOCK(kgdb_registration_lock);
76
77/* kgdb console driver is loaded */
78static int kgdb_con_registered;
79/* determine if kgdb console output should be used */
80static int kgdb_use_con;
81/* Flag for alternate operations for early debugging */
82bool dbg_is_early = true;
83/* Next cpu to become the master debug core */
84int dbg_switch_cpu;
85
86/* Use kdb or gdbserver mode */
87int dbg_kdb_mode = 1;
88
89static int __init opt_kgdb_con(char *str)
90{
91 kgdb_use_con = 1;
92 return 0;
93}
94
95early_param("kgdbcon", opt_kgdb_con);
96
97module_param(kgdb_use_con, int, 0644);
98
99/*
100 * Holds information about breakpoints in a kernel. These breakpoints are
101 * added and removed by gdb.
102 */
103static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
104 [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
105};
106
107/*
108 * The CPU# of the active CPU, or -1 if none:
109 */
110atomic_t kgdb_active = ATOMIC_INIT(-1);
111EXPORT_SYMBOL_GPL(kgdb_active);
112
113/*
114 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
115 * bootup code (which might not have percpu set up yet):
116 */
117static atomic_t passive_cpu_wait[NR_CPUS];
118static atomic_t cpu_in_kgdb[NR_CPUS];
119static atomic_t kgdb_break_tasklet_var;
120atomic_t kgdb_setting_breakpoint;
121
122struct task_struct *kgdb_usethread;
123struct task_struct *kgdb_contthread;
124
125int kgdb_single_step;
126static pid_t kgdb_sstep_pid;
127
128/* to keep track of the CPU which is doing the single stepping*/
129atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
130
131/*
132 * If you are debugging a problem where roundup (the collection of
133 * all other CPUs) is a problem [this should be extremely rare],
134 * then use the nokgdbroundup option to avoid roundup. In that case
135 * the other CPUs might interfere with your debugging context, so
136 * use this with care:
137 */
138static int kgdb_do_roundup = 1;
139
140static int __init opt_nokgdbroundup(char *str)
141{
142 kgdb_do_roundup = 0;
143
144 return 0;
145}
146
147early_param("nokgdbroundup", opt_nokgdbroundup);
148
149/*
150 * Finally, some KGDB code :-)
151 */
152
153/*
154 * Weak aliases for breakpoint management,
155 * can be overriden by architectures when needed:
156 */
157int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
158{
159 int err;
160
161 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
162 if (err)
163 return err;
164
165 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
166 BREAK_INSTR_SIZE);
167}
168
169int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
170{
171 return probe_kernel_write((char *)addr,
172 (char *)bundle, BREAK_INSTR_SIZE);
173}
174
175int __weak kgdb_validate_break_address(unsigned long addr)
176{
177 char tmp_variable[BREAK_INSTR_SIZE];
178 int err;
179 /* Validate setting the breakpoint and then removing it. In the
180 * remove fails, the kernel needs to emit a bad message because we
181 * are deep trouble not being able to put things back the way we
182 * found them.
183 */
184 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
185 if (err)
186 return err;
187 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
188 if (err)
189 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
190 "memory destroyed at: %lx", addr);
191 return err;
192}
193
194unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
195{
196 return instruction_pointer(regs);
197}
198
199int __weak kgdb_arch_init(void)
200{
201 return 0;
202}
203
204int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
205{
206 return 0;
207}
208
209/**
210 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
211 * @regs: Current &struct pt_regs.
212 *
213 * This function will be called if the particular architecture must
214 * disable hardware debugging while it is processing gdb packets or
215 * handling exception.
216 */
217void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
218{
219}
220
221/*
222 * Some architectures need cache flushes when we set/clear a
223 * breakpoint:
224 */
225static void kgdb_flush_swbreak_addr(unsigned long addr)
226{
227 if (!CACHE_FLUSH_IS_SAFE)
228 return;
229
230 if (current->mm && current->mm->mmap_cache) {
231 flush_cache_range(current->mm->mmap_cache,
232 addr, addr + BREAK_INSTR_SIZE);
233 }
234 /* Force flush instruction cache if it was outside the mm */
235 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
236}
237
238/*
239 * SW breakpoint management:
240 */
241int dbg_activate_sw_breakpoints(void)
242{
243 unsigned long addr;
244 int error;
245 int ret = 0;
246 int i;
247
248 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
249 if (kgdb_break[i].state != BP_SET)
250 continue;
251
252 addr = kgdb_break[i].bpt_addr;
253 error = kgdb_arch_set_breakpoint(addr,
254 kgdb_break[i].saved_instr);
255 if (error) {
256 ret = error;
257 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
258 continue;
259 }
260
261 kgdb_flush_swbreak_addr(addr);
262 kgdb_break[i].state = BP_ACTIVE;
263 }
264 return ret;
265}
266
267int dbg_set_sw_break(unsigned long addr)
268{
269 int err = kgdb_validate_break_address(addr);
270 int breakno = -1;
271 int i;
272
273 if (err)
274 return err;
275
276 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
277 if ((kgdb_break[i].state == BP_SET) &&
278 (kgdb_break[i].bpt_addr == addr))
279 return -EEXIST;
280 }
281 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
282 if (kgdb_break[i].state == BP_REMOVED &&
283 kgdb_break[i].bpt_addr == addr) {
284 breakno = i;
285 break;
286 }
287 }
288
289 if (breakno == -1) {
290 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
291 if (kgdb_break[i].state == BP_UNDEFINED) {
292 breakno = i;
293 break;
294 }
295 }
296 }
297
298 if (breakno == -1)
299 return -E2BIG;
300
301 kgdb_break[breakno].state = BP_SET;
302 kgdb_break[breakno].type = BP_BREAKPOINT;
303 kgdb_break[breakno].bpt_addr = addr;
304
305 return 0;
306}
307
308int dbg_deactivate_sw_breakpoints(void)
309{
310 unsigned long addr;
311 int error;
312 int ret = 0;
313 int i;
314
315 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
316 if (kgdb_break[i].state != BP_ACTIVE)
317 continue;
318 addr = kgdb_break[i].bpt_addr;
319 error = kgdb_arch_remove_breakpoint(addr,
320 kgdb_break[i].saved_instr);
321 if (error) {
322 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
323 ret = error;
324 }
325
326 kgdb_flush_swbreak_addr(addr);
327 kgdb_break[i].state = BP_SET;
328 }
329 return ret;
330}
331
332int dbg_remove_sw_break(unsigned long addr)
333{
334 int i;
335
336 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
337 if ((kgdb_break[i].state == BP_SET) &&
338 (kgdb_break[i].bpt_addr == addr)) {
339 kgdb_break[i].state = BP_REMOVED;
340 return 0;
341 }
342 }
343 return -ENOENT;
344}
345
346int kgdb_isremovedbreak(unsigned long addr)
347{
348 int i;
349
350 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
351 if ((kgdb_break[i].state == BP_REMOVED) &&
352 (kgdb_break[i].bpt_addr == addr))
353 return 1;
354 }
355 return 0;
356}
357
358int dbg_remove_all_break(void)
359{
360 unsigned long addr;
361 int error;
362 int i;
363
364 /* Clear memory breakpoints. */
365 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
366 if (kgdb_break[i].state != BP_ACTIVE)
367 goto setundefined;
368 addr = kgdb_break[i].bpt_addr;
369 error = kgdb_arch_remove_breakpoint(addr,
370 kgdb_break[i].saved_instr);
371 if (error)
372 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
373 addr);
374setundefined:
375 kgdb_break[i].state = BP_UNDEFINED;
376 }
377
378 /* Clear hardware breakpoints. */
379 if (arch_kgdb_ops.remove_all_hw_break)
380 arch_kgdb_ops.remove_all_hw_break();
381
382 return 0;
383}
384
385/*
386 * Return true if there is a valid kgdb I/O module. Also if no
387 * debugger is attached a message can be printed to the console about
388 * waiting for the debugger to attach.
389 *
390 * The print_wait argument is only to be true when called from inside
391 * the core kgdb_handle_exception, because it will wait for the
392 * debugger to attach.
393 */
394static int kgdb_io_ready(int print_wait)
395{
396 if (!dbg_io_ops)
397 return 0;
398 if (kgdb_connected)
399 return 1;
400 if (atomic_read(&kgdb_setting_breakpoint))
401 return 1;
402 if (print_wait) {
403#ifdef CONFIG_KGDB_KDB
404 if (!dbg_kdb_mode)
405 printk(KERN_CRIT "KGDB: waiting... or $3#33 for KDB\n");
406#else
407 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
408#endif
409 }
410 return 1;
411}
412
413static int kgdb_reenter_check(struct kgdb_state *ks)
414{
415 unsigned long addr;
416
417 if (atomic_read(&kgdb_active) != raw_smp_processor_id())
418 return 0;
419
420 /* Panic on recursive debugger calls: */
421 exception_level++;
422 addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
423 dbg_deactivate_sw_breakpoints();
424
425 /*
426 * If the break point removed ok at the place exception
427 * occurred, try to recover and print a warning to the end
428 * user because the user planted a breakpoint in a place that
429 * KGDB needs in order to function.
430 */
431 if (dbg_remove_sw_break(addr) == 0) {
432 exception_level = 0;
433 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
434 dbg_activate_sw_breakpoints();
435 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
436 addr);
437 WARN_ON_ONCE(1);
438
439 return 1;
440 }
441 dbg_remove_all_break();
442 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
443
444 if (exception_level > 1) {
445 dump_stack();
446 panic("Recursive entry to debugger");
447 }
448
449 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
450#ifdef CONFIG_KGDB_KDB
451 /* Allow kdb to debug itself one level */
452 return 0;
453#endif
454 dump_stack();
455 panic("Recursive entry to debugger");
456
457 return 1;
458}
459
460static void dbg_cpu_switch(int cpu, int next_cpu)
461{
462 /* Mark the cpu we are switching away from as a slave when it
463 * holds the kgdb_active token. This must be done so that the
464 * that all the cpus wait in for the debug core will not enter
465 * again as the master. */
466 if (cpu == atomic_read(&kgdb_active)) {
467 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
468 kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
469 }
470 kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
471}
472
473static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
474{
475 unsigned long flags;
476 int sstep_tries = 100;
477 int error;
478 int i, cpu;
479 int trace_on = 0;
480acquirelock:
481 /*
482 * Interrupts will be restored by the 'trap return' code, except when
483 * single stepping.
484 */
485 local_irq_save(flags);
486
487 cpu = ks->cpu;
488 kgdb_info[cpu].debuggerinfo = regs;
489 kgdb_info[cpu].task = current;
490 kgdb_info[cpu].ret_state = 0;
491 kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
492 /*
493 * Make sure the above info reaches the primary CPU before
494 * our cpu_in_kgdb[] flag setting does:
495 */
496 atomic_inc(&cpu_in_kgdb[cpu]);
497
498 if (exception_level == 1)
499 goto cpu_master_loop;
500
501 /*
502 * CPU will loop if it is a slave or request to become a kgdb
503 * master cpu and acquire the kgdb_active lock:
504 */
505 while (1) {
506cpu_loop:
507 if (kgdb_info[cpu].exception_state & DCPU_NEXT_MASTER) {
508 kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
509 goto cpu_master_loop;
510 } else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
511 if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
512 break;
513 } else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
514 if (!atomic_read(&passive_cpu_wait[cpu]))
515 goto return_normal;
516 } else {
517return_normal:
518 /* Return to normal operation by executing any
519 * hw breakpoint fixup.
520 */
521 if (arch_kgdb_ops.correct_hw_break)
522 arch_kgdb_ops.correct_hw_break();
523 if (trace_on)
524 tracing_on();
525 atomic_dec(&cpu_in_kgdb[cpu]);
526 touch_softlockup_watchdog_sync();
527 clocksource_touch_watchdog();
528 local_irq_restore(flags);
529 return 0;
530 }
531 cpu_relax();
532 }
533
534 /*
535 * For single stepping, try to only enter on the processor
536 * that was single stepping. To gaurd against a deadlock, the
537 * kernel will only try for the value of sstep_tries before
538 * giving up and continuing on.
539 */
540 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
541 (kgdb_info[cpu].task &&
542 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
543 atomic_set(&kgdb_active, -1);
544 touch_softlockup_watchdog_sync();
545 clocksource_touch_watchdog();
546 local_irq_restore(flags);
547
548 goto acquirelock;
549 }
550
551 if (!kgdb_io_ready(1)) {
552 kgdb_info[cpu].ret_state = 1;
553 goto kgdb_restore; /* No I/O connection, resume the system */
554 }
555
556 /*
557 * Don't enter if we have hit a removed breakpoint.
558 */
559 if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
560 goto kgdb_restore;
561
562 /* Call the I/O driver's pre_exception routine */
563 if (dbg_io_ops->pre_exception)
564 dbg_io_ops->pre_exception();
565
566 kgdb_disable_hw_debug(ks->linux_regs);
567
568 /*
569 * Get the passive CPU lock which will hold all the non-primary
570 * CPU in a spin state while the debugger is active
571 */
572 if (!kgdb_single_step) {
573 for (i = 0; i < NR_CPUS; i++)
574 atomic_inc(&passive_cpu_wait[i]);
575 }
576
577#ifdef CONFIG_SMP
578 /* Signal the other CPUs to enter kgdb_wait() */
579 if ((!kgdb_single_step) && kgdb_do_roundup)
580 kgdb_roundup_cpus(flags);
581#endif
582
583 /*
584 * Wait for the other CPUs to be notified and be waiting for us:
585 */
586 for_each_online_cpu(i) {
587 while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
588 cpu_relax();
589 }
590
591 /*
592 * At this point the primary processor is completely
593 * in the debugger and all secondary CPUs are quiescent
594 */
595 dbg_deactivate_sw_breakpoints();
596 kgdb_single_step = 0;
597 kgdb_contthread = current;
598 exception_level = 0;
599 trace_on = tracing_is_on();
600 if (trace_on)
601 tracing_off();
602
603 while (1) {
604cpu_master_loop:
605 if (dbg_kdb_mode) {
606 kgdb_connected = 1;
607 error = kdb_stub(ks);
608 kgdb_connected = 0;
609 } else {
610 error = gdb_serial_stub(ks);
611 }
612
613 if (error == DBG_PASS_EVENT) {
614 dbg_kdb_mode = !dbg_kdb_mode;
615 } else if (error == DBG_SWITCH_CPU_EVENT) {
616 dbg_cpu_switch(cpu, dbg_switch_cpu);
617 goto cpu_loop;
618 } else {
619 kgdb_info[cpu].ret_state = error;
620 break;
621 }
622 }
623
624 /* Call the I/O driver's post_exception routine */
625 if (dbg_io_ops->post_exception)
626 dbg_io_ops->post_exception();
627
628 atomic_dec(&cpu_in_kgdb[ks->cpu]);
629
630 if (!kgdb_single_step) {
631 for (i = NR_CPUS-1; i >= 0; i--)
632 atomic_dec(&passive_cpu_wait[i]);
633 /*
634 * Wait till all the CPUs have quit from the debugger,
635 * but allow a CPU that hit an exception and is
636 * waiting to become the master to remain in the debug
637 * core.
638 */
639 for_each_online_cpu(i) {
640 while (kgdb_do_roundup &&
641 atomic_read(&cpu_in_kgdb[i]) &&
642 !(kgdb_info[i].exception_state &
643 DCPU_WANT_MASTER))
644 cpu_relax();
645 }
646 }
647
648kgdb_restore:
649 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
650 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
651 if (kgdb_info[sstep_cpu].task)
652 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
653 else
654 kgdb_sstep_pid = 0;
655 }
656 if (trace_on)
657 tracing_on();
658 /* Free kgdb_active */
659 atomic_set(&kgdb_active, -1);
660 touch_softlockup_watchdog_sync();
661 clocksource_touch_watchdog();
662 local_irq_restore(flags);
663
664 return kgdb_info[cpu].ret_state;
665}
666
667/*
668 * kgdb_handle_exception() - main entry point from a kernel exception
669 *
670 * Locking hierarchy:
671 * interface locks, if any (begin_session)
672 * kgdb lock (kgdb_active)
673 */
674int
675kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
676{
677 struct kgdb_state kgdb_var;
678 struct kgdb_state *ks = &kgdb_var;
679 int ret;
680
681 ks->cpu = raw_smp_processor_id();
682 ks->ex_vector = evector;
683 ks->signo = signo;
684 ks->err_code = ecode;
685 ks->kgdb_usethreadid = 0;
686 ks->linux_regs = regs;
687
688 if (kgdb_reenter_check(ks))
689 return 0; /* Ouch, double exception ! */
690 kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
691 ret = kgdb_cpu_enter(ks, regs);
692 kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
693 DCPU_IS_SLAVE);
694 return ret;
695}
696
697int kgdb_nmicallback(int cpu, void *regs)
698{
699#ifdef CONFIG_SMP
700 struct kgdb_state kgdb_var;
701 struct kgdb_state *ks = &kgdb_var;
702
703 memset(ks, 0, sizeof(struct kgdb_state));
704 ks->cpu = cpu;
705 ks->linux_regs = regs;
706
707 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
708 atomic_read(&kgdb_active) != -1 &&
709 atomic_read(&kgdb_active) != cpu) {
710 kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
711 kgdb_cpu_enter(ks, regs);
712 kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
713 return 0;
714 }
715#endif
716 return 1;
717}
718
719static void kgdb_console_write(struct console *co, const char *s,
720 unsigned count)
721{
722 unsigned long flags;
723
724 /* If we're debugging, or KGDB has not connected, don't try
725 * and print. */
726 if (!kgdb_connected || atomic_read(&kgdb_active) != -1 || dbg_kdb_mode)
727 return;
728
729 local_irq_save(flags);
730 gdbstub_msg_write(s, count);
731 local_irq_restore(flags);
732}
733
734static struct console kgdbcons = {
735 .name = "kgdb",
736 .write = kgdb_console_write,
737 .flags = CON_PRINTBUFFER | CON_ENABLED,
738 .index = -1,
739};
740
741#ifdef CONFIG_MAGIC_SYSRQ
742static void sysrq_handle_dbg(int key, struct tty_struct *tty)
743{
744 if (!dbg_io_ops) {
745 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
746 return;
747 }
748 if (!kgdb_connected) {
749#ifdef CONFIG_KGDB_KDB
750 if (!dbg_kdb_mode)
751 printk(KERN_CRIT "KGDB or $3#33 for KDB\n");
752#else
753 printk(KERN_CRIT "Entering KGDB\n");
754#endif
755 }
756
757 kgdb_breakpoint();
758}
759
760static struct sysrq_key_op sysrq_dbg_op = {
761 .handler = sysrq_handle_dbg,
762 .help_msg = "debug(G)",
763 .action_msg = "DEBUG",
764};
765#endif
766
767static int kgdb_panic_event(struct notifier_block *self,
768 unsigned long val,
769 void *data)
770{
771 if (dbg_kdb_mode)
772 kdb_printf("PANIC: %s\n", (char *)data);
773 kgdb_breakpoint();
774 return NOTIFY_DONE;
775}
776
777static struct notifier_block kgdb_panic_event_nb = {
778 .notifier_call = kgdb_panic_event,
779 .priority = INT_MAX,
780};
781
782void __weak kgdb_arch_late(void)
783{
784}
785
786void __init dbg_late_init(void)
787{
788 dbg_is_early = false;
789 if (kgdb_io_module_registered)
790 kgdb_arch_late();
791 kdb_init(KDB_INIT_FULL);
792}
793
794static void kgdb_register_callbacks(void)
795{
796 if (!kgdb_io_module_registered) {
797 kgdb_io_module_registered = 1;
798 kgdb_arch_init();
799 if (!dbg_is_early)
800 kgdb_arch_late();
801 atomic_notifier_chain_register(&panic_notifier_list,
802 &kgdb_panic_event_nb);
803#ifdef CONFIG_MAGIC_SYSRQ
804 register_sysrq_key('g', &sysrq_dbg_op);
805#endif
806 if (kgdb_use_con && !kgdb_con_registered) {
807 register_console(&kgdbcons);
808 kgdb_con_registered = 1;
809 }
810 }
811}
812
813static void kgdb_unregister_callbacks(void)
814{
815 /*
816 * When this routine is called KGDB should unregister from the
817 * panic handler and clean up, making sure it is not handling any
818 * break exceptions at the time.
819 */
820 if (kgdb_io_module_registered) {
821 kgdb_io_module_registered = 0;
822 atomic_notifier_chain_unregister(&panic_notifier_list,
823 &kgdb_panic_event_nb);
824 kgdb_arch_exit();
825#ifdef CONFIG_MAGIC_SYSRQ
826 unregister_sysrq_key('g', &sysrq_dbg_op);
827#endif
828 if (kgdb_con_registered) {
829 unregister_console(&kgdbcons);
830 kgdb_con_registered = 0;
831 }
832 }
833}
834
835/*
836 * There are times a tasklet needs to be used vs a compiled in
837 * break point so as to cause an exception outside a kgdb I/O module,
838 * such as is the case with kgdboe, where calling a breakpoint in the
839 * I/O driver itself would be fatal.
840 */
841static void kgdb_tasklet_bpt(unsigned long ing)
842{
843 kgdb_breakpoint();
844 atomic_set(&kgdb_break_tasklet_var, 0);
845}
846
847static DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0);
848
849void kgdb_schedule_breakpoint(void)
850{
851 if (atomic_read(&kgdb_break_tasklet_var) ||
852 atomic_read(&kgdb_active) != -1 ||
853 atomic_read(&kgdb_setting_breakpoint))
854 return;
855 atomic_inc(&kgdb_break_tasklet_var);
856 tasklet_schedule(&kgdb_tasklet_breakpoint);
857}
858EXPORT_SYMBOL_GPL(kgdb_schedule_breakpoint);
859
860static void kgdb_initial_breakpoint(void)
861{
862 kgdb_break_asap = 0;
863
864 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
865 kgdb_breakpoint();
866}
867
868/**
869 * kgdb_register_io_module - register KGDB IO module
870 * @new_dbg_io_ops: the io ops vector
871 *
872 * Register it with the KGDB core.
873 */
874int kgdb_register_io_module(struct kgdb_io *new_dbg_io_ops)
875{
876 int err;
877
878 spin_lock(&kgdb_registration_lock);
879
880 if (dbg_io_ops) {
881 spin_unlock(&kgdb_registration_lock);
882
883 printk(KERN_ERR "kgdb: Another I/O driver is already "
884 "registered with KGDB.\n");
885 return -EBUSY;
886 }
887
888 if (new_dbg_io_ops->init) {
889 err = new_dbg_io_ops->init();
890 if (err) {
891 spin_unlock(&kgdb_registration_lock);
892 return err;
893 }
894 }
895
896 dbg_io_ops = new_dbg_io_ops;
897
898 spin_unlock(&kgdb_registration_lock);
899
900 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
901 new_dbg_io_ops->name);
902
903 /* Arm KGDB now. */
904 kgdb_register_callbacks();
905
906 if (kgdb_break_asap)
907 kgdb_initial_breakpoint();
908
909 return 0;
910}
911EXPORT_SYMBOL_GPL(kgdb_register_io_module);
912
913/**
914 * kkgdb_unregister_io_module - unregister KGDB IO module
915 * @old_dbg_io_ops: the io ops vector
916 *
917 * Unregister it with the KGDB core.
918 */
919void kgdb_unregister_io_module(struct kgdb_io *old_dbg_io_ops)
920{
921 BUG_ON(kgdb_connected);
922
923 /*
924 * KGDB is no longer able to communicate out, so
925 * unregister our callbacks and reset state.
926 */
927 kgdb_unregister_callbacks();
928
929 spin_lock(&kgdb_registration_lock);
930
931 WARN_ON_ONCE(dbg_io_ops != old_dbg_io_ops);
932 dbg_io_ops = NULL;
933
934 spin_unlock(&kgdb_registration_lock);
935
936 printk(KERN_INFO
937 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
938 old_dbg_io_ops->name);
939}
940EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
941
942int dbg_io_get_char(void)
943{
944 int ret = dbg_io_ops->read_char();
945 if (ret == NO_POLL_CHAR)
946 return -1;
947 if (!dbg_kdb_mode)
948 return ret;
949 if (ret == 127)
950 return 8;
951 return ret;
952}
953
954/**
955 * kgdb_breakpoint - generate breakpoint exception
956 *
957 * This function will generate a breakpoint exception. It is used at the
958 * beginning of a program to sync up with a debugger and can be used
959 * otherwise as a quick means to stop program execution and "break" into
960 * the debugger.
961 */
962void kgdb_breakpoint(void)
963{
964 atomic_inc(&kgdb_setting_breakpoint);
965 wmb(); /* Sync point before breakpoint */
966 arch_kgdb_breakpoint();
967 wmb(); /* Sync point after breakpoint */
968 atomic_dec(&kgdb_setting_breakpoint);
969}
970EXPORT_SYMBOL_GPL(kgdb_breakpoint);
971
972static int __init opt_kgdb_wait(char *str)
973{
974 kgdb_break_asap = 1;
975
976 kdb_init(KDB_INIT_EARLY);
977 if (kgdb_io_module_registered)
978 kgdb_initial_breakpoint();
979
980 return 0;
981}
982
983early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
new file mode 100644
index 000000000000..c5d753d80f67
--- /dev/null
+++ b/kernel/debug/debug_core.h
@@ -0,0 +1,81 @@
1/*
2 * Created by: Jason Wessel <jason.wessel@windriver.com>
3 *
4 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
5 *
6 * This file is licensed under the terms of the GNU General Public
7 * License version 2. This program is licensed "as is" without any
8 * warranty of any kind, whether express or implied.
9 */
10
11#ifndef _DEBUG_CORE_H_
12#define _DEBUG_CORE_H_
13/*
14 * These are the private implementation headers between the kernel
15 * debugger core and the debugger front end code.
16 */
17
18/* kernel debug core data structures */
19struct kgdb_state {
20 int ex_vector;
21 int signo;
22 int err_code;
23 int cpu;
24 int pass_exception;
25 unsigned long thr_query;
26 unsigned long threadid;
27 long kgdb_usethreadid;
28 struct pt_regs *linux_regs;
29};
30
31/* Exception state values */
32#define DCPU_WANT_MASTER 0x1 /* Waiting to become a master kgdb cpu */
33#define DCPU_NEXT_MASTER 0x2 /* Transition from one master cpu to another */
34#define DCPU_IS_SLAVE 0x4 /* Slave cpu enter exception */
35#define DCPU_SSTEP 0x8 /* CPU is single stepping */
36
37struct debuggerinfo_struct {
38 void *debuggerinfo;
39 struct task_struct *task;
40 int exception_state;
41 int ret_state;
42 int irq_depth;
43};
44
45extern struct debuggerinfo_struct kgdb_info[];
46
47/* kernel debug core break point routines */
48extern int dbg_remove_all_break(void);
49extern int dbg_set_sw_break(unsigned long addr);
50extern int dbg_remove_sw_break(unsigned long addr);
51extern int dbg_activate_sw_breakpoints(void);
52extern int dbg_deactivate_sw_breakpoints(void);
53
54/* polled character access to i/o module */
55extern int dbg_io_get_char(void);
56
57/* stub return value for switching between the gdbstub and kdb */
58#define DBG_PASS_EVENT -12345
59/* Switch from one cpu to another */
60#define DBG_SWITCH_CPU_EVENT -123456
61extern int dbg_switch_cpu;
62
63/* gdbstub interface functions */
64extern int gdb_serial_stub(struct kgdb_state *ks);
65extern void gdbstub_msg_write(const char *s, int len);
66
67/* gdbstub functions used for kdb <-> gdbstub transition */
68extern int gdbstub_state(struct kgdb_state *ks, char *cmd);
69extern int dbg_kdb_mode;
70
71#ifdef CONFIG_KGDB_KDB
72extern int kdb_stub(struct kgdb_state *ks);
73extern int kdb_parse(const char *cmdstr);
74#else /* ! CONFIG_KGDB_KDB */
75static inline int kdb_stub(struct kgdb_state *ks)
76{
77 return DBG_PASS_EVENT;
78}
79#endif /* CONFIG_KGDB_KDB */
80
81#endif /* _DEBUG_CORE_H_ */
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
new file mode 100644
index 000000000000..6e81fd59566b
--- /dev/null
+++ b/kernel/debug/gdbstub.c
@@ -0,0 +1,1014 @@
1/*
2 * Kernel Debug Core
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@ucw.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2009 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30
31#include <linux/kernel.h>
32#include <linux/kgdb.h>
33#include <linux/kdb.h>
34#include <linux/reboot.h>
35#include <linux/uaccess.h>
36#include <asm/cacheflush.h>
37#include <asm/unaligned.h>
38#include "debug_core.h"
39
40#define KGDB_MAX_THREAD_QUERY 17
41
42/* Our I/O buffers. */
43static char remcom_in_buffer[BUFMAX];
44static char remcom_out_buffer[BUFMAX];
45
46/* Storage for the registers, in GDB format. */
47static unsigned long gdb_regs[(NUMREGBYTES +
48 sizeof(unsigned long) - 1) /
49 sizeof(unsigned long)];
50
51/*
52 * GDB remote protocol parser:
53 */
54
55static int hex(char ch)
56{
57 if ((ch >= 'a') && (ch <= 'f'))
58 return ch - 'a' + 10;
59 if ((ch >= '0') && (ch <= '9'))
60 return ch - '0';
61 if ((ch >= 'A') && (ch <= 'F'))
62 return ch - 'A' + 10;
63 return -1;
64}
65
66#ifdef CONFIG_KGDB_KDB
67static int gdbstub_read_wait(void)
68{
69 int ret = -1;
70 int i;
71
72 /* poll any additional I/O interfaces that are defined */
73 while (ret < 0)
74 for (i = 0; kdb_poll_funcs[i] != NULL; i++) {
75 ret = kdb_poll_funcs[i]();
76 if (ret > 0)
77 break;
78 }
79 return ret;
80}
81#else
82static int gdbstub_read_wait(void)
83{
84 int ret = dbg_io_ops->read_char();
85 while (ret == NO_POLL_CHAR)
86 ret = dbg_io_ops->read_char();
87 return ret;
88}
89#endif
90/* scan for the sequence $<data>#<checksum> */
91static void get_packet(char *buffer)
92{
93 unsigned char checksum;
94 unsigned char xmitcsum;
95 int count;
96 char ch;
97
98 do {
99 /*
100 * Spin and wait around for the start character, ignore all
101 * other characters:
102 */
103 while ((ch = (gdbstub_read_wait())) != '$')
104 /* nothing */;
105
106 kgdb_connected = 1;
107 checksum = 0;
108 xmitcsum = -1;
109
110 count = 0;
111
112 /*
113 * now, read until a # or end of buffer is found:
114 */
115 while (count < (BUFMAX - 1)) {
116 ch = gdbstub_read_wait();
117 if (ch == '#')
118 break;
119 checksum = checksum + ch;
120 buffer[count] = ch;
121 count = count + 1;
122 }
123 buffer[count] = 0;
124
125 if (ch == '#') {
126 xmitcsum = hex(gdbstub_read_wait()) << 4;
127 xmitcsum += hex(gdbstub_read_wait());
128
129 if (checksum != xmitcsum)
130 /* failed checksum */
131 dbg_io_ops->write_char('-');
132 else
133 /* successful transfer */
134 dbg_io_ops->write_char('+');
135 if (dbg_io_ops->flush)
136 dbg_io_ops->flush();
137 }
138 } while (checksum != xmitcsum);
139}
140
141/*
142 * Send the packet in buffer.
143 * Check for gdb connection if asked for.
144 */
145static void put_packet(char *buffer)
146{
147 unsigned char checksum;
148 int count;
149 char ch;
150
151 /*
152 * $<packet info>#<checksum>.
153 */
154 while (1) {
155 dbg_io_ops->write_char('$');
156 checksum = 0;
157 count = 0;
158
159 while ((ch = buffer[count])) {
160 dbg_io_ops->write_char(ch);
161 checksum += ch;
162 count++;
163 }
164
165 dbg_io_ops->write_char('#');
166 dbg_io_ops->write_char(hex_asc_hi(checksum));
167 dbg_io_ops->write_char(hex_asc_lo(checksum));
168 if (dbg_io_ops->flush)
169 dbg_io_ops->flush();
170
171 /* Now see what we get in reply. */
172 ch = gdbstub_read_wait();
173
174 if (ch == 3)
175 ch = gdbstub_read_wait();
176
177 /* If we get an ACK, we are done. */
178 if (ch == '+')
179 return;
180
181 /*
182 * If we get the start of another packet, this means
183 * that GDB is attempting to reconnect. We will NAK
184 * the packet being sent, and stop trying to send this
185 * packet.
186 */
187 if (ch == '$') {
188 dbg_io_ops->write_char('-');
189 if (dbg_io_ops->flush)
190 dbg_io_ops->flush();
191 return;
192 }
193 }
194}
195
196static char gdbmsgbuf[BUFMAX + 1];
197
198void gdbstub_msg_write(const char *s, int len)
199{
200 char *bufptr;
201 int wcount;
202 int i;
203
204 if (len == 0)
205 len = strlen(s);
206
207 /* 'O'utput */
208 gdbmsgbuf[0] = 'O';
209
210 /* Fill and send buffers... */
211 while (len > 0) {
212 bufptr = gdbmsgbuf + 1;
213
214 /* Calculate how many this time */
215 if ((len << 1) > (BUFMAX - 2))
216 wcount = (BUFMAX - 2) >> 1;
217 else
218 wcount = len;
219
220 /* Pack in hex chars */
221 for (i = 0; i < wcount; i++)
222 bufptr = pack_hex_byte(bufptr, s[i]);
223 *bufptr = '\0';
224
225 /* Move up */
226 s += wcount;
227 len -= wcount;
228
229 /* Write packet */
230 put_packet(gdbmsgbuf);
231 }
232}
233
234/*
235 * Convert the memory pointed to by mem into hex, placing result in
236 * buf. Return a pointer to the last char put in buf (null). May
237 * return an error.
238 */
239int kgdb_mem2hex(char *mem, char *buf, int count)
240{
241 char *tmp;
242 int err;
243
244 /*
245 * We use the upper half of buf as an intermediate buffer for the
246 * raw memory copy. Hex conversion will work against this one.
247 */
248 tmp = buf + count;
249
250 err = probe_kernel_read(tmp, mem, count);
251 if (!err) {
252 while (count > 0) {
253 buf = pack_hex_byte(buf, *tmp);
254 tmp++;
255 count--;
256 }
257
258 *buf = 0;
259 }
260
261 return err;
262}
263
264/*
265 * Convert the hex array pointed to by buf into binary to be placed in
266 * mem. Return a pointer to the character AFTER the last byte
267 * written. May return an error.
268 */
269int kgdb_hex2mem(char *buf, char *mem, int count)
270{
271 char *tmp_raw;
272 char *tmp_hex;
273
274 /*
275 * We use the upper half of buf as an intermediate buffer for the
276 * raw memory that is converted from hex.
277 */
278 tmp_raw = buf + count * 2;
279
280 tmp_hex = tmp_raw - 1;
281 while (tmp_hex >= buf) {
282 tmp_raw--;
283 *tmp_raw = hex(*tmp_hex--);
284 *tmp_raw |= hex(*tmp_hex--) << 4;
285 }
286
287 return probe_kernel_write(mem, tmp_raw, count);
288}
289
290/*
291 * While we find nice hex chars, build a long_val.
292 * Return number of chars processed.
293 */
294int kgdb_hex2long(char **ptr, unsigned long *long_val)
295{
296 int hex_val;
297 int num = 0;
298 int negate = 0;
299
300 *long_val = 0;
301
302 if (**ptr == '-') {
303 negate = 1;
304 (*ptr)++;
305 }
306 while (**ptr) {
307 hex_val = hex(**ptr);
308 if (hex_val < 0)
309 break;
310
311 *long_val = (*long_val << 4) | hex_val;
312 num++;
313 (*ptr)++;
314 }
315
316 if (negate)
317 *long_val = -*long_val;
318
319 return num;
320}
321
322/*
323 * Copy the binary array pointed to by buf into mem. Fix $, #, and
324 * 0x7d escaped with 0x7d. Return -EFAULT on failure or 0 on success.
325 * The input buf is overwitten with the result to write to mem.
326 */
327static int kgdb_ebin2mem(char *buf, char *mem, int count)
328{
329 int size = 0;
330 char *c = buf;
331
332 while (count-- > 0) {
333 c[size] = *buf++;
334 if (c[size] == 0x7d)
335 c[size] = *buf++ ^ 0x20;
336 size++;
337 }
338
339 return probe_kernel_write(mem, c, size);
340}
341
342/* Write memory due to an 'M' or 'X' packet. */
343static int write_mem_msg(int binary)
344{
345 char *ptr = &remcom_in_buffer[1];
346 unsigned long addr;
347 unsigned long length;
348 int err;
349
350 if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
351 kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
352 if (binary)
353 err = kgdb_ebin2mem(ptr, (char *)addr, length);
354 else
355 err = kgdb_hex2mem(ptr, (char *)addr, length);
356 if (err)
357 return err;
358 if (CACHE_FLUSH_IS_SAFE)
359 flush_icache_range(addr, addr + length);
360 return 0;
361 }
362
363 return -EINVAL;
364}
365
366static void error_packet(char *pkt, int error)
367{
368 error = -error;
369 pkt[0] = 'E';
370 pkt[1] = hex_asc[(error / 10)];
371 pkt[2] = hex_asc[(error % 10)];
372 pkt[3] = '\0';
373}
374
375/*
376 * Thread ID accessors. We represent a flat TID space to GDB, where
377 * the per CPU idle threads (which under Linux all have PID 0) are
378 * remapped to negative TIDs.
379 */
380
381#define BUF_THREAD_ID_SIZE 16
382
383static char *pack_threadid(char *pkt, unsigned char *id)
384{
385 char *limit;
386
387 limit = pkt + BUF_THREAD_ID_SIZE;
388 while (pkt < limit)
389 pkt = pack_hex_byte(pkt, *id++);
390
391 return pkt;
392}
393
394static void int_to_threadref(unsigned char *id, int value)
395{
396 unsigned char *scan;
397 int i = 4;
398
399 scan = (unsigned char *)id;
400 while (i--)
401 *scan++ = 0;
402 put_unaligned_be32(value, scan);
403}
404
405static struct task_struct *getthread(struct pt_regs *regs, int tid)
406{
407 /*
408 * Non-positive TIDs are remapped to the cpu shadow information
409 */
410 if (tid == 0 || tid == -1)
411 tid = -atomic_read(&kgdb_active) - 2;
412 if (tid < -1 && tid > -NR_CPUS - 2) {
413 if (kgdb_info[-tid - 2].task)
414 return kgdb_info[-tid - 2].task;
415 else
416 return idle_task(-tid - 2);
417 }
418 if (tid <= 0) {
419 printk(KERN_ERR "KGDB: Internal thread select error\n");
420 dump_stack();
421 return NULL;
422 }
423
424 /*
425 * find_task_by_pid_ns() does not take the tasklist lock anymore
426 * but is nicely RCU locked - hence is a pretty resilient
427 * thing to use:
428 */
429 return find_task_by_pid_ns(tid, &init_pid_ns);
430}
431
432
433/*
434 * Remap normal tasks to their real PID,
435 * CPU shadow threads are mapped to -CPU - 2
436 */
437static inline int shadow_pid(int realpid)
438{
439 if (realpid)
440 return realpid;
441
442 return -raw_smp_processor_id() - 2;
443}
444
445/*
446 * All the functions that start with gdb_cmd are the various
447 * operations to implement the handlers for the gdbserial protocol
448 * where KGDB is communicating with an external debugger
449 */
450
451/* Handle the '?' status packets */
452static void gdb_cmd_status(struct kgdb_state *ks)
453{
454 /*
455 * We know that this packet is only sent
456 * during initial connect. So to be safe,
457 * we clear out our breakpoints now in case
458 * GDB is reconnecting.
459 */
460 dbg_remove_all_break();
461
462 remcom_out_buffer[0] = 'S';
463 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
464}
465
466/* Handle the 'g' get registers request */
467static void gdb_cmd_getregs(struct kgdb_state *ks)
468{
469 struct task_struct *thread;
470 void *local_debuggerinfo;
471 int i;
472
473 thread = kgdb_usethread;
474 if (!thread) {
475 thread = kgdb_info[ks->cpu].task;
476 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
477 } else {
478 local_debuggerinfo = NULL;
479 for_each_online_cpu(i) {
480 /*
481 * Try to find the task on some other
482 * or possibly this node if we do not
483 * find the matching task then we try
484 * to approximate the results.
485 */
486 if (thread == kgdb_info[i].task)
487 local_debuggerinfo = kgdb_info[i].debuggerinfo;
488 }
489 }
490
491 /*
492 * All threads that don't have debuggerinfo should be
493 * in schedule() sleeping, since all other CPUs
494 * are in kgdb_wait, and thus have debuggerinfo.
495 */
496 if (local_debuggerinfo) {
497 pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
498 } else {
499 /*
500 * Pull stuff saved during switch_to; nothing
501 * else is accessible (or even particularly
502 * relevant).
503 *
504 * This should be enough for a stack trace.
505 */
506 sleeping_thread_to_gdb_regs(gdb_regs, thread);
507 }
508 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
509}
510
511/* Handle the 'G' set registers request */
512static void gdb_cmd_setregs(struct kgdb_state *ks)
513{
514 kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
515
516 if (kgdb_usethread && kgdb_usethread != current) {
517 error_packet(remcom_out_buffer, -EINVAL);
518 } else {
519 gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
520 strcpy(remcom_out_buffer, "OK");
521 }
522}
523
524/* Handle the 'm' memory read bytes */
525static void gdb_cmd_memread(struct kgdb_state *ks)
526{
527 char *ptr = &remcom_in_buffer[1];
528 unsigned long length;
529 unsigned long addr;
530 int err;
531
532 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
533 kgdb_hex2long(&ptr, &length) > 0) {
534 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
535 if (err)
536 error_packet(remcom_out_buffer, err);
537 } else {
538 error_packet(remcom_out_buffer, -EINVAL);
539 }
540}
541
542/* Handle the 'M' memory write bytes */
543static void gdb_cmd_memwrite(struct kgdb_state *ks)
544{
545 int err = write_mem_msg(0);
546
547 if (err)
548 error_packet(remcom_out_buffer, err);
549 else
550 strcpy(remcom_out_buffer, "OK");
551}
552
553/* Handle the 'X' memory binary write bytes */
554static void gdb_cmd_binwrite(struct kgdb_state *ks)
555{
556 int err = write_mem_msg(1);
557
558 if (err)
559 error_packet(remcom_out_buffer, err);
560 else
561 strcpy(remcom_out_buffer, "OK");
562}
563
564/* Handle the 'D' or 'k', detach or kill packets */
565static void gdb_cmd_detachkill(struct kgdb_state *ks)
566{
567 int error;
568
569 /* The detach case */
570 if (remcom_in_buffer[0] == 'D') {
571 error = dbg_remove_all_break();
572 if (error < 0) {
573 error_packet(remcom_out_buffer, error);
574 } else {
575 strcpy(remcom_out_buffer, "OK");
576 kgdb_connected = 0;
577 }
578 put_packet(remcom_out_buffer);
579 } else {
580 /*
581 * Assume the kill case, with no exit code checking,
582 * trying to force detach the debugger:
583 */
584 dbg_remove_all_break();
585 kgdb_connected = 0;
586 }
587}
588
589/* Handle the 'R' reboot packets */
590static int gdb_cmd_reboot(struct kgdb_state *ks)
591{
592 /* For now, only honor R0 */
593 if (strcmp(remcom_in_buffer, "R0") == 0) {
594 printk(KERN_CRIT "Executing emergency reboot\n");
595 strcpy(remcom_out_buffer, "OK");
596 put_packet(remcom_out_buffer);
597
598 /*
599 * Execution should not return from
600 * machine_emergency_restart()
601 */
602 machine_emergency_restart();
603 kgdb_connected = 0;
604
605 return 1;
606 }
607 return 0;
608}
609
610/* Handle the 'q' query packets */
611static void gdb_cmd_query(struct kgdb_state *ks)
612{
613 struct task_struct *g;
614 struct task_struct *p;
615 unsigned char thref[8];
616 char *ptr;
617 int i;
618 int cpu;
619 int finished = 0;
620
621 switch (remcom_in_buffer[1]) {
622 case 's':
623 case 'f':
624 if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10))
625 break;
626
627 i = 0;
628 remcom_out_buffer[0] = 'm';
629 ptr = remcom_out_buffer + 1;
630 if (remcom_in_buffer[1] == 'f') {
631 /* Each cpu is a shadow thread */
632 for_each_online_cpu(cpu) {
633 ks->thr_query = 0;
634 int_to_threadref(thref, -cpu - 2);
635 pack_threadid(ptr, thref);
636 ptr += BUF_THREAD_ID_SIZE;
637 *(ptr++) = ',';
638 i++;
639 }
640 }
641
642 do_each_thread(g, p) {
643 if (i >= ks->thr_query && !finished) {
644 int_to_threadref(thref, p->pid);
645 pack_threadid(ptr, thref);
646 ptr += BUF_THREAD_ID_SIZE;
647 *(ptr++) = ',';
648 ks->thr_query++;
649 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
650 finished = 1;
651 }
652 i++;
653 } while_each_thread(g, p);
654
655 *(--ptr) = '\0';
656 break;
657
658 case 'C':
659 /* Current thread id */
660 strcpy(remcom_out_buffer, "QC");
661 ks->threadid = shadow_pid(current->pid);
662 int_to_threadref(thref, ks->threadid);
663 pack_threadid(remcom_out_buffer + 2, thref);
664 break;
665 case 'T':
666 if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16))
667 break;
668
669 ks->threadid = 0;
670 ptr = remcom_in_buffer + 17;
671 kgdb_hex2long(&ptr, &ks->threadid);
672 if (!getthread(ks->linux_regs, ks->threadid)) {
673 error_packet(remcom_out_buffer, -EINVAL);
674 break;
675 }
676 if ((int)ks->threadid > 0) {
677 kgdb_mem2hex(getthread(ks->linux_regs,
678 ks->threadid)->comm,
679 remcom_out_buffer, 16);
680 } else {
681 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
682
683 sprintf(tmpstr, "shadowCPU%d",
684 (int)(-ks->threadid - 2));
685 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
686 }
687 break;
688#ifdef CONFIG_KGDB_KDB
689 case 'R':
690 if (strncmp(remcom_in_buffer, "qRcmd,", 6) == 0) {
691 int len = strlen(remcom_in_buffer + 6);
692
693 if ((len % 2) != 0) {
694 strcpy(remcom_out_buffer, "E01");
695 break;
696 }
697 kgdb_hex2mem(remcom_in_buffer + 6,
698 remcom_out_buffer, len);
699 len = len / 2;
700 remcom_out_buffer[len++] = 0;
701
702 kdb_parse(remcom_out_buffer);
703 strcpy(remcom_out_buffer, "OK");
704 }
705 break;
706#endif
707 }
708}
709
710/* Handle the 'H' task query packets */
711static void gdb_cmd_task(struct kgdb_state *ks)
712{
713 struct task_struct *thread;
714 char *ptr;
715
716 switch (remcom_in_buffer[1]) {
717 case 'g':
718 ptr = &remcom_in_buffer[2];
719 kgdb_hex2long(&ptr, &ks->threadid);
720 thread = getthread(ks->linux_regs, ks->threadid);
721 if (!thread && ks->threadid > 0) {
722 error_packet(remcom_out_buffer, -EINVAL);
723 break;
724 }
725 kgdb_usethread = thread;
726 ks->kgdb_usethreadid = ks->threadid;
727 strcpy(remcom_out_buffer, "OK");
728 break;
729 case 'c':
730 ptr = &remcom_in_buffer[2];
731 kgdb_hex2long(&ptr, &ks->threadid);
732 if (!ks->threadid) {
733 kgdb_contthread = NULL;
734 } else {
735 thread = getthread(ks->linux_regs, ks->threadid);
736 if (!thread && ks->threadid > 0) {
737 error_packet(remcom_out_buffer, -EINVAL);
738 break;
739 }
740 kgdb_contthread = thread;
741 }
742 strcpy(remcom_out_buffer, "OK");
743 break;
744 }
745}
746
747/* Handle the 'T' thread query packets */
748static void gdb_cmd_thread(struct kgdb_state *ks)
749{
750 char *ptr = &remcom_in_buffer[1];
751 struct task_struct *thread;
752
753 kgdb_hex2long(&ptr, &ks->threadid);
754 thread = getthread(ks->linux_regs, ks->threadid);
755 if (thread)
756 strcpy(remcom_out_buffer, "OK");
757 else
758 error_packet(remcom_out_buffer, -EINVAL);
759}
760
761/* Handle the 'z' or 'Z' breakpoint remove or set packets */
762static void gdb_cmd_break(struct kgdb_state *ks)
763{
764 /*
765 * Since GDB-5.3, it's been drafted that '0' is a software
766 * breakpoint, '1' is a hardware breakpoint, so let's do that.
767 */
768 char *bpt_type = &remcom_in_buffer[1];
769 char *ptr = &remcom_in_buffer[2];
770 unsigned long addr;
771 unsigned long length;
772 int error = 0;
773
774 if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
775 /* Unsupported */
776 if (*bpt_type > '4')
777 return;
778 } else {
779 if (*bpt_type != '0' && *bpt_type != '1')
780 /* Unsupported. */
781 return;
782 }
783
784 /*
785 * Test if this is a hardware breakpoint, and
786 * if we support it:
787 */
788 if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
789 /* Unsupported. */
790 return;
791
792 if (*(ptr++) != ',') {
793 error_packet(remcom_out_buffer, -EINVAL);
794 return;
795 }
796 if (!kgdb_hex2long(&ptr, &addr)) {
797 error_packet(remcom_out_buffer, -EINVAL);
798 return;
799 }
800 if (*(ptr++) != ',' ||
801 !kgdb_hex2long(&ptr, &length)) {
802 error_packet(remcom_out_buffer, -EINVAL);
803 return;
804 }
805
806 if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
807 error = dbg_set_sw_break(addr);
808 else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
809 error = dbg_remove_sw_break(addr);
810 else if (remcom_in_buffer[0] == 'Z')
811 error = arch_kgdb_ops.set_hw_breakpoint(addr,
812 (int)length, *bpt_type - '0');
813 else if (remcom_in_buffer[0] == 'z')
814 error = arch_kgdb_ops.remove_hw_breakpoint(addr,
815 (int) length, *bpt_type - '0');
816
817 if (error == 0)
818 strcpy(remcom_out_buffer, "OK");
819 else
820 error_packet(remcom_out_buffer, error);
821}
822
823/* Handle the 'C' signal / exception passing packets */
824static int gdb_cmd_exception_pass(struct kgdb_state *ks)
825{
826 /* C09 == pass exception
827 * C15 == detach kgdb, pass exception
828 */
829 if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
830
831 ks->pass_exception = 1;
832 remcom_in_buffer[0] = 'c';
833
834 } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
835
836 ks->pass_exception = 1;
837 remcom_in_buffer[0] = 'D';
838 dbg_remove_all_break();
839 kgdb_connected = 0;
840 return 1;
841
842 } else {
843 gdbstub_msg_write("KGDB only knows signal 9 (pass)"
844 " and 15 (pass and disconnect)\n"
845 "Executing a continue without signal passing\n", 0);
846 remcom_in_buffer[0] = 'c';
847 }
848
849 /* Indicate fall through */
850 return -1;
851}
852
853/*
854 * This function performs all gdbserial command procesing
855 */
856int gdb_serial_stub(struct kgdb_state *ks)
857{
858 int error = 0;
859 int tmp;
860
861 /* Clear the out buffer. */
862 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
863
864 if (kgdb_connected) {
865 unsigned char thref[8];
866 char *ptr;
867
868 /* Reply to host that an exception has occurred */
869 ptr = remcom_out_buffer;
870 *ptr++ = 'T';
871 ptr = pack_hex_byte(ptr, ks->signo);
872 ptr += strlen(strcpy(ptr, "thread:"));
873 int_to_threadref(thref, shadow_pid(current->pid));
874 ptr = pack_threadid(ptr, thref);
875 *ptr++ = ';';
876 put_packet(remcom_out_buffer);
877 }
878
879 kgdb_usethread = kgdb_info[ks->cpu].task;
880 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
881 ks->pass_exception = 0;
882
883 while (1) {
884 error = 0;
885
886 /* Clear the out buffer. */
887 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
888
889 get_packet(remcom_in_buffer);
890
891 switch (remcom_in_buffer[0]) {
892 case '?': /* gdbserial status */
893 gdb_cmd_status(ks);
894 break;
895 case 'g': /* return the value of the CPU registers */
896 gdb_cmd_getregs(ks);
897 break;
898 case 'G': /* set the value of the CPU registers - return OK */
899 gdb_cmd_setregs(ks);
900 break;
901 case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
902 gdb_cmd_memread(ks);
903 break;
904 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
905 gdb_cmd_memwrite(ks);
906 break;
907 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
908 gdb_cmd_binwrite(ks);
909 break;
910 /* kill or detach. KGDB should treat this like a
911 * continue.
912 */
913 case 'D': /* Debugger detach */
914 case 'k': /* Debugger detach via kill */
915 gdb_cmd_detachkill(ks);
916 goto default_handle;
917 case 'R': /* Reboot */
918 if (gdb_cmd_reboot(ks))
919 goto default_handle;
920 break;
921 case 'q': /* query command */
922 gdb_cmd_query(ks);
923 break;
924 case 'H': /* task related */
925 gdb_cmd_task(ks);
926 break;
927 case 'T': /* Query thread status */
928 gdb_cmd_thread(ks);
929 break;
930 case 'z': /* Break point remove */
931 case 'Z': /* Break point set */
932 gdb_cmd_break(ks);
933 break;
934#ifdef CONFIG_KGDB_KDB
935 case '3': /* Escape into back into kdb */
936 if (remcom_in_buffer[1] == '\0') {
937 gdb_cmd_detachkill(ks);
938 return DBG_PASS_EVENT;
939 }
940#endif
941 case 'C': /* Exception passing */
942 tmp = gdb_cmd_exception_pass(ks);
943 if (tmp > 0)
944 goto default_handle;
945 if (tmp == 0)
946 break;
947 /* Fall through on tmp < 0 */
948 case 'c': /* Continue packet */
949 case 's': /* Single step packet */
950 if (kgdb_contthread && kgdb_contthread != current) {
951 /* Can't switch threads in kgdb */
952 error_packet(remcom_out_buffer, -EINVAL);
953 break;
954 }
955 dbg_activate_sw_breakpoints();
956 /* Fall through to default processing */
957 default:
958default_handle:
959 error = kgdb_arch_handle_exception(ks->ex_vector,
960 ks->signo,
961 ks->err_code,
962 remcom_in_buffer,
963 remcom_out_buffer,
964 ks->linux_regs);
965 /*
966 * Leave cmd processing on error, detach,
967 * kill, continue, or single step.
968 */
969 if (error >= 0 || remcom_in_buffer[0] == 'D' ||
970 remcom_in_buffer[0] == 'k') {
971 error = 0;
972 goto kgdb_exit;
973 }
974
975 }
976
977 /* reply to the request */
978 put_packet(remcom_out_buffer);
979 }
980
981kgdb_exit:
982 if (ks->pass_exception)
983 error = 1;
984 return error;
985}
986
987int gdbstub_state(struct kgdb_state *ks, char *cmd)
988{
989 int error;
990
991 switch (cmd[0]) {
992 case 'e':
993 error = kgdb_arch_handle_exception(ks->ex_vector,
994 ks->signo,
995 ks->err_code,
996 remcom_in_buffer,
997 remcom_out_buffer,
998 ks->linux_regs);
999 return error;
1000 case 's':
1001 case 'c':
1002 strcpy(remcom_in_buffer, cmd);
1003 return 0;
1004 case '?':
1005 gdb_cmd_status(ks);
1006 break;
1007 case '\0':
1008 strcpy(remcom_out_buffer, "");
1009 break;
1010 }
1011 dbg_io_ops->write_char('+');
1012 put_packet(remcom_out_buffer);
1013 return 0;
1014}
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
new file mode 100644
index 000000000000..396d12eda9e8
--- /dev/null
+++ b/kernel/debug/kdb/.gitignore
@@ -0,0 +1 @@
gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
new file mode 100644
index 000000000000..d4fc58f4b88d
--- /dev/null
+++ b/kernel/debug/kdb/Makefile
@@ -0,0 +1,25 @@
1# This file is subject to the terms and conditions of the GNU General Public
2# License. See the file "COPYING" in the main directory of this archive
3# for more details.
4#
5# Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
6# Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
7#
8
9CCVERSION := $(shell $(CC) -v 2>&1 | sed -ne '$$p')
10obj-y := kdb_io.o kdb_main.o kdb_support.o kdb_bt.o gen-kdb_cmds.o kdb_bp.o kdb_debugger.o
11obj-$(CONFIG_KDB_KEYBOARD) += kdb_keyboard.o
12
13clean-files := gen-kdb_cmds.c
14
15quiet_cmd_gen-kdb = GENKDB $@
16 cmd_gen-kdb = $(AWK) 'BEGIN {print "\#include <linux/stddef.h>"; print "\#include <linux/init.h>"} \
17 /^\#/{next} \
18 /^[ \t]*$$/{next} \
19 {gsub(/"/, "\\\"", $$0); \
20 print "static __initdata char kdb_cmd" cmds++ "[] = \"" $$0 "\\n\";"} \
21 END {print "extern char *kdb_cmds[]; char __initdata *kdb_cmds[] = {"; for (i = 0; i < cmds; ++i) {print " kdb_cmd" i ","}; print(" NULL\n};");}' \
22 $(filter-out %/Makefile,$^) > $@#
23
24$(obj)/gen-kdb_cmds.c: $(src)/kdb_cmds $(src)/Makefile
25 $(call cmd,gen-kdb)
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
new file mode 100644
index 000000000000..75bd9b3ebbb7
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -0,0 +1,564 @@
1/*
2 * Kernel Debugger Architecture Independent Breakpoint Handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/string.h>
13#include <linux/kernel.h>
14#include <linux/init.h>
15#include <linux/kdb.h>
16#include <linux/kgdb.h>
17#include <linux/smp.h>
18#include <linux/sched.h>
19#include <linux/interrupt.h>
20#include "kdb_private.h"
21
22/*
23 * Table of kdb_breakpoints
24 */
25kdb_bp_t kdb_breakpoints[KDB_MAXBPT];
26
27static void kdb_setsinglestep(struct pt_regs *regs)
28{
29 KDB_STATE_SET(DOING_SS);
30}
31
32static char *kdb_rwtypes[] = {
33 "Instruction(i)",
34 "Instruction(Register)",
35 "Data Write",
36 "I/O",
37 "Data Access"
38};
39
40static char *kdb_bptype(kdb_bp_t *bp)
41{
42 if (bp->bp_type < 0 || bp->bp_type > 4)
43 return "";
44
45 return kdb_rwtypes[bp->bp_type];
46}
47
48static int kdb_parsebp(int argc, const char **argv, int *nextargp, kdb_bp_t *bp)
49{
50 int nextarg = *nextargp;
51 int diag;
52
53 bp->bph_length = 1;
54 if ((argc + 1) != nextarg) {
55 if (strnicmp(argv[nextarg], "datar", sizeof("datar")) == 0)
56 bp->bp_type = BP_ACCESS_WATCHPOINT;
57 else if (strnicmp(argv[nextarg], "dataw", sizeof("dataw")) == 0)
58 bp->bp_type = BP_WRITE_WATCHPOINT;
59 else if (strnicmp(argv[nextarg], "inst", sizeof("inst")) == 0)
60 bp->bp_type = BP_HARDWARE_BREAKPOINT;
61 else
62 return KDB_ARGCOUNT;
63
64 bp->bph_length = 1;
65
66 nextarg++;
67
68 if ((argc + 1) != nextarg) {
69 unsigned long len;
70
71 diag = kdbgetularg((char *)argv[nextarg],
72 &len);
73 if (diag)
74 return diag;
75
76
77 if (len > 8)
78 return KDB_BADLENGTH;
79
80 bp->bph_length = len;
81 nextarg++;
82 }
83
84 if ((argc + 1) != nextarg)
85 return KDB_ARGCOUNT;
86 }
87
88 *nextargp = nextarg;
89 return 0;
90}
91
92static int _kdb_bp_remove(kdb_bp_t *bp)
93{
94 int ret = 1;
95 if (!bp->bp_installed)
96 return ret;
97 if (!bp->bp_type)
98 ret = dbg_remove_sw_break(bp->bp_addr);
99 else
100 ret = arch_kgdb_ops.remove_hw_breakpoint(bp->bp_addr,
101 bp->bph_length,
102 bp->bp_type);
103 if (ret == 0)
104 bp->bp_installed = 0;
105 return ret;
106}
107
108static void kdb_handle_bp(struct pt_regs *regs, kdb_bp_t *bp)
109{
110 if (KDB_DEBUG(BP))
111 kdb_printf("regs->ip = 0x%lx\n", instruction_pointer(regs));
112
113 /*
114 * Setup single step
115 */
116 kdb_setsinglestep(regs);
117
118 /*
119 * Reset delay attribute
120 */
121 bp->bp_delay = 0;
122 bp->bp_delayed = 1;
123}
124
125static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
126{
127 int ret;
128 /*
129 * Install the breakpoint, if it is not already installed.
130 */
131
132 if (KDB_DEBUG(BP))
133 kdb_printf("%s: bp_installed %d\n",
134 __func__, bp->bp_installed);
135 if (!KDB_STATE(SSBPT))
136 bp->bp_delay = 0;
137 if (bp->bp_installed)
138 return 1;
139 if (bp->bp_delay || (bp->bp_delayed && KDB_STATE(DOING_SS))) {
140 if (KDB_DEBUG(BP))
141 kdb_printf("%s: delayed bp\n", __func__);
142 kdb_handle_bp(regs, bp);
143 return 0;
144 }
145 if (!bp->bp_type)
146 ret = dbg_set_sw_break(bp->bp_addr);
147 else
148 ret = arch_kgdb_ops.set_hw_breakpoint(bp->bp_addr,
149 bp->bph_length,
150 bp->bp_type);
151 if (ret == 0) {
152 bp->bp_installed = 1;
153 } else {
154 kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
155 __func__, bp->bp_addr);
156 return 1;
157 }
158 return 0;
159}
160
161/*
162 * kdb_bp_install
163 *
164 * Install kdb_breakpoints prior to returning from the
165 * kernel debugger. This allows the kdb_breakpoints to be set
166 * upon functions that are used internally by kdb, such as
167 * printk(). This function is only called once per kdb session.
168 */
169void kdb_bp_install(struct pt_regs *regs)
170{
171 int i;
172
173 for (i = 0; i < KDB_MAXBPT; i++) {
174 kdb_bp_t *bp = &kdb_breakpoints[i];
175
176 if (KDB_DEBUG(BP)) {
177 kdb_printf("%s: bp %d bp_enabled %d\n",
178 __func__, i, bp->bp_enabled);
179 }
180 if (bp->bp_enabled)
181 _kdb_bp_install(regs, bp);
182 }
183}
184
185/*
186 * kdb_bp_remove
187 *
188 * Remove kdb_breakpoints upon entry to the kernel debugger.
189 *
190 * Parameters:
191 * None.
192 * Outputs:
193 * None.
194 * Returns:
195 * None.
196 * Locking:
197 * None.
198 * Remarks:
199 */
200void kdb_bp_remove(void)
201{
202 int i;
203
204 for (i = KDB_MAXBPT - 1; i >= 0; i--) {
205 kdb_bp_t *bp = &kdb_breakpoints[i];
206
207 if (KDB_DEBUG(BP)) {
208 kdb_printf("%s: bp %d bp_enabled %d\n",
209 __func__, i, bp->bp_enabled);
210 }
211 if (bp->bp_enabled)
212 _kdb_bp_remove(bp);
213 }
214}
215
216
217/*
218 * kdb_printbp
219 *
220 * Internal function to format and print a breakpoint entry.
221 *
222 * Parameters:
223 * None.
224 * Outputs:
225 * None.
226 * Returns:
227 * None.
228 * Locking:
229 * None.
230 * Remarks:
231 */
232
233static void kdb_printbp(kdb_bp_t *bp, int i)
234{
235 kdb_printf("%s ", kdb_bptype(bp));
236 kdb_printf("BP #%d at ", i);
237 kdb_symbol_print(bp->bp_addr, NULL, KDB_SP_DEFAULT);
238
239 if (bp->bp_enabled)
240 kdb_printf("\n is enabled");
241 else
242 kdb_printf("\n is disabled");
243
244 kdb_printf("\taddr at %016lx, hardtype=%d installed=%d\n",
245 bp->bp_addr, bp->bp_type, bp->bp_installed);
246
247 kdb_printf("\n");
248}
249
250/*
251 * kdb_bp
252 *
253 * Handle the bp commands.
254 *
255 * [bp|bph] <addr-expression> [DATAR|DATAW]
256 *
257 * Parameters:
258 * argc Count of arguments in argv
259 * argv Space delimited command line arguments
260 * Outputs:
261 * None.
262 * Returns:
263 * Zero for success, a kdb diagnostic if failure.
264 * Locking:
265 * None.
266 * Remarks:
267 *
268 * bp Set breakpoint on all cpus. Only use hardware assist if need.
269 * bph Set breakpoint on all cpus. Force hardware register
270 */
271
272static int kdb_bp(int argc, const char **argv)
273{
274 int i, bpno;
275 kdb_bp_t *bp, *bp_check;
276 int diag;
277 int free;
278 char *symname = NULL;
279 long offset = 0ul;
280 int nextarg;
281 kdb_bp_t template = {0};
282
283 if (argc == 0) {
284 /*
285 * Display breakpoint table
286 */
287 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT;
288 bpno++, bp++) {
289 if (bp->bp_free)
290 continue;
291 kdb_printbp(bp, bpno);
292 }
293
294 return 0;
295 }
296
297 nextarg = 1;
298 diag = kdbgetaddrarg(argc, argv, &nextarg, &template.bp_addr,
299 &offset, &symname);
300 if (diag)
301 return diag;
302 if (!template.bp_addr)
303 return KDB_BADINT;
304
305 /*
306 * Find an empty bp structure to allocate
307 */
308 free = KDB_MAXBPT;
309 for (bpno = 0, bp = kdb_breakpoints; bpno < KDB_MAXBPT; bpno++, bp++) {
310 if (bp->bp_free)
311 break;
312 }
313
314 if (bpno == KDB_MAXBPT)
315 return KDB_TOOMANYBPT;
316
317 if (strcmp(argv[0], "bph") == 0) {
318 template.bp_type = BP_HARDWARE_BREAKPOINT;
319 diag = kdb_parsebp(argc, argv, &nextarg, &template);
320 if (diag)
321 return diag;
322 } else {
323 template.bp_type = BP_BREAKPOINT;
324 }
325
326 /*
327 * Check for clashing breakpoints.
328 *
329 * Note, in this design we can't have hardware breakpoints
330 * enabled for both read and write on the same address.
331 */
332 for (i = 0, bp_check = kdb_breakpoints; i < KDB_MAXBPT;
333 i++, bp_check++) {
334 if (!bp_check->bp_free &&
335 bp_check->bp_addr == template.bp_addr) {
336 kdb_printf("You already have a breakpoint at "
337 kdb_bfd_vma_fmt0 "\n", template.bp_addr);
338 return KDB_DUPBPT;
339 }
340 }
341
342 template.bp_enabled = 1;
343
344 /*
345 * Actually allocate the breakpoint found earlier
346 */
347 *bp = template;
348 bp->bp_free = 0;
349
350 kdb_printbp(bp, bpno);
351
352 return 0;
353}
354
355/*
356 * kdb_bc
357 *
358 * Handles the 'bc', 'be', and 'bd' commands
359 *
360 * [bd|bc|be] <breakpoint-number>
361 * [bd|bc|be] *
362 *
363 * Parameters:
364 * argc Count of arguments in argv
365 * argv Space delimited command line arguments
366 * Outputs:
367 * None.
368 * Returns:
369 * Zero for success, a kdb diagnostic for failure
370 * Locking:
371 * None.
372 * Remarks:
373 */
374static int kdb_bc(int argc, const char **argv)
375{
376 unsigned long addr;
377 kdb_bp_t *bp = NULL;
378 int lowbp = KDB_MAXBPT;
379 int highbp = 0;
380 int done = 0;
381 int i;
382 int diag = 0;
383
384 int cmd; /* KDBCMD_B? */
385#define KDBCMD_BC 0
386#define KDBCMD_BE 1
387#define KDBCMD_BD 2
388
389 if (strcmp(argv[0], "be") == 0)
390 cmd = KDBCMD_BE;
391 else if (strcmp(argv[0], "bd") == 0)
392 cmd = KDBCMD_BD;
393 else
394 cmd = KDBCMD_BC;
395
396 if (argc != 1)
397 return KDB_ARGCOUNT;
398
399 if (strcmp(argv[1], "*") == 0) {
400 lowbp = 0;
401 highbp = KDB_MAXBPT;
402 } else {
403 diag = kdbgetularg(argv[1], &addr);
404 if (diag)
405 return diag;
406
407 /*
408 * For addresses less than the maximum breakpoint number,
409 * assume that the breakpoint number is desired.
410 */
411 if (addr < KDB_MAXBPT) {
412 bp = &kdb_breakpoints[addr];
413 lowbp = highbp = addr;
414 highbp++;
415 } else {
416 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT;
417 i++, bp++) {
418 if (bp->bp_addr == addr) {
419 lowbp = highbp = i;
420 highbp++;
421 break;
422 }
423 }
424 }
425 }
426
427 /*
428 * Now operate on the set of breakpoints matching the input
429 * criteria (either '*' for all, or an individual breakpoint).
430 */
431 for (bp = &kdb_breakpoints[lowbp], i = lowbp;
432 i < highbp;
433 i++, bp++) {
434 if (bp->bp_free)
435 continue;
436
437 done++;
438
439 switch (cmd) {
440 case KDBCMD_BC:
441 bp->bp_enabled = 0;
442
443 kdb_printf("Breakpoint %d at "
444 kdb_bfd_vma_fmt " cleared\n",
445 i, bp->bp_addr);
446
447 bp->bp_addr = 0;
448 bp->bp_free = 1;
449
450 break;
451 case KDBCMD_BE:
452 bp->bp_enabled = 1;
453
454 kdb_printf("Breakpoint %d at "
455 kdb_bfd_vma_fmt " enabled",
456 i, bp->bp_addr);
457
458 kdb_printf("\n");
459 break;
460 case KDBCMD_BD:
461 if (!bp->bp_enabled)
462 break;
463
464 bp->bp_enabled = 0;
465
466 kdb_printf("Breakpoint %d at "
467 kdb_bfd_vma_fmt " disabled\n",
468 i, bp->bp_addr);
469
470 break;
471 }
472 if (bp->bp_delay && (cmd == KDBCMD_BC || cmd == KDBCMD_BD)) {
473 bp->bp_delay = 0;
474 KDB_STATE_CLEAR(SSBPT);
475 }
476 }
477
478 return (!done) ? KDB_BPTNOTFOUND : 0;
479}
480
481/*
482 * kdb_ss
483 *
484 * Process the 'ss' (Single Step) and 'ssb' (Single Step to Branch)
485 * commands.
486 *
487 * ss
488 * ssb
489 *
490 * Parameters:
491 * argc Argument count
492 * argv Argument vector
493 * Outputs:
494 * None.
495 * Returns:
496 * KDB_CMD_SS[B] for success, a kdb error if failure.
497 * Locking:
498 * None.
499 * Remarks:
500 *
501 * Set the arch specific option to trigger a debug trap after the next
502 * instruction.
503 *
504 * For 'ssb', set the trace flag in the debug trap handler
505 * after printing the current insn and return directly without
506 * invoking the kdb command processor, until a branch instruction
507 * is encountered.
508 */
509
510static int kdb_ss(int argc, const char **argv)
511{
512 int ssb = 0;
513
514 ssb = (strcmp(argv[0], "ssb") == 0);
515 if (argc != 0)
516 return KDB_ARGCOUNT;
517 /*
518 * Set trace flag and go.
519 */
520 KDB_STATE_SET(DOING_SS);
521 if (ssb) {
522 KDB_STATE_SET(DOING_SSB);
523 return KDB_CMD_SSB;
524 }
525 return KDB_CMD_SS;
526}
527
528/* Initialize the breakpoint table and register breakpoint commands. */
529
530void __init kdb_initbptab(void)
531{
532 int i;
533 kdb_bp_t *bp;
534
535 /*
536 * First time initialization.
537 */
538 memset(&kdb_breakpoints, '\0', sizeof(kdb_breakpoints));
539
540 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++)
541 bp->bp_free = 1;
542
543 kdb_register_repeat("bp", kdb_bp, "[<vaddr>]",
544 "Set/Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
545 kdb_register_repeat("bl", kdb_bp, "[<vaddr>]",
546 "Display breakpoints", 0, KDB_REPEAT_NO_ARGS);
547 if (arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT)
548 kdb_register_repeat("bph", kdb_bp, "[<vaddr>]",
549 "[datar [length]|dataw [length]] Set hw brk", 0, KDB_REPEAT_NO_ARGS);
550 kdb_register_repeat("bc", kdb_bc, "<bpnum>",
551 "Clear Breakpoint", 0, KDB_REPEAT_NONE);
552 kdb_register_repeat("be", kdb_bc, "<bpnum>",
553 "Enable Breakpoint", 0, KDB_REPEAT_NONE);
554 kdb_register_repeat("bd", kdb_bc, "<bpnum>",
555 "Disable Breakpoint", 0, KDB_REPEAT_NONE);
556
557 kdb_register_repeat("ss", kdb_ss, "",
558 "Single Step", 1, KDB_REPEAT_NO_ARGS);
559 kdb_register_repeat("ssb", kdb_ss, "",
560 "Single step to branch/call", 0, KDB_REPEAT_NO_ARGS);
561 /*
562 * Architecture dependent initialization.
563 */
564}
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
new file mode 100644
index 000000000000..2f62fe85f16a
--- /dev/null
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -0,0 +1,210 @@
1/*
2 * Kernel Debugger Architecture Independent Stack Traceback
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/ctype.h>
13#include <linux/string.h>
14#include <linux/kernel.h>
15#include <linux/sched.h>
16#include <linux/kdb.h>
17#include <linux/nmi.h>
18#include <asm/system.h>
19#include "kdb_private.h"
20
21
22static void kdb_show_stack(struct task_struct *p, void *addr)
23{
24 int old_lvl = console_loglevel;
25 console_loglevel = 15;
26 kdb_trap_printk++;
27 kdb_set_current_task(p);
28 if (addr) {
29 show_stack((struct task_struct *)p, addr);
30 } else if (kdb_current_regs) {
31#ifdef CONFIG_X86
32 show_stack(p, &kdb_current_regs->sp);
33#else
34 show_stack(p, NULL);
35#endif
36 } else {
37 show_stack(p, NULL);
38 }
39 console_loglevel = old_lvl;
40 kdb_trap_printk--;
41}
42
43/*
44 * kdb_bt
45 *
46 * This function implements the 'bt' command. Print a stack
47 * traceback.
48 *
49 * bt [<address-expression>] (addr-exp is for alternate stacks)
50 * btp <pid> Kernel stack for <pid>
51 * btt <address-expression> Kernel stack for task structure at
52 * <address-expression>
53 * bta [DRSTCZEUIMA] All useful processes, optionally
54 * filtered by state
55 * btc [<cpu>] The current process on one cpu,
56 * default is all cpus
57 *
58 * bt <address-expression> refers to a address on the stack, that location
59 * is assumed to contain a return address.
60 *
61 * btt <address-expression> refers to the address of a struct task.
62 *
63 * Inputs:
64 * argc argument count
65 * argv argument vector
66 * Outputs:
67 * None.
68 * Returns:
69 * zero for success, a kdb diagnostic if error
70 * Locking:
71 * none.
72 * Remarks:
73 * Backtrack works best when the code uses frame pointers. But even
74 * without frame pointers we should get a reasonable trace.
75 *
76 * mds comes in handy when examining the stack to do a manual traceback or
77 * to get a starting point for bt <address-expression>.
78 */
79
80static int
81kdb_bt1(struct task_struct *p, unsigned long mask,
82 int argcount, int btaprompt)
83{
84 char buffer[2];
85 if (kdb_getarea(buffer[0], (unsigned long)p) ||
86 kdb_getarea(buffer[0], (unsigned long)(p+1)-1))
87 return KDB_BADADDR;
88 if (!kdb_task_state(p, mask))
89 return 0;
90 kdb_printf("Stack traceback for pid %d\n", p->pid);
91 kdb_ps1(p);
92 kdb_show_stack(p, NULL);
93 if (btaprompt) {
94 kdb_getstr(buffer, sizeof(buffer),
95 "Enter <q> to end, <cr> to continue:");
96 if (buffer[0] == 'q') {
97 kdb_printf("\n");
98 return 1;
99 }
100 }
101 touch_nmi_watchdog();
102 return 0;
103}
104
105int
106kdb_bt(int argc, const char **argv)
107{
108 int diag;
109 int argcount = 5;
110 int btaprompt = 1;
111 int nextarg;
112 unsigned long addr;
113 long offset;
114
115 kdbgetintenv("BTARGS", &argcount); /* Arguments to print */
116 kdbgetintenv("BTAPROMPT", &btaprompt); /* Prompt after each
117 * proc in bta */
118
119 if (strcmp(argv[0], "bta") == 0) {
120 struct task_struct *g, *p;
121 unsigned long cpu;
122 unsigned long mask = kdb_task_state_string(argc ? argv[1] :
123 NULL);
124 if (argc == 0)
125 kdb_ps_suppressed();
126 /* Run the active tasks first */
127 for_each_online_cpu(cpu) {
128 p = kdb_curr_task(cpu);
129 if (kdb_bt1(p, mask, argcount, btaprompt))
130 return 0;
131 }
132 /* Now the inactive tasks */
133 kdb_do_each_thread(g, p) {
134 if (task_curr(p))
135 continue;
136 if (kdb_bt1(p, mask, argcount, btaprompt))
137 return 0;
138 } kdb_while_each_thread(g, p);
139 } else if (strcmp(argv[0], "btp") == 0) {
140 struct task_struct *p;
141 unsigned long pid;
142 if (argc != 1)
143 return KDB_ARGCOUNT;
144 diag = kdbgetularg((char *)argv[1], &pid);
145 if (diag)
146 return diag;
147 p = find_task_by_pid_ns(pid, &init_pid_ns);
148 if (p) {
149 kdb_set_current_task(p);
150 return kdb_bt1(p, ~0UL, argcount, 0);
151 }
152 kdb_printf("No process with pid == %ld found\n", pid);
153 return 0;
154 } else if (strcmp(argv[0], "btt") == 0) {
155 if (argc != 1)
156 return KDB_ARGCOUNT;
157 diag = kdbgetularg((char *)argv[1], &addr);
158 if (diag)
159 return diag;
160 kdb_set_current_task((struct task_struct *)addr);
161 return kdb_bt1((struct task_struct *)addr, ~0UL, argcount, 0);
162 } else if (strcmp(argv[0], "btc") == 0) {
163 unsigned long cpu = ~0;
164 struct task_struct *save_current_task = kdb_current_task;
165 char buf[80];
166 if (argc > 1)
167 return KDB_ARGCOUNT;
168 if (argc == 1) {
169 diag = kdbgetularg((char *)argv[1], &cpu);
170 if (diag)
171 return diag;
172 }
173 /* Recursive use of kdb_parse, do not use argv after
174 * this point */
175 argv = NULL;
176 if (cpu != ~0) {
177 if (cpu >= num_possible_cpus() || !cpu_online(cpu)) {
178 kdb_printf("no process for cpu %ld\n", cpu);
179 return 0;
180 }
181 sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
182 kdb_parse(buf);
183 return 0;
184 }
185 kdb_printf("btc: cpu status: ");
186 kdb_parse("cpu\n");
187 for_each_online_cpu(cpu) {
188 sprintf(buf, "btt 0x%p\n", KDB_TSK(cpu));
189 kdb_parse(buf);
190 touch_nmi_watchdog();
191 }
192 kdb_set_current_task(save_current_task);
193 return 0;
194 } else {
195 if (argc) {
196 nextarg = 1;
197 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
198 &offset, NULL);
199 if (diag)
200 return diag;
201 kdb_show_stack(kdb_current_task, (void *)addr);
202 return 0;
203 } else {
204 return kdb_bt1(kdb_current_task, ~0UL, argcount, 0);
205 }
206 }
207
208 /* NOTREACHED */
209 return 0;
210}
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
new file mode 100644
index 000000000000..56c88e4db309
--- /dev/null
+++ b/kernel/debug/kdb/kdb_cmds
@@ -0,0 +1,35 @@
1# Initial commands for kdb, alter to suit your needs.
2# These commands are executed in kdb_init() context, no SMP, no
3# processes. Commands that require process data (including stack or
4# registers) are not reliable this early. set and bp commands should
5# be safe. Global breakpoint commands affect each cpu as it is booted.
6
7# Standard debugging information for first level support, just type archkdb
8# or archkdbcpu or archkdbshort at the kdb prompt.
9
10defcmd dumpcommon "" "Common kdb debugging"
11 set BTAPROMPT 0
12 set LINES 10000
13 -summary
14 -cpu
15 -ps
16 -dmesg 600
17 -bt
18endefcmd
19
20defcmd dumpall "" "First line debugging"
21 set BTSYMARG 1
22 set BTARGS 9
23 pid R
24 -dumpcommon
25 -bta
26endefcmd
27
28defcmd dumpcpu "" "Same as dumpall but only tasks on cpus"
29 set BTSYMARG 1
30 set BTARGS 9
31 pid R
32 -dumpcommon
33 -btc
34endefcmd
35
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
new file mode 100644
index 000000000000..bf6e8270e957
--- /dev/null
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -0,0 +1,169 @@
1/*
2 * Created by: Jason Wessel <jason.wessel@windriver.com>
3 *
4 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
5 *
6 * This file is licensed under the terms of the GNU General Public
7 * License version 2. This program is licensed "as is" without any
8 * warranty of any kind, whether express or implied.
9 */
10
11#include <linux/kgdb.h>
12#include <linux/kdb.h>
13#include <linux/kdebug.h>
14#include "kdb_private.h"
15#include "../debug_core.h"
16
17/*
18 * KDB interface to KGDB internals
19 */
20get_char_func kdb_poll_funcs[] = {
21 dbg_io_get_char,
22 NULL,
23 NULL,
24 NULL,
25 NULL,
26 NULL,
27};
28EXPORT_SYMBOL_GPL(kdb_poll_funcs);
29
30int kdb_poll_idx = 1;
31EXPORT_SYMBOL_GPL(kdb_poll_idx);
32
33int kdb_stub(struct kgdb_state *ks)
34{
35 int error = 0;
36 kdb_bp_t *bp;
37 unsigned long addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
38 kdb_reason_t reason = KDB_REASON_OOPS;
39 kdb_dbtrap_t db_result = KDB_DB_NOBPT;
40 int i;
41
42 if (KDB_STATE(REENTRY)) {
43 reason = KDB_REASON_SWITCH;
44 KDB_STATE_CLEAR(REENTRY);
45 addr = instruction_pointer(ks->linux_regs);
46 }
47 ks->pass_exception = 0;
48 if (atomic_read(&kgdb_setting_breakpoint))
49 reason = KDB_REASON_KEYBOARD;
50
51 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
52 if ((bp->bp_enabled) && (bp->bp_addr == addr)) {
53 reason = KDB_REASON_BREAK;
54 db_result = KDB_DB_BPT;
55 if (addr != instruction_pointer(ks->linux_regs))
56 kgdb_arch_set_pc(ks->linux_regs, addr);
57 break;
58 }
59 }
60 if (reason == KDB_REASON_BREAK || reason == KDB_REASON_SWITCH) {
61 for (i = 0, bp = kdb_breakpoints; i < KDB_MAXBPT; i++, bp++) {
62 if (bp->bp_free)
63 continue;
64 if (bp->bp_addr == addr) {
65 bp->bp_delay = 1;
66 bp->bp_delayed = 1;
67 /*
68 * SSBPT is set when the kernel debugger must single step a
69 * task in order to re-establish an instruction breakpoint
70 * which uses the instruction replacement mechanism. It is
71 * cleared by any action that removes the need to single-step
72 * the breakpoint.
73 */
74 reason = KDB_REASON_BREAK;
75 db_result = KDB_DB_BPT;
76 KDB_STATE_SET(SSBPT);
77 break;
78 }
79 }
80 }
81
82 if (reason != KDB_REASON_BREAK && ks->ex_vector == 0 &&
83 ks->signo == SIGTRAP) {
84 reason = KDB_REASON_SSTEP;
85 db_result = KDB_DB_BPT;
86 }
87 /* Set initial kdb state variables */
88 KDB_STATE_CLEAR(KGDB_TRANS);
89 kdb_initial_cpu = ks->cpu;
90 kdb_current_task = kgdb_info[ks->cpu].task;
91 kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
92 /* Remove any breakpoints as needed by kdb and clear single step */
93 kdb_bp_remove();
94 KDB_STATE_CLEAR(DOING_SS);
95 KDB_STATE_CLEAR(DOING_SSB);
96 KDB_STATE_SET(PAGER);
97 /* zero out any offline cpu data */
98 for_each_present_cpu(i) {
99 if (!cpu_online(i)) {
100 kgdb_info[i].debuggerinfo = NULL;
101 kgdb_info[i].task = NULL;
102 }
103 }
104 if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
105 ks->pass_exception = 1;
106 KDB_FLAG_SET(CATASTROPHIC);
107 }
108 kdb_initial_cpu = ks->cpu;
109 if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
110 KDB_STATE_CLEAR(SSBPT);
111 KDB_STATE_CLEAR(DOING_SS);
112 } else {
113 /* Start kdb main loop */
114 error = kdb_main_loop(KDB_REASON_ENTER, reason,
115 ks->err_code, db_result, ks->linux_regs);
116 }
117 /*
118 * Upon exit from the kdb main loop setup break points and restart
119 * the system based on the requested continue state
120 */
121 kdb_initial_cpu = -1;
122 kdb_current_task = NULL;
123 kdb_current_regs = NULL;
124 KDB_STATE_CLEAR(PAGER);
125 kdbnearsym_cleanup();
126 if (error == KDB_CMD_KGDB) {
127 if (KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)) {
128 /*
129 * This inteface glue which allows kdb to transition in into
130 * the gdb stub. In order to do this the '?' or '' gdb serial
131 * packet response is processed here. And then control is
132 * passed to the gdbstub.
133 */
134 if (KDB_STATE(DOING_KGDB))
135 gdbstub_state(ks, "?");
136 else
137 gdbstub_state(ks, "");
138 KDB_STATE_CLEAR(DOING_KGDB);
139 KDB_STATE_CLEAR(DOING_KGDB2);
140 }
141 return DBG_PASS_EVENT;
142 }
143 kdb_bp_install(ks->linux_regs);
144 dbg_activate_sw_breakpoints();
145 /* Set the exit state to a single step or a continue */
146 if (KDB_STATE(DOING_SS))
147 gdbstub_state(ks, "s");
148 else
149 gdbstub_state(ks, "c");
150
151 KDB_FLAG_CLEAR(CATASTROPHIC);
152
153 /* Invoke arch specific exception handling prior to system resume */
154 kgdb_info[ks->cpu].ret_state = gdbstub_state(ks, "e");
155 if (ks->pass_exception)
156 kgdb_info[ks->cpu].ret_state = 1;
157 if (error == KDB_CMD_CPU) {
158 KDB_STATE_SET(REENTRY);
159 /*
160 * Force clear the single step bit because kdb emulates this
161 * differently vs the gdbstub
162 */
163 kgdb_single_step = 0;
164 dbg_deactivate_sw_breakpoints();
165 return DBG_SWITCH_CPU_EVENT;
166 }
167 return kgdb_info[ks->cpu].ret_state;
168}
169
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
new file mode 100644
index 000000000000..c9b7f4f90bba
--- /dev/null
+++ b/kernel/debug/kdb/kdb_io.c
@@ -0,0 +1,826 @@
1/*
2 * Kernel Debugger Architecture Independent Console I/O handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 */
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/ctype.h>
15#include <linux/kernel.h>
16#include <linux/init.h>
17#include <linux/kdev_t.h>
18#include <linux/console.h>
19#include <linux/string.h>
20#include <linux/sched.h>
21#include <linux/smp.h>
22#include <linux/nmi.h>
23#include <linux/delay.h>
24#include <linux/kgdb.h>
25#include <linux/kdb.h>
26#include <linux/kallsyms.h>
27#include "kdb_private.h"
28
29#define CMD_BUFLEN 256
30char kdb_prompt_str[CMD_BUFLEN];
31
32int kdb_trap_printk;
33
34static void kgdb_transition_check(char *buffer)
35{
36 int slen = strlen(buffer);
37 if (strncmp(buffer, "$?#3f", slen) != 0 &&
38 strncmp(buffer, "$qSupported#37", slen) != 0 &&
39 strncmp(buffer, "+$qSupported#37", slen) != 0) {
40 KDB_STATE_SET(KGDB_TRANS);
41 kdb_printf("%s", buffer);
42 }
43}
44
45static int kdb_read_get_key(char *buffer, size_t bufsize)
46{
47#define ESCAPE_UDELAY 1000
48#define ESCAPE_DELAY (2*1000000/ESCAPE_UDELAY) /* 2 seconds worth of udelays */
49 char escape_data[5]; /* longest vt100 escape sequence is 4 bytes */
50 char *ped = escape_data;
51 int escape_delay = 0;
52 get_char_func *f, *f_escape = NULL;
53 int key;
54
55 for (f = &kdb_poll_funcs[0]; ; ++f) {
56 if (*f == NULL) {
57 /* Reset NMI watchdog once per poll loop */
58 touch_nmi_watchdog();
59 f = &kdb_poll_funcs[0];
60 }
61 if (escape_delay == 2) {
62 *ped = '\0';
63 ped = escape_data;
64 --escape_delay;
65 }
66 if (escape_delay == 1) {
67 key = *ped++;
68 if (!*ped)
69 --escape_delay;
70 break;
71 }
72 key = (*f)();
73 if (key == -1) {
74 if (escape_delay) {
75 udelay(ESCAPE_UDELAY);
76 --escape_delay;
77 }
78 continue;
79 }
80 if (bufsize <= 2) {
81 if (key == '\r')
82 key = '\n';
83 *buffer++ = key;
84 *buffer = '\0';
85 return -1;
86 }
87 if (escape_delay == 0 && key == '\e') {
88 escape_delay = ESCAPE_DELAY;
89 ped = escape_data;
90 f_escape = f;
91 }
92 if (escape_delay) {
93 *ped++ = key;
94 if (f_escape != f) {
95 escape_delay = 2;
96 continue;
97 }
98 if (ped - escape_data == 1) {
99 /* \e */
100 continue;
101 } else if (ped - escape_data == 2) {
102 /* \e<something> */
103 if (key != '[')
104 escape_delay = 2;
105 continue;
106 } else if (ped - escape_data == 3) {
107 /* \e[<something> */
108 int mapkey = 0;
109 switch (key) {
110 case 'A': /* \e[A, up arrow */
111 mapkey = 16;
112 break;
113 case 'B': /* \e[B, down arrow */
114 mapkey = 14;
115 break;
116 case 'C': /* \e[C, right arrow */
117 mapkey = 6;
118 break;
119 case 'D': /* \e[D, left arrow */
120 mapkey = 2;
121 break;
122 case '1': /* dropthrough */
123 case '3': /* dropthrough */
124 /* \e[<1,3,4>], may be home, del, end */
125 case '4':
126 mapkey = -1;
127 break;
128 }
129 if (mapkey != -1) {
130 if (mapkey > 0) {
131 escape_data[0] = mapkey;
132 escape_data[1] = '\0';
133 }
134 escape_delay = 2;
135 }
136 continue;
137 } else if (ped - escape_data == 4) {
138 /* \e[<1,3,4><something> */
139 int mapkey = 0;
140 if (key == '~') {
141 switch (escape_data[2]) {
142 case '1': /* \e[1~, home */
143 mapkey = 1;
144 break;
145 case '3': /* \e[3~, del */
146 mapkey = 4;
147 break;
148 case '4': /* \e[4~, end */
149 mapkey = 5;
150 break;
151 }
152 }
153 if (mapkey > 0) {
154 escape_data[0] = mapkey;
155 escape_data[1] = '\0';
156 }
157 escape_delay = 2;
158 continue;
159 }
160 }
161 break; /* A key to process */
162 }
163 return key;
164}
165
166/*
167 * kdb_read
168 *
169 * This function reads a string of characters, terminated by
170 * a newline, or by reaching the end of the supplied buffer,
171 * from the current kernel debugger console device.
172 * Parameters:
173 * buffer - Address of character buffer to receive input characters.
174 * bufsize - size, in bytes, of the character buffer
175 * Returns:
176 * Returns a pointer to the buffer containing the received
177 * character string. This string will be terminated by a
178 * newline character.
179 * Locking:
180 * No locks are required to be held upon entry to this
181 * function. It is not reentrant - it relies on the fact
182 * that while kdb is running on only one "master debug" cpu.
183 * Remarks:
184 *
185 * The buffer size must be >= 2. A buffer size of 2 means that the caller only
186 * wants a single key.
187 *
188 * An escape key could be the start of a vt100 control sequence such as \e[D
189 * (left arrow) or it could be a character in its own right. The standard
190 * method for detecting the difference is to wait for 2 seconds to see if there
191 * are any other characters. kdb is complicated by the lack of a timer service
192 * (interrupts are off), by multiple input sources and by the need to sometimes
193 * return after just one key. Escape sequence processing has to be done as
194 * states in the polling loop.
195 */
196
197static char *kdb_read(char *buffer, size_t bufsize)
198{
199 char *cp = buffer;
200 char *bufend = buffer+bufsize-2; /* Reserve space for newline
201 * and null byte */
202 char *lastchar;
203 char *p_tmp;
204 char tmp;
205 static char tmpbuffer[CMD_BUFLEN];
206 int len = strlen(buffer);
207 int len_tmp;
208 int tab = 0;
209 int count;
210 int i;
211 int diag, dtab_count;
212 int key;
213
214
215 diag = kdbgetintenv("DTABCOUNT", &dtab_count);
216 if (diag)
217 dtab_count = 30;
218
219 if (len > 0) {
220 cp += len;
221 if (*(buffer+len-1) == '\n')
222 cp--;
223 }
224
225 lastchar = cp;
226 *cp = '\0';
227 kdb_printf("%s", buffer);
228poll_again:
229 key = kdb_read_get_key(buffer, bufsize);
230 if (key == -1)
231 return buffer;
232 if (key != 9)
233 tab = 0;
234 switch (key) {
235 case 8: /* backspace */
236 if (cp > buffer) {
237 if (cp < lastchar) {
238 memcpy(tmpbuffer, cp, lastchar - cp);
239 memcpy(cp-1, tmpbuffer, lastchar - cp);
240 }
241 *(--lastchar) = '\0';
242 --cp;
243 kdb_printf("\b%s \r", cp);
244 tmp = *cp;
245 *cp = '\0';
246 kdb_printf(kdb_prompt_str);
247 kdb_printf("%s", buffer);
248 *cp = tmp;
249 }
250 break;
251 case 13: /* enter */
252 *lastchar++ = '\n';
253 *lastchar++ = '\0';
254 kdb_printf("\n");
255 return buffer;
256 case 4: /* Del */
257 if (cp < lastchar) {
258 memcpy(tmpbuffer, cp+1, lastchar - cp - 1);
259 memcpy(cp, tmpbuffer, lastchar - cp - 1);
260 *(--lastchar) = '\0';
261 kdb_printf("%s \r", cp);
262 tmp = *cp;
263 *cp = '\0';
264 kdb_printf(kdb_prompt_str);
265 kdb_printf("%s", buffer);
266 *cp = tmp;
267 }
268 break;
269 case 1: /* Home */
270 if (cp > buffer) {
271 kdb_printf("\r");
272 kdb_printf(kdb_prompt_str);
273 cp = buffer;
274 }
275 break;
276 case 5: /* End */
277 if (cp < lastchar) {
278 kdb_printf("%s", cp);
279 cp = lastchar;
280 }
281 break;
282 case 2: /* Left */
283 if (cp > buffer) {
284 kdb_printf("\b");
285 --cp;
286 }
287 break;
288 case 14: /* Down */
289 memset(tmpbuffer, ' ',
290 strlen(kdb_prompt_str) + (lastchar-buffer));
291 *(tmpbuffer+strlen(kdb_prompt_str) +
292 (lastchar-buffer)) = '\0';
293 kdb_printf("\r%s\r", tmpbuffer);
294 *lastchar = (char)key;
295 *(lastchar+1) = '\0';
296 return lastchar;
297 case 6: /* Right */
298 if (cp < lastchar) {
299 kdb_printf("%c", *cp);
300 ++cp;
301 }
302 break;
303 case 16: /* Up */
304 memset(tmpbuffer, ' ',
305 strlen(kdb_prompt_str) + (lastchar-buffer));
306 *(tmpbuffer+strlen(kdb_prompt_str) +
307 (lastchar-buffer)) = '\0';
308 kdb_printf("\r%s\r", tmpbuffer);
309 *lastchar = (char)key;
310 *(lastchar+1) = '\0';
311 return lastchar;
312 case 9: /* Tab */
313 if (tab < 2)
314 ++tab;
315 p_tmp = buffer;
316 while (*p_tmp == ' ')
317 p_tmp++;
318 if (p_tmp > cp)
319 break;
320 memcpy(tmpbuffer, p_tmp, cp-p_tmp);
321 *(tmpbuffer + (cp-p_tmp)) = '\0';
322 p_tmp = strrchr(tmpbuffer, ' ');
323 if (p_tmp)
324 ++p_tmp;
325 else
326 p_tmp = tmpbuffer;
327 len = strlen(p_tmp);
328 count = kallsyms_symbol_complete(p_tmp,
329 sizeof(tmpbuffer) -
330 (p_tmp - tmpbuffer));
331 if (tab == 2 && count > 0) {
332 kdb_printf("\n%d symbols are found.", count);
333 if (count > dtab_count) {
334 count = dtab_count;
335 kdb_printf(" But only first %d symbols will"
336 " be printed.\nYou can change the"
337 " environment variable DTABCOUNT.",
338 count);
339 }
340 kdb_printf("\n");
341 for (i = 0; i < count; i++) {
342 if (kallsyms_symbol_next(p_tmp, i) < 0)
343 break;
344 kdb_printf("%s ", p_tmp);
345 *(p_tmp + len) = '\0';
346 }
347 if (i >= dtab_count)
348 kdb_printf("...");
349 kdb_printf("\n");
350 kdb_printf(kdb_prompt_str);
351 kdb_printf("%s", buffer);
352 } else if (tab != 2 && count > 0) {
353 len_tmp = strlen(p_tmp);
354 strncpy(p_tmp+len_tmp, cp, lastchar-cp+1);
355 len_tmp = strlen(p_tmp);
356 strncpy(cp, p_tmp+len, len_tmp-len + 1);
357 len = len_tmp - len;
358 kdb_printf("%s", cp);
359 cp += len;
360 lastchar += len;
361 }
362 kdb_nextline = 1; /* reset output line number */
363 break;
364 default:
365 if (key >= 32 && lastchar < bufend) {
366 if (cp < lastchar) {
367 memcpy(tmpbuffer, cp, lastchar - cp);
368 memcpy(cp+1, tmpbuffer, lastchar - cp);
369 *++lastchar = '\0';
370 *cp = key;
371 kdb_printf("%s\r", cp);
372 ++cp;
373 tmp = *cp;
374 *cp = '\0';
375 kdb_printf(kdb_prompt_str);
376 kdb_printf("%s", buffer);
377 *cp = tmp;
378 } else {
379 *++lastchar = '\0';
380 *cp++ = key;
381 /* The kgdb transition check will hide
382 * printed characters if we think that
383 * kgdb is connecting, until the check
384 * fails */
385 if (!KDB_STATE(KGDB_TRANS))
386 kgdb_transition_check(buffer);
387 else
388 kdb_printf("%c", key);
389 }
390 /* Special escape to kgdb */
391 if (lastchar - buffer >= 5 &&
392 strcmp(lastchar - 5, "$?#3f") == 0) {
393 strcpy(buffer, "kgdb");
394 KDB_STATE_SET(DOING_KGDB);
395 return buffer;
396 }
397 if (lastchar - buffer >= 14 &&
398 strcmp(lastchar - 14, "$qSupported#37") == 0) {
399 strcpy(buffer, "kgdb");
400 KDB_STATE_SET(DOING_KGDB2);
401 return buffer;
402 }
403 }
404 break;
405 }
406 goto poll_again;
407}
408
409/*
410 * kdb_getstr
411 *
412 * Print the prompt string and read a command from the
413 * input device.
414 *
415 * Parameters:
416 * buffer Address of buffer to receive command
417 * bufsize Size of buffer in bytes
418 * prompt Pointer to string to use as prompt string
419 * Returns:
420 * Pointer to command buffer.
421 * Locking:
422 * None.
423 * Remarks:
424 * For SMP kernels, the processor number will be
425 * substituted for %d, %x or %o in the prompt.
426 */
427
428char *kdb_getstr(char *buffer, size_t bufsize, char *prompt)
429{
430 if (prompt && kdb_prompt_str != prompt)
431 strncpy(kdb_prompt_str, prompt, CMD_BUFLEN);
432 kdb_printf(kdb_prompt_str);
433 kdb_nextline = 1; /* Prompt and input resets line number */
434 return kdb_read(buffer, bufsize);
435}
436
437/*
438 * kdb_input_flush
439 *
440 * Get rid of any buffered console input.
441 *
442 * Parameters:
443 * none
444 * Returns:
445 * nothing
446 * Locking:
447 * none
448 * Remarks:
449 * Call this function whenever you want to flush input. If there is any
450 * outstanding input, it ignores all characters until there has been no
451 * data for approximately 1ms.
452 */
453
454static void kdb_input_flush(void)
455{
456 get_char_func *f;
457 int res;
458 int flush_delay = 1;
459 while (flush_delay) {
460 flush_delay--;
461empty:
462 touch_nmi_watchdog();
463 for (f = &kdb_poll_funcs[0]; *f; ++f) {
464 res = (*f)();
465 if (res != -1) {
466 flush_delay = 1;
467 goto empty;
468 }
469 }
470 if (flush_delay)
471 mdelay(1);
472 }
473}
474
475/*
476 * kdb_printf
477 *
478 * Print a string to the output device(s).
479 *
480 * Parameters:
481 * printf-like format and optional args.
482 * Returns:
483 * 0
484 * Locking:
485 * None.
486 * Remarks:
487 * use 'kdbcons->write()' to avoid polluting 'log_buf' with
488 * kdb output.
489 *
490 * If the user is doing a cmd args | grep srch
491 * then kdb_grepping_flag is set.
492 * In that case we need to accumulate full lines (ending in \n) before
493 * searching for the pattern.
494 */
495
496static char kdb_buffer[256]; /* A bit too big to go on stack */
497static char *next_avail = kdb_buffer;
498static int size_avail;
499static int suspend_grep;
500
501/*
502 * search arg1 to see if it contains arg2
503 * (kdmain.c provides flags for ^pat and pat$)
504 *
505 * return 1 for found, 0 for not found
506 */
507static int kdb_search_string(char *searched, char *searchfor)
508{
509 char firstchar, *cp;
510 int len1, len2;
511
512 /* not counting the newline at the end of "searched" */
513 len1 = strlen(searched)-1;
514 len2 = strlen(searchfor);
515 if (len1 < len2)
516 return 0;
517 if (kdb_grep_leading && kdb_grep_trailing && len1 != len2)
518 return 0;
519 if (kdb_grep_leading) {
520 if (!strncmp(searched, searchfor, len2))
521 return 1;
522 } else if (kdb_grep_trailing) {
523 if (!strncmp(searched+len1-len2, searchfor, len2))
524 return 1;
525 } else {
526 firstchar = *searchfor;
527 cp = searched;
528 while ((cp = strchr(cp, firstchar))) {
529 if (!strncmp(cp, searchfor, len2))
530 return 1;
531 cp++;
532 }
533 }
534 return 0;
535}
536
537int vkdb_printf(const char *fmt, va_list ap)
538{
539 int diag;
540 int linecount;
541 int logging, saved_loglevel = 0;
542 int saved_trap_printk;
543 int got_printf_lock = 0;
544 int retlen = 0;
545 int fnd, len;
546 char *cp, *cp2, *cphold = NULL, replaced_byte = ' ';
547 char *moreprompt = "more> ";
548 struct console *c = console_drivers;
549 static DEFINE_SPINLOCK(kdb_printf_lock);
550 unsigned long uninitialized_var(flags);
551
552 preempt_disable();
553 saved_trap_printk = kdb_trap_printk;
554 kdb_trap_printk = 0;
555
556 /* Serialize kdb_printf if multiple cpus try to write at once.
557 * But if any cpu goes recursive in kdb, just print the output,
558 * even if it is interleaved with any other text.
559 */
560 if (!KDB_STATE(PRINTF_LOCK)) {
561 KDB_STATE_SET(PRINTF_LOCK);
562 spin_lock_irqsave(&kdb_printf_lock, flags);
563 got_printf_lock = 1;
564 atomic_inc(&kdb_event);
565 } else {
566 __acquire(kdb_printf_lock);
567 }
568
569 diag = kdbgetintenv("LINES", &linecount);
570 if (diag || linecount <= 1)
571 linecount = 24;
572
573 diag = kdbgetintenv("LOGGING", &logging);
574 if (diag)
575 logging = 0;
576
577 if (!kdb_grepping_flag || suspend_grep) {
578 /* normally, every vsnprintf starts a new buffer */
579 next_avail = kdb_buffer;
580 size_avail = sizeof(kdb_buffer);
581 }
582 vsnprintf(next_avail, size_avail, fmt, ap);
583
584 /*
585 * If kdb_parse() found that the command was cmd xxx | grep yyy
586 * then kdb_grepping_flag is set, and kdb_grep_string contains yyy
587 *
588 * Accumulate the print data up to a newline before searching it.
589 * (vsnprintf does null-terminate the string that it generates)
590 */
591
592 /* skip the search if prints are temporarily unconditional */
593 if (!suspend_grep && kdb_grepping_flag) {
594 cp = strchr(kdb_buffer, '\n');
595 if (!cp) {
596 /*
597 * Special cases that don't end with newlines
598 * but should be written without one:
599 * The "[nn]kdb> " prompt should
600 * appear at the front of the buffer.
601 *
602 * The "[nn]more " prompt should also be
603 * (MOREPROMPT -> moreprompt)
604 * written * but we print that ourselves,
605 * we set the suspend_grep flag to make
606 * it unconditional.
607 *
608 */
609 if (next_avail == kdb_buffer) {
610 /*
611 * these should occur after a newline,
612 * so they will be at the front of the
613 * buffer
614 */
615 cp2 = kdb_buffer;
616 len = strlen(kdb_prompt_str);
617 if (!strncmp(cp2, kdb_prompt_str, len)) {
618 /*
619 * We're about to start a new
620 * command, so we can go back
621 * to normal mode.
622 */
623 kdb_grepping_flag = 0;
624 goto kdb_printit;
625 }
626 }
627 /* no newline; don't search/write the buffer
628 until one is there */
629 len = strlen(kdb_buffer);
630 next_avail = kdb_buffer + len;
631 size_avail = sizeof(kdb_buffer) - len;
632 goto kdb_print_out;
633 }
634
635 /*
636 * The newline is present; print through it or discard
637 * it, depending on the results of the search.
638 */
639 cp++; /* to byte after the newline */
640 replaced_byte = *cp; /* remember what/where it was */
641 cphold = cp;
642 *cp = '\0'; /* end the string for our search */
643
644 /*
645 * We now have a newline at the end of the string
646 * Only continue with this output if it contains the
647 * search string.
648 */
649 fnd = kdb_search_string(kdb_buffer, kdb_grep_string);
650 if (!fnd) {
651 /*
652 * At this point the complete line at the start
653 * of kdb_buffer can be discarded, as it does
654 * not contain what the user is looking for.
655 * Shift the buffer left.
656 */
657 *cphold = replaced_byte;
658 strcpy(kdb_buffer, cphold);
659 len = strlen(kdb_buffer);
660 next_avail = kdb_buffer + len;
661 size_avail = sizeof(kdb_buffer) - len;
662 goto kdb_print_out;
663 }
664 /*
665 * at this point the string is a full line and
666 * should be printed, up to the null.
667 */
668 }
669kdb_printit:
670
671 /*
672 * Write to all consoles.
673 */
674 retlen = strlen(kdb_buffer);
675 if (!dbg_kdb_mode && kgdb_connected) {
676 gdbstub_msg_write(kdb_buffer, retlen);
677 } else {
678 if (!dbg_io_ops->is_console) {
679 len = strlen(kdb_buffer);
680 cp = kdb_buffer;
681 while (len--) {
682 dbg_io_ops->write_char(*cp);
683 cp++;
684 }
685 }
686 while (c) {
687 c->write(c, kdb_buffer, retlen);
688 touch_nmi_watchdog();
689 c = c->next;
690 }
691 }
692 if (logging) {
693 saved_loglevel = console_loglevel;
694 console_loglevel = 0;
695 printk(KERN_INFO "%s", kdb_buffer);
696 }
697
698 if (KDB_STATE(PAGER) && strchr(kdb_buffer, '\n'))
699 kdb_nextline++;
700
701 /* check for having reached the LINES number of printed lines */
702 if (kdb_nextline == linecount) {
703 char buf1[16] = "";
704#if defined(CONFIG_SMP)
705 char buf2[32];
706#endif
707
708 /* Watch out for recursion here. Any routine that calls
709 * kdb_printf will come back through here. And kdb_read
710 * uses kdb_printf to echo on serial consoles ...
711 */
712 kdb_nextline = 1; /* In case of recursion */
713
714 /*
715 * Pause until cr.
716 */
717 moreprompt = kdbgetenv("MOREPROMPT");
718 if (moreprompt == NULL)
719 moreprompt = "more> ";
720
721#if defined(CONFIG_SMP)
722 if (strchr(moreprompt, '%')) {
723 sprintf(buf2, moreprompt, get_cpu());
724 put_cpu();
725 moreprompt = buf2;
726 }
727#endif
728
729 kdb_input_flush();
730 c = console_drivers;
731
732 if (!dbg_io_ops->is_console) {
733 len = strlen(moreprompt);
734 cp = moreprompt;
735 while (len--) {
736 dbg_io_ops->write_char(*cp);
737 cp++;
738 }
739 }
740 while (c) {
741 c->write(c, moreprompt, strlen(moreprompt));
742 touch_nmi_watchdog();
743 c = c->next;
744 }
745
746 if (logging)
747 printk("%s", moreprompt);
748
749 kdb_read(buf1, 2); /* '2' indicates to return
750 * immediately after getting one key. */
751 kdb_nextline = 1; /* Really set output line 1 */
752
753 /* empty and reset the buffer: */
754 kdb_buffer[0] = '\0';
755 next_avail = kdb_buffer;
756 size_avail = sizeof(kdb_buffer);
757 if ((buf1[0] == 'q') || (buf1[0] == 'Q')) {
758 /* user hit q or Q */
759 KDB_FLAG_SET(CMD_INTERRUPT); /* command interrupted */
760 KDB_STATE_CLEAR(PAGER);
761 /* end of command output; back to normal mode */
762 kdb_grepping_flag = 0;
763 kdb_printf("\n");
764 } else if (buf1[0] == ' ') {
765 kdb_printf("\n");
766 suspend_grep = 1; /* for this recursion */
767 } else if (buf1[0] == '\n') {
768 kdb_nextline = linecount - 1;
769 kdb_printf("\r");
770 suspend_grep = 1; /* for this recursion */
771 } else if (buf1[0] && buf1[0] != '\n') {
772 /* user hit something other than enter */
773 suspend_grep = 1; /* for this recursion */
774 kdb_printf("\nOnly 'q' or 'Q' are processed at more "
775 "prompt, input ignored\n");
776 } else if (kdb_grepping_flag) {
777 /* user hit enter */
778 suspend_grep = 1; /* for this recursion */
779 kdb_printf("\n");
780 }
781 kdb_input_flush();
782 }
783
784 /*
785 * For grep searches, shift the printed string left.
786 * replaced_byte contains the character that was overwritten with
787 * the terminating null, and cphold points to the null.
788 * Then adjust the notion of available space in the buffer.
789 */
790 if (kdb_grepping_flag && !suspend_grep) {
791 *cphold = replaced_byte;
792 strcpy(kdb_buffer, cphold);
793 len = strlen(kdb_buffer);
794 next_avail = kdb_buffer + len;
795 size_avail = sizeof(kdb_buffer) - len;
796 }
797
798kdb_print_out:
799 suspend_grep = 0; /* end of what may have been a recursive call */
800 if (logging)
801 console_loglevel = saved_loglevel;
802 if (KDB_STATE(PRINTF_LOCK) && got_printf_lock) {
803 got_printf_lock = 0;
804 spin_unlock_irqrestore(&kdb_printf_lock, flags);
805 KDB_STATE_CLEAR(PRINTF_LOCK);
806 atomic_dec(&kdb_event);
807 } else {
808 __release(kdb_printf_lock);
809 }
810 kdb_trap_printk = saved_trap_printk;
811 preempt_enable();
812 return retlen;
813}
814
815int kdb_printf(const char *fmt, ...)
816{
817 va_list ap;
818 int r;
819
820 va_start(ap, fmt);
821 r = vkdb_printf(fmt, ap);
822 va_end(ap);
823
824 return r;
825}
826
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
new file mode 100644
index 000000000000..4bca634975c0
--- /dev/null
+++ b/kernel/debug/kdb/kdb_keyboard.c
@@ -0,0 +1,212 @@
1/*
2 * Kernel Debugger Architecture Dependent Console I/O handler
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License.
6 *
7 * Copyright (c) 1999-2006 Silicon Graphics, Inc. All Rights Reserved.
8 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
9 */
10
11#include <linux/kdb.h>
12#include <linux/keyboard.h>
13#include <linux/ctype.h>
14#include <linux/module.h>
15#include <linux/io.h>
16
17/* Keyboard Controller Registers on normal PCs. */
18
19#define KBD_STATUS_REG 0x64 /* Status register (R) */
20#define KBD_DATA_REG 0x60 /* Keyboard data register (R/W) */
21
22/* Status Register Bits */
23
24#define KBD_STAT_OBF 0x01 /* Keyboard output buffer full */
25#define KBD_STAT_MOUSE_OBF 0x20 /* Mouse output buffer full */
26
27static int kbd_exists;
28
29/*
30 * Check if the keyboard controller has a keypress for us.
31 * Some parts (Enter Release, LED change) are still blocking polled here,
32 * but hopefully they are all short.
33 */
34int kdb_get_kbd_char(void)
35{
36 int scancode, scanstatus;
37 static int shift_lock; /* CAPS LOCK state (0-off, 1-on) */
38 static int shift_key; /* Shift next keypress */
39 static int ctrl_key;
40 u_short keychar;
41
42 if (KDB_FLAG(NO_I8042) || KDB_FLAG(NO_VT_CONSOLE) ||
43 (inb(KBD_STATUS_REG) == 0xff && inb(KBD_DATA_REG) == 0xff)) {
44 kbd_exists = 0;
45 return -1;
46 }
47 kbd_exists = 1;
48
49 if ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
50 return -1;
51
52 /*
53 * Fetch the scancode
54 */
55 scancode = inb(KBD_DATA_REG);
56 scanstatus = inb(KBD_STATUS_REG);
57
58 /*
59 * Ignore mouse events.
60 */
61 if (scanstatus & KBD_STAT_MOUSE_OBF)
62 return -1;
63
64 /*
65 * Ignore release, trigger on make
66 * (except for shift keys, where we want to
67 * keep the shift state so long as the key is
68 * held down).
69 */
70
71 if (((scancode&0x7f) == 0x2a) || ((scancode&0x7f) == 0x36)) {
72 /*
73 * Next key may use shift table
74 */
75 if ((scancode & 0x80) == 0)
76 shift_key = 1;
77 else
78 shift_key = 0;
79 return -1;
80 }
81
82 if ((scancode&0x7f) == 0x1d) {
83 /*
84 * Left ctrl key
85 */
86 if ((scancode & 0x80) == 0)
87 ctrl_key = 1;
88 else
89 ctrl_key = 0;
90 return -1;
91 }
92
93 if ((scancode & 0x80) != 0)
94 return -1;
95
96 scancode &= 0x7f;
97
98 /*
99 * Translate scancode
100 */
101
102 if (scancode == 0x3a) {
103 /*
104 * Toggle caps lock
105 */
106 shift_lock ^= 1;
107
108#ifdef KDB_BLINK_LED
109 kdb_toggleled(0x4);
110#endif
111 return -1;
112 }
113
114 if (scancode == 0x0e) {
115 /*
116 * Backspace
117 */
118 return 8;
119 }
120
121 /* Special Key */
122 switch (scancode) {
123 case 0xF: /* Tab */
124 return 9;
125 case 0x53: /* Del */
126 return 4;
127 case 0x47: /* Home */
128 return 1;
129 case 0x4F: /* End */
130 return 5;
131 case 0x4B: /* Left */
132 return 2;
133 case 0x48: /* Up */
134 return 16;
135 case 0x50: /* Down */
136 return 14;
137 case 0x4D: /* Right */
138 return 6;
139 }
140
141 if (scancode == 0xe0)
142 return -1;
143
144 /*
145 * For Japanese 86/106 keyboards
146 * See comment in drivers/char/pc_keyb.c.
147 * - Masahiro Adegawa
148 */
149 if (scancode == 0x73)
150 scancode = 0x59;
151 else if (scancode == 0x7d)
152 scancode = 0x7c;
153
154 if (!shift_lock && !shift_key && !ctrl_key) {
155 keychar = plain_map[scancode];
156 } else if ((shift_lock || shift_key) && key_maps[1]) {
157 keychar = key_maps[1][scancode];
158 } else if (ctrl_key && key_maps[4]) {
159 keychar = key_maps[4][scancode];
160 } else {
161 keychar = 0x0020;
162 kdb_printf("Unknown state/scancode (%d)\n", scancode);
163 }
164 keychar &= 0x0fff;
165 if (keychar == '\t')
166 keychar = ' ';
167 switch (KTYP(keychar)) {
168 case KT_LETTER:
169 case KT_LATIN:
170 if (isprint(keychar))
171 break; /* printable characters */
172 /* drop through */
173 case KT_SPEC:
174 if (keychar == K_ENTER)
175 break;
176 /* drop through */
177 default:
178 return -1; /* ignore unprintables */
179 }
180
181 if ((scancode & 0x7f) == 0x1c) {
182 /*
183 * enter key. All done. Absorb the release scancode.
184 */
185 while ((inb(KBD_STATUS_REG) & KBD_STAT_OBF) == 0)
186 ;
187
188 /*
189 * Fetch the scancode
190 */
191 scancode = inb(KBD_DATA_REG);
192 scanstatus = inb(KBD_STATUS_REG);
193
194 while (scanstatus & KBD_STAT_MOUSE_OBF) {
195 scancode = inb(KBD_DATA_REG);
196 scanstatus = inb(KBD_STATUS_REG);
197 }
198
199 if (scancode != 0x9c) {
200 /*
201 * Wasn't an enter-release, why not?
202 */
203 kdb_printf("kdb: expected enter got 0x%x status 0x%x\n",
204 scancode, scanstatus);
205 }
206
207 return 13;
208 }
209
210 return keychar & 0xff;
211}
212EXPORT_SYMBOL_GPL(kdb_get_kbd_char);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
new file mode 100644
index 000000000000..ebe4a287419e
--- /dev/null
+++ b/kernel/debug/kdb/kdb_main.c
@@ -0,0 +1,2846 @@
1/*
2 * Kernel Debugger Architecture Independent Main Code
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (C) 2000 Stephane Eranian <eranian@hpl.hp.com>
10 * Xscale (R) modifications copyright (C) 2003 Intel Corporation.
11 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
12 */
13
14#include <linux/ctype.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/reboot.h>
18#include <linux/sched.h>
19#include <linux/sysrq.h>
20#include <linux/smp.h>
21#include <linux/utsname.h>
22#include <linux/vmalloc.h>
23#include <linux/module.h>
24#include <linux/mm.h>
25#include <linux/init.h>
26#include <linux/kallsyms.h>
27#include <linux/kgdb.h>
28#include <linux/kdb.h>
29#include <linux/notifier.h>
30#include <linux/interrupt.h>
31#include <linux/delay.h>
32#include <linux/nmi.h>
33#include <linux/time.h>
34#include <linux/ptrace.h>
35#include <linux/sysctl.h>
36#include <linux/cpu.h>
37#include <linux/kdebug.h>
38#include <linux/proc_fs.h>
39#include <linux/uaccess.h>
40#include <linux/slab.h>
41#include "kdb_private.h"
42
43#define GREP_LEN 256
44char kdb_grep_string[GREP_LEN];
45int kdb_grepping_flag;
46EXPORT_SYMBOL(kdb_grepping_flag);
47int kdb_grep_leading;
48int kdb_grep_trailing;
49
50/*
51 * Kernel debugger state flags
52 */
53int kdb_flags;
54atomic_t kdb_event;
55
56/*
57 * kdb_lock protects updates to kdb_initial_cpu. Used to
58 * single thread processors through the kernel debugger.
59 */
60int kdb_initial_cpu = -1; /* cpu number that owns kdb */
61int kdb_nextline = 1;
62int kdb_state; /* General KDB state */
63
64struct task_struct *kdb_current_task;
65EXPORT_SYMBOL(kdb_current_task);
66struct pt_regs *kdb_current_regs;
67
68const char *kdb_diemsg;
69static int kdb_go_count;
70#ifdef CONFIG_KDB_CONTINUE_CATASTROPHIC
71static unsigned int kdb_continue_catastrophic =
72 CONFIG_KDB_CONTINUE_CATASTROPHIC;
73#else
74static unsigned int kdb_continue_catastrophic;
75#endif
76
77/* kdb_commands describes the available commands. */
78static kdbtab_t *kdb_commands;
79#define KDB_BASE_CMD_MAX 50
80static int kdb_max_commands = KDB_BASE_CMD_MAX;
81static kdbtab_t kdb_base_commands[50];
82#define for_each_kdbcmd(cmd, num) \
83 for ((cmd) = kdb_base_commands, (num) = 0; \
84 num < kdb_max_commands; \
85 num == KDB_BASE_CMD_MAX ? cmd = kdb_commands : cmd++, num++)
86
87typedef struct _kdbmsg {
88 int km_diag; /* kdb diagnostic */
89 char *km_msg; /* Corresponding message text */
90} kdbmsg_t;
91
92#define KDBMSG(msgnum, text) \
93 { KDB_##msgnum, text }
94
95static kdbmsg_t kdbmsgs[] = {
96 KDBMSG(NOTFOUND, "Command Not Found"),
97 KDBMSG(ARGCOUNT, "Improper argument count, see usage."),
98 KDBMSG(BADWIDTH, "Illegal value for BYTESPERWORD use 1, 2, 4 or 8, "
99 "8 is only allowed on 64 bit systems"),
100 KDBMSG(BADRADIX, "Illegal value for RADIX use 8, 10 or 16"),
101 KDBMSG(NOTENV, "Cannot find environment variable"),
102 KDBMSG(NOENVVALUE, "Environment variable should have value"),
103 KDBMSG(NOTIMP, "Command not implemented"),
104 KDBMSG(ENVFULL, "Environment full"),
105 KDBMSG(ENVBUFFULL, "Environment buffer full"),
106 KDBMSG(TOOMANYBPT, "Too many breakpoints defined"),
107#ifdef CONFIG_CPU_XSCALE
108 KDBMSG(TOOMANYDBREGS, "More breakpoints than ibcr registers defined"),
109#else
110 KDBMSG(TOOMANYDBREGS, "More breakpoints than db registers defined"),
111#endif
112 KDBMSG(DUPBPT, "Duplicate breakpoint address"),
113 KDBMSG(BPTNOTFOUND, "Breakpoint not found"),
114 KDBMSG(BADMODE, "Invalid IDMODE"),
115 KDBMSG(BADINT, "Illegal numeric value"),
116 KDBMSG(INVADDRFMT, "Invalid symbolic address format"),
117 KDBMSG(BADREG, "Invalid register name"),
118 KDBMSG(BADCPUNUM, "Invalid cpu number"),
119 KDBMSG(BADLENGTH, "Invalid length field"),
120 KDBMSG(NOBP, "No Breakpoint exists"),
121 KDBMSG(BADADDR, "Invalid address"),
122};
123#undef KDBMSG
124
125static const int __nkdb_err = sizeof(kdbmsgs) / sizeof(kdbmsg_t);
126
127
128/*
129 * Initial environment. This is all kept static and local to
130 * this file. We don't want to rely on the memory allocation
131 * mechanisms in the kernel, so we use a very limited allocate-only
132 * heap for new and altered environment variables. The entire
133 * environment is limited to a fixed number of entries (add more
134 * to __env[] if required) and a fixed amount of heap (add more to
135 * KDB_ENVBUFSIZE if required).
136 */
137
138static char *__env[] = {
139#if defined(CONFIG_SMP)
140 "PROMPT=[%d]kdb> ",
141 "MOREPROMPT=[%d]more> ",
142#else
143 "PROMPT=kdb> ",
144 "MOREPROMPT=more> ",
145#endif
146 "RADIX=16",
147 "MDCOUNT=8", /* lines of md output */
148 "BTARGS=9", /* 9 possible args in bt */
149 KDB_PLATFORM_ENV,
150 "DTABCOUNT=30",
151 "NOSECT=1",
152 (char *)0,
153 (char *)0,
154 (char *)0,
155 (char *)0,
156 (char *)0,
157 (char *)0,
158 (char *)0,
159 (char *)0,
160 (char *)0,
161 (char *)0,
162 (char *)0,
163 (char *)0,
164 (char *)0,
165 (char *)0,
166 (char *)0,
167 (char *)0,
168 (char *)0,
169 (char *)0,
170 (char *)0,
171 (char *)0,
172 (char *)0,
173 (char *)0,
174 (char *)0,
175};
176
177static const int __nenv = (sizeof(__env) / sizeof(char *));
178
179struct task_struct *kdb_curr_task(int cpu)
180{
181 struct task_struct *p = curr_task(cpu);
182#ifdef _TIF_MCA_INIT
183 if ((task_thread_info(p)->flags & _TIF_MCA_INIT) && KDB_TSK(cpu))
184 p = krp->p;
185#endif
186 return p;
187}
188
189/*
190 * kdbgetenv - This function will return the character string value of
191 * an environment variable.
192 * Parameters:
193 * match A character string representing an environment variable.
194 * Returns:
195 * NULL No environment variable matches 'match'
196 * char* Pointer to string value of environment variable.
197 */
198char *kdbgetenv(const char *match)
199{
200 char **ep = __env;
201 int matchlen = strlen(match);
202 int i;
203
204 for (i = 0; i < __nenv; i++) {
205 char *e = *ep++;
206
207 if (!e)
208 continue;
209
210 if ((strncmp(match, e, matchlen) == 0)
211 && ((e[matchlen] == '\0')
212 || (e[matchlen] == '='))) {
213 char *cp = strchr(e, '=');
214 return cp ? ++cp : "";
215 }
216 }
217 return NULL;
218}
219
220/*
221 * kdballocenv - This function is used to allocate bytes for
222 * environment entries.
223 * Parameters:
224 * match A character string representing a numeric value
225 * Outputs:
226 * *value the unsigned long representation of the env variable 'match'
227 * Returns:
228 * Zero on success, a kdb diagnostic on failure.
229 * Remarks:
230 * We use a static environment buffer (envbuffer) to hold the values
231 * of dynamically generated environment variables (see kdb_set). Buffer
232 * space once allocated is never free'd, so over time, the amount of space
233 * (currently 512 bytes) will be exhausted if env variables are changed
234 * frequently.
235 */
236static char *kdballocenv(size_t bytes)
237{
238#define KDB_ENVBUFSIZE 512
239 static char envbuffer[KDB_ENVBUFSIZE];
240 static int envbufsize;
241 char *ep = NULL;
242
243 if ((KDB_ENVBUFSIZE - envbufsize) >= bytes) {
244 ep = &envbuffer[envbufsize];
245 envbufsize += bytes;
246 }
247 return ep;
248}
249
250/*
251 * kdbgetulenv - This function will return the value of an unsigned
252 * long-valued environment variable.
253 * Parameters:
254 * match A character string representing a numeric value
255 * Outputs:
256 * *value the unsigned long represntation of the env variable 'match'
257 * Returns:
258 * Zero on success, a kdb diagnostic on failure.
259 */
260static int kdbgetulenv(const char *match, unsigned long *value)
261{
262 char *ep;
263
264 ep = kdbgetenv(match);
265 if (!ep)
266 return KDB_NOTENV;
267 if (strlen(ep) == 0)
268 return KDB_NOENVVALUE;
269
270 *value = simple_strtoul(ep, NULL, 0);
271
272 return 0;
273}
274
275/*
276 * kdbgetintenv - This function will return the value of an
277 * integer-valued environment variable.
278 * Parameters:
279 * match A character string representing an integer-valued env variable
280 * Outputs:
281 * *value the integer representation of the environment variable 'match'
282 * Returns:
283 * Zero on success, a kdb diagnostic on failure.
284 */
285int kdbgetintenv(const char *match, int *value)
286{
287 unsigned long val;
288 int diag;
289
290 diag = kdbgetulenv(match, &val);
291 if (!diag)
292 *value = (int) val;
293 return diag;
294}
295
296/*
297 * kdbgetularg - This function will convert a numeric string into an
298 * unsigned long value.
299 * Parameters:
300 * arg A character string representing a numeric value
301 * Outputs:
302 * *value the unsigned long represntation of arg.
303 * Returns:
304 * Zero on success, a kdb diagnostic on failure.
305 */
306int kdbgetularg(const char *arg, unsigned long *value)
307{
308 char *endp;
309 unsigned long val;
310
311 val = simple_strtoul(arg, &endp, 0);
312
313 if (endp == arg) {
314 /*
315 * Try base 16, for us folks too lazy to type the
316 * leading 0x...
317 */
318 val = simple_strtoul(arg, &endp, 16);
319 if (endp == arg)
320 return KDB_BADINT;
321 }
322
323 *value = val;
324
325 return 0;
326}
327
328/*
329 * kdb_set - This function implements the 'set' command. Alter an
330 * existing environment variable or create a new one.
331 */
332int kdb_set(int argc, const char **argv)
333{
334 int i;
335 char *ep;
336 size_t varlen, vallen;
337
338 /*
339 * we can be invoked two ways:
340 * set var=value argv[1]="var", argv[2]="value"
341 * set var = value argv[1]="var", argv[2]="=", argv[3]="value"
342 * - if the latter, shift 'em down.
343 */
344 if (argc == 3) {
345 argv[2] = argv[3];
346 argc--;
347 }
348
349 if (argc != 2)
350 return KDB_ARGCOUNT;
351
352 /*
353 * Check for internal variables
354 */
355 if (strcmp(argv[1], "KDBDEBUG") == 0) {
356 unsigned int debugflags;
357 char *cp;
358
359 debugflags = simple_strtoul(argv[2], &cp, 0);
360 if (cp == argv[2] || debugflags & ~KDB_DEBUG_FLAG_MASK) {
361 kdb_printf("kdb: illegal debug flags '%s'\n",
362 argv[2]);
363 return 0;
364 }
365 kdb_flags = (kdb_flags &
366 ~(KDB_DEBUG_FLAG_MASK << KDB_DEBUG_FLAG_SHIFT))
367 | (debugflags << KDB_DEBUG_FLAG_SHIFT);
368
369 return 0;
370 }
371
372 /*
373 * Tokenizer squashed the '=' sign. argv[1] is variable
374 * name, argv[2] = value.
375 */
376 varlen = strlen(argv[1]);
377 vallen = strlen(argv[2]);
378 ep = kdballocenv(varlen + vallen + 2);
379 if (ep == (char *)0)
380 return KDB_ENVBUFFULL;
381
382 sprintf(ep, "%s=%s", argv[1], argv[2]);
383
384 ep[varlen+vallen+1] = '\0';
385
386 for (i = 0; i < __nenv; i++) {
387 if (__env[i]
388 && ((strncmp(__env[i], argv[1], varlen) == 0)
389 && ((__env[i][varlen] == '\0')
390 || (__env[i][varlen] == '=')))) {
391 __env[i] = ep;
392 return 0;
393 }
394 }
395
396 /*
397 * Wasn't existing variable. Fit into slot.
398 */
399 for (i = 0; i < __nenv-1; i++) {
400 if (__env[i] == (char *)0) {
401 __env[i] = ep;
402 return 0;
403 }
404 }
405
406 return KDB_ENVFULL;
407}
408
409static int kdb_check_regs(void)
410{
411 if (!kdb_current_regs) {
412 kdb_printf("No current kdb registers."
413 " You may need to select another task\n");
414 return KDB_BADREG;
415 }
416 return 0;
417}
418
419/*
420 * kdbgetaddrarg - This function is responsible for parsing an
421 * address-expression and returning the value of the expression,
422 * symbol name, and offset to the caller.
423 *
424 * The argument may consist of a numeric value (decimal or
425 * hexidecimal), a symbol name, a register name (preceeded by the
426 * percent sign), an environment variable with a numeric value
427 * (preceeded by a dollar sign) or a simple arithmetic expression
428 * consisting of a symbol name, +/-, and a numeric constant value
429 * (offset).
430 * Parameters:
431 * argc - count of arguments in argv
432 * argv - argument vector
433 * *nextarg - index to next unparsed argument in argv[]
434 * regs - Register state at time of KDB entry
435 * Outputs:
436 * *value - receives the value of the address-expression
437 * *offset - receives the offset specified, if any
438 * *name - receives the symbol name, if any
439 * *nextarg - index to next unparsed argument in argv[]
440 * Returns:
441 * zero is returned on success, a kdb diagnostic code is
442 * returned on error.
443 */
444int kdbgetaddrarg(int argc, const char **argv, int *nextarg,
445 unsigned long *value, long *offset,
446 char **name)
447{
448 unsigned long addr;
449 unsigned long off = 0;
450 int positive;
451 int diag;
452 int found = 0;
453 char *symname;
454 char symbol = '\0';
455 char *cp;
456 kdb_symtab_t symtab;
457
458 /*
459 * Process arguments which follow the following syntax:
460 *
461 * symbol | numeric-address [+/- numeric-offset]
462 * %register
463 * $environment-variable
464 */
465
466 if (*nextarg > argc)
467 return KDB_ARGCOUNT;
468
469 symname = (char *)argv[*nextarg];
470
471 /*
472 * If there is no whitespace between the symbol
473 * or address and the '+' or '-' symbols, we
474 * remember the character and replace it with a
475 * null so the symbol/value can be properly parsed
476 */
477 cp = strpbrk(symname, "+-");
478 if (cp != NULL) {
479 symbol = *cp;
480 *cp++ = '\0';
481 }
482
483 if (symname[0] == '$') {
484 diag = kdbgetulenv(&symname[1], &addr);
485 if (diag)
486 return diag;
487 } else if (symname[0] == '%') {
488 diag = kdb_check_regs();
489 if (diag)
490 return diag;
491 /* Implement register values with % at a later time as it is
492 * arch optional.
493 */
494 return KDB_NOTIMP;
495 } else {
496 found = kdbgetsymval(symname, &symtab);
497 if (found) {
498 addr = symtab.sym_start;
499 } else {
500 diag = kdbgetularg(argv[*nextarg], &addr);
501 if (diag)
502 return diag;
503 }
504 }
505
506 if (!found)
507 found = kdbnearsym(addr, &symtab);
508
509 (*nextarg)++;
510
511 if (name)
512 *name = symname;
513 if (value)
514 *value = addr;
515 if (offset && name && *name)
516 *offset = addr - symtab.sym_start;
517
518 if ((*nextarg > argc)
519 && (symbol == '\0'))
520 return 0;
521
522 /*
523 * check for +/- and offset
524 */
525
526 if (symbol == '\0') {
527 if ((argv[*nextarg][0] != '+')
528 && (argv[*nextarg][0] != '-')) {
529 /*
530 * Not our argument. Return.
531 */
532 return 0;
533 } else {
534 positive = (argv[*nextarg][0] == '+');
535 (*nextarg)++;
536 }
537 } else
538 positive = (symbol == '+');
539
540 /*
541 * Now there must be an offset!
542 */
543 if ((*nextarg > argc)
544 && (symbol == '\0')) {
545 return KDB_INVADDRFMT;
546 }
547
548 if (!symbol) {
549 cp = (char *)argv[*nextarg];
550 (*nextarg)++;
551 }
552
553 diag = kdbgetularg(cp, &off);
554 if (diag)
555 return diag;
556
557 if (!positive)
558 off = -off;
559
560 if (offset)
561 *offset += off;
562
563 if (value)
564 *value += off;
565
566 return 0;
567}
568
569static void kdb_cmderror(int diag)
570{
571 int i;
572
573 if (diag >= 0) {
574 kdb_printf("no error detected (diagnostic is %d)\n", diag);
575 return;
576 }
577
578 for (i = 0; i < __nkdb_err; i++) {
579 if (kdbmsgs[i].km_diag == diag) {
580 kdb_printf("diag: %d: %s\n", diag, kdbmsgs[i].km_msg);
581 return;
582 }
583 }
584
585 kdb_printf("Unknown diag %d\n", -diag);
586}
587
588/*
589 * kdb_defcmd, kdb_defcmd2 - This function implements the 'defcmd'
590 * command which defines one command as a set of other commands,
591 * terminated by endefcmd. kdb_defcmd processes the initial
592 * 'defcmd' command, kdb_defcmd2 is invoked from kdb_parse for
593 * the following commands until 'endefcmd'.
594 * Inputs:
595 * argc argument count
596 * argv argument vector
597 * Returns:
598 * zero for success, a kdb diagnostic if error
599 */
600struct defcmd_set {
601 int count;
602 int usable;
603 char *name;
604 char *usage;
605 char *help;
606 char **command;
607};
608static struct defcmd_set *defcmd_set;
609static int defcmd_set_count;
610static int defcmd_in_progress;
611
612/* Forward references */
613static int kdb_exec_defcmd(int argc, const char **argv);
614
615static int kdb_defcmd2(const char *cmdstr, const char *argv0)
616{
617 struct defcmd_set *s = defcmd_set + defcmd_set_count - 1;
618 char **save_command = s->command;
619 if (strcmp(argv0, "endefcmd") == 0) {
620 defcmd_in_progress = 0;
621 if (!s->count)
622 s->usable = 0;
623 if (s->usable)
624 kdb_register(s->name, kdb_exec_defcmd,
625 s->usage, s->help, 0);
626 return 0;
627 }
628 if (!s->usable)
629 return KDB_NOTIMP;
630 s->command = kmalloc((s->count + 1) * sizeof(*(s->command)), GFP_KDB);
631 if (!s->command) {
632 kdb_printf("Could not allocate new kdb_defcmd table for %s\n",
633 cmdstr);
634 s->usable = 0;
635 return KDB_NOTIMP;
636 }
637 memcpy(s->command, save_command, s->count * sizeof(*(s->command)));
638 s->command[s->count++] = kdb_strdup(cmdstr, GFP_KDB);
639 kfree(save_command);
640 return 0;
641}
642
643static int kdb_defcmd(int argc, const char **argv)
644{
645 struct defcmd_set *save_defcmd_set = defcmd_set, *s;
646 if (defcmd_in_progress) {
647 kdb_printf("kdb: nested defcmd detected, assuming missing "
648 "endefcmd\n");
649 kdb_defcmd2("endefcmd", "endefcmd");
650 }
651 if (argc == 0) {
652 int i;
653 for (s = defcmd_set; s < defcmd_set + defcmd_set_count; ++s) {
654 kdb_printf("defcmd %s \"%s\" \"%s\"\n", s->name,
655 s->usage, s->help);
656 for (i = 0; i < s->count; ++i)
657 kdb_printf("%s", s->command[i]);
658 kdb_printf("endefcmd\n");
659 }
660 return 0;
661 }
662 if (argc != 3)
663 return KDB_ARGCOUNT;
664 defcmd_set = kmalloc((defcmd_set_count + 1) * sizeof(*defcmd_set),
665 GFP_KDB);
666 if (!defcmd_set) {
667 kdb_printf("Could not allocate new defcmd_set entry for %s\n",
668 argv[1]);
669 defcmd_set = save_defcmd_set;
670 return KDB_NOTIMP;
671 }
672 memcpy(defcmd_set, save_defcmd_set,
673 defcmd_set_count * sizeof(*defcmd_set));
674 kfree(save_defcmd_set);
675 s = defcmd_set + defcmd_set_count;
676 memset(s, 0, sizeof(*s));
677 s->usable = 1;
678 s->name = kdb_strdup(argv[1], GFP_KDB);
679 s->usage = kdb_strdup(argv[2], GFP_KDB);
680 s->help = kdb_strdup(argv[3], GFP_KDB);
681 if (s->usage[0] == '"') {
682 strcpy(s->usage, s->usage+1);
683 s->usage[strlen(s->usage)-1] = '\0';
684 }
685 if (s->help[0] == '"') {
686 strcpy(s->help, s->help+1);
687 s->help[strlen(s->help)-1] = '\0';
688 }
689 ++defcmd_set_count;
690 defcmd_in_progress = 1;
691 return 0;
692}
693
694/*
695 * kdb_exec_defcmd - Execute the set of commands associated with this
696 * defcmd name.
697 * Inputs:
698 * argc argument count
699 * argv argument vector
700 * Returns:
701 * zero for success, a kdb diagnostic if error
702 */
703static int kdb_exec_defcmd(int argc, const char **argv)
704{
705 int i, ret;
706 struct defcmd_set *s;
707 if (argc != 0)
708 return KDB_ARGCOUNT;
709 for (s = defcmd_set, i = 0; i < defcmd_set_count; ++i, ++s) {
710 if (strcmp(s->name, argv[0]) == 0)
711 break;
712 }
713 if (i == defcmd_set_count) {
714 kdb_printf("kdb_exec_defcmd: could not find commands for %s\n",
715 argv[0]);
716 return KDB_NOTIMP;
717 }
718 for (i = 0; i < s->count; ++i) {
719 /* Recursive use of kdb_parse, do not use argv after
720 * this point */
721 argv = NULL;
722 kdb_printf("[%s]kdb> %s\n", s->name, s->command[i]);
723 ret = kdb_parse(s->command[i]);
724 if (ret)
725 return ret;
726 }
727 return 0;
728}
729
730/* Command history */
731#define KDB_CMD_HISTORY_COUNT 32
732#define CMD_BUFLEN 200 /* kdb_printf: max printline
733 * size == 256 */
734static unsigned int cmd_head, cmd_tail;
735static unsigned int cmdptr;
736static char cmd_hist[KDB_CMD_HISTORY_COUNT][CMD_BUFLEN];
737static char cmd_cur[CMD_BUFLEN];
738
739/*
740 * The "str" argument may point to something like | grep xyz
741 */
742static void parse_grep(const char *str)
743{
744 int len;
745 char *cp = (char *)str, *cp2;
746
747 /* sanity check: we should have been called with the \ first */
748 if (*cp != '|')
749 return;
750 cp++;
751 while (isspace(*cp))
752 cp++;
753 if (strncmp(cp, "grep ", 5)) {
754 kdb_printf("invalid 'pipe', see grephelp\n");
755 return;
756 }
757 cp += 5;
758 while (isspace(*cp))
759 cp++;
760 cp2 = strchr(cp, '\n');
761 if (cp2)
762 *cp2 = '\0'; /* remove the trailing newline */
763 len = strlen(cp);
764 if (len == 0) {
765 kdb_printf("invalid 'pipe', see grephelp\n");
766 return;
767 }
768 /* now cp points to a nonzero length search string */
769 if (*cp == '"') {
770 /* allow it be "x y z" by removing the "'s - there must
771 be two of them */
772 cp++;
773 cp2 = strchr(cp, '"');
774 if (!cp2) {
775 kdb_printf("invalid quoted string, see grephelp\n");
776 return;
777 }
778 *cp2 = '\0'; /* end the string where the 2nd " was */
779 }
780 kdb_grep_leading = 0;
781 if (*cp == '^') {
782 kdb_grep_leading = 1;
783 cp++;
784 }
785 len = strlen(cp);
786 kdb_grep_trailing = 0;
787 if (*(cp+len-1) == '$') {
788 kdb_grep_trailing = 1;
789 *(cp+len-1) = '\0';
790 }
791 len = strlen(cp);
792 if (!len)
793 return;
794 if (len >= GREP_LEN) {
795 kdb_printf("search string too long\n");
796 return;
797 }
798 strcpy(kdb_grep_string, cp);
799 kdb_grepping_flag++;
800 return;
801}
802
803/*
804 * kdb_parse - Parse the command line, search the command table for a
805 * matching command and invoke the command function. This
806 * function may be called recursively, if it is, the second call
807 * will overwrite argv and cbuf. It is the caller's
808 * responsibility to save their argv if they recursively call
809 * kdb_parse().
810 * Parameters:
811 * cmdstr The input command line to be parsed.
812 * regs The registers at the time kdb was entered.
813 * Returns:
814 * Zero for success, a kdb diagnostic if failure.
815 * Remarks:
816 * Limited to 20 tokens.
817 *
818 * Real rudimentary tokenization. Basically only whitespace
819 * is considered a token delimeter (but special consideration
820 * is taken of the '=' sign as used by the 'set' command).
821 *
822 * The algorithm used to tokenize the input string relies on
823 * there being at least one whitespace (or otherwise useless)
824 * character between tokens as the character immediately following
825 * the token is altered in-place to a null-byte to terminate the
826 * token string.
827 */
828
829#define MAXARGC 20
830
831int kdb_parse(const char *cmdstr)
832{
833 static char *argv[MAXARGC];
834 static int argc;
835 static char cbuf[CMD_BUFLEN+2];
836 char *cp;
837 char *cpp, quoted;
838 kdbtab_t *tp;
839 int i, escaped, ignore_errors = 0, check_grep;
840
841 /*
842 * First tokenize the command string.
843 */
844 cp = (char *)cmdstr;
845 kdb_grepping_flag = check_grep = 0;
846
847 if (KDB_FLAG(CMD_INTERRUPT)) {
848 /* Previous command was interrupted, newline must not
849 * repeat the command */
850 KDB_FLAG_CLEAR(CMD_INTERRUPT);
851 KDB_STATE_SET(PAGER);
852 argc = 0; /* no repeat */
853 }
854
855 if (*cp != '\n' && *cp != '\0') {
856 argc = 0;
857 cpp = cbuf;
858 while (*cp) {
859 /* skip whitespace */
860 while (isspace(*cp))
861 cp++;
862 if ((*cp == '\0') || (*cp == '\n') ||
863 (*cp == '#' && !defcmd_in_progress))
864 break;
865 /* special case: check for | grep pattern */
866 if (*cp == '|') {
867 check_grep++;
868 break;
869 }
870 if (cpp >= cbuf + CMD_BUFLEN) {
871 kdb_printf("kdb_parse: command buffer "
872 "overflow, command ignored\n%s\n",
873 cmdstr);
874 return KDB_NOTFOUND;
875 }
876 if (argc >= MAXARGC - 1) {
877 kdb_printf("kdb_parse: too many arguments, "
878 "command ignored\n%s\n", cmdstr);
879 return KDB_NOTFOUND;
880 }
881 argv[argc++] = cpp;
882 escaped = 0;
883 quoted = '\0';
884 /* Copy to next unquoted and unescaped
885 * whitespace or '=' */
886 while (*cp && *cp != '\n' &&
887 (escaped || quoted || !isspace(*cp))) {
888 if (cpp >= cbuf + CMD_BUFLEN)
889 break;
890 if (escaped) {
891 escaped = 0;
892 *cpp++ = *cp++;
893 continue;
894 }
895 if (*cp == '\\') {
896 escaped = 1;
897 ++cp;
898 continue;
899 }
900 if (*cp == quoted)
901 quoted = '\0';
902 else if (*cp == '\'' || *cp == '"')
903 quoted = *cp;
904 *cpp = *cp++;
905 if (*cpp == '=' && !quoted)
906 break;
907 ++cpp;
908 }
909 *cpp++ = '\0'; /* Squash a ws or '=' character */
910 }
911 }
912 if (!argc)
913 return 0;
914 if (check_grep)
915 parse_grep(cp);
916 if (defcmd_in_progress) {
917 int result = kdb_defcmd2(cmdstr, argv[0]);
918 if (!defcmd_in_progress) {
919 argc = 0; /* avoid repeat on endefcmd */
920 *(argv[0]) = '\0';
921 }
922 return result;
923 }
924 if (argv[0][0] == '-' && argv[0][1] &&
925 (argv[0][1] < '0' || argv[0][1] > '9')) {
926 ignore_errors = 1;
927 ++argv[0];
928 }
929
930 for_each_kdbcmd(tp, i) {
931 if (tp->cmd_name) {
932 /*
933 * If this command is allowed to be abbreviated,
934 * check to see if this is it.
935 */
936
937 if (tp->cmd_minlen
938 && (strlen(argv[0]) <= tp->cmd_minlen)) {
939 if (strncmp(argv[0],
940 tp->cmd_name,
941 tp->cmd_minlen) == 0) {
942 break;
943 }
944 }
945
946 if (strcmp(argv[0], tp->cmd_name) == 0)
947 break;
948 }
949 }
950
951 /*
952 * If we don't find a command by this name, see if the first
953 * few characters of this match any of the known commands.
954 * e.g., md1c20 should match md.
955 */
956 if (i == kdb_max_commands) {
957 for_each_kdbcmd(tp, i) {
958 if (tp->cmd_name) {
959 if (strncmp(argv[0],
960 tp->cmd_name,
961 strlen(tp->cmd_name)) == 0) {
962 break;
963 }
964 }
965 }
966 }
967
968 if (i < kdb_max_commands) {
969 int result;
970 KDB_STATE_SET(CMD);
971 result = (*tp->cmd_func)(argc-1, (const char **)argv);
972 if (result && ignore_errors && result > KDB_CMD_GO)
973 result = 0;
974 KDB_STATE_CLEAR(CMD);
975 switch (tp->cmd_repeat) {
976 case KDB_REPEAT_NONE:
977 argc = 0;
978 if (argv[0])
979 *(argv[0]) = '\0';
980 break;
981 case KDB_REPEAT_NO_ARGS:
982 argc = 1;
983 if (argv[1])
984 *(argv[1]) = '\0';
985 break;
986 case KDB_REPEAT_WITH_ARGS:
987 break;
988 }
989 return result;
990 }
991
992 /*
993 * If the input with which we were presented does not
994 * map to an existing command, attempt to parse it as an
995 * address argument and display the result. Useful for
996 * obtaining the address of a variable, or the nearest symbol
997 * to an address contained in a register.
998 */
999 {
1000 unsigned long value;
1001 char *name = NULL;
1002 long offset;
1003 int nextarg = 0;
1004
1005 if (kdbgetaddrarg(0, (const char **)argv, &nextarg,
1006 &value, &offset, &name)) {
1007 return KDB_NOTFOUND;
1008 }
1009
1010 kdb_printf("%s = ", argv[0]);
1011 kdb_symbol_print(value, NULL, KDB_SP_DEFAULT);
1012 kdb_printf("\n");
1013 return 0;
1014 }
1015}
1016
1017
1018static int handle_ctrl_cmd(char *cmd)
1019{
1020#define CTRL_P 16
1021#define CTRL_N 14
1022
1023 /* initial situation */
1024 if (cmd_head == cmd_tail)
1025 return 0;
1026 switch (*cmd) {
1027 case CTRL_P:
1028 if (cmdptr != cmd_tail)
1029 cmdptr = (cmdptr-1) % KDB_CMD_HISTORY_COUNT;
1030 strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
1031 return 1;
1032 case CTRL_N:
1033 if (cmdptr != cmd_head)
1034 cmdptr = (cmdptr+1) % KDB_CMD_HISTORY_COUNT;
1035 strncpy(cmd_cur, cmd_hist[cmdptr], CMD_BUFLEN);
1036 return 1;
1037 }
1038 return 0;
1039}
1040
1041/*
1042 * kdb_reboot - This function implements the 'reboot' command. Reboot
1043 * the system immediately, or loop for ever on failure.
1044 */
1045static int kdb_reboot(int argc, const char **argv)
1046{
1047 emergency_restart();
1048 kdb_printf("Hmm, kdb_reboot did not reboot, spinning here\n");
1049 while (1)
1050 cpu_relax();
1051 /* NOTREACHED */
1052 return 0;
1053}
1054
1055static void kdb_dumpregs(struct pt_regs *regs)
1056{
1057 int old_lvl = console_loglevel;
1058 console_loglevel = 15;
1059 kdb_trap_printk++;
1060 show_regs(regs);
1061 kdb_trap_printk--;
1062 kdb_printf("\n");
1063 console_loglevel = old_lvl;
1064}
1065
1066void kdb_set_current_task(struct task_struct *p)
1067{
1068 kdb_current_task = p;
1069
1070 if (kdb_task_has_cpu(p)) {
1071 kdb_current_regs = KDB_TSKREGS(kdb_process_cpu(p));
1072 return;
1073 }
1074 kdb_current_regs = NULL;
1075}
1076
1077/*
1078 * kdb_local - The main code for kdb. This routine is invoked on a
1079 * specific processor, it is not global. The main kdb() routine
1080 * ensures that only one processor at a time is in this routine.
1081 * This code is called with the real reason code on the first
1082 * entry to a kdb session, thereafter it is called with reason
1083 * SWITCH, even if the user goes back to the original cpu.
1084 * Inputs:
1085 * reason The reason KDB was invoked
1086 * error The hardware-defined error code
1087 * regs The exception frame at time of fault/breakpoint.
1088 * db_result Result code from the break or debug point.
1089 * Returns:
1090 * 0 KDB was invoked for an event which it wasn't responsible
1091 * 1 KDB handled the event for which it was invoked.
1092 * KDB_CMD_GO User typed 'go'.
1093 * KDB_CMD_CPU User switched to another cpu.
1094 * KDB_CMD_SS Single step.
1095 * KDB_CMD_SSB Single step until branch.
1096 */
1097static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
1098 kdb_dbtrap_t db_result)
1099{
1100 char *cmdbuf;
1101 int diag;
1102 struct task_struct *kdb_current =
1103 kdb_curr_task(raw_smp_processor_id());
1104
1105 KDB_DEBUG_STATE("kdb_local 1", reason);
1106 kdb_go_count = 0;
1107 if (reason == KDB_REASON_DEBUG) {
1108 /* special case below */
1109 } else {
1110 kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
1111 kdb_current, kdb_current->pid);
1112#if defined(CONFIG_SMP)
1113 kdb_printf("on processor %d ", raw_smp_processor_id());
1114#endif
1115 }
1116
1117 switch (reason) {
1118 case KDB_REASON_DEBUG:
1119 {
1120 /*
1121 * If re-entering kdb after a single step
1122 * command, don't print the message.
1123 */
1124 switch (db_result) {
1125 case KDB_DB_BPT:
1126 kdb_printf("\nEntering kdb (0x%p, pid %d) ",
1127 kdb_current, kdb_current->pid);
1128#if defined(CONFIG_SMP)
1129 kdb_printf("on processor %d ", raw_smp_processor_id());
1130#endif
1131 kdb_printf("due to Debug @ " kdb_machreg_fmt "\n",
1132 instruction_pointer(regs));
1133 break;
1134 case KDB_DB_SSB:
1135 /*
1136 * In the midst of ssb command. Just return.
1137 */
1138 KDB_DEBUG_STATE("kdb_local 3", reason);
1139 return KDB_CMD_SSB; /* Continue with SSB command */
1140
1141 break;
1142 case KDB_DB_SS:
1143 break;
1144 case KDB_DB_SSBPT:
1145 KDB_DEBUG_STATE("kdb_local 4", reason);
1146 return 1; /* kdba_db_trap did the work */
1147 default:
1148 kdb_printf("kdb: Bad result from kdba_db_trap: %d\n",
1149 db_result);
1150 break;
1151 }
1152
1153 }
1154 break;
1155 case KDB_REASON_ENTER:
1156 if (KDB_STATE(KEYBOARD))
1157 kdb_printf("due to Keyboard Entry\n");
1158 else
1159 kdb_printf("due to KDB_ENTER()\n");
1160 break;
1161 case KDB_REASON_KEYBOARD:
1162 KDB_STATE_SET(KEYBOARD);
1163 kdb_printf("due to Keyboard Entry\n");
1164 break;
1165 case KDB_REASON_ENTER_SLAVE:
1166 /* drop through, slaves only get released via cpu switch */
1167 case KDB_REASON_SWITCH:
1168 kdb_printf("due to cpu switch\n");
1169 break;
1170 case KDB_REASON_OOPS:
1171 kdb_printf("Oops: %s\n", kdb_diemsg);
1172 kdb_printf("due to oops @ " kdb_machreg_fmt "\n",
1173 instruction_pointer(regs));
1174 kdb_dumpregs(regs);
1175 break;
1176 case KDB_REASON_NMI:
1177 kdb_printf("due to NonMaskable Interrupt @ "
1178 kdb_machreg_fmt "\n",
1179 instruction_pointer(regs));
1180 kdb_dumpregs(regs);
1181 break;
1182 case KDB_REASON_SSTEP:
1183 case KDB_REASON_BREAK:
1184 kdb_printf("due to %s @ " kdb_machreg_fmt "\n",
1185 reason == KDB_REASON_BREAK ?
1186 "Breakpoint" : "SS trap", instruction_pointer(regs));
1187 /*
1188 * Determine if this breakpoint is one that we
1189 * are interested in.
1190 */
1191 if (db_result != KDB_DB_BPT) {
1192 kdb_printf("kdb: error return from kdba_bp_trap: %d\n",
1193 db_result);
1194 KDB_DEBUG_STATE("kdb_local 6", reason);
1195 return 0; /* Not for us, dismiss it */
1196 }
1197 break;
1198 case KDB_REASON_RECURSE:
1199 kdb_printf("due to Recursion @ " kdb_machreg_fmt "\n",
1200 instruction_pointer(regs));
1201 break;
1202 default:
1203 kdb_printf("kdb: unexpected reason code: %d\n", reason);
1204 KDB_DEBUG_STATE("kdb_local 8", reason);
1205 return 0; /* Not for us, dismiss it */
1206 }
1207
1208 while (1) {
1209 /*
1210 * Initialize pager context.
1211 */
1212 kdb_nextline = 1;
1213 KDB_STATE_CLEAR(SUPPRESS);
1214
1215 cmdbuf = cmd_cur;
1216 *cmdbuf = '\0';
1217 *(cmd_hist[cmd_head]) = '\0';
1218
1219 if (KDB_FLAG(ONLY_DO_DUMP)) {
1220 /* kdb is off but a catastrophic error requires a dump.
1221 * Take the dump and reboot.
1222 * Turn on logging so the kdb output appears in the log
1223 * buffer in the dump.
1224 */
1225 const char *setargs[] = { "set", "LOGGING", "1" };
1226 kdb_set(2, setargs);
1227 kdb_reboot(0, NULL);
1228 /*NOTREACHED*/
1229 }
1230
1231do_full_getstr:
1232#if defined(CONFIG_SMP)
1233 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"),
1234 raw_smp_processor_id());
1235#else
1236 snprintf(kdb_prompt_str, CMD_BUFLEN, kdbgetenv("PROMPT"));
1237#endif
1238 if (defcmd_in_progress)
1239 strncat(kdb_prompt_str, "[defcmd]", CMD_BUFLEN);
1240
1241 /*
1242 * Fetch command from keyboard
1243 */
1244 cmdbuf = kdb_getstr(cmdbuf, CMD_BUFLEN, kdb_prompt_str);
1245 if (*cmdbuf != '\n') {
1246 if (*cmdbuf < 32) {
1247 if (cmdptr == cmd_head) {
1248 strncpy(cmd_hist[cmd_head], cmd_cur,
1249 CMD_BUFLEN);
1250 *(cmd_hist[cmd_head] +
1251 strlen(cmd_hist[cmd_head])-1) = '\0';
1252 }
1253 if (!handle_ctrl_cmd(cmdbuf))
1254 *(cmd_cur+strlen(cmd_cur)-1) = '\0';
1255 cmdbuf = cmd_cur;
1256 goto do_full_getstr;
1257 } else {
1258 strncpy(cmd_hist[cmd_head], cmd_cur,
1259 CMD_BUFLEN);
1260 }
1261
1262 cmd_head = (cmd_head+1) % KDB_CMD_HISTORY_COUNT;
1263 if (cmd_head == cmd_tail)
1264 cmd_tail = (cmd_tail+1) % KDB_CMD_HISTORY_COUNT;
1265 }
1266
1267 cmdptr = cmd_head;
1268 diag = kdb_parse(cmdbuf);
1269 if (diag == KDB_NOTFOUND) {
1270 kdb_printf("Unknown kdb command: '%s'\n", cmdbuf);
1271 diag = 0;
1272 }
1273 if (diag == KDB_CMD_GO
1274 || diag == KDB_CMD_CPU
1275 || diag == KDB_CMD_SS
1276 || diag == KDB_CMD_SSB
1277 || diag == KDB_CMD_KGDB)
1278 break;
1279
1280 if (diag)
1281 kdb_cmderror(diag);
1282 }
1283 KDB_DEBUG_STATE("kdb_local 9", diag);
1284 return diag;
1285}
1286
1287
1288/*
1289 * kdb_print_state - Print the state data for the current processor
1290 * for debugging.
1291 * Inputs:
1292 * text Identifies the debug point
1293 * value Any integer value to be printed, e.g. reason code.
1294 */
1295void kdb_print_state(const char *text, int value)
1296{
1297 kdb_printf("state: %s cpu %d value %d initial %d state %x\n",
1298 text, raw_smp_processor_id(), value, kdb_initial_cpu,
1299 kdb_state);
1300}
1301
1302/*
1303 * kdb_main_loop - After initial setup and assignment of the
1304 * controlling cpu, all cpus are in this loop. One cpu is in
1305 * control and will issue the kdb prompt, the others will spin
1306 * until 'go' or cpu switch.
1307 *
1308 * To get a consistent view of the kernel stacks for all
1309 * processes, this routine is invoked from the main kdb code via
1310 * an architecture specific routine. kdba_main_loop is
1311 * responsible for making the kernel stacks consistent for all
1312 * processes, there should be no difference between a blocked
1313 * process and a running process as far as kdb is concerned.
1314 * Inputs:
1315 * reason The reason KDB was invoked
1316 * error The hardware-defined error code
1317 * reason2 kdb's current reason code.
1318 * Initially error but can change
1319 * acording to kdb state.
1320 * db_result Result code from break or debug point.
1321 * regs The exception frame at time of fault/breakpoint.
1322 * should always be valid.
1323 * Returns:
1324 * 0 KDB was invoked for an event which it wasn't responsible
1325 * 1 KDB handled the event for which it was invoked.
1326 */
1327int kdb_main_loop(kdb_reason_t reason, kdb_reason_t reason2, int error,
1328 kdb_dbtrap_t db_result, struct pt_regs *regs)
1329{
1330 int result = 1;
1331 /* Stay in kdb() until 'go', 'ss[b]' or an error */
1332 while (1) {
1333 /*
1334 * All processors except the one that is in control
1335 * will spin here.
1336 */
1337 KDB_DEBUG_STATE("kdb_main_loop 1", reason);
1338 while (KDB_STATE(HOLD_CPU)) {
1339 /* state KDB is turned off by kdb_cpu to see if the
1340 * other cpus are still live, each cpu in this loop
1341 * turns it back on.
1342 */
1343 if (!KDB_STATE(KDB))
1344 KDB_STATE_SET(KDB);
1345 }
1346
1347 KDB_STATE_CLEAR(SUPPRESS);
1348 KDB_DEBUG_STATE("kdb_main_loop 2", reason);
1349 if (KDB_STATE(LEAVING))
1350 break; /* Another cpu said 'go' */
1351 /* Still using kdb, this processor is in control */
1352 result = kdb_local(reason2, error, regs, db_result);
1353 KDB_DEBUG_STATE("kdb_main_loop 3", result);
1354
1355 if (result == KDB_CMD_CPU)
1356 break;
1357
1358 if (result == KDB_CMD_SS) {
1359 KDB_STATE_SET(DOING_SS);
1360 break;
1361 }
1362
1363 if (result == KDB_CMD_SSB) {
1364 KDB_STATE_SET(DOING_SS);
1365 KDB_STATE_SET(DOING_SSB);
1366 break;
1367 }
1368
1369 if (result == KDB_CMD_KGDB) {
1370 if (!(KDB_STATE(DOING_KGDB) || KDB_STATE(DOING_KGDB2)))
1371 kdb_printf("Entering please attach debugger "
1372 "or use $D#44+ or $3#33\n");
1373 break;
1374 }
1375 if (result && result != 1 && result != KDB_CMD_GO)
1376 kdb_printf("\nUnexpected kdb_local return code %d\n",
1377 result);
1378 KDB_DEBUG_STATE("kdb_main_loop 4", reason);
1379 break;
1380 }
1381 if (KDB_STATE(DOING_SS))
1382 KDB_STATE_CLEAR(SSBPT);
1383
1384 return result;
1385}
1386
1387/*
1388 * kdb_mdr - This function implements the guts of the 'mdr', memory
1389 * read command.
1390 * mdr <addr arg>,<byte count>
1391 * Inputs:
1392 * addr Start address
1393 * count Number of bytes
1394 * Returns:
1395 * Always 0. Any errors are detected and printed by kdb_getarea.
1396 */
1397static int kdb_mdr(unsigned long addr, unsigned int count)
1398{
1399 unsigned char c;
1400 while (count--) {
1401 if (kdb_getarea(c, addr))
1402 return 0;
1403 kdb_printf("%02x", c);
1404 addr++;
1405 }
1406 kdb_printf("\n");
1407 return 0;
1408}
1409
1410/*
1411 * kdb_md - This function implements the 'md', 'md1', 'md2', 'md4',
1412 * 'md8' 'mdr' and 'mds' commands.
1413 *
1414 * md|mds [<addr arg> [<line count> [<radix>]]]
1415 * mdWcN [<addr arg> [<line count> [<radix>]]]
1416 * where W = is the width (1, 2, 4 or 8) and N is the count.
1417 * for eg., md1c20 reads 20 bytes, 1 at a time.
1418 * mdr <addr arg>,<byte count>
1419 */
1420static void kdb_md_line(const char *fmtstr, unsigned long addr,
1421 int symbolic, int nosect, int bytesperword,
1422 int num, int repeat, int phys)
1423{
1424 /* print just one line of data */
1425 kdb_symtab_t symtab;
1426 char cbuf[32];
1427 char *c = cbuf;
1428 int i;
1429 unsigned long word;
1430
1431 memset(cbuf, '\0', sizeof(cbuf));
1432 if (phys)
1433 kdb_printf("phys " kdb_machreg_fmt0 " ", addr);
1434 else
1435 kdb_printf(kdb_machreg_fmt0 " ", addr);
1436
1437 for (i = 0; i < num && repeat--; i++) {
1438 if (phys) {
1439 if (kdb_getphysword(&word, addr, bytesperword))
1440 break;
1441 } else if (kdb_getword(&word, addr, bytesperword))
1442 break;
1443 kdb_printf(fmtstr, word);
1444 if (symbolic)
1445 kdbnearsym(word, &symtab);
1446 else
1447 memset(&symtab, 0, sizeof(symtab));
1448 if (symtab.sym_name) {
1449 kdb_symbol_print(word, &symtab, 0);
1450 if (!nosect) {
1451 kdb_printf("\n");
1452 kdb_printf(" %s %s "
1453 kdb_machreg_fmt " "
1454 kdb_machreg_fmt " "
1455 kdb_machreg_fmt, symtab.mod_name,
1456 symtab.sec_name, symtab.sec_start,
1457 symtab.sym_start, symtab.sym_end);
1458 }
1459 addr += bytesperword;
1460 } else {
1461 union {
1462 u64 word;
1463 unsigned char c[8];
1464 } wc;
1465 unsigned char *cp;
1466#ifdef __BIG_ENDIAN
1467 cp = wc.c + 8 - bytesperword;
1468#else
1469 cp = wc.c;
1470#endif
1471 wc.word = word;
1472#define printable_char(c) \
1473 ({unsigned char __c = c; isascii(__c) && isprint(__c) ? __c : '.'; })
1474 switch (bytesperword) {
1475 case 8:
1476 *c++ = printable_char(*cp++);
1477 *c++ = printable_char(*cp++);
1478 *c++ = printable_char(*cp++);
1479 *c++ = printable_char(*cp++);
1480 addr += 4;
1481 case 4:
1482 *c++ = printable_char(*cp++);
1483 *c++ = printable_char(*cp++);
1484 addr += 2;
1485 case 2:
1486 *c++ = printable_char(*cp++);
1487 addr++;
1488 case 1:
1489 *c++ = printable_char(*cp++);
1490 addr++;
1491 break;
1492 }
1493#undef printable_char
1494 }
1495 }
1496 kdb_printf("%*s %s\n", (int)((num-i)*(2*bytesperword + 1)+1),
1497 " ", cbuf);
1498}
1499
1500static int kdb_md(int argc, const char **argv)
1501{
1502 static unsigned long last_addr;
1503 static int last_radix, last_bytesperword, last_repeat;
1504 int radix = 16, mdcount = 8, bytesperword = KDB_WORD_SIZE, repeat;
1505 int nosect = 0;
1506 char fmtchar, fmtstr[64];
1507 unsigned long addr;
1508 unsigned long word;
1509 long offset = 0;
1510 int symbolic = 0;
1511 int valid = 0;
1512 int phys = 0;
1513
1514 kdbgetintenv("MDCOUNT", &mdcount);
1515 kdbgetintenv("RADIX", &radix);
1516 kdbgetintenv("BYTESPERWORD", &bytesperword);
1517
1518 /* Assume 'md <addr>' and start with environment values */
1519 repeat = mdcount * 16 / bytesperword;
1520
1521 if (strcmp(argv[0], "mdr") == 0) {
1522 if (argc != 2)
1523 return KDB_ARGCOUNT;
1524 valid = 1;
1525 } else if (isdigit(argv[0][2])) {
1526 bytesperword = (int)(argv[0][2] - '0');
1527 if (bytesperword == 0) {
1528 bytesperword = last_bytesperword;
1529 if (bytesperword == 0)
1530 bytesperword = 4;
1531 }
1532 last_bytesperword = bytesperword;
1533 repeat = mdcount * 16 / bytesperword;
1534 if (!argv[0][3])
1535 valid = 1;
1536 else if (argv[0][3] == 'c' && argv[0][4]) {
1537 char *p;
1538 repeat = simple_strtoul(argv[0] + 4, &p, 10);
1539 mdcount = ((repeat * bytesperword) + 15) / 16;
1540 valid = !*p;
1541 }
1542 last_repeat = repeat;
1543 } else if (strcmp(argv[0], "md") == 0)
1544 valid = 1;
1545 else if (strcmp(argv[0], "mds") == 0)
1546 valid = 1;
1547 else if (strcmp(argv[0], "mdp") == 0) {
1548 phys = valid = 1;
1549 }
1550 if (!valid)
1551 return KDB_NOTFOUND;
1552
1553 if (argc == 0) {
1554 if (last_addr == 0)
1555 return KDB_ARGCOUNT;
1556 addr = last_addr;
1557 radix = last_radix;
1558 bytesperword = last_bytesperword;
1559 repeat = last_repeat;
1560 mdcount = ((repeat * bytesperword) + 15) / 16;
1561 }
1562
1563 if (argc) {
1564 unsigned long val;
1565 int diag, nextarg = 1;
1566 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr,
1567 &offset, NULL);
1568 if (diag)
1569 return diag;
1570 if (argc > nextarg+2)
1571 return KDB_ARGCOUNT;
1572
1573 if (argc >= nextarg) {
1574 diag = kdbgetularg(argv[nextarg], &val);
1575 if (!diag) {
1576 mdcount = (int) val;
1577 repeat = mdcount * 16 / bytesperword;
1578 }
1579 }
1580 if (argc >= nextarg+1) {
1581 diag = kdbgetularg(argv[nextarg+1], &val);
1582 if (!diag)
1583 radix = (int) val;
1584 }
1585 }
1586
1587 if (strcmp(argv[0], "mdr") == 0)
1588 return kdb_mdr(addr, mdcount);
1589
1590 switch (radix) {
1591 case 10:
1592 fmtchar = 'd';
1593 break;
1594 case 16:
1595 fmtchar = 'x';
1596 break;
1597 case 8:
1598 fmtchar = 'o';
1599 break;
1600 default:
1601 return KDB_BADRADIX;
1602 }
1603
1604 last_radix = radix;
1605
1606 if (bytesperword > KDB_WORD_SIZE)
1607 return KDB_BADWIDTH;
1608
1609 switch (bytesperword) {
1610 case 8:
1611 sprintf(fmtstr, "%%16.16l%c ", fmtchar);
1612 break;
1613 case 4:
1614 sprintf(fmtstr, "%%8.8l%c ", fmtchar);
1615 break;
1616 case 2:
1617 sprintf(fmtstr, "%%4.4l%c ", fmtchar);
1618 break;
1619 case 1:
1620 sprintf(fmtstr, "%%2.2l%c ", fmtchar);
1621 break;
1622 default:
1623 return KDB_BADWIDTH;
1624 }
1625
1626 last_repeat = repeat;
1627 last_bytesperword = bytesperword;
1628
1629 if (strcmp(argv[0], "mds") == 0) {
1630 symbolic = 1;
1631 /* Do not save these changes as last_*, they are temporary mds
1632 * overrides.
1633 */
1634 bytesperword = KDB_WORD_SIZE;
1635 repeat = mdcount;
1636 kdbgetintenv("NOSECT", &nosect);
1637 }
1638
1639 /* Round address down modulo BYTESPERWORD */
1640
1641 addr &= ~(bytesperword-1);
1642
1643 while (repeat > 0) {
1644 unsigned long a;
1645 int n, z, num = (symbolic ? 1 : (16 / bytesperword));
1646
1647 if (KDB_FLAG(CMD_INTERRUPT))
1648 return 0;
1649 for (a = addr, z = 0; z < repeat; a += bytesperword, ++z) {
1650 if (phys) {
1651 if (kdb_getphysword(&word, a, bytesperword)
1652 || word)
1653 break;
1654 } else if (kdb_getword(&word, a, bytesperword) || word)
1655 break;
1656 }
1657 n = min(num, repeat);
1658 kdb_md_line(fmtstr, addr, symbolic, nosect, bytesperword,
1659 num, repeat, phys);
1660 addr += bytesperword * n;
1661 repeat -= n;
1662 z = (z + num - 1) / num;
1663 if (z > 2) {
1664 int s = num * (z-2);
1665 kdb_printf(kdb_machreg_fmt0 "-" kdb_machreg_fmt0
1666 " zero suppressed\n",
1667 addr, addr + bytesperword * s - 1);
1668 addr += bytesperword * s;
1669 repeat -= s;
1670 }
1671 }
1672 last_addr = addr;
1673
1674 return 0;
1675}
1676
1677/*
1678 * kdb_mm - This function implements the 'mm' command.
1679 * mm address-expression new-value
1680 * Remarks:
1681 * mm works on machine words, mmW works on bytes.
1682 */
1683static int kdb_mm(int argc, const char **argv)
1684{
1685 int diag;
1686 unsigned long addr;
1687 long offset = 0;
1688 unsigned long contents;
1689 int nextarg;
1690 int width;
1691
1692 if (argv[0][2] && !isdigit(argv[0][2]))
1693 return KDB_NOTFOUND;
1694
1695 if (argc < 2)
1696 return KDB_ARGCOUNT;
1697
1698 nextarg = 1;
1699 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
1700 if (diag)
1701 return diag;
1702
1703 if (nextarg > argc)
1704 return KDB_ARGCOUNT;
1705 diag = kdbgetaddrarg(argc, argv, &nextarg, &contents, NULL, NULL);
1706 if (diag)
1707 return diag;
1708
1709 if (nextarg != argc + 1)
1710 return KDB_ARGCOUNT;
1711
1712 width = argv[0][2] ? (argv[0][2] - '0') : (KDB_WORD_SIZE);
1713 diag = kdb_putword(addr, contents, width);
1714 if (diag)
1715 return diag;
1716
1717 kdb_printf(kdb_machreg_fmt " = " kdb_machreg_fmt "\n", addr, contents);
1718
1719 return 0;
1720}
1721
1722/*
1723 * kdb_go - This function implements the 'go' command.
1724 * go [address-expression]
1725 */
1726static int kdb_go(int argc, const char **argv)
1727{
1728 unsigned long addr;
1729 int diag;
1730 int nextarg;
1731 long offset;
1732
1733 if (argc == 1) {
1734 if (raw_smp_processor_id() != kdb_initial_cpu) {
1735 kdb_printf("go <address> must be issued from the "
1736 "initial cpu, do cpu %d first\n",
1737 kdb_initial_cpu);
1738 return KDB_ARGCOUNT;
1739 }
1740 nextarg = 1;
1741 diag = kdbgetaddrarg(argc, argv, &nextarg,
1742 &addr, &offset, NULL);
1743 if (diag)
1744 return diag;
1745 } else if (argc) {
1746 return KDB_ARGCOUNT;
1747 }
1748
1749 diag = KDB_CMD_GO;
1750 if (KDB_FLAG(CATASTROPHIC)) {
1751 kdb_printf("Catastrophic error detected\n");
1752 kdb_printf("kdb_continue_catastrophic=%d, ",
1753 kdb_continue_catastrophic);
1754 if (kdb_continue_catastrophic == 0 && kdb_go_count++ == 0) {
1755 kdb_printf("type go a second time if you really want "
1756 "to continue\n");
1757 return 0;
1758 }
1759 if (kdb_continue_catastrophic == 2) {
1760 kdb_printf("forcing reboot\n");
1761 kdb_reboot(0, NULL);
1762 }
1763 kdb_printf("attempting to continue\n");
1764 }
1765 return diag;
1766}
1767
1768/*
1769 * kdb_rd - This function implements the 'rd' command.
1770 */
1771static int kdb_rd(int argc, const char **argv)
1772{
1773 int diag = kdb_check_regs();
1774 if (diag)
1775 return diag;
1776
1777 kdb_dumpregs(kdb_current_regs);
1778 return 0;
1779}
1780
1781/*
1782 * kdb_rm - This function implements the 'rm' (register modify) command.
1783 * rm register-name new-contents
1784 * Remarks:
1785 * Currently doesn't allow modification of control or
1786 * debug registers.
1787 */
1788static int kdb_rm(int argc, const char **argv)
1789{
1790 int diag;
1791 int ind = 0;
1792 unsigned long contents;
1793
1794 if (argc != 2)
1795 return KDB_ARGCOUNT;
1796 /*
1797 * Allow presence or absence of leading '%' symbol.
1798 */
1799 if (argv[1][0] == '%')
1800 ind = 1;
1801
1802 diag = kdbgetularg(argv[2], &contents);
1803 if (diag)
1804 return diag;
1805
1806 diag = kdb_check_regs();
1807 if (diag)
1808 return diag;
1809 kdb_printf("ERROR: Register set currently not implemented\n");
1810 return 0;
1811}
1812
1813#if defined(CONFIG_MAGIC_SYSRQ)
1814/*
1815 * kdb_sr - This function implements the 'sr' (SYSRQ key) command
1816 * which interfaces to the soi-disant MAGIC SYSRQ functionality.
1817 * sr <magic-sysrq-code>
1818 */
1819static int kdb_sr(int argc, const char **argv)
1820{
1821 if (argc != 1)
1822 return KDB_ARGCOUNT;
1823 kdb_trap_printk++;
1824 __handle_sysrq(*argv[1], NULL, 0);
1825 kdb_trap_printk--;
1826
1827 return 0;
1828}
1829#endif /* CONFIG_MAGIC_SYSRQ */
1830
1831/*
1832 * kdb_ef - This function implements the 'regs' (display exception
1833 * frame) command. This command takes an address and expects to
1834 * find an exception frame at that address, formats and prints
1835 * it.
1836 * regs address-expression
1837 * Remarks:
1838 * Not done yet.
1839 */
1840static int kdb_ef(int argc, const char **argv)
1841{
1842 int diag;
1843 unsigned long addr;
1844 long offset;
1845 int nextarg;
1846
1847 if (argc != 1)
1848 return KDB_ARGCOUNT;
1849
1850 nextarg = 1;
1851 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
1852 if (diag)
1853 return diag;
1854 show_regs((struct pt_regs *)addr);
1855 return 0;
1856}
1857
1858#if defined(CONFIG_MODULES)
1859/*
1860 * kdb_lsmod - This function implements the 'lsmod' command. Lists
1861 * currently loaded kernel modules.
1862 * Mostly taken from userland lsmod.
1863 */
1864static int kdb_lsmod(int argc, const char **argv)
1865{
1866 struct module *mod;
1867
1868 if (argc != 0)
1869 return KDB_ARGCOUNT;
1870
1871 kdb_printf("Module Size modstruct Used by\n");
1872 list_for_each_entry(mod, kdb_modules, list) {
1873
1874 kdb_printf("%-20s%8u 0x%p ", mod->name,
1875 mod->core_size, (void *)mod);
1876#ifdef CONFIG_MODULE_UNLOAD
1877 kdb_printf("%4d ", module_refcount(mod));
1878#endif
1879 if (mod->state == MODULE_STATE_GOING)
1880 kdb_printf(" (Unloading)");
1881 else if (mod->state == MODULE_STATE_COMING)
1882 kdb_printf(" (Loading)");
1883 else
1884 kdb_printf(" (Live)");
1885 kdb_printf(" 0x%p", mod->module_core);
1886
1887#ifdef CONFIG_MODULE_UNLOAD
1888 {
1889 struct module_use *use;
1890 kdb_printf(" [ ");
1891 list_for_each_entry(use, &mod->source_list,
1892 source_list)
1893 kdb_printf("%s ", use->target->name);
1894 kdb_printf("]\n");
1895 }
1896#endif
1897 }
1898
1899 return 0;
1900}
1901
1902#endif /* CONFIG_MODULES */
1903
1904/*
1905 * kdb_env - This function implements the 'env' command. Display the
1906 * current environment variables.
1907 */
1908
1909static int kdb_env(int argc, const char **argv)
1910{
1911 int i;
1912
1913 for (i = 0; i < __nenv; i++) {
1914 if (__env[i])
1915 kdb_printf("%s\n", __env[i]);
1916 }
1917
1918 if (KDB_DEBUG(MASK))
1919 kdb_printf("KDBFLAGS=0x%x\n", kdb_flags);
1920
1921 return 0;
1922}
1923
1924#ifdef CONFIG_PRINTK
1925/*
1926 * kdb_dmesg - This function implements the 'dmesg' command to display
1927 * the contents of the syslog buffer.
1928 * dmesg [lines] [adjust]
1929 */
1930static int kdb_dmesg(int argc, const char **argv)
1931{
1932 char *syslog_data[4], *start, *end, c = '\0', *p;
1933 int diag, logging, logsize, lines = 0, adjust = 0, n;
1934
1935 if (argc > 2)
1936 return KDB_ARGCOUNT;
1937 if (argc) {
1938 char *cp;
1939 lines = simple_strtol(argv[1], &cp, 0);
1940 if (*cp)
1941 lines = 0;
1942 if (argc > 1) {
1943 adjust = simple_strtoul(argv[2], &cp, 0);
1944 if (*cp || adjust < 0)
1945 adjust = 0;
1946 }
1947 }
1948
1949 /* disable LOGGING if set */
1950 diag = kdbgetintenv("LOGGING", &logging);
1951 if (!diag && logging) {
1952 const char *setargs[] = { "set", "LOGGING", "0" };
1953 kdb_set(2, setargs);
1954 }
1955
1956 /* syslog_data[0,1] physical start, end+1. syslog_data[2,3]
1957 * logical start, end+1. */
1958 kdb_syslog_data(syslog_data);
1959 if (syslog_data[2] == syslog_data[3])
1960 return 0;
1961 logsize = syslog_data[1] - syslog_data[0];
1962 start = syslog_data[2];
1963 end = syslog_data[3];
1964#define KDB_WRAP(p) (((p - syslog_data[0]) % logsize) + syslog_data[0])
1965 for (n = 0, p = start; p < end; ++p) {
1966 c = *KDB_WRAP(p);
1967 if (c == '\n')
1968 ++n;
1969 }
1970 if (c != '\n')
1971 ++n;
1972 if (lines < 0) {
1973 if (adjust >= n)
1974 kdb_printf("buffer only contains %d lines, nothing "
1975 "printed\n", n);
1976 else if (adjust - lines >= n)
1977 kdb_printf("buffer only contains %d lines, last %d "
1978 "lines printed\n", n, n - adjust);
1979 if (adjust) {
1980 for (; start < end && adjust; ++start) {
1981 if (*KDB_WRAP(start) == '\n')
1982 --adjust;
1983 }
1984 if (start < end)
1985 ++start;
1986 }
1987 for (p = start; p < end && lines; ++p) {
1988 if (*KDB_WRAP(p) == '\n')
1989 ++lines;
1990 }
1991 end = p;
1992 } else if (lines > 0) {
1993 int skip = n - (adjust + lines);
1994 if (adjust >= n) {
1995 kdb_printf("buffer only contains %d lines, "
1996 "nothing printed\n", n);
1997 skip = n;
1998 } else if (skip < 0) {
1999 lines += skip;
2000 skip = 0;
2001 kdb_printf("buffer only contains %d lines, first "
2002 "%d lines printed\n", n, lines);
2003 }
2004 for (; start < end && skip; ++start) {
2005 if (*KDB_WRAP(start) == '\n')
2006 --skip;
2007 }
2008 for (p = start; p < end && lines; ++p) {
2009 if (*KDB_WRAP(p) == '\n')
2010 --lines;
2011 }
2012 end = p;
2013 }
2014 /* Do a line at a time (max 200 chars) to reduce protocol overhead */
2015 c = '\n';
2016 while (start != end) {
2017 char buf[201];
2018 p = buf;
2019 if (KDB_FLAG(CMD_INTERRUPT))
2020 return 0;
2021 while (start < end && (c = *KDB_WRAP(start)) &&
2022 (p - buf) < sizeof(buf)-1) {
2023 ++start;
2024 *p++ = c;
2025 if (c == '\n')
2026 break;
2027 }
2028 *p = '\0';
2029 kdb_printf("%s", buf);
2030 }
2031 if (c != '\n')
2032 kdb_printf("\n");
2033
2034 return 0;
2035}
2036#endif /* CONFIG_PRINTK */
2037/*
2038 * kdb_cpu - This function implements the 'cpu' command.
2039 * cpu [<cpunum>]
2040 * Returns:
2041 * KDB_CMD_CPU for success, a kdb diagnostic if error
2042 */
2043static void kdb_cpu_status(void)
2044{
2045 int i, start_cpu, first_print = 1;
2046 char state, prev_state = '?';
2047
2048 kdb_printf("Currently on cpu %d\n", raw_smp_processor_id());
2049 kdb_printf("Available cpus: ");
2050 for (start_cpu = -1, i = 0; i < NR_CPUS; i++) {
2051 if (!cpu_online(i)) {
2052 state = 'F'; /* cpu is offline */
2053 } else {
2054 state = ' '; /* cpu is responding to kdb */
2055 if (kdb_task_state_char(KDB_TSK(i)) == 'I')
2056 state = 'I'; /* idle task */
2057 }
2058 if (state != prev_state) {
2059 if (prev_state != '?') {
2060 if (!first_print)
2061 kdb_printf(", ");
2062 first_print = 0;
2063 kdb_printf("%d", start_cpu);
2064 if (start_cpu < i-1)
2065 kdb_printf("-%d", i-1);
2066 if (prev_state != ' ')
2067 kdb_printf("(%c)", prev_state);
2068 }
2069 prev_state = state;
2070 start_cpu = i;
2071 }
2072 }
2073 /* print the trailing cpus, ignoring them if they are all offline */
2074 if (prev_state != 'F') {
2075 if (!first_print)
2076 kdb_printf(", ");
2077 kdb_printf("%d", start_cpu);
2078 if (start_cpu < i-1)
2079 kdb_printf("-%d", i-1);
2080 if (prev_state != ' ')
2081 kdb_printf("(%c)", prev_state);
2082 }
2083 kdb_printf("\n");
2084}
2085
2086static int kdb_cpu(int argc, const char **argv)
2087{
2088 unsigned long cpunum;
2089 int diag;
2090
2091 if (argc == 0) {
2092 kdb_cpu_status();
2093 return 0;
2094 }
2095
2096 if (argc != 1)
2097 return KDB_ARGCOUNT;
2098
2099 diag = kdbgetularg(argv[1], &cpunum);
2100 if (diag)
2101 return diag;
2102
2103 /*
2104 * Validate cpunum
2105 */
2106 if ((cpunum > NR_CPUS) || !cpu_online(cpunum))
2107 return KDB_BADCPUNUM;
2108
2109 dbg_switch_cpu = cpunum;
2110
2111 /*
2112 * Switch to other cpu
2113 */
2114 return KDB_CMD_CPU;
2115}
2116
2117/* The user may not realize that ps/bta with no parameters does not print idle
2118 * or sleeping system daemon processes, so tell them how many were suppressed.
2119 */
2120void kdb_ps_suppressed(void)
2121{
2122 int idle = 0, daemon = 0;
2123 unsigned long mask_I = kdb_task_state_string("I"),
2124 mask_M = kdb_task_state_string("M");
2125 unsigned long cpu;
2126 const struct task_struct *p, *g;
2127 for_each_online_cpu(cpu) {
2128 p = kdb_curr_task(cpu);
2129 if (kdb_task_state(p, mask_I))
2130 ++idle;
2131 }
2132 kdb_do_each_thread(g, p) {
2133 if (kdb_task_state(p, mask_M))
2134 ++daemon;
2135 } kdb_while_each_thread(g, p);
2136 if (idle || daemon) {
2137 if (idle)
2138 kdb_printf("%d idle process%s (state I)%s\n",
2139 idle, idle == 1 ? "" : "es",
2140 daemon ? " and " : "");
2141 if (daemon)
2142 kdb_printf("%d sleeping system daemon (state M) "
2143 "process%s", daemon,
2144 daemon == 1 ? "" : "es");
2145 kdb_printf(" suppressed,\nuse 'ps A' to see all.\n");
2146 }
2147}
2148
2149/*
2150 * kdb_ps - This function implements the 'ps' command which shows a
2151 * list of the active processes.
2152 * ps [DRSTCZEUIMA] All processes, optionally filtered by state
2153 */
2154void kdb_ps1(const struct task_struct *p)
2155{
2156 int cpu;
2157 unsigned long tmp;
2158
2159 if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
2160 return;
2161
2162 cpu = kdb_process_cpu(p);
2163 kdb_printf("0x%p %8d %8d %d %4d %c 0x%p %c%s\n",
2164 (void *)p, p->pid, p->parent->pid,
2165 kdb_task_has_cpu(p), kdb_process_cpu(p),
2166 kdb_task_state_char(p),
2167 (void *)(&p->thread),
2168 p == kdb_curr_task(raw_smp_processor_id()) ? '*' : ' ',
2169 p->comm);
2170 if (kdb_task_has_cpu(p)) {
2171 if (!KDB_TSK(cpu)) {
2172 kdb_printf(" Error: no saved data for this cpu\n");
2173 } else {
2174 if (KDB_TSK(cpu) != p)
2175 kdb_printf(" Error: does not match running "
2176 "process table (0x%p)\n", KDB_TSK(cpu));
2177 }
2178 }
2179}
2180
2181static int kdb_ps(int argc, const char **argv)
2182{
2183 struct task_struct *g, *p;
2184 unsigned long mask, cpu;
2185
2186 if (argc == 0)
2187 kdb_ps_suppressed();
2188 kdb_printf("%-*s Pid Parent [*] cpu State %-*s Command\n",
2189 (int)(2*sizeof(void *))+2, "Task Addr",
2190 (int)(2*sizeof(void *))+2, "Thread");
2191 mask = kdb_task_state_string(argc ? argv[1] : NULL);
2192 /* Run the active tasks first */
2193 for_each_online_cpu(cpu) {
2194 if (KDB_FLAG(CMD_INTERRUPT))
2195 return 0;
2196 p = kdb_curr_task(cpu);
2197 if (kdb_task_state(p, mask))
2198 kdb_ps1(p);
2199 }
2200 kdb_printf("\n");
2201 /* Now the real tasks */
2202 kdb_do_each_thread(g, p) {
2203 if (KDB_FLAG(CMD_INTERRUPT))
2204 return 0;
2205 if (kdb_task_state(p, mask))
2206 kdb_ps1(p);
2207 } kdb_while_each_thread(g, p);
2208
2209 return 0;
2210}
2211
2212/*
2213 * kdb_pid - This function implements the 'pid' command which switches
2214 * the currently active process.
2215 * pid [<pid> | R]
2216 */
2217static int kdb_pid(int argc, const char **argv)
2218{
2219 struct task_struct *p;
2220 unsigned long val;
2221 int diag;
2222
2223 if (argc > 1)
2224 return KDB_ARGCOUNT;
2225
2226 if (argc) {
2227 if (strcmp(argv[1], "R") == 0) {
2228 p = KDB_TSK(kdb_initial_cpu);
2229 } else {
2230 diag = kdbgetularg(argv[1], &val);
2231 if (diag)
2232 return KDB_BADINT;
2233
2234 p = find_task_by_pid_ns((pid_t)val, &init_pid_ns);
2235 if (!p) {
2236 kdb_printf("No task with pid=%d\n", (pid_t)val);
2237 return 0;
2238 }
2239 }
2240 kdb_set_current_task(p);
2241 }
2242 kdb_printf("KDB current process is %s(pid=%d)\n",
2243 kdb_current_task->comm,
2244 kdb_current_task->pid);
2245
2246 return 0;
2247}
2248
2249/*
2250 * kdb_ll - This function implements the 'll' command which follows a
2251 * linked list and executes an arbitrary command for each
2252 * element.
2253 */
2254static int kdb_ll(int argc, const char **argv)
2255{
2256 int diag;
2257 unsigned long addr;
2258 long offset = 0;
2259 unsigned long va;
2260 unsigned long linkoffset;
2261 int nextarg;
2262 const char *command;
2263
2264 if (argc != 3)
2265 return KDB_ARGCOUNT;
2266
2267 nextarg = 1;
2268 diag = kdbgetaddrarg(argc, argv, &nextarg, &addr, &offset, NULL);
2269 if (diag)
2270 return diag;
2271
2272 diag = kdbgetularg(argv[2], &linkoffset);
2273 if (diag)
2274 return diag;
2275
2276 /*
2277 * Using the starting address as
2278 * the first element in the list, and assuming that
2279 * the list ends with a null pointer.
2280 */
2281
2282 va = addr;
2283 command = kdb_strdup(argv[3], GFP_KDB);
2284 if (!command) {
2285 kdb_printf("%s: cannot duplicate command\n", __func__);
2286 return 0;
2287 }
2288 /* Recursive use of kdb_parse, do not use argv after this point */
2289 argv = NULL;
2290
2291 while (va) {
2292 char buf[80];
2293
2294 if (KDB_FLAG(CMD_INTERRUPT))
2295 return 0;
2296
2297 sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va);
2298 diag = kdb_parse(buf);
2299 if (diag)
2300 return diag;
2301
2302 addr = va + linkoffset;
2303 if (kdb_getword(&va, addr, sizeof(va)))
2304 return 0;
2305 }
2306 kfree(command);
2307
2308 return 0;
2309}
2310
2311static int kdb_kgdb(int argc, const char **argv)
2312{
2313 return KDB_CMD_KGDB;
2314}
2315
2316/*
2317 * kdb_help - This function implements the 'help' and '?' commands.
2318 */
2319static int kdb_help(int argc, const char **argv)
2320{
2321 kdbtab_t *kt;
2322 int i;
2323
2324 kdb_printf("%-15.15s %-20.20s %s\n", "Command", "Usage", "Description");
2325 kdb_printf("-----------------------------"
2326 "-----------------------------\n");
2327 for_each_kdbcmd(kt, i) {
2328 if (kt->cmd_name)
2329 kdb_printf("%-15.15s %-20.20s %s\n", kt->cmd_name,
2330 kt->cmd_usage, kt->cmd_help);
2331 if (KDB_FLAG(CMD_INTERRUPT))
2332 return 0;
2333 }
2334 return 0;
2335}
2336
2337/*
2338 * kdb_kill - This function implements the 'kill' commands.
2339 */
2340static int kdb_kill(int argc, const char **argv)
2341{
2342 long sig, pid;
2343 char *endp;
2344 struct task_struct *p;
2345 struct siginfo info;
2346
2347 if (argc != 2)
2348 return KDB_ARGCOUNT;
2349
2350 sig = simple_strtol(argv[1], &endp, 0);
2351 if (*endp)
2352 return KDB_BADINT;
2353 if (sig >= 0) {
2354 kdb_printf("Invalid signal parameter.<-signal>\n");
2355 return 0;
2356 }
2357 sig = -sig;
2358
2359 pid = simple_strtol(argv[2], &endp, 0);
2360 if (*endp)
2361 return KDB_BADINT;
2362 if (pid <= 0) {
2363 kdb_printf("Process ID must be large than 0.\n");
2364 return 0;
2365 }
2366
2367 /* Find the process. */
2368 p = find_task_by_pid_ns(pid, &init_pid_ns);
2369 if (!p) {
2370 kdb_printf("The specified process isn't found.\n");
2371 return 0;
2372 }
2373 p = p->group_leader;
2374 info.si_signo = sig;
2375 info.si_errno = 0;
2376 info.si_code = SI_USER;
2377 info.si_pid = pid; /* same capabilities as process being signalled */
2378 info.si_uid = 0; /* kdb has root authority */
2379 kdb_send_sig_info(p, &info);
2380 return 0;
2381}
2382
2383struct kdb_tm {
2384 int tm_sec; /* seconds */
2385 int tm_min; /* minutes */
2386 int tm_hour; /* hours */
2387 int tm_mday; /* day of the month */
2388 int tm_mon; /* month */
2389 int tm_year; /* year */
2390};
2391
2392static void kdb_gmtime(struct timespec *tv, struct kdb_tm *tm)
2393{
2394 /* This will work from 1970-2099, 2100 is not a leap year */
2395 static int mon_day[] = { 31, 29, 31, 30, 31, 30, 31,
2396 31, 30, 31, 30, 31 };
2397 memset(tm, 0, sizeof(*tm));
2398 tm->tm_sec = tv->tv_sec % (24 * 60 * 60);
2399 tm->tm_mday = tv->tv_sec / (24 * 60 * 60) +
2400 (2 * 365 + 1); /* shift base from 1970 to 1968 */
2401 tm->tm_min = tm->tm_sec / 60 % 60;
2402 tm->tm_hour = tm->tm_sec / 60 / 60;
2403 tm->tm_sec = tm->tm_sec % 60;
2404 tm->tm_year = 68 + 4*(tm->tm_mday / (4*365+1));
2405 tm->tm_mday %= (4*365+1);
2406 mon_day[1] = 29;
2407 while (tm->tm_mday >= mon_day[tm->tm_mon]) {
2408 tm->tm_mday -= mon_day[tm->tm_mon];
2409 if (++tm->tm_mon == 12) {
2410 tm->tm_mon = 0;
2411 ++tm->tm_year;
2412 mon_day[1] = 28;
2413 }
2414 }
2415 ++tm->tm_mday;
2416}
2417
2418/*
2419 * Most of this code has been lifted from kernel/timer.c::sys_sysinfo().
2420 * I cannot call that code directly from kdb, it has an unconditional
2421 * cli()/sti() and calls routines that take locks which can stop the debugger.
2422 */
2423static void kdb_sysinfo(struct sysinfo *val)
2424{
2425 struct timespec uptime;
2426 do_posix_clock_monotonic_gettime(&uptime);
2427 memset(val, 0, sizeof(*val));
2428 val->uptime = uptime.tv_sec;
2429 val->loads[0] = avenrun[0];
2430 val->loads[1] = avenrun[1];
2431 val->loads[2] = avenrun[2];
2432 val->procs = nr_threads-1;
2433 si_meminfo(val);
2434
2435 return;
2436}
2437
2438/*
2439 * kdb_summary - This function implements the 'summary' command.
2440 */
2441static int kdb_summary(int argc, const char **argv)
2442{
2443 struct kdb_tm tm;
2444 struct sysinfo val;
2445
2446 if (argc)
2447 return KDB_ARGCOUNT;
2448
2449 kdb_printf("sysname %s\n", init_uts_ns.name.sysname);
2450 kdb_printf("release %s\n", init_uts_ns.name.release);
2451 kdb_printf("version %s\n", init_uts_ns.name.version);
2452 kdb_printf("machine %s\n", init_uts_ns.name.machine);
2453 kdb_printf("nodename %s\n", init_uts_ns.name.nodename);
2454 kdb_printf("domainname %s\n", init_uts_ns.name.domainname);
2455 kdb_printf("ccversion %s\n", __stringify(CCVERSION));
2456
2457 kdb_gmtime(&xtime, &tm);
2458 kdb_printf("date %04d-%02d-%02d %02d:%02d:%02d "
2459 "tz_minuteswest %d\n",
2460 1900+tm.tm_year, tm.tm_mon+1, tm.tm_mday,
2461 tm.tm_hour, tm.tm_min, tm.tm_sec,
2462 sys_tz.tz_minuteswest);
2463
2464 kdb_sysinfo(&val);
2465 kdb_printf("uptime ");
2466 if (val.uptime > (24*60*60)) {
2467 int days = val.uptime / (24*60*60);
2468 val.uptime %= (24*60*60);
2469 kdb_printf("%d day%s ", days, days == 1 ? "" : "s");
2470 }
2471 kdb_printf("%02ld:%02ld\n", val.uptime/(60*60), (val.uptime/60)%60);
2472
2473 /* lifted from fs/proc/proc_misc.c::loadavg_read_proc() */
2474
2475#define LOAD_INT(x) ((x) >> FSHIFT)
2476#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
2477 kdb_printf("load avg %ld.%02ld %ld.%02ld %ld.%02ld\n",
2478 LOAD_INT(val.loads[0]), LOAD_FRAC(val.loads[0]),
2479 LOAD_INT(val.loads[1]), LOAD_FRAC(val.loads[1]),
2480 LOAD_INT(val.loads[2]), LOAD_FRAC(val.loads[2]));
2481#undef LOAD_INT
2482#undef LOAD_FRAC
2483 /* Display in kilobytes */
2484#define K(x) ((x) << (PAGE_SHIFT - 10))
2485 kdb_printf("\nMemTotal: %8lu kB\nMemFree: %8lu kB\n"
2486 "Buffers: %8lu kB\n",
2487 val.totalram, val.freeram, val.bufferram);
2488 return 0;
2489}
2490
2491/*
2492 * kdb_per_cpu - This function implements the 'per_cpu' command.
2493 */
2494static int kdb_per_cpu(int argc, const char **argv)
2495{
2496 char buf[256], fmtstr[64];
2497 kdb_symtab_t symtab;
2498 cpumask_t suppress = CPU_MASK_NONE;
2499 int cpu, diag;
2500 unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
2501
2502 if (argc < 1 || argc > 3)
2503 return KDB_ARGCOUNT;
2504
2505 snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
2506 if (!kdbgetsymval(buf, &symtab)) {
2507 kdb_printf("%s is not a per_cpu variable\n", argv[1]);
2508 return KDB_BADADDR;
2509 }
2510 if (argc >= 2) {
2511 diag = kdbgetularg(argv[2], &bytesperword);
2512 if (diag)
2513 return diag;
2514 }
2515 if (!bytesperword)
2516 bytesperword = KDB_WORD_SIZE;
2517 else if (bytesperword > KDB_WORD_SIZE)
2518 return KDB_BADWIDTH;
2519 sprintf(fmtstr, "%%0%dlx ", (int)(2*bytesperword));
2520 if (argc >= 3) {
2521 diag = kdbgetularg(argv[3], &whichcpu);
2522 if (diag)
2523 return diag;
2524 if (!cpu_online(whichcpu)) {
2525 kdb_printf("cpu %ld is not online\n", whichcpu);
2526 return KDB_BADCPUNUM;
2527 }
2528 }
2529
2530 /* Most architectures use __per_cpu_offset[cpu], some use
2531 * __per_cpu_offset(cpu), smp has no __per_cpu_offset.
2532 */
2533#ifdef __per_cpu_offset
2534#define KDB_PCU(cpu) __per_cpu_offset(cpu)
2535#else
2536#ifdef CONFIG_SMP
2537#define KDB_PCU(cpu) __per_cpu_offset[cpu]
2538#else
2539#define KDB_PCU(cpu) 0
2540#endif
2541#endif
2542
2543 for_each_online_cpu(cpu) {
2544 if (whichcpu != ~0UL && whichcpu != cpu)
2545 continue;
2546 addr = symtab.sym_start + KDB_PCU(cpu);
2547 diag = kdb_getword(&val, addr, bytesperword);
2548 if (diag) {
2549 kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
2550 "read, diag=%d\n", cpu, addr, diag);
2551 continue;
2552 }
2553#ifdef CONFIG_SMP
2554 if (!val) {
2555 cpu_set(cpu, suppress);
2556 continue;
2557 }
2558#endif /* CONFIG_SMP */
2559 kdb_printf("%5d ", cpu);
2560 kdb_md_line(fmtstr, addr,
2561 bytesperword == KDB_WORD_SIZE,
2562 1, bytesperword, 1, 1, 0);
2563 }
2564 if (cpus_weight(suppress) == 0)
2565 return 0;
2566 kdb_printf("Zero suppressed cpu(s):");
2567 for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
2568 cpu = next_cpu(cpu, suppress)) {
2569 kdb_printf(" %d", cpu);
2570 if (cpu == num_possible_cpus() - 1 ||
2571 next_cpu(cpu, suppress) != cpu + 1)
2572 continue;
2573 while (cpu < num_possible_cpus() &&
2574 next_cpu(cpu, suppress) == cpu + 1)
2575 ++cpu;
2576 kdb_printf("-%d", cpu);
2577 }
2578 kdb_printf("\n");
2579
2580#undef KDB_PCU
2581
2582 return 0;
2583}
2584
2585/*
2586 * display help for the use of cmd | grep pattern
2587 */
2588static int kdb_grep_help(int argc, const char **argv)
2589{
2590 kdb_printf("Usage of cmd args | grep pattern:\n");
2591 kdb_printf(" Any command's output may be filtered through an ");
2592 kdb_printf("emulated 'pipe'.\n");
2593 kdb_printf(" 'grep' is just a key word.\n");
2594 kdb_printf(" The pattern may include a very limited set of "
2595 "metacharacters:\n");
2596 kdb_printf(" pattern or ^pattern or pattern$ or ^pattern$\n");
2597 kdb_printf(" And if there are spaces in the pattern, you may "
2598 "quote it:\n");
2599 kdb_printf(" \"pat tern\" or \"^pat tern\" or \"pat tern$\""
2600 " or \"^pat tern$\"\n");
2601 return 0;
2602}
2603
2604/*
2605 * kdb_register_repeat - This function is used to register a kernel
2606 * debugger command.
2607 * Inputs:
2608 * cmd Command name
2609 * func Function to execute the command
2610 * usage A simple usage string showing arguments
2611 * help A simple help string describing command
2612 * repeat Does the command auto repeat on enter?
2613 * Returns:
2614 * zero for success, one if a duplicate command.
2615 */
2616#define kdb_command_extend 50 /* arbitrary */
2617int kdb_register_repeat(char *cmd,
2618 kdb_func_t func,
2619 char *usage,
2620 char *help,
2621 short minlen,
2622 kdb_repeat_t repeat)
2623{
2624 int i;
2625 kdbtab_t *kp;
2626
2627 /*
2628 * Brute force method to determine duplicates
2629 */
2630 for_each_kdbcmd(kp, i) {
2631 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2632 kdb_printf("Duplicate kdb command registered: "
2633 "%s, func %p help %s\n", cmd, func, help);
2634 return 1;
2635 }
2636 }
2637
2638 /*
2639 * Insert command into first available location in table
2640 */
2641 for_each_kdbcmd(kp, i) {
2642 if (kp->cmd_name == NULL)
2643 break;
2644 }
2645
2646 if (i >= kdb_max_commands) {
2647 kdbtab_t *new = kmalloc((kdb_max_commands - KDB_BASE_CMD_MAX +
2648 kdb_command_extend) * sizeof(*new), GFP_KDB);
2649 if (!new) {
2650 kdb_printf("Could not allocate new kdb_command "
2651 "table\n");
2652 return 1;
2653 }
2654 if (kdb_commands) {
2655 memcpy(new, kdb_commands,
2656 kdb_max_commands * sizeof(*new));
2657 kfree(kdb_commands);
2658 }
2659 memset(new + kdb_max_commands, 0,
2660 kdb_command_extend * sizeof(*new));
2661 kdb_commands = new;
2662 kp = kdb_commands + kdb_max_commands;
2663 kdb_max_commands += kdb_command_extend;
2664 }
2665
2666 kp->cmd_name = cmd;
2667 kp->cmd_func = func;
2668 kp->cmd_usage = usage;
2669 kp->cmd_help = help;
2670 kp->cmd_flags = 0;
2671 kp->cmd_minlen = minlen;
2672 kp->cmd_repeat = repeat;
2673
2674 return 0;
2675}
2676
2677/*
2678 * kdb_register - Compatibility register function for commands that do
2679 * not need to specify a repeat state. Equivalent to
2680 * kdb_register_repeat with KDB_REPEAT_NONE.
2681 * Inputs:
2682 * cmd Command name
2683 * func Function to execute the command
2684 * usage A simple usage string showing arguments
2685 * help A simple help string describing command
2686 * Returns:
2687 * zero for success, one if a duplicate command.
2688 */
2689int kdb_register(char *cmd,
2690 kdb_func_t func,
2691 char *usage,
2692 char *help,
2693 short minlen)
2694{
2695 return kdb_register_repeat(cmd, func, usage, help, minlen,
2696 KDB_REPEAT_NONE);
2697}
2698
2699/*
2700 * kdb_unregister - This function is used to unregister a kernel
2701 * debugger command. It is generally called when a module which
2702 * implements kdb commands is unloaded.
2703 * Inputs:
2704 * cmd Command name
2705 * Returns:
2706 * zero for success, one command not registered.
2707 */
2708int kdb_unregister(char *cmd)
2709{
2710 int i;
2711 kdbtab_t *kp;
2712
2713 /*
2714 * find the command.
2715 */
2716 for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) {
2717 if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
2718 kp->cmd_name = NULL;
2719 return 0;
2720 }
2721 }
2722
2723 /* Couldn't find it. */
2724 return 1;
2725}
2726
2727/* Initialize the kdb command table. */
2728static void __init kdb_inittab(void)
2729{
2730 int i;
2731 kdbtab_t *kp;
2732
2733 for_each_kdbcmd(kp, i)
2734 kp->cmd_name = NULL;
2735
2736 kdb_register_repeat("md", kdb_md, "<vaddr>",
2737 "Display Memory Contents, also mdWcN, e.g. md8c1", 1,
2738 KDB_REPEAT_NO_ARGS);
2739 kdb_register_repeat("mdr", kdb_md, "<vaddr> <bytes>",
2740 "Display Raw Memory", 0, KDB_REPEAT_NO_ARGS);
2741 kdb_register_repeat("mdp", kdb_md, "<paddr> <bytes>",
2742 "Display Physical Memory", 0, KDB_REPEAT_NO_ARGS);
2743 kdb_register_repeat("mds", kdb_md, "<vaddr>",
2744 "Display Memory Symbolically", 0, KDB_REPEAT_NO_ARGS);
2745 kdb_register_repeat("mm", kdb_mm, "<vaddr> <contents>",
2746 "Modify Memory Contents", 0, KDB_REPEAT_NO_ARGS);
2747 kdb_register_repeat("go", kdb_go, "[<vaddr>]",
2748 "Continue Execution", 1, KDB_REPEAT_NONE);
2749 kdb_register_repeat("rd", kdb_rd, "",
2750 "Display Registers", 0, KDB_REPEAT_NONE);
2751 kdb_register_repeat("rm", kdb_rm, "<reg> <contents>",
2752 "Modify Registers", 0, KDB_REPEAT_NONE);
2753 kdb_register_repeat("ef", kdb_ef, "<vaddr>",
2754 "Display exception frame", 0, KDB_REPEAT_NONE);
2755 kdb_register_repeat("bt", kdb_bt, "[<vaddr>]",
2756 "Stack traceback", 1, KDB_REPEAT_NONE);
2757 kdb_register_repeat("btp", kdb_bt, "<pid>",
2758 "Display stack for process <pid>", 0, KDB_REPEAT_NONE);
2759 kdb_register_repeat("bta", kdb_bt, "[DRSTCZEUIMA]",
2760 "Display stack all processes", 0, KDB_REPEAT_NONE);
2761 kdb_register_repeat("btc", kdb_bt, "",
2762 "Backtrace current process on each cpu", 0, KDB_REPEAT_NONE);
2763 kdb_register_repeat("btt", kdb_bt, "<vaddr>",
2764 "Backtrace process given its struct task address", 0,
2765 KDB_REPEAT_NONE);
2766 kdb_register_repeat("ll", kdb_ll, "<first-element> <linkoffset> <cmd>",
2767 "Execute cmd for each element in linked list", 0, KDB_REPEAT_NONE);
2768 kdb_register_repeat("env", kdb_env, "",
2769 "Show environment variables", 0, KDB_REPEAT_NONE);
2770 kdb_register_repeat("set", kdb_set, "",
2771 "Set environment variables", 0, KDB_REPEAT_NONE);
2772 kdb_register_repeat("help", kdb_help, "",
2773 "Display Help Message", 1, KDB_REPEAT_NONE);
2774 kdb_register_repeat("?", kdb_help, "",
2775 "Display Help Message", 0, KDB_REPEAT_NONE);
2776 kdb_register_repeat("cpu", kdb_cpu, "<cpunum>",
2777 "Switch to new cpu", 0, KDB_REPEAT_NONE);
2778 kdb_register_repeat("kgdb", kdb_kgdb, "",
2779 "Enter kgdb mode", 0, KDB_REPEAT_NONE);
2780 kdb_register_repeat("ps", kdb_ps, "[<flags>|A]",
2781 "Display active task list", 0, KDB_REPEAT_NONE);
2782 kdb_register_repeat("pid", kdb_pid, "<pidnum>",
2783 "Switch to another task", 0, KDB_REPEAT_NONE);
2784 kdb_register_repeat("reboot", kdb_reboot, "",
2785 "Reboot the machine immediately", 0, KDB_REPEAT_NONE);
2786#if defined(CONFIG_MODULES)
2787 kdb_register_repeat("lsmod", kdb_lsmod, "",
2788 "List loaded kernel modules", 0, KDB_REPEAT_NONE);
2789#endif
2790#if defined(CONFIG_MAGIC_SYSRQ)
2791 kdb_register_repeat("sr", kdb_sr, "<key>",
2792 "Magic SysRq key", 0, KDB_REPEAT_NONE);
2793#endif
2794#if defined(CONFIG_PRINTK)
2795 kdb_register_repeat("dmesg", kdb_dmesg, "[lines]",
2796 "Display syslog buffer", 0, KDB_REPEAT_NONE);
2797#endif
2798 kdb_register_repeat("defcmd", kdb_defcmd, "name \"usage\" \"help\"",
2799 "Define a set of commands, down to endefcmd", 0, KDB_REPEAT_NONE);
2800 kdb_register_repeat("kill", kdb_kill, "<-signal> <pid>",
2801 "Send a signal to a process", 0, KDB_REPEAT_NONE);
2802 kdb_register_repeat("summary", kdb_summary, "",
2803 "Summarize the system", 4, KDB_REPEAT_NONE);
2804 kdb_register_repeat("per_cpu", kdb_per_cpu, "",
2805 "Display per_cpu variables", 3, KDB_REPEAT_NONE);
2806 kdb_register_repeat("grephelp", kdb_grep_help, "",
2807 "Display help on | grep", 0, KDB_REPEAT_NONE);
2808}
2809
2810/* Execute any commands defined in kdb_cmds. */
2811static void __init kdb_cmd_init(void)
2812{
2813 int i, diag;
2814 for (i = 0; kdb_cmds[i]; ++i) {
2815 diag = kdb_parse(kdb_cmds[i]);
2816 if (diag)
2817 kdb_printf("kdb command %s failed, kdb diag %d\n",
2818 kdb_cmds[i], diag);
2819 }
2820 if (defcmd_in_progress) {
2821 kdb_printf("Incomplete 'defcmd' set, forcing endefcmd\n");
2822 kdb_parse("endefcmd");
2823 }
2824}
2825
2826/* Intialize kdb_printf, breakpoint tables and kdb state */
2827void __init kdb_init(int lvl)
2828{
2829 static int kdb_init_lvl = KDB_NOT_INITIALIZED;
2830 int i;
2831
2832 if (kdb_init_lvl == KDB_INIT_FULL || lvl <= kdb_init_lvl)
2833 return;
2834 for (i = kdb_init_lvl; i < lvl; i++) {
2835 switch (i) {
2836 case KDB_NOT_INITIALIZED:
2837 kdb_inittab(); /* Initialize Command Table */
2838 kdb_initbptab(); /* Initialize Breakpoints */
2839 break;
2840 case KDB_INIT_EARLY:
2841 kdb_cmd_init(); /* Build kdb_cmds tables */
2842 break;
2843 }
2844 }
2845 kdb_init_lvl = lvl;
2846}
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
new file mode 100644
index 000000000000..97d3ba69775d
--- /dev/null
+++ b/kernel/debug/kdb/kdb_private.h
@@ -0,0 +1,300 @@
1#ifndef _KDBPRIVATE_H
2#define _KDBPRIVATE_H
3
4/*
5 * Kernel Debugger Architecture Independent Private Headers
6 *
7 * This file is subject to the terms and conditions of the GNU General Public
8 * License. See the file "COPYING" in the main directory of this archive
9 * for more details.
10 *
11 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
12 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
13 */
14
15#include <linux/kgdb.h>
16#include "../debug_core.h"
17
18/* Kernel Debugger Error codes. Must not overlap with command codes. */
19#define KDB_NOTFOUND (-1)
20#define KDB_ARGCOUNT (-2)
21#define KDB_BADWIDTH (-3)
22#define KDB_BADRADIX (-4)
23#define KDB_NOTENV (-5)
24#define KDB_NOENVVALUE (-6)
25#define KDB_NOTIMP (-7)
26#define KDB_ENVFULL (-8)
27#define KDB_ENVBUFFULL (-9)
28#define KDB_TOOMANYBPT (-10)
29#define KDB_TOOMANYDBREGS (-11)
30#define KDB_DUPBPT (-12)
31#define KDB_BPTNOTFOUND (-13)
32#define KDB_BADMODE (-14)
33#define KDB_BADINT (-15)
34#define KDB_INVADDRFMT (-16)
35#define KDB_BADREG (-17)
36#define KDB_BADCPUNUM (-18)
37#define KDB_BADLENGTH (-19)
38#define KDB_NOBP (-20)
39#define KDB_BADADDR (-21)
40
41/* Kernel Debugger Command codes. Must not overlap with error codes. */
42#define KDB_CMD_GO (-1001)
43#define KDB_CMD_CPU (-1002)
44#define KDB_CMD_SS (-1003)
45#define KDB_CMD_SSB (-1004)
46#define KDB_CMD_KGDB (-1005)
47#define KDB_CMD_KGDB2 (-1006)
48
49/* Internal debug flags */
50#define KDB_DEBUG_FLAG_BP 0x0002 /* Breakpoint subsystem debug */
51#define KDB_DEBUG_FLAG_BB_SUMM 0x0004 /* Basic block analysis, summary only */
52#define KDB_DEBUG_FLAG_AR 0x0008 /* Activation record, generic */
53#define KDB_DEBUG_FLAG_ARA 0x0010 /* Activation record, arch specific */
54#define KDB_DEBUG_FLAG_BB 0x0020 /* All basic block analysis */
55#define KDB_DEBUG_FLAG_STATE 0x0040 /* State flags */
56#define KDB_DEBUG_FLAG_MASK 0xffff /* All debug flags */
57#define KDB_DEBUG_FLAG_SHIFT 16 /* Shift factor for dbflags */
58
59#define KDB_DEBUG(flag) (kdb_flags & \
60 (KDB_DEBUG_FLAG_##flag << KDB_DEBUG_FLAG_SHIFT))
61#define KDB_DEBUG_STATE(text, value) if (KDB_DEBUG(STATE)) \
62 kdb_print_state(text, value)
63
64#if BITS_PER_LONG == 32
65
66#define KDB_PLATFORM_ENV "BYTESPERWORD=4"
67
68#define kdb_machreg_fmt "0x%lx"
69#define kdb_machreg_fmt0 "0x%08lx"
70#define kdb_bfd_vma_fmt "0x%lx"
71#define kdb_bfd_vma_fmt0 "0x%08lx"
72#define kdb_elfw_addr_fmt "0x%x"
73#define kdb_elfw_addr_fmt0 "0x%08x"
74#define kdb_f_count_fmt "%d"
75
76#elif BITS_PER_LONG == 64
77
78#define KDB_PLATFORM_ENV "BYTESPERWORD=8"
79
80#define kdb_machreg_fmt "0x%lx"
81#define kdb_machreg_fmt0 "0x%016lx"
82#define kdb_bfd_vma_fmt "0x%lx"
83#define kdb_bfd_vma_fmt0 "0x%016lx"
84#define kdb_elfw_addr_fmt "0x%x"
85#define kdb_elfw_addr_fmt0 "0x%016x"
86#define kdb_f_count_fmt "%ld"
87
88#endif
89
90/*
91 * KDB_MAXBPT describes the total number of breakpoints
92 * supported by this architecure.
93 */
94#define KDB_MAXBPT 16
95
96/* Maximum number of arguments to a function */
97#define KDB_MAXARGS 16
98
99typedef enum {
100 KDB_REPEAT_NONE = 0, /* Do not repeat this command */
101 KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */
102 KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */
103} kdb_repeat_t;
104
105typedef int (*kdb_func_t)(int, const char **);
106
107/* Symbol table format returned by kallsyms. */
108typedef struct __ksymtab {
109 unsigned long value; /* Address of symbol */
110 const char *mod_name; /* Module containing symbol or
111 * "kernel" */
112 unsigned long mod_start;
113 unsigned long mod_end;
114 const char *sec_name; /* Section containing symbol */
115 unsigned long sec_start;
116 unsigned long sec_end;
117 const char *sym_name; /* Full symbol name, including
118 * any version */
119 unsigned long sym_start;
120 unsigned long sym_end;
121 } kdb_symtab_t;
122extern int kallsyms_symbol_next(char *prefix_name, int flag);
123extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
124
125/* Exported Symbols for kernel loadable modules to use. */
126extern int kdb_register(char *, kdb_func_t, char *, char *, short);
127extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
128 short, kdb_repeat_t);
129extern int kdb_unregister(char *);
130
131extern int kdb_getarea_size(void *, unsigned long, size_t);
132extern int kdb_putarea_size(unsigned long, void *, size_t);
133
134/*
135 * Like get_user and put_user, kdb_getarea and kdb_putarea take variable
136 * names, not pointers. The underlying *_size functions take pointers.
137 */
138#define kdb_getarea(x, addr) kdb_getarea_size(&(x), addr, sizeof((x)))
139#define kdb_putarea(addr, x) kdb_putarea_size(addr, &(x), sizeof((x)))
140
141extern int kdb_getphysword(unsigned long *word,
142 unsigned long addr, size_t size);
143extern int kdb_getword(unsigned long *, unsigned long, size_t);
144extern int kdb_putword(unsigned long, unsigned long, size_t);
145
146extern int kdbgetularg(const char *, unsigned long *);
147extern int kdb_set(int, const char **);
148extern char *kdbgetenv(const char *);
149extern int kdbgetintenv(const char *, int *);
150extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
151 long *, char **);
152extern int kdbgetsymval(const char *, kdb_symtab_t *);
153extern int kdbnearsym(unsigned long, kdb_symtab_t *);
154extern void kdbnearsym_cleanup(void);
155extern char *kdb_strdup(const char *str, gfp_t type);
156extern void kdb_symbol_print(unsigned long, const kdb_symtab_t *, unsigned int);
157
158/* Routine for debugging the debugger state. */
159extern void kdb_print_state(const char *, int);
160
161extern int kdb_state;
162#define KDB_STATE_KDB 0x00000001 /* Cpu is inside kdb */
163#define KDB_STATE_LEAVING 0x00000002 /* Cpu is leaving kdb */
164#define KDB_STATE_CMD 0x00000004 /* Running a kdb command */
165#define KDB_STATE_KDB_CONTROL 0x00000008 /* This cpu is under
166 * kdb control */
167#define KDB_STATE_HOLD_CPU 0x00000010 /* Hold this cpu inside kdb */
168#define KDB_STATE_DOING_SS 0x00000020 /* Doing ss command */
169#define KDB_STATE_DOING_SSB 0x00000040 /* Doing ssb command,
170 * DOING_SS is also set */
171#define KDB_STATE_SSBPT 0x00000080 /* Install breakpoint
172 * after one ss, independent of
173 * DOING_SS */
174#define KDB_STATE_REENTRY 0x00000100 /* Valid re-entry into kdb */
175#define KDB_STATE_SUPPRESS 0x00000200 /* Suppress error messages */
176#define KDB_STATE_PAGER 0x00000400 /* pager is available */
177#define KDB_STATE_GO_SWITCH 0x00000800 /* go is switching
178 * back to initial cpu */
179#define KDB_STATE_PRINTF_LOCK 0x00001000 /* Holds kdb_printf lock */
180#define KDB_STATE_WAIT_IPI 0x00002000 /* Waiting for kdb_ipi() NMI */
181#define KDB_STATE_RECURSE 0x00004000 /* Recursive entry to kdb */
182#define KDB_STATE_IP_ADJUSTED 0x00008000 /* Restart IP has been
183 * adjusted */
184#define KDB_STATE_GO1 0x00010000 /* go only releases one cpu */
185#define KDB_STATE_KEYBOARD 0x00020000 /* kdb entered via
186 * keyboard on this cpu */
187#define KDB_STATE_KEXEC 0x00040000 /* kexec issued */
188#define KDB_STATE_DOING_KGDB 0x00080000 /* kgdb enter now issued */
189#define KDB_STATE_DOING_KGDB2 0x00100000 /* kgdb enter now issued */
190#define KDB_STATE_KGDB_TRANS 0x00200000 /* Transition to kgdb */
191#define KDB_STATE_ARCH 0xff000000 /* Reserved for arch
192 * specific use */
193
194#define KDB_STATE(flag) (kdb_state & KDB_STATE_##flag)
195#define KDB_STATE_SET(flag) ((void)(kdb_state |= KDB_STATE_##flag))
196#define KDB_STATE_CLEAR(flag) ((void)(kdb_state &= ~KDB_STATE_##flag))
197
198extern int kdb_nextline; /* Current number of lines displayed */
199
200typedef struct _kdb_bp {
201 unsigned long bp_addr; /* Address breakpoint is present at */
202 unsigned int bp_free:1; /* This entry is available */
203 unsigned int bp_enabled:1; /* Breakpoint is active in register */
204 unsigned int bp_type:4; /* Uses hardware register */
205 unsigned int bp_installed:1; /* Breakpoint is installed */
206 unsigned int bp_delay:1; /* Do delayed bp handling */
207 unsigned int bp_delayed:1; /* Delayed breakpoint */
208 unsigned int bph_length; /* HW break length */
209} kdb_bp_t;
210
211#ifdef CONFIG_KGDB_KDB
212extern kdb_bp_t kdb_breakpoints[/* KDB_MAXBPT */];
213
214/* The KDB shell command table */
215typedef struct _kdbtab {
216 char *cmd_name; /* Command name */
217 kdb_func_t cmd_func; /* Function to execute command */
218 char *cmd_usage; /* Usage String for this command */
219 char *cmd_help; /* Help message for this command */
220 short cmd_flags; /* Parsing flags */
221 short cmd_minlen; /* Minimum legal # command
222 * chars required */
223 kdb_repeat_t cmd_repeat; /* Does command auto repeat on enter? */
224} kdbtab_t;
225
226extern int kdb_bt(int, const char **); /* KDB display back trace */
227
228/* KDB breakpoint management functions */
229extern void kdb_initbptab(void);
230extern void kdb_bp_install(struct pt_regs *);
231extern void kdb_bp_remove(void);
232
233typedef enum {
234 KDB_DB_BPT, /* Breakpoint */
235 KDB_DB_SS, /* Single-step trap */
236 KDB_DB_SSB, /* Single step to branch */
237 KDB_DB_SSBPT, /* Single step over breakpoint */
238 KDB_DB_NOBPT /* Spurious breakpoint */
239} kdb_dbtrap_t;
240
241extern int kdb_main_loop(kdb_reason_t, kdb_reason_t,
242 int, kdb_dbtrap_t, struct pt_regs *);
243
244/* Miscellaneous functions and data areas */
245extern int kdb_grepping_flag;
246extern char kdb_grep_string[];
247extern int kdb_grep_leading;
248extern int kdb_grep_trailing;
249extern char *kdb_cmds[];
250extern void kdb_syslog_data(char *syslog_data[]);
251extern unsigned long kdb_task_state_string(const char *);
252extern char kdb_task_state_char (const struct task_struct *);
253extern unsigned long kdb_task_state(const struct task_struct *p,
254 unsigned long mask);
255extern void kdb_ps_suppressed(void);
256extern void kdb_ps1(const struct task_struct *p);
257extern void kdb_print_nameval(const char *name, unsigned long val);
258extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
259extern void kdb_meminfo_proc_show(void);
260extern const char *kdb_walk_kallsyms(loff_t *pos);
261extern char *kdb_getstr(char *, size_t, char *);
262
263/* Defines for kdb_symbol_print */
264#define KDB_SP_SPACEB 0x0001 /* Space before string */
265#define KDB_SP_SPACEA 0x0002 /* Space after string */
266#define KDB_SP_PAREN 0x0004 /* Parenthesis around string */
267#define KDB_SP_VALUE 0x0008 /* Print the value of the address */
268#define KDB_SP_SYMSIZE 0x0010 /* Print the size of the symbol */
269#define KDB_SP_NEWLINE 0x0020 /* Newline after string */
270#define KDB_SP_DEFAULT (KDB_SP_VALUE|KDB_SP_PAREN)
271
272#define KDB_TSK(cpu) kgdb_info[cpu].task
273#define KDB_TSKREGS(cpu) kgdb_info[cpu].debuggerinfo
274
275extern struct task_struct *kdb_curr_task(int);
276
277#define kdb_task_has_cpu(p) (task_curr(p))
278
279/* Simplify coexistence with NPTL */
280#define kdb_do_each_thread(g, p) do_each_thread(g, p)
281#define kdb_while_each_thread(g, p) while_each_thread(g, p)
282
283#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
284
285extern void *debug_kmalloc(size_t size, gfp_t flags);
286extern void debug_kfree(void *);
287extern void debug_kusage(void);
288
289extern void kdb_set_current_task(struct task_struct *);
290extern struct task_struct *kdb_current_task;
291#ifdef CONFIG_MODULES
292extern struct list_head *kdb_modules;
293#endif /* CONFIG_MODULES */
294
295extern char kdb_prompt_str[];
296
297#define KDB_WORD_SIZE ((int)sizeof(unsigned long))
298
299#endif /* CONFIG_KGDB_KDB */
300#endif /* !_KDBPRIVATE_H */
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
new file mode 100644
index 000000000000..45344d5c53dd
--- /dev/null
+++ b/kernel/debug/kdb/kdb_support.c
@@ -0,0 +1,927 @@
1/*
2 * Kernel Debugger Architecture Independent Support Functions
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (c) 1999-2004 Silicon Graphics, Inc. All Rights Reserved.
9 * Copyright (c) 2009 Wind River Systems, Inc. All Rights Reserved.
10 * 03/02/13 added new 2.5 kallsyms <xavier.bru@bull.net>
11 */
12
13#include <stdarg.h>
14#include <linux/types.h>
15#include <linux/sched.h>
16#include <linux/mm.h>
17#include <linux/kallsyms.h>
18#include <linux/stddef.h>
19#include <linux/vmalloc.h>
20#include <linux/ptrace.h>
21#include <linux/module.h>
22#include <linux/highmem.h>
23#include <linux/hardirq.h>
24#include <linux/delay.h>
25#include <linux/uaccess.h>
26#include <linux/kdb.h>
27#include <linux/slab.h>
28#include "kdb_private.h"
29
30/*
31 * kdbgetsymval - Return the address of the given symbol.
32 *
33 * Parameters:
34 * symname Character string containing symbol name
35 * symtab Structure to receive results
36 * Returns:
37 * 0 Symbol not found, symtab zero filled
38 * 1 Symbol mapped to module/symbol/section, data in symtab
39 */
40int kdbgetsymval(const char *symname, kdb_symtab_t *symtab)
41{
42 if (KDB_DEBUG(AR))
43 kdb_printf("kdbgetsymval: symname=%s, symtab=%p\n", symname,
44 symtab);
45 memset(symtab, 0, sizeof(*symtab));
46 symtab->sym_start = kallsyms_lookup_name(symname);
47 if (symtab->sym_start) {
48 if (KDB_DEBUG(AR))
49 kdb_printf("kdbgetsymval: returns 1, "
50 "symtab->sym_start=0x%lx\n",
51 symtab->sym_start);
52 return 1;
53 }
54 if (KDB_DEBUG(AR))
55 kdb_printf("kdbgetsymval: returns 0\n");
56 return 0;
57}
58EXPORT_SYMBOL(kdbgetsymval);
59
60static char *kdb_name_table[100]; /* arbitrary size */
61
62/*
63 * kdbnearsym - Return the name of the symbol with the nearest address
64 * less than 'addr'.
65 *
66 * Parameters:
67 * addr Address to check for symbol near
68 * symtab Structure to receive results
69 * Returns:
70 * 0 No sections contain this address, symtab zero filled
71 * 1 Address mapped to module/symbol/section, data in symtab
72 * Remarks:
73 * 2.6 kallsyms has a "feature" where it unpacks the name into a
74 * string. If that string is reused before the caller expects it
75 * then the caller sees its string change without warning. To
76 * avoid cluttering up the main kdb code with lots of kdb_strdup,
77 * tests and kfree calls, kdbnearsym maintains an LRU list of the
78 * last few unique strings. The list is sized large enough to
79 * hold active strings, no kdb caller of kdbnearsym makes more
80 * than ~20 later calls before using a saved value.
81 */
82int kdbnearsym(unsigned long addr, kdb_symtab_t *symtab)
83{
84 int ret = 0;
85 unsigned long symbolsize;
86 unsigned long offset;
87#define knt1_size 128 /* must be >= kallsyms table size */
88 char *knt1 = NULL;
89
90 if (KDB_DEBUG(AR))
91 kdb_printf("kdbnearsym: addr=0x%lx, symtab=%p\n", addr, symtab);
92 memset(symtab, 0, sizeof(*symtab));
93
94 if (addr < 4096)
95 goto out;
96 knt1 = debug_kmalloc(knt1_size, GFP_ATOMIC);
97 if (!knt1) {
98 kdb_printf("kdbnearsym: addr=0x%lx cannot kmalloc knt1\n",
99 addr);
100 goto out;
101 }
102 symtab->sym_name = kallsyms_lookup(addr, &symbolsize , &offset,
103 (char **)(&symtab->mod_name), knt1);
104 if (offset > 8*1024*1024) {
105 symtab->sym_name = NULL;
106 addr = offset = symbolsize = 0;
107 }
108 symtab->sym_start = addr - offset;
109 symtab->sym_end = symtab->sym_start + symbolsize;
110 ret = symtab->sym_name != NULL && *(symtab->sym_name) != '\0';
111
112 if (ret) {
113 int i;
114 /* Another 2.6 kallsyms "feature". Sometimes the sym_name is
115 * set but the buffer passed into kallsyms_lookup is not used,
116 * so it contains garbage. The caller has to work out which
117 * buffer needs to be saved.
118 *
119 * What was Rusty smoking when he wrote that code?
120 */
121 if (symtab->sym_name != knt1) {
122 strncpy(knt1, symtab->sym_name, knt1_size);
123 knt1[knt1_size-1] = '\0';
124 }
125 for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
126 if (kdb_name_table[i] &&
127 strcmp(kdb_name_table[i], knt1) == 0)
128 break;
129 }
130 if (i >= ARRAY_SIZE(kdb_name_table)) {
131 debug_kfree(kdb_name_table[0]);
132 memcpy(kdb_name_table, kdb_name_table+1,
133 sizeof(kdb_name_table[0]) *
134 (ARRAY_SIZE(kdb_name_table)-1));
135 } else {
136 debug_kfree(knt1);
137 knt1 = kdb_name_table[i];
138 memcpy(kdb_name_table+i, kdb_name_table+i+1,
139 sizeof(kdb_name_table[0]) *
140 (ARRAY_SIZE(kdb_name_table)-i-1));
141 }
142 i = ARRAY_SIZE(kdb_name_table) - 1;
143 kdb_name_table[i] = knt1;
144 symtab->sym_name = kdb_name_table[i];
145 knt1 = NULL;
146 }
147
148 if (symtab->mod_name == NULL)
149 symtab->mod_name = "kernel";
150 if (KDB_DEBUG(AR))
151 kdb_printf("kdbnearsym: returns %d symtab->sym_start=0x%lx, "
152 "symtab->mod_name=%p, symtab->sym_name=%p (%s)\n", ret,
153 symtab->sym_start, symtab->mod_name, symtab->sym_name,
154 symtab->sym_name);
155
156out:
157 debug_kfree(knt1);
158 return ret;
159}
160
161void kdbnearsym_cleanup(void)
162{
163 int i;
164 for (i = 0; i < ARRAY_SIZE(kdb_name_table); ++i) {
165 if (kdb_name_table[i]) {
166 debug_kfree(kdb_name_table[i]);
167 kdb_name_table[i] = NULL;
168 }
169 }
170}
171
172static char ks_namebuf[KSYM_NAME_LEN+1], ks_namebuf_prev[KSYM_NAME_LEN+1];
173
174/*
175 * kallsyms_symbol_complete
176 *
177 * Parameters:
178 * prefix_name prefix of a symbol name to lookup
179 * max_len maximum length that can be returned
180 * Returns:
181 * Number of symbols which match the given prefix.
182 * Notes:
183 * prefix_name is changed to contain the longest unique prefix that
184 * starts with this prefix (tab completion).
185 */
186int kallsyms_symbol_complete(char *prefix_name, int max_len)
187{
188 loff_t pos = 0;
189 int prefix_len = strlen(prefix_name), prev_len = 0;
190 int i, number = 0;
191 const char *name;
192
193 while ((name = kdb_walk_kallsyms(&pos))) {
194 if (strncmp(name, prefix_name, prefix_len) == 0) {
195 strcpy(ks_namebuf, name);
196 /* Work out the longest name that matches the prefix */
197 if (++number == 1) {
198 prev_len = min_t(int, max_len-1,
199 strlen(ks_namebuf));
200 memcpy(ks_namebuf_prev, ks_namebuf, prev_len);
201 ks_namebuf_prev[prev_len] = '\0';
202 continue;
203 }
204 for (i = 0; i < prev_len; i++) {
205 if (ks_namebuf[i] != ks_namebuf_prev[i]) {
206 prev_len = i;
207 ks_namebuf_prev[i] = '\0';
208 break;
209 }
210 }
211 }
212 }
213 if (prev_len > prefix_len)
214 memcpy(prefix_name, ks_namebuf_prev, prev_len+1);
215 return number;
216}
217
218/*
219 * kallsyms_symbol_next
220 *
221 * Parameters:
222 * prefix_name prefix of a symbol name to lookup
223 * flag 0 means search from the head, 1 means continue search.
224 * Returns:
225 * 1 if a symbol matches the given prefix.
226 * 0 if no string found
227 */
228int kallsyms_symbol_next(char *prefix_name, int flag)
229{
230 int prefix_len = strlen(prefix_name);
231 static loff_t pos;
232 const char *name;
233
234 if (!flag)
235 pos = 0;
236
237 while ((name = kdb_walk_kallsyms(&pos))) {
238 if (strncmp(name, prefix_name, prefix_len) == 0) {
239 strncpy(prefix_name, name, strlen(name)+1);
240 return 1;
241 }
242 }
243 return 0;
244}
245
246/*
247 * kdb_symbol_print - Standard method for printing a symbol name and offset.
248 * Inputs:
249 * addr Address to be printed.
250 * symtab Address of symbol data, if NULL this routine does its
251 * own lookup.
252 * punc Punctuation for string, bit field.
253 * Remarks:
254 * The string and its punctuation is only printed if the address
255 * is inside the kernel, except that the value is always printed
256 * when requested.
257 */
258void kdb_symbol_print(unsigned long addr, const kdb_symtab_t *symtab_p,
259 unsigned int punc)
260{
261 kdb_symtab_t symtab, *symtab_p2;
262 if (symtab_p) {
263 symtab_p2 = (kdb_symtab_t *)symtab_p;
264 } else {
265 symtab_p2 = &symtab;
266 kdbnearsym(addr, symtab_p2);
267 }
268 if (!(symtab_p2->sym_name || (punc & KDB_SP_VALUE)))
269 return;
270 if (punc & KDB_SP_SPACEB)
271 kdb_printf(" ");
272 if (punc & KDB_SP_VALUE)
273 kdb_printf(kdb_machreg_fmt0, addr);
274 if (symtab_p2->sym_name) {
275 if (punc & KDB_SP_VALUE)
276 kdb_printf(" ");
277 if (punc & KDB_SP_PAREN)
278 kdb_printf("(");
279 if (strcmp(symtab_p2->mod_name, "kernel"))
280 kdb_printf("[%s]", symtab_p2->mod_name);
281 kdb_printf("%s", symtab_p2->sym_name);
282 if (addr != symtab_p2->sym_start)
283 kdb_printf("+0x%lx", addr - symtab_p2->sym_start);
284 if (punc & KDB_SP_SYMSIZE)
285 kdb_printf("/0x%lx",
286 symtab_p2->sym_end - symtab_p2->sym_start);
287 if (punc & KDB_SP_PAREN)
288 kdb_printf(")");
289 }
290 if (punc & KDB_SP_SPACEA)
291 kdb_printf(" ");
292 if (punc & KDB_SP_NEWLINE)
293 kdb_printf("\n");
294}
295
296/*
297 * kdb_strdup - kdb equivalent of strdup, for disasm code.
298 * Inputs:
299 * str The string to duplicate.
300 * type Flags to kmalloc for the new string.
301 * Returns:
302 * Address of the new string, NULL if storage could not be allocated.
303 * Remarks:
304 * This is not in lib/string.c because it uses kmalloc which is not
305 * available when string.o is used in boot loaders.
306 */
307char *kdb_strdup(const char *str, gfp_t type)
308{
309 int n = strlen(str)+1;
310 char *s = kmalloc(n, type);
311 if (!s)
312 return NULL;
313 return strcpy(s, str);
314}
315
316/*
317 * kdb_getarea_size - Read an area of data. The kdb equivalent of
318 * copy_from_user, with kdb messages for invalid addresses.
319 * Inputs:
320 * res Pointer to the area to receive the result.
321 * addr Address of the area to copy.
322 * size Size of the area.
323 * Returns:
324 * 0 for success, < 0 for error.
325 */
326int kdb_getarea_size(void *res, unsigned long addr, size_t size)
327{
328 int ret = probe_kernel_read((char *)res, (char *)addr, size);
329 if (ret) {
330 if (!KDB_STATE(SUPPRESS)) {
331 kdb_printf("kdb_getarea: Bad address 0x%lx\n", addr);
332 KDB_STATE_SET(SUPPRESS);
333 }
334 ret = KDB_BADADDR;
335 } else {
336 KDB_STATE_CLEAR(SUPPRESS);
337 }
338 return ret;
339}
340
341/*
342 * kdb_putarea_size - Write an area of data. The kdb equivalent of
343 * copy_to_user, with kdb messages for invalid addresses.
344 * Inputs:
345 * addr Address of the area to write to.
346 * res Pointer to the area holding the data.
347 * size Size of the area.
348 * Returns:
349 * 0 for success, < 0 for error.
350 */
351int kdb_putarea_size(unsigned long addr, void *res, size_t size)
352{
353 int ret = probe_kernel_read((char *)addr, (char *)res, size);
354 if (ret) {
355 if (!KDB_STATE(SUPPRESS)) {
356 kdb_printf("kdb_putarea: Bad address 0x%lx\n", addr);
357 KDB_STATE_SET(SUPPRESS);
358 }
359 ret = KDB_BADADDR;
360 } else {
361 KDB_STATE_CLEAR(SUPPRESS);
362 }
363 return ret;
364}
365
366/*
367 * kdb_getphys - Read data from a physical address. Validate the
368 * address is in range, use kmap_atomic() to get data
369 * similar to kdb_getarea() - but for phys addresses
370 * Inputs:
371 * res Pointer to the word to receive the result
372 * addr Physical address of the area to copy
373 * size Size of the area
374 * Returns:
375 * 0 for success, < 0 for error.
376 */
377static int kdb_getphys(void *res, unsigned long addr, size_t size)
378{
379 unsigned long pfn;
380 void *vaddr;
381 struct page *page;
382
383 pfn = (addr >> PAGE_SHIFT);
384 if (!pfn_valid(pfn))
385 return 1;
386 page = pfn_to_page(pfn);
387 vaddr = kmap_atomic(page, KM_KDB);
388 memcpy(res, vaddr + (addr & (PAGE_SIZE - 1)), size);
389 kunmap_atomic(vaddr, KM_KDB);
390
391 return 0;
392}
393
394/*
395 * kdb_getphysword
396 * Inputs:
397 * word Pointer to the word to receive the result.
398 * addr Address of the area to copy.
399 * size Size of the area.
400 * Returns:
401 * 0 for success, < 0 for error.
402 */
403int kdb_getphysword(unsigned long *word, unsigned long addr, size_t size)
404{
405 int diag;
406 __u8 w1;
407 __u16 w2;
408 __u32 w4;
409 __u64 w8;
410 *word = 0; /* Default value if addr or size is invalid */
411
412 switch (size) {
413 case 1:
414 diag = kdb_getphys(&w1, addr, sizeof(w1));
415 if (!diag)
416 *word = w1;
417 break;
418 case 2:
419 diag = kdb_getphys(&w2, addr, sizeof(w2));
420 if (!diag)
421 *word = w2;
422 break;
423 case 4:
424 diag = kdb_getphys(&w4, addr, sizeof(w4));
425 if (!diag)
426 *word = w4;
427 break;
428 case 8:
429 if (size <= sizeof(*word)) {
430 diag = kdb_getphys(&w8, addr, sizeof(w8));
431 if (!diag)
432 *word = w8;
433 break;
434 }
435 /* drop through */
436 default:
437 diag = KDB_BADWIDTH;
438 kdb_printf("kdb_getphysword: bad width %ld\n", (long) size);
439 }
440 return diag;
441}
442
443/*
444 * kdb_getword - Read a binary value. Unlike kdb_getarea, this treats
445 * data as numbers.
446 * Inputs:
447 * word Pointer to the word to receive the result.
448 * addr Address of the area to copy.
449 * size Size of the area.
450 * Returns:
451 * 0 for success, < 0 for error.
452 */
453int kdb_getword(unsigned long *word, unsigned long addr, size_t size)
454{
455 int diag;
456 __u8 w1;
457 __u16 w2;
458 __u32 w4;
459 __u64 w8;
460 *word = 0; /* Default value if addr or size is invalid */
461 switch (size) {
462 case 1:
463 diag = kdb_getarea(w1, addr);
464 if (!diag)
465 *word = w1;
466 break;
467 case 2:
468 diag = kdb_getarea(w2, addr);
469 if (!diag)
470 *word = w2;
471 break;
472 case 4:
473 diag = kdb_getarea(w4, addr);
474 if (!diag)
475 *word = w4;
476 break;
477 case 8:
478 if (size <= sizeof(*word)) {
479 diag = kdb_getarea(w8, addr);
480 if (!diag)
481 *word = w8;
482 break;
483 }
484 /* drop through */
485 default:
486 diag = KDB_BADWIDTH;
487 kdb_printf("kdb_getword: bad width %ld\n", (long) size);
488 }
489 return diag;
490}
491
492/*
493 * kdb_putword - Write a binary value. Unlike kdb_putarea, this
494 * treats data as numbers.
495 * Inputs:
496 * addr Address of the area to write to..
497 * word The value to set.
498 * size Size of the area.
499 * Returns:
500 * 0 for success, < 0 for error.
501 */
502int kdb_putword(unsigned long addr, unsigned long word, size_t size)
503{
504 int diag;
505 __u8 w1;
506 __u16 w2;
507 __u32 w4;
508 __u64 w8;
509 switch (size) {
510 case 1:
511 w1 = word;
512 diag = kdb_putarea(addr, w1);
513 break;
514 case 2:
515 w2 = word;
516 diag = kdb_putarea(addr, w2);
517 break;
518 case 4:
519 w4 = word;
520 diag = kdb_putarea(addr, w4);
521 break;
522 case 8:
523 if (size <= sizeof(word)) {
524 w8 = word;
525 diag = kdb_putarea(addr, w8);
526 break;
527 }
528 /* drop through */
529 default:
530 diag = KDB_BADWIDTH;
531 kdb_printf("kdb_putword: bad width %ld\n", (long) size);
532 }
533 return diag;
534}
535
536/*
537 * kdb_task_state_string - Convert a string containing any of the
538 * letters DRSTCZEUIMA to a mask for the process state field and
539 * return the value. If no argument is supplied, return the mask
540 * that corresponds to environment variable PS, DRSTCZEU by
541 * default.
542 * Inputs:
543 * s String to convert
544 * Returns:
545 * Mask for process state.
546 * Notes:
547 * The mask folds data from several sources into a single long value, so
548 * be carefull not to overlap the bits. TASK_* bits are in the LSB,
549 * special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
550 * is no overlap between TASK_* and EXIT_* but that may not always be
551 * true, so EXIT_* bits are shifted left 16 bits before being stored in
552 * the mask.
553 */
554
555/* unrunnable is < 0 */
556#define UNRUNNABLE (1UL << (8*sizeof(unsigned long) - 1))
557#define RUNNING (1UL << (8*sizeof(unsigned long) - 2))
558#define IDLE (1UL << (8*sizeof(unsigned long) - 3))
559#define DAEMON (1UL << (8*sizeof(unsigned long) - 4))
560
561unsigned long kdb_task_state_string(const char *s)
562{
563 long res = 0;
564 if (!s) {
565 s = kdbgetenv("PS");
566 if (!s)
567 s = "DRSTCZEU"; /* default value for ps */
568 }
569 while (*s) {
570 switch (*s) {
571 case 'D':
572 res |= TASK_UNINTERRUPTIBLE;
573 break;
574 case 'R':
575 res |= RUNNING;
576 break;
577 case 'S':
578 res |= TASK_INTERRUPTIBLE;
579 break;
580 case 'T':
581 res |= TASK_STOPPED;
582 break;
583 case 'C':
584 res |= TASK_TRACED;
585 break;
586 case 'Z':
587 res |= EXIT_ZOMBIE << 16;
588 break;
589 case 'E':
590 res |= EXIT_DEAD << 16;
591 break;
592 case 'U':
593 res |= UNRUNNABLE;
594 break;
595 case 'I':
596 res |= IDLE;
597 break;
598 case 'M':
599 res |= DAEMON;
600 break;
601 case 'A':
602 res = ~0UL;
603 break;
604 default:
605 kdb_printf("%s: unknown flag '%c' ignored\n",
606 __func__, *s);
607 break;
608 }
609 ++s;
610 }
611 return res;
612}
613
614/*
615 * kdb_task_state_char - Return the character that represents the task state.
616 * Inputs:
617 * p struct task for the process
618 * Returns:
619 * One character to represent the task state.
620 */
621char kdb_task_state_char (const struct task_struct *p)
622{
623 int cpu;
624 char state;
625 unsigned long tmp;
626
627 if (!p || probe_kernel_read(&tmp, (char *)p, sizeof(unsigned long)))
628 return 'E';
629
630 cpu = kdb_process_cpu(p);
631 state = (p->state == 0) ? 'R' :
632 (p->state < 0) ? 'U' :
633 (p->state & TASK_UNINTERRUPTIBLE) ? 'D' :
634 (p->state & TASK_STOPPED) ? 'T' :
635 (p->state & TASK_TRACED) ? 'C' :
636 (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
637 (p->exit_state & EXIT_DEAD) ? 'E' :
638 (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
639 if (p->pid == 0) {
640 /* Idle task. Is it really idle, apart from the kdb
641 * interrupt? */
642 if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
643 if (cpu != kdb_initial_cpu)
644 state = 'I'; /* idle task */
645 }
646 } else if (!p->mm && state == 'S') {
647 state = 'M'; /* sleeping system daemon */
648 }
649 return state;
650}
651
652/*
653 * kdb_task_state - Return true if a process has the desired state
654 * given by the mask.
655 * Inputs:
656 * p struct task for the process
657 * mask mask from kdb_task_state_string to select processes
658 * Returns:
659 * True if the process matches at least one criteria defined by the mask.
660 */
661unsigned long kdb_task_state(const struct task_struct *p, unsigned long mask)
662{
663 char state[] = { kdb_task_state_char(p), '\0' };
664 return (mask & kdb_task_state_string(state)) != 0;
665}
666
667/*
668 * kdb_print_nameval - Print a name and its value, converting the
669 * value to a symbol lookup if possible.
670 * Inputs:
671 * name field name to print
672 * val value of field
673 */
674void kdb_print_nameval(const char *name, unsigned long val)
675{
676 kdb_symtab_t symtab;
677 kdb_printf(" %-11.11s ", name);
678 if (kdbnearsym(val, &symtab))
679 kdb_symbol_print(val, &symtab,
680 KDB_SP_VALUE|KDB_SP_SYMSIZE|KDB_SP_NEWLINE);
681 else
682 kdb_printf("0x%lx\n", val);
683}
684
685/* Last ditch allocator for debugging, so we can still debug even when
686 * the GFP_ATOMIC pool has been exhausted. The algorithms are tuned
687 * for space usage, not for speed. One smallish memory pool, the free
688 * chain is always in ascending address order to allow coalescing,
689 * allocations are done in brute force best fit.
690 */
691
692struct debug_alloc_header {
693 u32 next; /* offset of next header from start of pool */
694 u32 size;
695 void *caller;
696};
697
698/* The memory returned by this allocator must be aligned, which means
699 * so must the header size. Do not assume that sizeof(struct
700 * debug_alloc_header) is a multiple of the alignment, explicitly
701 * calculate the overhead of this header, including the alignment.
702 * The rest of this code must not use sizeof() on any header or
703 * pointer to a header.
704 */
705#define dah_align 8
706#define dah_overhead ALIGN(sizeof(struct debug_alloc_header), dah_align)
707
708static u64 debug_alloc_pool_aligned[256*1024/dah_align]; /* 256K pool */
709static char *debug_alloc_pool = (char *)debug_alloc_pool_aligned;
710static u32 dah_first, dah_first_call = 1, dah_used, dah_used_max;
711
712/* Locking is awkward. The debug code is called from all contexts,
713 * including non maskable interrupts. A normal spinlock is not safe
714 * in NMI context. Try to get the debug allocator lock, if it cannot
715 * be obtained after a second then give up. If the lock could not be
716 * previously obtained on this cpu then only try once.
717 *
718 * sparse has no annotation for "this function _sometimes_ acquires a
719 * lock", so fudge the acquire/release notation.
720 */
721static DEFINE_SPINLOCK(dap_lock);
722static int get_dap_lock(void)
723 __acquires(dap_lock)
724{
725 static int dap_locked = -1;
726 int count;
727 if (dap_locked == smp_processor_id())
728 count = 1;
729 else
730 count = 1000;
731 while (1) {
732 if (spin_trylock(&dap_lock)) {
733 dap_locked = -1;
734 return 1;
735 }
736 if (!count--)
737 break;
738 udelay(1000);
739 }
740 dap_locked = smp_processor_id();
741 __acquire(dap_lock);
742 return 0;
743}
744
745void *debug_kmalloc(size_t size, gfp_t flags)
746{
747 unsigned int rem, h_offset;
748 struct debug_alloc_header *best, *bestprev, *prev, *h;
749 void *p = NULL;
750 if (!get_dap_lock()) {
751 __release(dap_lock); /* we never actually got it */
752 return NULL;
753 }
754 h = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
755 if (dah_first_call) {
756 h->size = sizeof(debug_alloc_pool_aligned) - dah_overhead;
757 dah_first_call = 0;
758 }
759 size = ALIGN(size, dah_align);
760 prev = best = bestprev = NULL;
761 while (1) {
762 if (h->size >= size && (!best || h->size < best->size)) {
763 best = h;
764 bestprev = prev;
765 if (h->size == size)
766 break;
767 }
768 if (!h->next)
769 break;
770 prev = h;
771 h = (struct debug_alloc_header *)(debug_alloc_pool + h->next);
772 }
773 if (!best)
774 goto out;
775 rem = best->size - size;
776 /* The pool must always contain at least one header */
777 if (best->next == 0 && bestprev == NULL && rem < dah_overhead)
778 goto out;
779 if (rem >= dah_overhead) {
780 best->size = size;
781 h_offset = ((char *)best - debug_alloc_pool) +
782 dah_overhead + best->size;
783 h = (struct debug_alloc_header *)(debug_alloc_pool + h_offset);
784 h->size = rem - dah_overhead;
785 h->next = best->next;
786 } else
787 h_offset = best->next;
788 best->caller = __builtin_return_address(0);
789 dah_used += best->size;
790 dah_used_max = max(dah_used, dah_used_max);
791 if (bestprev)
792 bestprev->next = h_offset;
793 else
794 dah_first = h_offset;
795 p = (char *)best + dah_overhead;
796 memset(p, POISON_INUSE, best->size - 1);
797 *((char *)p + best->size - 1) = POISON_END;
798out:
799 spin_unlock(&dap_lock);
800 return p;
801}
802
803void debug_kfree(void *p)
804{
805 struct debug_alloc_header *h;
806 unsigned int h_offset;
807 if (!p)
808 return;
809 if ((char *)p < debug_alloc_pool ||
810 (char *)p >= debug_alloc_pool + sizeof(debug_alloc_pool_aligned)) {
811 kfree(p);
812 return;
813 }
814 if (!get_dap_lock()) {
815 __release(dap_lock); /* we never actually got it */
816 return; /* memory leak, cannot be helped */
817 }
818 h = (struct debug_alloc_header *)((char *)p - dah_overhead);
819 memset(p, POISON_FREE, h->size - 1);
820 *((char *)p + h->size - 1) = POISON_END;
821 h->caller = NULL;
822 dah_used -= h->size;
823 h_offset = (char *)h - debug_alloc_pool;
824 if (h_offset < dah_first) {
825 h->next = dah_first;
826 dah_first = h_offset;
827 } else {
828 struct debug_alloc_header *prev;
829 unsigned int prev_offset;
830 prev = (struct debug_alloc_header *)(debug_alloc_pool +
831 dah_first);
832 while (1) {
833 if (!prev->next || prev->next > h_offset)
834 break;
835 prev = (struct debug_alloc_header *)
836 (debug_alloc_pool + prev->next);
837 }
838 prev_offset = (char *)prev - debug_alloc_pool;
839 if (prev_offset + dah_overhead + prev->size == h_offset) {
840 prev->size += dah_overhead + h->size;
841 memset(h, POISON_FREE, dah_overhead - 1);
842 *((char *)h + dah_overhead - 1) = POISON_END;
843 h = prev;
844 h_offset = prev_offset;
845 } else {
846 h->next = prev->next;
847 prev->next = h_offset;
848 }
849 }
850 if (h_offset + dah_overhead + h->size == h->next) {
851 struct debug_alloc_header *next;
852 next = (struct debug_alloc_header *)
853 (debug_alloc_pool + h->next);
854 h->size += dah_overhead + next->size;
855 h->next = next->next;
856 memset(next, POISON_FREE, dah_overhead - 1);
857 *((char *)next + dah_overhead - 1) = POISON_END;
858 }
859 spin_unlock(&dap_lock);
860}
861
862void debug_kusage(void)
863{
864 struct debug_alloc_header *h_free, *h_used;
865#ifdef CONFIG_IA64
866 /* FIXME: using dah for ia64 unwind always results in a memory leak.
867 * Fix that memory leak first, then set debug_kusage_one_time = 1 for
868 * all architectures.
869 */
870 static int debug_kusage_one_time;
871#else
872 static int debug_kusage_one_time = 1;
873#endif
874 if (!get_dap_lock()) {
875 __release(dap_lock); /* we never actually got it */
876 return;
877 }
878 h_free = (struct debug_alloc_header *)(debug_alloc_pool + dah_first);
879 if (dah_first == 0 &&
880 (h_free->size == sizeof(debug_alloc_pool_aligned) - dah_overhead ||
881 dah_first_call))
882 goto out;
883 if (!debug_kusage_one_time)
884 goto out;
885 debug_kusage_one_time = 0;
886 kdb_printf("%s: debug_kmalloc memory leak dah_first %d\n",
887 __func__, dah_first);
888 if (dah_first) {
889 h_used = (struct debug_alloc_header *)debug_alloc_pool;
890 kdb_printf("%s: h_used %p size %d\n", __func__, h_used,
891 h_used->size);
892 }
893 do {
894 h_used = (struct debug_alloc_header *)
895 ((char *)h_free + dah_overhead + h_free->size);
896 kdb_printf("%s: h_used %p size %d caller %p\n",
897 __func__, h_used, h_used->size, h_used->caller);
898 h_free = (struct debug_alloc_header *)
899 (debug_alloc_pool + h_free->next);
900 } while (h_free->next);
901 h_used = (struct debug_alloc_header *)
902 ((char *)h_free + dah_overhead + h_free->size);
903 if ((char *)h_used - debug_alloc_pool !=
904 sizeof(debug_alloc_pool_aligned))
905 kdb_printf("%s: h_used %p size %d caller %p\n",
906 __func__, h_used, h_used->size, h_used->caller);
907out:
908 spin_unlock(&dap_lock);
909}
910
911/* Maintain a small stack of kdb_flags to allow recursion without disturbing
912 * the global kdb state.
913 */
914
915static int kdb_flags_stack[4], kdb_flags_index;
916
917void kdb_save_flags(void)
918{
919 BUG_ON(kdb_flags_index >= ARRAY_SIZE(kdb_flags_stack));
920 kdb_flags_stack[kdb_flags_index++] = kdb_flags;
921}
922
923void kdb_restore_flags(void)
924{
925 BUG_ON(kdb_flags_index <= 0);
926 kdb_flags = kdb_flags_stack[--kdb_flags_index];
927}
diff --git a/kernel/early_res.c b/kernel/early_res.c
index 3cb2c661bb78..7bfae887f211 100644
--- a/kernel/early_res.c
+++ b/kernel/early_res.c
@@ -7,6 +7,8 @@
7#include <linux/bootmem.h> 7#include <linux/bootmem.h>
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/early_res.h> 9#include <linux/early_res.h>
10#include <linux/slab.h>
11#include <linux/kmemleak.h>
10 12
11/* 13/*
12 * Early reserved memory areas. 14 * Early reserved memory areas.
@@ -319,6 +321,8 @@ void __init free_early(u64 start, u64 end)
319 struct early_res *r; 321 struct early_res *r;
320 int i; 322 int i;
321 323
324 kmemleak_free_part(__va(start), end - start);
325
322 i = find_overlapped_early(start, end); 326 i = find_overlapped_early(start, end);
323 r = &early_res[i]; 327 r = &early_res[i];
324 if (i >= max_early_res || r->end != end || r->start != start) 328 if (i >= max_early_res || r->end != end || r->start != start)
@@ -333,6 +337,14 @@ void __init free_early_partial(u64 start, u64 end)
333 struct early_res *r; 337 struct early_res *r;
334 int i; 338 int i;
335 339
340 kmemleak_free_part(__va(start), end - start);
341
342 if (start == end)
343 return;
344
345 if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end))
346 return;
347
336try_next: 348try_next:
337 i = find_overlapped_early(start, end); 349 i = find_overlapped_early(start, end);
338 if (i >= max_early_res) 350 if (i >= max_early_res)
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c35452cadded..dd62f8e714ca 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain;
27static DEFINE_RWLOCK(exec_domains_lock); 27static DEFINE_RWLOCK(exec_domains_lock);
28 28
29 29
30static u_long ident_map[32] = { 30static unsigned long ident_map[32] = {
31 0, 1, 2, 3, 4, 5, 6, 7, 31 0, 1, 2, 3, 4, 5, 6, 7,
32 8, 9, 10, 11, 12, 13, 14, 15, 32 8, 9, 10, 11, 12, 13, 14, 15,
33 16, 17, 18, 19, 20, 21, 22, 23, 33 16, 17, 18, 19, 20, 21, 22, 23,
@@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp)
56} 56}
57 57
58static struct exec_domain * 58static struct exec_domain *
59lookup_exec_domain(u_long personality) 59lookup_exec_domain(unsigned int personality)
60{ 60{
61 struct exec_domain * ep; 61 unsigned int pers = personality(personality);
62 u_long pers = personality(personality); 62 struct exec_domain *ep;
63 63
64 read_lock(&exec_domains_lock); 64 read_lock(&exec_domains_lock);
65 for (ep = exec_domains; ep; ep = ep->next) { 65 for (ep = exec_domains; ep; ep = ep->next) {
@@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality)
70 70
71#ifdef CONFIG_MODULES 71#ifdef CONFIG_MODULES
72 read_unlock(&exec_domains_lock); 72 read_unlock(&exec_domains_lock);
73 request_module("personality-%ld", pers); 73 request_module("personality-%d", pers);
74 read_lock(&exec_domains_lock); 74 read_lock(&exec_domains_lock);
75 75
76 for (ep = exec_domains; ep; ep = ep->next) { 76 for (ep = exec_domains; ep; ep = ep->next) {
@@ -135,7 +135,7 @@ unregister:
135} 135}
136 136
137int 137int
138__set_personality(u_long personality) 138__set_personality(unsigned int personality)
139{ 139{
140 struct exec_domain *ep, *oep; 140 struct exec_domain *ep, *oep;
141 141
@@ -188,9 +188,9 @@ static int __init proc_execdomains_init(void)
188module_init(proc_execdomains_init); 188module_init(proc_execdomains_init);
189#endif 189#endif
190 190
191SYSCALL_DEFINE1(personality, u_long, personality) 191SYSCALL_DEFINE1(personality, unsigned int, personality)
192{ 192{
193 u_long old = current->personality; 193 unsigned int old = current->personality;
194 194
195 if (personality != 0xffffffff) { 195 if (personality != 0xffffffff) {
196 set_personality(personality); 196 set_personality(personality);
@@ -198,7 +198,7 @@ SYSCALL_DEFINE1(personality, u_long, personality)
198 return -EINVAL; 198 return -EINVAL;
199 } 199 }
200 200
201 return (long)old; 201 return old;
202} 202}
203 203
204 204
diff --git a/kernel/exit.c b/kernel/exit.c
index cce59cb5ee6a..ceffc67b564a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -55,15 +55,14 @@
55#include <asm/unistd.h> 55#include <asm/unistd.h>
56#include <asm/pgtable.h> 56#include <asm/pgtable.h>
57#include <asm/mmu_context.h> 57#include <asm/mmu_context.h>
58#include "cred-internals.h"
59 58
60static void exit_mm(struct task_struct * tsk); 59static void exit_mm(struct task_struct * tsk);
61 60
62static void __unhash_process(struct task_struct *p) 61static void __unhash_process(struct task_struct *p, bool group_dead)
63{ 62{
64 nr_threads--; 63 nr_threads--;
65 detach_pid(p, PIDTYPE_PID); 64 detach_pid(p, PIDTYPE_PID);
66 if (thread_group_leader(p)) { 65 if (group_dead) {
67 detach_pid(p, PIDTYPE_PGID); 66 detach_pid(p, PIDTYPE_PGID);
68 detach_pid(p, PIDTYPE_SID); 67 detach_pid(p, PIDTYPE_SID);
69 68
@@ -80,10 +79,9 @@ static void __unhash_process(struct task_struct *p)
80static void __exit_signal(struct task_struct *tsk) 79static void __exit_signal(struct task_struct *tsk)
81{ 80{
82 struct signal_struct *sig = tsk->signal; 81 struct signal_struct *sig = tsk->signal;
82 bool group_dead = thread_group_leader(tsk);
83 struct sighand_struct *sighand; 83 struct sighand_struct *sighand;
84 84 struct tty_struct *uninitialized_var(tty);
85 BUG_ON(!sig);
86 BUG_ON(!atomic_read(&sig->count));
87 85
88 sighand = rcu_dereference_check(tsk->sighand, 86 sighand = rcu_dereference_check(tsk->sighand,
89 rcu_read_lock_held() || 87 rcu_read_lock_held() ||
@@ -91,14 +89,16 @@ static void __exit_signal(struct task_struct *tsk)
91 spin_lock(&sighand->siglock); 89 spin_lock(&sighand->siglock);
92 90
93 posix_cpu_timers_exit(tsk); 91 posix_cpu_timers_exit(tsk);
94 if (atomic_dec_and_test(&sig->count)) 92 if (group_dead) {
95 posix_cpu_timers_exit_group(tsk); 93 posix_cpu_timers_exit_group(tsk);
96 else { 94 tty = sig->tty;
95 sig->tty = NULL;
96 } else {
97 /* 97 /*
98 * If there is any task waiting for the group exit 98 * If there is any task waiting for the group exit
99 * then notify it: 99 * then notify it:
100 */ 100 */
101 if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) 101 if (sig->notify_count > 0 && !--sig->notify_count)
102 wake_up_process(sig->group_exit_task); 102 wake_up_process(sig->group_exit_task);
103 103
104 if (tsk == sig->curr_target) 104 if (tsk == sig->curr_target)
@@ -124,32 +124,24 @@ static void __exit_signal(struct task_struct *tsk)
124 sig->oublock += task_io_get_oublock(tsk); 124 sig->oublock += task_io_get_oublock(tsk);
125 task_io_accounting_add(&sig->ioac, &tsk->ioac); 125 task_io_accounting_add(&sig->ioac, &tsk->ioac);
126 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 126 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
127 sig = NULL; /* Marker for below. */
128 } 127 }
129 128
130 __unhash_process(tsk); 129 sig->nr_threads--;
130 __unhash_process(tsk, group_dead);
131 131
132 /* 132 /*
133 * Do this under ->siglock, we can race with another thread 133 * Do this under ->siglock, we can race with another thread
134 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. 134 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
135 */ 135 */
136 flush_sigqueue(&tsk->pending); 136 flush_sigqueue(&tsk->pending);
137
138 tsk->signal = NULL;
139 tsk->sighand = NULL; 137 tsk->sighand = NULL;
140 spin_unlock(&sighand->siglock); 138 spin_unlock(&sighand->siglock);
141 139
142 __cleanup_sighand(sighand); 140 __cleanup_sighand(sighand);
143 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 141 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
144 if (sig) { 142 if (group_dead) {
145 flush_sigqueue(&sig->shared_pending); 143 flush_sigqueue(&sig->shared_pending);
146 taskstats_tgid_free(sig); 144 tty_kref_put(tty);
147 /*
148 * Make sure ->signal can't go away under rq->lock,
149 * see account_group_exec_runtime().
150 */
151 task_rq_unlock_wait(tsk);
152 __cleanup_signal(sig);
153 } 145 }
154} 146}
155 147
@@ -857,12 +849,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
857 849
858 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; 850 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
859 851
860 /* mt-exec, de_thread() is waiting for us */ 852 /* mt-exec, de_thread() is waiting for group leader */
861 if (thread_group_leader(tsk) && 853 if (unlikely(tsk->signal->notify_count < 0))
862 tsk->signal->group_exit_task &&
863 tsk->signal->notify_count < 0)
864 wake_up_process(tsk->signal->group_exit_task); 854 wake_up_process(tsk->signal->group_exit_task);
865
866 write_unlock_irq(&tasklist_lock); 855 write_unlock_irq(&tasklist_lock);
867 856
868 tracehook_report_death(tsk, signal, cookie, group_dead); 857 tracehook_report_death(tsk, signal, cookie, group_dead);
@@ -953,7 +942,8 @@ NORET_TYPE void do_exit(long code)
953 942
954 acct_update_integrals(tsk); 943 acct_update_integrals(tsk);
955 /* sync mm's RSS info before statistics gathering */ 944 /* sync mm's RSS info before statistics gathering */
956 sync_mm_rss(tsk, tsk->mm); 945 if (tsk->mm)
946 sync_mm_rss(tsk, tsk->mm);
957 group_dead = atomic_dec_and_test(&tsk->signal->live); 947 group_dead = atomic_dec_and_test(&tsk->signal->live);
958 if (group_dead) { 948 if (group_dead) {
959 hrtimer_cancel(&tsk->signal->real_timer); 949 hrtimer_cancel(&tsk->signal->real_timer);
@@ -1002,8 +992,10 @@ NORET_TYPE void do_exit(long code)
1002 992
1003 exit_notify(tsk, group_dead); 993 exit_notify(tsk, group_dead);
1004#ifdef CONFIG_NUMA 994#ifdef CONFIG_NUMA
995 task_lock(tsk);
1005 mpol_put(tsk->mempolicy); 996 mpol_put(tsk->mempolicy);
1006 tsk->mempolicy = NULL; 997 tsk->mempolicy = NULL;
998 task_unlock(tsk);
1007#endif 999#endif
1008#ifdef CONFIG_FUTEX 1000#ifdef CONFIG_FUTEX
1009 if (unlikely(current->pi_state_cache)) 1001 if (unlikely(current->pi_state_cache))
diff --git a/kernel/fork.c b/kernel/fork.c
index 4799c5f0e6d0..b6cce14ba047 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -165,6 +165,18 @@ void free_task(struct task_struct *tsk)
165} 165}
166EXPORT_SYMBOL(free_task); 166EXPORT_SYMBOL(free_task);
167 167
168static inline void free_signal_struct(struct signal_struct *sig)
169{
170 taskstats_tgid_free(sig);
171 kmem_cache_free(signal_cachep, sig);
172}
173
174static inline void put_signal_struct(struct signal_struct *sig)
175{
176 if (atomic_dec_and_test(&sig->sigcnt))
177 free_signal_struct(sig);
178}
179
168void __put_task_struct(struct task_struct *tsk) 180void __put_task_struct(struct task_struct *tsk)
169{ 181{
170 WARN_ON(!tsk->exit_state); 182 WARN_ON(!tsk->exit_state);
@@ -173,6 +185,7 @@ void __put_task_struct(struct task_struct *tsk)
173 185
174 exit_creds(tsk); 186 exit_creds(tsk);
175 delayacct_tsk_free(tsk); 187 delayacct_tsk_free(tsk);
188 put_signal_struct(tsk->signal);
176 189
177 if (!profile_handoff_task(tsk)) 190 if (!profile_handoff_task(tsk))
178 free_task(tsk); 191 free_task(tsk);
@@ -864,8 +877,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
864 if (!sig) 877 if (!sig)
865 return -ENOMEM; 878 return -ENOMEM;
866 879
867 atomic_set(&sig->count, 1); 880 sig->nr_threads = 1;
868 atomic_set(&sig->live, 1); 881 atomic_set(&sig->live, 1);
882 atomic_set(&sig->sigcnt, 1);
869 init_waitqueue_head(&sig->wait_chldexit); 883 init_waitqueue_head(&sig->wait_chldexit);
870 if (clone_flags & CLONE_NEWPID) 884 if (clone_flags & CLONE_NEWPID)
871 sig->flags |= SIGNAL_UNKILLABLE; 885 sig->flags |= SIGNAL_UNKILLABLE;
@@ -889,13 +903,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
889 return 0; 903 return 0;
890} 904}
891 905
892void __cleanup_signal(struct signal_struct *sig)
893{
894 thread_group_cputime_free(sig);
895 tty_kref_put(sig->tty);
896 kmem_cache_free(signal_cachep, sig);
897}
898
899static void copy_flags(unsigned long clone_flags, struct task_struct *p) 906static void copy_flags(unsigned long clone_flags, struct task_struct *p)
900{ 907{
901 unsigned long new_flags = p->flags; 908 unsigned long new_flags = p->flags;
@@ -1052,6 +1059,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1052 p->prev_utime = cputime_zero; 1059 p->prev_utime = cputime_zero;
1053 p->prev_stime = cputime_zero; 1060 p->prev_stime = cputime_zero;
1054#endif 1061#endif
1062#if defined(SPLIT_RSS_COUNTING)
1063 memset(&p->rss_stat, 0, sizeof(p->rss_stat));
1064#endif
1055 1065
1056 p->default_timer_slack_ns = current->timer_slack_ns; 1066 p->default_timer_slack_ns = current->timer_slack_ns;
1057 1067
@@ -1109,10 +1119,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1109 p->memcg_batch.memcg = NULL; 1119 p->memcg_batch.memcg = NULL;
1110#endif 1120#endif
1111 1121
1112 p->bts = NULL;
1113
1114 p->stack_start = stack_start;
1115
1116 /* Perform scheduler related setup. Assign this task to a CPU. */ 1122 /* Perform scheduler related setup. Assign this task to a CPU. */
1117 sched_fork(p, clone_flags); 1123 sched_fork(p, clone_flags);
1118 1124
@@ -1246,8 +1252,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1246 } 1252 }
1247 1253
1248 if (clone_flags & CLONE_THREAD) { 1254 if (clone_flags & CLONE_THREAD) {
1249 atomic_inc(&current->signal->count); 1255 current->signal->nr_threads++;
1250 atomic_inc(&current->signal->live); 1256 atomic_inc(&current->signal->live);
1257 atomic_inc(&current->signal->sigcnt);
1251 p->group_leader = current->group_leader; 1258 p->group_leader = current->group_leader;
1252 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1259 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1253 } 1260 }
@@ -1260,7 +1267,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1260 p->nsproxy->pid_ns->child_reaper = p; 1267 p->nsproxy->pid_ns->child_reaper = p;
1261 1268
1262 p->signal->leader_pid = pid; 1269 p->signal->leader_pid = pid;
1263 tty_kref_put(p->signal->tty);
1264 p->signal->tty = tty_kref_get(current->signal->tty); 1270 p->signal->tty = tty_kref_get(current->signal->tty);
1265 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1271 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1266 attach_pid(p, PIDTYPE_SID, task_session(current)); 1272 attach_pid(p, PIDTYPE_SID, task_session(current));
@@ -1293,7 +1299,7 @@ bad_fork_cleanup_mm:
1293 mmput(p->mm); 1299 mmput(p->mm);
1294bad_fork_cleanup_signal: 1300bad_fork_cleanup_signal:
1295 if (!(clone_flags & CLONE_THREAD)) 1301 if (!(clone_flags & CLONE_THREAD))
1296 __cleanup_signal(p->signal); 1302 free_signal_struct(p->signal);
1297bad_fork_cleanup_sighand: 1303bad_fork_cleanup_sighand:
1298 __cleanup_sighand(p->sighand); 1304 __cleanup_sighand(p->sighand);
1299bad_fork_cleanup_fs: 1305bad_fork_cleanup_fs:
@@ -1328,6 +1334,16 @@ noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_re
1328 return regs; 1334 return regs;
1329} 1335}
1330 1336
1337static inline void init_idle_pids(struct pid_link *links)
1338{
1339 enum pid_type type;
1340
1341 for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) {
1342 INIT_HLIST_NODE(&links[type].node); /* not really needed */
1343 links[type].pid = &init_struct_pid;
1344 }
1345}
1346
1331struct task_struct * __cpuinit fork_idle(int cpu) 1347struct task_struct * __cpuinit fork_idle(int cpu)
1332{ 1348{
1333 struct task_struct *task; 1349 struct task_struct *task;
@@ -1335,8 +1351,10 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1335 1351
1336 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, 1352 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1337 &init_struct_pid, 0); 1353 &init_struct_pid, 0);
1338 if (!IS_ERR(task)) 1354 if (!IS_ERR(task)) {
1355 init_idle_pids(task->pids);
1339 init_idle(task, cpu); 1356 init_idle(task, cpu);
1357 }
1340 1358
1341 return task; 1359 return task;
1342} 1360}
@@ -1508,14 +1526,6 @@ static void check_unshare_flags(unsigned long *flags_ptr)
1508 *flags_ptr |= CLONE_SIGHAND; 1526 *flags_ptr |= CLONE_SIGHAND;
1509 1527
1510 /* 1528 /*
1511 * If unsharing signal handlers and the task was created
1512 * using CLONE_THREAD, then must unshare the thread
1513 */
1514 if ((*flags_ptr & CLONE_SIGHAND) &&
1515 (atomic_read(&current->signal->count) > 1))
1516 *flags_ptr |= CLONE_THREAD;
1517
1518 /*
1519 * If unsharing namespace, must also unshare filesystem information. 1529 * If unsharing namespace, must also unshare filesystem information.
1520 */ 1530 */
1521 if (*flags_ptr & CLONE_NEWNS) 1531 if (*flags_ptr & CLONE_NEWNS)
diff --git a/kernel/futex.c b/kernel/futex.c
index e7a35f1039e7..6a3a5fa1526d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state)
429static struct task_struct * futex_find_get_task(pid_t pid) 429static struct task_struct * futex_find_get_task(pid_t pid)
430{ 430{
431 struct task_struct *p; 431 struct task_struct *p;
432 const struct cred *cred = current_cred(), *pcred;
433 432
434 rcu_read_lock(); 433 rcu_read_lock();
435 p = find_task_by_vpid(pid); 434 p = find_task_by_vpid(pid);
436 if (!p) { 435 if (p)
437 p = ERR_PTR(-ESRCH); 436 get_task_struct(p);
438 } else {
439 pcred = __task_cred(p);
440 if (cred->euid != pcred->euid &&
441 cred->euid != pcred->uid)
442 p = ERR_PTR(-ESRCH);
443 else
444 get_task_struct(p);
445 }
446 437
447 rcu_read_unlock(); 438 rcu_read_unlock();
448 439
@@ -564,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
564 if (!pid) 555 if (!pid)
565 return -ESRCH; 556 return -ESRCH;
566 p = futex_find_get_task(pid); 557 p = futex_find_get_task(pid);
567 if (IS_ERR(p)) 558 if (!p)
568 return PTR_ERR(p); 559 return -ESRCH;
569 560
570 /* 561 /*
571 * We need to look at the task state flags to figure out, 562 * We need to look at the task state flags to figure out,
diff --git a/kernel/groups.c b/kernel/groups.c
index 2b45b2ee3964..53b1916c9492 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -164,12 +164,6 @@ int groups_search(const struct group_info *group_info, gid_t grp)
164 */ 164 */
165int set_groups(struct cred *new, struct group_info *group_info) 165int set_groups(struct cred *new, struct group_info *group_info)
166{ 166{
167 int retval;
168
169 retval = security_task_setgroups(group_info);
170 if (retval)
171 return retval;
172
173 put_group_info(new->group_info); 167 put_group_info(new->group_info);
174 groups_sort(group_info); 168 groups_sort(group_info);
175 get_group_info(group_info); 169 get_group_info(group_info);
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0086628b6e97..5c69e996bd0f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -89,7 +89,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
89 89
90 do { 90 do {
91 seq = read_seqbegin(&xtime_lock); 91 seq = read_seqbegin(&xtime_lock);
92 xts = current_kernel_time(); 92 xts = __current_kernel_time();
93 tom = wall_to_monotonic; 93 tom = wall_to_monotonic;
94 } while (read_seqretry(&xtime_lock, seq)); 94 } while (read_seqretry(&xtime_lock, seq));
95 95
@@ -1749,35 +1749,15 @@ void __init hrtimers_init(void)
1749} 1749}
1750 1750
1751/** 1751/**
1752 * schedule_hrtimeout_range - sleep until timeout 1752 * schedule_hrtimeout_range_clock - sleep until timeout
1753 * @expires: timeout value (ktime_t) 1753 * @expires: timeout value (ktime_t)
1754 * @delta: slack in expires timeout (ktime_t) 1754 * @delta: slack in expires timeout (ktime_t)
1755 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL 1755 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1756 * 1756 * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
1757 * Make the current task sleep until the given expiry time has
1758 * elapsed. The routine will return immediately unless
1759 * the current task state has been set (see set_current_state()).
1760 *
1761 * The @delta argument gives the kernel the freedom to schedule the
1762 * actual wakeup to a time that is both power and performance friendly.
1763 * The kernel give the normal best effort behavior for "@expires+@delta",
1764 * but may decide to fire the timer earlier, but no earlier than @expires.
1765 *
1766 * You can set the task state as follows -
1767 *
1768 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1769 * pass before the routine returns.
1770 *
1771 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1772 * delivered to the current task.
1773 *
1774 * The current task state is guaranteed to be TASK_RUNNING when this
1775 * routine returns.
1776 *
1777 * Returns 0 when the timer has expired otherwise -EINTR
1778 */ 1757 */
1779int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, 1758int __sched
1780 const enum hrtimer_mode mode) 1759schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
1760 const enum hrtimer_mode mode, int clock)
1781{ 1761{
1782 struct hrtimer_sleeper t; 1762 struct hrtimer_sleeper t;
1783 1763
@@ -1799,7 +1779,7 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1799 return -EINTR; 1779 return -EINTR;
1800 } 1780 }
1801 1781
1802 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode); 1782 hrtimer_init_on_stack(&t.timer, clock, mode);
1803 hrtimer_set_expires_range_ns(&t.timer, *expires, delta); 1783 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1804 1784
1805 hrtimer_init_sleeper(&t, current); 1785 hrtimer_init_sleeper(&t, current);
@@ -1818,6 +1798,41 @@ int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1818 1798
1819 return !t.task ? 0 : -EINTR; 1799 return !t.task ? 0 : -EINTR;
1820} 1800}
1801
1802/**
1803 * schedule_hrtimeout_range - sleep until timeout
1804 * @expires: timeout value (ktime_t)
1805 * @delta: slack in expires timeout (ktime_t)
1806 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1807 *
1808 * Make the current task sleep until the given expiry time has
1809 * elapsed. The routine will return immediately unless
1810 * the current task state has been set (see set_current_state()).
1811 *
1812 * The @delta argument gives the kernel the freedom to schedule the
1813 * actual wakeup to a time that is both power and performance friendly.
1814 * The kernel give the normal best effort behavior for "@expires+@delta",
1815 * but may decide to fire the timer earlier, but no earlier than @expires.
1816 *
1817 * You can set the task state as follows -
1818 *
1819 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1820 * pass before the routine returns.
1821 *
1822 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1823 * delivered to the current task.
1824 *
1825 * The current task state is guaranteed to be TASK_RUNNING when this
1826 * routine returns.
1827 *
1828 * Returns 0 when the timer has expired otherwise -EINTR
1829 */
1830int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1831 const enum hrtimer_mode mode)
1832{
1833 return schedule_hrtimeout_range_clock(expires, delta, mode,
1834 CLOCK_MONOTONIC);
1835}
1821EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); 1836EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
1822 1837
1823/** 1838/**
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 03808ed342a6..7a56b22e0602 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -40,23 +40,29 @@
40#include <linux/percpu.h> 40#include <linux/percpu.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/slab.h>
43#include <linux/cpu.h> 44#include <linux/cpu.h>
44#include <linux/smp.h> 45#include <linux/smp.h>
45 46
46#include <linux/hw_breakpoint.h> 47#include <linux/hw_breakpoint.h>
47 48
49
48/* 50/*
49 * Constraints data 51 * Constraints data
50 */ 52 */
51 53
52/* Number of pinned cpu breakpoints in a cpu */ 54/* Number of pinned cpu breakpoints in a cpu */
53static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); 55static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
54 56
55/* Number of pinned task breakpoints in a cpu */ 57/* Number of pinned task breakpoints in a cpu */
56static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); 58static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
57 59
58/* Number of non-pinned cpu/task breakpoints in a cpu */ 60/* Number of non-pinned cpu/task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); 61static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
62
63static int nr_slots[TYPE_MAX];
64
65static int constraints_initialized;
60 66
61/* Gather the number of total pinned and un-pinned bp in a cpuset */ 67/* Gather the number of total pinned and un-pinned bp in a cpuset */
62struct bp_busy_slots { 68struct bp_busy_slots {
@@ -67,16 +73,29 @@ struct bp_busy_slots {
67/* Serialize accesses to the above constraints */ 73/* Serialize accesses to the above constraints */
68static DEFINE_MUTEX(nr_bp_mutex); 74static DEFINE_MUTEX(nr_bp_mutex);
69 75
76__weak int hw_breakpoint_weight(struct perf_event *bp)
77{
78 return 1;
79}
80
81static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
82{
83 if (bp->attr.bp_type & HW_BREAKPOINT_RW)
84 return TYPE_DATA;
85
86 return TYPE_INST;
87}
88
70/* 89/*
71 * Report the maximum number of pinned breakpoints a task 90 * Report the maximum number of pinned breakpoints a task
72 * have in this cpu 91 * have in this cpu
73 */ 92 */
74static unsigned int max_task_bp_pinned(int cpu) 93static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
75{ 94{
76 int i; 95 int i;
77 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 96 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
78 97
79 for (i = HBP_NUM -1; i >= 0; i--) { 98 for (i = nr_slots[type] - 1; i >= 0; i--) {
80 if (tsk_pinned[i] > 0) 99 if (tsk_pinned[i] > 0)
81 return i + 1; 100 return i + 1;
82 } 101 }
@@ -84,7 +103,7 @@ static unsigned int max_task_bp_pinned(int cpu)
84 return 0; 103 return 0;
85} 104}
86 105
87static int task_bp_pinned(struct task_struct *tsk) 106static int task_bp_pinned(struct task_struct *tsk, enum bp_type_idx type)
88{ 107{
89 struct perf_event_context *ctx = tsk->perf_event_ctxp; 108 struct perf_event_context *ctx = tsk->perf_event_ctxp;
90 struct list_head *list; 109 struct list_head *list;
@@ -105,7 +124,8 @@ static int task_bp_pinned(struct task_struct *tsk)
105 */ 124 */
106 list_for_each_entry(bp, list, event_entry) { 125 list_for_each_entry(bp, list, event_entry) {
107 if (bp->attr.type == PERF_TYPE_BREAKPOINT) 126 if (bp->attr.type == PERF_TYPE_BREAKPOINT)
108 count++; 127 if (find_slot_idx(bp) == type)
128 count += hw_breakpoint_weight(bp);
109 } 129 }
110 130
111 raw_spin_unlock_irqrestore(&ctx->lock, flags); 131 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -118,18 +138,19 @@ static int task_bp_pinned(struct task_struct *tsk)
118 * a given cpu (cpu > -1) or in all of them (cpu = -1). 138 * a given cpu (cpu > -1) or in all of them (cpu = -1).
119 */ 139 */
120static void 140static void
121fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) 141fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
142 enum bp_type_idx type)
122{ 143{
123 int cpu = bp->cpu; 144 int cpu = bp->cpu;
124 struct task_struct *tsk = bp->ctx->task; 145 struct task_struct *tsk = bp->ctx->task;
125 146
126 if (cpu >= 0) { 147 if (cpu >= 0) {
127 slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); 148 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
128 if (!tsk) 149 if (!tsk)
129 slots->pinned += max_task_bp_pinned(cpu); 150 slots->pinned += max_task_bp_pinned(cpu, type);
130 else 151 else
131 slots->pinned += task_bp_pinned(tsk); 152 slots->pinned += task_bp_pinned(tsk, type);
132 slots->flexible = per_cpu(nr_bp_flexible, cpu); 153 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
133 154
134 return; 155 return;
135 } 156 }
@@ -137,16 +158,16 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
137 for_each_online_cpu(cpu) { 158 for_each_online_cpu(cpu) {
138 unsigned int nr; 159 unsigned int nr;
139 160
140 nr = per_cpu(nr_cpu_bp_pinned, cpu); 161 nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
141 if (!tsk) 162 if (!tsk)
142 nr += max_task_bp_pinned(cpu); 163 nr += max_task_bp_pinned(cpu, type);
143 else 164 else
144 nr += task_bp_pinned(tsk); 165 nr += task_bp_pinned(tsk, type);
145 166
146 if (nr > slots->pinned) 167 if (nr > slots->pinned)
147 slots->pinned = nr; 168 slots->pinned = nr;
148 169
149 nr = per_cpu(nr_bp_flexible, cpu); 170 nr = per_cpu(nr_bp_flexible[type], cpu);
150 171
151 if (nr > slots->flexible) 172 if (nr > slots->flexible)
152 slots->flexible = nr; 173 slots->flexible = nr;
@@ -154,31 +175,49 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp)
154} 175}
155 176
156/* 177/*
178 * For now, continue to consider flexible as pinned, until we can
179 * ensure no flexible event can ever be scheduled before a pinned event
180 * in a same cpu.
181 */
182static void
183fetch_this_slot(struct bp_busy_slots *slots, int weight)
184{
185 slots->pinned += weight;
186}
187
188/*
157 * Add a pinned breakpoint for the given task in our constraint table 189 * Add a pinned breakpoint for the given task in our constraint table
158 */ 190 */
159static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) 191static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable,
192 enum bp_type_idx type, int weight)
160{ 193{
161 unsigned int *tsk_pinned; 194 unsigned int *tsk_pinned;
162 int count = 0; 195 int old_count = 0;
196 int old_idx = 0;
197 int idx = 0;
163 198
164 count = task_bp_pinned(tsk); 199 old_count = task_bp_pinned(tsk, type);
200 old_idx = old_count - 1;
201 idx = old_idx + weight;
165 202
166 tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); 203 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
167 if (enable) { 204 if (enable) {
168 tsk_pinned[count]++; 205 tsk_pinned[idx]++;
169 if (count > 0) 206 if (old_count > 0)
170 tsk_pinned[count-1]--; 207 tsk_pinned[old_idx]--;
171 } else { 208 } else {
172 tsk_pinned[count]--; 209 tsk_pinned[idx]--;
173 if (count > 0) 210 if (old_count > 0)
174 tsk_pinned[count-1]++; 211 tsk_pinned[old_idx]++;
175 } 212 }
176} 213}
177 214
178/* 215/*
179 * Add/remove the given breakpoint in our constraint table 216 * Add/remove the given breakpoint in our constraint table
180 */ 217 */
181static void toggle_bp_slot(struct perf_event *bp, bool enable) 218static void
219toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
220 int weight)
182{ 221{
183 int cpu = bp->cpu; 222 int cpu = bp->cpu;
184 struct task_struct *tsk = bp->ctx->task; 223 struct task_struct *tsk = bp->ctx->task;
@@ -186,20 +225,20 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
186 /* Pinned counter task profiling */ 225 /* Pinned counter task profiling */
187 if (tsk) { 226 if (tsk) {
188 if (cpu >= 0) { 227 if (cpu >= 0) {
189 toggle_bp_task_slot(tsk, cpu, enable); 228 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
190 return; 229 return;
191 } 230 }
192 231
193 for_each_online_cpu(cpu) 232 for_each_online_cpu(cpu)
194 toggle_bp_task_slot(tsk, cpu, enable); 233 toggle_bp_task_slot(tsk, cpu, enable, type, weight);
195 return; 234 return;
196 } 235 }
197 236
198 /* Pinned counter cpu profiling */ 237 /* Pinned counter cpu profiling */
199 if (enable) 238 if (enable)
200 per_cpu(nr_cpu_bp_pinned, bp->cpu)++; 239 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
201 else 240 else
202 per_cpu(nr_cpu_bp_pinned, bp->cpu)--; 241 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
203} 242}
204 243
205/* 244/*
@@ -246,14 +285,29 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable)
246static int __reserve_bp_slot(struct perf_event *bp) 285static int __reserve_bp_slot(struct perf_event *bp)
247{ 286{
248 struct bp_busy_slots slots = {0}; 287 struct bp_busy_slots slots = {0};
288 enum bp_type_idx type;
289 int weight;
249 290
250 fetch_bp_busy_slots(&slots, bp); 291 /* We couldn't initialize breakpoint constraints on boot */
292 if (!constraints_initialized)
293 return -ENOMEM;
294
295 /* Basic checks */
296 if (bp->attr.bp_type == HW_BREAKPOINT_EMPTY ||
297 bp->attr.bp_type == HW_BREAKPOINT_INVALID)
298 return -EINVAL;
299
300 type = find_slot_idx(bp);
301 weight = hw_breakpoint_weight(bp);
302
303 fetch_bp_busy_slots(&slots, bp, type);
304 fetch_this_slot(&slots, weight);
251 305
252 /* Flexible counters need to keep at least one slot */ 306 /* Flexible counters need to keep at least one slot */
253 if (slots.pinned + (!!slots.flexible) == HBP_NUM) 307 if (slots.pinned + (!!slots.flexible) > nr_slots[type])
254 return -ENOSPC; 308 return -ENOSPC;
255 309
256 toggle_bp_slot(bp, true); 310 toggle_bp_slot(bp, true, type, weight);
257 311
258 return 0; 312 return 0;
259} 313}
@@ -273,7 +327,12 @@ int reserve_bp_slot(struct perf_event *bp)
273 327
274static void __release_bp_slot(struct perf_event *bp) 328static void __release_bp_slot(struct perf_event *bp)
275{ 329{
276 toggle_bp_slot(bp, false); 330 enum bp_type_idx type;
331 int weight;
332
333 type = find_slot_idx(bp);
334 weight = hw_breakpoint_weight(bp);
335 toggle_bp_slot(bp, false, type, weight);
277} 336}
278 337
279void release_bp_slot(struct perf_event *bp) 338void release_bp_slot(struct perf_event *bp)
@@ -308,6 +367,28 @@ int dbg_release_bp_slot(struct perf_event *bp)
308 return 0; 367 return 0;
309} 368}
310 369
370static int validate_hw_breakpoint(struct perf_event *bp)
371{
372 int ret;
373
374 ret = arch_validate_hwbkpt_settings(bp);
375 if (ret)
376 return ret;
377
378 if (arch_check_bp_in_kernelspace(bp)) {
379 if (bp->attr.exclude_kernel)
380 return -EINVAL;
381 /*
382 * Don't let unprivileged users set a breakpoint in the trap
383 * path to avoid trap recursion attacks.
384 */
385 if (!capable(CAP_SYS_ADMIN))
386 return -EPERM;
387 }
388
389 return 0;
390}
391
311int register_perf_hw_breakpoint(struct perf_event *bp) 392int register_perf_hw_breakpoint(struct perf_event *bp)
312{ 393{
313 int ret; 394 int ret;
@@ -316,17 +397,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
316 if (ret) 397 if (ret)
317 return ret; 398 return ret;
318 399
319 /* 400 ret = validate_hw_breakpoint(bp);
320 * Ptrace breakpoints can be temporary perf events only
321 * meant to reserve a slot. In this case, it is created disabled and
322 * we don't want to check the params right now (as we put a null addr)
323 * But perf tools create events as disabled and we want to check
324 * the params for them.
325 * This is a quick hack that will be removed soon, once we remove
326 * the tmp breakpoints from ptrace
327 */
328 if (!bp->attr.disabled || !bp->overflow_handler)
329 ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task);
330 401
331 /* if arch_validate_hwbkpt_settings() fails then release bp slot */ 402 /* if arch_validate_hwbkpt_settings() fails then release bp slot */
332 if (ret) 403 if (ret)
@@ -373,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
373 if (attr->disabled) 444 if (attr->disabled)
374 goto end; 445 goto end;
375 446
376 err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); 447 err = validate_hw_breakpoint(bp);
377 if (!err) 448 if (!err)
378 perf_event_enable(bp); 449 perf_event_enable(bp);
379 450
@@ -480,7 +551,36 @@ static struct notifier_block hw_breakpoint_exceptions_nb = {
480 551
481static int __init init_hw_breakpoint(void) 552static int __init init_hw_breakpoint(void)
482{ 553{
554 unsigned int **task_bp_pinned;
555 int cpu, err_cpu;
556 int i;
557
558 for (i = 0; i < TYPE_MAX; i++)
559 nr_slots[i] = hw_breakpoint_slots(i);
560
561 for_each_possible_cpu(cpu) {
562 for (i = 0; i < TYPE_MAX; i++) {
563 task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
564 *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
565 GFP_KERNEL);
566 if (!*task_bp_pinned)
567 goto err_alloc;
568 }
569 }
570
571 constraints_initialized = 1;
572
483 return register_die_notifier(&hw_breakpoint_exceptions_nb); 573 return register_die_notifier(&hw_breakpoint_exceptions_nb);
574
575 err_alloc:
576 for_each_possible_cpu(err_cpu) {
577 if (err_cpu == cpu)
578 break;
579 for (i = 0; i < TYPE_MAX; i++)
580 kfree(per_cpu(nr_task_bp_pinned[i], cpu));
581 }
582
583 return -ENOMEM;
484} 584}
485core_initcall(init_hw_breakpoint); 585core_initcall(init_hw_breakpoint);
486 586
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 42ec11b2af8a..b7091d5ca2f8 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
359 if (desc->chip->ack) 359 if (desc->chip->ack)
360 desc->chip->ack(irq); 360 desc->chip->ack(irq);
361 } 361 }
362 desc->status |= IRQ_MASKED;
363}
364
365static inline void mask_irq(struct irq_desc *desc, int irq)
366{
367 if (desc->chip->mask) {
368 desc->chip->mask(irq);
369 desc->status |= IRQ_MASKED;
370 }
371}
372
373static inline void unmask_irq(struct irq_desc *desc, int irq)
374{
375 if (desc->chip->unmask) {
376 desc->chip->unmask(irq);
377 desc->status &= ~IRQ_MASKED;
378 }
362} 379}
363 380
364/* 381/*
@@ -484,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
484 raw_spin_lock(&desc->lock); 501 raw_spin_lock(&desc->lock);
485 desc->status &= ~IRQ_INPROGRESS; 502 desc->status &= ~IRQ_INPROGRESS;
486 503
487 if (unlikely(desc->status & IRQ_ONESHOT)) 504 if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
488 desc->status |= IRQ_MASKED; 505 unmask_irq(desc, irq);
489 else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
490 desc->chip->unmask(irq);
491out_unlock: 506out_unlock:
492 raw_spin_unlock(&desc->lock); 507 raw_spin_unlock(&desc->lock);
493} 508}
@@ -524,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
524 action = desc->action; 539 action = desc->action;
525 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 540 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
526 desc->status |= IRQ_PENDING; 541 desc->status |= IRQ_PENDING;
527 if (desc->chip->mask) 542 mask_irq(desc, irq);
528 desc->chip->mask(irq);
529 goto out; 543 goto out;
530 } 544 }
531 545
@@ -593,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
593 irqreturn_t action_ret; 607 irqreturn_t action_ret;
594 608
595 if (unlikely(!action)) { 609 if (unlikely(!action)) {
596 desc->chip->mask(irq); 610 mask_irq(desc, irq);
597 goto out_unlock; 611 goto out_unlock;
598 } 612 }
599 613
@@ -605,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
605 if (unlikely((desc->status & 619 if (unlikely((desc->status &
606 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == 620 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
607 (IRQ_PENDING | IRQ_MASKED))) { 621 (IRQ_PENDING | IRQ_MASKED))) {
608 desc->chip->unmask(irq); 622 unmask_irq(desc, irq);
609 desc->status &= ~IRQ_MASKED;
610 } 623 }
611 624
612 desc->status &= ~IRQ_PENDING; 625 desc->status &= ~IRQ_PENDING;
@@ -716,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
716 __set_irq_handler(irq, handle, 0, name); 729 __set_irq_handler(irq, handle, 0, name);
717} 730}
718 731
719void __init set_irq_noprobe(unsigned int irq) 732void set_irq_noprobe(unsigned int irq)
720{ 733{
721 struct irq_desc *desc = irq_to_desc(irq); 734 struct irq_desc *desc = irq_to_desc(irq);
722 unsigned long flags; 735 unsigned long flags;
@@ -731,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq)
731 raw_spin_unlock_irqrestore(&desc->lock, flags); 744 raw_spin_unlock_irqrestore(&desc->lock, flags);
732} 745}
733 746
734void __init set_irq_probe(unsigned int irq) 747void set_irq_probe(unsigned int irq)
735{ 748{
736 struct irq_desc *desc = irq_to_desc(irq); 749 struct irq_desc *desc = irq_to_desc(irq);
737 unsigned long flags; 750 unsigned long flags;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 76d5a671bfe1..27e5c6911223 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -370,9 +370,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
370 irqreturn_t ret, retval = IRQ_NONE; 370 irqreturn_t ret, retval = IRQ_NONE;
371 unsigned int status = 0; 371 unsigned int status = 0;
372 372
373 if (!(action->flags & IRQF_DISABLED))
374 local_irq_enable_in_hardirq();
375
376 do { 373 do {
377 trace_irq_handler_entry(irq, action); 374 trace_irq_handler_entry(irq, action);
378 ret = action->handler(irq, action->dev_id); 375 ret = action->handler(irq, action->dev_id);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index eb6078ca60c7..e1497481fe8a 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -138,6 +138,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
138 return 0; 138 return 0;
139} 139}
140 140
141int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
142{
143 struct irq_desc *desc = irq_to_desc(irq);
144 unsigned long flags;
145
146 if (!desc)
147 return -EINVAL;
148
149 raw_spin_lock_irqsave(&desc->lock, flags);
150 desc->affinity_hint = m;
151 raw_spin_unlock_irqrestore(&desc->lock, flags);
152
153 return 0;
154}
155EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
156
141#ifndef CONFIG_AUTO_IRQ_AFFINITY 157#ifndef CONFIG_AUTO_IRQ_AFFINITY
142/* 158/*
143 * Generic version of the affinity autoselector. 159 * Generic version of the affinity autoselector.
@@ -382,6 +398,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
382{ 398{
383 struct irq_desc *desc = irq_to_desc(irq); 399 struct irq_desc *desc = irq_to_desc(irq);
384 struct irqaction *action; 400 struct irqaction *action;
401 unsigned long flags;
385 402
386 if (!desc) 403 if (!desc)
387 return 0; 404 return 0;
@@ -389,11 +406,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
389 if (desc->status & IRQ_NOREQUEST) 406 if (desc->status & IRQ_NOREQUEST)
390 return 0; 407 return 0;
391 408
409 raw_spin_lock_irqsave(&desc->lock, flags);
392 action = desc->action; 410 action = desc->action;
393 if (action) 411 if (action)
394 if (irqflags & action->flags & IRQF_SHARED) 412 if (irqflags & action->flags & IRQF_SHARED)
395 action = NULL; 413 action = NULL;
396 414
415 raw_spin_unlock_irqrestore(&desc->lock, flags);
416
397 return !action; 417 return !action;
398} 418}
399 419
@@ -436,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
436 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ 456 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
437 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); 457 desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
438 desc->status |= flags; 458 desc->status |= flags;
459
460 if (chip != desc->chip)
461 irq_chip_set_defaults(desc->chip);
439 } 462 }
440 463
441 return ret; 464 return ret;
@@ -483,8 +506,26 @@ static int irq_wait_for_interrupt(struct irqaction *action)
483 */ 506 */
484static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) 507static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
485{ 508{
509again:
486 chip_bus_lock(irq, desc); 510 chip_bus_lock(irq, desc);
487 raw_spin_lock_irq(&desc->lock); 511 raw_spin_lock_irq(&desc->lock);
512
513 /*
514 * Implausible though it may be we need to protect us against
515 * the following scenario:
516 *
517 * The thread is faster done than the hard interrupt handler
518 * on the other CPU. If we unmask the irq line then the
519 * interrupt can come in again and masks the line, leaves due
520 * to IRQ_INPROGRESS and the irq line is masked forever.
521 */
522 if (unlikely(desc->status & IRQ_INPROGRESS)) {
523 raw_spin_unlock_irq(&desc->lock);
524 chip_bus_sync_unlock(irq, desc);
525 cpu_relax();
526 goto again;
527 }
528
488 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { 529 if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
489 desc->status &= ~IRQ_MASKED; 530 desc->status &= ~IRQ_MASKED;
490 desc->chip->unmask(irq); 531 desc->chip->unmask(irq);
@@ -884,6 +925,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
884 desc->chip->disable(irq); 925 desc->chip->disable(irq);
885 } 926 }
886 927
928#ifdef CONFIG_SMP
929 /* make sure affinity_hint is cleaned up */
930 if (WARN_ON_ONCE(desc->affinity_hint))
931 desc->affinity_hint = NULL;
932#endif
933
887 raw_spin_unlock_irqrestore(&desc->lock, flags); 934 raw_spin_unlock_irqrestore(&desc->lock, flags);
888 935
889 unregister_handler_proc(irq, action); 936 unregister_handler_proc(irq, action);
@@ -995,7 +1042,6 @@ EXPORT_SYMBOL(free_irq);
995 * Flags: 1042 * Flags:
996 * 1043 *
997 * IRQF_SHARED Interrupt is shared 1044 * IRQF_SHARED Interrupt is shared
998 * IRQF_DISABLED Disable local interrupts while processing
999 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy 1045 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
1000 * IRQF_TRIGGER_* Specify active edge(s) or level 1046 * IRQF_TRIGGER_* Specify active edge(s) or level
1001 * 1047 *
@@ -1009,25 +1055,6 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1009 int retval; 1055 int retval;
1010 1056
1011 /* 1057 /*
1012 * handle_IRQ_event() always ignores IRQF_DISABLED except for
1013 * the _first_ irqaction (sigh). That can cause oopsing, but
1014 * the behavior is classified as "will not fix" so we need to
1015 * start nudging drivers away from using that idiom.
1016 */
1017 if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
1018 (IRQF_SHARED|IRQF_DISABLED)) {
1019 pr_warning(
1020 "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
1021 irq, devname);
1022 }
1023
1024#ifdef CONFIG_LOCKDEP
1025 /*
1026 * Lockdep wants atomic interrupt handlers:
1027 */
1028 irqflags |= IRQF_DISABLED;
1029#endif
1030 /*
1031 * Sanity-check: shared interrupts must pass in a real dev-ID, 1058 * Sanity-check: shared interrupts must pass in a real dev-ID,
1032 * otherwise we'll have trouble later trying to figure out 1059 * otherwise we'll have trouble later trying to figure out
1033 * which interrupt is which (messes up the interrupt freeing 1060 * which interrupt is which (messes up the interrupt freeing
@@ -1088,3 +1115,40 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
1088 return retval; 1115 return retval;
1089} 1116}
1090EXPORT_SYMBOL(request_threaded_irq); 1117EXPORT_SYMBOL(request_threaded_irq);
1118
1119/**
1120 * request_any_context_irq - allocate an interrupt line
1121 * @irq: Interrupt line to allocate
1122 * @handler: Function to be called when the IRQ occurs.
1123 * Threaded handler for threaded interrupts.
1124 * @flags: Interrupt type flags
1125 * @name: An ascii name for the claiming device
1126 * @dev_id: A cookie passed back to the handler function
1127 *
1128 * This call allocates interrupt resources and enables the
1129 * interrupt line and IRQ handling. It selects either a
1130 * hardirq or threaded handling method depending on the
1131 * context.
1132 *
1133 * On failure, it returns a negative value. On success,
1134 * it returns either IRQC_IS_HARDIRQ or IRQC_IS_NESTED.
1135 */
1136int request_any_context_irq(unsigned int irq, irq_handler_t handler,
1137 unsigned long flags, const char *name, void *dev_id)
1138{
1139 struct irq_desc *desc = irq_to_desc(irq);
1140 int ret;
1141
1142 if (!desc)
1143 return -EINVAL;
1144
1145 if (desc->status & IRQ_NESTED_THREAD) {
1146 ret = request_threaded_irq(irq, NULL, handler,
1147 flags, name, dev_id);
1148 return !ret ? IRQC_IS_NESTED : ret;
1149 }
1150
1151 ret = request_irq(irq, handler, flags, name, dev_id);
1152 return !ret ? IRQC_IS_HARDIRQ : ret;
1153}
1154EXPORT_SYMBOL_GPL(request_any_context_irq);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 963559dbd858..65d3845665ac 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -6,6 +6,7 @@
6 */ 6 */
7 7
8#include <linux/irq.h> 8#include <linux/irq.h>
9#include <linux/slab.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/random.h> 11#include <linux/random.h>
11#include <linux/interrupt.h> 12#include <linux/interrupt.h>
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6f50eccc79c0..09a2ee540bd2 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/gfp.h>
10#include <linux/proc_fs.h> 11#include <linux/proc_fs.h>
11#include <linux/seq_file.h> 12#include <linux/seq_file.h>
12#include <linux/interrupt.h> 13#include <linux/interrupt.h>
@@ -31,6 +32,27 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
31 return 0; 32 return 0;
32} 33}
33 34
35static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
36{
37 struct irq_desc *desc = irq_to_desc((long)m->private);
38 unsigned long flags;
39 cpumask_var_t mask;
40
41 if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
42 return -ENOMEM;
43
44 raw_spin_lock_irqsave(&desc->lock, flags);
45 if (desc->affinity_hint)
46 cpumask_copy(mask, desc->affinity_hint);
47 raw_spin_unlock_irqrestore(&desc->lock, flags);
48
49 seq_cpumask(m, mask);
50 seq_putc(m, '\n');
51 free_cpumask_var(mask);
52
53 return 0;
54}
55
34#ifndef is_affinity_mask_valid 56#ifndef is_affinity_mask_valid
35#define is_affinity_mask_valid(val) 1 57#define is_affinity_mask_valid(val) 1
36#endif 58#endif
@@ -83,6 +105,11 @@ static int irq_affinity_proc_open(struct inode *inode, struct file *file)
83 return single_open(file, irq_affinity_proc_show, PDE(inode)->data); 105 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
84} 106}
85 107
108static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
109{
110 return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
111}
112
86static const struct file_operations irq_affinity_proc_fops = { 113static const struct file_operations irq_affinity_proc_fops = {
87 .open = irq_affinity_proc_open, 114 .open = irq_affinity_proc_open,
88 .read = seq_read, 115 .read = seq_read,
@@ -91,6 +118,13 @@ static const struct file_operations irq_affinity_proc_fops = {
91 .write = irq_affinity_proc_write, 118 .write = irq_affinity_proc_write,
92}; 119};
93 120
121static const struct file_operations irq_affinity_hint_proc_fops = {
122 .open = irq_affinity_hint_proc_open,
123 .read = seq_read,
124 .llseek = seq_lseek,
125 .release = single_release,
126};
127
94static int default_affinity_show(struct seq_file *m, void *v) 128static int default_affinity_show(struct seq_file *m, void *v)
95{ 129{
96 seq_cpumask(m, irq_default_affinity); 130 seq_cpumask(m, irq_default_affinity);
@@ -146,6 +180,26 @@ static const struct file_operations default_affinity_proc_fops = {
146 .release = single_release, 180 .release = single_release,
147 .write = default_affinity_write, 181 .write = default_affinity_write,
148}; 182};
183
184static int irq_node_proc_show(struct seq_file *m, void *v)
185{
186 struct irq_desc *desc = irq_to_desc((long) m->private);
187
188 seq_printf(m, "%d\n", desc->node);
189 return 0;
190}
191
192static int irq_node_proc_open(struct inode *inode, struct file *file)
193{
194 return single_open(file, irq_node_proc_show, PDE(inode)->data);
195}
196
197static const struct file_operations irq_node_proc_fops = {
198 .open = irq_node_proc_open,
199 .read = seq_read,
200 .llseek = seq_lseek,
201 .release = single_release,
202};
149#endif 203#endif
150 204
151static int irq_spurious_proc_show(struct seq_file *m, void *v) 205static int irq_spurious_proc_show(struct seq_file *m, void *v)
@@ -230,6 +284,13 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
230 /* create /proc/irq/<irq>/smp_affinity */ 284 /* create /proc/irq/<irq>/smp_affinity */
231 proc_create_data("smp_affinity", 0600, desc->dir, 285 proc_create_data("smp_affinity", 0600, desc->dir,
232 &irq_affinity_proc_fops, (void *)(long)irq); 286 &irq_affinity_proc_fops, (void *)(long)irq);
287
288 /* create /proc/irq/<irq>/affinity_hint */
289 proc_create_data("affinity_hint", 0400, desc->dir,
290 &irq_affinity_hint_proc_fops, (void *)(long)irq);
291
292 proc_create_data("node", 0444, desc->dir,
293 &irq_node_proc_fops, (void *)(long)irq);
233#endif 294#endif
234 295
235 proc_create_data("spurious", 0444, desc->dir, 296 proc_create_data("spurious", 0444, desc->dir,
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 8e5288a8a355..6f6d091b5757 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -16,11 +16,13 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/seq_file.h> 17#include <linux/seq_file.h>
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/kdb.h>
19#include <linux/err.h> 20#include <linux/err.h>
20#include <linux/proc_fs.h> 21#include <linux/proc_fs.h>
21#include <linux/sched.h> /* for cond_resched */ 22#include <linux/sched.h> /* for cond_resched */
22#include <linux/mm.h> 23#include <linux/mm.h>
23#include <linux/ctype.h> 24#include <linux/ctype.h>
25#include <linux/slab.h>
24 26
25#include <asm/sections.h> 27#include <asm/sections.h>
26 28
@@ -515,6 +517,26 @@ static int kallsyms_open(struct inode *inode, struct file *file)
515 return ret; 517 return ret;
516} 518}
517 519
520#ifdef CONFIG_KGDB_KDB
521const char *kdb_walk_kallsyms(loff_t *pos)
522{
523 static struct kallsym_iter kdb_walk_kallsyms_iter;
524 if (*pos == 0) {
525 memset(&kdb_walk_kallsyms_iter, 0,
526 sizeof(kdb_walk_kallsyms_iter));
527 reset_iter(&kdb_walk_kallsyms_iter, 0);
528 }
529 while (1) {
530 if (!update_iter(&kdb_walk_kallsyms_iter, *pos))
531 return NULL;
532 ++*pos;
533 /* Some debugging symbols have no name. Ignore them. */
534 if (kdb_walk_kallsyms_iter.name[0])
535 return kdb_walk_kallsyms_iter.name;
536 }
537}
538#endif /* CONFIG_KGDB_KDB */
539
518static const struct file_operations kallsyms_operations = { 540static const struct file_operations kallsyms_operations = {
519 .open = kallsyms_open, 541 .open = kallsyms_open,
520 .read = seq_read, 542 .read = seq_read,
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 87ebe8adc474..131b1703936f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1089,9 +1089,10 @@ void crash_kexec(struct pt_regs *regs)
1089 1089
1090size_t crash_get_memory_size(void) 1090size_t crash_get_memory_size(void)
1091{ 1091{
1092 size_t size; 1092 size_t size = 0;
1093 mutex_lock(&kexec_mutex); 1093 mutex_lock(&kexec_mutex);
1094 size = crashk_res.end - crashk_res.start + 1; 1094 if (crashk_res.end != crashk_res.start)
1095 size = crashk_res.end - crashk_res.start + 1;
1095 mutex_unlock(&kexec_mutex); 1096 mutex_unlock(&kexec_mutex);
1096 return size; 1097 return size;
1097} 1098}
@@ -1134,11 +1135,9 @@ int crash_shrink_memory(unsigned long new_size)
1134 1135
1135 free_reserved_phys_range(end, crashk_res.end); 1136 free_reserved_phys_range(end, crashk_res.end);
1136 1137
1137 if (start == end) { 1138 if ((start == end) && (crashk_res.parent != NULL))
1138 crashk_res.end = end;
1139 release_resource(&crashk_res); 1139 release_resource(&crashk_res);
1140 } else 1140 crashk_res.end = end - 1;
1141 crashk_res.end = end - 1;
1142 1141
1143unlock: 1142unlock:
1144 mutex_unlock(&kexec_mutex); 1143 mutex_unlock(&kexec_mutex);
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
deleted file mode 100644
index 761fdd2b3034..000000000000
--- a/kernel/kgdb.c
+++ /dev/null
@@ -1,1763 +0,0 @@
1/*
2 * KGDB stub.
3 *
4 * Maintainer: Jason Wessel <jason.wessel@windriver.com>
5 *
6 * Copyright (C) 2000-2001 VERITAS Software Corporation.
7 * Copyright (C) 2002-2004 Timesys Corporation
8 * Copyright (C) 2003-2004 Amit S. Kale <amitkale@linsyssoft.com>
9 * Copyright (C) 2004 Pavel Machek <pavel@suse.cz>
10 * Copyright (C) 2004-2006 Tom Rini <trini@kernel.crashing.org>
11 * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd.
12 * Copyright (C) 2005-2008 Wind River Systems, Inc.
13 * Copyright (C) 2007 MontaVista Software, Inc.
14 * Copyright (C) 2008 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
15 *
16 * Contributors at various stages not listed above:
17 * Jason Wessel ( jason.wessel@windriver.com )
18 * George Anzinger <george@mvista.com>
19 * Anurekh Saxena (anurekh.saxena@timesys.com)
20 * Lake Stevens Instrument Division (Glenn Engel)
21 * Jim Kingdon, Cygnus Support.
22 *
23 * Original KGDB stub: David Grothe <dave@gcom.com>,
24 * Tigran Aivazian <tigran@sco.com>
25 *
26 * This file is licensed under the terms of the GNU General Public License
27 * version 2. This program is licensed "as is" without any warranty of any
28 * kind, whether express or implied.
29 */
30#include <linux/pid_namespace.h>
31#include <linux/clocksource.h>
32#include <linux/interrupt.h>
33#include <linux/spinlock.h>
34#include <linux/console.h>
35#include <linux/threads.h>
36#include <linux/uaccess.h>
37#include <linux/kernel.h>
38#include <linux/module.h>
39#include <linux/ptrace.h>
40#include <linux/reboot.h>
41#include <linux/string.h>
42#include <linux/delay.h>
43#include <linux/sched.h>
44#include <linux/sysrq.h>
45#include <linux/init.h>
46#include <linux/kgdb.h>
47#include <linux/pid.h>
48#include <linux/smp.h>
49#include <linux/mm.h>
50
51#include <asm/cacheflush.h>
52#include <asm/byteorder.h>
53#include <asm/atomic.h>
54#include <asm/system.h>
55#include <asm/unaligned.h>
56
57static int kgdb_break_asap;
58
59#define KGDB_MAX_THREAD_QUERY 17
60struct kgdb_state {
61 int ex_vector;
62 int signo;
63 int err_code;
64 int cpu;
65 int pass_exception;
66 unsigned long thr_query;
67 unsigned long threadid;
68 long kgdb_usethreadid;
69 struct pt_regs *linux_regs;
70};
71
72static struct debuggerinfo_struct {
73 void *debuggerinfo;
74 struct task_struct *task;
75} kgdb_info[NR_CPUS];
76
77/**
78 * kgdb_connected - Is a host GDB connected to us?
79 */
80int kgdb_connected;
81EXPORT_SYMBOL_GPL(kgdb_connected);
82
83/* All the KGDB handlers are installed */
84static int kgdb_io_module_registered;
85
86/* Guard for recursive entry */
87static int exception_level;
88
89static struct kgdb_io *kgdb_io_ops;
90static DEFINE_SPINLOCK(kgdb_registration_lock);
91
92/* kgdb console driver is loaded */
93static int kgdb_con_registered;
94/* determine if kgdb console output should be used */
95static int kgdb_use_con;
96
97static int __init opt_kgdb_con(char *str)
98{
99 kgdb_use_con = 1;
100 return 0;
101}
102
103early_param("kgdbcon", opt_kgdb_con);
104
105module_param(kgdb_use_con, int, 0644);
106
107/*
108 * Holds information about breakpoints in a kernel. These breakpoints are
109 * added and removed by gdb.
110 */
111static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
112 [0 ... KGDB_MAX_BREAKPOINTS-1] = { .state = BP_UNDEFINED }
113};
114
115/*
116 * The CPU# of the active CPU, or -1 if none:
117 */
118atomic_t kgdb_active = ATOMIC_INIT(-1);
119
120/*
121 * We use NR_CPUs not PERCPU, in case kgdb is used to debug early
122 * bootup code (which might not have percpu set up yet):
123 */
124static atomic_t passive_cpu_wait[NR_CPUS];
125static atomic_t cpu_in_kgdb[NR_CPUS];
126atomic_t kgdb_setting_breakpoint;
127
128struct task_struct *kgdb_usethread;
129struct task_struct *kgdb_contthread;
130
131int kgdb_single_step;
132pid_t kgdb_sstep_pid;
133
134/* Our I/O buffers. */
135static char remcom_in_buffer[BUFMAX];
136static char remcom_out_buffer[BUFMAX];
137
138/* Storage for the registers, in GDB format. */
139static unsigned long gdb_regs[(NUMREGBYTES +
140 sizeof(unsigned long) - 1) /
141 sizeof(unsigned long)];
142
143/* to keep track of the CPU which is doing the single stepping*/
144atomic_t kgdb_cpu_doing_single_step = ATOMIC_INIT(-1);
145
146/*
147 * If you are debugging a problem where roundup (the collection of
148 * all other CPUs) is a problem [this should be extremely rare],
149 * then use the nokgdbroundup option to avoid roundup. In that case
150 * the other CPUs might interfere with your debugging context, so
151 * use this with care:
152 */
153static int kgdb_do_roundup = 1;
154
155static int __init opt_nokgdbroundup(char *str)
156{
157 kgdb_do_roundup = 0;
158
159 return 0;
160}
161
162early_param("nokgdbroundup", opt_nokgdbroundup);
163
164/*
165 * Finally, some KGDB code :-)
166 */
167
168/*
169 * Weak aliases for breakpoint management,
170 * can be overriden by architectures when needed:
171 */
172int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
173{
174 int err;
175
176 err = probe_kernel_read(saved_instr, (char *)addr, BREAK_INSTR_SIZE);
177 if (err)
178 return err;
179
180 return probe_kernel_write((char *)addr, arch_kgdb_ops.gdb_bpt_instr,
181 BREAK_INSTR_SIZE);
182}
183
184int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
185{
186 return probe_kernel_write((char *)addr,
187 (char *)bundle, BREAK_INSTR_SIZE);
188}
189
190int __weak kgdb_validate_break_address(unsigned long addr)
191{
192 char tmp_variable[BREAK_INSTR_SIZE];
193 int err;
194 /* Validate setting the breakpoint and then removing it. In the
195 * remove fails, the kernel needs to emit a bad message because we
196 * are deep trouble not being able to put things back the way we
197 * found them.
198 */
199 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
200 if (err)
201 return err;
202 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
203 if (err)
204 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
205 "memory destroyed at: %lx", addr);
206 return err;
207}
208
209unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
210{
211 return instruction_pointer(regs);
212}
213
214int __weak kgdb_arch_init(void)
215{
216 return 0;
217}
218
219int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
220{
221 return 0;
222}
223
224void __weak
225kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
226{
227 return;
228}
229
230/**
231 * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
232 * @regs: Current &struct pt_regs.
233 *
234 * This function will be called if the particular architecture must
235 * disable hardware debugging while it is processing gdb packets or
236 * handling exception.
237 */
238void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
239{
240}
241
242/*
243 * GDB remote protocol parser:
244 */
245
246static int hex(char ch)
247{
248 if ((ch >= 'a') && (ch <= 'f'))
249 return ch - 'a' + 10;
250 if ((ch >= '0') && (ch <= '9'))
251 return ch - '0';
252 if ((ch >= 'A') && (ch <= 'F'))
253 return ch - 'A' + 10;
254 return -1;
255}
256
257/* scan for the sequence $<data>#<checksum> */
258static void get_packet(char *buffer)
259{
260 unsigned char checksum;
261 unsigned char xmitcsum;
262 int count;
263 char ch;
264
265 do {
266 /*
267 * Spin and wait around for the start character, ignore all
268 * other characters:
269 */
270 while ((ch = (kgdb_io_ops->read_char())) != '$')
271 /* nothing */;
272
273 kgdb_connected = 1;
274 checksum = 0;
275 xmitcsum = -1;
276
277 count = 0;
278
279 /*
280 * now, read until a # or end of buffer is found:
281 */
282 while (count < (BUFMAX - 1)) {
283 ch = kgdb_io_ops->read_char();
284 if (ch == '#')
285 break;
286 checksum = checksum + ch;
287 buffer[count] = ch;
288 count = count + 1;
289 }
290 buffer[count] = 0;
291
292 if (ch == '#') {
293 xmitcsum = hex(kgdb_io_ops->read_char()) << 4;
294 xmitcsum += hex(kgdb_io_ops->read_char());
295
296 if (checksum != xmitcsum)
297 /* failed checksum */
298 kgdb_io_ops->write_char('-');
299 else
300 /* successful transfer */
301 kgdb_io_ops->write_char('+');
302 if (kgdb_io_ops->flush)
303 kgdb_io_ops->flush();
304 }
305 } while (checksum != xmitcsum);
306}
307
308/*
309 * Send the packet in buffer.
310 * Check for gdb connection if asked for.
311 */
312static void put_packet(char *buffer)
313{
314 unsigned char checksum;
315 int count;
316 char ch;
317
318 /*
319 * $<packet info>#<checksum>.
320 */
321 while (1) {
322 kgdb_io_ops->write_char('$');
323 checksum = 0;
324 count = 0;
325
326 while ((ch = buffer[count])) {
327 kgdb_io_ops->write_char(ch);
328 checksum += ch;
329 count++;
330 }
331
332 kgdb_io_ops->write_char('#');
333 kgdb_io_ops->write_char(hex_asc_hi(checksum));
334 kgdb_io_ops->write_char(hex_asc_lo(checksum));
335 if (kgdb_io_ops->flush)
336 kgdb_io_ops->flush();
337
338 /* Now see what we get in reply. */
339 ch = kgdb_io_ops->read_char();
340
341 if (ch == 3)
342 ch = kgdb_io_ops->read_char();
343
344 /* If we get an ACK, we are done. */
345 if (ch == '+')
346 return;
347
348 /*
349 * If we get the start of another packet, this means
350 * that GDB is attempting to reconnect. We will NAK
351 * the packet being sent, and stop trying to send this
352 * packet.
353 */
354 if (ch == '$') {
355 kgdb_io_ops->write_char('-');
356 if (kgdb_io_ops->flush)
357 kgdb_io_ops->flush();
358 return;
359 }
360 }
361}
362
363/*
364 * Convert the memory pointed to by mem into hex, placing result in buf.
365 * Return a pointer to the last char put in buf (null). May return an error.
366 */
367int kgdb_mem2hex(char *mem, char *buf, int count)
368{
369 char *tmp;
370 int err;
371
372 /*
373 * We use the upper half of buf as an intermediate buffer for the
374 * raw memory copy. Hex conversion will work against this one.
375 */
376 tmp = buf + count;
377
378 err = probe_kernel_read(tmp, mem, count);
379 if (!err) {
380 while (count > 0) {
381 buf = pack_hex_byte(buf, *tmp);
382 tmp++;
383 count--;
384 }
385
386 *buf = 0;
387 }
388
389 return err;
390}
391
392/*
393 * Copy the binary array pointed to by buf into mem. Fix $, #, and
394 * 0x7d escaped with 0x7d. Return a pointer to the character after
395 * the last byte written.
396 */
397static int kgdb_ebin2mem(char *buf, char *mem, int count)
398{
399 int err = 0;
400 char c;
401
402 while (count-- > 0) {
403 c = *buf++;
404 if (c == 0x7d)
405 c = *buf++ ^ 0x20;
406
407 err = probe_kernel_write(mem, &c, 1);
408 if (err)
409 break;
410
411 mem++;
412 }
413
414 return err;
415}
416
417/*
418 * Convert the hex array pointed to by buf into binary to be placed in mem.
419 * Return a pointer to the character AFTER the last byte written.
420 * May return an error.
421 */
422int kgdb_hex2mem(char *buf, char *mem, int count)
423{
424 char *tmp_raw;
425 char *tmp_hex;
426
427 /*
428 * We use the upper half of buf as an intermediate buffer for the
429 * raw memory that is converted from hex.
430 */
431 tmp_raw = buf + count * 2;
432
433 tmp_hex = tmp_raw - 1;
434 while (tmp_hex >= buf) {
435 tmp_raw--;
436 *tmp_raw = hex(*tmp_hex--);
437 *tmp_raw |= hex(*tmp_hex--) << 4;
438 }
439
440 return probe_kernel_write(mem, tmp_raw, count);
441}
442
443/*
444 * While we find nice hex chars, build a long_val.
445 * Return number of chars processed.
446 */
447int kgdb_hex2long(char **ptr, unsigned long *long_val)
448{
449 int hex_val;
450 int num = 0;
451 int negate = 0;
452
453 *long_val = 0;
454
455 if (**ptr == '-') {
456 negate = 1;
457 (*ptr)++;
458 }
459 while (**ptr) {
460 hex_val = hex(**ptr);
461 if (hex_val < 0)
462 break;
463
464 *long_val = (*long_val << 4) | hex_val;
465 num++;
466 (*ptr)++;
467 }
468
469 if (negate)
470 *long_val = -*long_val;
471
472 return num;
473}
474
475/* Write memory due to an 'M' or 'X' packet. */
476static int write_mem_msg(int binary)
477{
478 char *ptr = &remcom_in_buffer[1];
479 unsigned long addr;
480 unsigned long length;
481 int err;
482
483 if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' &&
484 kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') {
485 if (binary)
486 err = kgdb_ebin2mem(ptr, (char *)addr, length);
487 else
488 err = kgdb_hex2mem(ptr, (char *)addr, length);
489 if (err)
490 return err;
491 if (CACHE_FLUSH_IS_SAFE)
492 flush_icache_range(addr, addr + length);
493 return 0;
494 }
495
496 return -EINVAL;
497}
498
499static void error_packet(char *pkt, int error)
500{
501 error = -error;
502 pkt[0] = 'E';
503 pkt[1] = hex_asc[(error / 10)];
504 pkt[2] = hex_asc[(error % 10)];
505 pkt[3] = '\0';
506}
507
508/*
509 * Thread ID accessors. We represent a flat TID space to GDB, where
510 * the per CPU idle threads (which under Linux all have PID 0) are
511 * remapped to negative TIDs.
512 */
513
514#define BUF_THREAD_ID_SIZE 16
515
516static char *pack_threadid(char *pkt, unsigned char *id)
517{
518 char *limit;
519
520 limit = pkt + BUF_THREAD_ID_SIZE;
521 while (pkt < limit)
522 pkt = pack_hex_byte(pkt, *id++);
523
524 return pkt;
525}
526
527static void int_to_threadref(unsigned char *id, int value)
528{
529 unsigned char *scan;
530 int i = 4;
531
532 scan = (unsigned char *)id;
533 while (i--)
534 *scan++ = 0;
535 put_unaligned_be32(value, scan);
536}
537
538static struct task_struct *getthread(struct pt_regs *regs, int tid)
539{
540 /*
541 * Non-positive TIDs are remapped to the cpu shadow information
542 */
543 if (tid == 0 || tid == -1)
544 tid = -atomic_read(&kgdb_active) - 2;
545 if (tid < -1 && tid > -NR_CPUS - 2) {
546 if (kgdb_info[-tid - 2].task)
547 return kgdb_info[-tid - 2].task;
548 else
549 return idle_task(-tid - 2);
550 }
551 if (tid <= 0) {
552 printk(KERN_ERR "KGDB: Internal thread select error\n");
553 dump_stack();
554 return NULL;
555 }
556
557 /*
558 * find_task_by_pid_ns() does not take the tasklist lock anymore
559 * but is nicely RCU locked - hence is a pretty resilient
560 * thing to use:
561 */
562 return find_task_by_pid_ns(tid, &init_pid_ns);
563}
564
565/*
566 * CPU debug state control:
567 */
568
569#ifdef CONFIG_SMP
570static void kgdb_wait(struct pt_regs *regs)
571{
572 unsigned long flags;
573 int cpu;
574
575 local_irq_save(flags);
576 cpu = raw_smp_processor_id();
577 kgdb_info[cpu].debuggerinfo = regs;
578 kgdb_info[cpu].task = current;
579 /*
580 * Make sure the above info reaches the primary CPU before
581 * our cpu_in_kgdb[] flag setting does:
582 */
583 smp_wmb();
584 atomic_set(&cpu_in_kgdb[cpu], 1);
585
586 /* Disable any cpu specific hw breakpoints */
587 kgdb_disable_hw_debug(regs);
588
589 /* Wait till primary CPU is done with debugging */
590 while (atomic_read(&passive_cpu_wait[cpu]))
591 cpu_relax();
592
593 kgdb_info[cpu].debuggerinfo = NULL;
594 kgdb_info[cpu].task = NULL;
595
596 /* fix up hardware debug registers on local cpu */
597 if (arch_kgdb_ops.correct_hw_break)
598 arch_kgdb_ops.correct_hw_break();
599
600 /* Signal the primary CPU that we are done: */
601 atomic_set(&cpu_in_kgdb[cpu], 0);
602 touch_softlockup_watchdog_sync();
603 clocksource_touch_watchdog();
604 local_irq_restore(flags);
605}
606#endif
607
608/*
609 * Some architectures need cache flushes when we set/clear a
610 * breakpoint:
611 */
612static void kgdb_flush_swbreak_addr(unsigned long addr)
613{
614 if (!CACHE_FLUSH_IS_SAFE)
615 return;
616
617 if (current->mm && current->mm->mmap_cache) {
618 flush_cache_range(current->mm->mmap_cache,
619 addr, addr + BREAK_INSTR_SIZE);
620 }
621 /* Force flush instruction cache if it was outside the mm */
622 flush_icache_range(addr, addr + BREAK_INSTR_SIZE);
623}
624
625/*
626 * SW breakpoint management:
627 */
628static int kgdb_activate_sw_breakpoints(void)
629{
630 unsigned long addr;
631 int error;
632 int ret = 0;
633 int i;
634
635 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
636 if (kgdb_break[i].state != BP_SET)
637 continue;
638
639 addr = kgdb_break[i].bpt_addr;
640 error = kgdb_arch_set_breakpoint(addr,
641 kgdb_break[i].saved_instr);
642 if (error) {
643 ret = error;
644 printk(KERN_INFO "KGDB: BP install failed: %lx", addr);
645 continue;
646 }
647
648 kgdb_flush_swbreak_addr(addr);
649 kgdb_break[i].state = BP_ACTIVE;
650 }
651 return ret;
652}
653
654static int kgdb_set_sw_break(unsigned long addr)
655{
656 int err = kgdb_validate_break_address(addr);
657 int breakno = -1;
658 int i;
659
660 if (err)
661 return err;
662
663 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
664 if ((kgdb_break[i].state == BP_SET) &&
665 (kgdb_break[i].bpt_addr == addr))
666 return -EEXIST;
667 }
668 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
669 if (kgdb_break[i].state == BP_REMOVED &&
670 kgdb_break[i].bpt_addr == addr) {
671 breakno = i;
672 break;
673 }
674 }
675
676 if (breakno == -1) {
677 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
678 if (kgdb_break[i].state == BP_UNDEFINED) {
679 breakno = i;
680 break;
681 }
682 }
683 }
684
685 if (breakno == -1)
686 return -E2BIG;
687
688 kgdb_break[breakno].state = BP_SET;
689 kgdb_break[breakno].type = BP_BREAKPOINT;
690 kgdb_break[breakno].bpt_addr = addr;
691
692 return 0;
693}
694
695static int kgdb_deactivate_sw_breakpoints(void)
696{
697 unsigned long addr;
698 int error;
699 int ret = 0;
700 int i;
701
702 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
703 if (kgdb_break[i].state != BP_ACTIVE)
704 continue;
705 addr = kgdb_break[i].bpt_addr;
706 error = kgdb_arch_remove_breakpoint(addr,
707 kgdb_break[i].saved_instr);
708 if (error) {
709 printk(KERN_INFO "KGDB: BP remove failed: %lx\n", addr);
710 ret = error;
711 }
712
713 kgdb_flush_swbreak_addr(addr);
714 kgdb_break[i].state = BP_SET;
715 }
716 return ret;
717}
718
719static int kgdb_remove_sw_break(unsigned long addr)
720{
721 int i;
722
723 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
724 if ((kgdb_break[i].state == BP_SET) &&
725 (kgdb_break[i].bpt_addr == addr)) {
726 kgdb_break[i].state = BP_REMOVED;
727 return 0;
728 }
729 }
730 return -ENOENT;
731}
732
733int kgdb_isremovedbreak(unsigned long addr)
734{
735 int i;
736
737 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
738 if ((kgdb_break[i].state == BP_REMOVED) &&
739 (kgdb_break[i].bpt_addr == addr))
740 return 1;
741 }
742 return 0;
743}
744
745static int remove_all_break(void)
746{
747 unsigned long addr;
748 int error;
749 int i;
750
751 /* Clear memory breakpoints. */
752 for (i = 0; i < KGDB_MAX_BREAKPOINTS; i++) {
753 if (kgdb_break[i].state != BP_ACTIVE)
754 goto setundefined;
755 addr = kgdb_break[i].bpt_addr;
756 error = kgdb_arch_remove_breakpoint(addr,
757 kgdb_break[i].saved_instr);
758 if (error)
759 printk(KERN_ERR "KGDB: breakpoint remove failed: %lx\n",
760 addr);
761setundefined:
762 kgdb_break[i].state = BP_UNDEFINED;
763 }
764
765 /* Clear hardware breakpoints. */
766 if (arch_kgdb_ops.remove_all_hw_break)
767 arch_kgdb_ops.remove_all_hw_break();
768
769 return 0;
770}
771
772/*
773 * Remap normal tasks to their real PID,
774 * CPU shadow threads are mapped to -CPU - 2
775 */
776static inline int shadow_pid(int realpid)
777{
778 if (realpid)
779 return realpid;
780
781 return -raw_smp_processor_id() - 2;
782}
783
784static char gdbmsgbuf[BUFMAX + 1];
785
786static void kgdb_msg_write(const char *s, int len)
787{
788 char *bufptr;
789 int wcount;
790 int i;
791
792 /* 'O'utput */
793 gdbmsgbuf[0] = 'O';
794
795 /* Fill and send buffers... */
796 while (len > 0) {
797 bufptr = gdbmsgbuf + 1;
798
799 /* Calculate how many this time */
800 if ((len << 1) > (BUFMAX - 2))
801 wcount = (BUFMAX - 2) >> 1;
802 else
803 wcount = len;
804
805 /* Pack in hex chars */
806 for (i = 0; i < wcount; i++)
807 bufptr = pack_hex_byte(bufptr, s[i]);
808 *bufptr = '\0';
809
810 /* Move up */
811 s += wcount;
812 len -= wcount;
813
814 /* Write packet */
815 put_packet(gdbmsgbuf);
816 }
817}
818
819/*
820 * Return true if there is a valid kgdb I/O module. Also if no
821 * debugger is attached a message can be printed to the console about
822 * waiting for the debugger to attach.
823 *
824 * The print_wait argument is only to be true when called from inside
825 * the core kgdb_handle_exception, because it will wait for the
826 * debugger to attach.
827 */
828static int kgdb_io_ready(int print_wait)
829{
830 if (!kgdb_io_ops)
831 return 0;
832 if (kgdb_connected)
833 return 1;
834 if (atomic_read(&kgdb_setting_breakpoint))
835 return 1;
836 if (print_wait)
837 printk(KERN_CRIT "KGDB: Waiting for remote debugger\n");
838 return 1;
839}
840
841/*
842 * All the functions that start with gdb_cmd are the various
843 * operations to implement the handlers for the gdbserial protocol
844 * where KGDB is communicating with an external debugger
845 */
846
847/* Handle the '?' status packets */
848static void gdb_cmd_status(struct kgdb_state *ks)
849{
850 /*
851 * We know that this packet is only sent
852 * during initial connect. So to be safe,
853 * we clear out our breakpoints now in case
854 * GDB is reconnecting.
855 */
856 remove_all_break();
857
858 remcom_out_buffer[0] = 'S';
859 pack_hex_byte(&remcom_out_buffer[1], ks->signo);
860}
861
862/* Handle the 'g' get registers request */
863static void gdb_cmd_getregs(struct kgdb_state *ks)
864{
865 struct task_struct *thread;
866 void *local_debuggerinfo;
867 int i;
868
869 thread = kgdb_usethread;
870 if (!thread) {
871 thread = kgdb_info[ks->cpu].task;
872 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
873 } else {
874 local_debuggerinfo = NULL;
875 for_each_online_cpu(i) {
876 /*
877 * Try to find the task on some other
878 * or possibly this node if we do not
879 * find the matching task then we try
880 * to approximate the results.
881 */
882 if (thread == kgdb_info[i].task)
883 local_debuggerinfo = kgdb_info[i].debuggerinfo;
884 }
885 }
886
887 /*
888 * All threads that don't have debuggerinfo should be
889 * in schedule() sleeping, since all other CPUs
890 * are in kgdb_wait, and thus have debuggerinfo.
891 */
892 if (local_debuggerinfo) {
893 pt_regs_to_gdb_regs(gdb_regs, local_debuggerinfo);
894 } else {
895 /*
896 * Pull stuff saved during switch_to; nothing
897 * else is accessible (or even particularly
898 * relevant).
899 *
900 * This should be enough for a stack trace.
901 */
902 sleeping_thread_to_gdb_regs(gdb_regs, thread);
903 }
904 kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, NUMREGBYTES);
905}
906
907/* Handle the 'G' set registers request */
908static void gdb_cmd_setregs(struct kgdb_state *ks)
909{
910 kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, NUMREGBYTES);
911
912 if (kgdb_usethread && kgdb_usethread != current) {
913 error_packet(remcom_out_buffer, -EINVAL);
914 } else {
915 gdb_regs_to_pt_regs(gdb_regs, ks->linux_regs);
916 strcpy(remcom_out_buffer, "OK");
917 }
918}
919
920/* Handle the 'm' memory read bytes */
921static void gdb_cmd_memread(struct kgdb_state *ks)
922{
923 char *ptr = &remcom_in_buffer[1];
924 unsigned long length;
925 unsigned long addr;
926 int err;
927
928 if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' &&
929 kgdb_hex2long(&ptr, &length) > 0) {
930 err = kgdb_mem2hex((char *)addr, remcom_out_buffer, length);
931 if (err)
932 error_packet(remcom_out_buffer, err);
933 } else {
934 error_packet(remcom_out_buffer, -EINVAL);
935 }
936}
937
938/* Handle the 'M' memory write bytes */
939static void gdb_cmd_memwrite(struct kgdb_state *ks)
940{
941 int err = write_mem_msg(0);
942
943 if (err)
944 error_packet(remcom_out_buffer, err);
945 else
946 strcpy(remcom_out_buffer, "OK");
947}
948
949/* Handle the 'X' memory binary write bytes */
950static void gdb_cmd_binwrite(struct kgdb_state *ks)
951{
952 int err = write_mem_msg(1);
953
954 if (err)
955 error_packet(remcom_out_buffer, err);
956 else
957 strcpy(remcom_out_buffer, "OK");
958}
959
960/* Handle the 'D' or 'k', detach or kill packets */
961static void gdb_cmd_detachkill(struct kgdb_state *ks)
962{
963 int error;
964
965 /* The detach case */
966 if (remcom_in_buffer[0] == 'D') {
967 error = remove_all_break();
968 if (error < 0) {
969 error_packet(remcom_out_buffer, error);
970 } else {
971 strcpy(remcom_out_buffer, "OK");
972 kgdb_connected = 0;
973 }
974 put_packet(remcom_out_buffer);
975 } else {
976 /*
977 * Assume the kill case, with no exit code checking,
978 * trying to force detach the debugger:
979 */
980 remove_all_break();
981 kgdb_connected = 0;
982 }
983}
984
985/* Handle the 'R' reboot packets */
986static int gdb_cmd_reboot(struct kgdb_state *ks)
987{
988 /* For now, only honor R0 */
989 if (strcmp(remcom_in_buffer, "R0") == 0) {
990 printk(KERN_CRIT "Executing emergency reboot\n");
991 strcpy(remcom_out_buffer, "OK");
992 put_packet(remcom_out_buffer);
993
994 /*
995 * Execution should not return from
996 * machine_emergency_restart()
997 */
998 machine_emergency_restart();
999 kgdb_connected = 0;
1000
1001 return 1;
1002 }
1003 return 0;
1004}
1005
1006/* Handle the 'q' query packets */
1007static void gdb_cmd_query(struct kgdb_state *ks)
1008{
1009 struct task_struct *g;
1010 struct task_struct *p;
1011 unsigned char thref[8];
1012 char *ptr;
1013 int i;
1014 int cpu;
1015 int finished = 0;
1016
1017 switch (remcom_in_buffer[1]) {
1018 case 's':
1019 case 'f':
1020 if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) {
1021 error_packet(remcom_out_buffer, -EINVAL);
1022 break;
1023 }
1024
1025 i = 0;
1026 remcom_out_buffer[0] = 'm';
1027 ptr = remcom_out_buffer + 1;
1028 if (remcom_in_buffer[1] == 'f') {
1029 /* Each cpu is a shadow thread */
1030 for_each_online_cpu(cpu) {
1031 ks->thr_query = 0;
1032 int_to_threadref(thref, -cpu - 2);
1033 pack_threadid(ptr, thref);
1034 ptr += BUF_THREAD_ID_SIZE;
1035 *(ptr++) = ',';
1036 i++;
1037 }
1038 }
1039
1040 do_each_thread(g, p) {
1041 if (i >= ks->thr_query && !finished) {
1042 int_to_threadref(thref, p->pid);
1043 pack_threadid(ptr, thref);
1044 ptr += BUF_THREAD_ID_SIZE;
1045 *(ptr++) = ',';
1046 ks->thr_query++;
1047 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
1048 finished = 1;
1049 }
1050 i++;
1051 } while_each_thread(g, p);
1052
1053 *(--ptr) = '\0';
1054 break;
1055
1056 case 'C':
1057 /* Current thread id */
1058 strcpy(remcom_out_buffer, "QC");
1059 ks->threadid = shadow_pid(current->pid);
1060 int_to_threadref(thref, ks->threadid);
1061 pack_threadid(remcom_out_buffer + 2, thref);
1062 break;
1063 case 'T':
1064 if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) {
1065 error_packet(remcom_out_buffer, -EINVAL);
1066 break;
1067 }
1068 ks->threadid = 0;
1069 ptr = remcom_in_buffer + 17;
1070 kgdb_hex2long(&ptr, &ks->threadid);
1071 if (!getthread(ks->linux_regs, ks->threadid)) {
1072 error_packet(remcom_out_buffer, -EINVAL);
1073 break;
1074 }
1075 if ((int)ks->threadid > 0) {
1076 kgdb_mem2hex(getthread(ks->linux_regs,
1077 ks->threadid)->comm,
1078 remcom_out_buffer, 16);
1079 } else {
1080 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
1081
1082 sprintf(tmpstr, "shadowCPU%d",
1083 (int)(-ks->threadid - 2));
1084 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
1085 }
1086 break;
1087 }
1088}
1089
1090/* Handle the 'H' task query packets */
1091static void gdb_cmd_task(struct kgdb_state *ks)
1092{
1093 struct task_struct *thread;
1094 char *ptr;
1095
1096 switch (remcom_in_buffer[1]) {
1097 case 'g':
1098 ptr = &remcom_in_buffer[2];
1099 kgdb_hex2long(&ptr, &ks->threadid);
1100 thread = getthread(ks->linux_regs, ks->threadid);
1101 if (!thread && ks->threadid > 0) {
1102 error_packet(remcom_out_buffer, -EINVAL);
1103 break;
1104 }
1105 kgdb_usethread = thread;
1106 ks->kgdb_usethreadid = ks->threadid;
1107 strcpy(remcom_out_buffer, "OK");
1108 break;
1109 case 'c':
1110 ptr = &remcom_in_buffer[2];
1111 kgdb_hex2long(&ptr, &ks->threadid);
1112 if (!ks->threadid) {
1113 kgdb_contthread = NULL;
1114 } else {
1115 thread = getthread(ks->linux_regs, ks->threadid);
1116 if (!thread && ks->threadid > 0) {
1117 error_packet(remcom_out_buffer, -EINVAL);
1118 break;
1119 }
1120 kgdb_contthread = thread;
1121 }
1122 strcpy(remcom_out_buffer, "OK");
1123 break;
1124 }
1125}
1126
1127/* Handle the 'T' thread query packets */
1128static void gdb_cmd_thread(struct kgdb_state *ks)
1129{
1130 char *ptr = &remcom_in_buffer[1];
1131 struct task_struct *thread;
1132
1133 kgdb_hex2long(&ptr, &ks->threadid);
1134 thread = getthread(ks->linux_regs, ks->threadid);
1135 if (thread)
1136 strcpy(remcom_out_buffer, "OK");
1137 else
1138 error_packet(remcom_out_buffer, -EINVAL);
1139}
1140
1141/* Handle the 'z' or 'Z' breakpoint remove or set packets */
1142static void gdb_cmd_break(struct kgdb_state *ks)
1143{
1144 /*
1145 * Since GDB-5.3, it's been drafted that '0' is a software
1146 * breakpoint, '1' is a hardware breakpoint, so let's do that.
1147 */
1148 char *bpt_type = &remcom_in_buffer[1];
1149 char *ptr = &remcom_in_buffer[2];
1150 unsigned long addr;
1151 unsigned long length;
1152 int error = 0;
1153
1154 if (arch_kgdb_ops.set_hw_breakpoint && *bpt_type >= '1') {
1155 /* Unsupported */
1156 if (*bpt_type > '4')
1157 return;
1158 } else {
1159 if (*bpt_type != '0' && *bpt_type != '1')
1160 /* Unsupported. */
1161 return;
1162 }
1163
1164 /*
1165 * Test if this is a hardware breakpoint, and
1166 * if we support it:
1167 */
1168 if (*bpt_type == '1' && !(arch_kgdb_ops.flags & KGDB_HW_BREAKPOINT))
1169 /* Unsupported. */
1170 return;
1171
1172 if (*(ptr++) != ',') {
1173 error_packet(remcom_out_buffer, -EINVAL);
1174 return;
1175 }
1176 if (!kgdb_hex2long(&ptr, &addr)) {
1177 error_packet(remcom_out_buffer, -EINVAL);
1178 return;
1179 }
1180 if (*(ptr++) != ',' ||
1181 !kgdb_hex2long(&ptr, &length)) {
1182 error_packet(remcom_out_buffer, -EINVAL);
1183 return;
1184 }
1185
1186 if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0')
1187 error = kgdb_set_sw_break(addr);
1188 else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0')
1189 error = kgdb_remove_sw_break(addr);
1190 else if (remcom_in_buffer[0] == 'Z')
1191 error = arch_kgdb_ops.set_hw_breakpoint(addr,
1192 (int)length, *bpt_type - '0');
1193 else if (remcom_in_buffer[0] == 'z')
1194 error = arch_kgdb_ops.remove_hw_breakpoint(addr,
1195 (int) length, *bpt_type - '0');
1196
1197 if (error == 0)
1198 strcpy(remcom_out_buffer, "OK");
1199 else
1200 error_packet(remcom_out_buffer, error);
1201}
1202
1203/* Handle the 'C' signal / exception passing packets */
1204static int gdb_cmd_exception_pass(struct kgdb_state *ks)
1205{
1206 /* C09 == pass exception
1207 * C15 == detach kgdb, pass exception
1208 */
1209 if (remcom_in_buffer[1] == '0' && remcom_in_buffer[2] == '9') {
1210
1211 ks->pass_exception = 1;
1212 remcom_in_buffer[0] = 'c';
1213
1214 } else if (remcom_in_buffer[1] == '1' && remcom_in_buffer[2] == '5') {
1215
1216 ks->pass_exception = 1;
1217 remcom_in_buffer[0] = 'D';
1218 remove_all_break();
1219 kgdb_connected = 0;
1220 return 1;
1221
1222 } else {
1223 kgdb_msg_write("KGDB only knows signal 9 (pass)"
1224 " and 15 (pass and disconnect)\n"
1225 "Executing a continue without signal passing\n", 0);
1226 remcom_in_buffer[0] = 'c';
1227 }
1228
1229 /* Indicate fall through */
1230 return -1;
1231}
1232
1233/*
1234 * This function performs all gdbserial command procesing
1235 */
1236static int gdb_serial_stub(struct kgdb_state *ks)
1237{
1238 int error = 0;
1239 int tmp;
1240
1241 /* Clear the out buffer. */
1242 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1243
1244 if (kgdb_connected) {
1245 unsigned char thref[8];
1246 char *ptr;
1247
1248 /* Reply to host that an exception has occurred */
1249 ptr = remcom_out_buffer;
1250 *ptr++ = 'T';
1251 ptr = pack_hex_byte(ptr, ks->signo);
1252 ptr += strlen(strcpy(ptr, "thread:"));
1253 int_to_threadref(thref, shadow_pid(current->pid));
1254 ptr = pack_threadid(ptr, thref);
1255 *ptr++ = ';';
1256 put_packet(remcom_out_buffer);
1257 }
1258
1259 kgdb_usethread = kgdb_info[ks->cpu].task;
1260 ks->kgdb_usethreadid = shadow_pid(kgdb_info[ks->cpu].task->pid);
1261 ks->pass_exception = 0;
1262
1263 while (1) {
1264 error = 0;
1265
1266 /* Clear the out buffer. */
1267 memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer));
1268
1269 get_packet(remcom_in_buffer);
1270
1271 switch (remcom_in_buffer[0]) {
1272 case '?': /* gdbserial status */
1273 gdb_cmd_status(ks);
1274 break;
1275 case 'g': /* return the value of the CPU registers */
1276 gdb_cmd_getregs(ks);
1277 break;
1278 case 'G': /* set the value of the CPU registers - return OK */
1279 gdb_cmd_setregs(ks);
1280 break;
1281 case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */
1282 gdb_cmd_memread(ks);
1283 break;
1284 case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1285 gdb_cmd_memwrite(ks);
1286 break;
1287 case 'X': /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */
1288 gdb_cmd_binwrite(ks);
1289 break;
1290 /* kill or detach. KGDB should treat this like a
1291 * continue.
1292 */
1293 case 'D': /* Debugger detach */
1294 case 'k': /* Debugger detach via kill */
1295 gdb_cmd_detachkill(ks);
1296 goto default_handle;
1297 case 'R': /* Reboot */
1298 if (gdb_cmd_reboot(ks))
1299 goto default_handle;
1300 break;
1301 case 'q': /* query command */
1302 gdb_cmd_query(ks);
1303 break;
1304 case 'H': /* task related */
1305 gdb_cmd_task(ks);
1306 break;
1307 case 'T': /* Query thread status */
1308 gdb_cmd_thread(ks);
1309 break;
1310 case 'z': /* Break point remove */
1311 case 'Z': /* Break point set */
1312 gdb_cmd_break(ks);
1313 break;
1314 case 'C': /* Exception passing */
1315 tmp = gdb_cmd_exception_pass(ks);
1316 if (tmp > 0)
1317 goto default_handle;
1318 if (tmp == 0)
1319 break;
1320 /* Fall through on tmp < 0 */
1321 case 'c': /* Continue packet */
1322 case 's': /* Single step packet */
1323 if (kgdb_contthread && kgdb_contthread != current) {
1324 /* Can't switch threads in kgdb */
1325 error_packet(remcom_out_buffer, -EINVAL);
1326 break;
1327 }
1328 kgdb_activate_sw_breakpoints();
1329 /* Fall through to default processing */
1330 default:
1331default_handle:
1332 error = kgdb_arch_handle_exception(ks->ex_vector,
1333 ks->signo,
1334 ks->err_code,
1335 remcom_in_buffer,
1336 remcom_out_buffer,
1337 ks->linux_regs);
1338 /*
1339 * Leave cmd processing on error, detach,
1340 * kill, continue, or single step.
1341 */
1342 if (error >= 0 || remcom_in_buffer[0] == 'D' ||
1343 remcom_in_buffer[0] == 'k') {
1344 error = 0;
1345 goto kgdb_exit;
1346 }
1347
1348 }
1349
1350 /* reply to the request */
1351 put_packet(remcom_out_buffer);
1352 }
1353
1354kgdb_exit:
1355 if (ks->pass_exception)
1356 error = 1;
1357 return error;
1358}
1359
1360static int kgdb_reenter_check(struct kgdb_state *ks)
1361{
1362 unsigned long addr;
1363
1364 if (atomic_read(&kgdb_active) != raw_smp_processor_id())
1365 return 0;
1366
1367 /* Panic on recursive debugger calls: */
1368 exception_level++;
1369 addr = kgdb_arch_pc(ks->ex_vector, ks->linux_regs);
1370 kgdb_deactivate_sw_breakpoints();
1371
1372 /*
1373 * If the break point removed ok at the place exception
1374 * occurred, try to recover and print a warning to the end
1375 * user because the user planted a breakpoint in a place that
1376 * KGDB needs in order to function.
1377 */
1378 if (kgdb_remove_sw_break(addr) == 0) {
1379 exception_level = 0;
1380 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1381 kgdb_activate_sw_breakpoints();
1382 printk(KERN_CRIT "KGDB: re-enter error: breakpoint removed %lx\n",
1383 addr);
1384 WARN_ON_ONCE(1);
1385
1386 return 1;
1387 }
1388 remove_all_break();
1389 kgdb_skipexception(ks->ex_vector, ks->linux_regs);
1390
1391 if (exception_level > 1) {
1392 dump_stack();
1393 panic("Recursive entry to debugger");
1394 }
1395
1396 printk(KERN_CRIT "KGDB: re-enter exception: ALL breakpoints killed\n");
1397 dump_stack();
1398 panic("Recursive entry to debugger");
1399
1400 return 1;
1401}
1402
1403/*
1404 * kgdb_handle_exception() - main entry point from a kernel exception
1405 *
1406 * Locking hierarchy:
1407 * interface locks, if any (begin_session)
1408 * kgdb lock (kgdb_active)
1409 */
1410int
1411kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
1412{
1413 struct kgdb_state kgdb_var;
1414 struct kgdb_state *ks = &kgdb_var;
1415 unsigned long flags;
1416 int sstep_tries = 100;
1417 int error = 0;
1418 int i, cpu;
1419
1420 ks->cpu = raw_smp_processor_id();
1421 ks->ex_vector = evector;
1422 ks->signo = signo;
1423 ks->ex_vector = evector;
1424 ks->err_code = ecode;
1425 ks->kgdb_usethreadid = 0;
1426 ks->linux_regs = regs;
1427
1428 if (kgdb_reenter_check(ks))
1429 return 0; /* Ouch, double exception ! */
1430
1431acquirelock:
1432 /*
1433 * Interrupts will be restored by the 'trap return' code, except when
1434 * single stepping.
1435 */
1436 local_irq_save(flags);
1437
1438 cpu = raw_smp_processor_id();
1439
1440 /*
1441 * Acquire the kgdb_active lock:
1442 */
1443 while (atomic_cmpxchg(&kgdb_active, -1, cpu) != -1)
1444 cpu_relax();
1445
1446 /*
1447 * For single stepping, try to only enter on the processor
1448 * that was single stepping. To gaurd against a deadlock, the
1449 * kernel will only try for the value of sstep_tries before
1450 * giving up and continuing on.
1451 */
1452 if (atomic_read(&kgdb_cpu_doing_single_step) != -1 &&
1453 (kgdb_info[cpu].task &&
1454 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1455 atomic_set(&kgdb_active, -1);
1456 touch_softlockup_watchdog_sync();
1457 clocksource_touch_watchdog();
1458 local_irq_restore(flags);
1459
1460 goto acquirelock;
1461 }
1462
1463 if (!kgdb_io_ready(1)) {
1464 error = 1;
1465 goto kgdb_restore; /* No I/O connection, so resume the system */
1466 }
1467
1468 /*
1469 * Don't enter if we have hit a removed breakpoint.
1470 */
1471 if (kgdb_skipexception(ks->ex_vector, ks->linux_regs))
1472 goto kgdb_restore;
1473
1474 /* Call the I/O driver's pre_exception routine */
1475 if (kgdb_io_ops->pre_exception)
1476 kgdb_io_ops->pre_exception();
1477
1478 kgdb_info[ks->cpu].debuggerinfo = ks->linux_regs;
1479 kgdb_info[ks->cpu].task = current;
1480
1481 kgdb_disable_hw_debug(ks->linux_regs);
1482
1483 /*
1484 * Get the passive CPU lock which will hold all the non-primary
1485 * CPU in a spin state while the debugger is active
1486 */
1487 if (!kgdb_single_step) {
1488 for (i = 0; i < NR_CPUS; i++)
1489 atomic_set(&passive_cpu_wait[i], 1);
1490 }
1491
1492 /*
1493 * spin_lock code is good enough as a barrier so we don't
1494 * need one here:
1495 */
1496 atomic_set(&cpu_in_kgdb[ks->cpu], 1);
1497
1498#ifdef CONFIG_SMP
1499 /* Signal the other CPUs to enter kgdb_wait() */
1500 if ((!kgdb_single_step) && kgdb_do_roundup)
1501 kgdb_roundup_cpus(flags);
1502#endif
1503
1504 /*
1505 * Wait for the other CPUs to be notified and be waiting for us:
1506 */
1507 for_each_online_cpu(i) {
1508 while (!atomic_read(&cpu_in_kgdb[i]))
1509 cpu_relax();
1510 }
1511
1512 /*
1513 * At this point the primary processor is completely
1514 * in the debugger and all secondary CPUs are quiescent
1515 */
1516 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1517 kgdb_deactivate_sw_breakpoints();
1518 kgdb_single_step = 0;
1519 kgdb_contthread = current;
1520 exception_level = 0;
1521
1522 /* Talk to debugger with gdbserial protocol */
1523 error = gdb_serial_stub(ks);
1524
1525 /* Call the I/O driver's post_exception routine */
1526 if (kgdb_io_ops->post_exception)
1527 kgdb_io_ops->post_exception();
1528
1529 kgdb_info[ks->cpu].debuggerinfo = NULL;
1530 kgdb_info[ks->cpu].task = NULL;
1531 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1532
1533 if (!kgdb_single_step) {
1534 for (i = NR_CPUS-1; i >= 0; i--)
1535 atomic_set(&passive_cpu_wait[i], 0);
1536 /*
1537 * Wait till all the CPUs have quit
1538 * from the debugger.
1539 */
1540 for_each_online_cpu(i) {
1541 while (atomic_read(&cpu_in_kgdb[i]))
1542 cpu_relax();
1543 }
1544 }
1545
1546kgdb_restore:
1547 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
1548 int sstep_cpu = atomic_read(&kgdb_cpu_doing_single_step);
1549 if (kgdb_info[sstep_cpu].task)
1550 kgdb_sstep_pid = kgdb_info[sstep_cpu].task->pid;
1551 else
1552 kgdb_sstep_pid = 0;
1553 }
1554 /* Free kgdb_active */
1555 atomic_set(&kgdb_active, -1);
1556 touch_softlockup_watchdog_sync();
1557 clocksource_touch_watchdog();
1558 local_irq_restore(flags);
1559
1560 return error;
1561}
1562
1563int kgdb_nmicallback(int cpu, void *regs)
1564{
1565#ifdef CONFIG_SMP
1566 if (!atomic_read(&cpu_in_kgdb[cpu]) &&
1567 atomic_read(&kgdb_active) != cpu &&
1568 atomic_read(&cpu_in_kgdb[atomic_read(&kgdb_active)])) {
1569 kgdb_wait((struct pt_regs *)regs);
1570 return 0;
1571 }
1572#endif
1573 return 1;
1574}
1575
1576static void kgdb_console_write(struct console *co, const char *s,
1577 unsigned count)
1578{
1579 unsigned long flags;
1580
1581 /* If we're debugging, or KGDB has not connected, don't try
1582 * and print. */
1583 if (!kgdb_connected || atomic_read(&kgdb_active) != -1)
1584 return;
1585
1586 local_irq_save(flags);
1587 kgdb_msg_write(s, count);
1588 local_irq_restore(flags);
1589}
1590
1591static struct console kgdbcons = {
1592 .name = "kgdb",
1593 .write = kgdb_console_write,
1594 .flags = CON_PRINTBUFFER | CON_ENABLED,
1595 .index = -1,
1596};
1597
1598#ifdef CONFIG_MAGIC_SYSRQ
1599static void sysrq_handle_gdb(int key, struct tty_struct *tty)
1600{
1601 if (!kgdb_io_ops) {
1602 printk(KERN_CRIT "ERROR: No KGDB I/O module available\n");
1603 return;
1604 }
1605 if (!kgdb_connected)
1606 printk(KERN_CRIT "Entering KGDB\n");
1607
1608 kgdb_breakpoint();
1609}
1610
1611static struct sysrq_key_op sysrq_gdb_op = {
1612 .handler = sysrq_handle_gdb,
1613 .help_msg = "debug(G)",
1614 .action_msg = "DEBUG",
1615};
1616#endif
1617
1618static void kgdb_register_callbacks(void)
1619{
1620 if (!kgdb_io_module_registered) {
1621 kgdb_io_module_registered = 1;
1622 kgdb_arch_init();
1623#ifdef CONFIG_MAGIC_SYSRQ
1624 register_sysrq_key('g', &sysrq_gdb_op);
1625#endif
1626 if (kgdb_use_con && !kgdb_con_registered) {
1627 register_console(&kgdbcons);
1628 kgdb_con_registered = 1;
1629 }
1630 }
1631}
1632
1633static void kgdb_unregister_callbacks(void)
1634{
1635 /*
1636 * When this routine is called KGDB should unregister from the
1637 * panic handler and clean up, making sure it is not handling any
1638 * break exceptions at the time.
1639 */
1640 if (kgdb_io_module_registered) {
1641 kgdb_io_module_registered = 0;
1642 kgdb_arch_exit();
1643#ifdef CONFIG_MAGIC_SYSRQ
1644 unregister_sysrq_key('g', &sysrq_gdb_op);
1645#endif
1646 if (kgdb_con_registered) {
1647 unregister_console(&kgdbcons);
1648 kgdb_con_registered = 0;
1649 }
1650 }
1651}
1652
1653static void kgdb_initial_breakpoint(void)
1654{
1655 kgdb_break_asap = 0;
1656
1657 printk(KERN_CRIT "kgdb: Waiting for connection from remote gdb...\n");
1658 kgdb_breakpoint();
1659}
1660
1661/**
1662 * kgdb_register_io_module - register KGDB IO module
1663 * @new_kgdb_io_ops: the io ops vector
1664 *
1665 * Register it with the KGDB core.
1666 */
1667int kgdb_register_io_module(struct kgdb_io *new_kgdb_io_ops)
1668{
1669 int err;
1670
1671 spin_lock(&kgdb_registration_lock);
1672
1673 if (kgdb_io_ops) {
1674 spin_unlock(&kgdb_registration_lock);
1675
1676 printk(KERN_ERR "kgdb: Another I/O driver is already "
1677 "registered with KGDB.\n");
1678 return -EBUSY;
1679 }
1680
1681 if (new_kgdb_io_ops->init) {
1682 err = new_kgdb_io_ops->init();
1683 if (err) {
1684 spin_unlock(&kgdb_registration_lock);
1685 return err;
1686 }
1687 }
1688
1689 kgdb_io_ops = new_kgdb_io_ops;
1690
1691 spin_unlock(&kgdb_registration_lock);
1692
1693 printk(KERN_INFO "kgdb: Registered I/O driver %s.\n",
1694 new_kgdb_io_ops->name);
1695
1696 /* Arm KGDB now. */
1697 kgdb_register_callbacks();
1698
1699 if (kgdb_break_asap)
1700 kgdb_initial_breakpoint();
1701
1702 return 0;
1703}
1704EXPORT_SYMBOL_GPL(kgdb_register_io_module);
1705
1706/**
1707 * kkgdb_unregister_io_module - unregister KGDB IO module
1708 * @old_kgdb_io_ops: the io ops vector
1709 *
1710 * Unregister it with the KGDB core.
1711 */
1712void kgdb_unregister_io_module(struct kgdb_io *old_kgdb_io_ops)
1713{
1714 BUG_ON(kgdb_connected);
1715
1716 /*
1717 * KGDB is no longer able to communicate out, so
1718 * unregister our callbacks and reset state.
1719 */
1720 kgdb_unregister_callbacks();
1721
1722 spin_lock(&kgdb_registration_lock);
1723
1724 WARN_ON_ONCE(kgdb_io_ops != old_kgdb_io_ops);
1725 kgdb_io_ops = NULL;
1726
1727 spin_unlock(&kgdb_registration_lock);
1728
1729 printk(KERN_INFO
1730 "kgdb: Unregistered I/O driver %s, debugger disabled.\n",
1731 old_kgdb_io_ops->name);
1732}
1733EXPORT_SYMBOL_GPL(kgdb_unregister_io_module);
1734
1735/**
1736 * kgdb_breakpoint - generate breakpoint exception
1737 *
1738 * This function will generate a breakpoint exception. It is used at the
1739 * beginning of a program to sync up with a debugger and can be used
1740 * otherwise as a quick means to stop program execution and "break" into
1741 * the debugger.
1742 */
1743void kgdb_breakpoint(void)
1744{
1745 atomic_set(&kgdb_setting_breakpoint, 1);
1746 wmb(); /* Sync point before breakpoint */
1747 arch_kgdb_breakpoint();
1748 wmb(); /* Sync point after breakpoint */
1749 atomic_set(&kgdb_setting_breakpoint, 0);
1750}
1751EXPORT_SYMBOL_GPL(kgdb_breakpoint);
1752
1753static int __init opt_kgdb_wait(char *str)
1754{
1755 kgdb_break_asap = 1;
1756
1757 if (kgdb_io_module_registered)
1758 kgdb_initial_breakpoint();
1759
1760 return 0;
1761}
1762
1763early_param("kgdbwait", opt_kgdb_wait);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index bf0e231d9702..6e9b19667a8d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -116,27 +116,16 @@ int __request_module(bool wait, const char *fmt, ...)
116 116
117 trace_module_request(module_name, wait, _RET_IP_); 117 trace_module_request(module_name, wait, _RET_IP_);
118 118
119 ret = call_usermodehelper(modprobe_path, argv, envp, 119 ret = call_usermodehelper_fns(modprobe_path, argv, envp,
120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); 120 wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC,
121 NULL, NULL, NULL);
122
121 atomic_dec(&kmod_concurrent); 123 atomic_dec(&kmod_concurrent);
122 return ret; 124 return ret;
123} 125}
124EXPORT_SYMBOL(__request_module); 126EXPORT_SYMBOL(__request_module);
125#endif /* CONFIG_MODULES */ 127#endif /* CONFIG_MODULES */
126 128
127struct subprocess_info {
128 struct work_struct work;
129 struct completion *complete;
130 struct cred *cred;
131 char *path;
132 char **argv;
133 char **envp;
134 enum umh_wait wait;
135 int retval;
136 struct file *stdin;
137 void (*cleanup)(char **argv, char **envp);
138};
139
140/* 129/*
141 * This is the task which runs the usermode application 130 * This is the task which runs the usermode application
142 */ 131 */
@@ -145,36 +134,10 @@ static int ____call_usermodehelper(void *data)
145 struct subprocess_info *sub_info = data; 134 struct subprocess_info *sub_info = data;
146 int retval; 135 int retval;
147 136
148 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
149
150 /* Unblock all signals */
151 spin_lock_irq(&current->sighand->siglock); 137 spin_lock_irq(&current->sighand->siglock);
152 flush_signal_handlers(current, 1); 138 flush_signal_handlers(current, 1);
153 sigemptyset(&current->blocked);
154 recalc_sigpending();
155 spin_unlock_irq(&current->sighand->siglock); 139 spin_unlock_irq(&current->sighand->siglock);
156 140
157 /* Install the credentials */
158 commit_creds(sub_info->cred);
159 sub_info->cred = NULL;
160
161 /* Install input pipe when needed */
162 if (sub_info->stdin) {
163 struct files_struct *f = current->files;
164 struct fdtable *fdt;
165 /* no races because files should be private here */
166 sys_close(0);
167 fd_install(0, sub_info->stdin);
168 spin_lock(&f->file_lock);
169 fdt = files_fdtable(f);
170 FD_SET(0, fdt->open_fds);
171 FD_CLR(0, fdt->close_on_exec);
172 spin_unlock(&f->file_lock);
173
174 /* and disallow core files too */
175 current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0};
176 }
177
178 /* We can run anywhere, unlike our parent keventd(). */ 141 /* We can run anywhere, unlike our parent keventd(). */
179 set_cpus_allowed_ptr(current, cpu_all_mask); 142 set_cpus_allowed_ptr(current, cpu_all_mask);
180 143
@@ -184,9 +147,16 @@ static int ____call_usermodehelper(void *data)
184 */ 147 */
185 set_user_nice(current, 0); 148 set_user_nice(current, 0);
186 149
150 if (sub_info->init) {
151 retval = sub_info->init(sub_info);
152 if (retval)
153 goto fail;
154 }
155
187 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); 156 retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp);
188 157
189 /* Exec failed? */ 158 /* Exec failed? */
159fail:
190 sub_info->retval = retval; 160 sub_info->retval = retval;
191 do_exit(0); 161 do_exit(0);
192} 162}
@@ -194,9 +164,7 @@ static int ____call_usermodehelper(void *data)
194void call_usermodehelper_freeinfo(struct subprocess_info *info) 164void call_usermodehelper_freeinfo(struct subprocess_info *info)
195{ 165{
196 if (info->cleanup) 166 if (info->cleanup)
197 (*info->cleanup)(info->argv, info->envp); 167 (*info->cleanup)(info);
198 if (info->cred)
199 put_cred(info->cred);
200 kfree(info); 168 kfree(info);
201} 169}
202EXPORT_SYMBOL(call_usermodehelper_freeinfo); 170EXPORT_SYMBOL(call_usermodehelper_freeinfo);
@@ -207,16 +175,16 @@ static int wait_for_helper(void *data)
207 struct subprocess_info *sub_info = data; 175 struct subprocess_info *sub_info = data;
208 pid_t pid; 176 pid_t pid;
209 177
210 /* Install a handler: if SIGCLD isn't handled sys_wait4 won't 178 /* If SIGCLD is ignored sys_wait4 won't populate the status. */
211 * populate the status, but will return -ECHILD. */ 179 spin_lock_irq(&current->sighand->siglock);
212 allow_signal(SIGCHLD); 180 current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL;
181 spin_unlock_irq(&current->sighand->siglock);
213 182
214 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); 183 pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD);
215 if (pid < 0) { 184 if (pid < 0) {
216 sub_info->retval = pid; 185 sub_info->retval = pid;
217 } else { 186 } else {
218 int ret; 187 int ret = -ECHILD;
219
220 /* 188 /*
221 * Normally it is bogus to call wait4() from in-kernel because 189 * Normally it is bogus to call wait4() from in-kernel because
222 * wait4() wants to write the exit code to a userspace address. 190 * wait4() wants to write the exit code to a userspace address.
@@ -237,10 +205,7 @@ static int wait_for_helper(void *data)
237 sub_info->retval = ret; 205 sub_info->retval = ret;
238 } 206 }
239 207
240 if (sub_info->wait == UMH_NO_WAIT) 208 complete(sub_info->complete);
241 call_usermodehelper_freeinfo(sub_info);
242 else
243 complete(sub_info->complete);
244 return 0; 209 return 0;
245} 210}
246 211
@@ -249,15 +214,13 @@ static void __call_usermodehelper(struct work_struct *work)
249{ 214{
250 struct subprocess_info *sub_info = 215 struct subprocess_info *sub_info =
251 container_of(work, struct subprocess_info, work); 216 container_of(work, struct subprocess_info, work);
252 pid_t pid;
253 enum umh_wait wait = sub_info->wait; 217 enum umh_wait wait = sub_info->wait;
254 218 pid_t pid;
255 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
256 219
257 /* CLONE_VFORK: wait until the usermode helper has execve'd 220 /* CLONE_VFORK: wait until the usermode helper has execve'd
258 * successfully We need the data structures to stay around 221 * successfully We need the data structures to stay around
259 * until that is done. */ 222 * until that is done. */
260 if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) 223 if (wait == UMH_WAIT_PROC)
261 pid = kernel_thread(wait_for_helper, sub_info, 224 pid = kernel_thread(wait_for_helper, sub_info,
262 CLONE_FS | CLONE_FILES | SIGCHLD); 225 CLONE_FS | CLONE_FILES | SIGCHLD);
263 else 226 else
@@ -266,15 +229,16 @@ static void __call_usermodehelper(struct work_struct *work)
266 229
267 switch (wait) { 230 switch (wait) {
268 case UMH_NO_WAIT: 231 case UMH_NO_WAIT:
232 call_usermodehelper_freeinfo(sub_info);
269 break; 233 break;
270 234
271 case UMH_WAIT_PROC: 235 case UMH_WAIT_PROC:
272 if (pid > 0) 236 if (pid > 0)
273 break; 237 break;
274 sub_info->retval = pid;
275 /* FALLTHROUGH */ 238 /* FALLTHROUGH */
276
277 case UMH_WAIT_EXEC: 239 case UMH_WAIT_EXEC:
240 if (pid < 0)
241 sub_info->retval = pid;
278 complete(sub_info->complete); 242 complete(sub_info->complete);
279 } 243 }
280} 244}
@@ -376,80 +340,37 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
376 sub_info->path = path; 340 sub_info->path = path;
377 sub_info->argv = argv; 341 sub_info->argv = argv;
378 sub_info->envp = envp; 342 sub_info->envp = envp;
379 sub_info->cred = prepare_usermodehelper_creds();
380 if (!sub_info->cred) {
381 kfree(sub_info);
382 return NULL;
383 }
384
385 out: 343 out:
386 return sub_info; 344 return sub_info;
387} 345}
388EXPORT_SYMBOL(call_usermodehelper_setup); 346EXPORT_SYMBOL(call_usermodehelper_setup);
389 347
390/** 348/**
391 * call_usermodehelper_setkeys - set the session keys for usermode helper 349 * call_usermodehelper_setfns - set a cleanup/init function
392 * @info: a subprocess_info returned by call_usermodehelper_setup
393 * @session_keyring: the session keyring for the process
394 */
395void call_usermodehelper_setkeys(struct subprocess_info *info,
396 struct key *session_keyring)
397{
398#ifdef CONFIG_KEYS
399 struct thread_group_cred *tgcred = info->cred->tgcred;
400 key_put(tgcred->session_keyring);
401 tgcred->session_keyring = key_get(session_keyring);
402#else
403 BUG();
404#endif
405}
406EXPORT_SYMBOL(call_usermodehelper_setkeys);
407
408/**
409 * call_usermodehelper_setcleanup - set a cleanup function
410 * @info: a subprocess_info returned by call_usermodehelper_setup 350 * @info: a subprocess_info returned by call_usermodehelper_setup
411 * @cleanup: a cleanup function 351 * @cleanup: a cleanup function
352 * @init: an init function
353 * @data: arbitrary context sensitive data
412 * 354 *
413 * The cleanup function is just befor ethe subprocess_info is about to 355 * The init function is used to customize the helper process prior to
356 * exec. A non-zero return code causes the process to error out, exit,
357 * and return the failure to the calling process
358 *
359 * The cleanup function is just before ethe subprocess_info is about to
414 * be freed. This can be used for freeing the argv and envp. The 360 * be freed. This can be used for freeing the argv and envp. The
415 * Function must be runnable in either a process context or the 361 * Function must be runnable in either a process context or the
416 * context in which call_usermodehelper_exec is called. 362 * context in which call_usermodehelper_exec is called.
417 */ 363 */
418void call_usermodehelper_setcleanup(struct subprocess_info *info, 364void call_usermodehelper_setfns(struct subprocess_info *info,
419 void (*cleanup)(char **argv, char **envp)) 365 int (*init)(struct subprocess_info *info),
366 void (*cleanup)(struct subprocess_info *info),
367 void *data)
420{ 368{
421 info->cleanup = cleanup; 369 info->cleanup = cleanup;
370 info->init = init;
371 info->data = data;
422} 372}
423EXPORT_SYMBOL(call_usermodehelper_setcleanup); 373EXPORT_SYMBOL(call_usermodehelper_setfns);
424
425/**
426 * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin
427 * @sub_info: a subprocess_info returned by call_usermodehelper_setup
428 * @filp: set to the write-end of a pipe
429 *
430 * This constructs a pipe, and sets the read end to be the stdin of the
431 * subprocess, and returns the write-end in *@filp.
432 */
433int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
434 struct file **filp)
435{
436 struct file *f;
437
438 f = create_write_pipe(0);
439 if (IS_ERR(f))
440 return PTR_ERR(f);
441 *filp = f;
442
443 f = create_read_pipe(f, 0);
444 if (IS_ERR(f)) {
445 free_write_pipe(*filp);
446 return PTR_ERR(f);
447 }
448 sub_info->stdin = f;
449
450 return 0;
451}
452EXPORT_SYMBOL(call_usermodehelper_stdinpipe);
453 374
454/** 375/**
455 * call_usermodehelper_exec - start a usermode application 376 * call_usermodehelper_exec - start a usermode application
@@ -469,9 +390,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
469 DECLARE_COMPLETION_ONSTACK(done); 390 DECLARE_COMPLETION_ONSTACK(done);
470 int retval = 0; 391 int retval = 0;
471 392
472 BUG_ON(atomic_read(&sub_info->cred->usage) != 1);
473 validate_creds(sub_info->cred);
474
475 helper_lock(); 393 helper_lock();
476 if (sub_info->path[0] == '\0') 394 if (sub_info->path[0] == '\0')
477 goto out; 395 goto out;
@@ -498,41 +416,6 @@ unlock:
498} 416}
499EXPORT_SYMBOL(call_usermodehelper_exec); 417EXPORT_SYMBOL(call_usermodehelper_exec);
500 418
501/**
502 * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin
503 * @path: path to usermode executable
504 * @argv: arg vector for process
505 * @envp: environment for process
506 * @filp: set to the write-end of a pipe
507 *
508 * This is a simple wrapper which executes a usermode-helper function
509 * with a pipe as stdin. It is implemented entirely in terms of
510 * lower-level call_usermodehelper_* functions.
511 */
512int call_usermodehelper_pipe(char *path, char **argv, char **envp,
513 struct file **filp)
514{
515 struct subprocess_info *sub_info;
516 int ret;
517
518 sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
519 if (sub_info == NULL)
520 return -ENOMEM;
521
522 ret = call_usermodehelper_stdinpipe(sub_info, filp);
523 if (ret < 0) {
524 call_usermodehelper_freeinfo(sub_info);
525 return ret;
526 }
527
528 ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC);
529 if (ret < 0) /* Failed to execute helper, close pipe */
530 filp_close(*filp, NULL);
531
532 return ret;
533}
534EXPORT_SYMBOL(call_usermodehelper_pipe);
535
536void __init usermodehelper_init(void) 419void __init usermodehelper_init(void)
537{ 420{
538 khelper_wq = create_singlethread_workqueue("khelper"); 421 khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 0ed46f3e51e9..282035f3ae96 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1588,6 +1588,72 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1588 arch_remove_kprobe(p); 1588 arch_remove_kprobe(p);
1589} 1589}
1590 1590
1591/* Disable one kprobe */
1592int __kprobes disable_kprobe(struct kprobe *kp)
1593{
1594 int ret = 0;
1595 struct kprobe *p;
1596
1597 mutex_lock(&kprobe_mutex);
1598
1599 /* Check whether specified probe is valid. */
1600 p = __get_valid_kprobe(kp);
1601 if (unlikely(p == NULL)) {
1602 ret = -EINVAL;
1603 goto out;
1604 }
1605
1606 /* If the probe is already disabled (or gone), just return */
1607 if (kprobe_disabled(kp))
1608 goto out;
1609
1610 kp->flags |= KPROBE_FLAG_DISABLED;
1611 if (p != kp)
1612 /* When kp != p, p is always enabled. */
1613 try_to_disable_aggr_kprobe(p);
1614
1615 if (!kprobes_all_disarmed && kprobe_disabled(p))
1616 disarm_kprobe(p);
1617out:
1618 mutex_unlock(&kprobe_mutex);
1619 return ret;
1620}
1621EXPORT_SYMBOL_GPL(disable_kprobe);
1622
1623/* Enable one kprobe */
1624int __kprobes enable_kprobe(struct kprobe *kp)
1625{
1626 int ret = 0;
1627 struct kprobe *p;
1628
1629 mutex_lock(&kprobe_mutex);
1630
1631 /* Check whether specified probe is valid. */
1632 p = __get_valid_kprobe(kp);
1633 if (unlikely(p == NULL)) {
1634 ret = -EINVAL;
1635 goto out;
1636 }
1637
1638 if (kprobe_gone(kp)) {
1639 /* This kprobe has gone, we couldn't enable it. */
1640 ret = -EINVAL;
1641 goto out;
1642 }
1643
1644 if (p != kp)
1645 kp->flags &= ~KPROBE_FLAG_DISABLED;
1646
1647 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1648 p->flags &= ~KPROBE_FLAG_DISABLED;
1649 arm_kprobe(p);
1650 }
1651out:
1652 mutex_unlock(&kprobe_mutex);
1653 return ret;
1654}
1655EXPORT_SYMBOL_GPL(enable_kprobe);
1656
1591void __kprobes dump_kprobe(struct kprobe *kp) 1657void __kprobes dump_kprobe(struct kprobe *kp)
1592{ 1658{
1593 printk(KERN_WARNING "Dumping kprobe:\n"); 1659 printk(KERN_WARNING "Dumping kprobe:\n");
@@ -1805,72 +1871,6 @@ static const struct file_operations debugfs_kprobes_operations = {
1805 .release = seq_release, 1871 .release = seq_release,
1806}; 1872};
1807 1873
1808/* Disable one kprobe */
1809int __kprobes disable_kprobe(struct kprobe *kp)
1810{
1811 int ret = 0;
1812 struct kprobe *p;
1813
1814 mutex_lock(&kprobe_mutex);
1815
1816 /* Check whether specified probe is valid. */
1817 p = __get_valid_kprobe(kp);
1818 if (unlikely(p == NULL)) {
1819 ret = -EINVAL;
1820 goto out;
1821 }
1822
1823 /* If the probe is already disabled (or gone), just return */
1824 if (kprobe_disabled(kp))
1825 goto out;
1826
1827 kp->flags |= KPROBE_FLAG_DISABLED;
1828 if (p != kp)
1829 /* When kp != p, p is always enabled. */
1830 try_to_disable_aggr_kprobe(p);
1831
1832 if (!kprobes_all_disarmed && kprobe_disabled(p))
1833 disarm_kprobe(p);
1834out:
1835 mutex_unlock(&kprobe_mutex);
1836 return ret;
1837}
1838EXPORT_SYMBOL_GPL(disable_kprobe);
1839
1840/* Enable one kprobe */
1841int __kprobes enable_kprobe(struct kprobe *kp)
1842{
1843 int ret = 0;
1844 struct kprobe *p;
1845
1846 mutex_lock(&kprobe_mutex);
1847
1848 /* Check whether specified probe is valid. */
1849 p = __get_valid_kprobe(kp);
1850 if (unlikely(p == NULL)) {
1851 ret = -EINVAL;
1852 goto out;
1853 }
1854
1855 if (kprobe_gone(kp)) {
1856 /* This kprobe has gone, we couldn't enable it. */
1857 ret = -EINVAL;
1858 goto out;
1859 }
1860
1861 if (p != kp)
1862 kp->flags &= ~KPROBE_FLAG_DISABLED;
1863
1864 if (!kprobes_all_disarmed && kprobe_disabled(p)) {
1865 p->flags &= ~KPROBE_FLAG_DISABLED;
1866 arm_kprobe(p);
1867 }
1868out:
1869 mutex_unlock(&kprobe_mutex);
1870 return ret;
1871}
1872EXPORT_SYMBOL_GPL(enable_kprobe);
1873
1874static void __kprobes arm_all_kprobes(void) 1874static void __kprobes arm_all_kprobes(void)
1875{ 1875{
1876 struct hlist_head *head; 1876 struct hlist_head *head;
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 21fe3c426948..0b624e791805 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -138,7 +138,8 @@ extern const void __start_notes __attribute__((weak));
138extern const void __stop_notes __attribute__((weak)); 138extern const void __stop_notes __attribute__((weak));
139#define notes_size (&__stop_notes - &__start_notes) 139#define notes_size (&__stop_notes - &__start_notes)
140 140
141static ssize_t notes_read(struct kobject *kobj, struct bin_attribute *bin_attr, 141static ssize_t notes_read(struct file *filp, struct kobject *kobj,
142 struct bin_attribute *bin_attr,
142 char *buf, loff_t off, size_t count) 143 char *buf, loff_t off, size_t count)
143{ 144{
144 memcpy(buf, &__start_notes + off, count); 145 memcpy(buf, &__start_notes + off, count);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 82ed0ea15194..83911c780175 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -219,7 +219,7 @@ int kthreadd(void *unused)
219 set_task_comm(tsk, "kthreadd"); 219 set_task_comm(tsk, "kthreadd");
220 ignore_signals(tsk); 220 ignore_signals(tsk);
221 set_cpus_allowed_ptr(tsk, cpu_all_mask); 221 set_cpus_allowed_ptr(tsk, cpu_all_mask);
222 set_mems_allowed(node_possible_map); 222 set_mems_allowed(node_states[N_HIGH_MEMORY]);
223 223
224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 224 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
225 225
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ca07c5c0c914..877fb306d415 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -56,7 +56,6 @@
56#include <linux/module.h> 56#include <linux/module.h>
57#include <linux/sched.h> 57#include <linux/sched.h>
58#include <linux/list.h> 58#include <linux/list.h>
59#include <linux/slab.h>
60#include <linux/stacktrace.h> 59#include <linux/stacktrace.h>
61 60
62static DEFINE_SPINLOCK(latency_lock); 61static DEFINE_SPINLOCK(latency_lock);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c927a549db2c..54286798c37b 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -43,6 +43,7 @@
43#include <linux/ftrace.h> 43#include <linux/ftrace.h>
44#include <linux/stringify.h> 44#include <linux/stringify.h>
45#include <linux/bitops.h> 45#include <linux/bitops.h>
46#include <linux/gfp.h>
46 47
47#include <asm/sections.h> 48#include <asm/sections.h>
48 49
@@ -430,20 +431,7 @@ static struct stack_trace lockdep_init_trace = {
430/* 431/*
431 * Various lockdep statistics: 432 * Various lockdep statistics:
432 */ 433 */
433atomic_t chain_lookup_hits; 434DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
434atomic_t chain_lookup_misses;
435atomic_t hardirqs_on_events;
436atomic_t hardirqs_off_events;
437atomic_t redundant_hardirqs_on;
438atomic_t redundant_hardirqs_off;
439atomic_t softirqs_on_events;
440atomic_t softirqs_off_events;
441atomic_t redundant_softirqs_on;
442atomic_t redundant_softirqs_off;
443atomic_t nr_unused_locks;
444atomic_t nr_cyclic_checks;
445atomic_t nr_find_usage_forwards_checks;
446atomic_t nr_find_usage_backwards_checks;
447#endif 435#endif
448 436
449/* 437/*
@@ -582,9 +570,6 @@ static int static_obj(void *obj)
582 unsigned long start = (unsigned long) &_stext, 570 unsigned long start = (unsigned long) &_stext,
583 end = (unsigned long) &_end, 571 end = (unsigned long) &_end,
584 addr = (unsigned long) obj; 572 addr = (unsigned long) obj;
585#ifdef CONFIG_SMP
586 int i;
587#endif
588 573
589 /* 574 /*
590 * static variable? 575 * static variable?
@@ -595,24 +580,16 @@ static int static_obj(void *obj)
595 if (arch_is_kernel_data(addr)) 580 if (arch_is_kernel_data(addr))
596 return 1; 581 return 1;
597 582
598#ifdef CONFIG_SMP
599 /* 583 /*
600 * percpu var? 584 * in-kernel percpu var?
601 */ 585 */
602 for_each_possible_cpu(i) { 586 if (is_kernel_percpu_address(addr))
603 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i); 587 return 1;
604 end = (unsigned long) &__per_cpu_start + PERCPU_ENOUGH_ROOM
605 + per_cpu_offset(i);
606
607 if ((addr >= start) && (addr < end))
608 return 1;
609 }
610#endif
611 588
612 /* 589 /*
613 * module var? 590 * module static or percpu var?
614 */ 591 */
615 return is_module_address(addr); 592 return is_module_address(addr) || is_module_percpu_address(addr);
616} 593}
617 594
618/* 595/*
@@ -758,7 +735,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
758 return NULL; 735 return NULL;
759 } 736 }
760 class = lock_classes + nr_lock_classes++; 737 class = lock_classes + nr_lock_classes++;
761 debug_atomic_inc(&nr_unused_locks); 738 debug_atomic_inc(nr_unused_locks);
762 class->key = key; 739 class->key = key;
763 class->name = lock->name; 740 class->name = lock->name;
764 class->subclass = subclass; 741 class->subclass = subclass;
@@ -828,7 +805,8 @@ static struct lock_list *alloc_list_entry(void)
828 * Add a new dependency to the head of the list: 805 * Add a new dependency to the head of the list:
829 */ 806 */
830static int add_lock_to_list(struct lock_class *class, struct lock_class *this, 807static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
831 struct list_head *head, unsigned long ip, int distance) 808 struct list_head *head, unsigned long ip,
809 int distance, struct stack_trace *trace)
832{ 810{
833 struct lock_list *entry; 811 struct lock_list *entry;
834 /* 812 /*
@@ -839,11 +817,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
839 if (!entry) 817 if (!entry)
840 return 0; 818 return 0;
841 819
842 if (!save_trace(&entry->trace))
843 return 0;
844
845 entry->class = this; 820 entry->class = this;
846 entry->distance = distance; 821 entry->distance = distance;
822 entry->trace = *trace;
847 /* 823 /*
848 * Since we never remove from the dependency list, the list can 824 * Since we never remove from the dependency list, the list can
849 * be walked lockless by other CPUs, it's only allocation 825 * be walked lockless by other CPUs, it's only allocation
@@ -1215,7 +1191,7 @@ check_noncircular(struct lock_list *root, struct lock_class *target,
1215{ 1191{
1216 int result; 1192 int result;
1217 1193
1218 debug_atomic_inc(&nr_cyclic_checks); 1194 debug_atomic_inc(nr_cyclic_checks);
1219 1195
1220 result = __bfs_forwards(root, target, class_equal, target_entry); 1196 result = __bfs_forwards(root, target, class_equal, target_entry);
1221 1197
@@ -1252,7 +1228,7 @@ find_usage_forwards(struct lock_list *root, enum lock_usage_bit bit,
1252{ 1228{
1253 int result; 1229 int result;
1254 1230
1255 debug_atomic_inc(&nr_find_usage_forwards_checks); 1231 debug_atomic_inc(nr_find_usage_forwards_checks);
1256 1232
1257 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry); 1233 result = __bfs_forwards(root, (void *)bit, usage_match, target_entry);
1258 1234
@@ -1275,7 +1251,7 @@ find_usage_backwards(struct lock_list *root, enum lock_usage_bit bit,
1275{ 1251{
1276 int result; 1252 int result;
1277 1253
1278 debug_atomic_inc(&nr_find_usage_backwards_checks); 1254 debug_atomic_inc(nr_find_usage_backwards_checks);
1279 1255
1280 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry); 1256 result = __bfs_backwards(root, (void *)bit, usage_match, target_entry);
1281 1257
@@ -1645,12 +1621,20 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1645 */ 1621 */
1646static int 1622static int
1647check_prev_add(struct task_struct *curr, struct held_lock *prev, 1623check_prev_add(struct task_struct *curr, struct held_lock *prev,
1648 struct held_lock *next, int distance) 1624 struct held_lock *next, int distance, int trylock_loop)
1649{ 1625{
1650 struct lock_list *entry; 1626 struct lock_list *entry;
1651 int ret; 1627 int ret;
1652 struct lock_list this; 1628 struct lock_list this;
1653 struct lock_list *uninitialized_var(target_entry); 1629 struct lock_list *uninitialized_var(target_entry);
1630 /*
1631 * Static variable, serialized by the graph_lock().
1632 *
1633 * We use this static variable to save the stack trace in case
1634 * we call into this function multiple times due to encountering
1635 * trylocks in the held lock stack.
1636 */
1637 static struct stack_trace trace;
1654 1638
1655 /* 1639 /*
1656 * Prove that the new <prev> -> <next> dependency would not 1640 * Prove that the new <prev> -> <next> dependency would not
@@ -1698,20 +1682,23 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1698 } 1682 }
1699 } 1683 }
1700 1684
1685 if (!trylock_loop && !save_trace(&trace))
1686 return 0;
1687
1701 /* 1688 /*
1702 * Ok, all validations passed, add the new lock 1689 * Ok, all validations passed, add the new lock
1703 * to the previous lock's dependency list: 1690 * to the previous lock's dependency list:
1704 */ 1691 */
1705 ret = add_lock_to_list(hlock_class(prev), hlock_class(next), 1692 ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
1706 &hlock_class(prev)->locks_after, 1693 &hlock_class(prev)->locks_after,
1707 next->acquire_ip, distance); 1694 next->acquire_ip, distance, &trace);
1708 1695
1709 if (!ret) 1696 if (!ret)
1710 return 0; 1697 return 0;
1711 1698
1712 ret = add_lock_to_list(hlock_class(next), hlock_class(prev), 1699 ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
1713 &hlock_class(next)->locks_before, 1700 &hlock_class(next)->locks_before,
1714 next->acquire_ip, distance); 1701 next->acquire_ip, distance, &trace);
1715 if (!ret) 1702 if (!ret)
1716 return 0; 1703 return 0;
1717 1704
@@ -1741,6 +1728,7 @@ static int
1741check_prevs_add(struct task_struct *curr, struct held_lock *next) 1728check_prevs_add(struct task_struct *curr, struct held_lock *next)
1742{ 1729{
1743 int depth = curr->lockdep_depth; 1730 int depth = curr->lockdep_depth;
1731 int trylock_loop = 0;
1744 struct held_lock *hlock; 1732 struct held_lock *hlock;
1745 1733
1746 /* 1734 /*
@@ -1766,7 +1754,8 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1766 * added: 1754 * added:
1767 */ 1755 */
1768 if (hlock->read != 2) { 1756 if (hlock->read != 2) {
1769 if (!check_prev_add(curr, hlock, next, distance)) 1757 if (!check_prev_add(curr, hlock, next,
1758 distance, trylock_loop))
1770 return 0; 1759 return 0;
1771 /* 1760 /*
1772 * Stop after the first non-trylock entry, 1761 * Stop after the first non-trylock entry,
@@ -1789,6 +1778,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
1789 if (curr->held_locks[depth].irq_context != 1778 if (curr->held_locks[depth].irq_context !=
1790 curr->held_locks[depth-1].irq_context) 1779 curr->held_locks[depth-1].irq_context)
1791 break; 1780 break;
1781 trylock_loop = 1;
1792 } 1782 }
1793 return 1; 1783 return 1;
1794out_bug: 1784out_bug:
@@ -1835,7 +1825,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1835 list_for_each_entry(chain, hash_head, entry) { 1825 list_for_each_entry(chain, hash_head, entry) {
1836 if (chain->chain_key == chain_key) { 1826 if (chain->chain_key == chain_key) {
1837cache_hit: 1827cache_hit:
1838 debug_atomic_inc(&chain_lookup_hits); 1828 debug_atomic_inc(chain_lookup_hits);
1839 if (very_verbose(class)) 1829 if (very_verbose(class))
1840 printk("\nhash chain already cached, key: " 1830 printk("\nhash chain already cached, key: "
1841 "%016Lx tail class: [%p] %s\n", 1831 "%016Lx tail class: [%p] %s\n",
@@ -1900,7 +1890,7 @@ cache_hit:
1900 chain_hlocks[chain->base + j] = class - lock_classes; 1890 chain_hlocks[chain->base + j] = class - lock_classes;
1901 } 1891 }
1902 list_add_tail_rcu(&chain->entry, hash_head); 1892 list_add_tail_rcu(&chain->entry, hash_head);
1903 debug_atomic_inc(&chain_lookup_misses); 1893 debug_atomic_inc(chain_lookup_misses);
1904 inc_chains(); 1894 inc_chains();
1905 1895
1906 return 1; 1896 return 1;
@@ -2321,7 +2311,12 @@ void trace_hardirqs_on_caller(unsigned long ip)
2321 return; 2311 return;
2322 2312
2323 if (unlikely(curr->hardirqs_enabled)) { 2313 if (unlikely(curr->hardirqs_enabled)) {
2324 debug_atomic_inc(&redundant_hardirqs_on); 2314 /*
2315 * Neither irq nor preemption are disabled here
2316 * so this is racy by nature but loosing one hit
2317 * in a stat is not a big deal.
2318 */
2319 __debug_atomic_inc(redundant_hardirqs_on);
2325 return; 2320 return;
2326 } 2321 }
2327 /* we'll do an OFF -> ON transition: */ 2322 /* we'll do an OFF -> ON transition: */
@@ -2348,7 +2343,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
2348 2343
2349 curr->hardirq_enable_ip = ip; 2344 curr->hardirq_enable_ip = ip;
2350 curr->hardirq_enable_event = ++curr->irq_events; 2345 curr->hardirq_enable_event = ++curr->irq_events;
2351 debug_atomic_inc(&hardirqs_on_events); 2346 debug_atomic_inc(hardirqs_on_events);
2352} 2347}
2353EXPORT_SYMBOL(trace_hardirqs_on_caller); 2348EXPORT_SYMBOL(trace_hardirqs_on_caller);
2354 2349
@@ -2380,9 +2375,9 @@ void trace_hardirqs_off_caller(unsigned long ip)
2380 curr->hardirqs_enabled = 0; 2375 curr->hardirqs_enabled = 0;
2381 curr->hardirq_disable_ip = ip; 2376 curr->hardirq_disable_ip = ip;
2382 curr->hardirq_disable_event = ++curr->irq_events; 2377 curr->hardirq_disable_event = ++curr->irq_events;
2383 debug_atomic_inc(&hardirqs_off_events); 2378 debug_atomic_inc(hardirqs_off_events);
2384 } else 2379 } else
2385 debug_atomic_inc(&redundant_hardirqs_off); 2380 debug_atomic_inc(redundant_hardirqs_off);
2386} 2381}
2387EXPORT_SYMBOL(trace_hardirqs_off_caller); 2382EXPORT_SYMBOL(trace_hardirqs_off_caller);
2388 2383
@@ -2406,7 +2401,7 @@ void trace_softirqs_on(unsigned long ip)
2406 return; 2401 return;
2407 2402
2408 if (curr->softirqs_enabled) { 2403 if (curr->softirqs_enabled) {
2409 debug_atomic_inc(&redundant_softirqs_on); 2404 debug_atomic_inc(redundant_softirqs_on);
2410 return; 2405 return;
2411 } 2406 }
2412 2407
@@ -2416,7 +2411,7 @@ void trace_softirqs_on(unsigned long ip)
2416 curr->softirqs_enabled = 1; 2411 curr->softirqs_enabled = 1;
2417 curr->softirq_enable_ip = ip; 2412 curr->softirq_enable_ip = ip;
2418 curr->softirq_enable_event = ++curr->irq_events; 2413 curr->softirq_enable_event = ++curr->irq_events;
2419 debug_atomic_inc(&softirqs_on_events); 2414 debug_atomic_inc(softirqs_on_events);
2420 /* 2415 /*
2421 * We are going to turn softirqs on, so set the 2416 * We are going to turn softirqs on, so set the
2422 * usage bit for all held locks, if hardirqs are 2417 * usage bit for all held locks, if hardirqs are
@@ -2446,10 +2441,10 @@ void trace_softirqs_off(unsigned long ip)
2446 curr->softirqs_enabled = 0; 2441 curr->softirqs_enabled = 0;
2447 curr->softirq_disable_ip = ip; 2442 curr->softirq_disable_ip = ip;
2448 curr->softirq_disable_event = ++curr->irq_events; 2443 curr->softirq_disable_event = ++curr->irq_events;
2449 debug_atomic_inc(&softirqs_off_events); 2444 debug_atomic_inc(softirqs_off_events);
2450 DEBUG_LOCKS_WARN_ON(!softirq_count()); 2445 DEBUG_LOCKS_WARN_ON(!softirq_count());
2451 } else 2446 } else
2452 debug_atomic_inc(&redundant_softirqs_off); 2447 debug_atomic_inc(redundant_softirqs_off);
2453} 2448}
2454 2449
2455static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) 2450static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
@@ -2654,7 +2649,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2654 return 0; 2649 return 0;
2655 break; 2650 break;
2656 case LOCK_USED: 2651 case LOCK_USED:
2657 debug_atomic_dec(&nr_unused_locks); 2652 debug_atomic_dec(nr_unused_locks);
2658 break; 2653 break;
2659 default: 2654 default:
2660 if (!debug_locks_off_graph_unlock()) 2655 if (!debug_locks_off_graph_unlock())
@@ -2716,6 +2711,8 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
2716} 2711}
2717EXPORT_SYMBOL_GPL(lockdep_init_map); 2712EXPORT_SYMBOL_GPL(lockdep_init_map);
2718 2713
2714struct lock_class_key __lockdep_no_validate__;
2715
2719/* 2716/*
2720 * This gets called for every mutex_lock*()/spin_lock*() operation. 2717 * This gets called for every mutex_lock*()/spin_lock*() operation.
2721 * We maintain the dependency maps and validate the locking attempt: 2718 * We maintain the dependency maps and validate the locking attempt:
@@ -2750,6 +2747,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2750 return 0; 2747 return 0;
2751 } 2748 }
2752 2749
2750 if (lock->key == &__lockdep_no_validate__)
2751 check = 1;
2752
2753 if (!subclass) 2753 if (!subclass)
2754 class = lock->class_cache; 2754 class = lock->class_cache;
2755 /* 2755 /*
@@ -2760,7 +2760,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2760 if (!class) 2760 if (!class)
2761 return 0; 2761 return 0;
2762 } 2762 }
2763 debug_atomic_inc((atomic_t *)&class->ops); 2763 atomic_inc((atomic_t *)&class->ops);
2764 if (very_verbose(class)) { 2764 if (very_verbose(class)) {
2765 printk("\nacquire class [%p] %s", class->key, class->name); 2765 printk("\nacquire class [%p] %s", class->key, class->name);
2766 if (class->name_version > 1) 2766 if (class->name_version > 1)
@@ -3237,7 +3237,7 @@ void lock_release(struct lockdep_map *lock, int nested,
3237 raw_local_irq_save(flags); 3237 raw_local_irq_save(flags);
3238 check_flags(flags); 3238 check_flags(flags);
3239 current->lockdep_recursion = 1; 3239 current->lockdep_recursion = 1;
3240 trace_lock_release(lock, nested, ip); 3240 trace_lock_release(lock, ip);
3241 __lock_release(lock, nested, ip); 3241 __lock_release(lock, nested, ip);
3242 current->lockdep_recursion = 0; 3242 current->lockdep_recursion = 0;
3243 raw_local_irq_restore(flags); 3243 raw_local_irq_restore(flags);
@@ -3390,7 +3390,7 @@ found_it:
3390 hlock->holdtime_stamp = now; 3390 hlock->holdtime_stamp = now;
3391 } 3391 }
3392 3392
3393 trace_lock_acquired(lock, ip, waittime); 3393 trace_lock_acquired(lock, ip);
3394 3394
3395 stats = get_lock_stats(hlock_class(hlock)); 3395 stats = get_lock_stats(hlock_class(hlock));
3396 if (waittime) { 3396 if (waittime) {
@@ -3811,8 +3811,11 @@ void lockdep_rcu_dereference(const char *file, const int line)
3811{ 3811{
3812 struct task_struct *curr = current; 3812 struct task_struct *curr = current;
3813 3813
3814#ifndef CONFIG_PROVE_RCU_REPEATEDLY
3814 if (!debug_locks_off()) 3815 if (!debug_locks_off())
3815 return; 3816 return;
3817#endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
3818 /* Note: the following can be executed concurrently, so be careful. */
3816 printk("\n===================================================\n"); 3819 printk("\n===================================================\n");
3817 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); 3820 printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
3818 printk( "---------------------------------------------------\n"); 3821 printk( "---------------------------------------------------\n");
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index a2ee95ad1313..4f560cfedc8f 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -110,30 +110,60 @@ lockdep_count_backward_deps(struct lock_class *class)
110#endif 110#endif
111 111
112#ifdef CONFIG_DEBUG_LOCKDEP 112#ifdef CONFIG_DEBUG_LOCKDEP
113
114#include <asm/local.h>
113/* 115/*
114 * Various lockdep statistics: 116 * Various lockdep statistics.
117 * We want them per cpu as they are often accessed in fast path
118 * and we want to avoid too much cache bouncing.
115 */ 119 */
116extern atomic_t chain_lookup_hits; 120struct lockdep_stats {
117extern atomic_t chain_lookup_misses; 121 int chain_lookup_hits;
118extern atomic_t hardirqs_on_events; 122 int chain_lookup_misses;
119extern atomic_t hardirqs_off_events; 123 int hardirqs_on_events;
120extern atomic_t redundant_hardirqs_on; 124 int hardirqs_off_events;
121extern atomic_t redundant_hardirqs_off; 125 int redundant_hardirqs_on;
122extern atomic_t softirqs_on_events; 126 int redundant_hardirqs_off;
123extern atomic_t softirqs_off_events; 127 int softirqs_on_events;
124extern atomic_t redundant_softirqs_on; 128 int softirqs_off_events;
125extern atomic_t redundant_softirqs_off; 129 int redundant_softirqs_on;
126extern atomic_t nr_unused_locks; 130 int redundant_softirqs_off;
127extern atomic_t nr_cyclic_checks; 131 int nr_unused_locks;
128extern atomic_t nr_cyclic_check_recursions; 132 int nr_cyclic_checks;
129extern atomic_t nr_find_usage_forwards_checks; 133 int nr_cyclic_check_recursions;
130extern atomic_t nr_find_usage_forwards_recursions; 134 int nr_find_usage_forwards_checks;
131extern atomic_t nr_find_usage_backwards_checks; 135 int nr_find_usage_forwards_recursions;
132extern atomic_t nr_find_usage_backwards_recursions; 136 int nr_find_usage_backwards_checks;
133# define debug_atomic_inc(ptr) atomic_inc(ptr) 137 int nr_find_usage_backwards_recursions;
134# define debug_atomic_dec(ptr) atomic_dec(ptr) 138};
135# define debug_atomic_read(ptr) atomic_read(ptr) 139
140DECLARE_PER_CPU(struct lockdep_stats, lockdep_stats);
141
142#define __debug_atomic_inc(ptr) \
143 this_cpu_inc(lockdep_stats.ptr);
144
145#define debug_atomic_inc(ptr) { \
146 WARN_ON_ONCE(!irqs_disabled()); \
147 __this_cpu_inc(lockdep_stats.ptr); \
148}
149
150#define debug_atomic_dec(ptr) { \
151 WARN_ON_ONCE(!irqs_disabled()); \
152 __this_cpu_dec(lockdep_stats.ptr); \
153}
154
155#define debug_atomic_read(ptr) ({ \
156 struct lockdep_stats *__cpu_lockdep_stats; \
157 unsigned long long __total = 0; \
158 int __cpu; \
159 for_each_possible_cpu(__cpu) { \
160 __cpu_lockdep_stats = &per_cpu(lockdep_stats, __cpu); \
161 __total += __cpu_lockdep_stats->ptr; \
162 } \
163 __total; \
164})
136#else 165#else
166# define __debug_atomic_inc(ptr) do { } while (0)
137# define debug_atomic_inc(ptr) do { } while (0) 167# define debug_atomic_inc(ptr) do { } while (0)
138# define debug_atomic_dec(ptr) do { } while (0) 168# define debug_atomic_dec(ptr) do { } while (0)
139# define debug_atomic_read(ptr) 0 169# define debug_atomic_read(ptr) 0
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index d4aba4f3584c..59b76c8ce9d7 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -184,34 +184,34 @@ static const struct file_operations proc_lockdep_chains_operations = {
184static void lockdep_stats_debug_show(struct seq_file *m) 184static void lockdep_stats_debug_show(struct seq_file *m)
185{ 185{
186#ifdef CONFIG_DEBUG_LOCKDEP 186#ifdef CONFIG_DEBUG_LOCKDEP
187 unsigned int hi1 = debug_atomic_read(&hardirqs_on_events), 187 unsigned long long hi1 = debug_atomic_read(hardirqs_on_events),
188 hi2 = debug_atomic_read(&hardirqs_off_events), 188 hi2 = debug_atomic_read(hardirqs_off_events),
189 hr1 = debug_atomic_read(&redundant_hardirqs_on), 189 hr1 = debug_atomic_read(redundant_hardirqs_on),
190 hr2 = debug_atomic_read(&redundant_hardirqs_off), 190 hr2 = debug_atomic_read(redundant_hardirqs_off),
191 si1 = debug_atomic_read(&softirqs_on_events), 191 si1 = debug_atomic_read(softirqs_on_events),
192 si2 = debug_atomic_read(&softirqs_off_events), 192 si2 = debug_atomic_read(softirqs_off_events),
193 sr1 = debug_atomic_read(&redundant_softirqs_on), 193 sr1 = debug_atomic_read(redundant_softirqs_on),
194 sr2 = debug_atomic_read(&redundant_softirqs_off); 194 sr2 = debug_atomic_read(redundant_softirqs_off);
195 195
196 seq_printf(m, " chain lookup misses: %11u\n", 196 seq_printf(m, " chain lookup misses: %11llu\n",
197 debug_atomic_read(&chain_lookup_misses)); 197 debug_atomic_read(chain_lookup_misses));
198 seq_printf(m, " chain lookup hits: %11u\n", 198 seq_printf(m, " chain lookup hits: %11llu\n",
199 debug_atomic_read(&chain_lookup_hits)); 199 debug_atomic_read(chain_lookup_hits));
200 seq_printf(m, " cyclic checks: %11u\n", 200 seq_printf(m, " cyclic checks: %11llu\n",
201 debug_atomic_read(&nr_cyclic_checks)); 201 debug_atomic_read(nr_cyclic_checks));
202 seq_printf(m, " find-mask forwards checks: %11u\n", 202 seq_printf(m, " find-mask forwards checks: %11llu\n",
203 debug_atomic_read(&nr_find_usage_forwards_checks)); 203 debug_atomic_read(nr_find_usage_forwards_checks));
204 seq_printf(m, " find-mask backwards checks: %11u\n", 204 seq_printf(m, " find-mask backwards checks: %11llu\n",
205 debug_atomic_read(&nr_find_usage_backwards_checks)); 205 debug_atomic_read(nr_find_usage_backwards_checks));
206 206
207 seq_printf(m, " hardirq on events: %11u\n", hi1); 207 seq_printf(m, " hardirq on events: %11llu\n", hi1);
208 seq_printf(m, " hardirq off events: %11u\n", hi2); 208 seq_printf(m, " hardirq off events: %11llu\n", hi2);
209 seq_printf(m, " redundant hardirq ons: %11u\n", hr1); 209 seq_printf(m, " redundant hardirq ons: %11llu\n", hr1);
210 seq_printf(m, " redundant hardirq offs: %11u\n", hr2); 210 seq_printf(m, " redundant hardirq offs: %11llu\n", hr2);
211 seq_printf(m, " softirq on events: %11u\n", si1); 211 seq_printf(m, " softirq on events: %11llu\n", si1);
212 seq_printf(m, " softirq off events: %11u\n", si2); 212 seq_printf(m, " softirq off events: %11llu\n", si2);
213 seq_printf(m, " redundant softirq ons: %11u\n", sr1); 213 seq_printf(m, " redundant softirq ons: %11llu\n", sr1);
214 seq_printf(m, " redundant softirq offs: %11u\n", sr2); 214 seq_printf(m, " redundant softirq offs: %11llu\n", sr2);
215#endif 215#endif
216} 216}
217 217
@@ -263,7 +263,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
263#endif 263#endif
264 } 264 }
265#ifdef CONFIG_DEBUG_LOCKDEP 265#ifdef CONFIG_DEBUG_LOCKDEP
266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); 266 DEBUG_LOCKS_WARN_ON(debug_atomic_read(nr_unused_locks) != nr_unused);
267#endif 267#endif
268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n", 268 seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
269 nr_lock_classes, MAX_LOCKDEP_KEYS); 269 nr_lock_classes, MAX_LOCKDEP_KEYS);
diff --git a/kernel/module.c b/kernel/module.c
index c968d3606dca..6c562828c85c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -59,8 +59,6 @@
59#define CREATE_TRACE_POINTS 59#define CREATE_TRACE_POINTS
60#include <trace/events/module.h> 60#include <trace/events/module.h>
61 61
62EXPORT_TRACEPOINT_SYMBOL(module_get);
63
64#if 0 62#if 0
65#define DEBUGP printk 63#define DEBUGP printk
66#else 64#else
@@ -74,11 +72,19 @@ EXPORT_TRACEPOINT_SYMBOL(module_get);
74/* If this is set, the section belongs in the init part of the module */ 72/* If this is set, the section belongs in the init part of the module */
75#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 73#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
76 74
77/* List of modules, protected by module_mutex or preempt_disable 75/*
76 * Mutex protects:
77 * 1) List of modules (also safely readable with preempt_disable),
78 * 2) module_use links,
79 * 3) module_addr_min/module_addr_max.
78 * (delete uses stop_machine/add uses RCU list operations). */ 80 * (delete uses stop_machine/add uses RCU list operations). */
79DEFINE_MUTEX(module_mutex); 81DEFINE_MUTEX(module_mutex);
80EXPORT_SYMBOL_GPL(module_mutex); 82EXPORT_SYMBOL_GPL(module_mutex);
81static LIST_HEAD(modules); 83static LIST_HEAD(modules);
84#ifdef CONFIG_KGDB_KDB
85struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
86#endif /* CONFIG_KGDB_KDB */
87
82 88
83/* Block module loading/unloading? */ 89/* Block module loading/unloading? */
84int modules_disabled = 0; 90int modules_disabled = 0;
@@ -88,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
88 94
89static BLOCKING_NOTIFIER_HEAD(module_notify_list); 95static BLOCKING_NOTIFIER_HEAD(module_notify_list);
90 96
91/* Bounds of module allocation, for speeding __module_address */ 97/* Bounds of module allocation, for speeding __module_address.
98 * Protected by module_mutex. */
92static unsigned long module_addr_min = -1UL, module_addr_max = 0; 99static unsigned long module_addr_min = -1UL, module_addr_max = 0;
93 100
94int register_module_notifier(struct notifier_block * nb) 101int register_module_notifier(struct notifier_block * nb)
@@ -178,8 +185,6 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
178extern const struct kernel_symbol __stop___ksymtab_gpl[]; 185extern const struct kernel_symbol __stop___ksymtab_gpl[];
179extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 186extern const struct kernel_symbol __start___ksymtab_gpl_future[];
180extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 187extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
181extern const struct kernel_symbol __start___ksymtab_gpl_future[];
182extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
183extern const unsigned long __start___kcrctab[]; 188extern const unsigned long __start___kcrctab[];
184extern const unsigned long __start___kcrctab_gpl[]; 189extern const unsigned long __start___kcrctab_gpl[];
185extern const unsigned long __start___kcrctab_gpl_future[]; 190extern const unsigned long __start___kcrctab_gpl_future[];
@@ -329,7 +334,7 @@ static bool find_symbol_in_section(const struct symsearch *syms,
329} 334}
330 335
331/* Find a symbol and return it, along with, (optional) crc and 336/* Find a symbol and return it, along with, (optional) crc and
332 * (optional) module which owns it */ 337 * (optional) module which owns it. Needs preempt disabled or module_mutex. */
333const struct kernel_symbol *find_symbol(const char *name, 338const struct kernel_symbol *find_symbol(const char *name,
334 struct module **owner, 339 struct module **owner,
335 const unsigned long **crc, 340 const unsigned long **crc,
@@ -370,54 +375,98 @@ EXPORT_SYMBOL_GPL(find_module);
370 375
371#ifdef CONFIG_SMP 376#ifdef CONFIG_SMP
372 377
373static void *percpu_modalloc(unsigned long size, unsigned long align, 378static inline void __percpu *mod_percpu(struct module *mod)
374 const char *name)
375{ 379{
376 void *ptr; 380 return mod->percpu;
381}
377 382
383static int percpu_modalloc(struct module *mod,
384 unsigned long size, unsigned long align)
385{
378 if (align > PAGE_SIZE) { 386 if (align > PAGE_SIZE) {
379 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", 387 printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
380 name, align, PAGE_SIZE); 388 mod->name, align, PAGE_SIZE);
381 align = PAGE_SIZE; 389 align = PAGE_SIZE;
382 } 390 }
383 391
384 ptr = __alloc_reserved_percpu(size, align); 392 mod->percpu = __alloc_reserved_percpu(size, align);
385 if (!ptr) 393 if (!mod->percpu) {
386 printk(KERN_WARNING 394 printk(KERN_WARNING
387 "Could not allocate %lu bytes percpu data\n", size); 395 "Could not allocate %lu bytes percpu data\n", size);
388 return ptr; 396 return -ENOMEM;
397 }
398 mod->percpu_size = size;
399 return 0;
389} 400}
390 401
391static void percpu_modfree(void *freeme) 402static void percpu_modfree(struct module *mod)
392{ 403{
393 free_percpu(freeme); 404 free_percpu(mod->percpu);
394} 405}
395 406
396static unsigned int find_pcpusec(Elf_Ehdr *hdr, 407static unsigned int find_pcpusec(Elf_Ehdr *hdr,
397 Elf_Shdr *sechdrs, 408 Elf_Shdr *sechdrs,
398 const char *secstrings) 409 const char *secstrings)
399{ 410{
400 return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); 411 return find_sec(hdr, sechdrs, secstrings, ".data..percpu");
401} 412}
402 413
403static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) 414static void percpu_modcopy(struct module *mod,
415 const void *from, unsigned long size)
404{ 416{
405 int cpu; 417 int cpu;
406 418
407 for_each_possible_cpu(cpu) 419 for_each_possible_cpu(cpu)
408 memcpy(pcpudest + per_cpu_offset(cpu), from, size); 420 memcpy(per_cpu_ptr(mod->percpu, cpu), from, size);
421}
422
423/**
424 * is_module_percpu_address - test whether address is from module static percpu
425 * @addr: address to test
426 *
427 * Test whether @addr belongs to module static percpu area.
428 *
429 * RETURNS:
430 * %true if @addr is from module static percpu area
431 */
432bool is_module_percpu_address(unsigned long addr)
433{
434 struct module *mod;
435 unsigned int cpu;
436
437 preempt_disable();
438
439 list_for_each_entry_rcu(mod, &modules, list) {
440 if (!mod->percpu_size)
441 continue;
442 for_each_possible_cpu(cpu) {
443 void *start = per_cpu_ptr(mod->percpu, cpu);
444
445 if ((void *)addr >= start &&
446 (void *)addr < start + mod->percpu_size) {
447 preempt_enable();
448 return true;
449 }
450 }
451 }
452
453 preempt_enable();
454 return false;
409} 455}
410 456
411#else /* ... !CONFIG_SMP */ 457#else /* ... !CONFIG_SMP */
412 458
413static inline void *percpu_modalloc(unsigned long size, unsigned long align, 459static inline void __percpu *mod_percpu(struct module *mod)
414 const char *name)
415{ 460{
416 return NULL; 461 return NULL;
417} 462}
418static inline void percpu_modfree(void *pcpuptr) 463static inline int percpu_modalloc(struct module *mod,
464 unsigned long size, unsigned long align)
465{
466 return -ENOMEM;
467}
468static inline void percpu_modfree(struct module *mod)
419{ 469{
420 BUG();
421} 470}
422static inline unsigned int find_pcpusec(Elf_Ehdr *hdr, 471static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
423 Elf_Shdr *sechdrs, 472 Elf_Shdr *sechdrs,
@@ -425,12 +474,16 @@ static inline unsigned int find_pcpusec(Elf_Ehdr *hdr,
425{ 474{
426 return 0; 475 return 0;
427} 476}
428static inline void percpu_modcopy(void *pcpudst, const void *src, 477static inline void percpu_modcopy(struct module *mod,
429 unsigned long size) 478 const void *from, unsigned long size)
430{ 479{
431 /* pcpusec should be 0, and size of that section should be 0. */ 480 /* pcpusec should be 0, and size of that section should be 0. */
432 BUG_ON(size != 0); 481 BUG_ON(size != 0);
433} 482}
483bool is_module_percpu_address(unsigned long addr)
484{
485 return false;
486}
434 487
435#endif /* CONFIG_SMP */ 488#endif /* CONFIG_SMP */
436 489
@@ -467,35 +520,34 @@ MODINFO_ATTR(srcversion);
467static char last_unloaded_module[MODULE_NAME_LEN+1]; 520static char last_unloaded_module[MODULE_NAME_LEN+1];
468 521
469#ifdef CONFIG_MODULE_UNLOAD 522#ifdef CONFIG_MODULE_UNLOAD
523
524EXPORT_TRACEPOINT_SYMBOL(module_get);
525
470/* Init the unload section of the module. */ 526/* Init the unload section of the module. */
471static void module_unload_init(struct module *mod) 527static void module_unload_init(struct module *mod)
472{ 528{
473 int cpu; 529 int cpu;
474 530
475 INIT_LIST_HEAD(&mod->modules_which_use_me); 531 INIT_LIST_HEAD(&mod->source_list);
476 for_each_possible_cpu(cpu) 532 INIT_LIST_HEAD(&mod->target_list);
477 per_cpu_ptr(mod->refptr, cpu)->count = 0; 533 for_each_possible_cpu(cpu) {
534 per_cpu_ptr(mod->refptr, cpu)->incs = 0;
535 per_cpu_ptr(mod->refptr, cpu)->decs = 0;
536 }
478 537
479 /* Hold reference count during initialization. */ 538 /* Hold reference count during initialization. */
480 __this_cpu_write(mod->refptr->count, 1); 539 __this_cpu_write(mod->refptr->incs, 1);
481 /* Backwards compatibility macros put refcount during init. */ 540 /* Backwards compatibility macros put refcount during init. */
482 mod->waiter = current; 541 mod->waiter = current;
483} 542}
484 543
485/* modules using other modules */
486struct module_use
487{
488 struct list_head list;
489 struct module *module_which_uses;
490};
491
492/* Does a already use b? */ 544/* Does a already use b? */
493static int already_uses(struct module *a, struct module *b) 545static int already_uses(struct module *a, struct module *b)
494{ 546{
495 struct module_use *use; 547 struct module_use *use;
496 548
497 list_for_each_entry(use, &b->modules_which_use_me, list) { 549 list_for_each_entry(use, &b->source_list, source_list) {
498 if (use->module_which_uses == a) { 550 if (use->source == a) {
499 DEBUGP("%s uses %s!\n", a->name, b->name); 551 DEBUGP("%s uses %s!\n", a->name, b->name);
500 return 1; 552 return 1;
501 } 553 }
@@ -504,62 +556,68 @@ static int already_uses(struct module *a, struct module *b)
504 return 0; 556 return 0;
505} 557}
506 558
507/* Module a uses b */ 559/*
508int use_module(struct module *a, struct module *b) 560 * Module a uses b
561 * - we add 'a' as a "source", 'b' as a "target" of module use
562 * - the module_use is added to the list of 'b' sources (so
563 * 'b' can walk the list to see who sourced them), and of 'a'
564 * targets (so 'a' can see what modules it targets).
565 */
566static int add_module_usage(struct module *a, struct module *b)
509{ 567{
510 struct module_use *use; 568 struct module_use *use;
511 int no_warn, err;
512 569
513 if (b == NULL || already_uses(a, b)) return 1; 570 DEBUGP("Allocating new usage for %s.\n", a->name);
571 use = kmalloc(sizeof(*use), GFP_ATOMIC);
572 if (!use) {
573 printk(KERN_WARNING "%s: out of memory loading\n", a->name);
574 return -ENOMEM;
575 }
576
577 use->source = a;
578 use->target = b;
579 list_add(&use->source_list, &b->source_list);
580 list_add(&use->target_list, &a->target_list);
581 return 0;
582}
514 583
515 /* If we're interrupted or time out, we fail. */ 584/* Module a uses b: caller needs module_mutex() */
516 if (wait_event_interruptible_timeout( 585int ref_module(struct module *a, struct module *b)
517 module_wq, (err = strong_try_module_get(b)) != -EBUSY, 586{
518 30 * HZ) <= 0) { 587 int err;
519 printk("%s: gave up waiting for init of module %s.\n", 588
520 a->name, b->name); 589 if (b == NULL || already_uses(a, b))
521 return 0; 590 return 0;
522 }
523 591
524 /* If strong_try_module_get() returned a different error, we fail. */ 592 /* If module isn't available, we fail. */
593 err = strong_try_module_get(b);
525 if (err) 594 if (err)
526 return 0; 595 return err;
527 596
528 DEBUGP("Allocating new usage for %s.\n", a->name); 597 err = add_module_usage(a, b);
529 use = kmalloc(sizeof(*use), GFP_ATOMIC); 598 if (err) {
530 if (!use) {
531 printk("%s: out of memory loading\n", a->name);
532 module_put(b); 599 module_put(b);
533 return 0; 600 return err;
534 } 601 }
535 602 return 0;
536 use->module_which_uses = a;
537 list_add(&use->list, &b->modules_which_use_me);
538 no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
539 return 1;
540} 603}
541EXPORT_SYMBOL_GPL(use_module); 604EXPORT_SYMBOL_GPL(ref_module);
542 605
543/* Clear the unload stuff of the module. */ 606/* Clear the unload stuff of the module. */
544static void module_unload_free(struct module *mod) 607static void module_unload_free(struct module *mod)
545{ 608{
546 struct module *i; 609 struct module_use *use, *tmp;
547
548 list_for_each_entry(i, &modules, list) {
549 struct module_use *use;
550 610
551 list_for_each_entry(use, &i->modules_which_use_me, list) { 611 mutex_lock(&module_mutex);
552 if (use->module_which_uses == mod) { 612 list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) {
553 DEBUGP("%s unusing %s\n", mod->name, i->name); 613 struct module *i = use->target;
554 module_put(i); 614 DEBUGP("%s unusing %s\n", mod->name, i->name);
555 list_del(&use->list); 615 module_put(i);
556 kfree(use); 616 list_del(&use->source_list);
557 sysfs_remove_link(i->holders_dir, mod->name); 617 list_del(&use->target_list);
558 /* There can be at most one match. */ 618 kfree(use);
559 break;
560 }
561 }
562 } 619 }
620 mutex_unlock(&module_mutex);
563} 621}
564 622
565#ifdef CONFIG_MODULE_FORCE_UNLOAD 623#ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -616,12 +674,28 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
616 674
617unsigned int module_refcount(struct module *mod) 675unsigned int module_refcount(struct module *mod)
618{ 676{
619 unsigned int total = 0; 677 unsigned int incs = 0, decs = 0;
620 int cpu; 678 int cpu;
621 679
622 for_each_possible_cpu(cpu) 680 for_each_possible_cpu(cpu)
623 total += per_cpu_ptr(mod->refptr, cpu)->count; 681 decs += per_cpu_ptr(mod->refptr, cpu)->decs;
624 return total; 682 /*
683 * ensure the incs are added up after the decs.
684 * module_put ensures incs are visible before decs with smp_wmb.
685 *
686 * This 2-count scheme avoids the situation where the refcount
687 * for CPU0 is read, then CPU0 increments the module refcount,
688 * then CPU1 drops that refcount, then the refcount for CPU1 is
689 * read. We would record a decrement but not its corresponding
690 * increment so we would see a low count (disaster).
691 *
692 * Rare situation? But module_refcount can be preempted, and we
693 * might be tallying up 4096+ CPUs. So it is not impossible.
694 */
695 smp_rmb();
696 for_each_possible_cpu(cpu)
697 incs += per_cpu_ptr(mod->refptr, cpu)->incs;
698 return incs - decs;
625} 699}
626EXPORT_SYMBOL(module_refcount); 700EXPORT_SYMBOL(module_refcount);
627 701
@@ -657,16 +731,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
657 return -EFAULT; 731 return -EFAULT;
658 name[MODULE_NAME_LEN-1] = '\0'; 732 name[MODULE_NAME_LEN-1] = '\0';
659 733
660 /* Create stop_machine threads since free_module relies on 734 if (mutex_lock_interruptible(&module_mutex) != 0)
661 * a non-failing stop_machine call. */ 735 return -EINTR;
662 ret = stop_machine_create();
663 if (ret)
664 return ret;
665
666 if (mutex_lock_interruptible(&module_mutex) != 0) {
667 ret = -EINTR;
668 goto out_stop;
669 }
670 736
671 mod = find_module(name); 737 mod = find_module(name);
672 if (!mod) { 738 if (!mod) {
@@ -674,7 +740,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
674 goto out; 740 goto out;
675 } 741 }
676 742
677 if (!list_empty(&mod->modules_which_use_me)) { 743 if (!list_empty(&mod->source_list)) {
678 /* Other modules depend on us: get rid of them first. */ 744 /* Other modules depend on us: get rid of them first. */
679 ret = -EWOULDBLOCK; 745 ret = -EWOULDBLOCK;
680 goto out; 746 goto out;
@@ -718,16 +784,14 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
718 blocking_notifier_call_chain(&module_notify_list, 784 blocking_notifier_call_chain(&module_notify_list,
719 MODULE_STATE_GOING, mod); 785 MODULE_STATE_GOING, mod);
720 async_synchronize_full(); 786 async_synchronize_full();
721 mutex_lock(&module_mutex); 787
722 /* Store the name of the last unloaded module for diagnostic purposes */ 788 /* Store the name of the last unloaded module for diagnostic purposes */
723 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 789 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
724 ddebug_remove_module(mod->name);
725 free_module(mod);
726 790
727 out: 791 free_module(mod);
792 return 0;
793out:
728 mutex_unlock(&module_mutex); 794 mutex_unlock(&module_mutex);
729out_stop:
730 stop_machine_destroy();
731 return ret; 795 return ret;
732} 796}
733 797
@@ -740,9 +804,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
740 804
741 /* Always include a trailing , so userspace can differentiate 805 /* Always include a trailing , so userspace can differentiate
742 between this and the old multi-field proc format. */ 806 between this and the old multi-field proc format. */
743 list_for_each_entry(use, &mod->modules_which_use_me, list) { 807 list_for_each_entry(use, &mod->source_list, source_list) {
744 printed_something = 1; 808 printed_something = 1;
745 seq_printf(m, "%s,", use->module_which_uses->name); 809 seq_printf(m, "%s,", use->source->name);
746 } 810 }
747 811
748 if (mod->init != NULL && mod->exit == NULL) { 812 if (mod->init != NULL && mod->exit == NULL) {
@@ -798,10 +862,10 @@ void module_put(struct module *module)
798{ 862{
799 if (module) { 863 if (module) {
800 preempt_disable(); 864 preempt_disable();
801 __this_cpu_dec(module->refptr->count); 865 smp_wmb(); /* see comment in module_refcount */
866 __this_cpu_inc(module->refptr->decs);
802 867
803 trace_module_put(module, _RET_IP_, 868 trace_module_put(module, _RET_IP_);
804 __this_cpu_read(module->refptr->count));
805 /* Maybe they're waiting for us to drop reference? */ 869 /* Maybe they're waiting for us to drop reference? */
806 if (unlikely(!module_is_live(module))) 870 if (unlikely(!module_is_live(module)))
807 wake_up_process(module->waiter); 871 wake_up_process(module->waiter);
@@ -821,11 +885,11 @@ static inline void module_unload_free(struct module *mod)
821{ 885{
822} 886}
823 887
824int use_module(struct module *a, struct module *b) 888int ref_module(struct module *a, struct module *b)
825{ 889{
826 return strong_try_module_get(b) == 0; 890 return strong_try_module_get(b);
827} 891}
828EXPORT_SYMBOL_GPL(use_module); 892EXPORT_SYMBOL_GPL(ref_module);
829 893
830static inline void module_unload_init(struct module *mod) 894static inline void module_unload_init(struct module *mod)
831{ 895{
@@ -942,6 +1006,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
942{ 1006{
943 const unsigned long *crc; 1007 const unsigned long *crc;
944 1008
1009 /* Since this should be found in kernel (which can't be removed),
1010 * no locking is necessary. */
945 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, 1011 if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL,
946 &crc, true, false)) 1012 &crc, true, false))
947 BUG(); 1013 BUG();
@@ -984,29 +1050,62 @@ static inline int same_magic(const char *amagic, const char *bmagic,
984} 1050}
985#endif /* CONFIG_MODVERSIONS */ 1051#endif /* CONFIG_MODVERSIONS */
986 1052
987/* Resolve a symbol for this module. I.e. if we find one, record usage. 1053/* Resolve a symbol for this module. I.e. if we find one, record usage. */
988 Must be holding module_mutex. */
989static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, 1054static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
990 unsigned int versindex, 1055 unsigned int versindex,
991 const char *name, 1056 const char *name,
992 struct module *mod) 1057 struct module *mod,
1058 char ownername[])
993{ 1059{
994 struct module *owner; 1060 struct module *owner;
995 const struct kernel_symbol *sym; 1061 const struct kernel_symbol *sym;
996 const unsigned long *crc; 1062 const unsigned long *crc;
1063 int err;
997 1064
1065 mutex_lock(&module_mutex);
998 sym = find_symbol(name, &owner, &crc, 1066 sym = find_symbol(name, &owner, &crc,
999 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); 1067 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1000 /* use_module can fail due to OOM, 1068 if (!sym)
1001 or module initialization or unloading */ 1069 goto unlock;
1002 if (sym) { 1070
1003 if (!check_version(sechdrs, versindex, name, mod, crc, owner) 1071 if (!check_version(sechdrs, versindex, name, mod, crc, owner)) {
1004 || !use_module(mod, owner)) 1072 sym = ERR_PTR(-EINVAL);
1005 sym = NULL; 1073 goto getname;
1006 } 1074 }
1075
1076 err = ref_module(mod, owner);
1077 if (err) {
1078 sym = ERR_PTR(err);
1079 goto getname;
1080 }
1081
1082getname:
1083 /* We must make copy under the lock if we failed to get ref. */
1084 strncpy(ownername, module_name(owner), MODULE_NAME_LEN);
1085unlock:
1086 mutex_unlock(&module_mutex);
1007 return sym; 1087 return sym;
1008} 1088}
1009 1089
1090static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs,
1091 unsigned int versindex,
1092 const char *name,
1093 struct module *mod)
1094{
1095 const struct kernel_symbol *ksym;
1096 char ownername[MODULE_NAME_LEN];
1097
1098 if (wait_event_interruptible_timeout(module_wq,
1099 !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name,
1100 mod, ownername)) ||
1101 PTR_ERR(ksym) != -EBUSY,
1102 30 * HZ) <= 0) {
1103 printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n",
1104 mod->name, ownername);
1105 }
1106 return ksym;
1107}
1108
1010/* 1109/*
1011 * /sys/module/foo/sections stuff 1110 * /sys/module/foo/sections stuff
1012 * J. Corbet <corbet@lwn.net> 1111 * J. Corbet <corbet@lwn.net>
@@ -1125,7 +1224,7 @@ struct module_notes_attrs {
1125 struct bin_attribute attrs[0]; 1224 struct bin_attribute attrs[0];
1126}; 1225};
1127 1226
1128static ssize_t module_notes_read(struct kobject *kobj, 1227static ssize_t module_notes_read(struct file *filp, struct kobject *kobj,
1129 struct bin_attribute *bin_attr, 1228 struct bin_attribute *bin_attr,
1130 char *buf, loff_t pos, size_t count) 1229 char *buf, loff_t pos, size_t count)
1131{ 1230{
@@ -1236,7 +1335,34 @@ static inline void remove_notes_attrs(struct module *mod)
1236#endif 1335#endif
1237 1336
1238#ifdef CONFIG_SYSFS 1337#ifdef CONFIG_SYSFS
1239int module_add_modinfo_attrs(struct module *mod) 1338static void add_usage_links(struct module *mod)
1339{
1340#ifdef CONFIG_MODULE_UNLOAD
1341 struct module_use *use;
1342 int nowarn;
1343
1344 mutex_lock(&module_mutex);
1345 list_for_each_entry(use, &mod->target_list, target_list) {
1346 nowarn = sysfs_create_link(use->target->holders_dir,
1347 &mod->mkobj.kobj, mod->name);
1348 }
1349 mutex_unlock(&module_mutex);
1350#endif
1351}
1352
1353static void del_usage_links(struct module *mod)
1354{
1355#ifdef CONFIG_MODULE_UNLOAD
1356 struct module_use *use;
1357
1358 mutex_lock(&module_mutex);
1359 list_for_each_entry(use, &mod->target_list, target_list)
1360 sysfs_remove_link(use->target->holders_dir, mod->name);
1361 mutex_unlock(&module_mutex);
1362#endif
1363}
1364
1365static int module_add_modinfo_attrs(struct module *mod)
1240{ 1366{
1241 struct module_attribute *attr; 1367 struct module_attribute *attr;
1242 struct module_attribute *temp_attr; 1368 struct module_attribute *temp_attr;
@@ -1262,7 +1388,7 @@ int module_add_modinfo_attrs(struct module *mod)
1262 return error; 1388 return error;
1263} 1389}
1264 1390
1265void module_remove_modinfo_attrs(struct module *mod) 1391static void module_remove_modinfo_attrs(struct module *mod)
1266{ 1392{
1267 struct module_attribute *attr; 1393 struct module_attribute *attr;
1268 int i; 1394 int i;
@@ -1278,7 +1404,7 @@ void module_remove_modinfo_attrs(struct module *mod)
1278 kfree(mod->modinfo_attrs); 1404 kfree(mod->modinfo_attrs);
1279} 1405}
1280 1406
1281int mod_sysfs_init(struct module *mod) 1407static int mod_sysfs_init(struct module *mod)
1282{ 1408{
1283 int err; 1409 int err;
1284 struct kobject *kobj; 1410 struct kobject *kobj;
@@ -1312,12 +1438,16 @@ out:
1312 return err; 1438 return err;
1313} 1439}
1314 1440
1315int mod_sysfs_setup(struct module *mod, 1441static int mod_sysfs_setup(struct module *mod,
1316 struct kernel_param *kparam, 1442 struct kernel_param *kparam,
1317 unsigned int num_params) 1443 unsigned int num_params)
1318{ 1444{
1319 int err; 1445 int err;
1320 1446
1447 err = mod_sysfs_init(mod);
1448 if (err)
1449 goto out;
1450
1321 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); 1451 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
1322 if (!mod->holders_dir) { 1452 if (!mod->holders_dir) {
1323 err = -ENOMEM; 1453 err = -ENOMEM;
@@ -1332,6 +1462,8 @@ int mod_sysfs_setup(struct module *mod,
1332 if (err) 1462 if (err)
1333 goto out_unreg_param; 1463 goto out_unreg_param;
1334 1464
1465 add_usage_links(mod);
1466
1335 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); 1467 kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
1336 return 0; 1468 return 0;
1337 1469
@@ -1341,6 +1473,7 @@ out_unreg_holders:
1341 kobject_put(mod->holders_dir); 1473 kobject_put(mod->holders_dir);
1342out_unreg: 1474out_unreg:
1343 kobject_put(&mod->mkobj.kobj); 1475 kobject_put(&mod->mkobj.kobj);
1476out:
1344 return err; 1477 return err;
1345} 1478}
1346 1479
@@ -1351,14 +1484,40 @@ static void mod_sysfs_fini(struct module *mod)
1351 1484
1352#else /* CONFIG_SYSFS */ 1485#else /* CONFIG_SYSFS */
1353 1486
1487static inline int mod_sysfs_init(struct module *mod)
1488{
1489 return 0;
1490}
1491
1492static inline int mod_sysfs_setup(struct module *mod,
1493 struct kernel_param *kparam,
1494 unsigned int num_params)
1495{
1496 return 0;
1497}
1498
1499static inline int module_add_modinfo_attrs(struct module *mod)
1500{
1501 return 0;
1502}
1503
1504static inline void module_remove_modinfo_attrs(struct module *mod)
1505{
1506}
1507
1354static void mod_sysfs_fini(struct module *mod) 1508static void mod_sysfs_fini(struct module *mod)
1355{ 1509{
1356} 1510}
1357 1511
1512static void del_usage_links(struct module *mod)
1513{
1514}
1515
1358#endif /* CONFIG_SYSFS */ 1516#endif /* CONFIG_SYSFS */
1359 1517
1360static void mod_kobject_remove(struct module *mod) 1518static void mod_kobject_remove(struct module *mod)
1361{ 1519{
1520 del_usage_links(mod);
1362 module_remove_modinfo_attrs(mod); 1521 module_remove_modinfo_attrs(mod);
1363 module_param_sysfs_remove(mod); 1522 module_param_sysfs_remove(mod);
1364 kobject_put(mod->mkobj.drivers_dir); 1523 kobject_put(mod->mkobj.drivers_dir);
@@ -1377,17 +1536,22 @@ static int __unlink_module(void *_mod)
1377 return 0; 1536 return 0;
1378} 1537}
1379 1538
1380/* Free a module, remove from lists, etc (must hold module_mutex). */ 1539/* Free a module, remove from lists, etc. */
1381static void free_module(struct module *mod) 1540static void free_module(struct module *mod)
1382{ 1541{
1383 trace_module_free(mod); 1542 trace_module_free(mod);
1384 1543
1385 /* Delete from various lists */ 1544 /* Delete from various lists */
1545 mutex_lock(&module_mutex);
1386 stop_machine(__unlink_module, mod, NULL); 1546 stop_machine(__unlink_module, mod, NULL);
1547 mutex_unlock(&module_mutex);
1387 remove_notes_attrs(mod); 1548 remove_notes_attrs(mod);
1388 remove_sect_attrs(mod); 1549 remove_sect_attrs(mod);
1389 mod_kobject_remove(mod); 1550 mod_kobject_remove(mod);
1390 1551
1552 /* Remove dynamic debug info */
1553 ddebug_remove_module(mod->name);
1554
1391 /* Arch-specific cleanup. */ 1555 /* Arch-specific cleanup. */
1392 module_arch_cleanup(mod); 1556 module_arch_cleanup(mod);
1393 1557
@@ -1400,8 +1564,7 @@ static void free_module(struct module *mod)
1400 /* This may be NULL, but that's OK */ 1564 /* This may be NULL, but that's OK */
1401 module_free(mod, mod->module_init); 1565 module_free(mod, mod->module_init);
1402 kfree(mod->args); 1566 kfree(mod->args);
1403 if (mod->percpu) 1567 percpu_modfree(mod);
1404 percpu_modfree(mod->percpu);
1405#if defined(CONFIG_MODULE_UNLOAD) 1568#if defined(CONFIG_MODULE_UNLOAD)
1406 if (mod->refptr) 1569 if (mod->refptr)
1407 free_percpu(mod->refptr); 1570 free_percpu(mod->refptr);
@@ -1435,6 +1598,8 @@ EXPORT_SYMBOL_GPL(__symbol_get);
1435/* 1598/*
1436 * Ensure that an exported symbol [global namespace] does not already exist 1599 * Ensure that an exported symbol [global namespace] does not already exist
1437 * in the kernel or in some other module's exported symbol table. 1600 * in the kernel or in some other module's exported symbol table.
1601 *
1602 * You must hold the module_mutex.
1438 */ 1603 */
1439static int verify_export_symbols(struct module *mod) 1604static int verify_export_symbols(struct module *mod)
1440{ 1605{
@@ -1500,27 +1665,29 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1500 break; 1665 break;
1501 1666
1502 case SHN_UNDEF: 1667 case SHN_UNDEF:
1503 ksym = resolve_symbol(sechdrs, versindex, 1668 ksym = resolve_symbol_wait(sechdrs, versindex,
1504 strtab + sym[i].st_name, mod); 1669 strtab + sym[i].st_name,
1670 mod);
1505 /* Ok if resolved. */ 1671 /* Ok if resolved. */
1506 if (ksym) { 1672 if (ksym && !IS_ERR(ksym)) {
1507 sym[i].st_value = ksym->value; 1673 sym[i].st_value = ksym->value;
1508 break; 1674 break;
1509 } 1675 }
1510 1676
1511 /* Ok if weak. */ 1677 /* Ok if weak. */
1512 if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) 1678 if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
1513 break; 1679 break;
1514 1680
1515 printk(KERN_WARNING "%s: Unknown symbol %s\n", 1681 printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n",
1516 mod->name, strtab + sym[i].st_name); 1682 mod->name, strtab + sym[i].st_name,
1517 ret = -ENOENT; 1683 PTR_ERR(ksym));
1684 ret = PTR_ERR(ksym) ?: -ENOENT;
1518 break; 1685 break;
1519 1686
1520 default: 1687 default:
1521 /* Divert to percpu allocation if a percpu var. */ 1688 /* Divert to percpu allocation if a percpu var. */
1522 if (sym[i].st_shndx == pcpuindex) 1689 if (sym[i].st_shndx == pcpuindex)
1523 secbase = (unsigned long)mod->percpu; 1690 secbase = (unsigned long)mod_percpu(mod);
1524 else 1691 else
1525 secbase = sechdrs[sym[i].st_shndx].sh_addr; 1692 secbase = sechdrs[sym[i].st_shndx].sh_addr;
1526 sym[i].st_value += secbase; 1693 sym[i].st_value += secbase;
@@ -1897,16 +2064,24 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
1897#endif 2064#endif
1898} 2065}
1899 2066
2067static void dynamic_debug_remove(struct _ddebug *debug)
2068{
2069 if (debug)
2070 ddebug_remove_module(debug->modname);
2071}
2072
1900static void *module_alloc_update_bounds(unsigned long size) 2073static void *module_alloc_update_bounds(unsigned long size)
1901{ 2074{
1902 void *ret = module_alloc(size); 2075 void *ret = module_alloc(size);
1903 2076
1904 if (ret) { 2077 if (ret) {
2078 mutex_lock(&module_mutex);
1905 /* Update module bounds. */ 2079 /* Update module bounds. */
1906 if ((unsigned long)ret < module_addr_min) 2080 if ((unsigned long)ret < module_addr_min)
1907 module_addr_min = (unsigned long)ret; 2081 module_addr_min = (unsigned long)ret;
1908 if ((unsigned long)ret + size > module_addr_max) 2082 if ((unsigned long)ret + size > module_addr_max)
1909 module_addr_max = (unsigned long)ret + size; 2083 module_addr_max = (unsigned long)ret + size;
2084 mutex_unlock(&module_mutex);
1910 } 2085 }
1911 return ret; 2086 return ret;
1912} 2087}
@@ -1954,8 +2129,11 @@ static noinline struct module *load_module(void __user *umod,
1954 unsigned int modindex, versindex, infoindex, pcpuindex; 2129 unsigned int modindex, versindex, infoindex, pcpuindex;
1955 struct module *mod; 2130 struct module *mod;
1956 long err = 0; 2131 long err = 0;
1957 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 2132 void *ptr = NULL; /* Stops spurious gcc warning */
1958 unsigned long symoffs, stroffs, *strmap; 2133 unsigned long symoffs, stroffs, *strmap;
2134 void __percpu *percpu;
2135 struct _ddebug *debug = NULL;
2136 unsigned int num_debug = 0;
1959 2137
1960 mm_segment_t old_fs; 2138 mm_segment_t old_fs;
1961 2139
@@ -2080,11 +2258,6 @@ static noinline struct module *load_module(void __user *umod,
2080 goto free_mod; 2258 goto free_mod;
2081 } 2259 }
2082 2260
2083 if (find_module(mod->name)) {
2084 err = -EEXIST;
2085 goto free_mod;
2086 }
2087
2088 mod->state = MODULE_STATE_COMING; 2261 mod->state = MODULE_STATE_COMING;
2089 2262
2090 /* Allow arches to frob section contents and sizes. */ 2263 /* Allow arches to frob section contents and sizes. */
@@ -2094,16 +2267,14 @@ static noinline struct module *load_module(void __user *umod,
2094 2267
2095 if (pcpuindex) { 2268 if (pcpuindex) {
2096 /* We have a special allocation for this section. */ 2269 /* We have a special allocation for this section. */
2097 percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, 2270 err = percpu_modalloc(mod, sechdrs[pcpuindex].sh_size,
2098 sechdrs[pcpuindex].sh_addralign, 2271 sechdrs[pcpuindex].sh_addralign);
2099 mod->name); 2272 if (err)
2100 if (!percpu) {
2101 err = -ENOMEM;
2102 goto free_mod; 2273 goto free_mod;
2103 }
2104 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 2274 sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
2105 mod->percpu = percpu;
2106 } 2275 }
2276 /* Keep this around for failure path. */
2277 percpu = mod_percpu(mod);
2107 2278
2108 /* Determine total sizes, and put offsets in sh_entsize. For now 2279 /* Determine total sizes, and put offsets in sh_entsize. For now
2109 this is done generically; there doesn't appear to be any 2280 this is done generically; there doesn't appear to be any
@@ -2177,11 +2348,6 @@ static noinline struct module *load_module(void __user *umod,
2177 /* Now we've moved module, initialize linked lists, etc. */ 2348 /* Now we've moved module, initialize linked lists, etc. */
2178 module_unload_init(mod); 2349 module_unload_init(mod);
2179 2350
2180 /* add kobject, so we can reference it. */
2181 err = mod_sysfs_init(mod);
2182 if (err)
2183 goto free_unload;
2184
2185 /* Set up license info based on the info section */ 2351 /* Set up license info based on the info section */
2186 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 2352 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
2187 2353
@@ -2306,18 +2472,13 @@ static noinline struct module *load_module(void __user *umod,
2306 goto cleanup; 2472 goto cleanup;
2307 } 2473 }
2308 2474
2309 /* Find duplicate symbols */
2310 err = verify_export_symbols(mod);
2311 if (err < 0)
2312 goto cleanup;
2313
2314 /* Set up and sort exception table */ 2475 /* Set up and sort exception table */
2315 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", 2476 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2316 sizeof(*mod->extable), &mod->num_exentries); 2477 sizeof(*mod->extable), &mod->num_exentries);
2317 sort_extable(mod->extable, mod->extable + mod->num_exentries); 2478 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2318 2479
2319 /* Finally, copy percpu area over. */ 2480 /* Finally, copy percpu area over. */
2320 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2481 percpu_modcopy(mod, (void *)sechdrs[pcpuindex].sh_addr,
2321 sechdrs[pcpuindex].sh_size); 2482 sechdrs[pcpuindex].sh_size);
2322 2483
2323 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex, 2484 add_kallsyms(mod, sechdrs, hdr->e_shnum, symindex, strindex,
@@ -2325,15 +2486,9 @@ static noinline struct module *load_module(void __user *umod,
2325 kfree(strmap); 2486 kfree(strmap);
2326 strmap = NULL; 2487 strmap = NULL;
2327 2488
2328 if (!mod->taints) { 2489 if (!mod->taints)
2329 struct _ddebug *debug;
2330 unsigned int num_debug;
2331
2332 debug = section_objs(hdr, sechdrs, secstrings, "__verbose", 2490 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2333 sizeof(*debug), &num_debug); 2491 sizeof(*debug), &num_debug);
2334 if (debug)
2335 dynamic_debug_setup(debug, num_debug);
2336 }
2337 2492
2338 err = module_finalize(hdr, sechdrs, mod); 2493 err = module_finalize(hdr, sechdrs, mod);
2339 if (err < 0) 2494 if (err < 0)
@@ -2369,7 +2524,22 @@ static noinline struct module *load_module(void __user *umod,
2369 * function to insert in a way safe to concurrent readers. 2524 * function to insert in a way safe to concurrent readers.
2370 * The mutex protects against concurrent writers. 2525 * The mutex protects against concurrent writers.
2371 */ 2526 */
2527 mutex_lock(&module_mutex);
2528 if (find_module(mod->name)) {
2529 err = -EEXIST;
2530 goto unlock;
2531 }
2532
2533 if (debug)
2534 dynamic_debug_setup(debug, num_debug);
2535
2536 /* Find duplicate symbols */
2537 err = verify_export_symbols(mod);
2538 if (err < 0)
2539 goto ddebug;
2540
2372 list_add_rcu(&mod->list, &modules); 2541 list_add_rcu(&mod->list, &modules);
2542 mutex_unlock(&module_mutex);
2373 2543
2374 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); 2544 err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
2375 if (err < 0) 2545 if (err < 0)
@@ -2378,6 +2548,7 @@ static noinline struct module *load_module(void __user *umod,
2378 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); 2548 err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
2379 if (err < 0) 2549 if (err < 0)
2380 goto unlink; 2550 goto unlink;
2551
2381 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2552 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2382 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2553 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2383 2554
@@ -2390,15 +2561,17 @@ static noinline struct module *load_module(void __user *umod,
2390 return mod; 2561 return mod;
2391 2562
2392 unlink: 2563 unlink:
2564 mutex_lock(&module_mutex);
2393 /* Unlink carefully: kallsyms could be walking list. */ 2565 /* Unlink carefully: kallsyms could be walking list. */
2394 list_del_rcu(&mod->list); 2566 list_del_rcu(&mod->list);
2567 ddebug:
2568 dynamic_debug_remove(debug);
2569 unlock:
2570 mutex_unlock(&module_mutex);
2395 synchronize_sched(); 2571 synchronize_sched();
2396 module_arch_cleanup(mod); 2572 module_arch_cleanup(mod);
2397 cleanup: 2573 cleanup:
2398 free_modinfo(mod); 2574 free_modinfo(mod);
2399 kobject_del(&mod->mkobj.kobj);
2400 kobject_put(&mod->mkobj.kobj);
2401 free_unload:
2402 module_unload_free(mod); 2575 module_unload_free(mod);
2403#if defined(CONFIG_MODULE_UNLOAD) 2576#if defined(CONFIG_MODULE_UNLOAD)
2404 free_percpu(mod->refptr); 2577 free_percpu(mod->refptr);
@@ -2409,8 +2582,7 @@ static noinline struct module *load_module(void __user *umod,
2409 module_free(mod, mod->module_core); 2582 module_free(mod, mod->module_core);
2410 /* mod will be freed with core. Don't access it beyond this line! */ 2583 /* mod will be freed with core. Don't access it beyond this line! */
2411 free_percpu: 2584 free_percpu:
2412 if (percpu) 2585 free_percpu(percpu);
2413 percpu_modfree(percpu);
2414 free_mod: 2586 free_mod:
2415 kfree(args); 2587 kfree(args);
2416 kfree(strmap); 2588 kfree(strmap);
@@ -2446,19 +2618,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2446 if (!capable(CAP_SYS_MODULE) || modules_disabled) 2618 if (!capable(CAP_SYS_MODULE) || modules_disabled)
2447 return -EPERM; 2619 return -EPERM;
2448 2620
2449 /* Only one module load at a time, please */
2450 if (mutex_lock_interruptible(&module_mutex) != 0)
2451 return -EINTR;
2452
2453 /* Do all the hard work */ 2621 /* Do all the hard work */
2454 mod = load_module(umod, len, uargs); 2622 mod = load_module(umod, len, uargs);
2455 if (IS_ERR(mod)) { 2623 if (IS_ERR(mod))
2456 mutex_unlock(&module_mutex);
2457 return PTR_ERR(mod); 2624 return PTR_ERR(mod);
2458 }
2459
2460 /* Drop lock so they can recurse */
2461 mutex_unlock(&module_mutex);
2462 2625
2463 blocking_notifier_call_chain(&module_notify_list, 2626 blocking_notifier_call_chain(&module_notify_list,
2464 MODULE_STATE_COMING, mod); 2627 MODULE_STATE_COMING, mod);
@@ -2475,9 +2638,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2475 module_put(mod); 2638 module_put(mod);
2476 blocking_notifier_call_chain(&module_notify_list, 2639 blocking_notifier_call_chain(&module_notify_list,
2477 MODULE_STATE_GOING, mod); 2640 MODULE_STATE_GOING, mod);
2478 mutex_lock(&module_mutex);
2479 free_module(mod); 2641 free_module(mod);
2480 mutex_unlock(&module_mutex);
2481 wake_up(&module_wq); 2642 wake_up(&module_wq);
2482 return ret; 2643 return ret;
2483 } 2644 }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 632f04c57d82..4c0b7b3e6d2e 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -172,6 +172,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
172 struct thread_info *owner; 172 struct thread_info *owner;
173 173
174 /* 174 /*
175 * If we own the BKL, then don't spin. The owner of
176 * the mutex might be waiting on us to release the BKL.
177 */
178 if (unlikely(current->lock_depth >= 0))
179 break;
180
181 /*
175 * If there's an owner, wait for it to either 182 * If there's an owner, wait for it to either
176 * release the lock or go to sleep. 183 * release the lock or go to sleep.
177 */ 184 */
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 2ab67233ee8f..f74e6c00e26d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -13,6 +13,7 @@
13 * Pavel Emelianov <xemul@openvz.org> 13 * Pavel Emelianov <xemul@openvz.org>
14 */ 14 */
15 15
16#include <linux/slab.h>
16#include <linux/module.h> 17#include <linux/module.h>
17#include <linux/nsproxy.h> 18#include <linux/nsproxy.h>
18#include <linux/init_task.h> 19#include <linux/init_task.h>
diff --git a/kernel/padata.c b/kernel/padata.c
index 93caf65ff57c..751019415d23 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -25,18 +25,20 @@
25#include <linux/padata.h> 25#include <linux/padata.h>
26#include <linux/mutex.h> 26#include <linux/mutex.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/slab.h>
29#include <linux/sysfs.h>
28#include <linux/rcupdate.h> 30#include <linux/rcupdate.h>
29 31
30#define MAX_SEQ_NR INT_MAX - NR_CPUS 32#define MAX_SEQ_NR (INT_MAX - NR_CPUS)
31#define MAX_OBJ_NUM 10000 * NR_CPUS 33#define MAX_OBJ_NUM 1000
32 34
33static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) 35static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
34{ 36{
35 int cpu, target_cpu; 37 int cpu, target_cpu;
36 38
37 target_cpu = cpumask_first(pd->cpumask); 39 target_cpu = cpumask_first(pd->cpumask.pcpu);
38 for (cpu = 0; cpu < cpu_index; cpu++) 40 for (cpu = 0; cpu < cpu_index; cpu++)
39 target_cpu = cpumask_next(target_cpu, pd->cpumask); 41 target_cpu = cpumask_next(target_cpu, pd->cpumask.pcpu);
40 42
41 return target_cpu; 43 return target_cpu;
42} 44}
@@ -52,26 +54,27 @@ static int padata_cpu_hash(struct padata_priv *padata)
52 * Hash the sequence numbers to the cpus by taking 54 * Hash the sequence numbers to the cpus by taking
53 * seq_nr mod. number of cpus in use. 55 * seq_nr mod. number of cpus in use.
54 */ 56 */
55 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); 57 cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask.pcpu);
56 58
57 return padata_index_to_cpu(pd, cpu_index); 59 return padata_index_to_cpu(pd, cpu_index);
58} 60}
59 61
60static void padata_parallel_worker(struct work_struct *work) 62static void padata_parallel_worker(struct work_struct *parallel_work)
61{ 63{
62 struct padata_queue *queue; 64 struct padata_parallel_queue *pqueue;
63 struct parallel_data *pd; 65 struct parallel_data *pd;
64 struct padata_instance *pinst; 66 struct padata_instance *pinst;
65 LIST_HEAD(local_list); 67 LIST_HEAD(local_list);
66 68
67 local_bh_disable(); 69 local_bh_disable();
68 queue = container_of(work, struct padata_queue, pwork); 70 pqueue = container_of(parallel_work,
69 pd = queue->pd; 71 struct padata_parallel_queue, work);
72 pd = pqueue->pd;
70 pinst = pd->pinst; 73 pinst = pd->pinst;
71 74
72 spin_lock(&queue->parallel.lock); 75 spin_lock(&pqueue->parallel.lock);
73 list_replace_init(&queue->parallel.list, &local_list); 76 list_replace_init(&pqueue->parallel.list, &local_list);
74 spin_unlock(&queue->parallel.lock); 77 spin_unlock(&pqueue->parallel.lock);
75 78
76 while (!list_empty(&local_list)) { 79 while (!list_empty(&local_list)) {
77 struct padata_priv *padata; 80 struct padata_priv *padata;
@@ -87,13 +90,13 @@ static void padata_parallel_worker(struct work_struct *work)
87 local_bh_enable(); 90 local_bh_enable();
88} 91}
89 92
90/* 93/**
91 * padata_do_parallel - padata parallelization function 94 * padata_do_parallel - padata parallelization function
92 * 95 *
93 * @pinst: padata instance 96 * @pinst: padata instance
94 * @padata: object to be parallelized 97 * @padata: object to be parallelized
95 * @cb_cpu: cpu the serialization callback function will run on, 98 * @cb_cpu: cpu the serialization callback function will run on,
96 * must be in the cpumask of padata. 99 * must be in the serial cpumask of padata(i.e. cpumask.cbcpu).
97 * 100 *
98 * The parallelization callback function will run with BHs off. 101 * The parallelization callback function will run with BHs off.
99 * Note: Every object which is parallelized by padata_do_parallel 102 * Note: Every object which is parallelized by padata_do_parallel
@@ -103,15 +106,18 @@ int padata_do_parallel(struct padata_instance *pinst,
103 struct padata_priv *padata, int cb_cpu) 106 struct padata_priv *padata, int cb_cpu)
104{ 107{
105 int target_cpu, err; 108 int target_cpu, err;
106 struct padata_queue *queue; 109 struct padata_parallel_queue *queue;
107 struct parallel_data *pd; 110 struct parallel_data *pd;
108 111
109 rcu_read_lock_bh(); 112 rcu_read_lock_bh();
110 113
111 pd = rcu_dereference(pinst->pd); 114 pd = rcu_dereference(pinst->pd);
112 115
113 err = 0; 116 err = -EINVAL;
114 if (!(pinst->flags & PADATA_INIT)) 117 if (!(pinst->flags & PADATA_INIT) || pinst->flags & PADATA_INVALID)
118 goto out;
119
120 if (!cpumask_test_cpu(cb_cpu, pd->cpumask.cbcpu))
115 goto out; 121 goto out;
116 122
117 err = -EBUSY; 123 err = -EBUSY;
@@ -121,11 +127,7 @@ int padata_do_parallel(struct padata_instance *pinst,
121 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) 127 if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM)
122 goto out; 128 goto out;
123 129
124 err = -EINVAL; 130 err = 0;
125 if (!cpumask_test_cpu(cb_cpu, pd->cpumask))
126 goto out;
127
128 err = -EINPROGRESS;
129 atomic_inc(&pd->refcnt); 131 atomic_inc(&pd->refcnt);
130 padata->pd = pd; 132 padata->pd = pd;
131 padata->cb_cpu = cb_cpu; 133 padata->cb_cpu = cb_cpu;
@@ -136,13 +138,13 @@ int padata_do_parallel(struct padata_instance *pinst,
136 padata->seq_nr = atomic_inc_return(&pd->seq_nr); 138 padata->seq_nr = atomic_inc_return(&pd->seq_nr);
137 139
138 target_cpu = padata_cpu_hash(padata); 140 target_cpu = padata_cpu_hash(padata);
139 queue = per_cpu_ptr(pd->queue, target_cpu); 141 queue = per_cpu_ptr(pd->pqueue, target_cpu);
140 142
141 spin_lock(&queue->parallel.lock); 143 spin_lock(&queue->parallel.lock);
142 list_add_tail(&padata->list, &queue->parallel.list); 144 list_add_tail(&padata->list, &queue->parallel.list);
143 spin_unlock(&queue->parallel.lock); 145 spin_unlock(&queue->parallel.lock);
144 146
145 queue_work_on(target_cpu, pinst->wq, &queue->pwork); 147 queue_work_on(target_cpu, pinst->wq, &queue->work);
146 148
147out: 149out:
148 rcu_read_unlock_bh(); 150 rcu_read_unlock_bh();
@@ -151,86 +153,72 @@ out:
151} 153}
152EXPORT_SYMBOL(padata_do_parallel); 154EXPORT_SYMBOL(padata_do_parallel);
153 155
156/*
157 * padata_get_next - Get the next object that needs serialization.
158 *
159 * Return values are:
160 *
161 * A pointer to the control struct of the next object that needs
162 * serialization, if present in one of the percpu reorder queues.
163 *
164 * NULL, if all percpu reorder queues are empty.
165 *
166 * -EINPROGRESS, if the next object that needs serialization will
167 * be parallel processed by another cpu and is not yet present in
168 * the cpu's reorder queue.
169 *
170 * -ENODATA, if this cpu has to do the parallel processing for
171 * the next object.
172 */
154static struct padata_priv *padata_get_next(struct parallel_data *pd) 173static struct padata_priv *padata_get_next(struct parallel_data *pd)
155{ 174{
156 int cpu, num_cpus, empty, calc_seq_nr; 175 int cpu, num_cpus;
157 int seq_nr, next_nr, overrun, next_overrun; 176 int next_nr, next_index;
158 struct padata_queue *queue, *next_queue; 177 struct padata_parallel_queue *queue, *next_queue;
159 struct padata_priv *padata; 178 struct padata_priv *padata;
160 struct padata_list *reorder; 179 struct padata_list *reorder;
161 180
162 empty = 0; 181 num_cpus = cpumask_weight(pd->cpumask.pcpu);
163 next_nr = -1;
164 next_overrun = 0;
165 next_queue = NULL;
166
167 num_cpus = cpumask_weight(pd->cpumask);
168
169 for_each_cpu(cpu, pd->cpumask) {
170 queue = per_cpu_ptr(pd->queue, cpu);
171 reorder = &queue->reorder;
172
173 /*
174 * Calculate the seq_nr of the object that should be
175 * next in this queue.
176 */
177 overrun = 0;
178 calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
179 + queue->cpu_index;
180
181 if (unlikely(calc_seq_nr > pd->max_seq_nr)) {
182 calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1;
183 overrun = 1;
184 }
185
186 if (!list_empty(&reorder->list)) {
187 padata = list_entry(reorder->list.next,
188 struct padata_priv, list);
189
190 seq_nr = padata->seq_nr;
191 BUG_ON(calc_seq_nr != seq_nr);
192 } else {
193 seq_nr = calc_seq_nr;
194 empty++;
195 }
196 182
197 if (next_nr < 0 || seq_nr < next_nr 183 /*
198 || (next_overrun && !overrun)) { 184 * Calculate the percpu reorder queue and the sequence
199 next_nr = seq_nr; 185 * number of the next object.
200 next_overrun = overrun; 186 */
201 next_queue = queue; 187 next_nr = pd->processed;
202 } 188 next_index = next_nr % num_cpus;
189 cpu = padata_index_to_cpu(pd, next_index);
190 next_queue = per_cpu_ptr(pd->pqueue, cpu);
191
192 if (unlikely(next_nr > pd->max_seq_nr)) {
193 next_nr = next_nr - pd->max_seq_nr - 1;
194 next_index = next_nr % num_cpus;
195 cpu = padata_index_to_cpu(pd, next_index);
196 next_queue = per_cpu_ptr(pd->pqueue, cpu);
197 pd->processed = 0;
203 } 198 }
204 199
205 padata = NULL; 200 padata = NULL;
206 201
207 if (empty == num_cpus)
208 goto out;
209
210 reorder = &next_queue->reorder; 202 reorder = &next_queue->reorder;
211 203
212 if (!list_empty(&reorder->list)) { 204 if (!list_empty(&reorder->list)) {
213 padata = list_entry(reorder->list.next, 205 padata = list_entry(reorder->list.next,
214 struct padata_priv, list); 206 struct padata_priv, list);
215 207
216 if (unlikely(next_overrun)) { 208 BUG_ON(next_nr != padata->seq_nr);
217 for_each_cpu(cpu, pd->cpumask) {
218 queue = per_cpu_ptr(pd->queue, cpu);
219 atomic_set(&queue->num_obj, 0);
220 }
221 }
222 209
223 spin_lock(&reorder->lock); 210 spin_lock(&reorder->lock);
224 list_del_init(&padata->list); 211 list_del_init(&padata->list);
225 atomic_dec(&pd->reorder_objects); 212 atomic_dec(&pd->reorder_objects);
226 spin_unlock(&reorder->lock); 213 spin_unlock(&reorder->lock);
227 214
228 atomic_inc(&next_queue->num_obj); 215 pd->processed++;
229 216
230 goto out; 217 goto out;
231 } 218 }
232 219
233 if (next_nr % num_cpus == next_queue->cpu_index) { 220 queue = per_cpu_ptr(pd->pqueue, smp_processor_id());
221 if (queue->cpu_index == next_queue->cpu_index) {
234 padata = ERR_PTR(-ENODATA); 222 padata = ERR_PTR(-ENODATA);
235 goto out; 223 goto out;
236 } 224 }
@@ -243,55 +231,90 @@ out:
243static void padata_reorder(struct parallel_data *pd) 231static void padata_reorder(struct parallel_data *pd)
244{ 232{
245 struct padata_priv *padata; 233 struct padata_priv *padata;
246 struct padata_queue *queue; 234 struct padata_serial_queue *squeue;
247 struct padata_instance *pinst = pd->pinst; 235 struct padata_instance *pinst = pd->pinst;
248 236
249try_again: 237 /*
238 * We need to ensure that only one cpu can work on dequeueing of
239 * the reorder queue the time. Calculating in which percpu reorder
240 * queue the next object will arrive takes some time. A spinlock
241 * would be highly contended. Also it is not clear in which order
242 * the objects arrive to the reorder queues. So a cpu could wait to
243 * get the lock just to notice that there is nothing to do at the
244 * moment. Therefore we use a trylock and let the holder of the lock
245 * care for all the objects enqueued during the holdtime of the lock.
246 */
250 if (!spin_trylock_bh(&pd->lock)) 247 if (!spin_trylock_bh(&pd->lock))
251 goto out; 248 return;
252 249
253 while (1) { 250 while (1) {
254 padata = padata_get_next(pd); 251 padata = padata_get_next(pd);
255 252
253 /*
254 * All reorder queues are empty, or the next object that needs
255 * serialization is parallel processed by another cpu and is
256 * still on it's way to the cpu's reorder queue, nothing to
257 * do for now.
258 */
256 if (!padata || PTR_ERR(padata) == -EINPROGRESS) 259 if (!padata || PTR_ERR(padata) == -EINPROGRESS)
257 break; 260 break;
258 261
262 /*
263 * This cpu has to do the parallel processing of the next
264 * object. It's waiting in the cpu's parallelization queue,
265 * so exit imediately.
266 */
259 if (PTR_ERR(padata) == -ENODATA) { 267 if (PTR_ERR(padata) == -ENODATA) {
268 del_timer(&pd->timer);
260 spin_unlock_bh(&pd->lock); 269 spin_unlock_bh(&pd->lock);
261 goto out; 270 return;
262 } 271 }
263 272
264 queue = per_cpu_ptr(pd->queue, padata->cb_cpu); 273 squeue = per_cpu_ptr(pd->squeue, padata->cb_cpu);
265 274
266 spin_lock(&queue->serial.lock); 275 spin_lock(&squeue->serial.lock);
267 list_add_tail(&padata->list, &queue->serial.list); 276 list_add_tail(&padata->list, &squeue->serial.list);
268 spin_unlock(&queue->serial.lock); 277 spin_unlock(&squeue->serial.lock);
269 278
270 queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); 279 queue_work_on(padata->cb_cpu, pinst->wq, &squeue->work);
271 } 280 }
272 281
273 spin_unlock_bh(&pd->lock); 282 spin_unlock_bh(&pd->lock);
274 283
275 if (atomic_read(&pd->reorder_objects)) 284 /*
276 goto try_again; 285 * The next object that needs serialization might have arrived to
286 * the reorder queues in the meantime, we will be called again
287 * from the timer function if noone else cares for it.
288 */
289 if (atomic_read(&pd->reorder_objects)
290 && !(pinst->flags & PADATA_RESET))
291 mod_timer(&pd->timer, jiffies + HZ);
292 else
293 del_timer(&pd->timer);
277 294
278out:
279 return; 295 return;
280} 296}
281 297
282static void padata_serial_worker(struct work_struct *work) 298static void padata_reorder_timer(unsigned long arg)
299{
300 struct parallel_data *pd = (struct parallel_data *)arg;
301
302 padata_reorder(pd);
303}
304
305static void padata_serial_worker(struct work_struct *serial_work)
283{ 306{
284 struct padata_queue *queue; 307 struct padata_serial_queue *squeue;
285 struct parallel_data *pd; 308 struct parallel_data *pd;
286 LIST_HEAD(local_list); 309 LIST_HEAD(local_list);
287 310
288 local_bh_disable(); 311 local_bh_disable();
289 queue = container_of(work, struct padata_queue, swork); 312 squeue = container_of(serial_work, struct padata_serial_queue, work);
290 pd = queue->pd; 313 pd = squeue->pd;
291 314
292 spin_lock(&queue->serial.lock); 315 spin_lock(&squeue->serial.lock);
293 list_replace_init(&queue->serial.list, &local_list); 316 list_replace_init(&squeue->serial.list, &local_list);
294 spin_unlock(&queue->serial.lock); 317 spin_unlock(&squeue->serial.lock);
295 318
296 while (!list_empty(&local_list)) { 319 while (!list_empty(&local_list)) {
297 struct padata_priv *padata; 320 struct padata_priv *padata;
@@ -307,7 +330,7 @@ static void padata_serial_worker(struct work_struct *work)
307 local_bh_enable(); 330 local_bh_enable();
308} 331}
309 332
310/* 333/**
311 * padata_do_serial - padata serialization function 334 * padata_do_serial - padata serialization function
312 * 335 *
313 * @padata: object to be serialized. 336 * @padata: object to be serialized.
@@ -318,18 +341,18 @@ static void padata_serial_worker(struct work_struct *work)
318void padata_do_serial(struct padata_priv *padata) 341void padata_do_serial(struct padata_priv *padata)
319{ 342{
320 int cpu; 343 int cpu;
321 struct padata_queue *queue; 344 struct padata_parallel_queue *pqueue;
322 struct parallel_data *pd; 345 struct parallel_data *pd;
323 346
324 pd = padata->pd; 347 pd = padata->pd;
325 348
326 cpu = get_cpu(); 349 cpu = get_cpu();
327 queue = per_cpu_ptr(pd->queue, cpu); 350 pqueue = per_cpu_ptr(pd->pqueue, cpu);
328 351
329 spin_lock(&queue->reorder.lock); 352 spin_lock(&pqueue->reorder.lock);
330 atomic_inc(&pd->reorder_objects); 353 atomic_inc(&pd->reorder_objects);
331 list_add_tail(&padata->list, &queue->reorder.list); 354 list_add_tail(&padata->list, &pqueue->reorder.list);
332 spin_unlock(&queue->reorder.lock); 355 spin_unlock(&pqueue->reorder.lock);
333 356
334 put_cpu(); 357 put_cpu();
335 358
@@ -337,55 +360,90 @@ void padata_do_serial(struct padata_priv *padata)
337} 360}
338EXPORT_SYMBOL(padata_do_serial); 361EXPORT_SYMBOL(padata_do_serial);
339 362
340static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, 363static int padata_setup_cpumasks(struct parallel_data *pd,
341 const struct cpumask *cpumask) 364 const struct cpumask *pcpumask,
365 const struct cpumask *cbcpumask)
342{ 366{
343 int cpu, cpu_index, num_cpus; 367 if (!alloc_cpumask_var(&pd->cpumask.pcpu, GFP_KERNEL))
344 struct padata_queue *queue; 368 return -ENOMEM;
345 struct parallel_data *pd;
346 369
347 cpu_index = 0; 370 cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_active_mask);
371 if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
372 free_cpumask_var(pd->cpumask.cbcpu);
373 return -ENOMEM;
374 }
348 375
349 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); 376 cpumask_and(pd->cpumask.cbcpu, cbcpumask, cpu_active_mask);
350 if (!pd) 377 return 0;
351 goto err; 378}
352 379
353 pd->queue = alloc_percpu(struct padata_queue); 380static void __padata_list_init(struct padata_list *pd_list)
354 if (!pd->queue) 381{
355 goto err_free_pd; 382 INIT_LIST_HEAD(&pd_list->list);
383 spin_lock_init(&pd_list->lock);
384}
356 385
357 if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) 386/* Initialize all percpu queues used by serial workers */
358 goto err_free_queue; 387static void padata_init_squeues(struct parallel_data *pd)
388{
389 int cpu;
390 struct padata_serial_queue *squeue;
391
392 for_each_cpu(cpu, pd->cpumask.cbcpu) {
393 squeue = per_cpu_ptr(pd->squeue, cpu);
394 squeue->pd = pd;
395 __padata_list_init(&squeue->serial);
396 INIT_WORK(&squeue->work, padata_serial_worker);
397 }
398}
359 399
360 for_each_possible_cpu(cpu) { 400/* Initialize all percpu queues used by parallel workers */
361 queue = per_cpu_ptr(pd->queue, cpu); 401static void padata_init_pqueues(struct parallel_data *pd)
402{
403 int cpu_index, num_cpus, cpu;
404 struct padata_parallel_queue *pqueue;
362 405
363 queue->pd = pd; 406 cpu_index = 0;
407 for_each_cpu(cpu, pd->cpumask.pcpu) {
408 pqueue = per_cpu_ptr(pd->pqueue, cpu);
409 pqueue->pd = pd;
410 pqueue->cpu_index = cpu_index;
411 cpu_index++;
412
413 __padata_list_init(&pqueue->reorder);
414 __padata_list_init(&pqueue->parallel);
415 INIT_WORK(&pqueue->work, padata_parallel_worker);
416 atomic_set(&pqueue->num_obj, 0);
417 }
364 418
365 if (cpumask_test_cpu(cpu, cpumask) 419 num_cpus = cpumask_weight(pd->cpumask.pcpu);
366 && cpumask_test_cpu(cpu, cpu_active_mask)) { 420 pd->max_seq_nr = num_cpus ? (MAX_SEQ_NR / num_cpus) * num_cpus - 1 : 0;
367 queue->cpu_index = cpu_index; 421}
368 cpu_index++;
369 } else
370 queue->cpu_index = -1;
371 422
372 INIT_LIST_HEAD(&queue->reorder.list); 423/* Allocate and initialize the internal cpumask dependend resources. */
373 INIT_LIST_HEAD(&queue->parallel.list); 424static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
374 INIT_LIST_HEAD(&queue->serial.list); 425 const struct cpumask *pcpumask,
375 spin_lock_init(&queue->reorder.lock); 426 const struct cpumask *cbcpumask)
376 spin_lock_init(&queue->parallel.lock); 427{
377 spin_lock_init(&queue->serial.lock); 428 struct parallel_data *pd;
378 429
379 INIT_WORK(&queue->pwork, padata_parallel_worker); 430 pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL);
380 INIT_WORK(&queue->swork, padata_serial_worker); 431 if (!pd)
381 atomic_set(&queue->num_obj, 0); 432 goto err;
382 }
383 433
384 cpumask_and(pd->cpumask, cpumask, cpu_active_mask); 434 pd->pqueue = alloc_percpu(struct padata_parallel_queue);
435 if (!pd->pqueue)
436 goto err_free_pd;
385 437
386 num_cpus = cpumask_weight(pd->cpumask); 438 pd->squeue = alloc_percpu(struct padata_serial_queue);
387 pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; 439 if (!pd->squeue)
440 goto err_free_pqueue;
441 if (padata_setup_cpumasks(pd, pcpumask, cbcpumask) < 0)
442 goto err_free_squeue;
388 443
444 padata_init_pqueues(pd);
445 padata_init_squeues(pd);
446 setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
389 atomic_set(&pd->seq_nr, -1); 447 atomic_set(&pd->seq_nr, -1);
390 atomic_set(&pd->reorder_objects, 0); 448 atomic_set(&pd->reorder_objects, 0);
391 atomic_set(&pd->refcnt, 0); 449 atomic_set(&pd->refcnt, 0);
@@ -394,8 +452,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
394 452
395 return pd; 453 return pd;
396 454
397err_free_queue: 455err_free_squeue:
398 free_percpu(pd->queue); 456 free_percpu(pd->squeue);
457err_free_pqueue:
458 free_percpu(pd->pqueue);
399err_free_pd: 459err_free_pd:
400 kfree(pd); 460 kfree(pd);
401err: 461err:
@@ -404,15 +464,63 @@ err:
404 464
405static void padata_free_pd(struct parallel_data *pd) 465static void padata_free_pd(struct parallel_data *pd)
406{ 466{
407 free_cpumask_var(pd->cpumask); 467 free_cpumask_var(pd->cpumask.pcpu);
408 free_percpu(pd->queue); 468 free_cpumask_var(pd->cpumask.cbcpu);
469 free_percpu(pd->pqueue);
470 free_percpu(pd->squeue);
409 kfree(pd); 471 kfree(pd);
410} 472}
411 473
474/* Flush all objects out of the padata queues. */
475static void padata_flush_queues(struct parallel_data *pd)
476{
477 int cpu;
478 struct padata_parallel_queue *pqueue;
479 struct padata_serial_queue *squeue;
480
481 for_each_cpu(cpu, pd->cpumask.pcpu) {
482 pqueue = per_cpu_ptr(pd->pqueue, cpu);
483 flush_work(&pqueue->work);
484 }
485
486 del_timer_sync(&pd->timer);
487
488 if (atomic_read(&pd->reorder_objects))
489 padata_reorder(pd);
490
491 for_each_cpu(cpu, pd->cpumask.cbcpu) {
492 squeue = per_cpu_ptr(pd->squeue, cpu);
493 flush_work(&squeue->work);
494 }
495
496 BUG_ON(atomic_read(&pd->refcnt) != 0);
497}
498
499static void __padata_start(struct padata_instance *pinst)
500{
501 pinst->flags |= PADATA_INIT;
502}
503
504static void __padata_stop(struct padata_instance *pinst)
505{
506 if (!(pinst->flags & PADATA_INIT))
507 return;
508
509 pinst->flags &= ~PADATA_INIT;
510
511 synchronize_rcu();
512
513 get_online_cpus();
514 padata_flush_queues(pinst->pd);
515 put_online_cpus();
516}
517
518/* Replace the internal control stucture with a new one. */
412static void padata_replace(struct padata_instance *pinst, 519static void padata_replace(struct padata_instance *pinst,
413 struct parallel_data *pd_new) 520 struct parallel_data *pd_new)
414{ 521{
415 struct parallel_data *pd_old = pinst->pd; 522 struct parallel_data *pd_old = pinst->pd;
523 int notification_mask = 0;
416 524
417 pinst->flags |= PADATA_RESET; 525 pinst->flags |= PADATA_RESET;
418 526
@@ -420,43 +528,162 @@ static void padata_replace(struct padata_instance *pinst,
420 528
421 synchronize_rcu(); 529 synchronize_rcu();
422 530
423 while (atomic_read(&pd_old->refcnt) != 0) 531 if (!cpumask_equal(pd_old->cpumask.pcpu, pd_new->cpumask.pcpu))
424 yield(); 532 notification_mask |= PADATA_CPU_PARALLEL;
425 533 if (!cpumask_equal(pd_old->cpumask.cbcpu, pd_new->cpumask.cbcpu))
426 flush_workqueue(pinst->wq); 534 notification_mask |= PADATA_CPU_SERIAL;
427 535
536 padata_flush_queues(pd_old);
428 padata_free_pd(pd_old); 537 padata_free_pd(pd_old);
429 538
539 if (notification_mask)
540 blocking_notifier_call_chain(&pinst->cpumask_change_notifier,
541 notification_mask,
542 &pd_new->cpumask);
543
430 pinst->flags &= ~PADATA_RESET; 544 pinst->flags &= ~PADATA_RESET;
431} 545}
432 546
433/* 547/**
434 * padata_set_cpumask - set the cpumask that padata should use 548 * padata_register_cpumask_notifier - Registers a notifier that will be called
549 * if either pcpu or cbcpu or both cpumasks change.
435 * 550 *
436 * @pinst: padata instance 551 * @pinst: A poineter to padata instance
437 * @cpumask: the cpumask to use 552 * @nblock: A pointer to notifier block.
438 */ 553 */
439int padata_set_cpumask(struct padata_instance *pinst, 554int padata_register_cpumask_notifier(struct padata_instance *pinst,
440 cpumask_var_t cpumask) 555 struct notifier_block *nblock)
441{ 556{
442 struct parallel_data *pd; 557 return blocking_notifier_chain_register(&pinst->cpumask_change_notifier,
443 int err = 0; 558 nblock);
559}
560EXPORT_SYMBOL(padata_register_cpumask_notifier);
444 561
445 might_sleep(); 562/**
563 * padata_unregister_cpumask_notifier - Unregisters cpumask notifier
564 * registered earlier using padata_register_cpumask_notifier
565 *
566 * @pinst: A pointer to data instance.
567 * @nlock: A pointer to notifier block.
568 */
569int padata_unregister_cpumask_notifier(struct padata_instance *pinst,
570 struct notifier_block *nblock)
571{
572 return blocking_notifier_chain_unregister(
573 &pinst->cpumask_change_notifier,
574 nblock);
575}
576EXPORT_SYMBOL(padata_unregister_cpumask_notifier);
446 577
447 mutex_lock(&pinst->lock);
448 578
449 pd = padata_alloc_pd(pinst, cpumask); 579/* If cpumask contains no active cpu, we mark the instance as invalid. */
450 if (!pd) { 580static bool padata_validate_cpumask(struct padata_instance *pinst,
451 err = -ENOMEM; 581 const struct cpumask *cpumask)
452 goto out; 582{
583 if (!cpumask_intersects(cpumask, cpu_active_mask)) {
584 pinst->flags |= PADATA_INVALID;
585 return false;
586 }
587
588 pinst->flags &= ~PADATA_INVALID;
589 return true;
590}
591
592static int __padata_set_cpumasks(struct padata_instance *pinst,
593 cpumask_var_t pcpumask,
594 cpumask_var_t cbcpumask)
595{
596 int valid;
597 struct parallel_data *pd;
598
599 valid = padata_validate_cpumask(pinst, pcpumask);
600 if (!valid) {
601 __padata_stop(pinst);
602 goto out_replace;
453 } 603 }
454 604
455 cpumask_copy(pinst->cpumask, cpumask); 605 valid = padata_validate_cpumask(pinst, cbcpumask);
606 if (!valid)
607 __padata_stop(pinst);
608
609out_replace:
610 pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
611 if (!pd)
612 return -ENOMEM;
613
614 cpumask_copy(pinst->cpumask.pcpu, pcpumask);
615 cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
456 616
457 padata_replace(pinst, pd); 617 padata_replace(pinst, pd);
458 618
619 if (valid)
620 __padata_start(pinst);
621
622 return 0;
623}
624
625/**
626 * padata_set_cpumasks - Set both parallel and serial cpumasks. The first
627 * one is used by parallel workers and the second one
628 * by the wokers doing serialization.
629 *
630 * @pinst: padata instance
631 * @pcpumask: the cpumask to use for parallel workers
632 * @cbcpumask: the cpumsak to use for serial workers
633 */
634int padata_set_cpumasks(struct padata_instance *pinst, cpumask_var_t pcpumask,
635 cpumask_var_t cbcpumask)
636{
637 int err;
638
639 mutex_lock(&pinst->lock);
640 get_online_cpus();
641
642 err = __padata_set_cpumasks(pinst, pcpumask, cbcpumask);
643
644 put_online_cpus();
645 mutex_unlock(&pinst->lock);
646
647 return err;
648
649}
650EXPORT_SYMBOL(padata_set_cpumasks);
651
652/**
653 * padata_set_cpumask: Sets specified by @cpumask_type cpumask to the value
654 * equivalent to @cpumask.
655 *
656 * @pinst: padata instance
657 * @cpumask_type: PADATA_CPU_SERIAL or PADATA_CPU_PARALLEL corresponding
658 * to parallel and serial cpumasks respectively.
659 * @cpumask: the cpumask to use
660 */
661int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
662 cpumask_var_t cpumask)
663{
664 struct cpumask *serial_mask, *parallel_mask;
665 int err = -EINVAL;
666
667 mutex_lock(&pinst->lock);
668 get_online_cpus();
669
670 switch (cpumask_type) {
671 case PADATA_CPU_PARALLEL:
672 serial_mask = pinst->cpumask.cbcpu;
673 parallel_mask = cpumask;
674 break;
675 case PADATA_CPU_SERIAL:
676 parallel_mask = pinst->cpumask.pcpu;
677 serial_mask = cpumask;
678 break;
679 default:
680 goto out;
681 }
682
683 err = __padata_set_cpumasks(pinst, parallel_mask, serial_mask);
684
459out: 685out:
686 put_online_cpus();
460 mutex_unlock(&pinst->lock); 687 mutex_unlock(&pinst->lock);
461 688
462 return err; 689 return err;
@@ -468,32 +695,50 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
468 struct parallel_data *pd; 695 struct parallel_data *pd;
469 696
470 if (cpumask_test_cpu(cpu, cpu_active_mask)) { 697 if (cpumask_test_cpu(cpu, cpu_active_mask)) {
471 pd = padata_alloc_pd(pinst, pinst->cpumask); 698 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
699 pinst->cpumask.cbcpu);
472 if (!pd) 700 if (!pd)
473 return -ENOMEM; 701 return -ENOMEM;
474 702
475 padata_replace(pinst, pd); 703 padata_replace(pinst, pd);
704
705 if (padata_validate_cpumask(pinst, pinst->cpumask.pcpu) &&
706 padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
707 __padata_start(pinst);
476 } 708 }
477 709
478 return 0; 710 return 0;
479} 711}
480 712
481/* 713 /**
482 * padata_add_cpu - add a cpu to the padata cpumask 714 * padata_add_cpu - add a cpu to one or both(parallel and serial)
715 * padata cpumasks.
483 * 716 *
484 * @pinst: padata instance 717 * @pinst: padata instance
485 * @cpu: cpu to add 718 * @cpu: cpu to add
719 * @mask: bitmask of flags specifying to which cpumask @cpu shuld be added.
720 * The @mask may be any combination of the following flags:
721 * PADATA_CPU_SERIAL - serial cpumask
722 * PADATA_CPU_PARALLEL - parallel cpumask
486 */ 723 */
487int padata_add_cpu(struct padata_instance *pinst, int cpu) 724
725int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask)
488{ 726{
489 int err; 727 int err;
490 728
491 might_sleep(); 729 if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
730 return -EINVAL;
492 731
493 mutex_lock(&pinst->lock); 732 mutex_lock(&pinst->lock);
494 733
495 cpumask_set_cpu(cpu, pinst->cpumask); 734 get_online_cpus();
735 if (mask & PADATA_CPU_SERIAL)
736 cpumask_set_cpu(cpu, pinst->cpumask.cbcpu);
737 if (mask & PADATA_CPU_PARALLEL)
738 cpumask_set_cpu(cpu, pinst->cpumask.pcpu);
739
496 err = __padata_add_cpu(pinst, cpu); 740 err = __padata_add_cpu(pinst, cpu);
741 put_online_cpus();
497 742
498 mutex_unlock(&pinst->lock); 743 mutex_unlock(&pinst->lock);
499 744
@@ -503,10 +748,16 @@ EXPORT_SYMBOL(padata_add_cpu);
503 748
504static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) 749static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
505{ 750{
506 struct parallel_data *pd; 751 struct parallel_data *pd = NULL;
507 752
508 if (cpumask_test_cpu(cpu, cpu_online_mask)) { 753 if (cpumask_test_cpu(cpu, cpu_online_mask)) {
509 pd = padata_alloc_pd(pinst, pinst->cpumask); 754
755 if (!padata_validate_cpumask(pinst, pinst->cpumask.pcpu) ||
756 !padata_validate_cpumask(pinst, pinst->cpumask.cbcpu))
757 __padata_stop(pinst);
758
759 pd = padata_alloc_pd(pinst, pinst->cpumask.pcpu,
760 pinst->cpumask.cbcpu);
510 if (!pd) 761 if (!pd)
511 return -ENOMEM; 762 return -ENOMEM;
512 763
@@ -516,22 +767,34 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
516 return 0; 767 return 0;
517} 768}
518 769
519/* 770 /**
520 * padata_remove_cpu - remove a cpu from the padata cpumask 771 * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
772 * padata cpumasks.
521 * 773 *
522 * @pinst: padata instance 774 * @pinst: padata instance
523 * @cpu: cpu to remove 775 * @cpu: cpu to remove
776 * @mask: bitmask specifying from which cpumask @cpu should be removed
777 * The @mask may be any combination of the following flags:
778 * PADATA_CPU_SERIAL - serial cpumask
779 * PADATA_CPU_PARALLEL - parallel cpumask
524 */ 780 */
525int padata_remove_cpu(struct padata_instance *pinst, int cpu) 781int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask)
526{ 782{
527 int err; 783 int err;
528 784
529 might_sleep(); 785 if (!(mask & (PADATA_CPU_SERIAL | PADATA_CPU_PARALLEL)))
786 return -EINVAL;
530 787
531 mutex_lock(&pinst->lock); 788 mutex_lock(&pinst->lock);
532 789
533 cpumask_clear_cpu(cpu, pinst->cpumask); 790 get_online_cpus();
791 if (mask & PADATA_CPU_SERIAL)
792 cpumask_clear_cpu(cpu, pinst->cpumask.cbcpu);
793 if (mask & PADATA_CPU_PARALLEL)
794 cpumask_clear_cpu(cpu, pinst->cpumask.pcpu);
795
534 err = __padata_remove_cpu(pinst, cpu); 796 err = __padata_remove_cpu(pinst, cpu);
797 put_online_cpus();
535 798
536 mutex_unlock(&pinst->lock); 799 mutex_unlock(&pinst->lock);
537 800
@@ -539,38 +802,52 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu)
539} 802}
540EXPORT_SYMBOL(padata_remove_cpu); 803EXPORT_SYMBOL(padata_remove_cpu);
541 804
542/* 805/**
543 * padata_start - start the parallel processing 806 * padata_start - start the parallel processing
544 * 807 *
545 * @pinst: padata instance to start 808 * @pinst: padata instance to start
546 */ 809 */
547void padata_start(struct padata_instance *pinst) 810int padata_start(struct padata_instance *pinst)
548{ 811{
549 might_sleep(); 812 int err = 0;
550 813
551 mutex_lock(&pinst->lock); 814 mutex_lock(&pinst->lock);
552 pinst->flags |= PADATA_INIT; 815
816 if (pinst->flags & PADATA_INVALID)
817 err =-EINVAL;
818
819 __padata_start(pinst);
820
553 mutex_unlock(&pinst->lock); 821 mutex_unlock(&pinst->lock);
822
823 return err;
554} 824}
555EXPORT_SYMBOL(padata_start); 825EXPORT_SYMBOL(padata_start);
556 826
557/* 827/**
558 * padata_stop - stop the parallel processing 828 * padata_stop - stop the parallel processing
559 * 829 *
560 * @pinst: padata instance to stop 830 * @pinst: padata instance to stop
561 */ 831 */
562void padata_stop(struct padata_instance *pinst) 832void padata_stop(struct padata_instance *pinst)
563{ 833{
564 might_sleep();
565
566 mutex_lock(&pinst->lock); 834 mutex_lock(&pinst->lock);
567 pinst->flags &= ~PADATA_INIT; 835 __padata_stop(pinst);
568 mutex_unlock(&pinst->lock); 836 mutex_unlock(&pinst->lock);
569} 837}
570EXPORT_SYMBOL(padata_stop); 838EXPORT_SYMBOL(padata_stop);
571 839
572static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, 840#ifdef CONFIG_HOTPLUG_CPU
573 unsigned long action, void *hcpu) 841
842static inline int pinst_has_cpu(struct padata_instance *pinst, int cpu)
843{
844 return cpumask_test_cpu(cpu, pinst->cpumask.pcpu) ||
845 cpumask_test_cpu(cpu, pinst->cpumask.cbcpu);
846}
847
848
849static int padata_cpu_callback(struct notifier_block *nfb,
850 unsigned long action, void *hcpu)
574{ 851{
575 int err; 852 int err;
576 struct padata_instance *pinst; 853 struct padata_instance *pinst;
@@ -581,29 +858,29 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
581 switch (action) { 858 switch (action) {
582 case CPU_ONLINE: 859 case CPU_ONLINE:
583 case CPU_ONLINE_FROZEN: 860 case CPU_ONLINE_FROZEN:
584 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 861 if (!pinst_has_cpu(pinst, cpu))
585 break; 862 break;
586 mutex_lock(&pinst->lock); 863 mutex_lock(&pinst->lock);
587 err = __padata_add_cpu(pinst, cpu); 864 err = __padata_add_cpu(pinst, cpu);
588 mutex_unlock(&pinst->lock); 865 mutex_unlock(&pinst->lock);
589 if (err) 866 if (err)
590 return NOTIFY_BAD; 867 return notifier_from_errno(err);
591 break; 868 break;
592 869
593 case CPU_DOWN_PREPARE: 870 case CPU_DOWN_PREPARE:
594 case CPU_DOWN_PREPARE_FROZEN: 871 case CPU_DOWN_PREPARE_FROZEN:
595 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 872 if (!pinst_has_cpu(pinst, cpu))
596 break; 873 break;
597 mutex_lock(&pinst->lock); 874 mutex_lock(&pinst->lock);
598 err = __padata_remove_cpu(pinst, cpu); 875 err = __padata_remove_cpu(pinst, cpu);
599 mutex_unlock(&pinst->lock); 876 mutex_unlock(&pinst->lock);
600 if (err) 877 if (err)
601 return NOTIFY_BAD; 878 return notifier_from_errno(err);
602 break; 879 break;
603 880
604 case CPU_UP_CANCELED: 881 case CPU_UP_CANCELED:
605 case CPU_UP_CANCELED_FROZEN: 882 case CPU_UP_CANCELED_FROZEN:
606 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 883 if (!pinst_has_cpu(pinst, cpu))
607 break; 884 break;
608 mutex_lock(&pinst->lock); 885 mutex_lock(&pinst->lock);
609 __padata_remove_cpu(pinst, cpu); 886 __padata_remove_cpu(pinst, cpu);
@@ -611,7 +888,7 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
611 888
612 case CPU_DOWN_FAILED: 889 case CPU_DOWN_FAILED:
613 case CPU_DOWN_FAILED_FROZEN: 890 case CPU_DOWN_FAILED_FROZEN:
614 if (!cpumask_test_cpu(cpu, pinst->cpumask)) 891 if (!pinst_has_cpu(pinst, cpu))
615 break; 892 break;
616 mutex_lock(&pinst->lock); 893 mutex_lock(&pinst->lock);
617 __padata_add_cpu(pinst, cpu); 894 __padata_add_cpu(pinst, cpu);
@@ -620,77 +897,239 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
620 897
621 return NOTIFY_OK; 898 return NOTIFY_OK;
622} 899}
900#endif
901
902static void __padata_free(struct padata_instance *pinst)
903{
904#ifdef CONFIG_HOTPLUG_CPU
905 unregister_hotcpu_notifier(&pinst->cpu_notifier);
906#endif
907
908 padata_stop(pinst);
909 padata_free_pd(pinst->pd);
910 free_cpumask_var(pinst->cpumask.pcpu);
911 free_cpumask_var(pinst->cpumask.cbcpu);
912 kfree(pinst);
913}
914
915#define kobj2pinst(_kobj) \
916 container_of(_kobj, struct padata_instance, kobj)
917#define attr2pentry(_attr) \
918 container_of(_attr, struct padata_sysfs_entry, attr)
919
920static void padata_sysfs_release(struct kobject *kobj)
921{
922 struct padata_instance *pinst = kobj2pinst(kobj);
923 __padata_free(pinst);
924}
925
926struct padata_sysfs_entry {
927 struct attribute attr;
928 ssize_t (*show)(struct padata_instance *, struct attribute *, char *);
929 ssize_t (*store)(struct padata_instance *, struct attribute *,
930 const char *, size_t);
931};
932
933static ssize_t show_cpumask(struct padata_instance *pinst,
934 struct attribute *attr, char *buf)
935{
936 struct cpumask *cpumask;
937 ssize_t len;
938
939 mutex_lock(&pinst->lock);
940 if (!strcmp(attr->name, "serial_cpumask"))
941 cpumask = pinst->cpumask.cbcpu;
942 else
943 cpumask = pinst->cpumask.pcpu;
944
945 len = bitmap_scnprintf(buf, PAGE_SIZE, cpumask_bits(cpumask),
946 nr_cpu_ids);
947 if (PAGE_SIZE - len < 2)
948 len = -EINVAL;
949 else
950 len += sprintf(buf + len, "\n");
951
952 mutex_unlock(&pinst->lock);
953 return len;
954}
955
956static ssize_t store_cpumask(struct padata_instance *pinst,
957 struct attribute *attr,
958 const char *buf, size_t count)
959{
960 cpumask_var_t new_cpumask;
961 ssize_t ret;
962 int mask_type;
963
964 if (!alloc_cpumask_var(&new_cpumask, GFP_KERNEL))
965 return -ENOMEM;
966
967 ret = bitmap_parse(buf, count, cpumask_bits(new_cpumask),
968 nr_cpumask_bits);
969 if (ret < 0)
970 goto out;
971
972 mask_type = !strcmp(attr->name, "serial_cpumask") ?
973 PADATA_CPU_SERIAL : PADATA_CPU_PARALLEL;
974 ret = padata_set_cpumask(pinst, mask_type, new_cpumask);
975 if (!ret)
976 ret = count;
977
978out:
979 free_cpumask_var(new_cpumask);
980 return ret;
981}
982
983#define PADATA_ATTR_RW(_name, _show_name, _store_name) \
984 static struct padata_sysfs_entry _name##_attr = \
985 __ATTR(_name, 0644, _show_name, _store_name)
986#define PADATA_ATTR_RO(_name, _show_name) \
987 static struct padata_sysfs_entry _name##_attr = \
988 __ATTR(_name, 0400, _show_name, NULL)
989
990PADATA_ATTR_RW(serial_cpumask, show_cpumask, store_cpumask);
991PADATA_ATTR_RW(parallel_cpumask, show_cpumask, store_cpumask);
623 992
624/* 993/*
625 * padata_alloc - allocate and initialize a padata instance 994 * Padata sysfs provides the following objects:
995 * serial_cpumask [RW] - cpumask for serial workers
996 * parallel_cpumask [RW] - cpumask for parallel workers
997 */
998static struct attribute *padata_default_attrs[] = {
999 &serial_cpumask_attr.attr,
1000 &parallel_cpumask_attr.attr,
1001 NULL,
1002};
1003
1004static ssize_t padata_sysfs_show(struct kobject *kobj,
1005 struct attribute *attr, char *buf)
1006{
1007 struct padata_instance *pinst;
1008 struct padata_sysfs_entry *pentry;
1009 ssize_t ret = -EIO;
1010
1011 pinst = kobj2pinst(kobj);
1012 pentry = attr2pentry(attr);
1013 if (pentry->show)
1014 ret = pentry->show(pinst, attr, buf);
1015
1016 return ret;
1017}
1018
1019static ssize_t padata_sysfs_store(struct kobject *kobj, struct attribute *attr,
1020 const char *buf, size_t count)
1021{
1022 struct padata_instance *pinst;
1023 struct padata_sysfs_entry *pentry;
1024 ssize_t ret = -EIO;
1025
1026 pinst = kobj2pinst(kobj);
1027 pentry = attr2pentry(attr);
1028 if (pentry->show)
1029 ret = pentry->store(pinst, attr, buf, count);
1030
1031 return ret;
1032}
1033
1034static const struct sysfs_ops padata_sysfs_ops = {
1035 .show = padata_sysfs_show,
1036 .store = padata_sysfs_store,
1037};
1038
1039static struct kobj_type padata_attr_type = {
1040 .sysfs_ops = &padata_sysfs_ops,
1041 .default_attrs = padata_default_attrs,
1042 .release = padata_sysfs_release,
1043};
1044
1045/**
1046 * padata_alloc_possible - Allocate and initialize padata instance.
1047 * Use the cpu_possible_mask for serial and
1048 * parallel workers.
626 * 1049 *
627 * @cpumask: cpumask that padata uses for parallelization
628 * @wq: workqueue to use for the allocated padata instance 1050 * @wq: workqueue to use for the allocated padata instance
629 */ 1051 */
630struct padata_instance *padata_alloc(const struct cpumask *cpumask, 1052struct padata_instance *padata_alloc_possible(struct workqueue_struct *wq)
631 struct workqueue_struct *wq) 1053{
1054 return padata_alloc(wq, cpu_possible_mask, cpu_possible_mask);
1055}
1056EXPORT_SYMBOL(padata_alloc_possible);
1057
1058/**
1059 * padata_alloc - allocate and initialize a padata instance and specify
1060 * cpumasks for serial and parallel workers.
1061 *
1062 * @wq: workqueue to use for the allocated padata instance
1063 * @pcpumask: cpumask that will be used for padata parallelization
1064 * @cbcpumask: cpumask that will be used for padata serialization
1065 */
1066struct padata_instance *padata_alloc(struct workqueue_struct *wq,
1067 const struct cpumask *pcpumask,
1068 const struct cpumask *cbcpumask)
632{ 1069{
633 int err;
634 struct padata_instance *pinst; 1070 struct padata_instance *pinst;
635 struct parallel_data *pd; 1071 struct parallel_data *pd = NULL;
636 1072
637 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); 1073 pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL);
638 if (!pinst) 1074 if (!pinst)
639 goto err; 1075 goto err;
640 1076
641 pd = padata_alloc_pd(pinst, cpumask); 1077 get_online_cpus();
642 if (!pd) 1078 if (!alloc_cpumask_var(&pinst->cpumask.pcpu, GFP_KERNEL))
1079 goto err_free_inst;
1080 if (!alloc_cpumask_var(&pinst->cpumask.cbcpu, GFP_KERNEL)) {
1081 free_cpumask_var(pinst->cpumask.pcpu);
643 goto err_free_inst; 1082 goto err_free_inst;
1083 }
1084 if (!padata_validate_cpumask(pinst, pcpumask) ||
1085 !padata_validate_cpumask(pinst, cbcpumask))
1086 goto err_free_masks;
644 1087
645 if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) 1088 pd = padata_alloc_pd(pinst, pcpumask, cbcpumask);
646 goto err_free_pd; 1089 if (!pd)
1090 goto err_free_masks;
647 1091
648 rcu_assign_pointer(pinst->pd, pd); 1092 rcu_assign_pointer(pinst->pd, pd);
649 1093
650 pinst->wq = wq; 1094 pinst->wq = wq;
651 1095
652 cpumask_copy(pinst->cpumask, cpumask); 1096 cpumask_copy(pinst->cpumask.pcpu, pcpumask);
1097 cpumask_copy(pinst->cpumask.cbcpu, cbcpumask);
653 1098
654 pinst->flags = 0; 1099 pinst->flags = 0;
655 1100
1101#ifdef CONFIG_HOTPLUG_CPU
656 pinst->cpu_notifier.notifier_call = padata_cpu_callback; 1102 pinst->cpu_notifier.notifier_call = padata_cpu_callback;
657 pinst->cpu_notifier.priority = 0; 1103 pinst->cpu_notifier.priority = 0;
658 err = register_hotcpu_notifier(&pinst->cpu_notifier); 1104 register_hotcpu_notifier(&pinst->cpu_notifier);
659 if (err) 1105#endif
660 goto err_free_cpumask; 1106
1107 put_online_cpus();
661 1108
1109 BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier);
1110 kobject_init(&pinst->kobj, &padata_attr_type);
662 mutex_init(&pinst->lock); 1111 mutex_init(&pinst->lock);
663 1112
664 return pinst; 1113 return pinst;
665 1114
666err_free_cpumask: 1115err_free_masks:
667 free_cpumask_var(pinst->cpumask); 1116 free_cpumask_var(pinst->cpumask.pcpu);
668err_free_pd: 1117 free_cpumask_var(pinst->cpumask.cbcpu);
669 padata_free_pd(pd);
670err_free_inst: 1118err_free_inst:
671 kfree(pinst); 1119 kfree(pinst);
1120 put_online_cpus();
672err: 1121err:
673 return NULL; 1122 return NULL;
674} 1123}
675EXPORT_SYMBOL(padata_alloc); 1124EXPORT_SYMBOL(padata_alloc);
676 1125
677/* 1126/**
678 * padata_free - free a padata instance 1127 * padata_free - free a padata instance
679 * 1128 *
680 * @ padata_inst: padata instance to free 1129 * @padata_inst: padata instance to free
681 */ 1130 */
682void padata_free(struct padata_instance *pinst) 1131void padata_free(struct padata_instance *pinst)
683{ 1132{
684 padata_stop(pinst); 1133 kobject_put(&pinst->kobj);
685
686 synchronize_rcu();
687
688 while (atomic_read(&pinst->pd->refcnt) != 0)
689 yield();
690
691 unregister_hotcpu_notifier(&pinst->cpu_notifier);
692 padata_free_pd(pinst->pd);
693 free_cpumask_var(pinst->cpumask);
694 kfree(pinst);
695} 1134}
696EXPORT_SYMBOL(padata_free); 1135EXPORT_SYMBOL(padata_free);
diff --git a/kernel/panic.c b/kernel/panic.c
index 13d966b4c14a..3b16cd93fa7d 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -87,6 +87,7 @@ NORET_TYPE void panic(const char * fmt, ...)
87 */ 87 */
88 preempt_disable(); 88 preempt_disable();
89 89
90 console_verbose();
90 bust_spinlocks(1); 91 bust_spinlocks(1);
91 va_start(args, fmt); 92 va_start(args, fmt);
92 vsnprintf(buf, sizeof(buf), fmt, args); 93 vsnprintf(buf, sizeof(buf), fmt, args);
@@ -178,6 +179,7 @@ static const struct tnt tnts[] = {
178 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' }, 179 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
179 { TAINT_WARN, 'W', ' ' }, 180 { TAINT_WARN, 'W', ' ' },
180 { TAINT_CRAP, 'C', ' ' }, 181 { TAINT_CRAP, 'C', ' ' },
182 { TAINT_FIRMWARE_WORKAROUND, 'I', ' ' },
181}; 183};
182 184
183/** 185/**
@@ -194,6 +196,7 @@ static const struct tnt tnts[] = {
194 * 'A' - ACPI table overridden. 196 * 'A' - ACPI table overridden.
195 * 'W' - Taint on warning. 197 * 'W' - Taint on warning.
196 * 'C' - modules from drivers/staging are loaded. 198 * 'C' - modules from drivers/staging are loaded.
199 * 'I' - Working around severe firmware bug.
197 * 200 *
198 * The string is overwritten by the next call to print_tainted(). 201 * The string is overwritten by the next call to print_tainted().
199 */ 202 */
@@ -365,7 +368,8 @@ struct slowpath_args {
365 va_list args; 368 va_list args;
366}; 369};
367 370
368static void warn_slowpath_common(const char *file, int line, void *caller, struct slowpath_args *args) 371static void warn_slowpath_common(const char *file, int line, void *caller,
372 unsigned taint, struct slowpath_args *args)
369{ 373{
370 const char *board; 374 const char *board;
371 375
@@ -381,7 +385,7 @@ static void warn_slowpath_common(const char *file, int line, void *caller, struc
381 print_modules(); 385 print_modules();
382 dump_stack(); 386 dump_stack();
383 print_oops_end_marker(); 387 print_oops_end_marker();
384 add_taint(TAINT_WARN); 388 add_taint(taint);
385} 389}
386 390
387void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) 391void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
@@ -390,14 +394,29 @@ void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...)
390 394
391 args.fmt = fmt; 395 args.fmt = fmt;
392 va_start(args.args, fmt); 396 va_start(args.args, fmt);
393 warn_slowpath_common(file, line, __builtin_return_address(0), &args); 397 warn_slowpath_common(file, line, __builtin_return_address(0),
398 TAINT_WARN, &args);
394 va_end(args.args); 399 va_end(args.args);
395} 400}
396EXPORT_SYMBOL(warn_slowpath_fmt); 401EXPORT_SYMBOL(warn_slowpath_fmt);
397 402
403void warn_slowpath_fmt_taint(const char *file, int line,
404 unsigned taint, const char *fmt, ...)
405{
406 struct slowpath_args args;
407
408 args.fmt = fmt;
409 va_start(args.args, fmt);
410 warn_slowpath_common(file, line, __builtin_return_address(0),
411 taint, &args);
412 va_end(args.args);
413}
414EXPORT_SYMBOL(warn_slowpath_fmt_taint);
415
398void warn_slowpath_null(const char *file, int line) 416void warn_slowpath_null(const char *file, int line)
399{ 417{
400 warn_slowpath_common(file, line, __builtin_return_address(0), NULL); 418 warn_slowpath_common(file, line, __builtin_return_address(0),
419 TAINT_WARN, NULL);
401} 420}
402EXPORT_SYMBOL(warn_slowpath_null); 421EXPORT_SYMBOL(warn_slowpath_null);
403#endif 422#endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 574ee58a3046..ff86c558af4c 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -15,6 +15,8 @@
15#include <linux/smp.h> 15#include <linux/smp.h>
16#include <linux/file.h> 16#include <linux/file.h>
17#include <linux/poll.h> 17#include <linux/poll.h>
18#include <linux/slab.h>
19#include <linux/hash.h>
18#include <linux/sysfs.h> 20#include <linux/sysfs.h>
19#include <linux/dcache.h> 21#include <linux/dcache.h>
20#include <linux/percpu.h> 22#include <linux/percpu.h>
@@ -81,14 +83,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
81void __weak hw_perf_disable(void) { barrier(); } 83void __weak hw_perf_disable(void) { barrier(); }
82void __weak hw_perf_enable(void) { barrier(); } 84void __weak hw_perf_enable(void) { barrier(); }
83 85
84int __weak
85hw_perf_group_sched_in(struct perf_event *group_leader,
86 struct perf_cpu_context *cpuctx,
87 struct perf_event_context *ctx)
88{
89 return 0;
90}
91
92void __weak perf_event_print_debug(void) { } 86void __weak perf_event_print_debug(void) { }
93 87
94static DEFINE_PER_CPU(int, perf_disable_count); 88static DEFINE_PER_CPU(int, perf_disable_count);
@@ -261,6 +255,18 @@ static void update_event_times(struct perf_event *event)
261 event->total_time_running = run_end - event->tstamp_running; 255 event->total_time_running = run_end - event->tstamp_running;
262} 256}
263 257
258/*
259 * Update total_time_enabled and total_time_running for all events in a group.
260 */
261static void update_group_times(struct perf_event *leader)
262{
263 struct perf_event *event;
264
265 update_event_times(leader);
266 list_for_each_entry(event, &leader->sibling_list, group_entry)
267 update_event_times(event);
268}
269
264static struct list_head * 270static struct list_head *
265ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) 271ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
266{ 272{
@@ -277,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
277static void 283static void
278list_add_event(struct perf_event *event, struct perf_event_context *ctx) 284list_add_event(struct perf_event *event, struct perf_event_context *ctx)
279{ 285{
280 struct perf_event *group_leader = event->group_leader; 286 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
287 event->attach_state |= PERF_ATTACH_CONTEXT;
281 288
282 /* 289 /*
283 * Depending on whether it is a standalone or sibling event, 290 * If we're a stand alone event or group leader, we go to the context
284 * add it straight to the context's event list, or to the group 291 * list, group events are kept attached to the group so that
285 * leader's sibling list: 292 * perf_group_detach can, at all times, locate all siblings.
286 */ 293 */
287 if (group_leader == event) { 294 if (event->group_leader == event) {
288 struct list_head *list; 295 struct list_head *list;
289 296
290 if (is_software_event(event)) 297 if (is_software_event(event))
@@ -292,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
292 299
293 list = ctx_group_list(event, ctx); 300 list = ctx_group_list(event, ctx);
294 list_add_tail(&event->group_entry, list); 301 list_add_tail(&event->group_entry, list);
295 } else {
296 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
297 !is_software_event(event))
298 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
299
300 list_add_tail(&event->group_entry, &group_leader->sibling_list);
301 group_leader->nr_siblings++;
302 } 302 }
303 303
304 list_add_rcu(&event->event_entry, &ctx->event_list); 304 list_add_rcu(&event->event_entry, &ctx->event_list);
@@ -307,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
307 ctx->nr_stat++; 307 ctx->nr_stat++;
308} 308}
309 309
310static void perf_group_attach(struct perf_event *event)
311{
312 struct perf_event *group_leader = event->group_leader;
313
314 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP);
315 event->attach_state |= PERF_ATTACH_GROUP;
316
317 if (group_leader == event)
318 return;
319
320 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
321 !is_software_event(event))
322 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
323
324 list_add_tail(&event->group_entry, &group_leader->sibling_list);
325 group_leader->nr_siblings++;
326}
327
310/* 328/*
311 * Remove a event from the lists for its context. 329 * Remove a event from the lists for its context.
312 * Must be called with ctx->mutex and ctx->lock held. 330 * Must be called with ctx->mutex and ctx->lock held.
@@ -314,21 +332,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
314static void 332static void
315list_del_event(struct perf_event *event, struct perf_event_context *ctx) 333list_del_event(struct perf_event *event, struct perf_event_context *ctx)
316{ 334{
317 struct perf_event *sibling, *tmp; 335 /*
318 336 * We can have double detach due to exit/hot-unplug + close.
319 if (list_empty(&event->group_entry)) 337 */
338 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
320 return; 339 return;
340
341 event->attach_state &= ~PERF_ATTACH_CONTEXT;
342
321 ctx->nr_events--; 343 ctx->nr_events--;
322 if (event->attr.inherit_stat) 344 if (event->attr.inherit_stat)
323 ctx->nr_stat--; 345 ctx->nr_stat--;
324 346
325 list_del_init(&event->group_entry);
326 list_del_rcu(&event->event_entry); 347 list_del_rcu(&event->event_entry);
327 348
328 if (event->group_leader != event) 349 if (event->group_leader == event)
329 event->group_leader->nr_siblings--; 350 list_del_init(&event->group_entry);
330 351
331 update_event_times(event); 352 update_group_times(event);
332 353
333 /* 354 /*
334 * If event was in error state, then keep it 355 * If event was in error state, then keep it
@@ -339,17 +360,41 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
339 */ 360 */
340 if (event->state > PERF_EVENT_STATE_OFF) 361 if (event->state > PERF_EVENT_STATE_OFF)
341 event->state = PERF_EVENT_STATE_OFF; 362 event->state = PERF_EVENT_STATE_OFF;
363}
364
365static void perf_group_detach(struct perf_event *event)
366{
367 struct perf_event *sibling, *tmp;
368 struct list_head *list = NULL;
369
370 /*
371 * We can have double detach due to exit/hot-unplug + close.
372 */
373 if (!(event->attach_state & PERF_ATTACH_GROUP))
374 return;
375
376 event->attach_state &= ~PERF_ATTACH_GROUP;
377
378 /*
379 * If this is a sibling, remove it from its group.
380 */
381 if (event->group_leader != event) {
382 list_del_init(&event->group_entry);
383 event->group_leader->nr_siblings--;
384 return;
385 }
386
387 if (!list_empty(&event->group_entry))
388 list = &event->group_entry;
342 389
343 /* 390 /*
344 * If this was a group event with sibling events then 391 * If this was a group event with sibling events then
345 * upgrade the siblings to singleton events by adding them 392 * upgrade the siblings to singleton events by adding them
346 * to the context list directly: 393 * to whatever list we are on.
347 */ 394 */
348 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 395 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
349 struct list_head *list; 396 if (list)
350 397 list_move_tail(&sibling->group_entry, list);
351 list = ctx_group_list(event, ctx);
352 list_move_tail(&sibling->group_entry, list);
353 sibling->group_leader = sibling; 398 sibling->group_leader = sibling;
354 399
355 /* Inherit group flags from the previous leader */ 400 /* Inherit group flags from the previous leader */
@@ -504,18 +549,6 @@ retry:
504} 549}
505 550
506/* 551/*
507 * Update total_time_enabled and total_time_running for all events in a group.
508 */
509static void update_group_times(struct perf_event *leader)
510{
511 struct perf_event *event;
512
513 update_event_times(leader);
514 list_for_each_entry(event, &leader->sibling_list, group_entry)
515 update_event_times(event);
516}
517
518/*
519 * Cross CPU call to disable a performance event 552 * Cross CPU call to disable a performance event
520 */ 553 */
521static void __perf_event_disable(void *info) 554static void __perf_event_disable(void *info)
@@ -639,18 +672,26 @@ group_sched_in(struct perf_event *group_event,
639 struct perf_cpu_context *cpuctx, 672 struct perf_cpu_context *cpuctx,
640 struct perf_event_context *ctx) 673 struct perf_event_context *ctx)
641{ 674{
642 struct perf_event *event, *partial_group; 675 struct perf_event *event, *partial_group = NULL;
676 const struct pmu *pmu = group_event->pmu;
677 bool txn = false;
643 int ret; 678 int ret;
644 679
645 if (group_event->state == PERF_EVENT_STATE_OFF) 680 if (group_event->state == PERF_EVENT_STATE_OFF)
646 return 0; 681 return 0;
647 682
648 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx); 683 /* Check if group transaction availabe */
649 if (ret) 684 if (pmu->start_txn)
650 return ret < 0 ? ret : 0; 685 txn = true;
651 686
652 if (event_sched_in(group_event, cpuctx, ctx)) 687 if (txn)
688 pmu->start_txn(pmu);
689
690 if (event_sched_in(group_event, cpuctx, ctx)) {
691 if (txn)
692 pmu->cancel_txn(pmu);
653 return -EAGAIN; 693 return -EAGAIN;
694 }
654 695
655 /* 696 /*
656 * Schedule in siblings as one group (if any): 697 * Schedule in siblings as one group (if any):
@@ -662,7 +703,14 @@ group_sched_in(struct perf_event *group_event,
662 } 703 }
663 } 704 }
664 705
665 return 0; 706 if (!txn)
707 return 0;
708
709 ret = pmu->commit_txn(pmu);
710 if (!ret) {
711 pmu->cancel_txn(pmu);
712 return 0;
713 }
666 714
667group_error: 715group_error:
668 /* 716 /*
@@ -676,6 +724,9 @@ group_error:
676 } 724 }
677 event_sched_out(group_event, cpuctx, ctx); 725 event_sched_out(group_event, cpuctx, ctx);
678 726
727 if (txn)
728 pmu->cancel_txn(pmu);
729
679 return -EAGAIN; 730 return -EAGAIN;
680} 731}
681 732
@@ -714,6 +765,7 @@ static void add_event_to_ctx(struct perf_event *event,
714 struct perf_event_context *ctx) 765 struct perf_event_context *ctx)
715{ 766{
716 list_add_event(event, ctx); 767 list_add_event(event, ctx);
768 perf_group_attach(event);
717 event->tstamp_enabled = ctx->time; 769 event->tstamp_enabled = ctx->time;
718 event->tstamp_running = ctx->time; 770 event->tstamp_running = ctx->time;
719 event->tstamp_stopped = ctx->time; 771 event->tstamp_stopped = ctx->time;
@@ -1164,11 +1216,9 @@ void perf_event_task_sched_out(struct task_struct *task,
1164 struct perf_event_context *ctx = task->perf_event_ctxp; 1216 struct perf_event_context *ctx = task->perf_event_ctxp;
1165 struct perf_event_context *next_ctx; 1217 struct perf_event_context *next_ctx;
1166 struct perf_event_context *parent; 1218 struct perf_event_context *parent;
1167 struct pt_regs *regs;
1168 int do_switch = 1; 1219 int do_switch = 1;
1169 1220
1170 regs = task_pt_regs(task); 1221 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1171 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
1172 1222
1173 if (likely(!ctx || !cpuctx->task_ctx)) 1223 if (likely(!ctx || !cpuctx->task_ctx))
1174 return; 1224 return;
@@ -1368,6 +1418,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1368 if (cpuctx->task_ctx == ctx) 1418 if (cpuctx->task_ctx == ctx)
1369 return; 1419 return;
1370 1420
1421 perf_disable();
1422
1371 /* 1423 /*
1372 * We want to keep the following priority order: 1424 * We want to keep the following priority order:
1373 * cpu pinned (that don't need to move), task pinned, 1425 * cpu pinned (that don't need to move), task pinned,
@@ -1380,6 +1432,8 @@ void perf_event_task_sched_in(struct task_struct *task)
1380 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); 1432 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1381 1433
1382 cpuctx->task_ctx = ctx; 1434 cpuctx->task_ctx = ctx;
1435
1436 perf_enable();
1383} 1437}
1384 1438
1385#define MAX_INTERRUPTS (~0ULL) 1439#define MAX_INTERRUPTS (~0ULL)
@@ -1453,6 +1507,9 @@ do { \
1453 divisor = nsec * frequency; 1507 divisor = nsec * frequency;
1454 } 1508 }
1455 1509
1510 if (!divisor)
1511 return dividend;
1512
1456 return div64_u64(dividend, divisor); 1513 return div64_u64(dividend, divisor);
1457} 1514}
1458 1515
@@ -1475,7 +1532,7 @@ static int perf_event_start(struct perf_event *event)
1475static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) 1532static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1476{ 1533{
1477 struct hw_perf_event *hwc = &event->hw; 1534 struct hw_perf_event *hwc = &event->hw;
1478 u64 period, sample_period; 1535 s64 period, sample_period;
1479 s64 delta; 1536 s64 delta;
1480 1537
1481 period = perf_calculate_period(event, nsec, count); 1538 period = perf_calculate_period(event, nsec, count);
@@ -1826,6 +1883,7 @@ static void free_event_rcu(struct rcu_head *head)
1826} 1883}
1827 1884
1828static void perf_pending_sync(struct perf_event *event); 1885static void perf_pending_sync(struct perf_event *event);
1886static void perf_mmap_data_put(struct perf_mmap_data *data);
1829 1887
1830static void free_event(struct perf_event *event) 1888static void free_event(struct perf_event *event)
1831{ 1889{
@@ -1841,9 +1899,9 @@ static void free_event(struct perf_event *event)
1841 atomic_dec(&nr_task_events); 1899 atomic_dec(&nr_task_events);
1842 } 1900 }
1843 1901
1844 if (event->output) { 1902 if (event->data) {
1845 fput(event->output->filp); 1903 perf_mmap_data_put(event->data);
1846 event->output = NULL; 1904 event->data = NULL;
1847 } 1905 }
1848 1906
1849 if (event->destroy) 1907 if (event->destroy)
@@ -1857,9 +1915,30 @@ int perf_event_release_kernel(struct perf_event *event)
1857{ 1915{
1858 struct perf_event_context *ctx = event->ctx; 1916 struct perf_event_context *ctx = event->ctx;
1859 1917
1918 /*
1919 * Remove from the PMU, can't get re-enabled since we got
1920 * here because the last ref went.
1921 */
1922 perf_event_disable(event);
1923
1860 WARN_ON_ONCE(ctx->parent_ctx); 1924 WARN_ON_ONCE(ctx->parent_ctx);
1861 mutex_lock(&ctx->mutex); 1925 /*
1862 perf_event_remove_from_context(event); 1926 * There are two ways this annotation is useful:
1927 *
1928 * 1) there is a lock recursion from perf_event_exit_task
1929 * see the comment there.
1930 *
1931 * 2) there is a lock-inversion with mmap_sem through
1932 * perf_event_read_group(), which takes faults while
1933 * holding ctx->mutex, however this is called after
1934 * the last filedesc died, so there is no possibility
1935 * to trigger the AB-BA case.
1936 */
1937 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
1938 raw_spin_lock_irq(&ctx->lock);
1939 perf_group_detach(event);
1940 list_del_event(event, ctx);
1941 raw_spin_unlock_irq(&ctx->lock);
1863 mutex_unlock(&ctx->mutex); 1942 mutex_unlock(&ctx->mutex);
1864 1943
1865 mutex_lock(&event->owner->perf_event_mutex); 1944 mutex_lock(&event->owner->perf_event_mutex);
@@ -2139,7 +2218,27 @@ unlock:
2139 return ret; 2218 return ret;
2140} 2219}
2141 2220
2142static int perf_event_set_output(struct perf_event *event, int output_fd); 2221static const struct file_operations perf_fops;
2222
2223static struct perf_event *perf_fget_light(int fd, int *fput_needed)
2224{
2225 struct file *file;
2226
2227 file = fget_light(fd, fput_needed);
2228 if (!file)
2229 return ERR_PTR(-EBADF);
2230
2231 if (file->f_op != &perf_fops) {
2232 fput_light(file, *fput_needed);
2233 *fput_needed = 0;
2234 return ERR_PTR(-EBADF);
2235 }
2236
2237 return file->private_data;
2238}
2239
2240static int perf_event_set_output(struct perf_event *event,
2241 struct perf_event *output_event);
2143static int perf_event_set_filter(struct perf_event *event, void __user *arg); 2242static int perf_event_set_filter(struct perf_event *event, void __user *arg);
2144 2243
2145static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 2244static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -2166,7 +2265,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
2166 return perf_event_period(event, (u64 __user *)arg); 2265 return perf_event_period(event, (u64 __user *)arg);
2167 2266
2168 case PERF_EVENT_IOC_SET_OUTPUT: 2267 case PERF_EVENT_IOC_SET_OUTPUT:
2169 return perf_event_set_output(event, arg); 2268 {
2269 struct perf_event *output_event = NULL;
2270 int fput_needed = 0;
2271 int ret;
2272
2273 if (arg != -1) {
2274 output_event = perf_fget_light(arg, &fput_needed);
2275 if (IS_ERR(output_event))
2276 return PTR_ERR(output_event);
2277 }
2278
2279 ret = perf_event_set_output(event, output_event);
2280 if (output_event)
2281 fput_light(output_event->filp, fput_needed);
2282
2283 return ret;
2284 }
2170 2285
2171 case PERF_EVENT_IOC_SET_FILTER: 2286 case PERF_EVENT_IOC_SET_FILTER:
2172 return perf_event_set_filter(event, (void __user *)arg); 2287 return perf_event_set_filter(event, (void __user *)arg);
@@ -2261,11 +2376,6 @@ unlock:
2261 rcu_read_unlock(); 2376 rcu_read_unlock();
2262} 2377}
2263 2378
2264static unsigned long perf_data_size(struct perf_mmap_data *data)
2265{
2266 return data->nr_pages << (PAGE_SHIFT + data->data_order);
2267}
2268
2269#ifndef CONFIG_PERF_USE_VMALLOC 2379#ifndef CONFIG_PERF_USE_VMALLOC
2270 2380
2271/* 2381/*
@@ -2284,6 +2394,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2284 return virt_to_page(data->data_pages[pgoff - 1]); 2394 return virt_to_page(data->data_pages[pgoff - 1]);
2285} 2395}
2286 2396
2397static void *perf_mmap_alloc_page(int cpu)
2398{
2399 struct page *page;
2400 int node;
2401
2402 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
2403 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
2404 if (!page)
2405 return NULL;
2406
2407 return page_address(page);
2408}
2409
2287static struct perf_mmap_data * 2410static struct perf_mmap_data *
2288perf_mmap_data_alloc(struct perf_event *event, int nr_pages) 2411perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2289{ 2412{
@@ -2291,8 +2414,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2291 unsigned long size; 2414 unsigned long size;
2292 int i; 2415 int i;
2293 2416
2294 WARN_ON(atomic_read(&event->mmap_count));
2295
2296 size = sizeof(struct perf_mmap_data); 2417 size = sizeof(struct perf_mmap_data);
2297 size += nr_pages * sizeof(void *); 2418 size += nr_pages * sizeof(void *);
2298 2419
@@ -2300,17 +2421,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2300 if (!data) 2421 if (!data)
2301 goto fail; 2422 goto fail;
2302 2423
2303 data->user_page = (void *)get_zeroed_page(GFP_KERNEL); 2424 data->user_page = perf_mmap_alloc_page(event->cpu);
2304 if (!data->user_page) 2425 if (!data->user_page)
2305 goto fail_user_page; 2426 goto fail_user_page;
2306 2427
2307 for (i = 0; i < nr_pages; i++) { 2428 for (i = 0; i < nr_pages; i++) {
2308 data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); 2429 data->data_pages[i] = perf_mmap_alloc_page(event->cpu);
2309 if (!data->data_pages[i]) 2430 if (!data->data_pages[i])
2310 goto fail_data_pages; 2431 goto fail_data_pages;
2311 } 2432 }
2312 2433
2313 data->data_order = 0;
2314 data->nr_pages = nr_pages; 2434 data->nr_pages = nr_pages;
2315 2435
2316 return data; 2436 return data;
@@ -2346,6 +2466,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2346 kfree(data); 2466 kfree(data);
2347} 2467}
2348 2468
2469static inline int page_order(struct perf_mmap_data *data)
2470{
2471 return 0;
2472}
2473
2349#else 2474#else
2350 2475
2351/* 2476/*
@@ -2354,10 +2479,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data)
2354 * Required for architectures that have d-cache aliasing issues. 2479 * Required for architectures that have d-cache aliasing issues.
2355 */ 2480 */
2356 2481
2482static inline int page_order(struct perf_mmap_data *data)
2483{
2484 return data->page_order;
2485}
2486
2357static struct page * 2487static struct page *
2358perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) 2488perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
2359{ 2489{
2360 if (pgoff > (1UL << data->data_order)) 2490 if (pgoff > (1UL << page_order(data)))
2361 return NULL; 2491 return NULL;
2362 2492
2363 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); 2493 return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
@@ -2377,7 +2507,7 @@ static void perf_mmap_data_free_work(struct work_struct *work)
2377 int i, nr; 2507 int i, nr;
2378 2508
2379 data = container_of(work, struct perf_mmap_data, work); 2509 data = container_of(work, struct perf_mmap_data, work);
2380 nr = 1 << data->data_order; 2510 nr = 1 << page_order(data);
2381 2511
2382 base = data->user_page; 2512 base = data->user_page;
2383 for (i = 0; i < nr + 1; i++) 2513 for (i = 0; i < nr + 1; i++)
@@ -2399,8 +2529,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2399 unsigned long size; 2529 unsigned long size;
2400 void *all_buf; 2530 void *all_buf;
2401 2531
2402 WARN_ON(atomic_read(&event->mmap_count));
2403
2404 size = sizeof(struct perf_mmap_data); 2532 size = sizeof(struct perf_mmap_data);
2405 size += sizeof(void *); 2533 size += sizeof(void *);
2406 2534
@@ -2416,7 +2544,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
2416 2544
2417 data->user_page = all_buf; 2545 data->user_page = all_buf;
2418 data->data_pages[0] = all_buf + PAGE_SIZE; 2546 data->data_pages[0] = all_buf + PAGE_SIZE;
2419 data->data_order = ilog2(nr_pages); 2547 data->page_order = ilog2(nr_pages);
2420 data->nr_pages = 1; 2548 data->nr_pages = 1;
2421 2549
2422 return data; 2550 return data;
@@ -2430,6 +2558,11 @@ fail:
2430 2558
2431#endif 2559#endif
2432 2560
2561static unsigned long perf_data_size(struct perf_mmap_data *data)
2562{
2563 return data->nr_pages << (PAGE_SHIFT + page_order(data));
2564}
2565
2433static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 2566static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
2434{ 2567{
2435 struct perf_event *event = vma->vm_file->private_data; 2568 struct perf_event *event = vma->vm_file->private_data;
@@ -2470,8 +2603,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2470{ 2603{
2471 long max_size = perf_data_size(data); 2604 long max_size = perf_data_size(data);
2472 2605
2473 atomic_set(&data->lock, -1);
2474
2475 if (event->attr.watermark) { 2606 if (event->attr.watermark) {
2476 data->watermark = min_t(long, max_size, 2607 data->watermark = min_t(long, max_size,
2477 event->attr.wakeup_watermark); 2608 event->attr.wakeup_watermark);
@@ -2480,7 +2611,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
2480 if (!data->watermark) 2611 if (!data->watermark)
2481 data->watermark = max_size / 2; 2612 data->watermark = max_size / 2;
2482 2613
2483 2614 atomic_set(&data->refcount, 1);
2484 rcu_assign_pointer(event->data, data); 2615 rcu_assign_pointer(event->data, data);
2485} 2616}
2486 2617
@@ -2492,13 +2623,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
2492 perf_mmap_data_free(data); 2623 perf_mmap_data_free(data);
2493} 2624}
2494 2625
2495static void perf_mmap_data_release(struct perf_event *event) 2626static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event)
2496{ 2627{
2497 struct perf_mmap_data *data = event->data; 2628 struct perf_mmap_data *data;
2498 2629
2499 WARN_ON(atomic_read(&event->mmap_count)); 2630 rcu_read_lock();
2631 data = rcu_dereference(event->data);
2632 if (data) {
2633 if (!atomic_inc_not_zero(&data->refcount))
2634 data = NULL;
2635 }
2636 rcu_read_unlock();
2637
2638 return data;
2639}
2640
2641static void perf_mmap_data_put(struct perf_mmap_data *data)
2642{
2643 if (!atomic_dec_and_test(&data->refcount))
2644 return;
2500 2645
2501 rcu_assign_pointer(event->data, NULL);
2502 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); 2646 call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
2503} 2647}
2504 2648
@@ -2513,15 +2657,18 @@ static void perf_mmap_close(struct vm_area_struct *vma)
2513{ 2657{
2514 struct perf_event *event = vma->vm_file->private_data; 2658 struct perf_event *event = vma->vm_file->private_data;
2515 2659
2516 WARN_ON_ONCE(event->ctx->parent_ctx);
2517 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 2660 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
2518 unsigned long size = perf_data_size(event->data); 2661 unsigned long size = perf_data_size(event->data);
2519 struct user_struct *user = current_user(); 2662 struct user_struct *user = event->mmap_user;
2663 struct perf_mmap_data *data = event->data;
2520 2664
2521 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 2665 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
2522 vma->vm_mm->locked_vm -= event->data->nr_locked; 2666 vma->vm_mm->locked_vm -= event->mmap_locked;
2523 perf_mmap_data_release(event); 2667 rcu_assign_pointer(event->data, NULL);
2524 mutex_unlock(&event->mmap_mutex); 2668 mutex_unlock(&event->mmap_mutex);
2669
2670 perf_mmap_data_put(data);
2671 free_uid(user);
2525 } 2672 }
2526} 2673}
2527 2674
@@ -2544,6 +2691,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2544 long user_extra, extra; 2691 long user_extra, extra;
2545 int ret = 0; 2692 int ret = 0;
2546 2693
2694 /*
2695 * Don't allow mmap() of inherited per-task counters. This would
2696 * create a performance issue due to all children writing to the
2697 * same buffer.
2698 */
2699 if (event->cpu == -1 && event->attr.inherit)
2700 return -EINVAL;
2701
2547 if (!(vma->vm_flags & VM_SHARED)) 2702 if (!(vma->vm_flags & VM_SHARED))
2548 return -EINVAL; 2703 return -EINVAL;
2549 2704
@@ -2565,13 +2720,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2565 2720
2566 WARN_ON_ONCE(event->ctx->parent_ctx); 2721 WARN_ON_ONCE(event->ctx->parent_ctx);
2567 mutex_lock(&event->mmap_mutex); 2722 mutex_lock(&event->mmap_mutex);
2568 if (event->output) { 2723 if (event->data) {
2569 ret = -EINVAL; 2724 if (event->data->nr_pages == nr_pages)
2570 goto unlock; 2725 atomic_inc(&event->data->refcount);
2571 } 2726 else
2572
2573 if (atomic_inc_not_zero(&event->mmap_count)) {
2574 if (nr_pages != event->data->nr_pages)
2575 ret = -EINVAL; 2727 ret = -EINVAL;
2576 goto unlock; 2728 goto unlock;
2577 } 2729 }
@@ -2603,21 +2755,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2603 WARN_ON(event->data); 2755 WARN_ON(event->data);
2604 2756
2605 data = perf_mmap_data_alloc(event, nr_pages); 2757 data = perf_mmap_data_alloc(event, nr_pages);
2606 ret = -ENOMEM; 2758 if (!data) {
2607 if (!data) 2759 ret = -ENOMEM;
2608 goto unlock; 2760 goto unlock;
2761 }
2609 2762
2610 ret = 0;
2611 perf_mmap_data_init(event, data); 2763 perf_mmap_data_init(event, data);
2612
2613 atomic_set(&event->mmap_count, 1);
2614 atomic_long_add(user_extra, &user->locked_vm);
2615 vma->vm_mm->locked_vm += extra;
2616 event->data->nr_locked = extra;
2617 if (vma->vm_flags & VM_WRITE) 2764 if (vma->vm_flags & VM_WRITE)
2618 event->data->writable = 1; 2765 event->data->writable = 1;
2619 2766
2767 atomic_long_add(user_extra, &user->locked_vm);
2768 event->mmap_locked = extra;
2769 event->mmap_user = get_current_user();
2770 vma->vm_mm->locked_vm += event->mmap_locked;
2771
2620unlock: 2772unlock:
2773 if (!ret)
2774 atomic_inc(&event->mmap_count);
2621 mutex_unlock(&event->mmap_mutex); 2775 mutex_unlock(&event->mmap_mutex);
2622 2776
2623 vma->vm_flags |= VM_RESERVED; 2777 vma->vm_flags |= VM_RESERVED;
@@ -2643,6 +2797,7 @@ static int perf_fasync(int fd, struct file *filp, int on)
2643} 2797}
2644 2798
2645static const struct file_operations perf_fops = { 2799static const struct file_operations perf_fops = {
2800 .llseek = no_llseek,
2646 .release = perf_release, 2801 .release = perf_release,
2647 .read = perf_read, 2802 .read = perf_read,
2648 .poll = perf_poll, 2803 .poll = perf_poll,
@@ -2786,12 +2941,32 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2786 return NULL; 2941 return NULL;
2787} 2942}
2788 2943
2789#ifdef CONFIG_EVENT_TRACING
2790__weak 2944__weak
2791void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) 2945void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2792{ 2946{
2793} 2947}
2794#endif 2948
2949
2950/*
2951 * We assume there is only KVM supporting the callbacks.
2952 * Later on, we might change it to a list if there is
2953 * another virtualization implementation supporting the callbacks.
2954 */
2955struct perf_guest_info_callbacks *perf_guest_cbs;
2956
2957int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2958{
2959 perf_guest_cbs = cbs;
2960 return 0;
2961}
2962EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
2963
2964int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
2965{
2966 perf_guest_cbs = NULL;
2967 return 0;
2968}
2969EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
2795 2970
2796/* 2971/*
2797 * Output 2972 * Output
@@ -2828,127 +3003,87 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2828} 3003}
2829 3004
2830/* 3005/*
2831 * Curious locking construct.
2832 *
2833 * We need to ensure a later event_id doesn't publish a head when a former 3006 * We need to ensure a later event_id doesn't publish a head when a former
2834 * event_id isn't done writing. However since we need to deal with NMIs we 3007 * event isn't done writing. However since we need to deal with NMIs we
2835 * cannot fully serialize things. 3008 * cannot fully serialize things.
2836 * 3009 *
2837 * What we do is serialize between CPUs so we only have to deal with NMI
2838 * nesting on a single CPU.
2839 *
2840 * We only publish the head (and generate a wakeup) when the outer-most 3010 * We only publish the head (and generate a wakeup) when the outer-most
2841 * event_id completes. 3011 * event completes.
2842 */ 3012 */
2843static void perf_output_lock(struct perf_output_handle *handle) 3013static void perf_output_get_handle(struct perf_output_handle *handle)
2844{ 3014{
2845 struct perf_mmap_data *data = handle->data; 3015 struct perf_mmap_data *data = handle->data;
2846 int cur, cpu = get_cpu();
2847 3016
2848 handle->locked = 0; 3017 preempt_disable();
2849 3018 local_inc(&data->nest);
2850 for (;;) { 3019 handle->wakeup = local_read(&data->wakeup);
2851 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2852 if (cur == -1) {
2853 handle->locked = 1;
2854 break;
2855 }
2856 if (cur == cpu)
2857 break;
2858
2859 cpu_relax();
2860 }
2861} 3020}
2862 3021
2863static void perf_output_unlock(struct perf_output_handle *handle) 3022static void perf_output_put_handle(struct perf_output_handle *handle)
2864{ 3023{
2865 struct perf_mmap_data *data = handle->data; 3024 struct perf_mmap_data *data = handle->data;
2866 unsigned long head; 3025 unsigned long head;
2867 int cpu;
2868
2869 data->done_head = data->head;
2870
2871 if (!handle->locked)
2872 goto out;
2873 3026
2874again: 3027again:
2875 /* 3028 head = local_read(&data->head);
2876 * The xchg implies a full barrier that ensures all writes are done
2877 * before we publish the new head, matched by a rmb() in userspace when
2878 * reading this position.
2879 */
2880 while ((head = atomic_long_xchg(&data->done_head, 0)))
2881 data->user_page->data_head = head;
2882 3029
2883 /* 3030 /*
2884 * NMI can happen here, which means we can miss a done_head update. 3031 * IRQ/NMI can happen here, which means we can miss a head update.
2885 */ 3032 */
2886 3033
2887 cpu = atomic_xchg(&data->lock, -1); 3034 if (!local_dec_and_test(&data->nest))
2888 WARN_ON_ONCE(cpu != smp_processor_id()); 3035 goto out;
2889 3036
2890 /* 3037 /*
2891 * Therefore we have to validate we did not indeed do so. 3038 * Publish the known good head. Rely on the full barrier implied
3039 * by atomic_dec_and_test() order the data->head read and this
3040 * write.
2892 */ 3041 */
2893 if (unlikely(atomic_long_read(&data->done_head))) { 3042 data->user_page->data_head = head;
2894 /*
2895 * Since we had it locked, we can lock it again.
2896 */
2897 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2898 cpu_relax();
2899 3043
3044 /*
3045 * Now check if we missed an update, rely on the (compiler)
3046 * barrier in atomic_dec_and_test() to re-read data->head.
3047 */
3048 if (unlikely(head != local_read(&data->head))) {
3049 local_inc(&data->nest);
2900 goto again; 3050 goto again;
2901 } 3051 }
2902 3052
2903 if (atomic_xchg(&data->wakeup, 0)) 3053 if (handle->wakeup != local_read(&data->wakeup))
2904 perf_output_wakeup(handle); 3054 perf_output_wakeup(handle);
2905out: 3055
2906 put_cpu(); 3056 out:
3057 preempt_enable();
2907} 3058}
2908 3059
2909void perf_output_copy(struct perf_output_handle *handle, 3060__always_inline void perf_output_copy(struct perf_output_handle *handle,
2910 const void *buf, unsigned int len) 3061 const void *buf, unsigned int len)
2911{ 3062{
2912 unsigned int pages_mask;
2913 unsigned long offset;
2914 unsigned int size;
2915 void **pages;
2916
2917 offset = handle->offset;
2918 pages_mask = handle->data->nr_pages - 1;
2919 pages = handle->data->data_pages;
2920
2921 do { 3063 do {
2922 unsigned long page_offset; 3064 unsigned long size = min_t(unsigned long, handle->size, len);
2923 unsigned long page_size;
2924 int nr;
2925 3065
2926 nr = (offset >> PAGE_SHIFT) & pages_mask; 3066 memcpy(handle->addr, buf, size);
2927 page_size = 1UL << (handle->data->data_order + PAGE_SHIFT);
2928 page_offset = offset & (page_size - 1);
2929 size = min_t(unsigned int, page_size - page_offset, len);
2930 3067
2931 memcpy(pages[nr] + page_offset, buf, size); 3068 len -= size;
3069 handle->addr += size;
3070 buf += size;
3071 handle->size -= size;
3072 if (!handle->size) {
3073 struct perf_mmap_data *data = handle->data;
2932 3074
2933 len -= size; 3075 handle->page++;
2934 buf += size; 3076 handle->page &= data->nr_pages - 1;
2935 offset += size; 3077 handle->addr = data->data_pages[handle->page];
3078 handle->size = PAGE_SIZE << page_order(data);
3079 }
2936 } while (len); 3080 } while (len);
2937
2938 handle->offset = offset;
2939
2940 /*
2941 * Check we didn't copy past our reservation window, taking the
2942 * possible unsigned int wrap into account.
2943 */
2944 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2945} 3081}
2946 3082
2947int perf_output_begin(struct perf_output_handle *handle, 3083int perf_output_begin(struct perf_output_handle *handle,
2948 struct perf_event *event, unsigned int size, 3084 struct perf_event *event, unsigned int size,
2949 int nmi, int sample) 3085 int nmi, int sample)
2950{ 3086{
2951 struct perf_event *output_event;
2952 struct perf_mmap_data *data; 3087 struct perf_mmap_data *data;
2953 unsigned long tail, offset, head; 3088 unsigned long tail, offset, head;
2954 int have_lost; 3089 int have_lost;
@@ -2965,10 +3100,6 @@ int perf_output_begin(struct perf_output_handle *handle,
2965 if (event->parent) 3100 if (event->parent)
2966 event = event->parent; 3101 event = event->parent;
2967 3102
2968 output_event = rcu_dereference(event->output);
2969 if (output_event)
2970 event = output_event;
2971
2972 data = rcu_dereference(event->data); 3103 data = rcu_dereference(event->data);
2973 if (!data) 3104 if (!data)
2974 goto out; 3105 goto out;
@@ -2979,13 +3110,13 @@ int perf_output_begin(struct perf_output_handle *handle,
2979 handle->sample = sample; 3110 handle->sample = sample;
2980 3111
2981 if (!data->nr_pages) 3112 if (!data->nr_pages)
2982 goto fail; 3113 goto out;
2983 3114
2984 have_lost = atomic_read(&data->lost); 3115 have_lost = local_read(&data->lost);
2985 if (have_lost) 3116 if (have_lost)
2986 size += sizeof(lost_event); 3117 size += sizeof(lost_event);
2987 3118
2988 perf_output_lock(handle); 3119 perf_output_get_handle(handle);
2989 3120
2990 do { 3121 do {
2991 /* 3122 /*
@@ -2995,24 +3126,28 @@ int perf_output_begin(struct perf_output_handle *handle,
2995 */ 3126 */
2996 tail = ACCESS_ONCE(data->user_page->data_tail); 3127 tail = ACCESS_ONCE(data->user_page->data_tail);
2997 smp_rmb(); 3128 smp_rmb();
2998 offset = head = atomic_long_read(&data->head); 3129 offset = head = local_read(&data->head);
2999 head += size; 3130 head += size;
3000 if (unlikely(!perf_output_space(data, tail, offset, head))) 3131 if (unlikely(!perf_output_space(data, tail, offset, head)))
3001 goto fail; 3132 goto fail;
3002 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 3133 } while (local_cmpxchg(&data->head, offset, head) != offset);
3003 3134
3004 handle->offset = offset; 3135 if (head - local_read(&data->wakeup) > data->watermark)
3005 handle->head = head; 3136 local_add(data->watermark, &data->wakeup);
3006 3137
3007 if (head - tail > data->watermark) 3138 handle->page = offset >> (PAGE_SHIFT + page_order(data));
3008 atomic_set(&data->wakeup, 1); 3139 handle->page &= data->nr_pages - 1;
3140 handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1);
3141 handle->addr = data->data_pages[handle->page];
3142 handle->addr += handle->size;
3143 handle->size = (PAGE_SIZE << page_order(data)) - handle->size;
3009 3144
3010 if (have_lost) { 3145 if (have_lost) {
3011 lost_event.header.type = PERF_RECORD_LOST; 3146 lost_event.header.type = PERF_RECORD_LOST;
3012 lost_event.header.misc = 0; 3147 lost_event.header.misc = 0;
3013 lost_event.header.size = sizeof(lost_event); 3148 lost_event.header.size = sizeof(lost_event);
3014 lost_event.id = event->id; 3149 lost_event.id = event->id;
3015 lost_event.lost = atomic_xchg(&data->lost, 0); 3150 lost_event.lost = local_xchg(&data->lost, 0);
3016 3151
3017 perf_output_put(handle, lost_event); 3152 perf_output_put(handle, lost_event);
3018 } 3153 }
@@ -3020,8 +3155,8 @@ int perf_output_begin(struct perf_output_handle *handle,
3020 return 0; 3155 return 0;
3021 3156
3022fail: 3157fail:
3023 atomic_inc(&data->lost); 3158 local_inc(&data->lost);
3024 perf_output_unlock(handle); 3159 perf_output_put_handle(handle);
3025out: 3160out:
3026 rcu_read_unlock(); 3161 rcu_read_unlock();
3027 3162
@@ -3036,14 +3171,14 @@ void perf_output_end(struct perf_output_handle *handle)
3036 int wakeup_events = event->attr.wakeup_events; 3171 int wakeup_events = event->attr.wakeup_events;
3037 3172
3038 if (handle->sample && wakeup_events) { 3173 if (handle->sample && wakeup_events) {
3039 int events = atomic_inc_return(&data->events); 3174 int events = local_inc_return(&data->events);
3040 if (events >= wakeup_events) { 3175 if (events >= wakeup_events) {
3041 atomic_sub(wakeup_events, &data->events); 3176 local_sub(wakeup_events, &data->events);
3042 atomic_set(&data->wakeup, 1); 3177 local_inc(&data->wakeup);
3043 } 3178 }
3044 } 3179 }
3045 3180
3046 perf_output_unlock(handle); 3181 perf_output_put_handle(handle);
3047 rcu_read_unlock(); 3182 rcu_read_unlock();
3048} 3183}
3049 3184
@@ -3378,9 +3513,8 @@ static void perf_event_task_output(struct perf_event *event,
3378 struct perf_task_event *task_event) 3513 struct perf_task_event *task_event)
3379{ 3514{
3380 struct perf_output_handle handle; 3515 struct perf_output_handle handle;
3381 int size;
3382 struct task_struct *task = task_event->task; 3516 struct task_struct *task = task_event->task;
3383 int ret; 3517 int size, ret;
3384 3518
3385 size = task_event->event_id.header.size; 3519 size = task_event->event_id.header.size;
3386 ret = perf_output_begin(&handle, event, size, 0, 0); 3520 ret = perf_output_begin(&handle, event, size, 0, 0);
@@ -3736,7 +3870,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3736 .event_id = { 3870 .event_id = {
3737 .header = { 3871 .header = {
3738 .type = PERF_RECORD_MMAP, 3872 .type = PERF_RECORD_MMAP,
3739 .misc = 0, 3873 .misc = PERF_RECORD_MISC_USER,
3740 /* .size */ 3874 /* .size */
3741 }, 3875 },
3742 /* .pid */ 3876 /* .pid */
@@ -3924,13 +4058,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
3924 } 4058 }
3925} 4059}
3926 4060
3927static void perf_swevent_unthrottle(struct perf_event *event)
3928{
3929 /*
3930 * Nothing to do, we already reset hwc->interrupts.
3931 */
3932}
3933
3934static void perf_swevent_add(struct perf_event *event, u64 nr, 4061static void perf_swevent_add(struct perf_event *event, u64 nr,
3935 int nmi, struct perf_sample_data *data, 4062 int nmi, struct perf_sample_data *data,
3936 struct pt_regs *regs) 4063 struct pt_regs *regs)
@@ -3954,39 +4081,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr,
3954 perf_swevent_overflow(event, 0, nmi, data, regs); 4081 perf_swevent_overflow(event, 0, nmi, data, regs);
3955} 4082}
3956 4083
3957static int perf_swevent_is_counting(struct perf_event *event)
3958{
3959 /*
3960 * The event is active, we're good!
3961 */
3962 if (event->state == PERF_EVENT_STATE_ACTIVE)
3963 return 1;
3964
3965 /*
3966 * The event is off/error, not counting.
3967 */
3968 if (event->state != PERF_EVENT_STATE_INACTIVE)
3969 return 0;
3970
3971 /*
3972 * The event is inactive, if the context is active
3973 * we're part of a group that didn't make it on the 'pmu',
3974 * not counting.
3975 */
3976 if (event->ctx->is_active)
3977 return 0;
3978
3979 /*
3980 * We're inactive and the context is too, this means the
3981 * task is scheduled out, we're counting events that happen
3982 * to us, like migration events.
3983 */
3984 return 1;
3985}
3986
3987static int perf_tp_event_match(struct perf_event *event,
3988 struct perf_sample_data *data);
3989
3990static int perf_exclude_event(struct perf_event *event, 4084static int perf_exclude_event(struct perf_event *event,
3991 struct pt_regs *regs) 4085 struct pt_regs *regs)
3992{ 4086{
@@ -4007,12 +4101,6 @@ static int perf_swevent_match(struct perf_event *event,
4007 struct perf_sample_data *data, 4101 struct perf_sample_data *data,
4008 struct pt_regs *regs) 4102 struct pt_regs *regs)
4009{ 4103{
4010 if (event->cpu != -1 && event->cpu != smp_processor_id())
4011 return 0;
4012
4013 if (!perf_swevent_is_counting(event))
4014 return 0;
4015
4016 if (event->attr.type != type) 4104 if (event->attr.type != type)
4017 return 0; 4105 return 0;
4018 4106
@@ -4022,30 +4110,88 @@ static int perf_swevent_match(struct perf_event *event,
4022 if (perf_exclude_event(event, regs)) 4110 if (perf_exclude_event(event, regs))
4023 return 0; 4111 return 0;
4024 4112
4025 if (event->attr.type == PERF_TYPE_TRACEPOINT &&
4026 !perf_tp_event_match(event, data))
4027 return 0;
4028
4029 return 1; 4113 return 1;
4030} 4114}
4031 4115
4032static void perf_swevent_ctx_event(struct perf_event_context *ctx, 4116static inline u64 swevent_hash(u64 type, u32 event_id)
4033 enum perf_type_id type, 4117{
4034 u32 event_id, u64 nr, int nmi, 4118 u64 val = event_id | (type << 32);
4035 struct perf_sample_data *data, 4119
4036 struct pt_regs *regs) 4120 return hash_64(val, SWEVENT_HLIST_BITS);
4121}
4122
4123static inline struct hlist_head *
4124__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
4125{
4126 u64 hash = swevent_hash(type, event_id);
4127
4128 return &hlist->heads[hash];
4129}
4130
4131/* For the read side: events when they trigger */
4132static inline struct hlist_head *
4133find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id)
4134{
4135 struct swevent_hlist *hlist;
4136
4137 hlist = rcu_dereference(ctx->swevent_hlist);
4138 if (!hlist)
4139 return NULL;
4140
4141 return __find_swevent_head(hlist, type, event_id);
4142}
4143
4144/* For the event head insertion and removal in the hlist */
4145static inline struct hlist_head *
4146find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event)
4147{
4148 struct swevent_hlist *hlist;
4149 u32 event_id = event->attr.config;
4150 u64 type = event->attr.type;
4151
4152 /*
4153 * Event scheduling is always serialized against hlist allocation
4154 * and release. Which makes the protected version suitable here.
4155 * The context lock guarantees that.
4156 */
4157 hlist = rcu_dereference_protected(ctx->swevent_hlist,
4158 lockdep_is_held(&event->ctx->lock));
4159 if (!hlist)
4160 return NULL;
4161
4162 return __find_swevent_head(hlist, type, event_id);
4163}
4164
4165static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4166 u64 nr, int nmi,
4167 struct perf_sample_data *data,
4168 struct pt_regs *regs)
4037{ 4169{
4170 struct perf_cpu_context *cpuctx;
4038 struct perf_event *event; 4171 struct perf_event *event;
4172 struct hlist_node *node;
4173 struct hlist_head *head;
4039 4174
4040 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 4175 cpuctx = &__get_cpu_var(perf_cpu_context);
4176
4177 rcu_read_lock();
4178
4179 head = find_swevent_head_rcu(cpuctx, type, event_id);
4180
4181 if (!head)
4182 goto end;
4183
4184 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4041 if (perf_swevent_match(event, type, event_id, data, regs)) 4185 if (perf_swevent_match(event, type, event_id, data, regs))
4042 perf_swevent_add(event, nr, nmi, data, regs); 4186 perf_swevent_add(event, nr, nmi, data, regs);
4043 } 4187 }
4188end:
4189 rcu_read_unlock();
4044} 4190}
4045 4191
4046int perf_swevent_get_recursion_context(void) 4192int perf_swevent_get_recursion_context(void)
4047{ 4193{
4048 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 4194 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4049 int rctx; 4195 int rctx;
4050 4196
4051 if (in_nmi()) 4197 if (in_nmi())
@@ -4057,10 +4203,8 @@ int perf_swevent_get_recursion_context(void)
4057 else 4203 else
4058 rctx = 0; 4204 rctx = 0;
4059 4205
4060 if (cpuctx->recursion[rctx]) { 4206 if (cpuctx->recursion[rctx])
4061 put_cpu_var(perf_cpu_context);
4062 return -1; 4207 return -1;
4063 }
4064 4208
4065 cpuctx->recursion[rctx]++; 4209 cpuctx->recursion[rctx]++;
4066 barrier(); 4210 barrier();
@@ -4074,31 +4218,9 @@ void perf_swevent_put_recursion_context(int rctx)
4074 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 4218 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
4075 barrier(); 4219 barrier();
4076 cpuctx->recursion[rctx]--; 4220 cpuctx->recursion[rctx]--;
4077 put_cpu_var(perf_cpu_context);
4078} 4221}
4079EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); 4222EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context);
4080 4223
4081static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
4082 u64 nr, int nmi,
4083 struct perf_sample_data *data,
4084 struct pt_regs *regs)
4085{
4086 struct perf_cpu_context *cpuctx;
4087 struct perf_event_context *ctx;
4088
4089 cpuctx = &__get_cpu_var(perf_cpu_context);
4090 rcu_read_lock();
4091 perf_swevent_ctx_event(&cpuctx->ctx, type, event_id,
4092 nr, nmi, data, regs);
4093 /*
4094 * doesn't really matter which of the child contexts the
4095 * events ends up in.
4096 */
4097 ctx = rcu_dereference(current->perf_event_ctxp);
4098 if (ctx)
4099 perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs);
4100 rcu_read_unlock();
4101}
4102 4224
4103void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4225void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4104 struct pt_regs *regs, u64 addr) 4226 struct pt_regs *regs, u64 addr)
@@ -4106,6 +4228,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4106 struct perf_sample_data data; 4228 struct perf_sample_data data;
4107 int rctx; 4229 int rctx;
4108 4230
4231 preempt_disable_notrace();
4109 rctx = perf_swevent_get_recursion_context(); 4232 rctx = perf_swevent_get_recursion_context();
4110 if (rctx < 0) 4233 if (rctx < 0)
4111 return; 4234 return;
@@ -4115,6 +4238,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
4115 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4238 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
4116 4239
4117 perf_swevent_put_recursion_context(rctx); 4240 perf_swevent_put_recursion_context(rctx);
4241 preempt_enable_notrace();
4118} 4242}
4119 4243
4120static void perf_swevent_read(struct perf_event *event) 4244static void perf_swevent_read(struct perf_event *event)
@@ -4124,23 +4248,46 @@ static void perf_swevent_read(struct perf_event *event)
4124static int perf_swevent_enable(struct perf_event *event) 4248static int perf_swevent_enable(struct perf_event *event)
4125{ 4249{
4126 struct hw_perf_event *hwc = &event->hw; 4250 struct hw_perf_event *hwc = &event->hw;
4251 struct perf_cpu_context *cpuctx;
4252 struct hlist_head *head;
4253
4254 cpuctx = &__get_cpu_var(perf_cpu_context);
4127 4255
4128 if (hwc->sample_period) { 4256 if (hwc->sample_period) {
4129 hwc->last_period = hwc->sample_period; 4257 hwc->last_period = hwc->sample_period;
4130 perf_swevent_set_period(event); 4258 perf_swevent_set_period(event);
4131 } 4259 }
4260
4261 head = find_swevent_head(cpuctx, event);
4262 if (WARN_ON_ONCE(!head))
4263 return -EINVAL;
4264
4265 hlist_add_head_rcu(&event->hlist_entry, head);
4266
4132 return 0; 4267 return 0;
4133} 4268}
4134 4269
4135static void perf_swevent_disable(struct perf_event *event) 4270static void perf_swevent_disable(struct perf_event *event)
4136{ 4271{
4272 hlist_del_rcu(&event->hlist_entry);
4273}
4274
4275static void perf_swevent_void(struct perf_event *event)
4276{
4277}
4278
4279static int perf_swevent_int(struct perf_event *event)
4280{
4281 return 0;
4137} 4282}
4138 4283
4139static const struct pmu perf_ops_generic = { 4284static const struct pmu perf_ops_generic = {
4140 .enable = perf_swevent_enable, 4285 .enable = perf_swevent_enable,
4141 .disable = perf_swevent_disable, 4286 .disable = perf_swevent_disable,
4287 .start = perf_swevent_int,
4288 .stop = perf_swevent_void,
4142 .read = perf_swevent_read, 4289 .read = perf_swevent_read,
4143 .unthrottle = perf_swevent_unthrottle, 4290 .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */
4144}; 4291};
4145 4292
4146/* 4293/*
@@ -4161,15 +4308,8 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4161 perf_sample_data_init(&data, 0); 4308 perf_sample_data_init(&data, 0);
4162 data.period = event->hw.last_period; 4309 data.period = event->hw.last_period;
4163 regs = get_irq_regs(); 4310 regs = get_irq_regs();
4164 /*
4165 * In case we exclude kernel IPs or are somehow not in interrupt
4166 * context, provide the next best thing, the user IP.
4167 */
4168 if ((event->attr.exclude_kernel || !regs) &&
4169 !event->attr.exclude_user)
4170 regs = task_pt_regs(current);
4171 4311
4172 if (regs) { 4312 if (regs && !perf_exclude_event(event, regs)) {
4173 if (!(event->attr.exclude_idle && current->pid == 0)) 4313 if (!(event->attr.exclude_idle && current->pid == 0))
4174 if (perf_event_overflow(event, 0, &data, regs)) 4314 if (perf_event_overflow(event, 0, &data, regs))
4175 ret = HRTIMER_NORESTART; 4315 ret = HRTIMER_NORESTART;
@@ -4317,27 +4457,124 @@ static const struct pmu perf_ops_task_clock = {
4317 .read = task_clock_perf_event_read, 4457 .read = task_clock_perf_event_read,
4318}; 4458};
4319 4459
4320#ifdef CONFIG_EVENT_TRACING 4460/* Deref the hlist from the update side */
4461static inline struct swevent_hlist *
4462swevent_hlist_deref(struct perf_cpu_context *cpuctx)
4463{
4464 return rcu_dereference_protected(cpuctx->swevent_hlist,
4465 lockdep_is_held(&cpuctx->hlist_mutex));
4466}
4321 4467
4322void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4468static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
4323 int entry_size, struct pt_regs *regs)
4324{ 4469{
4325 struct perf_sample_data data; 4470 struct swevent_hlist *hlist;
4326 struct perf_raw_record raw = {
4327 .size = entry_size,
4328 .data = record,
4329 };
4330 4471
4331 perf_sample_data_init(&data, addr); 4472 hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
4332 data.raw = &raw; 4473 kfree(hlist);
4474}
4475
4476static void swevent_hlist_release(struct perf_cpu_context *cpuctx)
4477{
4478 struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx);
4333 4479
4334 /* Trace events already protected against recursion */ 4480 if (!hlist)
4335 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4481 return;
4336 &data, regs); 4482
4483 rcu_assign_pointer(cpuctx->swevent_hlist, NULL);
4484 call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
4337} 4485}
4338EXPORT_SYMBOL_GPL(perf_tp_event);
4339 4486
4340static int perf_tp_event_match(struct perf_event *event, 4487static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
4488{
4489 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4490
4491 mutex_lock(&cpuctx->hlist_mutex);
4492
4493 if (!--cpuctx->hlist_refcount)
4494 swevent_hlist_release(cpuctx);
4495
4496 mutex_unlock(&cpuctx->hlist_mutex);
4497}
4498
4499static void swevent_hlist_put(struct perf_event *event)
4500{
4501 int cpu;
4502
4503 if (event->cpu != -1) {
4504 swevent_hlist_put_cpu(event, event->cpu);
4505 return;
4506 }
4507
4508 for_each_possible_cpu(cpu)
4509 swevent_hlist_put_cpu(event, cpu);
4510}
4511
4512static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
4513{
4514 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
4515 int err = 0;
4516
4517 mutex_lock(&cpuctx->hlist_mutex);
4518
4519 if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) {
4520 struct swevent_hlist *hlist;
4521
4522 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
4523 if (!hlist) {
4524 err = -ENOMEM;
4525 goto exit;
4526 }
4527 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
4528 }
4529 cpuctx->hlist_refcount++;
4530 exit:
4531 mutex_unlock(&cpuctx->hlist_mutex);
4532
4533 return err;
4534}
4535
4536static int swevent_hlist_get(struct perf_event *event)
4537{
4538 int err;
4539 int cpu, failed_cpu;
4540
4541 if (event->cpu != -1)
4542 return swevent_hlist_get_cpu(event, event->cpu);
4543
4544 get_online_cpus();
4545 for_each_possible_cpu(cpu) {
4546 err = swevent_hlist_get_cpu(event, cpu);
4547 if (err) {
4548 failed_cpu = cpu;
4549 goto fail;
4550 }
4551 }
4552 put_online_cpus();
4553
4554 return 0;
4555 fail:
4556 for_each_possible_cpu(cpu) {
4557 if (cpu == failed_cpu)
4558 break;
4559 swevent_hlist_put_cpu(event, cpu);
4560 }
4561
4562 put_online_cpus();
4563 return err;
4564}
4565
4566#ifdef CONFIG_EVENT_TRACING
4567
4568static const struct pmu perf_ops_tracepoint = {
4569 .enable = perf_trace_enable,
4570 .disable = perf_trace_disable,
4571 .start = perf_swevent_int,
4572 .stop = perf_swevent_void,
4573 .read = perf_swevent_read,
4574 .unthrottle = perf_swevent_void,
4575};
4576
4577static int perf_tp_filter_match(struct perf_event *event,
4341 struct perf_sample_data *data) 4578 struct perf_sample_data *data)
4342{ 4579{
4343 void *record = data->raw->data; 4580 void *record = data->raw->data;
@@ -4347,13 +4584,55 @@ static int perf_tp_event_match(struct perf_event *event,
4347 return 0; 4584 return 0;
4348} 4585}
4349 4586
4587static int perf_tp_event_match(struct perf_event *event,
4588 struct perf_sample_data *data,
4589 struct pt_regs *regs)
4590{
4591 /*
4592 * All tracepoints are from kernel-space.
4593 */
4594 if (event->attr.exclude_kernel)
4595 return 0;
4596
4597 if (!perf_tp_filter_match(event, data))
4598 return 0;
4599
4600 return 1;
4601}
4602
4603void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
4604 struct pt_regs *regs, struct hlist_head *head)
4605{
4606 struct perf_sample_data data;
4607 struct perf_event *event;
4608 struct hlist_node *node;
4609
4610 struct perf_raw_record raw = {
4611 .size = entry_size,
4612 .data = record,
4613 };
4614
4615 perf_sample_data_init(&data, addr);
4616 data.raw = &raw;
4617
4618 rcu_read_lock();
4619 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
4620 if (perf_tp_event_match(event, &data, regs))
4621 perf_swevent_add(event, count, 1, &data, regs);
4622 }
4623 rcu_read_unlock();
4624}
4625EXPORT_SYMBOL_GPL(perf_tp_event);
4626
4350static void tp_perf_event_destroy(struct perf_event *event) 4627static void tp_perf_event_destroy(struct perf_event *event)
4351{ 4628{
4352 perf_trace_disable(event->attr.config); 4629 perf_trace_destroy(event);
4353} 4630}
4354 4631
4355static const struct pmu *tp_perf_event_init(struct perf_event *event) 4632static const struct pmu *tp_perf_event_init(struct perf_event *event)
4356{ 4633{
4634 int err;
4635
4357 /* 4636 /*
4358 * Raw tracepoint data is a severe data leak, only allow root to 4637 * Raw tracepoint data is a severe data leak, only allow root to
4359 * have these. 4638 * have these.
@@ -4363,12 +4642,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4363 !capable(CAP_SYS_ADMIN)) 4642 !capable(CAP_SYS_ADMIN))
4364 return ERR_PTR(-EPERM); 4643 return ERR_PTR(-EPERM);
4365 4644
4366 if (perf_trace_enable(event->attr.config)) 4645 err = perf_trace_init(event);
4646 if (err)
4367 return NULL; 4647 return NULL;
4368 4648
4369 event->destroy = tp_perf_event_destroy; 4649 event->destroy = tp_perf_event_destroy;
4370 4650
4371 return &perf_ops_generic; 4651 return &perf_ops_tracepoint;
4372} 4652}
4373 4653
4374static int perf_event_set_filter(struct perf_event *event, void __user *arg) 4654static int perf_event_set_filter(struct perf_event *event, void __user *arg)
@@ -4396,12 +4676,6 @@ static void perf_event_free_filter(struct perf_event *event)
4396 4676
4397#else 4677#else
4398 4678
4399static int perf_tp_event_match(struct perf_event *event,
4400 struct perf_sample_data *data)
4401{
4402 return 1;
4403}
4404
4405static const struct pmu *tp_perf_event_init(struct perf_event *event) 4679static const struct pmu *tp_perf_event_init(struct perf_event *event)
4406{ 4680{
4407 return NULL; 4681 return NULL;
@@ -4467,6 +4741,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
4467 WARN_ON(event->parent); 4741 WARN_ON(event->parent);
4468 4742
4469 atomic_dec(&perf_swevent_enabled[event_id]); 4743 atomic_dec(&perf_swevent_enabled[event_id]);
4744 swevent_hlist_put(event);
4470} 4745}
4471 4746
4472static const struct pmu *sw_perf_event_init(struct perf_event *event) 4747static const struct pmu *sw_perf_event_init(struct perf_event *event)
@@ -4505,6 +4780,12 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4505 case PERF_COUNT_SW_ALIGNMENT_FAULTS: 4780 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4506 case PERF_COUNT_SW_EMULATION_FAULTS: 4781 case PERF_COUNT_SW_EMULATION_FAULTS:
4507 if (!event->parent) { 4782 if (!event->parent) {
4783 int err;
4784
4785 err = swevent_hlist_get(event);
4786 if (err)
4787 return ERR_PTR(err);
4788
4508 atomic_inc(&perf_swevent_enabled[event_id]); 4789 atomic_inc(&perf_swevent_enabled[event_id]);
4509 event->destroy = sw_perf_event_destroy; 4790 event->destroy = sw_perf_event_destroy;
4510 } 4791 }
@@ -4723,54 +5004,53 @@ err_size:
4723 goto out; 5004 goto out;
4724} 5005}
4725 5006
4726static int perf_event_set_output(struct perf_event *event, int output_fd) 5007static int
5008perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
4727{ 5009{
4728 struct perf_event *output_event = NULL; 5010 struct perf_mmap_data *data = NULL, *old_data = NULL;
4729 struct file *output_file = NULL;
4730 struct perf_event *old_output;
4731 int fput_needed = 0;
4732 int ret = -EINVAL; 5011 int ret = -EINVAL;
4733 5012
4734 if (!output_fd) 5013 if (!output_event)
4735 goto set; 5014 goto set;
4736 5015
4737 output_file = fget_light(output_fd, &fput_needed); 5016 /* don't allow circular references */
4738 if (!output_file) 5017 if (event == output_event)
4739 return -EBADF;
4740
4741 if (output_file->f_op != &perf_fops)
4742 goto out; 5018 goto out;
4743 5019
4744 output_event = output_file->private_data; 5020 /*
4745 5021 * Don't allow cross-cpu buffers
4746 /* Don't chain output fds */ 5022 */
4747 if (output_event->output) 5023 if (output_event->cpu != event->cpu)
4748 goto out; 5024 goto out;
4749 5025
4750 /* Don't set an output fd when we already have an output channel */ 5026 /*
4751 if (event->data) 5027 * If its not a per-cpu buffer, it must be the same task.
5028 */
5029 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
4752 goto out; 5030 goto out;
4753 5031
4754 atomic_long_inc(&output_file->f_count);
4755
4756set: 5032set:
4757 mutex_lock(&event->mmap_mutex); 5033 mutex_lock(&event->mmap_mutex);
4758 old_output = event->output; 5034 /* Can't redirect output if we've got an active mmap() */
4759 rcu_assign_pointer(event->output, output_event); 5035 if (atomic_read(&event->mmap_count))
4760 mutex_unlock(&event->mmap_mutex); 5036 goto unlock;
4761 5037
4762 if (old_output) { 5038 if (output_event) {
4763 /* 5039 /* get the buffer we want to redirect to */
4764 * we need to make sure no existing perf_output_*() 5040 data = perf_mmap_data_get(output_event);
4765 * is still referencing this event. 5041 if (!data)
4766 */ 5042 goto unlock;
4767 synchronize_rcu();
4768 fput(old_output->filp);
4769 } 5043 }
4770 5044
5045 old_data = event->data;
5046 rcu_assign_pointer(event->data, data);
4771 ret = 0; 5047 ret = 0;
5048unlock:
5049 mutex_unlock(&event->mmap_mutex);
5050
5051 if (old_data)
5052 perf_mmap_data_put(old_data);
4772out: 5053out:
4773 fput_light(output_file, fput_needed);
4774 return ret; 5054 return ret;
4775} 5055}
4776 5056
@@ -4786,13 +5066,13 @@ SYSCALL_DEFINE5(perf_event_open,
4786 struct perf_event_attr __user *, attr_uptr, 5066 struct perf_event_attr __user *, attr_uptr,
4787 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) 5067 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
4788{ 5068{
4789 struct perf_event *event, *group_leader; 5069 struct perf_event *event, *group_leader = NULL, *output_event = NULL;
4790 struct perf_event_attr attr; 5070 struct perf_event_attr attr;
4791 struct perf_event_context *ctx; 5071 struct perf_event_context *ctx;
4792 struct file *event_file = NULL; 5072 struct file *event_file = NULL;
4793 struct file *group_file = NULL; 5073 struct file *group_file = NULL;
5074 int event_fd;
4794 int fput_needed = 0; 5075 int fput_needed = 0;
4795 int fput_needed2 = 0;
4796 int err; 5076 int err;
4797 5077
4798 /* for future expandability... */ 5078 /* for future expandability... */
@@ -4813,26 +5093,38 @@ SYSCALL_DEFINE5(perf_event_open,
4813 return -EINVAL; 5093 return -EINVAL;
4814 } 5094 }
4815 5095
5096 event_fd = get_unused_fd_flags(O_RDWR);
5097 if (event_fd < 0)
5098 return event_fd;
5099
4816 /* 5100 /*
4817 * Get the target context (task or percpu): 5101 * Get the target context (task or percpu):
4818 */ 5102 */
4819 ctx = find_get_context(pid, cpu); 5103 ctx = find_get_context(pid, cpu);
4820 if (IS_ERR(ctx)) 5104 if (IS_ERR(ctx)) {
4821 return PTR_ERR(ctx); 5105 err = PTR_ERR(ctx);
5106 goto err_fd;
5107 }
5108
5109 if (group_fd != -1) {
5110 group_leader = perf_fget_light(group_fd, &fput_needed);
5111 if (IS_ERR(group_leader)) {
5112 err = PTR_ERR(group_leader);
5113 goto err_put_context;
5114 }
5115 group_file = group_leader->filp;
5116 if (flags & PERF_FLAG_FD_OUTPUT)
5117 output_event = group_leader;
5118 if (flags & PERF_FLAG_FD_NO_GROUP)
5119 group_leader = NULL;
5120 }
4822 5121
4823 /* 5122 /*
4824 * Look up the group leader (we will attach this event to it): 5123 * Look up the group leader (we will attach this event to it):
4825 */ 5124 */
4826 group_leader = NULL; 5125 if (group_leader) {
4827 if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) {
4828 err = -EINVAL; 5126 err = -EINVAL;
4829 group_file = fget_light(group_fd, &fput_needed);
4830 if (!group_file)
4831 goto err_put_context;
4832 if (group_file->f_op != &perf_fops)
4833 goto err_put_context;
4834 5127
4835 group_leader = group_file->private_data;
4836 /* 5128 /*
4837 * Do not allow a recursive hierarchy (this new sibling 5129 * Do not allow a recursive hierarchy (this new sibling
4838 * becoming part of another group-sibling): 5130 * becoming part of another group-sibling):
@@ -4854,22 +5146,21 @@ SYSCALL_DEFINE5(perf_event_open,
4854 5146
4855 event = perf_event_alloc(&attr, cpu, ctx, group_leader, 5147 event = perf_event_alloc(&attr, cpu, ctx, group_leader,
4856 NULL, NULL, GFP_KERNEL); 5148 NULL, NULL, GFP_KERNEL);
4857 err = PTR_ERR(event); 5149 if (IS_ERR(event)) {
4858 if (IS_ERR(event)) 5150 err = PTR_ERR(event);
4859 goto err_put_context; 5151 goto err_put_context;
5152 }
4860 5153
4861 err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR); 5154 if (output_event) {
4862 if (err < 0) 5155 err = perf_event_set_output(event, output_event);
4863 goto err_free_put_context; 5156 if (err)
5157 goto err_free_put_context;
5158 }
4864 5159
4865 event_file = fget_light(err, &fput_needed2); 5160 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
4866 if (!event_file) 5161 if (IS_ERR(event_file)) {
5162 err = PTR_ERR(event_file);
4867 goto err_free_put_context; 5163 goto err_free_put_context;
4868
4869 if (flags & PERF_FLAG_FD_OUTPUT) {
4870 err = perf_event_set_output(event, group_fd);
4871 if (err)
4872 goto err_fput_free_put_context;
4873 } 5164 }
4874 5165
4875 event->filp = event_file; 5166 event->filp = event_file;
@@ -4885,19 +5176,23 @@ SYSCALL_DEFINE5(perf_event_open,
4885 list_add_tail(&event->owner_entry, &current->perf_event_list); 5176 list_add_tail(&event->owner_entry, &current->perf_event_list);
4886 mutex_unlock(&current->perf_event_mutex); 5177 mutex_unlock(&current->perf_event_mutex);
4887 5178
4888err_fput_free_put_context: 5179 /*
4889 fput_light(event_file, fput_needed2); 5180 * Drop the reference on the group_event after placing the
5181 * new event on the sibling_list. This ensures destruction
5182 * of the group leader will find the pointer to itself in
5183 * perf_group_detach().
5184 */
5185 fput_light(group_file, fput_needed);
5186 fd_install(event_fd, event_file);
5187 return event_fd;
4890 5188
4891err_free_put_context: 5189err_free_put_context:
4892 if (err < 0) 5190 free_event(event);
4893 kfree(event);
4894
4895err_put_context: 5191err_put_context:
4896 if (err < 0)
4897 put_ctx(ctx);
4898
4899 fput_light(group_file, fput_needed); 5192 fput_light(group_file, fput_needed);
4900 5193 put_ctx(ctx);
5194err_fd:
5195 put_unused_fd(event_fd);
4901 return err; 5196 return err;
4902} 5197}
4903 5198
@@ -5169,7 +5464,7 @@ void perf_event_exit_task(struct task_struct *child)
5169 * 5464 *
5170 * But since its the parent context it won't be the same instance. 5465 * But since its the parent context it won't be the same instance.
5171 */ 5466 */
5172 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5467 mutex_lock(&child_ctx->mutex);
5173 5468
5174again: 5469again:
5175 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups, 5470 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
@@ -5208,6 +5503,7 @@ static void perf_free_event(struct perf_event *event,
5208 5503
5209 fput(parent->filp); 5504 fput(parent->filp);
5210 5505
5506 perf_group_detach(event);
5211 list_del_event(event, ctx); 5507 list_del_event(event, ctx);
5212 free_event(event); 5508 free_event(event);
5213} 5509}
@@ -5377,6 +5673,7 @@ static void __init perf_event_init_all_cpus(void)
5377 5673
5378 for_each_possible_cpu(cpu) { 5674 for_each_possible_cpu(cpu) {
5379 cpuctx = &per_cpu(perf_cpu_context, cpu); 5675 cpuctx = &per_cpu(perf_cpu_context, cpu);
5676 mutex_init(&cpuctx->hlist_mutex);
5380 __perf_event_init_context(&cpuctx->ctx, NULL); 5677 __perf_event_init_context(&cpuctx->ctx, NULL);
5381 } 5678 }
5382} 5679}
@@ -5390,6 +5687,16 @@ static void __cpuinit perf_event_init_cpu(int cpu)
5390 spin_lock(&perf_resource_lock); 5687 spin_lock(&perf_resource_lock);
5391 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5688 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5392 spin_unlock(&perf_resource_lock); 5689 spin_unlock(&perf_resource_lock);
5690
5691 mutex_lock(&cpuctx->hlist_mutex);
5692 if (cpuctx->hlist_refcount > 0) {
5693 struct swevent_hlist *hlist;
5694
5695 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5696 WARN_ON_ONCE(!hlist);
5697 rcu_assign_pointer(cpuctx->swevent_hlist, hlist);
5698 }
5699 mutex_unlock(&cpuctx->hlist_mutex);
5393} 5700}
5394 5701
5395#ifdef CONFIG_HOTPLUG_CPU 5702#ifdef CONFIG_HOTPLUG_CPU
@@ -5409,6 +5716,10 @@ static void perf_event_exit_cpu(int cpu)
5409 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 5716 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
5410 struct perf_event_context *ctx = &cpuctx->ctx; 5717 struct perf_event_context *ctx = &cpuctx->ctx;
5411 5718
5719 mutex_lock(&cpuctx->hlist_mutex);
5720 swevent_hlist_release(cpuctx);
5721 mutex_unlock(&cpuctx->hlist_mutex);
5722
5412 mutex_lock(&ctx->mutex); 5723 mutex_lock(&ctx->mutex);
5413 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); 5724 smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1);
5414 mutex_unlock(&ctx->mutex); 5725 mutex_unlock(&ctx->mutex);
diff --git a/kernel/pid.c b/kernel/pid.c
index aebb30d9c233..e9fd8c132d26 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -513,6 +513,13 @@ void __init pidhash_init(void)
513 513
514void __init pidmap_init(void) 514void __init pidmap_init(void)
515{ 515{
516 /* bump default and minimum pid_max based on number of cpus */
517 pid_max = min(pid_max_max, max_t(int, pid_max,
518 PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
519 pid_max_min = max_t(int, pid_max_min,
520 PIDS_PER_CPU_MIN * num_possible_cpus());
521 pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
522
516 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); 523 init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
517 /* Reserve PID 0. We never call free_pidmap(0) */ 524 /* Reserve PID 0. We never call free_pidmap(0) */
518 set_bit(0, init_pid_ns.pidmap[0].page); 525 set_bit(0, init_pid_ns.pidmap[0].page);
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 79aac93acf99..a5aff94e1f0b 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -13,6 +13,7 @@
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h> 15#include <linux/acct.h>
16#include <linux/slab.h>
16 17
17#define BITS_PER_PAGE (PAGE_SIZE*8) 18#define BITS_PER_PAGE (PAGE_SIZE*8)
18 19
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 3db49b9ca374..996a4dec5f96 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -2,7 +2,7 @@
2 * This module exposes the interface to kernel space for specifying 2 * This module exposes the interface to kernel space for specifying
3 * QoS dependencies. It provides infrastructure for registration of: 3 * QoS dependencies. It provides infrastructure for registration of:
4 * 4 *
5 * Dependents on a QoS value : register requirements 5 * Dependents on a QoS value : register requests
6 * Watchers of QoS value : get notified when target QoS value changes 6 * Watchers of QoS value : get notified when target QoS value changes
7 * 7 *
8 * This QoS design is best effort based. Dependents register their QoS needs. 8 * This QoS design is best effort based. Dependents register their QoS needs.
@@ -14,19 +14,21 @@
14 * timeout: usec <-- currently not used. 14 * timeout: usec <-- currently not used.
15 * throughput: kbs (kilo byte / sec) 15 * throughput: kbs (kilo byte / sec)
16 * 16 *
17 * There are lists of pm_qos_objects each one wrapping requirements, notifiers 17 * There are lists of pm_qos_objects each one wrapping requests, notifiers
18 * 18 *
19 * User mode requirements on a QOS parameter register themselves to the 19 * User mode requests on a QOS parameter register themselves to the
20 * subsystem by opening the device node /dev/... and writing there request to 20 * subsystem by opening the device node /dev/... and writing there request to
21 * the node. As long as the process holds a file handle open to the node the 21 * the node. As long as the process holds a file handle open to the node the
22 * client continues to be accounted for. Upon file release the usermode 22 * client continues to be accounted for. Upon file release the usermode
23 * requirement is removed and a new qos target is computed. This way when the 23 * request is removed and a new qos target is computed. This way when the
24 * requirement that the application has is cleaned up when closes the file 24 * request that the application has is cleaned up when closes the file
25 * pointer or exits the pm_qos_object will get an opportunity to clean up. 25 * pointer or exits the pm_qos_object will get an opportunity to clean up.
26 * 26 *
27 * Mark Gross <mgross@linux.intel.com> 27 * Mark Gross <mgross@linux.intel.com>
28 */ 28 */
29 29
30/*#define DEBUG*/
31
30#include <linux/pm_qos_params.h> 32#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 33#include <linux/sched.h>
32#include <linux/spinlock.h> 34#include <linux/spinlock.h>
@@ -42,64 +44,53 @@
42#include <linux/uaccess.h> 44#include <linux/uaccess.h>
43 45
44/* 46/*
45 * locking rule: all changes to requirements or notifiers lists 47 * locking rule: all changes to requests or notifiers lists
46 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 48 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
47 * held, taken with _irqsave. One lock to rule them all 49 * held, taken with _irqsave. One lock to rule them all
48 */ 50 */
49struct requirement_list { 51enum pm_qos_type {
50 struct list_head list; 52 PM_QOS_MAX, /* return the largest value */
51 union { 53 PM_QOS_MIN /* return the smallest value */
52 s32 value;
53 s32 usec;
54 s32 kbps;
55 };
56 char *name;
57}; 54};
58 55
59static s32 max_compare(s32 v1, s32 v2);
60static s32 min_compare(s32 v1, s32 v2);
61
62struct pm_qos_object { 56struct pm_qos_object {
63 struct requirement_list requirements; 57 struct plist_head requests;
64 struct blocking_notifier_head *notifiers; 58 struct blocking_notifier_head *notifiers;
65 struct miscdevice pm_qos_power_miscdev; 59 struct miscdevice pm_qos_power_miscdev;
66 char *name; 60 char *name;
67 s32 default_value; 61 s32 default_value;
68 atomic_t target_value; 62 enum pm_qos_type type;
69 s32 (*comparitor)(s32, s32);
70}; 63};
71 64
65static DEFINE_SPINLOCK(pm_qos_lock);
66
72static struct pm_qos_object null_pm_qos; 67static struct pm_qos_object null_pm_qos;
73static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); 68static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
74static struct pm_qos_object cpu_dma_pm_qos = { 69static struct pm_qos_object cpu_dma_pm_qos = {
75 .requirements = {LIST_HEAD_INIT(cpu_dma_pm_qos.requirements.list)}, 70 .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock),
76 .notifiers = &cpu_dma_lat_notifier, 71 .notifiers = &cpu_dma_lat_notifier,
77 .name = "cpu_dma_latency", 72 .name = "cpu_dma_latency",
78 .default_value = 2000 * USEC_PER_SEC, 73 .default_value = 2000 * USEC_PER_SEC,
79 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), 74 .type = PM_QOS_MIN,
80 .comparitor = min_compare
81}; 75};
82 76
83static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); 77static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
84static struct pm_qos_object network_lat_pm_qos = { 78static struct pm_qos_object network_lat_pm_qos = {
85 .requirements = {LIST_HEAD_INIT(network_lat_pm_qos.requirements.list)}, 79 .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock),
86 .notifiers = &network_lat_notifier, 80 .notifiers = &network_lat_notifier,
87 .name = "network_latency", 81 .name = "network_latency",
88 .default_value = 2000 * USEC_PER_SEC, 82 .default_value = 2000 * USEC_PER_SEC,
89 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), 83 .type = PM_QOS_MIN
90 .comparitor = min_compare
91}; 84};
92 85
93 86
94static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); 87static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
95static struct pm_qos_object network_throughput_pm_qos = { 88static struct pm_qos_object network_throughput_pm_qos = {
96 .requirements = 89 .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock),
97 {LIST_HEAD_INIT(network_throughput_pm_qos.requirements.list)},
98 .notifiers = &network_throughput_notifier, 90 .notifiers = &network_throughput_notifier,
99 .name = "network_throughput", 91 .name = "network_throughput",
100 .default_value = 0, 92 .default_value = 0,
101 .target_value = ATOMIC_INIT(0), 93 .type = PM_QOS_MAX,
102 .comparitor = max_compare
103}; 94};
104 95
105 96
@@ -110,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = {
110 &network_throughput_pm_qos 101 &network_throughput_pm_qos
111}; 102};
112 103
113static DEFINE_SPINLOCK(pm_qos_lock);
114
115static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 104static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
116 size_t count, loff_t *f_pos); 105 size_t count, loff_t *f_pos);
117static int pm_qos_power_open(struct inode *inode, struct file *filp); 106static int pm_qos_power_open(struct inode *inode, struct file *filp);
@@ -123,43 +112,55 @@ static const struct file_operations pm_qos_power_fops = {
123 .release = pm_qos_power_release, 112 .release = pm_qos_power_release,
124}; 113};
125 114
126/* static helper functions */ 115/* unlocked internal variant */
127static s32 max_compare(s32 v1, s32 v2) 116static inline int pm_qos_get_value(struct pm_qos_object *o)
128{ 117{
129 return max(v1, v2); 118 if (plist_head_empty(&o->requests))
130} 119 return o->default_value;
131 120
132static s32 min_compare(s32 v1, s32 v2) 121 switch (o->type) {
133{ 122 case PM_QOS_MIN:
134 return min(v1, v2); 123 return plist_last(&o->requests)->prio;
135}
136 124
125 case PM_QOS_MAX:
126 return plist_first(&o->requests)->prio;
137 127
138static void update_target(int target) 128 default:
129 /* runtime check for not using enum */
130 BUG();
131 }
132}
133
134static void update_target(struct pm_qos_object *o, struct plist_node *node,
135 int del, int value)
139{ 136{
140 s32 extreme_value;
141 struct requirement_list *node;
142 unsigned long flags; 137 unsigned long flags;
143 int call_notifier = 0; 138 int prev_value, curr_value;
144 139
145 spin_lock_irqsave(&pm_qos_lock, flags); 140 spin_lock_irqsave(&pm_qos_lock, flags);
146 extreme_value = pm_qos_array[target]->default_value; 141 prev_value = pm_qos_get_value(o);
147 list_for_each_entry(node, 142 /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
148 &pm_qos_array[target]->requirements.list, list) { 143 if (value != PM_QOS_DEFAULT_VALUE) {
149 extreme_value = pm_qos_array[target]->comparitor( 144 /*
150 extreme_value, node->value); 145 * to change the list, we atomically remove, reinit
151 } 146 * with new value and add, then see if the extremal
152 if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) { 147 * changed
153 call_notifier = 1; 148 */
154 atomic_set(&pm_qos_array[target]->target_value, extreme_value); 149 plist_del(node, &o->requests);
155 pr_debug(KERN_ERR "new target for qos %d is %d\n", target, 150 plist_node_init(node, value);
156 atomic_read(&pm_qos_array[target]->target_value)); 151 plist_add(node, &o->requests);
152 } else if (del) {
153 plist_del(node, &o->requests);
154 } else {
155 plist_add(node, &o->requests);
157 } 156 }
157 curr_value = pm_qos_get_value(o);
158 spin_unlock_irqrestore(&pm_qos_lock, flags); 158 spin_unlock_irqrestore(&pm_qos_lock, flags);
159 159
160 if (call_notifier) 160 if (prev_value != curr_value)
161 blocking_notifier_call_chain(pm_qos_array[target]->notifiers, 161 blocking_notifier_call_chain(o->notifiers,
162 (unsigned long) extreme_value, NULL); 162 (unsigned long)curr_value,
163 NULL);
163} 164}
164 165
165static int register_pm_qos_misc(struct pm_qos_object *qos) 166static int register_pm_qos_misc(struct pm_qos_object *qos)
@@ -185,125 +186,123 @@ static int find_pm_qos_object_by_minor(int minor)
185} 186}
186 187
187/** 188/**
188 * pm_qos_requirement - returns current system wide qos expectation 189 * pm_qos_request - returns current system wide qos expectation
189 * @pm_qos_class: identification of which qos value is requested 190 * @pm_qos_class: identification of which qos value is requested
190 * 191 *
191 * This function returns the current target value in an atomic manner. 192 * This function returns the current target value in an atomic manner.
192 */ 193 */
193int pm_qos_requirement(int pm_qos_class) 194int pm_qos_request(int pm_qos_class)
194{ 195{
195 return atomic_read(&pm_qos_array[pm_qos_class]->target_value); 196 unsigned long flags;
197 int value;
198
199 spin_lock_irqsave(&pm_qos_lock, flags);
200 value = pm_qos_get_value(pm_qos_array[pm_qos_class]);
201 spin_unlock_irqrestore(&pm_qos_lock, flags);
202
203 return value;
196} 204}
197EXPORT_SYMBOL_GPL(pm_qos_requirement); 205EXPORT_SYMBOL_GPL(pm_qos_request);
206
207int pm_qos_request_active(struct pm_qos_request_list *req)
208{
209 return req->pm_qos_class != 0;
210}
211EXPORT_SYMBOL_GPL(pm_qos_request_active);
198 212
199/** 213/**
200 * pm_qos_add_requirement - inserts new qos request into the list 214 * pm_qos_add_request - inserts new qos request into the list
201 * @pm_qos_class: identifies which list of qos request to us 215 * @pm_qos_class: identifies which list of qos request to us
202 * @name: identifies the request
203 * @value: defines the qos request 216 * @value: defines the qos request
204 * 217 *
205 * This function inserts a new entry in the pm_qos_class list of requested qos 218 * This function inserts a new entry in the pm_qos_class list of requested qos
206 * performance characteristics. It recomputes the aggregate QoS expectations 219 * performance characteristics. It recomputes the aggregate QoS expectations
207 * for the pm_qos_class of parameters. 220 * for the pm_qos_class of parameters, and returns the pm_qos_request list
221 * element as a handle for use in updating and removal. Call needs to save
222 * this handle for later use.
208 */ 223 */
209int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) 224void pm_qos_add_request(struct pm_qos_request_list *dep,
225 int pm_qos_class, s32 value)
210{ 226{
211 struct requirement_list *dep; 227 struct pm_qos_object *o = pm_qos_array[pm_qos_class];
212 unsigned long flags; 228 int new_value;
213 229
214 dep = kzalloc(sizeof(struct requirement_list), GFP_KERNEL); 230 if (pm_qos_request_active(dep)) {
215 if (dep) { 231 WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
216 if (value == PM_QOS_DEFAULT_VALUE) 232 return;
217 dep->value = pm_qos_array[pm_qos_class]->default_value;
218 else
219 dep->value = value;
220 dep->name = kstrdup(name, GFP_KERNEL);
221 if (!dep->name)
222 goto cleanup;
223
224 spin_lock_irqsave(&pm_qos_lock, flags);
225 list_add(&dep->list,
226 &pm_qos_array[pm_qos_class]->requirements.list);
227 spin_unlock_irqrestore(&pm_qos_lock, flags);
228 update_target(pm_qos_class);
229
230 return 0;
231 } 233 }
232 234 if (value == PM_QOS_DEFAULT_VALUE)
233cleanup: 235 new_value = o->default_value;
234 kfree(dep); 236 else
235 return -ENOMEM; 237 new_value = value;
238 plist_node_init(&dep->list, new_value);
239 dep->pm_qos_class = pm_qos_class;
240 update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
236} 241}
237EXPORT_SYMBOL_GPL(pm_qos_add_requirement); 242EXPORT_SYMBOL_GPL(pm_qos_add_request);
238 243
239/** 244/**
240 * pm_qos_update_requirement - modifies an existing qos request 245 * pm_qos_update_request - modifies an existing qos request
241 * @pm_qos_class: identifies which list of qos request to us 246 * @pm_qos_req : handle to list element holding a pm_qos request to use
242 * @name: identifies the request
243 * @value: defines the qos request 247 * @value: defines the qos request
244 * 248 *
245 * Updates an existing qos requirement for the pm_qos_class of parameters along 249 * Updates an existing qos request for the pm_qos_class of parameters along
246 * with updating the target pm_qos_class value. 250 * with updating the target pm_qos_class value.
247 * 251 *
248 * If the named request isn't in the list then no change is made. 252 * Attempts are made to make this code callable on hot code paths.
249 */ 253 */
250int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) 254void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
255 s32 new_value)
251{ 256{
252 unsigned long flags; 257 s32 temp;
253 struct requirement_list *node; 258 struct pm_qos_object *o;
254 int pending_update = 0;
255 259
256 spin_lock_irqsave(&pm_qos_lock, flags); 260 if (!pm_qos_req) /*guard against callers passing in null */
257 list_for_each_entry(node, 261 return;
258 &pm_qos_array[pm_qos_class]->requirements.list, list) { 262
259 if (strcmp(node->name, name) == 0) { 263 if (!pm_qos_request_active(pm_qos_req)) {
260 if (new_value == PM_QOS_DEFAULT_VALUE) 264 WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
261 node->value = 265 return;
262 pm_qos_array[pm_qos_class]->default_value;
263 else
264 node->value = new_value;
265 pending_update = 1;
266 break;
267 }
268 } 266 }
269 spin_unlock_irqrestore(&pm_qos_lock, flags);
270 if (pending_update)
271 update_target(pm_qos_class);
272 267
273 return 0; 268 o = pm_qos_array[pm_qos_req->pm_qos_class];
269
270 if (new_value == PM_QOS_DEFAULT_VALUE)
271 temp = o->default_value;
272 else
273 temp = new_value;
274
275 if (temp != pm_qos_req->list.prio)
276 update_target(o, &pm_qos_req->list, 0, temp);
274} 277}
275EXPORT_SYMBOL_GPL(pm_qos_update_requirement); 278EXPORT_SYMBOL_GPL(pm_qos_update_request);
276 279
277/** 280/**
278 * pm_qos_remove_requirement - modifies an existing qos request 281 * pm_qos_remove_request - modifies an existing qos request
279 * @pm_qos_class: identifies which list of qos request to us 282 * @pm_qos_req: handle to request list element
280 * @name: identifies the request
281 * 283 *
282 * Will remove named qos request from pm_qos_class list of parameters and 284 * Will remove pm qos request from the list of requests and
283 * recompute the current target value for the pm_qos_class. 285 * recompute the current target value for the pm_qos_class. Call this
286 * on slow code paths.
284 */ 287 */
285void pm_qos_remove_requirement(int pm_qos_class, char *name) 288void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
286{ 289{
287 unsigned long flags; 290 struct pm_qos_object *o;
288 struct requirement_list *node;
289 int pending_update = 0;
290 291
291 spin_lock_irqsave(&pm_qos_lock, flags); 292 if (pm_qos_req == NULL)
292 list_for_each_entry(node, 293 return;
293 &pm_qos_array[pm_qos_class]->requirements.list, list) { 294 /* silent return to keep pcm code cleaner */
294 if (strcmp(node->name, name) == 0) { 295
295 kfree(node->name); 296 if (!pm_qos_request_active(pm_qos_req)) {
296 list_del(&node->list); 297 WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
297 kfree(node); 298 return;
298 pending_update = 1;
299 break;
300 }
301 } 299 }
302 spin_unlock_irqrestore(&pm_qos_lock, flags); 300
303 if (pending_update) 301 o = pm_qos_array[pm_qos_req->pm_qos_class];
304 update_target(pm_qos_class); 302 update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
303 memset(pm_qos_req, 0, sizeof(*pm_qos_req));
305} 304}
306EXPORT_SYMBOL_GPL(pm_qos_remove_requirement); 305EXPORT_SYMBOL_GPL(pm_qos_remove_request);
307 306
308/** 307/**
309 * pm_qos_add_notifier - sets notification entry for changes to target value 308 * pm_qos_add_notifier - sets notification entry for changes to target value
@@ -313,7 +312,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
313 * will register the notifier into a notification chain that gets called 312 * will register the notifier into a notification chain that gets called
314 * upon changes to the pm_qos_class target value. 313 * upon changes to the pm_qos_class target value.
315 */ 314 */
316 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) 315int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
317{ 316{
318 int retval; 317 int retval;
319 318
@@ -343,21 +342,20 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
343} 342}
344EXPORT_SYMBOL_GPL(pm_qos_remove_notifier); 343EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
345 344
346#define PID_NAME_LEN 32
347
348static int pm_qos_power_open(struct inode *inode, struct file *filp) 345static int pm_qos_power_open(struct inode *inode, struct file *filp)
349{ 346{
350 int ret;
351 long pm_qos_class; 347 long pm_qos_class;
352 char name[PID_NAME_LEN];
353 348
354 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 349 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
355 if (pm_qos_class >= 0) { 350 if (pm_qos_class >= 0) {
356 filp->private_data = (void *)pm_qos_class; 351 struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req));
357 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 352 if (!req)
358 ret = pm_qos_add_requirement(pm_qos_class, name, 353 return -ENOMEM;
359 PM_QOS_DEFAULT_VALUE); 354
360 if (ret >= 0) 355 pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
356 filp->private_data = req;
357
358 if (filp->private_data)
361 return 0; 359 return 0;
362 } 360 }
363 return -EPERM; 361 return -EPERM;
@@ -365,32 +363,41 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
365 363
366static int pm_qos_power_release(struct inode *inode, struct file *filp) 364static int pm_qos_power_release(struct inode *inode, struct file *filp)
367{ 365{
368 int pm_qos_class; 366 struct pm_qos_request_list *req;
369 char name[PID_NAME_LEN];
370 367
371 pm_qos_class = (long)filp->private_data; 368 req = filp->private_data;
372 snprintf(name, PID_NAME_LEN, "process_%d", current->pid); 369 pm_qos_remove_request(req);
373 pm_qos_remove_requirement(pm_qos_class, name); 370 kfree(req);
374 371
375 return 0; 372 return 0;
376} 373}
377 374
375
378static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, 376static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
379 size_t count, loff_t *f_pos) 377 size_t count, loff_t *f_pos)
380{ 378{
381 s32 value; 379 s32 value;
382 int pm_qos_class; 380 int x;
383 char name[PID_NAME_LEN]; 381 char ascii_value[11];
384 382 struct pm_qos_request_list *pm_qos_req;
385 pm_qos_class = (long)filp->private_data; 383
386 if (count != sizeof(s32)) 384 if (count == sizeof(s32)) {
385 if (copy_from_user(&value, buf, sizeof(s32)))
386 return -EFAULT;
387 } else if (count == 11) { /* len('0x12345678/0') */
388 if (copy_from_user(ascii_value, buf, 11))
389 return -EFAULT;
390 x = sscanf(ascii_value, "%x", &value);
391 if (x != 1)
392 return -EINVAL;
393 pr_debug(KERN_ERR "%s, %d, 0x%x\n", ascii_value, x, value);
394 } else
387 return -EINVAL; 395 return -EINVAL;
388 if (copy_from_user(&value, buf, sizeof(s32)))
389 return -EFAULT;
390 snprintf(name, PID_NAME_LEN, "process_%d", current->pid);
391 pm_qos_update_requirement(pm_qos_class, name, value);
392 396
393 return sizeof(s32); 397 pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
398 pm_qos_update_request(pm_qos_req, value);
399
400 return count;
394} 401}
395 402
396 403
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 1a22dfd42df9..9829646d399c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -11,19 +11,18 @@
11#include <trace/events/timer.h> 11#include <trace/events/timer.h>
12 12
13/* 13/*
14 * Called after updating RLIMIT_CPU to set timer expiration if necessary. 14 * Called after updating RLIMIT_CPU to run cpu timer and update
15 * tsk->signal->cputime_expires expiration cache if necessary. Needs
16 * siglock protection since other code may update expiration cache as
17 * well.
15 */ 18 */
16void update_rlimit_cpu(unsigned long rlim_new) 19void update_rlimit_cpu(unsigned long rlim_new)
17{ 20{
18 cputime_t cputime = secs_to_cputime(rlim_new); 21 cputime_t cputime = secs_to_cputime(rlim_new);
19 struct signal_struct *const sig = current->signal;
20 22
21 if (cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) || 23 spin_lock_irq(&current->sighand->siglock);
22 cputime_gt(sig->it[CPUCLOCK_PROF].expires, cputime)) { 24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
23 spin_lock_irq(&current->sighand->siglock); 25 spin_unlock_irq(&current->sighand->siglock);
24 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
25 spin_unlock_irq(&current->sighand->siglock);
26 }
27} 26}
28 27
29static int check_clock(const clockid_t which_clock) 28static int check_clock(const clockid_t which_clock)
@@ -364,7 +363,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
364 } 363 }
365 } else { 364 } else {
366 read_lock(&tasklist_lock); 365 read_lock(&tasklist_lock);
367 if (thread_group_leader(p) && p->signal) { 366 if (thread_group_leader(p) && p->sighand) {
368 error = 367 error =
369 cpu_clock_sample_group(which_clock, 368 cpu_clock_sample_group(which_clock,
370 p, &rtn); 369 p, &rtn);
@@ -440,7 +439,7 @@ int posix_cpu_timer_del(struct k_itimer *timer)
440 439
441 if (likely(p != NULL)) { 440 if (likely(p != NULL)) {
442 read_lock(&tasklist_lock); 441 read_lock(&tasklist_lock);
443 if (unlikely(p->signal == NULL)) { 442 if (unlikely(p->sighand == NULL)) {
444 /* 443 /*
445 * We raced with the reaping of the task. 444 * We raced with the reaping of the task.
446 * The deletion should have cleared us off the list. 445 * The deletion should have cleared us off the list.
@@ -548,111 +547,62 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
548 cputime_gt(expires, new_exp); 547 cputime_gt(expires, new_exp);
549} 548}
550 549
551static inline int expires_le(cputime_t expires, cputime_t new_exp)
552{
553 return !cputime_eq(expires, cputime_zero) &&
554 cputime_le(expires, new_exp);
555}
556/* 550/*
557 * Insert the timer on the appropriate list before any timers that 551 * Insert the timer on the appropriate list before any timers that
558 * expire later. This must be called with the tasklist_lock held 552 * expire later. This must be called with the tasklist_lock held
559 * for reading, and interrupts disabled. 553 * for reading, interrupts disabled and p->sighand->siglock taken.
560 */ 554 */
561static void arm_timer(struct k_itimer *timer, union cpu_time_count now) 555static void arm_timer(struct k_itimer *timer)
562{ 556{
563 struct task_struct *p = timer->it.cpu.task; 557 struct task_struct *p = timer->it.cpu.task;
564 struct list_head *head, *listpos; 558 struct list_head *head, *listpos;
559 struct task_cputime *cputime_expires;
565 struct cpu_timer_list *const nt = &timer->it.cpu; 560 struct cpu_timer_list *const nt = &timer->it.cpu;
566 struct cpu_timer_list *next; 561 struct cpu_timer_list *next;
567 unsigned long i;
568 562
569 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? 563 if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
570 p->cpu_timers : p->signal->cpu_timers); 564 head = p->cpu_timers;
565 cputime_expires = &p->cputime_expires;
566 } else {
567 head = p->signal->cpu_timers;
568 cputime_expires = &p->signal->cputime_expires;
569 }
571 head += CPUCLOCK_WHICH(timer->it_clock); 570 head += CPUCLOCK_WHICH(timer->it_clock);
572 571
573 BUG_ON(!irqs_disabled());
574 spin_lock(&p->sighand->siglock);
575
576 listpos = head; 572 listpos = head;
577 if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) { 573 list_for_each_entry(next, head, entry) {
578 list_for_each_entry(next, head, entry) { 574 if (cpu_time_before(timer->it_clock, nt->expires, next->expires))
579 if (next->expires.sched > nt->expires.sched) 575 break;
580 break; 576 listpos = &next->entry;
581 listpos = &next->entry;
582 }
583 } else {
584 list_for_each_entry(next, head, entry) {
585 if (cputime_gt(next->expires.cpu, nt->expires.cpu))
586 break;
587 listpos = &next->entry;
588 }
589 } 577 }
590 list_add(&nt->entry, listpos); 578 list_add(&nt->entry, listpos);
591 579
592 if (listpos == head) { 580 if (listpos == head) {
581 union cpu_time_count *exp = &nt->expires;
582
593 /* 583 /*
594 * We are the new earliest-expiring timer. 584 * We are the new earliest-expiring POSIX 1.b timer, hence
595 * If we are a thread timer, there can always 585 * need to update expiration cache. Take into account that
596 * be a process timer telling us to stop earlier. 586 * for process timers we share expiration cache with itimers
587 * and RLIMIT_CPU and for thread timers with RLIMIT_RTTIME.
597 */ 588 */
598 589
599 if (CPUCLOCK_PERTHREAD(timer->it_clock)) { 590 switch (CPUCLOCK_WHICH(timer->it_clock)) {
600 union cpu_time_count *exp = &nt->expires; 591 case CPUCLOCK_PROF:
601 592 if (expires_gt(cputime_expires->prof_exp, exp->cpu))
602 switch (CPUCLOCK_WHICH(timer->it_clock)) { 593 cputime_expires->prof_exp = exp->cpu;
603 default: 594 break;
604 BUG(); 595 case CPUCLOCK_VIRT:
605 case CPUCLOCK_PROF: 596 if (expires_gt(cputime_expires->virt_exp, exp->cpu))
606 if (expires_gt(p->cputime_expires.prof_exp, 597 cputime_expires->virt_exp = exp->cpu;
607 exp->cpu)) 598 break;
608 p->cputime_expires.prof_exp = exp->cpu; 599 case CPUCLOCK_SCHED:
609 break; 600 if (cputime_expires->sched_exp == 0 ||
610 case CPUCLOCK_VIRT: 601 cputime_expires->sched_exp > exp->sched)
611 if (expires_gt(p->cputime_expires.virt_exp, 602 cputime_expires->sched_exp = exp->sched;
612 exp->cpu)) 603 break;
613 p->cputime_expires.virt_exp = exp->cpu;
614 break;
615 case CPUCLOCK_SCHED:
616 if (p->cputime_expires.sched_exp == 0 ||
617 p->cputime_expires.sched_exp > exp->sched)
618 p->cputime_expires.sched_exp =
619 exp->sched;
620 break;
621 }
622 } else {
623 struct signal_struct *const sig = p->signal;
624 union cpu_time_count *exp = &timer->it.cpu.expires;
625
626 /*
627 * For a process timer, set the cached expiration time.
628 */
629 switch (CPUCLOCK_WHICH(timer->it_clock)) {
630 default:
631 BUG();
632 case CPUCLOCK_VIRT:
633 if (expires_le(sig->it[CPUCLOCK_VIRT].expires,
634 exp->cpu))
635 break;
636 sig->cputime_expires.virt_exp = exp->cpu;
637 break;
638 case CPUCLOCK_PROF:
639 if (expires_le(sig->it[CPUCLOCK_PROF].expires,
640 exp->cpu))
641 break;
642 i = sig->rlim[RLIMIT_CPU].rlim_cur;
643 if (i != RLIM_INFINITY &&
644 i <= cputime_to_secs(exp->cpu))
645 break;
646 sig->cputime_expires.prof_exp = exp->cpu;
647 break;
648 case CPUCLOCK_SCHED:
649 sig->cputime_expires.sched_exp = exp->sched;
650 break;
651 }
652 } 604 }
653 } 605 }
654
655 spin_unlock(&p->sighand->siglock);
656} 606}
657 607
658/* 608/*
@@ -660,7 +610,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
660 */ 610 */
661static void cpu_timer_fire(struct k_itimer *timer) 611static void cpu_timer_fire(struct k_itimer *timer)
662{ 612{
663 if (unlikely(timer->sigq == NULL)) { 613 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
614 /*
615 * User don't want any signal.
616 */
617 timer->it.cpu.expires.sched = 0;
618 } else if (unlikely(timer->sigq == NULL)) {
664 /* 619 /*
665 * This a special case for clock_nanosleep, 620 * This a special case for clock_nanosleep,
666 * not a normal timer from sys_timer_create. 621 * not a normal timer from sys_timer_create.
@@ -721,7 +676,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
721 struct itimerspec *new, struct itimerspec *old) 676 struct itimerspec *new, struct itimerspec *old)
722{ 677{
723 struct task_struct *p = timer->it.cpu.task; 678 struct task_struct *p = timer->it.cpu.task;
724 union cpu_time_count old_expires, new_expires, val; 679 union cpu_time_count old_expires, new_expires, old_incr, val;
725 int ret; 680 int ret;
726 681
727 if (unlikely(p == NULL)) { 682 if (unlikely(p == NULL)) {
@@ -736,10 +691,10 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
736 read_lock(&tasklist_lock); 691 read_lock(&tasklist_lock);
737 /* 692 /*
738 * We need the tasklist_lock to protect against reaping that 693 * We need the tasklist_lock to protect against reaping that
739 * clears p->signal. If p has just been reaped, we can no 694 * clears p->sighand. If p has just been reaped, we can no
740 * longer get any information about it at all. 695 * longer get any information about it at all.
741 */ 696 */
742 if (unlikely(p->signal == NULL)) { 697 if (unlikely(p->sighand == NULL)) {
743 read_unlock(&tasklist_lock); 698 read_unlock(&tasklist_lock);
744 put_task_struct(p); 699 put_task_struct(p);
745 timer->it.cpu.task = NULL; 700 timer->it.cpu.task = NULL;
@@ -752,6 +707,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
752 BUG_ON(!irqs_disabled()); 707 BUG_ON(!irqs_disabled());
753 708
754 ret = 0; 709 ret = 0;
710 old_incr = timer->it.cpu.incr;
755 spin_lock(&p->sighand->siglock); 711 spin_lock(&p->sighand->siglock);
756 old_expires = timer->it.cpu.expires; 712 old_expires = timer->it.cpu.expires;
757 if (unlikely(timer->it.cpu.firing)) { 713 if (unlikely(timer->it.cpu.firing)) {
@@ -759,7 +715,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
759 ret = TIMER_RETRY; 715 ret = TIMER_RETRY;
760 } else 716 } else
761 list_del_init(&timer->it.cpu.entry); 717 list_del_init(&timer->it.cpu.entry);
762 spin_unlock(&p->sighand->siglock);
763 718
764 /* 719 /*
765 * We need to sample the current value to convert the new 720 * We need to sample the current value to convert the new
@@ -813,6 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
813 * disable this firing since we are already reporting 768 * disable this firing since we are already reporting
814 * it as an overrun (thanks to bump_cpu_timer above). 769 * it as an overrun (thanks to bump_cpu_timer above).
815 */ 770 */
771 spin_unlock(&p->sighand->siglock);
816 read_unlock(&tasklist_lock); 772 read_unlock(&tasklist_lock);
817 goto out; 773 goto out;
818 } 774 }
@@ -828,11 +784,11 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
828 */ 784 */
829 timer->it.cpu.expires = new_expires; 785 timer->it.cpu.expires = new_expires;
830 if (new_expires.sched != 0 && 786 if (new_expires.sched != 0 &&
831 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
832 cpu_time_before(timer->it_clock, val, new_expires)) { 787 cpu_time_before(timer->it_clock, val, new_expires)) {
833 arm_timer(timer, val); 788 arm_timer(timer);
834 } 789 }
835 790
791 spin_unlock(&p->sighand->siglock);
836 read_unlock(&tasklist_lock); 792 read_unlock(&tasklist_lock);
837 793
838 /* 794 /*
@@ -853,7 +809,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
853 timer->it_overrun = -1; 809 timer->it_overrun = -1;
854 810
855 if (new_expires.sched != 0 && 811 if (new_expires.sched != 0 &&
856 (timer->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE &&
857 !cpu_time_before(timer->it_clock, val, new_expires)) { 812 !cpu_time_before(timer->it_clock, val, new_expires)) {
858 /* 813 /*
859 * The designated time already passed, so we notify 814 * The designated time already passed, so we notify
@@ -867,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
867 out: 822 out:
868 if (old) { 823 if (old) {
869 sample_to_timespec(timer->it_clock, 824 sample_to_timespec(timer->it_clock,
870 timer->it.cpu.incr, &old->it_interval); 825 old_incr, &old->it_interval);
871 } 826 }
872 return ret; 827 return ret;
873} 828}
@@ -908,7 +863,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
908 clear_dead = p->exit_state; 863 clear_dead = p->exit_state;
909 } else { 864 } else {
910 read_lock(&tasklist_lock); 865 read_lock(&tasklist_lock);
911 if (unlikely(p->signal == NULL)) { 866 if (unlikely(p->sighand == NULL)) {
912 /* 867 /*
913 * The process has been reaped. 868 * The process has been reaped.
914 * We can't even collect a sample any more. 869 * We can't even collect a sample any more.
@@ -927,25 +882,6 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
927 read_unlock(&tasklist_lock); 882 read_unlock(&tasklist_lock);
928 } 883 }
929 884
930 if ((timer->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
931 if (timer->it.cpu.incr.sched == 0 &&
932 cpu_time_before(timer->it_clock,
933 timer->it.cpu.expires, now)) {
934 /*
935 * Do-nothing timer expired and has no reload,
936 * so it's as if it was never set.
937 */
938 timer->it.cpu.expires.sched = 0;
939 itp->it_value.tv_sec = itp->it_value.tv_nsec = 0;
940 return;
941 }
942 /*
943 * Account for any expirations and reloads that should
944 * have happened.
945 */
946 bump_cpu_timer(timer, now);
947 }
948
949 if (unlikely(clear_dead)) { 885 if (unlikely(clear_dead)) {
950 /* 886 /*
951 * We've noticed that the thread is dead, but 887 * We've noticed that the thread is dead, but
@@ -1061,14 +997,11 @@ static void check_thread_timers(struct task_struct *tsk,
1061 } 997 }
1062} 998}
1063 999
1064static void stop_process_timers(struct task_struct *tsk) 1000static void stop_process_timers(struct signal_struct *sig)
1065{ 1001{
1066 struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; 1002 struct thread_group_cputimer *cputimer = &sig->cputimer;
1067 unsigned long flags; 1003 unsigned long flags;
1068 1004
1069 if (!cputimer->running)
1070 return;
1071
1072 spin_lock_irqsave(&cputimer->lock, flags); 1005 spin_lock_irqsave(&cputimer->lock, flags);
1073 cputimer->running = 0; 1006 cputimer->running = 0;
1074 spin_unlock_irqrestore(&cputimer->lock, flags); 1007 spin_unlock_irqrestore(&cputimer->lock, flags);
@@ -1108,6 +1041,23 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
1108 } 1041 }
1109} 1042}
1110 1043
1044/**
1045 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1046 *
1047 * @cputime: The struct to compare.
1048 *
1049 * Checks @cputime to see if all fields are zero. Returns true if all fields
1050 * are zero, false if any field is nonzero.
1051 */
1052static inline int task_cputime_zero(const struct task_cputime *cputime)
1053{
1054 if (cputime_eq(cputime->utime, cputime_zero) &&
1055 cputime_eq(cputime->stime, cputime_zero) &&
1056 cputime->sum_exec_runtime == 0)
1057 return 1;
1058 return 0;
1059}
1060
1111/* 1061/*
1112 * Check for any per-thread CPU timers that have fired and move them 1062 * Check for any per-thread CPU timers that have fired and move them
1113 * off the tsk->*_timers list onto the firing list. Per-thread timers 1063 * off the tsk->*_timers list onto the firing list. Per-thread timers
@@ -1125,19 +1075,6 @@ static void check_process_timers(struct task_struct *tsk,
1125 unsigned long soft; 1075 unsigned long soft;
1126 1076
1127 /* 1077 /*
1128 * Don't sample the current process CPU clocks if there are no timers.
1129 */
1130 if (list_empty(&timers[CPUCLOCK_PROF]) &&
1131 cputime_eq(sig->it[CPUCLOCK_PROF].expires, cputime_zero) &&
1132 sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
1133 list_empty(&timers[CPUCLOCK_VIRT]) &&
1134 cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) &&
1135 list_empty(&timers[CPUCLOCK_SCHED])) {
1136 stop_process_timers(tsk);
1137 return;
1138 }
1139
1140 /*
1141 * Collect the current process totals. 1078 * Collect the current process totals.
1142 */ 1079 */
1143 thread_group_cputimer(tsk, &cputime); 1080 thread_group_cputimer(tsk, &cputime);
@@ -1226,18 +1163,11 @@ static void check_process_timers(struct task_struct *tsk,
1226 } 1163 }
1227 } 1164 }
1228 1165
1229 if (!cputime_eq(prof_expires, cputime_zero) && 1166 sig->cputime_expires.prof_exp = prof_expires;
1230 (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) || 1167 sig->cputime_expires.virt_exp = virt_expires;
1231 cputime_gt(sig->cputime_expires.prof_exp, prof_expires))) 1168 sig->cputime_expires.sched_exp = sched_expires;
1232 sig->cputime_expires.prof_exp = prof_expires; 1169 if (task_cputime_zero(&sig->cputime_expires))
1233 if (!cputime_eq(virt_expires, cputime_zero) && 1170 stop_process_timers(sig);
1234 (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
1235 cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
1236 sig->cputime_expires.virt_exp = virt_expires;
1237 if (sched_expires != 0 &&
1238 (sig->cputime_expires.sched_exp == 0 ||
1239 sig->cputime_expires.sched_exp > sched_expires))
1240 sig->cputime_expires.sched_exp = sched_expires;
1241} 1171}
1242 1172
1243/* 1173/*
@@ -1266,9 +1196,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1266 goto out; 1196 goto out;
1267 } 1197 }
1268 read_lock(&tasklist_lock); /* arm_timer needs it. */ 1198 read_lock(&tasklist_lock); /* arm_timer needs it. */
1199 spin_lock(&p->sighand->siglock);
1269 } else { 1200 } else {
1270 read_lock(&tasklist_lock); 1201 read_lock(&tasklist_lock);
1271 if (unlikely(p->signal == NULL)) { 1202 if (unlikely(p->sighand == NULL)) {
1272 /* 1203 /*
1273 * The process has been reaped. 1204 * The process has been reaped.
1274 * We can't even collect a sample any more. 1205 * We can't even collect a sample any more.
@@ -1286,6 +1217,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1286 clear_dead_task(timer, now); 1217 clear_dead_task(timer, now);
1287 goto out_unlock; 1218 goto out_unlock;
1288 } 1219 }
1220 spin_lock(&p->sighand->siglock);
1289 cpu_timer_sample_group(timer->it_clock, p, &now); 1221 cpu_timer_sample_group(timer->it_clock, p, &now);
1290 bump_cpu_timer(timer, now); 1222 bump_cpu_timer(timer, now);
1291 /* Leave the tasklist_lock locked for the call below. */ 1223 /* Leave the tasklist_lock locked for the call below. */
@@ -1294,7 +1226,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
1294 /* 1226 /*
1295 * Now re-arm for the new expiry time. 1227 * Now re-arm for the new expiry time.
1296 */ 1228 */
1297 arm_timer(timer, now); 1229 BUG_ON(!irqs_disabled());
1230 arm_timer(timer);
1231 spin_unlock(&p->sighand->siglock);
1298 1232
1299out_unlock: 1233out_unlock:
1300 read_unlock(&tasklist_lock); 1234 read_unlock(&tasklist_lock);
@@ -1306,23 +1240,6 @@ out:
1306} 1240}
1307 1241
1308/** 1242/**
1309 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1310 *
1311 * @cputime: The struct to compare.
1312 *
1313 * Checks @cputime to see if all fields are zero. Returns true if all fields
1314 * are zero, false if any field is nonzero.
1315 */
1316static inline int task_cputime_zero(const struct task_cputime *cputime)
1317{
1318 if (cputime_eq(cputime->utime, cputime_zero) &&
1319 cputime_eq(cputime->stime, cputime_zero) &&
1320 cputime->sum_exec_runtime == 0)
1321 return 1;
1322 return 0;
1323}
1324
1325/**
1326 * task_cputime_expired - Compare two task_cputime entities. 1243 * task_cputime_expired - Compare two task_cputime entities.
1327 * 1244 *
1328 * @sample: The task_cputime structure to be checked for expiration. 1245 * @sample: The task_cputime structure to be checked for expiration.
@@ -1378,7 +1295,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1378 } 1295 }
1379 1296
1380 sig = tsk->signal; 1297 sig = tsk->signal;
1381 if (!task_cputime_zero(&sig->cputime_expires)) { 1298 if (sig->cputimer.running) {
1382 struct task_cputime group_sample; 1299 struct task_cputime group_sample;
1383 1300
1384 thread_group_cputimer(tsk, &group_sample); 1301 thread_group_cputimer(tsk, &group_sample);
@@ -1386,7 +1303,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
1386 return 1; 1303 return 1;
1387 } 1304 }
1388 1305
1389 return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY; 1306 return 0;
1390} 1307}
1391 1308
1392/* 1309/*
@@ -1415,7 +1332,12 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1415 * put them on the firing list. 1332 * put them on the firing list.
1416 */ 1333 */
1417 check_thread_timers(tsk, &firing); 1334 check_thread_timers(tsk, &firing);
1418 check_process_timers(tsk, &firing); 1335 /*
1336 * If there are any active process wide timers (POSIX 1.b, itimers,
1337 * RLIMIT_CPU) cputimer must be running.
1338 */
1339 if (tsk->signal->cputimer.running)
1340 check_process_timers(tsk, &firing);
1419 1341
1420 /* 1342 /*
1421 * We must release these locks before taking any timer's lock. 1343 * We must release these locks before taking any timer's lock.
@@ -1452,21 +1374,23 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1452} 1374}
1453 1375
1454/* 1376/*
1455 * Set one of the process-wide special case CPU timers. 1377 * Set one of the process-wide special case CPU timers or RLIMIT_CPU.
1456 * The tsk->sighand->siglock must be held by the caller. 1378 * The tsk->sighand->siglock must be held by the caller.
1457 * The *newval argument is relative and we update it to be absolute, *oldval
1458 * is absolute and we update it to be relative.
1459 */ 1379 */
1460void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1380void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1461 cputime_t *newval, cputime_t *oldval) 1381 cputime_t *newval, cputime_t *oldval)
1462{ 1382{
1463 union cpu_time_count now; 1383 union cpu_time_count now;
1464 struct list_head *head;
1465 1384
1466 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1385 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1467 cpu_timer_sample_group(clock_idx, tsk, &now); 1386 cpu_timer_sample_group(clock_idx, tsk, &now);
1468 1387
1469 if (oldval) { 1388 if (oldval) {
1389 /*
1390 * We are setting itimer. The *oldval is absolute and we update
1391 * it to be relative, *newval argument is relative and we update
1392 * it to be absolute.
1393 */
1470 if (!cputime_eq(*oldval, cputime_zero)) { 1394 if (!cputime_eq(*oldval, cputime_zero)) {
1471 if (cputime_le(*oldval, now.cpu)) { 1395 if (cputime_le(*oldval, now.cpu)) {
1472 /* Just about to fire. */ 1396 /* Just about to fire. */
@@ -1479,33 +1403,21 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1479 if (cputime_eq(*newval, cputime_zero)) 1403 if (cputime_eq(*newval, cputime_zero))
1480 return; 1404 return;
1481 *newval = cputime_add(*newval, now.cpu); 1405 *newval = cputime_add(*newval, now.cpu);
1482
1483 /*
1484 * If the RLIMIT_CPU timer will expire before the
1485 * ITIMER_PROF timer, we have nothing else to do.
1486 */
1487 if (tsk->signal->rlim[RLIMIT_CPU].rlim_cur
1488 < cputime_to_secs(*newval))
1489 return;
1490 } 1406 }
1491 1407
1492 /* 1408 /*
1493 * Check whether there are any process timers already set to fire 1409 * Update expiration cache if we are the earliest timer, or eventually
1494 * before this one. If so, we don't have anything more to do. 1410 * RLIMIT_CPU limit is earlier than prof_exp cpu timer expire.
1495 */ 1411 */
1496 head = &tsk->signal->cpu_timers[clock_idx]; 1412 switch (clock_idx) {
1497 if (list_empty(head) || 1413 case CPUCLOCK_PROF:
1498 cputime_ge(list_first_entry(head, 1414 if (expires_gt(tsk->signal->cputime_expires.prof_exp, *newval))
1499 struct cpu_timer_list, entry)->expires.cpu,
1500 *newval)) {
1501 switch (clock_idx) {
1502 case CPUCLOCK_PROF:
1503 tsk->signal->cputime_expires.prof_exp = *newval; 1415 tsk->signal->cputime_expires.prof_exp = *newval;
1504 break; 1416 break;
1505 case CPUCLOCK_VIRT: 1417 case CPUCLOCK_VIRT:
1418 if (expires_gt(tsk->signal->cputime_expires.virt_exp, *newval))
1506 tsk->signal->cputime_expires.virt_exp = *newval; 1419 tsk->signal->cputime_expires.virt_exp = *newval;
1507 break; 1420 break;
1508 }
1509 } 1421 }
1510} 1422}
1511 1423
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 00d1fda58ab6..ad723420acc3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -559,14 +559,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
559 new_timer->it_id = (timer_t) new_timer_id; 559 new_timer->it_id = (timer_t) new_timer_id;
560 new_timer->it_clock = which_clock; 560 new_timer->it_clock = which_clock;
561 new_timer->it_overrun = -1; 561 new_timer->it_overrun = -1;
562 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
563 if (error)
564 goto out;
565 562
566 /*
567 * return the timer_id now. The next step is hard to
568 * back out if there is an error.
569 */
570 if (copy_to_user(created_timer_id, 563 if (copy_to_user(created_timer_id,
571 &new_timer_id, sizeof (new_timer_id))) { 564 &new_timer_id, sizeof (new_timer_id))) {
572 error = -EFAULT; 565 error = -EFAULT;
@@ -597,6 +590,10 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
597 new_timer->sigq->info.si_tid = new_timer->it_id; 590 new_timer->sigq->info.si_tid = new_timer->it_id;
598 new_timer->sigq->info.si_code = SI_TIMER; 591 new_timer->sigq->info.si_code = SI_TIMER;
599 592
593 error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
594 if (error)
595 goto out;
596
600 spin_lock_irq(&current->sighand->siglock); 597 spin_lock_irq(&current->sighand->siglock);
601 new_timer->it_signal = current->signal; 598 new_timer->it_signal = current->signal;
602 list_add(&new_timer->list, &current->signal->posix_timers); 599 list_add(&new_timer->list, &current->signal->posix_timers);
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 5c36ea9d55d2..ca6066a6952e 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -99,9 +99,13 @@ config PM_SLEEP_ADVANCED_DEBUG
99 depends on PM_ADVANCED_DEBUG 99 depends on PM_ADVANCED_DEBUG
100 default n 100 default n
101 101
102config SUSPEND_NVS
103 bool
104
102config SUSPEND 105config SUSPEND
103 bool "Suspend to RAM and standby" 106 bool "Suspend to RAM and standby"
104 depends on PM && ARCH_SUSPEND_POSSIBLE 107 depends on PM && ARCH_SUSPEND_POSSIBLE
108 select SUSPEND_NVS if HAS_IOMEM
105 default y 109 default y
106 ---help--- 110 ---help---
107 Allow the system to enter sleep states in which main memory is 111 Allow the system to enter sleep states in which main memory is
@@ -130,13 +134,10 @@ config SUSPEND_FREEZER
130 134
131 Turning OFF this setting is NOT recommended! If in doubt, say Y. 135 Turning OFF this setting is NOT recommended! If in doubt, say Y.
132 136
133config HIBERNATION_NVS
134 bool
135
136config HIBERNATION 137config HIBERNATION
137 bool "Hibernation (aka 'suspend to disk')" 138 bool "Hibernation (aka 'suspend to disk')"
138 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE 139 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
139 select HIBERNATION_NVS if HAS_IOMEM 140 select SUSPEND_NVS if HAS_IOMEM
140 ---help--- 141 ---help---
141 Enable the suspend to disk (STD) functionality, which is usually 142 Enable the suspend to disk (STD) functionality, which is usually
142 called "hibernation" in user interfaces. STD checkpoints the 143 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 43191815f874..f9063c6b185d 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -8,7 +8,8 @@ obj-$(CONFIG_PM_SLEEP) += console.o
8obj-$(CONFIG_FREEZER) += process.o 8obj-$(CONFIG_FREEZER) += process.o
9obj-$(CONFIG_SUSPEND) += suspend.o 9obj-$(CONFIG_SUSPEND) += suspend.o
10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o 10obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o 11obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
12obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o 12 block_io.o
13obj-$(CONFIG_SUSPEND_NVS) += nvs.o
13 14
14obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 15obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
new file mode 100644
index 000000000000..97024fd40cd5
--- /dev/null
+++ b/kernel/power/block_io.c
@@ -0,0 +1,103 @@
1/*
2 * This file provides functions for block I/O operations on swap/file.
3 *
4 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
5 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#include <linux/bio.h>
11#include <linux/kernel.h>
12#include <linux/pagemap.h>
13#include <linux/swap.h>
14
15#include "power.h"
16
17/**
18 * submit - submit BIO request.
19 * @rw: READ or WRITE.
20 * @off physical offset of page.
21 * @page: page we're reading or writing.
22 * @bio_chain: list of pending biod (for async reading)
23 *
24 * Straight from the textbook - allocate and initialize the bio.
25 * If we're reading, make sure the page is marked as dirty.
26 * Then submit it and, if @bio_chain == NULL, wait.
27 */
28static int submit(int rw, struct block_device *bdev, sector_t sector,
29 struct page *page, struct bio **bio_chain)
30{
31 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
32 struct bio *bio;
33
34 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
35 bio->bi_sector = sector;
36 bio->bi_bdev = bdev;
37 bio->bi_end_io = end_swap_bio_read;
38
39 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
40 printk(KERN_ERR "PM: Adding page to bio failed at %llu\n",
41 (unsigned long long)sector);
42 bio_put(bio);
43 return -EFAULT;
44 }
45
46 lock_page(page);
47 bio_get(bio);
48
49 if (bio_chain == NULL) {
50 submit_bio(bio_rw, bio);
51 wait_on_page_locked(page);
52 if (rw == READ)
53 bio_set_pages_dirty(bio);
54 bio_put(bio);
55 } else {
56 if (rw == READ)
57 get_page(page); /* These pages are freed later */
58 bio->bi_private = *bio_chain;
59 *bio_chain = bio;
60 submit_bio(bio_rw, bio);
61 }
62 return 0;
63}
64
65int hib_bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
66{
67 return submit(READ, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
68 virt_to_page(addr), bio_chain);
69}
70
71int hib_bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
72{
73 return submit(WRITE, hib_resume_bdev, page_off * (PAGE_SIZE >> 9),
74 virt_to_page(addr), bio_chain);
75}
76
77int hib_wait_on_bio_chain(struct bio **bio_chain)
78{
79 struct bio *bio;
80 struct bio *next_bio;
81 int ret = 0;
82
83 if (bio_chain == NULL)
84 return 0;
85
86 bio = *bio_chain;
87 if (bio == NULL)
88 return 0;
89 while (bio) {
90 struct page *page;
91
92 next_bio = bio->bi_private;
93 page = bio->bi_io_vec[0].bv_page;
94 wait_on_page_locked(page);
95 if (!PageUptodate(page) || PageError(page))
96 ret = -EIO;
97 put_page(page);
98 bio_put(bio);
99 bio = next_bio;
100 }
101 *bio_chain = NULL;
102 return ret;
103}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index da5288ec2392..8dc31e02ae12 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003 Patrick Mochel 4 * Copyright (c) 2003 Patrick Mochel
5 * Copyright (c) 2003 Open Source Development Lab 5 * Copyright (c) 2003 Open Source Development Lab
6 * Copyright (c) 2004 Pavel Machek <pavel@suse.cz> 6 * Copyright (c) 2004 Pavel Machek <pavel@ucw.cz>
7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc. 7 * Copyright (c) 2009 Rafael J. Wysocki, Novell Inc.
8 * 8 *
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
@@ -22,6 +22,7 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/gfp.h>
25#include <scsi/scsi_scan.h> 26#include <scsi/scsi_scan.h>
26#include <asm/suspend.h> 27#include <asm/suspend.h>
27 28
@@ -276,7 +277,7 @@ static int create_image(int platform_mode)
276 goto Enable_irqs; 277 goto Enable_irqs;
277 } 278 }
278 279
279 if (hibernation_test(TEST_CORE)) 280 if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events())
280 goto Power_up; 281 goto Power_up;
281 282
282 in_suspend = 1; 283 in_suspend = 1;
@@ -287,8 +288,10 @@ static int create_image(int platform_mode)
287 error); 288 error);
288 /* Restore control flow magically appears here */ 289 /* Restore control flow magically appears here */
289 restore_processor_state(); 290 restore_processor_state();
290 if (!in_suspend) 291 if (!in_suspend) {
292 events_check_enabled = false;
291 platform_leave(platform_mode); 293 platform_leave(platform_mode);
294 }
292 295
293 Power_up: 296 Power_up:
294 sysdev_resume(); 297 sysdev_resume();
@@ -327,7 +330,7 @@ int hibernation_snapshot(int platform_mode)
327 330
328 error = platform_begin(platform_mode); 331 error = platform_begin(platform_mode);
329 if (error) 332 if (error)
330 return error; 333 goto Close;
331 334
332 /* Preallocate image memory before shutting down devices. */ 335 /* Preallocate image memory before shutting down devices. */
333 error = hibernate_preallocate_memory(); 336 error = hibernate_preallocate_memory();
@@ -510,18 +513,24 @@ int hibernation_platform_enter(void)
510 513
511 local_irq_disable(); 514 local_irq_disable();
512 sysdev_suspend(PMSG_HIBERNATE); 515 sysdev_suspend(PMSG_HIBERNATE);
516 if (!pm_check_wakeup_events()) {
517 error = -EAGAIN;
518 goto Power_up;
519 }
520
513 hibernation_ops->enter(); 521 hibernation_ops->enter();
514 /* We should never get here */ 522 /* We should never get here */
515 while (1); 523 while (1);
516 524
517 /* 525 Power_up:
518 * We don't need to reenable the nonboot CPUs or resume consoles, since 526 sysdev_resume();
519 * the system is going to be halted anyway. 527 local_irq_enable();
520 */ 528 enable_nonboot_cpus();
529
521 Platform_finish: 530 Platform_finish:
522 hibernation_ops->finish(); 531 hibernation_ops->finish();
523 532
524 dpm_suspend_noirq(PMSG_RESTORE); 533 dpm_resume_noirq(PMSG_RESTORE);
525 534
526 Resume_devices: 535 Resume_devices:
527 entering_platform_hibernation = false; 536 entering_platform_hibernation = false;
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b58800b21fc0..62b0bc6e4983 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
204 204
205power_attr(state); 205power_attr(state);
206 206
207#ifdef CONFIG_PM_SLEEP
208/*
209 * The 'wakeup_count' attribute, along with the functions defined in
210 * drivers/base/power/wakeup.c, provides a means by which wakeup events can be
211 * handled in a non-racy way.
212 *
213 * If a wakeup event occurs when the system is in a sleep state, it simply is
214 * woken up. In turn, if an event that would wake the system up from a sleep
215 * state occurs when it is undergoing a transition to that sleep state, the
216 * transition should be aborted. Moreover, if such an event occurs when the
217 * system is in the working state, an attempt to start a transition to the
218 * given sleep state should fail during certain period after the detection of
219 * the event. Using the 'state' attribute alone is not sufficient to satisfy
220 * these requirements, because a wakeup event may occur exactly when 'state'
221 * is being written to and may be delivered to user space right before it is
222 * frozen, so the event will remain only partially processed until the system is
223 * woken up by another event. In particular, it won't cause the transition to
224 * a sleep state to be aborted.
225 *
226 * This difficulty may be overcome if user space uses 'wakeup_count' before
227 * writing to 'state'. It first should read from 'wakeup_count' and store
228 * the read value. Then, after carrying out its own preparations for the system
229 * transition to a sleep state, it should write the stored value to
230 * 'wakeup_count'. If that fails, at least one wakeup event has occured since
231 * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
232 * is allowed to write to 'state', but the transition will be aborted if there
233 * are any wakeup events detected after 'wakeup_count' was written to.
234 */
235
236static ssize_t wakeup_count_show(struct kobject *kobj,
237 struct kobj_attribute *attr,
238 char *buf)
239{
240 unsigned long val;
241
242 return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR;
243}
244
245static ssize_t wakeup_count_store(struct kobject *kobj,
246 struct kobj_attribute *attr,
247 const char *buf, size_t n)
248{
249 unsigned long val;
250
251 if (sscanf(buf, "%lu", &val) == 1) {
252 if (pm_save_wakeup_count(val))
253 return n;
254 }
255 return -EINVAL;
256}
257
258power_attr(wakeup_count);
259#endif /* CONFIG_PM_SLEEP */
260
207#ifdef CONFIG_PM_TRACE 261#ifdef CONFIG_PM_TRACE
208int pm_trace_enabled; 262int pm_trace_enabled;
209 263
@@ -236,6 +290,7 @@ static struct attribute * g[] = {
236#endif 290#endif
237#ifdef CONFIG_PM_SLEEP 291#ifdef CONFIG_PM_SLEEP
238 &pm_async_attr.attr, 292 &pm_async_attr.attr,
293 &wakeup_count_attr.attr,
239#ifdef CONFIG_PM_DEBUG 294#ifdef CONFIG_PM_DEBUG
240 &pm_test_attr.attr, 295 &pm_test_attr.attr,
241#endif 296#endif
diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/nvs.c
index 39ac698ef836..1836db60bbb6 100644
--- a/kernel/power/hibernate_nvs.c
+++ b/kernel/power/nvs.c
@@ -10,11 +10,12 @@
10#include <linux/kernel.h> 10#include <linux/kernel.h>
11#include <linux/list.h> 11#include <linux/list.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/slab.h>
13#include <linux/suspend.h> 14#include <linux/suspend.h>
14 15
15/* 16/*
16 * Platforms, like ACPI, may want us to save some memory used by them during 17 * Platforms, like ACPI, may want us to save some memory used by them during
17 * hibernation and to restore the contents of this memory during the subsequent 18 * suspend and to restore the contents of this memory during the subsequent
18 * resume. The code below implements a mechanism allowing us to do that. 19 * resume. The code below implements a mechanism allowing us to do that.
19 */ 20 */
20 21
@@ -29,7 +30,7 @@ struct nvs_page {
29static LIST_HEAD(nvs_list); 30static LIST_HEAD(nvs_list);
30 31
31/** 32/**
32 * hibernate_nvs_register - register platform NVS memory region to save 33 * suspend_nvs_register - register platform NVS memory region to save
33 * @start - physical address of the region 34 * @start - physical address of the region
34 * @size - size of the region 35 * @size - size of the region
35 * 36 *
@@ -37,7 +38,7 @@ static LIST_HEAD(nvs_list);
37 * things so that the data from page-aligned addresses in this region will 38 * things so that the data from page-aligned addresses in this region will
38 * be copied into separate RAM pages. 39 * be copied into separate RAM pages.
39 */ 40 */
40int hibernate_nvs_register(unsigned long start, unsigned long size) 41int suspend_nvs_register(unsigned long start, unsigned long size)
41{ 42{
42 struct nvs_page *entry, *next; 43 struct nvs_page *entry, *next;
43 44
@@ -67,9 +68,9 @@ int hibernate_nvs_register(unsigned long start, unsigned long size)
67} 68}
68 69
69/** 70/**
70 * hibernate_nvs_free - free data pages allocated for saving NVS regions 71 * suspend_nvs_free - free data pages allocated for saving NVS regions
71 */ 72 */
72void hibernate_nvs_free(void) 73void suspend_nvs_free(void)
73{ 74{
74 struct nvs_page *entry; 75 struct nvs_page *entry;
75 76
@@ -85,16 +86,16 @@ void hibernate_nvs_free(void)
85} 86}
86 87
87/** 88/**
88 * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions 89 * suspend_nvs_alloc - allocate memory necessary for saving NVS regions
89 */ 90 */
90int hibernate_nvs_alloc(void) 91int suspend_nvs_alloc(void)
91{ 92{
92 struct nvs_page *entry; 93 struct nvs_page *entry;
93 94
94 list_for_each_entry(entry, &nvs_list, node) { 95 list_for_each_entry(entry, &nvs_list, node) {
95 entry->data = (void *)__get_free_page(GFP_KERNEL); 96 entry->data = (void *)__get_free_page(GFP_KERNEL);
96 if (!entry->data) { 97 if (!entry->data) {
97 hibernate_nvs_free(); 98 suspend_nvs_free();
98 return -ENOMEM; 99 return -ENOMEM;
99 } 100 }
100 } 101 }
@@ -102,9 +103,9 @@ int hibernate_nvs_alloc(void)
102} 103}
103 104
104/** 105/**
105 * hibernate_nvs_save - save NVS memory regions 106 * suspend_nvs_save - save NVS memory regions
106 */ 107 */
107void hibernate_nvs_save(void) 108void suspend_nvs_save(void)
108{ 109{
109 struct nvs_page *entry; 110 struct nvs_page *entry;
110 111
@@ -118,12 +119,12 @@ void hibernate_nvs_save(void)
118} 119}
119 120
120/** 121/**
121 * hibernate_nvs_restore - restore NVS memory regions 122 * suspend_nvs_restore - restore NVS memory regions
122 * 123 *
123 * This function is going to be called with interrupts disabled, so it 124 * This function is going to be called with interrupts disabled, so it
124 * cannot iounmap the virtual addresses used to access the NVS region. 125 * cannot iounmap the virtual addresses used to access the NVS region.
125 */ 126 */
126void hibernate_nvs_restore(void) 127void suspend_nvs_restore(void)
127{ 128{
128 struct nvs_page *entry; 129 struct nvs_page *entry;
129 130
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46c5a26630a3..006270fe382d 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -97,24 +97,12 @@ extern int hibernate_preallocate_memory(void);
97 */ 97 */
98 98
99struct snapshot_handle { 99struct snapshot_handle {
100 loff_t offset; /* number of the last byte ready for reading
101 * or writing in the sequence
102 */
103 unsigned int cur; /* number of the block of PAGE_SIZE bytes the 100 unsigned int cur; /* number of the block of PAGE_SIZE bytes the
104 * next operation will refer to (ie. current) 101 * next operation will refer to (ie. current)
105 */ 102 */
106 unsigned int cur_offset; /* offset with respect to the current
107 * block (for the next operation)
108 */
109 unsigned int prev; /* number of the block of PAGE_SIZE bytes that
110 * was the current one previously
111 */
112 void *buffer; /* address of the block to read from 103 void *buffer; /* address of the block to read from
113 * or write to 104 * or write to
114 */ 105 */
115 unsigned int buf_offset; /* location to read from or write to,
116 * given as a displacement from 'buffer'
117 */
118 int sync_read; /* Set to one to notify the caller of 106 int sync_read; /* Set to one to notify the caller of
119 * snapshot_write_next() that it may 107 * snapshot_write_next() that it may
120 * need to call wait_on_bio_chain() 108 * need to call wait_on_bio_chain()
@@ -125,12 +113,12 @@ struct snapshot_handle {
125 * snapshot_read_next()/snapshot_write_next() is allowed to 113 * snapshot_read_next()/snapshot_write_next() is allowed to
126 * read/write data after the function returns 114 * read/write data after the function returns
127 */ 115 */
128#define data_of(handle) ((handle).buffer + (handle).buf_offset) 116#define data_of(handle) ((handle).buffer)
129 117
130extern unsigned int snapshot_additional_pages(struct zone *zone); 118extern unsigned int snapshot_additional_pages(struct zone *zone);
131extern unsigned long snapshot_get_image_size(void); 119extern unsigned long snapshot_get_image_size(void);
132extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 120extern int snapshot_read_next(struct snapshot_handle *handle);
133extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 121extern int snapshot_write_next(struct snapshot_handle *handle);
134extern void snapshot_write_finalize(struct snapshot_handle *handle); 122extern void snapshot_write_finalize(struct snapshot_handle *handle);
135extern int snapshot_image_loaded(struct snapshot_handle *handle); 123extern int snapshot_image_loaded(struct snapshot_handle *handle);
136 124
@@ -154,6 +142,15 @@ extern int swsusp_read(unsigned int *flags_p);
154extern int swsusp_write(unsigned int flags); 142extern int swsusp_write(unsigned int flags);
155extern void swsusp_close(fmode_t); 143extern void swsusp_close(fmode_t);
156 144
145/* kernel/power/block_io.c */
146extern struct block_device *hib_resume_bdev;
147
148extern int hib_bio_read_page(pgoff_t page_off, void *addr,
149 struct bio **bio_chain);
150extern int hib_bio_write_page(pgoff_t page_off, void *addr,
151 struct bio **bio_chain);
152extern int hib_wait_on_bio_chain(struct bio **bio_chain);
153
157struct timeval; 154struct timeval;
158/* kernel/power/swsusp.c */ 155/* kernel/power/swsusp.c */
159extern void swsusp_show_speed(struct timeval *, struct timeval *, 156extern void swsusp_show_speed(struct timeval *, struct timeval *,
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5ade1bdcf366..71ae29052ab6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -88,12 +88,11 @@ static int try_to_freeze_tasks(bool sig_only)
88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds " 88 printk(KERN_ERR "Freezing of tasks failed after %d.%02d seconds "
89 "(%d tasks refusing to freeze):\n", 89 "(%d tasks refusing to freeze):\n",
90 elapsed_csecs / 100, elapsed_csecs % 100, todo); 90 elapsed_csecs / 100, elapsed_csecs % 100, todo);
91 show_state();
92 read_lock(&tasklist_lock); 91 read_lock(&tasklist_lock);
93 do_each_thread(g, p) { 92 do_each_thread(g, p) {
94 task_lock(p); 93 task_lock(p);
95 if (freezing(p) && !freezer_should_skip(p)) 94 if (freezing(p) && !freezer_should_skip(p))
96 printk(KERN_ERR " %s\n", p->comm); 95 sched_show_task(p);
97 cancel_freezing(p); 96 cancel_freezing(p);
98 task_unlock(p); 97 task_unlock(p);
99 } while_each_thread(g, p); 98 } while_each_thread(g, p);
@@ -145,7 +144,7 @@ static void thaw_tasks(bool nosig_only)
145 if (nosig_only && should_send_signal(p)) 144 if (nosig_only && should_send_signal(p))
146 continue; 145 continue;
147 146
148 if (cgroup_frozen(p)) 147 if (cgroup_freezing_or_frozen(p))
149 continue; 148 continue;
150 149
151 thaw_process(p); 150 thaw_process(p);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 830cadecbdfc..f6cd6faf84fd 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * This file provides system snapshot/restore functionality for swsusp. 4 * This file provides system snapshot/restore functionality for swsusp.
5 * 5 *
6 * Copyright (C) 1998-2005 Pavel Machek <pavel@suse.cz> 6 * Copyright (C) 1998-2005 Pavel Machek <pavel@ucw.cz>
7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 7 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
8 * 8 *
9 * This file is released under the GPLv2. 9 * This file is released under the GPLv2.
@@ -26,6 +26,7 @@
26#include <linux/console.h> 26#include <linux/console.h>
27#include <linux/highmem.h> 27#include <linux/highmem.h>
28#include <linux/list.h> 28#include <linux/list.h>
29#include <linux/slab.h>
29 30
30#include <asm/uaccess.h> 31#include <asm/uaccess.h>
31#include <asm/mmu_context.h> 32#include <asm/mmu_context.h>
@@ -1603,14 +1604,9 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1603 * snapshot_handle structure. The structure gets updated and a pointer 1604 * snapshot_handle structure. The structure gets updated and a pointer
1604 * to it should be passed to this function every next time. 1605 * to it should be passed to this function every next time.
1605 * 1606 *
1606 * The @count parameter should contain the number of bytes the caller
1607 * wants to read from the snapshot. It must not be zero.
1608 *
1609 * On success the function returns a positive number. Then, the caller 1607 * On success the function returns a positive number. Then, the caller
1610 * is allowed to read up to the returned number of bytes from the memory 1608 * is allowed to read up to the returned number of bytes from the memory
1611 * location computed by the data_of() macro. The number returned 1609 * location computed by the data_of() macro.
1612 * may be smaller than @count, but this only happens if the read would
1613 * cross a page boundary otherwise.
1614 * 1610 *
1615 * The function returns 0 to indicate the end of data stream condition, 1611 * The function returns 0 to indicate the end of data stream condition,
1616 * and a negative number is returned on error. In such cases the 1612 * and a negative number is returned on error. In such cases the
@@ -1618,7 +1614,7 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
1618 * any more. 1614 * any more.
1619 */ 1615 */
1620 1616
1621int snapshot_read_next(struct snapshot_handle *handle, size_t count) 1617int snapshot_read_next(struct snapshot_handle *handle)
1622{ 1618{
1623 if (handle->cur > nr_meta_pages + nr_copy_pages) 1619 if (handle->cur > nr_meta_pages + nr_copy_pages)
1624 return 0; 1620 return 0;
@@ -1629,7 +1625,7 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1629 if (!buffer) 1625 if (!buffer)
1630 return -ENOMEM; 1626 return -ENOMEM;
1631 } 1627 }
1632 if (!handle->offset) { 1628 if (!handle->cur) {
1633 int error; 1629 int error;
1634 1630
1635 error = init_header((struct swsusp_info *)buffer); 1631 error = init_header((struct swsusp_info *)buffer);
@@ -1638,42 +1634,30 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
1638 handle->buffer = buffer; 1634 handle->buffer = buffer;
1639 memory_bm_position_reset(&orig_bm); 1635 memory_bm_position_reset(&orig_bm);
1640 memory_bm_position_reset(&copy_bm); 1636 memory_bm_position_reset(&copy_bm);
1641 } 1637 } else if (handle->cur <= nr_meta_pages) {
1642 if (handle->prev < handle->cur) { 1638 memset(buffer, 0, PAGE_SIZE);
1643 if (handle->cur <= nr_meta_pages) { 1639 pack_pfns(buffer, &orig_bm);
1644 memset(buffer, 0, PAGE_SIZE); 1640 } else {
1645 pack_pfns(buffer, &orig_bm); 1641 struct page *page;
1646 } else {
1647 struct page *page;
1648 1642
1649 page = pfn_to_page(memory_bm_next_pfn(&copy_bm)); 1643 page = pfn_to_page(memory_bm_next_pfn(&copy_bm));
1650 if (PageHighMem(page)) { 1644 if (PageHighMem(page)) {
1651 /* Highmem pages are copied to the buffer, 1645 /* Highmem pages are copied to the buffer,
1652 * because we can't return with a kmapped 1646 * because we can't return with a kmapped
1653 * highmem page (we may not be called again). 1647 * highmem page (we may not be called again).
1654 */ 1648 */
1655 void *kaddr; 1649 void *kaddr;
1656 1650
1657 kaddr = kmap_atomic(page, KM_USER0); 1651 kaddr = kmap_atomic(page, KM_USER0);
1658 memcpy(buffer, kaddr, PAGE_SIZE); 1652 memcpy(buffer, kaddr, PAGE_SIZE);
1659 kunmap_atomic(kaddr, KM_USER0); 1653 kunmap_atomic(kaddr, KM_USER0);
1660 handle->buffer = buffer; 1654 handle->buffer = buffer;
1661 } else { 1655 } else {
1662 handle->buffer = page_address(page); 1656 handle->buffer = page_address(page);
1663 }
1664 } 1657 }
1665 handle->prev = handle->cur;
1666 }
1667 handle->buf_offset = handle->cur_offset;
1668 if (handle->cur_offset + count >= PAGE_SIZE) {
1669 count = PAGE_SIZE - handle->cur_offset;
1670 handle->cur_offset = 0;
1671 handle->cur++;
1672 } else {
1673 handle->cur_offset += count;
1674 } 1658 }
1675 handle->offset += count; 1659 handle->cur++;
1676 return count; 1660 return PAGE_SIZE;
1677} 1661}
1678 1662
1679/** 1663/**
@@ -2132,14 +2116,9 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2132 * snapshot_handle structure. The structure gets updated and a pointer 2116 * snapshot_handle structure. The structure gets updated and a pointer
2133 * to it should be passed to this function every next time. 2117 * to it should be passed to this function every next time.
2134 * 2118 *
2135 * The @count parameter should contain the number of bytes the caller
2136 * wants to write to the image. It must not be zero.
2137 *
2138 * On success the function returns a positive number. Then, the caller 2119 * On success the function returns a positive number. Then, the caller
2139 * is allowed to write up to the returned number of bytes to the memory 2120 * is allowed to write up to the returned number of bytes to the memory
2140 * location computed by the data_of() macro. The number returned 2121 * location computed by the data_of() macro.
2141 * may be smaller than @count, but this only happens if the write would
2142 * cross a page boundary otherwise.
2143 * 2122 *
2144 * The function returns 0 to indicate the "end of file" condition, 2123 * The function returns 0 to indicate the "end of file" condition,
2145 * and a negative number is returned on error. In such cases the 2124 * and a negative number is returned on error. In such cases the
@@ -2147,16 +2126,18 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
2147 * any more. 2126 * any more.
2148 */ 2127 */
2149 2128
2150int snapshot_write_next(struct snapshot_handle *handle, size_t count) 2129int snapshot_write_next(struct snapshot_handle *handle)
2151{ 2130{
2152 static struct chain_allocator ca; 2131 static struct chain_allocator ca;
2153 int error = 0; 2132 int error = 0;
2154 2133
2155 /* Check if we have already loaded the entire image */ 2134 /* Check if we have already loaded the entire image */
2156 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) 2135 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages)
2157 return 0; 2136 return 0;
2158 2137
2159 if (handle->offset == 0) { 2138 handle->sync_read = 1;
2139
2140 if (!handle->cur) {
2160 if (!buffer) 2141 if (!buffer)
2161 /* This makes the buffer be freed by swsusp_free() */ 2142 /* This makes the buffer be freed by swsusp_free() */
2162 buffer = get_image_page(GFP_ATOMIC, PG_ANY); 2143 buffer = get_image_page(GFP_ATOMIC, PG_ANY);
@@ -2165,56 +2146,43 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
2165 return -ENOMEM; 2146 return -ENOMEM;
2166 2147
2167 handle->buffer = buffer; 2148 handle->buffer = buffer;
2168 } 2149 } else if (handle->cur == 1) {
2169 handle->sync_read = 1; 2150 error = load_header(buffer);
2170 if (handle->prev < handle->cur) { 2151 if (error)
2171 if (handle->prev == 0) { 2152 return error;
2172 error = load_header(buffer);
2173 if (error)
2174 return error;
2175 2153
2176 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY); 2154 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
2177 if (error) 2155 if (error)
2178 return error; 2156 return error;
2157
2158 } else if (handle->cur <= nr_meta_pages + 1) {
2159 error = unpack_orig_pfns(buffer, &copy_bm);
2160 if (error)
2161 return error;
2179 2162
2180 } else if (handle->prev <= nr_meta_pages) { 2163 if (handle->cur == nr_meta_pages + 1) {
2181 error = unpack_orig_pfns(buffer, &copy_bm); 2164 error = prepare_image(&orig_bm, &copy_bm);
2182 if (error) 2165 if (error)
2183 return error; 2166 return error;
2184 2167
2185 if (handle->prev == nr_meta_pages) { 2168 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2186 error = prepare_image(&orig_bm, &copy_bm); 2169 memory_bm_position_reset(&orig_bm);
2187 if (error) 2170 restore_pblist = NULL;
2188 return error;
2189
2190 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
2191 memory_bm_position_reset(&orig_bm);
2192 restore_pblist = NULL;
2193 handle->buffer = get_buffer(&orig_bm, &ca);
2194 handle->sync_read = 0;
2195 if (IS_ERR(handle->buffer))
2196 return PTR_ERR(handle->buffer);
2197 }
2198 } else {
2199 copy_last_highmem_page();
2200 handle->buffer = get_buffer(&orig_bm, &ca); 2171 handle->buffer = get_buffer(&orig_bm, &ca);
2172 handle->sync_read = 0;
2201 if (IS_ERR(handle->buffer)) 2173 if (IS_ERR(handle->buffer))
2202 return PTR_ERR(handle->buffer); 2174 return PTR_ERR(handle->buffer);
2203 if (handle->buffer != buffer)
2204 handle->sync_read = 0;
2205 } 2175 }
2206 handle->prev = handle->cur;
2207 }
2208 handle->buf_offset = handle->cur_offset;
2209 if (handle->cur_offset + count >= PAGE_SIZE) {
2210 count = PAGE_SIZE - handle->cur_offset;
2211 handle->cur_offset = 0;
2212 handle->cur++;
2213 } else { 2176 } else {
2214 handle->cur_offset += count; 2177 copy_last_highmem_page();
2178 handle->buffer = get_buffer(&orig_bm, &ca);
2179 if (IS_ERR(handle->buffer))
2180 return PTR_ERR(handle->buffer);
2181 if (handle->buffer != buffer)
2182 handle->sync_read = 0;
2215 } 2183 }
2216 handle->offset += count; 2184 handle->cur++;
2217 return count; 2185 return PAGE_SIZE;
2218} 2186}
2219 2187
2220/** 2188/**
@@ -2229,7 +2197,7 @@ void snapshot_write_finalize(struct snapshot_handle *handle)
2229{ 2197{
2230 copy_last_highmem_page(); 2198 copy_last_highmem_page();
2231 /* Free only if we have loaded the image entirely */ 2199 /* Free only if we have loaded the image entirely */
2232 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages) { 2200 if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
2233 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR); 2201 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
2234 free_highmem_data(); 2202 free_highmem_data();
2235 } 2203 }
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 44cce10b582d..7335952ee473 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -15,6 +15,13 @@
15#include <linux/console.h> 15#include <linux/console.h>
16#include <linux/cpu.h> 16#include <linux/cpu.h>
17#include <linux/syscalls.h> 17#include <linux/syscalls.h>
18#include <linux/gfp.h>
19#include <linux/io.h>
20#include <linux/kernel.h>
21#include <linux/list.h>
22#include <linux/mm.h>
23#include <linux/slab.h>
24#include <linux/suspend.h>
18 25
19#include "power.h" 26#include "power.h"
20 27
@@ -129,19 +136,19 @@ static int suspend_enter(suspend_state_t state)
129 if (suspend_ops->prepare) { 136 if (suspend_ops->prepare) {
130 error = suspend_ops->prepare(); 137 error = suspend_ops->prepare();
131 if (error) 138 if (error)
132 return error; 139 goto Platform_finish;
133 } 140 }
134 141
135 error = dpm_suspend_noirq(PMSG_SUSPEND); 142 error = dpm_suspend_noirq(PMSG_SUSPEND);
136 if (error) { 143 if (error) {
137 printk(KERN_ERR "PM: Some devices failed to power down\n"); 144 printk(KERN_ERR "PM: Some devices failed to power down\n");
138 goto Platfrom_finish; 145 goto Platform_finish;
139 } 146 }
140 147
141 if (suspend_ops->prepare_late) { 148 if (suspend_ops->prepare_late) {
142 error = suspend_ops->prepare_late(); 149 error = suspend_ops->prepare_late();
143 if (error) 150 if (error)
144 goto Power_up_devices; 151 goto Platform_wake;
145 } 152 }
146 153
147 if (suspend_test(TEST_PLATFORM)) 154 if (suspend_test(TEST_PLATFORM))
@@ -156,8 +163,10 @@ static int suspend_enter(suspend_state_t state)
156 163
157 error = sysdev_suspend(PMSG_SUSPEND); 164 error = sysdev_suspend(PMSG_SUSPEND);
158 if (!error) { 165 if (!error) {
159 if (!suspend_test(TEST_CORE)) 166 if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) {
160 error = suspend_ops->enter(state); 167 error = suspend_ops->enter(state);
168 events_check_enabled = false;
169 }
161 sysdev_resume(); 170 sysdev_resume();
162 } 171 }
163 172
@@ -171,10 +180,9 @@ static int suspend_enter(suspend_state_t state)
171 if (suspend_ops->wake) 180 if (suspend_ops->wake)
172 suspend_ops->wake(); 181 suspend_ops->wake();
173 182
174 Power_up_devices:
175 dpm_resume_noirq(PMSG_RESUME); 183 dpm_resume_noirq(PMSG_RESUME);
176 184
177 Platfrom_finish: 185 Platform_finish:
178 if (suspend_ops->finish) 186 if (suspend_ops->finish)
179 suspend_ops->finish(); 187 suspend_ops->finish();
180 188
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 1d575733d4e1..e6a5bdf61a37 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -4,7 +4,7 @@
4 * This file provides functions for reading the suspend image from 4 * This file provides functions for reading the suspend image from
5 * and writing it to a swap partition. 5 * and writing it to a swap partition.
6 * 6 *
7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> 7 * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> 8 * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
9 * 9 *
10 * This file is released under the GPLv2. 10 * This file is released under the GPLv2.
@@ -23,11 +23,46 @@
23#include <linux/swap.h> 23#include <linux/swap.h>
24#include <linux/swapops.h> 24#include <linux/swapops.h>
25#include <linux/pm.h> 25#include <linux/pm.h>
26#include <linux/slab.h>
26 27
27#include "power.h" 28#include "power.h"
28 29
29#define SWSUSP_SIG "S1SUSPEND" 30#define SWSUSP_SIG "S1SUSPEND"
30 31
32/*
33 * The swap map is a data structure used for keeping track of each page
34 * written to a swap partition. It consists of many swap_map_page
35 * structures that contain each an array of MAP_PAGE_ENTRIES swap entries.
36 * These structures are stored on the swap and linked together with the
37 * help of the .next_swap member.
38 *
39 * The swap map is created during suspend. The swap map pages are
40 * allocated and populated one at a time, so we only need one memory
41 * page to set up the entire structure.
42 *
43 * During resume we also only need to use one swap_map_page structure
44 * at a time.
45 */
46
47#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
48
49struct swap_map_page {
50 sector_t entries[MAP_PAGE_ENTRIES];
51 sector_t next_swap;
52};
53
54/**
55 * The swap_map_handle structure is used for handling swap in
56 * a file-alike way
57 */
58
59struct swap_map_handle {
60 struct swap_map_page *cur;
61 sector_t cur_swap;
62 sector_t first_sector;
63 unsigned int k;
64};
65
31struct swsusp_header { 66struct swsusp_header {
32 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)]; 67 char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
33 sector_t image; 68 sector_t image;
@@ -113,7 +148,7 @@ sector_t alloc_swapdev_block(int swap)
113 148
114/** 149/**
115 * free_all_swap_pages - free swap pages allocated for saving image data. 150 * free_all_swap_pages - free swap pages allocated for saving image data.
116 * It also frees the extents used to register which swap entres had been 151 * It also frees the extents used to register which swap entries had been
117 * allocated. 152 * allocated.
118 */ 153 */
119 154
@@ -144,110 +179,24 @@ int swsusp_swap_in_use(void)
144 */ 179 */
145 180
146static unsigned short root_swap = 0xffff; 181static unsigned short root_swap = 0xffff;
147static struct block_device *resume_bdev; 182struct block_device *hib_resume_bdev;
148
149/**
150 * submit - submit BIO request.
151 * @rw: READ or WRITE.
152 * @off physical offset of page.
153 * @page: page we're reading or writing.
154 * @bio_chain: list of pending biod (for async reading)
155 *
156 * Straight from the textbook - allocate and initialize the bio.
157 * If we're reading, make sure the page is marked as dirty.
158 * Then submit it and, if @bio_chain == NULL, wait.
159 */
160static int submit(int rw, pgoff_t page_off, struct page *page,
161 struct bio **bio_chain)
162{
163 const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
164 struct bio *bio;
165
166 bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
167 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
168 bio->bi_bdev = resume_bdev;
169 bio->bi_end_io = end_swap_bio_read;
170
171 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
172 printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
173 page_off);
174 bio_put(bio);
175 return -EFAULT;
176 }
177
178 lock_page(page);
179 bio_get(bio);
180
181 if (bio_chain == NULL) {
182 submit_bio(bio_rw, bio);
183 wait_on_page_locked(page);
184 if (rw == READ)
185 bio_set_pages_dirty(bio);
186 bio_put(bio);
187 } else {
188 if (rw == READ)
189 get_page(page); /* These pages are freed later */
190 bio->bi_private = *bio_chain;
191 *bio_chain = bio;
192 submit_bio(bio_rw, bio);
193 }
194 return 0;
195}
196
197static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
198{
199 return submit(READ, page_off, virt_to_page(addr), bio_chain);
200}
201
202static int bio_write_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
203{
204 return submit(WRITE, page_off, virt_to_page(addr), bio_chain);
205}
206
207static int wait_on_bio_chain(struct bio **bio_chain)
208{
209 struct bio *bio;
210 struct bio *next_bio;
211 int ret = 0;
212
213 if (bio_chain == NULL)
214 return 0;
215
216 bio = *bio_chain;
217 if (bio == NULL)
218 return 0;
219 while (bio) {
220 struct page *page;
221
222 next_bio = bio->bi_private;
223 page = bio->bi_io_vec[0].bv_page;
224 wait_on_page_locked(page);
225 if (!PageUptodate(page) || PageError(page))
226 ret = -EIO;
227 put_page(page);
228 bio_put(bio);
229 bio = next_bio;
230 }
231 *bio_chain = NULL;
232 return ret;
233}
234 183
235/* 184/*
236 * Saving part 185 * Saving part
237 */ 186 */
238 187
239static int mark_swapfiles(sector_t start, unsigned int flags) 188static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
240{ 189{
241 int error; 190 int error;
242 191
243 bio_read_page(swsusp_resume_block, swsusp_header, NULL); 192 hib_bio_read_page(swsusp_resume_block, swsusp_header, NULL);
244 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) || 193 if (!memcmp("SWAP-SPACE",swsusp_header->sig, 10) ||
245 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) { 194 !memcmp("SWAPSPACE2",swsusp_header->sig, 10)) {
246 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10); 195 memcpy(swsusp_header->orig_sig,swsusp_header->sig, 10);
247 memcpy(swsusp_header->sig,SWSUSP_SIG, 10); 196 memcpy(swsusp_header->sig,SWSUSP_SIG, 10);
248 swsusp_header->image = start; 197 swsusp_header->image = handle->first_sector;
249 swsusp_header->flags = flags; 198 swsusp_header->flags = flags;
250 error = bio_write_page(swsusp_resume_block, 199 error = hib_bio_write_page(swsusp_resume_block,
251 swsusp_header, NULL); 200 swsusp_header, NULL);
252 } else { 201 } else {
253 printk(KERN_ERR "PM: Swap header not found!\n"); 202 printk(KERN_ERR "PM: Swap header not found!\n");
@@ -259,25 +208,26 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
259/** 208/**
260 * swsusp_swap_check - check if the resume device is a swap device 209 * swsusp_swap_check - check if the resume device is a swap device
261 * and get its index (if so) 210 * and get its index (if so)
211 *
212 * This is called before saving image
262 */ 213 */
263 214static int swsusp_swap_check(void)
264static int swsusp_swap_check(void) /* This is called before saving image */
265{ 215{
266 int res; 216 int res;
267 217
268 res = swap_type_of(swsusp_resume_device, swsusp_resume_block, 218 res = swap_type_of(swsusp_resume_device, swsusp_resume_block,
269 &resume_bdev); 219 &hib_resume_bdev);
270 if (res < 0) 220 if (res < 0)
271 return res; 221 return res;
272 222
273 root_swap = res; 223 root_swap = res;
274 res = blkdev_get(resume_bdev, FMODE_WRITE); 224 res = blkdev_get(hib_resume_bdev, FMODE_WRITE);
275 if (res) 225 if (res)
276 return res; 226 return res;
277 227
278 res = set_blocksize(resume_bdev, PAGE_SIZE); 228 res = set_blocksize(hib_resume_bdev, PAGE_SIZE);
279 if (res < 0) 229 if (res < 0)
280 blkdev_put(resume_bdev, FMODE_WRITE); 230 blkdev_put(hib_resume_bdev, FMODE_WRITE);
281 231
282 return res; 232 return res;
283} 233}
@@ -308,42 +258,9 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
308 } else { 258 } else {
309 src = buf; 259 src = buf;
310 } 260 }
311 return bio_write_page(offset, src, bio_chain); 261 return hib_bio_write_page(offset, src, bio_chain);
312} 262}
313 263
314/*
315 * The swap map is a data structure used for keeping track of each page
316 * written to a swap partition. It consists of many swap_map_page
317 * structures that contain each an array of MAP_PAGE_SIZE swap entries.
318 * These structures are stored on the swap and linked together with the
319 * help of the .next_swap member.
320 *
321 * The swap map is created during suspend. The swap map pages are
322 * allocated and populated one at a time, so we only need one memory
323 * page to set up the entire structure.
324 *
325 * During resume we also only need to use one swap_map_page structure
326 * at a time.
327 */
328
329#define MAP_PAGE_ENTRIES (PAGE_SIZE / sizeof(sector_t) - 1)
330
331struct swap_map_page {
332 sector_t entries[MAP_PAGE_ENTRIES];
333 sector_t next_swap;
334};
335
336/**
337 * The swap_map_handle structure is used for handling swap in
338 * a file-alike way
339 */
340
341struct swap_map_handle {
342 struct swap_map_page *cur;
343 sector_t cur_swap;
344 unsigned int k;
345};
346
347static void release_swap_writer(struct swap_map_handle *handle) 264static void release_swap_writer(struct swap_map_handle *handle)
348{ 265{
349 if (handle->cur) 266 if (handle->cur)
@@ -353,16 +270,33 @@ static void release_swap_writer(struct swap_map_handle *handle)
353 270
354static int get_swap_writer(struct swap_map_handle *handle) 271static int get_swap_writer(struct swap_map_handle *handle)
355{ 272{
273 int ret;
274
275 ret = swsusp_swap_check();
276 if (ret) {
277 if (ret != -ENOSPC)
278 printk(KERN_ERR "PM: Cannot find swap device, try "
279 "swapon -a.\n");
280 return ret;
281 }
356 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 282 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
357 if (!handle->cur) 283 if (!handle->cur) {
358 return -ENOMEM; 284 ret = -ENOMEM;
285 goto err_close;
286 }
359 handle->cur_swap = alloc_swapdev_block(root_swap); 287 handle->cur_swap = alloc_swapdev_block(root_swap);
360 if (!handle->cur_swap) { 288 if (!handle->cur_swap) {
361 release_swap_writer(handle); 289 ret = -ENOSPC;
362 return -ENOSPC; 290 goto err_rel;
363 } 291 }
364 handle->k = 0; 292 handle->k = 0;
293 handle->first_sector = handle->cur_swap;
365 return 0; 294 return 0;
295err_rel:
296 release_swap_writer(handle);
297err_close:
298 swsusp_close(FMODE_WRITE);
299 return ret;
366} 300}
367 301
368static int swap_write_page(struct swap_map_handle *handle, void *buf, 302static int swap_write_page(struct swap_map_handle *handle, void *buf,
@@ -379,7 +313,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
379 return error; 313 return error;
380 handle->cur->entries[handle->k++] = offset; 314 handle->cur->entries[handle->k++] = offset;
381 if (handle->k >= MAP_PAGE_ENTRIES) { 315 if (handle->k >= MAP_PAGE_ENTRIES) {
382 error = wait_on_bio_chain(bio_chain); 316 error = hib_wait_on_bio_chain(bio_chain);
383 if (error) 317 if (error)
384 goto out; 318 goto out;
385 offset = alloc_swapdev_block(root_swap); 319 offset = alloc_swapdev_block(root_swap);
@@ -405,6 +339,24 @@ static int flush_swap_writer(struct swap_map_handle *handle)
405 return -EINVAL; 339 return -EINVAL;
406} 340}
407 341
342static int swap_writer_finish(struct swap_map_handle *handle,
343 unsigned int flags, int error)
344{
345 if (!error) {
346 flush_swap_writer(handle);
347 printk(KERN_INFO "PM: S");
348 error = mark_swapfiles(handle, flags);
349 printk("|\n");
350 }
351
352 if (error)
353 free_all_swap_pages(root_swap);
354 release_swap_writer(handle);
355 swsusp_close(FMODE_WRITE);
356
357 return error;
358}
359
408/** 360/**
409 * save_image - save the suspend image data 361 * save_image - save the suspend image data
410 */ 362 */
@@ -430,7 +382,7 @@ static int save_image(struct swap_map_handle *handle,
430 bio = NULL; 382 bio = NULL;
431 do_gettimeofday(&start); 383 do_gettimeofday(&start);
432 while (1) { 384 while (1) {
433 ret = snapshot_read_next(snapshot, PAGE_SIZE); 385 ret = snapshot_read_next(snapshot);
434 if (ret <= 0) 386 if (ret <= 0)
435 break; 387 break;
436 ret = swap_write_page(handle, data_of(*snapshot), &bio); 388 ret = swap_write_page(handle, data_of(*snapshot), &bio);
@@ -440,7 +392,7 @@ static int save_image(struct swap_map_handle *handle,
440 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m); 392 printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
441 nr_pages++; 393 nr_pages++;
442 } 394 }
443 err2 = wait_on_bio_chain(&bio); 395 err2 = hib_wait_on_bio_chain(&bio);
444 do_gettimeofday(&stop); 396 do_gettimeofday(&stop);
445 if (!ret) 397 if (!ret)
446 ret = err2; 398 ret = err2;
@@ -482,50 +434,34 @@ int swsusp_write(unsigned int flags)
482 struct swap_map_handle handle; 434 struct swap_map_handle handle;
483 struct snapshot_handle snapshot; 435 struct snapshot_handle snapshot;
484 struct swsusp_info *header; 436 struct swsusp_info *header;
437 unsigned long pages;
485 int error; 438 int error;
486 439
487 error = swsusp_swap_check(); 440 pages = snapshot_get_image_size();
441 error = get_swap_writer(&handle);
488 if (error) { 442 if (error) {
489 printk(KERN_ERR "PM: Cannot find swap device, try " 443 printk(KERN_ERR "PM: Cannot get swap writer\n");
490 "swapon -a.\n");
491 return error; 444 return error;
492 } 445 }
446 if (!enough_swap(pages)) {
447 printk(KERN_ERR "PM: Not enough free swap\n");
448 error = -ENOSPC;
449 goto out_finish;
450 }
493 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 451 memset(&snapshot, 0, sizeof(struct snapshot_handle));
494 error = snapshot_read_next(&snapshot, PAGE_SIZE); 452 error = snapshot_read_next(&snapshot);
495 if (error < PAGE_SIZE) { 453 if (error < PAGE_SIZE) {
496 if (error >= 0) 454 if (error >= 0)
497 error = -EFAULT; 455 error = -EFAULT;
498 456
499 goto out; 457 goto out_finish;
500 } 458 }
501 header = (struct swsusp_info *)data_of(snapshot); 459 header = (struct swsusp_info *)data_of(snapshot);
502 if (!enough_swap(header->pages)) { 460 error = swap_write_page(&handle, header, NULL);
503 printk(KERN_ERR "PM: Not enough free swap\n"); 461 if (!error)
504 error = -ENOSPC; 462 error = save_image(&handle, &snapshot, pages - 1);
505 goto out; 463out_finish:
506 } 464 error = swap_writer_finish(&handle, flags, error);
507 error = get_swap_writer(&handle);
508 if (!error) {
509 sector_t start = handle.cur_swap;
510
511 error = swap_write_page(&handle, header, NULL);
512 if (!error)
513 error = save_image(&handle, &snapshot,
514 header->pages - 1);
515
516 if (!error) {
517 flush_swap_writer(&handle);
518 printk(KERN_INFO "PM: S");
519 error = mark_swapfiles(start, flags);
520 printk("|\n");
521 }
522 }
523 if (error)
524 free_all_swap_pages(root_swap);
525
526 release_swap_writer(&handle);
527 out:
528 swsusp_close(FMODE_WRITE);
529 return error; 465 return error;
530} 466}
531 467
@@ -541,18 +477,21 @@ static void release_swap_reader(struct swap_map_handle *handle)
541 handle->cur = NULL; 477 handle->cur = NULL;
542} 478}
543 479
544static int get_swap_reader(struct swap_map_handle *handle, sector_t start) 480static int get_swap_reader(struct swap_map_handle *handle,
481 unsigned int *flags_p)
545{ 482{
546 int error; 483 int error;
547 484
548 if (!start) 485 *flags_p = swsusp_header->flags;
486
487 if (!swsusp_header->image) /* how can this happen? */
549 return -EINVAL; 488 return -EINVAL;
550 489
551 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH); 490 handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
552 if (!handle->cur) 491 if (!handle->cur)
553 return -ENOMEM; 492 return -ENOMEM;
554 493
555 error = bio_read_page(start, handle->cur, NULL); 494 error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
556 if (error) { 495 if (error) {
557 release_swap_reader(handle); 496 release_swap_reader(handle);
558 return error; 497 return error;
@@ -572,21 +511,28 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
572 offset = handle->cur->entries[handle->k]; 511 offset = handle->cur->entries[handle->k];
573 if (!offset) 512 if (!offset)
574 return -EFAULT; 513 return -EFAULT;
575 error = bio_read_page(offset, buf, bio_chain); 514 error = hib_bio_read_page(offset, buf, bio_chain);
576 if (error) 515 if (error)
577 return error; 516 return error;
578 if (++handle->k >= MAP_PAGE_ENTRIES) { 517 if (++handle->k >= MAP_PAGE_ENTRIES) {
579 error = wait_on_bio_chain(bio_chain); 518 error = hib_wait_on_bio_chain(bio_chain);
580 handle->k = 0; 519 handle->k = 0;
581 offset = handle->cur->next_swap; 520 offset = handle->cur->next_swap;
582 if (!offset) 521 if (!offset)
583 release_swap_reader(handle); 522 release_swap_reader(handle);
584 else if (!error) 523 else if (!error)
585 error = bio_read_page(offset, handle->cur, NULL); 524 error = hib_bio_read_page(offset, handle->cur, NULL);
586 } 525 }
587 return error; 526 return error;
588} 527}
589 528
529static int swap_reader_finish(struct swap_map_handle *handle)
530{
531 release_swap_reader(handle);
532
533 return 0;
534}
535
590/** 536/**
591 * load_image - load the image using the swap map handle 537 * load_image - load the image using the swap map handle
592 * @handle and the snapshot handle @snapshot 538 * @handle and the snapshot handle @snapshot
@@ -614,21 +560,21 @@ static int load_image(struct swap_map_handle *handle,
614 bio = NULL; 560 bio = NULL;
615 do_gettimeofday(&start); 561 do_gettimeofday(&start);
616 for ( ; ; ) { 562 for ( ; ; ) {
617 error = snapshot_write_next(snapshot, PAGE_SIZE); 563 error = snapshot_write_next(snapshot);
618 if (error <= 0) 564 if (error <= 0)
619 break; 565 break;
620 error = swap_read_page(handle, data_of(*snapshot), &bio); 566 error = swap_read_page(handle, data_of(*snapshot), &bio);
621 if (error) 567 if (error)
622 break; 568 break;
623 if (snapshot->sync_read) 569 if (snapshot->sync_read)
624 error = wait_on_bio_chain(&bio); 570 error = hib_wait_on_bio_chain(&bio);
625 if (error) 571 if (error)
626 break; 572 break;
627 if (!(nr_pages % m)) 573 if (!(nr_pages % m))
628 printk("\b\b\b\b%3d%%", nr_pages / m); 574 printk("\b\b\b\b%3d%%", nr_pages / m);
629 nr_pages++; 575 nr_pages++;
630 } 576 }
631 err2 = wait_on_bio_chain(&bio); 577 err2 = hib_wait_on_bio_chain(&bio);
632 do_gettimeofday(&stop); 578 do_gettimeofday(&stop);
633 if (!error) 579 if (!error)
634 error = err2; 580 error = err2;
@@ -656,20 +602,20 @@ int swsusp_read(unsigned int *flags_p)
656 struct snapshot_handle snapshot; 602 struct snapshot_handle snapshot;
657 struct swsusp_info *header; 603 struct swsusp_info *header;
658 604
659 *flags_p = swsusp_header->flags;
660
661 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 605 memset(&snapshot, 0, sizeof(struct snapshot_handle));
662 error = snapshot_write_next(&snapshot, PAGE_SIZE); 606 error = snapshot_write_next(&snapshot);
663 if (error < PAGE_SIZE) 607 if (error < PAGE_SIZE)
664 return error < 0 ? error : -EFAULT; 608 return error < 0 ? error : -EFAULT;
665 header = (struct swsusp_info *)data_of(snapshot); 609 header = (struct swsusp_info *)data_of(snapshot);
666 error = get_swap_reader(&handle, swsusp_header->image); 610 error = get_swap_reader(&handle, flags_p);
611 if (error)
612 goto end;
667 if (!error) 613 if (!error)
668 error = swap_read_page(&handle, header, NULL); 614 error = swap_read_page(&handle, header, NULL);
669 if (!error) 615 if (!error)
670 error = load_image(&handle, &snapshot, header->pages - 1); 616 error = load_image(&handle, &snapshot, header->pages - 1);
671 release_swap_reader(&handle); 617 swap_reader_finish(&handle);
672 618end:
673 if (!error) 619 if (!error)
674 pr_debug("PM: Image successfully loaded\n"); 620 pr_debug("PM: Image successfully loaded\n");
675 else 621 else
@@ -685,11 +631,11 @@ int swsusp_check(void)
685{ 631{
686 int error; 632 int error;
687 633
688 resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ); 634 hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
689 if (!IS_ERR(resume_bdev)) { 635 if (!IS_ERR(hib_resume_bdev)) {
690 set_blocksize(resume_bdev, PAGE_SIZE); 636 set_blocksize(hib_resume_bdev, PAGE_SIZE);
691 memset(swsusp_header, 0, PAGE_SIZE); 637 memset(swsusp_header, 0, PAGE_SIZE);
692 error = bio_read_page(swsusp_resume_block, 638 error = hib_bio_read_page(swsusp_resume_block,
693 swsusp_header, NULL); 639 swsusp_header, NULL);
694 if (error) 640 if (error)
695 goto put; 641 goto put;
@@ -697,7 +643,7 @@ int swsusp_check(void)
697 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 643 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
698 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 644 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
699 /* Reset swap signature now */ 645 /* Reset swap signature now */
700 error = bio_write_page(swsusp_resume_block, 646 error = hib_bio_write_page(swsusp_resume_block,
701 swsusp_header, NULL); 647 swsusp_header, NULL);
702 } else { 648 } else {
703 error = -EINVAL; 649 error = -EINVAL;
@@ -705,11 +651,11 @@ int swsusp_check(void)
705 651
706put: 652put:
707 if (error) 653 if (error)
708 blkdev_put(resume_bdev, FMODE_READ); 654 blkdev_put(hib_resume_bdev, FMODE_READ);
709 else 655 else
710 pr_debug("PM: Signature found, resuming\n"); 656 pr_debug("PM: Signature found, resuming\n");
711 } else { 657 } else {
712 error = PTR_ERR(resume_bdev); 658 error = PTR_ERR(hib_resume_bdev);
713 } 659 }
714 660
715 if (error) 661 if (error)
@@ -724,12 +670,12 @@ put:
724 670
725void swsusp_close(fmode_t mode) 671void swsusp_close(fmode_t mode)
726{ 672{
727 if (IS_ERR(resume_bdev)) { 673 if (IS_ERR(hib_resume_bdev)) {
728 pr_debug("PM: Image device not initialised\n"); 674 pr_debug("PM: Image device not initialised\n");
729 return; 675 return;
730 } 676 }
731 677
732 blkdev_put(resume_bdev, mode); 678 blkdev_put(hib_resume_bdev, mode);
733} 679}
734 680
735static int swsusp_header_init(void) 681static int swsusp_header_init(void)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 4d2289626a84..e819e17877ca 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -151,6 +151,7 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
151{ 151{
152 struct snapshot_data *data; 152 struct snapshot_data *data;
153 ssize_t res; 153 ssize_t res;
154 loff_t pg_offp = *offp & ~PAGE_MASK;
154 155
155 mutex_lock(&pm_mutex); 156 mutex_lock(&pm_mutex);
156 157
@@ -159,14 +160,19 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
159 res = -ENODATA; 160 res = -ENODATA;
160 goto Unlock; 161 goto Unlock;
161 } 162 }
162 res = snapshot_read_next(&data->handle, count); 163 if (!pg_offp) { /* on page boundary? */
163 if (res > 0) { 164 res = snapshot_read_next(&data->handle);
164 if (copy_to_user(buf, data_of(data->handle), res)) 165 if (res <= 0)
165 res = -EFAULT; 166 goto Unlock;
166 else 167 } else {
167 *offp = data->handle.offset; 168 res = PAGE_SIZE - pg_offp;
168 } 169 }
169 170
171 res = simple_read_from_buffer(buf, count, &pg_offp,
172 data_of(data->handle), res);
173 if (res > 0)
174 *offp += res;
175
170 Unlock: 176 Unlock:
171 mutex_unlock(&pm_mutex); 177 mutex_unlock(&pm_mutex);
172 178
@@ -178,18 +184,25 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
178{ 184{
179 struct snapshot_data *data; 185 struct snapshot_data *data;
180 ssize_t res; 186 ssize_t res;
187 loff_t pg_offp = *offp & ~PAGE_MASK;
181 188
182 mutex_lock(&pm_mutex); 189 mutex_lock(&pm_mutex);
183 190
184 data = filp->private_data; 191 data = filp->private_data;
185 res = snapshot_write_next(&data->handle, count); 192
186 if (res > 0) { 193 if (!pg_offp) {
187 if (copy_from_user(data_of(data->handle), buf, res)) 194 res = snapshot_write_next(&data->handle);
188 res = -EFAULT; 195 if (res <= 0)
189 else 196 goto unlock;
190 *offp = data->handle.offset; 197 } else {
198 res = PAGE_SIZE - pg_offp;
191 } 199 }
192 200
201 res = simple_write_to_buffer(data_of(data->handle), res, &pg_offp,
202 buf, count);
203 if (res > 0)
204 *offp += res;
205unlock:
193 mutex_unlock(&pm_mutex); 206 mutex_unlock(&pm_mutex);
194 207
195 return res; 208 return res;
@@ -420,7 +433,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
420 * User space encodes device types as two-byte values, 433 * User space encodes device types as two-byte values,
421 * so we need to recode them 434 * so we need to recode them
422 */ 435 */
423 swdev = old_decode_dev(swap_area.dev); 436 swdev = new_decode_dev(swap_area.dev);
424 if (swdev) { 437 if (swdev) {
425 offset = swap_area.offset; 438 offset = swap_area.offset;
426 data->swap = swap_type_of(swdev, offset, NULL); 439 data->swap = swap_type_of(swdev, offset, NULL);
diff --git a/kernel/printk.c b/kernel/printk.c
index 75077ad0b537..444b770c9595 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -33,6 +33,7 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
35#include <linux/kexec.h> 35#include <linux/kexec.h>
36#include <linux/kdb.h>
36#include <linux/ratelimit.h> 37#include <linux/ratelimit.h>
37#include <linux/kmsg_dump.h> 38#include <linux/kmsg_dump.h>
38#include <linux/syslog.h> 39#include <linux/syslog.h>
@@ -413,6 +414,22 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len)
413 return do_syslog(type, buf, len, SYSLOG_FROM_CALL); 414 return do_syslog(type, buf, len, SYSLOG_FROM_CALL);
414} 415}
415 416
417#ifdef CONFIG_KGDB_KDB
418/* kdb dmesg command needs access to the syslog buffer. do_syslog()
419 * uses locks so it cannot be used during debugging. Just tell kdb
420 * where the start and end of the physical and logical logs are. This
421 * is equivalent to do_syslog(3).
422 */
423void kdb_syslog_data(char *syslog_data[4])
424{
425 syslog_data[0] = log_buf;
426 syslog_data[1] = log_buf + log_buf_len;
427 syslog_data[2] = log_buf + log_end -
428 (logged_chars < log_buf_len ? logged_chars : log_buf_len);
429 syslog_data[3] = log_buf + log_end;
430}
431#endif /* CONFIG_KGDB_KDB */
432
416/* 433/*
417 * Call the console drivers on a range of log_buf 434 * Call the console drivers on a range of log_buf
418 */ 435 */
@@ -586,6 +603,14 @@ asmlinkage int printk(const char *fmt, ...)
586 va_list args; 603 va_list args;
587 int r; 604 int r;
588 605
606#ifdef CONFIG_KGDB_KDB
607 if (unlikely(kdb_trap_printk)) {
608 va_start(args, fmt);
609 r = vkdb_printf(fmt, args);
610 va_end(args);
611 return r;
612 }
613#endif
589 va_start(args, fmt); 614 va_start(args, fmt);
590 r = vprintk(fmt, args); 615 r = vprintk(fmt, args);
591 va_end(args); 616 va_end(args);
diff --git a/kernel/profile.c b/kernel/profile.c
index a55d3a367ae8..b22a899934cc 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -127,8 +127,10 @@ int __ref profile_init(void)
127 return 0; 127 return 0;
128 128
129 prof_buffer = vmalloc(buffer_bytes); 129 prof_buffer = vmalloc(buffer_bytes);
130 if (prof_buffer) 130 if (prof_buffer) {
131 memset(prof_buffer, 0, buffer_bytes);
131 return 0; 132 return 0;
133 }
132 134
133 free_cpumask_var(prof_cpu_mask); 135 free_cpumask_var(prof_cpu_mask);
134 return -ENOMEM; 136 return -ENOMEM;
@@ -363,14 +365,14 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info,
363 switch (action) { 365 switch (action) {
364 case CPU_UP_PREPARE: 366 case CPU_UP_PREPARE:
365 case CPU_UP_PREPARE_FROZEN: 367 case CPU_UP_PREPARE_FROZEN:
366 node = cpu_to_node(cpu); 368 node = cpu_to_mem(cpu);
367 per_cpu(cpu_profile_flip, cpu) = 0; 369 per_cpu(cpu_profile_flip, cpu) = 0;
368 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 370 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
369 page = alloc_pages_exact_node(node, 371 page = alloc_pages_exact_node(node,
370 GFP_KERNEL | __GFP_ZERO, 372 GFP_KERNEL | __GFP_ZERO,
371 0); 373 0);
372 if (!page) 374 if (!page)
373 return NOTIFY_BAD; 375 return notifier_from_errno(-ENOMEM);
374 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 376 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
375 } 377 }
376 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 378 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
@@ -386,7 +388,7 @@ out_free:
386 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 388 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
387 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 389 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
388 __free_page(page); 390 __free_page(page);
389 return NOTIFY_BAD; 391 return notifier_from_errno(-ENOMEM);
390 case CPU_ONLINE: 392 case CPU_ONLINE:
391 case CPU_ONLINE_FROZEN: 393 case CPU_ONLINE_FROZEN:
392 if (prof_cpu_mask != NULL) 394 if (prof_cpu_mask != NULL)
@@ -565,7 +567,7 @@ static int create_hash_tables(void)
565 int cpu; 567 int cpu;
566 568
567 for_each_online_cpu(cpu) { 569 for_each_online_cpu(cpu) {
568 int node = cpu_to_node(cpu); 570 int node = cpu_to_mem(cpu);
569 struct page *page; 571 struct page *page;
570 572
571 page = alloc_pages_exact_node(node, 573 page = alloc_pages_exact_node(node,
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 42ad8ae729a0..74a3d693c196 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -14,7 +14,6 @@
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/highmem.h> 15#include <linux/highmem.h>
16#include <linux/pagemap.h> 16#include <linux/pagemap.h>
17#include <linux/smp_lock.h>
18#include <linux/ptrace.h> 17#include <linux/ptrace.h>
19#include <linux/security.h> 18#include <linux/security.h>
20#include <linux/signal.h> 19#include <linux/signal.h>
@@ -76,7 +75,6 @@ void __ptrace_unlink(struct task_struct *child)
76 child->parent = child->real_parent; 75 child->parent = child->real_parent;
77 list_del_init(&child->ptrace_entry); 76 list_del_init(&child->ptrace_entry);
78 77
79 arch_ptrace_untrace(child);
80 if (task_is_traced(child)) 78 if (task_is_traced(child))
81 ptrace_untrace(child); 79 ptrace_untrace(child);
82} 80}
@@ -596,6 +594,32 @@ int ptrace_request(struct task_struct *child, long request,
596 ret = ptrace_detach(child, data); 594 ret = ptrace_detach(child, data);
597 break; 595 break;
598 596
597#ifdef CONFIG_BINFMT_ELF_FDPIC
598 case PTRACE_GETFDPIC: {
599 struct mm_struct *mm = get_task_mm(child);
600 unsigned long tmp = 0;
601
602 ret = -ESRCH;
603 if (!mm)
604 break;
605
606 switch (addr) {
607 case PTRACE_GETFDPIC_EXEC:
608 tmp = mm->context.exec_fdpic_loadmap;
609 break;
610 case PTRACE_GETFDPIC_INTERP:
611 tmp = mm->context.interp_fdpic_loadmap;
612 break;
613 default:
614 break;
615 }
616 mmput(mm);
617
618 ret = put_user(tmp, (unsigned long __user *) data);
619 break;
620 }
621#endif
622
599#ifdef PTRACE_SINGLESTEP 623#ifdef PTRACE_SINGLESTEP
600 case PTRACE_SINGLESTEP: 624 case PTRACE_SINGLESTEP:
601#endif 625#endif
@@ -666,10 +690,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
666 struct task_struct *child; 690 struct task_struct *child;
667 long ret; 691 long ret;
668 692
669 /*
670 * This lock_kernel fixes a subtle race with suid exec
671 */
672 lock_kernel();
673 if (request == PTRACE_TRACEME) { 693 if (request == PTRACE_TRACEME) {
674 ret = ptrace_traceme(); 694 ret = ptrace_traceme();
675 if (!ret) 695 if (!ret)
@@ -703,7 +723,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
703 out_put_task_struct: 723 out_put_task_struct:
704 put_task_struct(child); 724 put_task_struct(child);
705 out: 725 out:
706 unlock_kernel();
707 return ret; 726 return ret;
708} 727}
709 728
@@ -813,10 +832,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
813 struct task_struct *child; 832 struct task_struct *child;
814 long ret; 833 long ret;
815 834
816 /*
817 * This lock_kernel fixes a subtle race with suid exec
818 */
819 lock_kernel();
820 if (request == PTRACE_TRACEME) { 835 if (request == PTRACE_TRACEME) {
821 ret = ptrace_traceme(); 836 ret = ptrace_traceme();
822 goto out; 837 goto out;
@@ -846,7 +861,6 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
846 out_put_task_struct: 861 out_put_task_struct:
847 put_task_struct(child); 862 put_task_struct(child);
848 out: 863 out:
849 unlock_kernel();
850 return ret; 864 return ret;
851} 865}
852#endif /* CONFIG_COMPAT */ 866#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f1125c1a6321..72a8dc9567f5 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,7 +44,7 @@
44#include <linux/cpu.h> 44#include <linux/cpu.h>
45#include <linux/mutex.h> 45#include <linux/mutex.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/kernel_stat.h> 47#include <linux/hardirq.h>
48 48
49#ifdef CONFIG_DEBUG_LOCK_ALLOC 49#ifdef CONFIG_DEBUG_LOCK_ALLOC
50static struct lock_class_key rcu_lock_key; 50static struct lock_class_key rcu_lock_key;
@@ -63,23 +63,34 @@ struct lockdep_map rcu_sched_lock_map =
63EXPORT_SYMBOL_GPL(rcu_sched_lock_map); 63EXPORT_SYMBOL_GPL(rcu_sched_lock_map);
64#endif 64#endif
65 65
66int rcu_scheduler_active __read_mostly; 66#ifdef CONFIG_DEBUG_LOCK_ALLOC
67EXPORT_SYMBOL_GPL(rcu_scheduler_active);
68 67
69/* 68int debug_lockdep_rcu_enabled(void)
70 * This function is invoked towards the end of the scheduler's initialization 69{
71 * process. Before this is called, the idle task might contain 70 return rcu_scheduler_active && debug_locks &&
72 * RCU read-side critical sections (during which time, this idle 71 current->lockdep_recursion == 0;
73 * task is booting the system). After this function is called, the 72}
74 * idle tasks are prohibited from containing RCU read-side critical 73EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled);
75 * sections. 74
75/**
76 * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section?
77 *
78 * Check for bottom half being disabled, which covers both the
79 * CONFIG_PROVE_RCU and not cases. Note that if someone uses
80 * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled)
81 * will show the situation.
82 *
83 * Check debug_lockdep_rcu_enabled() to prevent false positives during boot.
76 */ 84 */
77void rcu_scheduler_starting(void) 85int rcu_read_lock_bh_held(void)
78{ 86{
79 WARN_ON(num_online_cpus() != 1); 87 if (!debug_lockdep_rcu_enabled())
80 WARN_ON(nr_context_switches() > 0); 88 return 1;
81 rcu_scheduler_active = 1; 89 return in_softirq();
82} 90}
91EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
92
93#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
83 94
84/* 95/*
85 * Awaken the corresponding synchronize_rcu() instance now that a 96 * Awaken the corresponding synchronize_rcu() instance now that a
@@ -92,3 +103,14 @@ void wakeme_after_rcu(struct rcu_head *head)
92 rcu = container_of(head, struct rcu_synchronize, head); 103 rcu = container_of(head, struct rcu_synchronize, head);
93 complete(&rcu->completion); 104 complete(&rcu->completion);
94} 105}
106
107#ifdef CONFIG_PROVE_RCU
108/*
109 * wrapper function to avoid #include problems.
110 */
111int rcu_my_thread_group_empty(void)
112{
113 return thread_group_empty(current);
114}
115EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty);
116#endif /* #ifdef CONFIG_PROVE_RCU */
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 9f6d9ff2572c..38729d3cd236 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -44,9 +44,9 @@ struct rcu_ctrlblk {
44}; 44};
45 45
46/* Definition for rcupdate control block. */ 46/* Definition for rcupdate control block. */
47static struct rcu_ctrlblk rcu_ctrlblk = { 47static struct rcu_ctrlblk rcu_sched_ctrlblk = {
48 .donetail = &rcu_ctrlblk.rcucblist, 48 .donetail = &rcu_sched_ctrlblk.rcucblist,
49 .curtail = &rcu_ctrlblk.rcucblist, 49 .curtail = &rcu_sched_ctrlblk.rcucblist,
50}; 50};
51 51
52static struct rcu_ctrlblk rcu_bh_ctrlblk = { 52static struct rcu_ctrlblk rcu_bh_ctrlblk = {
@@ -54,6 +54,11 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
54 .curtail = &rcu_bh_ctrlblk.rcucblist, 54 .curtail = &rcu_bh_ctrlblk.rcucblist,
55}; 55};
56 56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58int rcu_scheduler_active __read_mostly;
59EXPORT_SYMBOL_GPL(rcu_scheduler_active);
60#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
61
57#ifdef CONFIG_NO_HZ 62#ifdef CONFIG_NO_HZ
58 63
59static long rcu_dynticks_nesting = 1; 64static long rcu_dynticks_nesting = 1;
@@ -108,7 +113,8 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
108 */ 113 */
109void rcu_sched_qs(int cpu) 114void rcu_sched_qs(int cpu)
110{ 115{
111 if (rcu_qsctr_help(&rcu_ctrlblk) + rcu_qsctr_help(&rcu_bh_ctrlblk)) 116 if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
117 rcu_qsctr_help(&rcu_bh_ctrlblk))
112 raise_softirq(RCU_SOFTIRQ); 118 raise_softirq(RCU_SOFTIRQ);
113} 119}
114 120
@@ -173,7 +179,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
173 */ 179 */
174static void rcu_process_callbacks(struct softirq_action *unused) 180static void rcu_process_callbacks(struct softirq_action *unused)
175{ 181{
176 __rcu_process_callbacks(&rcu_ctrlblk); 182 __rcu_process_callbacks(&rcu_sched_ctrlblk);
177 __rcu_process_callbacks(&rcu_bh_ctrlblk); 183 __rcu_process_callbacks(&rcu_bh_ctrlblk);
178} 184}
179 185
@@ -187,7 +193,8 @@ static void rcu_process_callbacks(struct softirq_action *unused)
187 * 193 *
188 * Cool, huh? (Due to Josh Triplett.) 194 * Cool, huh? (Due to Josh Triplett.)
189 * 195 *
190 * But we want to make this a static inline later. 196 * But we want to make this a static inline later. The cond_resched()
197 * currently makes this problematic.
191 */ 198 */
192void synchronize_sched(void) 199void synchronize_sched(void)
193{ 200{
@@ -195,12 +202,6 @@ void synchronize_sched(void)
195} 202}
196EXPORT_SYMBOL_GPL(synchronize_sched); 203EXPORT_SYMBOL_GPL(synchronize_sched);
197 204
198void synchronize_rcu_bh(void)
199{
200 synchronize_sched();
201}
202EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
203
204/* 205/*
205 * Helper function for call_rcu() and call_rcu_bh(). 206 * Helper function for call_rcu() and call_rcu_bh().
206 */ 207 */
@@ -226,7 +227,7 @@ static void __call_rcu(struct rcu_head *head,
226 */ 227 */
227void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) 228void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
228{ 229{
229 __call_rcu(head, func, &rcu_ctrlblk); 230 __call_rcu(head, func, &rcu_sched_ctrlblk);
230} 231}
231EXPORT_SYMBOL_GPL(call_rcu); 232EXPORT_SYMBOL_GPL(call_rcu);
232 233
@@ -244,11 +245,13 @@ void rcu_barrier(void)
244{ 245{
245 struct rcu_synchronize rcu; 246 struct rcu_synchronize rcu;
246 247
248 init_rcu_head_on_stack(&rcu.head);
247 init_completion(&rcu.completion); 249 init_completion(&rcu.completion);
248 /* Will wake me after RCU finished. */ 250 /* Will wake me after RCU finished. */
249 call_rcu(&rcu.head, wakeme_after_rcu); 251 call_rcu(&rcu.head, wakeme_after_rcu);
250 /* Wait for it. */ 252 /* Wait for it. */
251 wait_for_completion(&rcu.completion); 253 wait_for_completion(&rcu.completion);
254 destroy_rcu_head_on_stack(&rcu.head);
252} 255}
253EXPORT_SYMBOL_GPL(rcu_barrier); 256EXPORT_SYMBOL_GPL(rcu_barrier);
254 257
@@ -256,11 +259,13 @@ void rcu_barrier_bh(void)
256{ 259{
257 struct rcu_synchronize rcu; 260 struct rcu_synchronize rcu;
258 261
262 init_rcu_head_on_stack(&rcu.head);
259 init_completion(&rcu.completion); 263 init_completion(&rcu.completion);
260 /* Will wake me after RCU finished. */ 264 /* Will wake me after RCU finished. */
261 call_rcu_bh(&rcu.head, wakeme_after_rcu); 265 call_rcu_bh(&rcu.head, wakeme_after_rcu);
262 /* Wait for it. */ 266 /* Wait for it. */
263 wait_for_completion(&rcu.completion); 267 wait_for_completion(&rcu.completion);
268 destroy_rcu_head_on_stack(&rcu.head);
264} 269}
265EXPORT_SYMBOL_GPL(rcu_barrier_bh); 270EXPORT_SYMBOL_GPL(rcu_barrier_bh);
266 271
@@ -268,11 +273,13 @@ void rcu_barrier_sched(void)
268{ 273{
269 struct rcu_synchronize rcu; 274 struct rcu_synchronize rcu;
270 275
276 init_rcu_head_on_stack(&rcu.head);
271 init_completion(&rcu.completion); 277 init_completion(&rcu.completion);
272 /* Will wake me after RCU finished. */ 278 /* Will wake me after RCU finished. */
273 call_rcu_sched(&rcu.head, wakeme_after_rcu); 279 call_rcu_sched(&rcu.head, wakeme_after_rcu);
274 /* Wait for it. */ 280 /* Wait for it. */
275 wait_for_completion(&rcu.completion); 281 wait_for_completion(&rcu.completion);
282 destroy_rcu_head_on_stack(&rcu.head);
276} 283}
277EXPORT_SYMBOL_GPL(rcu_barrier_sched); 284EXPORT_SYMBOL_GPL(rcu_barrier_sched);
278 285
@@ -280,3 +287,5 @@ void __init rcu_init(void)
280{ 287{
281 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 288 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
282} 289}
290
291#include "rcutiny_plugin.h"
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
new file mode 100644
index 000000000000..d223a92bc742
--- /dev/null
+++ b/kernel/rcutiny_plugin.h
@@ -0,0 +1,39 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion (tree-based version)
3 * Internal non-public definitions that provide either classic
4 * or preemptable semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright IBM Corporation, 2009
21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#ifdef CONFIG_DEBUG_LOCK_ALLOC
26
27#include <linux/kernel_stat.h>
28
29/*
30 * During boot, we forgive RCU lockdep issues. After this function is
31 * invoked, we start taking RCU lockdep issues seriously.
32 */
33void rcu_scheduler_starting(void)
34{
35 WARN_ON(nr_context_switches() > 0);
36 rcu_scheduler_active = 1;
37}
38
39#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 58df55bf83ed..6535ac8bc6a5 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -464,9 +464,11 @@ static void rcu_bh_torture_synchronize(void)
464{ 464{
465 struct rcu_bh_torture_synchronize rcu; 465 struct rcu_bh_torture_synchronize rcu;
466 466
467 init_rcu_head_on_stack(&rcu.head);
467 init_completion(&rcu.completion); 468 init_completion(&rcu.completion);
468 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb); 469 call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
469 wait_for_completion(&rcu.completion); 470 wait_for_completion(&rcu.completion);
471 destroy_rcu_head_on_stack(&rcu.head);
470} 472}
471 473
472static struct rcu_torture_ops rcu_bh_ops = { 474static struct rcu_torture_ops rcu_bh_ops = {
@@ -669,7 +671,7 @@ static struct rcu_torture_ops sched_expedited_ops = {
669 .sync = synchronize_sched_expedited, 671 .sync = synchronize_sched_expedited,
670 .cb_barrier = NULL, 672 .cb_barrier = NULL,
671 .fqs = rcu_sched_force_quiescent_state, 673 .fqs = rcu_sched_force_quiescent_state,
672 .stats = rcu_expedited_torture_stats, 674 .stats = NULL,
673 .irq_capable = 1, 675 .irq_capable = 1,
674 .name = "sched_expedited" 676 .name = "sched_expedited"
675}; 677};
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 3ec8160fc75f..d4437345706f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -46,6 +46,7 @@
46#include <linux/cpu.h> 46#include <linux/cpu.h>
47#include <linux/mutex.h> 47#include <linux/mutex.h>
48#include <linux/time.h> 48#include <linux/time.h>
49#include <linux/kernel_stat.h>
49 50
50#include "rcutree.h" 51#include "rcutree.h"
51 52
@@ -53,8 +54,8 @@
53 54
54static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; 55static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
55 56
56#define RCU_STATE_INITIALIZER(name) { \ 57#define RCU_STATE_INITIALIZER(structname) { \
57 .level = { &name.node[0] }, \ 58 .level = { &structname.node[0] }, \
58 .levelcnt = { \ 59 .levelcnt = { \
59 NUM_RCU_LVL_0, /* root of hierarchy. */ \ 60 NUM_RCU_LVL_0, /* root of hierarchy. */ \
60 NUM_RCU_LVL_1, \ 61 NUM_RCU_LVL_1, \
@@ -65,13 +66,14 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
65 .signaled = RCU_GP_IDLE, \ 66 .signaled = RCU_GP_IDLE, \
66 .gpnum = -300, \ 67 .gpnum = -300, \
67 .completed = -300, \ 68 .completed = -300, \
68 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ 69 .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
69 .orphan_cbs_list = NULL, \ 70 .orphan_cbs_list = NULL, \
70 .orphan_cbs_tail = &name.orphan_cbs_list, \ 71 .orphan_cbs_tail = &structname.orphan_cbs_list, \
71 .orphan_qlen = 0, \ 72 .orphan_qlen = 0, \
72 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ 73 .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
73 .n_force_qs = 0, \ 74 .n_force_qs = 0, \
74 .n_force_qs_ngp = 0, \ 75 .n_force_qs_ngp = 0, \
76 .name = #structname, \
75} 77}
76 78
77struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state); 79struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
@@ -80,6 +82,9 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
80struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 82struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
81DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 83DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
82 84
85int rcu_scheduler_active __read_mostly;
86EXPORT_SYMBOL_GPL(rcu_scheduler_active);
87
83/* 88/*
84 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s 89 * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
85 * permit this function to be invoked without holding the root rcu_node 90 * permit this function to be invoked without holding the root rcu_node
@@ -97,25 +102,32 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
97 */ 102 */
98void rcu_sched_qs(int cpu) 103void rcu_sched_qs(int cpu)
99{ 104{
100 struct rcu_data *rdp; 105 struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
101 106
102 rdp = &per_cpu(rcu_sched_data, cpu);
103 rdp->passed_quiesc_completed = rdp->gpnum - 1; 107 rdp->passed_quiesc_completed = rdp->gpnum - 1;
104 barrier(); 108 barrier();
105 rdp->passed_quiesc = 1; 109 rdp->passed_quiesc = 1;
106 rcu_preempt_note_context_switch(cpu);
107} 110}
108 111
109void rcu_bh_qs(int cpu) 112void rcu_bh_qs(int cpu)
110{ 113{
111 struct rcu_data *rdp; 114 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
112 115
113 rdp = &per_cpu(rcu_bh_data, cpu);
114 rdp->passed_quiesc_completed = rdp->gpnum - 1; 116 rdp->passed_quiesc_completed = rdp->gpnum - 1;
115 barrier(); 117 barrier();
116 rdp->passed_quiesc = 1; 118 rdp->passed_quiesc = 1;
117} 119}
118 120
121/*
122 * Note a context switch. This is a quiescent state for RCU-sched,
123 * and requires special handling for preemptible RCU.
124 */
125void rcu_note_context_switch(int cpu)
126{
127 rcu_sched_qs(cpu);
128 rcu_preempt_note_context_switch(cpu);
129}
130
119#ifdef CONFIG_NO_HZ 131#ifdef CONFIG_NO_HZ
120DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 132DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
121 .dynticks_nesting = 1, 133 .dynticks_nesting = 1,
@@ -438,6 +450,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
438 450
439#ifdef CONFIG_RCU_CPU_STALL_DETECTOR 451#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
440 452
453int rcu_cpu_stall_panicking __read_mostly;
454
441static void record_gp_stall_check_time(struct rcu_state *rsp) 455static void record_gp_stall_check_time(struct rcu_state *rsp)
442{ 456{
443 rsp->gp_start = jiffies; 457 rsp->gp_start = jiffies;
@@ -470,7 +484,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
470 484
471 /* OK, time to rat on our buddy... */ 485 /* OK, time to rat on our buddy... */
472 486
473 printk(KERN_ERR "INFO: RCU detected CPU stalls:"); 487 printk(KERN_ERR "INFO: %s detected stalls on CPUs/tasks: {",
488 rsp->name);
474 rcu_for_each_leaf_node(rsp, rnp) { 489 rcu_for_each_leaf_node(rsp, rnp) {
475 raw_spin_lock_irqsave(&rnp->lock, flags); 490 raw_spin_lock_irqsave(&rnp->lock, flags);
476 rcu_print_task_stall(rnp); 491 rcu_print_task_stall(rnp);
@@ -481,7 +496,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
481 if (rnp->qsmask & (1UL << cpu)) 496 if (rnp->qsmask & (1UL << cpu))
482 printk(" %d", rnp->grplo + cpu); 497 printk(" %d", rnp->grplo + cpu);
483 } 498 }
484 printk(" (detected by %d, t=%ld jiffies)\n", 499 printk("} (detected by %d, t=%ld jiffies)\n",
485 smp_processor_id(), (long)(jiffies - rsp->gp_start)); 500 smp_processor_id(), (long)(jiffies - rsp->gp_start));
486 trigger_all_cpu_backtrace(); 501 trigger_all_cpu_backtrace();
487 502
@@ -497,8 +512,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
497 unsigned long flags; 512 unsigned long flags;
498 struct rcu_node *rnp = rcu_get_root(rsp); 513 struct rcu_node *rnp = rcu_get_root(rsp);
499 514
500 printk(KERN_ERR "INFO: RCU detected CPU %d stall (t=%lu jiffies)\n", 515 printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
501 smp_processor_id(), jiffies - rsp->gp_start); 516 rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
502 trigger_all_cpu_backtrace(); 517 trigger_all_cpu_backtrace();
503 518
504 raw_spin_lock_irqsave(&rnp->lock, flags); 519 raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -515,6 +530,8 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
515 long delta; 530 long delta;
516 struct rcu_node *rnp; 531 struct rcu_node *rnp;
517 532
533 if (rcu_cpu_stall_panicking)
534 return;
518 delta = jiffies - rsp->jiffies_stall; 535 delta = jiffies - rsp->jiffies_stall;
519 rnp = rdp->mynode; 536 rnp = rdp->mynode;
520 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) { 537 if ((rnp->qsmask & rdp->grpmask) && delta >= 0) {
@@ -529,6 +546,21 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
529 } 546 }
530} 547}
531 548
549static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)
550{
551 rcu_cpu_stall_panicking = 1;
552 return NOTIFY_DONE;
553}
554
555static struct notifier_block rcu_panic_block = {
556 .notifier_call = rcu_panic,
557};
558
559static void __init check_cpu_stall_init(void)
560{
561 atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
562}
563
532#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 564#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
533 565
534static void record_gp_stall_check_time(struct rcu_state *rsp) 566static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -539,6 +571,10 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
539{ 571{
540} 572}
541 573
574static void __init check_cpu_stall_init(void)
575{
576}
577
542#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 578#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
543 579
544/* 580/*
@@ -1125,8 +1161,6 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
1125 */ 1161 */
1126void rcu_check_callbacks(int cpu, int user) 1162void rcu_check_callbacks(int cpu, int user)
1127{ 1163{
1128 if (!rcu_pending(cpu))
1129 return; /* if nothing for RCU to do. */
1130 if (user || 1164 if (user ||
1131 (idle_cpu(cpu) && rcu_scheduler_active && 1165 (idle_cpu(cpu) && rcu_scheduler_active &&
1132 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 1166 !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1158,7 +1192,8 @@ void rcu_check_callbacks(int cpu, int user)
1158 rcu_bh_qs(cpu); 1192 rcu_bh_qs(cpu);
1159 } 1193 }
1160 rcu_preempt_check_callbacks(cpu); 1194 rcu_preempt_check_callbacks(cpu);
1161 raise_softirq(RCU_SOFTIRQ); 1195 if (rcu_pending(cpu))
1196 raise_softirq(RCU_SOFTIRQ);
1162} 1197}
1163 1198
1164#ifdef CONFIG_SMP 1199#ifdef CONFIG_SMP
@@ -1236,11 +1271,11 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1236 break; /* grace period idle or initializing, ignore. */ 1271 break; /* grace period idle or initializing, ignore. */
1237 1272
1238 case RCU_SAVE_DYNTICK: 1273 case RCU_SAVE_DYNTICK:
1239
1240 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1241 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) 1274 if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK)
1242 break; /* So gcc recognizes the dead code. */ 1275 break; /* So gcc recognizes the dead code. */
1243 1276
1277 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
1278
1244 /* Record dyntick-idle state. */ 1279 /* Record dyntick-idle state. */
1245 force_qs_rnp(rsp, dyntick_save_progress_counter); 1280 force_qs_rnp(rsp, dyntick_save_progress_counter);
1246 raw_spin_lock(&rnp->lock); /* irqs already disabled */ 1281 raw_spin_lock(&rnp->lock); /* irqs already disabled */
@@ -1449,11 +1484,13 @@ void synchronize_sched(void)
1449 if (rcu_blocking_is_gp()) 1484 if (rcu_blocking_is_gp())
1450 return; 1485 return;
1451 1486
1487 init_rcu_head_on_stack(&rcu.head);
1452 init_completion(&rcu.completion); 1488 init_completion(&rcu.completion);
1453 /* Will wake me after RCU finished. */ 1489 /* Will wake me after RCU finished. */
1454 call_rcu_sched(&rcu.head, wakeme_after_rcu); 1490 call_rcu_sched(&rcu.head, wakeme_after_rcu);
1455 /* Wait for it. */ 1491 /* Wait for it. */
1456 wait_for_completion(&rcu.completion); 1492 wait_for_completion(&rcu.completion);
1493 destroy_rcu_head_on_stack(&rcu.head);
1457} 1494}
1458EXPORT_SYMBOL_GPL(synchronize_sched); 1495EXPORT_SYMBOL_GPL(synchronize_sched);
1459 1496
@@ -1473,11 +1510,13 @@ void synchronize_rcu_bh(void)
1473 if (rcu_blocking_is_gp()) 1510 if (rcu_blocking_is_gp())
1474 return; 1511 return;
1475 1512
1513 init_rcu_head_on_stack(&rcu.head);
1476 init_completion(&rcu.completion); 1514 init_completion(&rcu.completion);
1477 /* Will wake me after RCU finished. */ 1515 /* Will wake me after RCU finished. */
1478 call_rcu_bh(&rcu.head, wakeme_after_rcu); 1516 call_rcu_bh(&rcu.head, wakeme_after_rcu);
1479 /* Wait for it. */ 1517 /* Wait for it. */
1480 wait_for_completion(&rcu.completion); 1518 wait_for_completion(&rcu.completion);
1519 destroy_rcu_head_on_stack(&rcu.head);
1481} 1520}
1482EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 1521EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
1483 1522
@@ -1498,8 +1537,20 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
1498 check_cpu_stall(rsp, rdp); 1537 check_cpu_stall(rsp, rdp);
1499 1538
1500 /* Is the RCU core waiting for a quiescent state from this CPU? */ 1539 /* Is the RCU core waiting for a quiescent state from this CPU? */
1501 if (rdp->qs_pending) { 1540 if (rdp->qs_pending && !rdp->passed_quiesc) {
1541
1542 /*
1543 * If force_quiescent_state() coming soon and this CPU
1544 * needs a quiescent state, and this is either RCU-sched
1545 * or RCU-bh, force a local reschedule.
1546 */
1502 rdp->n_rp_qs_pending++; 1547 rdp->n_rp_qs_pending++;
1548 if (!rdp->preemptable &&
1549 ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
1550 jiffies))
1551 set_need_resched();
1552 } else if (rdp->qs_pending && rdp->passed_quiesc) {
1553 rdp->n_rp_report_qs++;
1503 return 1; 1554 return 1;
1504 } 1555 }
1505 1556
@@ -1767,6 +1818,21 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
1767} 1818}
1768 1819
1769/* 1820/*
1821 * This function is invoked towards the end of the scheduler's initialization
1822 * process. Before this is called, the idle task might contain
1823 * RCU read-side critical sections (during which time, this idle
1824 * task is booting the system). After this function is called, the
1825 * idle tasks are prohibited from containing RCU read-side critical
1826 * sections. This function also enables RCU lockdep checking.
1827 */
1828void rcu_scheduler_starting(void)
1829{
1830 WARN_ON(num_online_cpus() != 1);
1831 WARN_ON(nr_context_switches() > 0);
1832 rcu_scheduler_active = 1;
1833}
1834
1835/*
1770 * Compute the per-level fanout, either using the exact fanout specified 1836 * Compute the per-level fanout, either using the exact fanout specified
1771 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. 1837 * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
1772 */ 1838 */
@@ -1849,6 +1915,14 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1849 INIT_LIST_HEAD(&rnp->blocked_tasks[3]); 1915 INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
1850 } 1916 }
1851 } 1917 }
1918
1919 rnp = rsp->level[NUM_RCU_LVLS - 1];
1920 for_each_possible_cpu(i) {
1921 while (i > rnp->grphi)
1922 rnp++;
1923 rsp->rda[i]->mynode = rnp;
1924 rcu_boot_init_percpu_data(i, rsp);
1925 }
1852} 1926}
1853 1927
1854/* 1928/*
@@ -1859,19 +1933,11 @@ static void __init rcu_init_one(struct rcu_state *rsp)
1859#define RCU_INIT_FLAVOR(rsp, rcu_data) \ 1933#define RCU_INIT_FLAVOR(rsp, rcu_data) \
1860do { \ 1934do { \
1861 int i; \ 1935 int i; \
1862 int j; \
1863 struct rcu_node *rnp; \
1864 \ 1936 \
1865 rcu_init_one(rsp); \
1866 rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
1867 j = 0; \
1868 for_each_possible_cpu(i) { \ 1937 for_each_possible_cpu(i) { \
1869 if (i > rnp[j].grphi) \
1870 j++; \
1871 per_cpu(rcu_data, i).mynode = &rnp[j]; \
1872 (rsp)->rda[i] = &per_cpu(rcu_data, i); \ 1938 (rsp)->rda[i] = &per_cpu(rcu_data, i); \
1873 rcu_boot_init_percpu_data(i, rsp); \
1874 } \ 1939 } \
1940 rcu_init_one(rsp); \
1875} while (0) 1941} while (0)
1876 1942
1877void __init rcu_init(void) 1943void __init rcu_init(void)
@@ -1879,12 +1945,6 @@ void __init rcu_init(void)
1879 int cpu; 1945 int cpu;
1880 1946
1881 rcu_bootup_announce(); 1947 rcu_bootup_announce();
1882#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
1883 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
1884#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
1885#if NUM_RCU_LVL_4 != 0
1886 printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n");
1887#endif /* #if NUM_RCU_LVL_4 != 0 */
1888 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data); 1948 RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);
1889 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data); 1949 RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);
1890 __rcu_init_preempt(); 1950 __rcu_init_preempt();
@@ -1898,6 +1958,7 @@ void __init rcu_init(void)
1898 cpu_notifier(rcu_cpu_notify, 0); 1958 cpu_notifier(rcu_cpu_notify, 0);
1899 for_each_online_cpu(cpu) 1959 for_each_online_cpu(cpu)
1900 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); 1960 rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
1961 check_cpu_stall_init();
1901} 1962}
1902 1963
1903#include "rcutree_plugin.h" 1964#include "rcutree_plugin.h"
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 4a525a30e08e..14c040b18ed0 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -223,6 +223,7 @@ struct rcu_data {
223 /* 5) __rcu_pending() statistics. */ 223 /* 5) __rcu_pending() statistics. */
224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ 224 unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */
225 unsigned long n_rp_qs_pending; 225 unsigned long n_rp_qs_pending;
226 unsigned long n_rp_report_qs;
226 unsigned long n_rp_cb_ready; 227 unsigned long n_rp_cb_ready;
227 unsigned long n_rp_cpu_needs_gp; 228 unsigned long n_rp_cpu_needs_gp;
228 unsigned long n_rp_gp_completed; 229 unsigned long n_rp_gp_completed;
@@ -326,6 +327,7 @@ struct rcu_state {
326 unsigned long jiffies_stall; /* Time at which to check */ 327 unsigned long jiffies_stall; /* Time at which to check */
327 /* for CPU stalls. */ 328 /* for CPU stalls. */
328#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 329#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
330 char *name; /* Name of structure. */
329}; 331};
330 332
331/* Return values for rcu_preempt_offline_tasks(). */ 333/* Return values for rcu_preempt_offline_tasks(). */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 79b53bda8943..0e4f420245d9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -26,6 +26,45 @@
26 26
27#include <linux/delay.h> 27#include <linux/delay.h>
28 28
29/*
30 * Check the RCU kernel configuration parameters and print informative
31 * messages about anything out of the ordinary. If you like #ifdef, you
32 * will love this function.
33 */
34static void __init rcu_bootup_announce_oddness(void)
35{
36#ifdef CONFIG_RCU_TRACE
37 printk(KERN_INFO "\tRCU debugfs-based tracing is enabled.\n");
38#endif
39#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
40 printk(KERN_INFO "\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
41 CONFIG_RCU_FANOUT);
42#endif
43#ifdef CONFIG_RCU_FANOUT_EXACT
44 printk(KERN_INFO "\tHierarchical RCU autobalancing is disabled.\n");
45#endif
46#ifdef CONFIG_RCU_FAST_NO_HZ
47 printk(KERN_INFO
48 "\tRCU dyntick-idle grace-period acceleration is enabled.\n");
49#endif
50#ifdef CONFIG_PROVE_RCU
51 printk(KERN_INFO "\tRCU lockdep checking is enabled.\n");
52#endif
53#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
54 printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
55#endif
56#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
57 printk(KERN_INFO
58 "\tRCU-based detection of stalled CPUs is disabled.\n");
59#endif
60#ifndef CONFIG_RCU_CPU_STALL_VERBOSE
61 printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
62#endif
63#if NUM_RCU_LVL_4 != 0
64 printk(KERN_INFO "\tExperimental four-level hierarchy is enabled.\n");
65#endif
66}
67
29#ifdef CONFIG_TREE_PREEMPT_RCU 68#ifdef CONFIG_TREE_PREEMPT_RCU
30 69
31struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state); 70struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
@@ -38,8 +77,8 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
38 */ 77 */
39static void __init rcu_bootup_announce(void) 78static void __init rcu_bootup_announce(void)
40{ 79{
41 printk(KERN_INFO 80 printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
42 "Experimental preemptable hierarchical RCU implementation.\n"); 81 rcu_bootup_announce_oddness();
43} 82}
44 83
45/* 84/*
@@ -75,13 +114,19 @@ EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
75 * that this just means that the task currently running on the CPU is 114 * that this just means that the task currently running on the CPU is
76 * not in a quiescent state. There might be any number of tasks blocked 115 * not in a quiescent state. There might be any number of tasks blocked
77 * while in an RCU read-side critical section. 116 * while in an RCU read-side critical section.
117 *
118 * Unlike the other rcu_*_qs() functions, callers to this function
119 * must disable irqs in order to protect the assignment to
120 * ->rcu_read_unlock_special.
78 */ 121 */
79static void rcu_preempt_qs(int cpu) 122static void rcu_preempt_qs(int cpu)
80{ 123{
81 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); 124 struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
125
82 rdp->passed_quiesc_completed = rdp->gpnum - 1; 126 rdp->passed_quiesc_completed = rdp->gpnum - 1;
83 barrier(); 127 barrier();
84 rdp->passed_quiesc = 1; 128 rdp->passed_quiesc = 1;
129 current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
85} 130}
86 131
87/* 132/*
@@ -144,9 +189,8 @@ static void rcu_preempt_note_context_switch(int cpu)
144 * grace period, then the fact that the task has been enqueued 189 * grace period, then the fact that the task has been enqueued
145 * means that we continue to block the current grace period. 190 * means that we continue to block the current grace period.
146 */ 191 */
147 rcu_preempt_qs(cpu);
148 local_irq_save(flags); 192 local_irq_save(flags);
149 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; 193 rcu_preempt_qs(cpu);
150 local_irq_restore(flags); 194 local_irq_restore(flags);
151} 195}
152 196
@@ -236,7 +280,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
236 */ 280 */
237 special = t->rcu_read_unlock_special; 281 special = t->rcu_read_unlock_special;
238 if (special & RCU_READ_UNLOCK_NEED_QS) { 282 if (special & RCU_READ_UNLOCK_NEED_QS) {
239 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
240 rcu_preempt_qs(smp_processor_id()); 283 rcu_preempt_qs(smp_processor_id());
241 } 284 }
242 285
@@ -473,7 +516,6 @@ static void rcu_preempt_check_callbacks(int cpu)
473 struct task_struct *t = current; 516 struct task_struct *t = current;
474 517
475 if (t->rcu_read_lock_nesting == 0) { 518 if (t->rcu_read_lock_nesting == 0) {
476 t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
477 rcu_preempt_qs(cpu); 519 rcu_preempt_qs(cpu);
478 return; 520 return;
479 } 521 }
@@ -515,11 +557,13 @@ void synchronize_rcu(void)
515 if (!rcu_scheduler_active) 557 if (!rcu_scheduler_active)
516 return; 558 return;
517 559
560 init_rcu_head_on_stack(&rcu.head);
518 init_completion(&rcu.completion); 561 init_completion(&rcu.completion);
519 /* Will wake me after RCU finished. */ 562 /* Will wake me after RCU finished. */
520 call_rcu(&rcu.head, wakeme_after_rcu); 563 call_rcu(&rcu.head, wakeme_after_rcu);
521 /* Wait for it. */ 564 /* Wait for it. */
522 wait_for_completion(&rcu.completion); 565 wait_for_completion(&rcu.completion);
566 destroy_rcu_head_on_stack(&rcu.head);
523} 567}
524EXPORT_SYMBOL_GPL(synchronize_rcu); 568EXPORT_SYMBOL_GPL(synchronize_rcu);
525 569
@@ -754,6 +798,7 @@ void exit_rcu(void)
754static void __init rcu_bootup_announce(void) 798static void __init rcu_bootup_announce(void)
755{ 799{
756 printk(KERN_INFO "Hierarchical RCU implementation.\n"); 800 printk(KERN_INFO "Hierarchical RCU implementation.\n");
801 rcu_bootup_announce_oddness();
757} 802}
758 803
759/* 804/*
@@ -1008,6 +1053,8 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
1008int rcu_needs_cpu(int cpu) 1053int rcu_needs_cpu(int cpu)
1009{ 1054{
1010 int c = 0; 1055 int c = 0;
1056 int snap;
1057 int snap_nmi;
1011 int thatcpu; 1058 int thatcpu;
1012 1059
1013 /* Check for being in the holdoff period. */ 1060 /* Check for being in the holdoff period. */
@@ -1015,12 +1062,18 @@ int rcu_needs_cpu(int cpu)
1015 return rcu_needs_cpu_quick_check(cpu); 1062 return rcu_needs_cpu_quick_check(cpu);
1016 1063
1017 /* Don't bother unless we are the last non-dyntick-idle CPU. */ 1064 /* Don't bother unless we are the last non-dyntick-idle CPU. */
1018 for_each_cpu_not(thatcpu, nohz_cpu_mask) 1065 for_each_online_cpu(thatcpu) {
1019 if (thatcpu != cpu) { 1066 if (thatcpu == cpu)
1067 continue;
1068 snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
1069 snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
1070 smp_mb(); /* Order sampling of snap with end of grace period. */
1071 if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
1020 per_cpu(rcu_dyntick_drain, cpu) = 0; 1072 per_cpu(rcu_dyntick_drain, cpu) = 0;
1021 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; 1073 per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
1022 return rcu_needs_cpu_quick_check(cpu); 1074 return rcu_needs_cpu_quick_check(cpu);
1023 } 1075 }
1076 }
1024 1077
1025 /* Check and update the rcu_dyntick_drain sequencing. */ 1078 /* Check and update the rcu_dyntick_drain sequencing. */
1026 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { 1079 if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d45db2e35d27..36c95b45738e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -241,11 +241,13 @@ static const struct file_operations rcugp_fops = {
241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) 241static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
242{ 242{
243 seq_printf(m, "%3d%cnp=%ld " 243 seq_printf(m, "%3d%cnp=%ld "
244 "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", 244 "qsp=%ld rpq=%ld cbr=%ld cng=%ld "
245 "gpc=%ld gps=%ld nf=%ld nn=%ld\n",
245 rdp->cpu, 246 rdp->cpu,
246 cpu_is_offline(rdp->cpu) ? '!' : ' ', 247 cpu_is_offline(rdp->cpu) ? '!' : ' ',
247 rdp->n_rcu_pending, 248 rdp->n_rcu_pending,
248 rdp->n_rp_qs_pending, 249 rdp->n_rp_qs_pending,
250 rdp->n_rp_report_qs,
249 rdp->n_rp_cb_ready, 251 rdp->n_rp_cb_ready,
250 rdp->n_rp_cpu_needs_gp, 252 rdp->n_rp_cpu_needs_gp,
251 rdp->n_rp_gp_completed, 253 rdp->n_rp_gp_completed,
diff --git a/kernel/relay.c b/kernel/relay.c
index 3d97f2821611..c7cf397fb929 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -539,7 +539,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
539 "relay_hotcpu_callback: cpu %d buffer " 539 "relay_hotcpu_callback: cpu %d buffer "
540 "creation failed\n", hotcpu); 540 "creation failed\n", hotcpu);
541 mutex_unlock(&relay_channels_mutex); 541 mutex_unlock(&relay_channels_mutex);
542 return NOTIFY_BAD; 542 return notifier_from_errno(-ENOMEM);
543 } 543 }
544 } 544 }
545 mutex_unlock(&relay_channels_mutex); 545 mutex_unlock(&relay_channels_mutex);
@@ -1231,8 +1231,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
1231 size_t read_subbuf = read_start / subbuf_size; 1231 size_t read_subbuf = read_start / subbuf_size;
1232 size_t padding = rbuf->padding[read_subbuf]; 1232 size_t padding = rbuf->padding[read_subbuf];
1233 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; 1233 size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding;
1234 struct page *pages[PIPE_BUFFERS]; 1234 struct page *pages[PIPE_DEF_BUFFERS];
1235 struct partial_page partial[PIPE_BUFFERS]; 1235 struct partial_page partial[PIPE_DEF_BUFFERS];
1236 struct splice_pipe_desc spd = { 1236 struct splice_pipe_desc spd = {
1237 .pages = pages, 1237 .pages = pages,
1238 .nr_pages = 0, 1238 .nr_pages = 0,
@@ -1245,6 +1245,8 @@ static ssize_t subbuf_splice_actor(struct file *in,
1245 1245
1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed) 1246 if (rbuf->subbufs_produced == rbuf->subbufs_consumed)
1247 return 0; 1247 return 0;
1248 if (splice_grow_spd(pipe, &spd))
1249 return -ENOMEM;
1248 1250
1249 /* 1251 /*
1250 * Adjust read len, if longer than what is available 1252 * Adjust read len, if longer than what is available
@@ -1255,7 +1257,7 @@ static ssize_t subbuf_splice_actor(struct file *in,
1255 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; 1257 subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT;
1256 pidx = (read_start / PAGE_SIZE) % subbuf_pages; 1258 pidx = (read_start / PAGE_SIZE) % subbuf_pages;
1257 poff = read_start & ~PAGE_MASK; 1259 poff = read_start & ~PAGE_MASK;
1258 nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); 1260 nr_pages = min_t(unsigned int, subbuf_pages, pipe->buffers);
1259 1261
1260 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { 1262 for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) {
1261 unsigned int this_len, this_end, private; 1263 unsigned int this_len, this_end, private;
@@ -1289,16 +1291,19 @@ static ssize_t subbuf_splice_actor(struct file *in,
1289 } 1291 }
1290 } 1292 }
1291 1293
1294 ret = 0;
1292 if (!spd.nr_pages) 1295 if (!spd.nr_pages)
1293 return 0; 1296 goto out;
1294 1297
1295 ret = *nonpad_ret = splice_to_pipe(pipe, &spd); 1298 ret = *nonpad_ret = splice_to_pipe(pipe, &spd);
1296 if (ret < 0 || ret < total_len) 1299 if (ret < 0 || ret < total_len)
1297 return ret; 1300 goto out;
1298 1301
1299 if (read_start + ret == nonpad_end) 1302 if (read_start + ret == nonpad_end)
1300 ret += padding; 1303 ret += padding;
1301 1304
1305out:
1306 splice_shrink_spd(pipe, &spd);
1302 return ret; 1307 return ret;
1303} 1308}
1304 1309
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index bcdabf37c40b..c7eaa37a768b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -10,7 +10,6 @@
10#include <linux/types.h> 10#include <linux/types.h>
11#include <linux/parser.h> 11#include <linux/parser.h>
12#include <linux/fs.h> 12#include <linux/fs.h>
13#include <linux/slab.h>
14#include <linux/res_counter.h> 13#include <linux/res_counter.h>
15#include <linux/uaccess.h> 14#include <linux/uaccess.h>
16#include <linux/mm.h> 15#include <linux/mm.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 2d5be5d9bf5f..7b36976e5dea 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -15,6 +15,7 @@
15#include <linux/spinlock.h> 15#include <linux/spinlock.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/sched.h>
18#include <linux/seq_file.h> 19#include <linux/seq_file.h>
19#include <linux/device.h> 20#include <linux/device.h>
20#include <linux/pfn.h> 21#include <linux/pfn.h>
@@ -219,19 +220,34 @@ void release_child_resources(struct resource *r)
219} 220}
220 221
221/** 222/**
222 * request_resource - request and reserve an I/O or memory resource 223 * request_resource_conflict - request and reserve an I/O or memory resource
223 * @root: root resource descriptor 224 * @root: root resource descriptor
224 * @new: resource descriptor desired by caller 225 * @new: resource descriptor desired by caller
225 * 226 *
226 * Returns 0 for success, negative error code on error. 227 * Returns 0 for success, conflict resource on error.
227 */ 228 */
228int request_resource(struct resource *root, struct resource *new) 229struct resource *request_resource_conflict(struct resource *root, struct resource *new)
229{ 230{
230 struct resource *conflict; 231 struct resource *conflict;
231 232
232 write_lock(&resource_lock); 233 write_lock(&resource_lock);
233 conflict = __request_resource(root, new); 234 conflict = __request_resource(root, new);
234 write_unlock(&resource_lock); 235 write_unlock(&resource_lock);
236 return conflict;
237}
238
239/**
240 * request_resource - request and reserve an I/O or memory resource
241 * @root: root resource descriptor
242 * @new: resource descriptor desired by caller
243 *
244 * Returns 0 for success, negative error code on error.
245 */
246int request_resource(struct resource *root, struct resource *new)
247{
248 struct resource *conflict;
249
250 conflict = request_resource_conflict(root, new);
235 return conflict ? -EBUSY : 0; 251 return conflict ? -EBUSY : 0;
236} 252}
237 253
@@ -474,25 +490,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
474} 490}
475 491
476/** 492/**
477 * insert_resource - Inserts a resource in the resource tree 493 * insert_resource_conflict - Inserts resource in the resource tree
478 * @parent: parent of the new resource 494 * @parent: parent of the new resource
479 * @new: new resource to insert 495 * @new: new resource to insert
480 * 496 *
481 * Returns 0 on success, -EBUSY if the resource can't be inserted. 497 * Returns 0 on success, conflict resource if the resource can't be inserted.
482 * 498 *
483 * This function is equivalent to request_resource when no conflict 499 * This function is equivalent to request_resource_conflict when no conflict
484 * happens. If a conflict happens, and the conflicting resources 500 * happens. If a conflict happens, and the conflicting resources
485 * entirely fit within the range of the new resource, then the new 501 * entirely fit within the range of the new resource, then the new
486 * resource is inserted and the conflicting resources become children of 502 * resource is inserted and the conflicting resources become children of
487 * the new resource. 503 * the new resource.
488 */ 504 */
489int insert_resource(struct resource *parent, struct resource *new) 505struct resource *insert_resource_conflict(struct resource *parent, struct resource *new)
490{ 506{
491 struct resource *conflict; 507 struct resource *conflict;
492 508
493 write_lock(&resource_lock); 509 write_lock(&resource_lock);
494 conflict = __insert_resource(parent, new); 510 conflict = __insert_resource(parent, new);
495 write_unlock(&resource_lock); 511 write_unlock(&resource_lock);
512 return conflict;
513}
514
515/**
516 * insert_resource - Inserts a resource in the resource tree
517 * @parent: parent of the new resource
518 * @new: new resource to insert
519 *
520 * Returns 0 on success, -EBUSY if the resource can't be inserted.
521 */
522int insert_resource(struct resource *parent, struct resource *new)
523{
524 struct resource *conflict;
525
526 conflict = insert_resource_conflict(parent, new);
496 return conflict ? -EBUSY : 0; 527 return conflict ? -EBUSY : 0;
497} 528}
498 529
@@ -651,6 +682,8 @@ resource_size_t resource_alignment(struct resource *res)
651 * release_region releases a matching busy region. 682 * release_region releases a matching busy region.
652 */ 683 */
653 684
685static DECLARE_WAIT_QUEUE_HEAD(muxed_resource_wait);
686
654/** 687/**
655 * __request_region - create a new busy resource region 688 * __request_region - create a new busy resource region
656 * @parent: parent resource descriptor 689 * @parent: parent resource descriptor
@@ -663,6 +696,7 @@ struct resource * __request_region(struct resource *parent,
663 resource_size_t start, resource_size_t n, 696 resource_size_t start, resource_size_t n,
664 const char *name, int flags) 697 const char *name, int flags)
665{ 698{
699 DECLARE_WAITQUEUE(wait, current);
666 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 700 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
667 701
668 if (!res) 702 if (!res)
@@ -687,7 +721,15 @@ struct resource * __request_region(struct resource *parent,
687 if (!(conflict->flags & IORESOURCE_BUSY)) 721 if (!(conflict->flags & IORESOURCE_BUSY))
688 continue; 722 continue;
689 } 723 }
690 724 if (conflict->flags & flags & IORESOURCE_MUXED) {
725 add_wait_queue(&muxed_resource_wait, &wait);
726 write_unlock(&resource_lock);
727 set_current_state(TASK_UNINTERRUPTIBLE);
728 schedule();
729 remove_wait_queue(&muxed_resource_wait, &wait);
730 write_lock(&resource_lock);
731 continue;
732 }
691 /* Uhhuh, that didn't work out.. */ 733 /* Uhhuh, that didn't work out.. */
692 kfree(res); 734 kfree(res);
693 res = NULL; 735 res = NULL;
@@ -761,6 +803,8 @@ void __release_region(struct resource *parent, resource_size_t start,
761 break; 803 break;
762 *p = res->sibling; 804 *p = res->sibling;
763 write_unlock(&resource_lock); 805 write_unlock(&resource_lock);
806 if (res->flags & IORESOURCE_MUXED)
807 wake_up(&muxed_resource_wait);
764 kfree(res); 808 kfree(res);
765 return; 809 return;
766 } 810 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 9ab3cd7858d3..f52a8801b7a2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,9 +55,9 @@
55#include <linux/cpu.h> 55#include <linux/cpu.h>
56#include <linux/cpuset.h> 56#include <linux/cpuset.h>
57#include <linux/percpu.h> 57#include <linux/percpu.h>
58#include <linux/kthread.h>
59#include <linux/proc_fs.h> 58#include <linux/proc_fs.h>
60#include <linux/seq_file.h> 59#include <linux/seq_file.h>
60#include <linux/stop_machine.h>
61#include <linux/sysctl.h> 61#include <linux/sysctl.h>
62#include <linux/syscalls.h> 62#include <linux/syscalls.h>
63#include <linux/times.h> 63#include <linux/times.h>
@@ -71,6 +71,7 @@
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h> 73#include <linux/ftrace.h>
74#include <linux/slab.h>
74 75
75#include <asm/tlb.h> 76#include <asm/tlb.h>
76#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
@@ -305,42 +306,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD;
305 */ 306 */
306struct task_group init_task_group; 307struct task_group init_task_group;
307 308
308/* return group to which a task belongs */
309static inline struct task_group *task_group(struct task_struct *p)
310{
311 struct task_group *tg;
312
313#ifdef CONFIG_CGROUP_SCHED
314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
315 struct task_group, css);
316#else
317 tg = &init_task_group;
318#endif
319 return tg;
320}
321
322/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
323static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
324{
325#ifdef CONFIG_FAIR_GROUP_SCHED
326 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
327 p->se.parent = task_group(p)->se[cpu];
328#endif
329
330#ifdef CONFIG_RT_GROUP_SCHED
331 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
332 p->rt.parent = task_group(p)->rt_se[cpu];
333#endif
334}
335
336#else
337
338static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
339static inline struct task_group *task_group(struct task_struct *p)
340{
341 return NULL;
342}
343
344#endif /* CONFIG_CGROUP_SCHED */ 309#endif /* CONFIG_CGROUP_SCHED */
345 310
346/* CFS-related fields in a runqueue */ 311/* CFS-related fields in a runqueue */
@@ -492,8 +457,11 @@ struct rq {
492 #define CPU_LOAD_IDX_MAX 5 457 #define CPU_LOAD_IDX_MAX 5
493 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 458 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
494#ifdef CONFIG_NO_HZ 459#ifdef CONFIG_NO_HZ
460 u64 nohz_stamp;
495 unsigned char in_nohz_recently; 461 unsigned char in_nohz_recently;
496#endif 462#endif
463 unsigned int skip_clock_update;
464
497 /* capture load from *all* tasks on this cpu: */ 465 /* capture load from *all* tasks on this cpu: */
498 struct load_weight load; 466 struct load_weight load;
499 unsigned long nr_load_updates; 467 unsigned long nr_load_updates;
@@ -530,20 +498,20 @@ struct rq {
530 struct root_domain *rd; 498 struct root_domain *rd;
531 struct sched_domain *sd; 499 struct sched_domain *sd;
532 500
501 unsigned long cpu_power;
502
533 unsigned char idle_at_tick; 503 unsigned char idle_at_tick;
534 /* For active balancing */ 504 /* For active balancing */
535 int post_schedule; 505 int post_schedule;
536 int active_balance; 506 int active_balance;
537 int push_cpu; 507 int push_cpu;
508 struct cpu_stop_work active_balance_work;
538 /* cpu of this runqueue: */ 509 /* cpu of this runqueue: */
539 int cpu; 510 int cpu;
540 int online; 511 int online;
541 512
542 unsigned long avg_load_per_task; 513 unsigned long avg_load_per_task;
543 514
544 struct task_struct *migration_thread;
545 struct list_head migration_queue;
546
547 u64 rt_avg; 515 u64 rt_avg;
548 u64 age_stamp; 516 u64 age_stamp;
549 u64 idle_stamp; 517 u64 idle_stamp;
@@ -591,6 +559,13 @@ static inline
591void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) 559void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
592{ 560{
593 rq->curr->sched_class->check_preempt_curr(rq, p, flags); 561 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
562
563 /*
564 * A queue event has occurred, and we're going to schedule. In
565 * this case, we can save a useless back to back clock update.
566 */
567 if (test_tsk_need_resched(p))
568 rq->skip_clock_update = 1;
594} 569}
595 570
596static inline int cpu_of(struct rq *rq) 571static inline int cpu_of(struct rq *rq)
@@ -623,9 +598,53 @@ static inline int cpu_of(struct rq *rq)
623#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 598#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
624#define raw_rq() (&__raw_get_cpu_var(runqueues)) 599#define raw_rq() (&__raw_get_cpu_var(runqueues))
625 600
601#ifdef CONFIG_CGROUP_SCHED
602
603/*
604 * Return the group to which this tasks belongs.
605 *
606 * We use task_subsys_state_check() and extend the RCU verification
607 * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
608 * holds that lock for each task it moves into the cgroup. Therefore
609 * by holding that lock, we pin the task to the current cgroup.
610 */
611static inline struct task_group *task_group(struct task_struct *p)
612{
613 struct cgroup_subsys_state *css;
614
615 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
616 lockdep_is_held(&task_rq(p)->lock));
617 return container_of(css, struct task_group, css);
618}
619
620/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
621static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
622{
623#ifdef CONFIG_FAIR_GROUP_SCHED
624 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
625 p->se.parent = task_group(p)->se[cpu];
626#endif
627
628#ifdef CONFIG_RT_GROUP_SCHED
629 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
630 p->rt.parent = task_group(p)->rt_se[cpu];
631#endif
632}
633
634#else /* CONFIG_CGROUP_SCHED */
635
636static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
637static inline struct task_group *task_group(struct task_struct *p)
638{
639 return NULL;
640}
641
642#endif /* CONFIG_CGROUP_SCHED */
643
626inline void update_rq_clock(struct rq *rq) 644inline void update_rq_clock(struct rq *rq)
627{ 645{
628 rq->clock = sched_clock_cpu(cpu_of(rq)); 646 if (!rq->skip_clock_update)
647 rq->clock = sched_clock_cpu(cpu_of(rq));
629} 648}
630 649
631/* 650/*
@@ -903,16 +922,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
903#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 922#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
904 923
905/* 924/*
906 * Check whether the task is waking, we use this to synchronize against 925 * Check whether the task is waking, we use this to synchronize ->cpus_allowed
907 * ttwu() so that task_cpu() reports a stable number. 926 * against ttwu().
908 *
909 * We need to make an exception for PF_STARTING tasks because the fork
910 * path might require task_rq_lock() to work, eg. it can call
911 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
912 */ 927 */
913static inline int task_is_waking(struct task_struct *p) 928static inline int task_is_waking(struct task_struct *p)
914{ 929{
915 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING)); 930 return unlikely(p->state == TASK_WAKING);
916} 931}
917 932
918/* 933/*
@@ -925,11 +940,9 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
925 struct rq *rq; 940 struct rq *rq;
926 941
927 for (;;) { 942 for (;;) {
928 while (task_is_waking(p))
929 cpu_relax();
930 rq = task_rq(p); 943 rq = task_rq(p);
931 raw_spin_lock(&rq->lock); 944 raw_spin_lock(&rq->lock);
932 if (likely(rq == task_rq(p) && !task_is_waking(p))) 945 if (likely(rq == task_rq(p)))
933 return rq; 946 return rq;
934 raw_spin_unlock(&rq->lock); 947 raw_spin_unlock(&rq->lock);
935 } 948 }
@@ -946,25 +959,15 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
946 struct rq *rq; 959 struct rq *rq;
947 960
948 for (;;) { 961 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
951 local_irq_save(*flags); 962 local_irq_save(*flags);
952 rq = task_rq(p); 963 rq = task_rq(p);
953 raw_spin_lock(&rq->lock); 964 raw_spin_lock(&rq->lock);
954 if (likely(rq == task_rq(p) && !task_is_waking(p))) 965 if (likely(rq == task_rq(p)))
955 return rq; 966 return rq;
956 raw_spin_unlock_irqrestore(&rq->lock, *flags); 967 raw_spin_unlock_irqrestore(&rq->lock, *flags);
957 } 968 }
958} 969}
959 970
960void task_rq_unlock_wait(struct task_struct *p)
961{
962 struct rq *rq = task_rq(p);
963
964 smp_mb(); /* spin-unlock-wait is not a full memory barrier */
965 raw_spin_unlock_wait(&rq->lock);
966}
967
968static void __task_rq_unlock(struct rq *rq) 971static void __task_rq_unlock(struct rq *rq)
969 __releases(rq->lock) 972 __releases(rq->lock)
970{ 973{
@@ -1228,6 +1231,17 @@ void wake_up_idle_cpu(int cpu)
1228 if (!tsk_is_polling(rq->idle)) 1231 if (!tsk_is_polling(rq->idle))
1229 smp_send_reschedule(cpu); 1232 smp_send_reschedule(cpu);
1230} 1233}
1234
1235int nohz_ratelimit(int cpu)
1236{
1237 struct rq *rq = cpu_rq(cpu);
1238 u64 diff = rq->clock - rq->nohz_stamp;
1239
1240 rq->nohz_stamp = rq->clock;
1241
1242 return diff < (NSEC_PER_SEC / HZ) >> 1;
1243}
1244
1231#endif /* CONFIG_NO_HZ */ 1245#endif /* CONFIG_NO_HZ */
1232 1246
1233static u64 sched_avg_period(void) 1247static u64 sched_avg_period(void)
@@ -1240,6 +1254,12 @@ static void sched_avg_update(struct rq *rq)
1240 s64 period = sched_avg_period(); 1254 s64 period = sched_avg_period();
1241 1255
1242 while ((s64)(rq->clock - rq->age_stamp) > period) { 1256 while ((s64)(rq->clock - rq->age_stamp) > period) {
1257 /*
1258 * Inline assembly required to prevent the compiler
1259 * optimising this loop into a divmod call.
1260 * See __iter_div_u64_rem() for another example of this.
1261 */
1262 asm("" : "+rm" (rq->age_stamp));
1243 rq->age_stamp += period; 1263 rq->age_stamp += period;
1244 rq->rt_avg /= 2; 1264 rq->rt_avg /= 2;
1245 } 1265 }
@@ -1484,24 +1504,9 @@ static unsigned long target_load(int cpu, int type)
1484 return max(rq->cpu_load[type-1], total); 1504 return max(rq->cpu_load[type-1], total);
1485} 1505}
1486 1506
1487static struct sched_group *group_of(int cpu)
1488{
1489 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1490
1491 if (!sd)
1492 return NULL;
1493
1494 return sd->groups;
1495}
1496
1497static unsigned long power_of(int cpu) 1507static unsigned long power_of(int cpu)
1498{ 1508{
1499 struct sched_group *group = group_of(cpu); 1509 return cpu_rq(cpu)->cpu_power;
1500
1501 if (!group)
1502 return SCHED_LOAD_SCALE;
1503
1504 return group->cpu_power;
1505} 1510}
1506 1511
1507static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1512static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
@@ -1658,9 +1663,6 @@ static void update_shares(struct sched_domain *sd)
1658 1663
1659static void update_h_load(long cpu) 1664static void update_h_load(long cpu)
1660{ 1665{
1661 if (root_task_group_empty())
1662 return;
1663
1664 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1666 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1665} 1667}
1666 1668
@@ -1770,8 +1772,6 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); 1772 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 } 1773 }
1772 } 1774 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775} 1775}
1776 1776
1777/* 1777/*
@@ -1802,7 +1802,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1802} 1802}
1803#endif 1803#endif
1804 1804
1805static void calc_load_account_active(struct rq *this_rq); 1805static void calc_load_account_idle(struct rq *this_rq);
1806static void update_sysctl(void); 1806static void update_sysctl(void);
1807static int get_update_sysctl_factor(void); 1807static int get_update_sysctl_factor(void);
1808 1808
@@ -1841,8 +1841,8 @@ static void dec_nr_running(struct rq *rq)
1841static void set_load_weight(struct task_struct *p) 1841static void set_load_weight(struct task_struct *p)
1842{ 1842{
1843 if (task_has_rt_policy(p)) { 1843 if (task_has_rt_policy(p)) {
1844 p->se.load.weight = prio_to_weight[0] * 2; 1844 p->se.load.weight = 0;
1845 p->se.load.inv_weight = prio_to_wmult[0] >> 1; 1845 p->se.load.inv_weight = WMULT_CONST;
1846 return; 1846 return;
1847 } 1847 }
1848 1848
@@ -1859,62 +1859,43 @@ static void set_load_weight(struct task_struct *p)
1859 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1859 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1860} 1860}
1861 1861
1862static void update_avg(u64 *avg, u64 sample) 1862static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1863{
1864 s64 diff = sample - *avg;
1865 *avg += diff >> 3;
1866}
1867
1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1870{ 1863{
1871 if (wakeup) 1864 update_rq_clock(rq);
1872 p->se.start_runtime = p->se.sum_exec_runtime;
1873
1874 sched_info_queued(p); 1865 sched_info_queued(p);
1875 p->sched_class->enqueue_task(rq, p, wakeup, head); 1866 p->sched_class->enqueue_task(rq, p, flags);
1876 p->se.on_rq = 1; 1867 p->se.on_rq = 1;
1877} 1868}
1878 1869
1879static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1870static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1880{ 1871{
1881 if (sleep) { 1872 update_rq_clock(rq);
1882 if (p->se.last_wakeup) {
1883 update_avg(&p->se.avg_overlap,
1884 p->se.sum_exec_runtime - p->se.last_wakeup);
1885 p->se.last_wakeup = 0;
1886 } else {
1887 update_avg(&p->se.avg_wakeup,
1888 sysctl_sched_wakeup_granularity);
1889 }
1890 }
1891
1892 sched_info_dequeued(p); 1873 sched_info_dequeued(p);
1893 p->sched_class->dequeue_task(rq, p, sleep); 1874 p->sched_class->dequeue_task(rq, p, flags);
1894 p->se.on_rq = 0; 1875 p->se.on_rq = 0;
1895} 1876}
1896 1877
1897/* 1878/*
1898 * activate_task - move a task to the runqueue. 1879 * activate_task - move a task to the runqueue.
1899 */ 1880 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 1881static void activate_task(struct rq *rq, struct task_struct *p, int flags)
1901{ 1882{
1902 if (task_contributes_to_load(p)) 1883 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--; 1884 rq->nr_uninterruptible--;
1904 1885
1905 enqueue_task(rq, p, wakeup, false); 1886 enqueue_task(rq, p, flags);
1906 inc_nr_running(rq); 1887 inc_nr_running(rq);
1907} 1888}
1908 1889
1909/* 1890/*
1910 * deactivate_task - remove a task from the runqueue. 1891 * deactivate_task - remove a task from the runqueue.
1911 */ 1892 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 1893static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1913{ 1894{
1914 if (task_contributes_to_load(p)) 1895 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++; 1896 rq->nr_uninterruptible++;
1916 1897
1917 dequeue_task(rq, p, sleep); 1898 dequeue_task(rq, p, flags);
1918 dec_nr_running(rq); 1899 dec_nr_running(rq);
1919} 1900}
1920 1901
@@ -2043,21 +2024,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2043 __set_task_cpu(p, new_cpu); 2024 __set_task_cpu(p, new_cpu);
2044} 2025}
2045 2026
2046struct migration_req { 2027struct migration_arg {
2047 struct list_head list;
2048
2049 struct task_struct *task; 2028 struct task_struct *task;
2050 int dest_cpu; 2029 int dest_cpu;
2051
2052 struct completion done;
2053}; 2030};
2054 2031
2032static int migration_cpu_stop(void *data);
2033
2055/* 2034/*
2056 * The task's runqueue lock must be held. 2035 * The task's runqueue lock must be held.
2057 * Returns true if you have to wait for migration thread. 2036 * Returns true if you have to wait for migration thread.
2058 */ 2037 */
2059static int 2038static bool migrate_task(struct task_struct *p, int dest_cpu)
2060migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2061{ 2039{
2062 struct rq *rq = task_rq(p); 2040 struct rq *rq = task_rq(p);
2063 2041
@@ -2065,58 +2043,7 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2065 * If the task is not on a runqueue (and not running), then 2043 * If the task is not on a runqueue (and not running), then
2066 * the next wake-up will properly place the task. 2044 * the next wake-up will properly place the task.
2067 */ 2045 */
2068 if (!p->se.on_rq && !task_running(rq, p)) 2046 return p->se.on_rq || task_running(rq, p);
2069 return 0;
2070
2071 init_completion(&req->done);
2072 req->task = p;
2073 req->dest_cpu = dest_cpu;
2074 list_add(&req->list, &rq->migration_queue);
2075
2076 return 1;
2077}
2078
2079/*
2080 * wait_task_context_switch - wait for a thread to complete at least one
2081 * context switch.
2082 *
2083 * @p must not be current.
2084 */
2085void wait_task_context_switch(struct task_struct *p)
2086{
2087 unsigned long nvcsw, nivcsw, flags;
2088 int running;
2089 struct rq *rq;
2090
2091 nvcsw = p->nvcsw;
2092 nivcsw = p->nivcsw;
2093 for (;;) {
2094 /*
2095 * The runqueue is assigned before the actual context
2096 * switch. We need to take the runqueue lock.
2097 *
2098 * We could check initially without the lock but it is
2099 * very likely that we need to take the lock in every
2100 * iteration.
2101 */
2102 rq = task_rq_lock(p, &flags);
2103 running = task_running(rq, p);
2104 task_rq_unlock(rq, &flags);
2105
2106 if (likely(!running))
2107 break;
2108 /*
2109 * The switch count is incremented before the actual
2110 * context switch. We thus wait for two switches to be
2111 * sure at least one completed.
2112 */
2113 if ((p->nvcsw - nvcsw) > 1)
2114 break;
2115 if ((p->nivcsw - nivcsw) > 1)
2116 break;
2117
2118 cpu_relax();
2119 }
2120} 2047}
2121 2048
2122/* 2049/*
@@ -2174,7 +2101,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2174 * just go back and repeat. 2101 * just go back and repeat.
2175 */ 2102 */
2176 rq = task_rq_lock(p, &flags); 2103 rq = task_rq_lock(p, &flags);
2177 trace_sched_wait_task(rq, p); 2104 trace_sched_wait_task(p);
2178 running = task_running(rq, p); 2105 running = task_running(rq, p);
2179 on_rq = p->se.on_rq; 2106 on_rq = p->se.on_rq;
2180 ncsw = 0; 2107 ncsw = 0;
@@ -2272,6 +2199,9 @@ void task_oncpu_function_call(struct task_struct *p,
2272} 2199}
2273 2200
2274#ifdef CONFIG_SMP 2201#ifdef CONFIG_SMP
2202/*
2203 * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
2204 */
2275static int select_fallback_rq(int cpu, struct task_struct *p) 2205static int select_fallback_rq(int cpu, struct task_struct *p)
2276{ 2206{
2277 int dest_cpu; 2207 int dest_cpu;
@@ -2288,12 +2218,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2288 return dest_cpu; 2218 return dest_cpu;
2289 2219
2290 /* No more Mr. Nice Guy. */ 2220 /* No more Mr. Nice Guy. */
2291 if (dest_cpu >= nr_cpu_ids) { 2221 if (unlikely(dest_cpu >= nr_cpu_ids)) {
2292 rcu_read_lock(); 2222 dest_cpu = cpuset_cpus_allowed_fallback(p);
2293 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2294 rcu_read_unlock();
2295 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2296
2297 /* 2223 /*
2298 * Don't tell them about moving exiting tasks or 2224 * Don't tell them about moving exiting tasks or
2299 * kernel threads (both mm NULL), since they never 2225 * kernel threads (both mm NULL), since they never
@@ -2310,17 +2236,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2310} 2236}
2311 2237
2312/* 2238/*
2313 * Gets called from 3 sites (exec, fork, wakeup), since it is called without 2239 * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
2314 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2315 * by:
2316 *
2317 * exec: is unstable, retry loop
2318 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2319 */ 2240 */
2320static inline 2241static inline
2321int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2242int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
2322{ 2243{
2323 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2244 int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
2324 2245
2325 /* 2246 /*
2326 * In order not to call set_task_cpu() on a blocking task we need 2247 * In order not to call set_task_cpu() on a blocking task we need
@@ -2338,6 +2259,12 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2338 2259
2339 return cpu; 2260 return cpu;
2340} 2261}
2262
2263static void update_avg(u64 *avg, u64 sample)
2264{
2265 s64 diff = sample - *avg;
2266 *avg += diff >> 3;
2267}
2341#endif 2268#endif
2342 2269
2343/*** 2270/***
@@ -2359,16 +2286,13 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2359{ 2286{
2360 int cpu, orig_cpu, this_cpu, success = 0; 2287 int cpu, orig_cpu, this_cpu, success = 0;
2361 unsigned long flags; 2288 unsigned long flags;
2289 unsigned long en_flags = ENQUEUE_WAKEUP;
2362 struct rq *rq; 2290 struct rq *rq;
2363 2291
2364 if (!sched_feat(SYNC_WAKEUPS))
2365 wake_flags &= ~WF_SYNC;
2366
2367 this_cpu = get_cpu(); 2292 this_cpu = get_cpu();
2368 2293
2369 smp_wmb(); 2294 smp_wmb();
2370 rq = task_rq_lock(p, &flags); 2295 rq = task_rq_lock(p, &flags);
2371 update_rq_clock(rq);
2372 if (!(p->state & state)) 2296 if (!(p->state & state))
2373 goto out; 2297 goto out;
2374 2298
@@ -2388,28 +2312,26 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2388 * 2312 *
2389 * First fix up the nr_uninterruptible count: 2313 * First fix up the nr_uninterruptible count:
2390 */ 2314 */
2391 if (task_contributes_to_load(p)) 2315 if (task_contributes_to_load(p)) {
2392 rq->nr_uninterruptible--; 2316 if (likely(cpu_online(orig_cpu)))
2317 rq->nr_uninterruptible--;
2318 else
2319 this_rq()->nr_uninterruptible--;
2320 }
2393 p->state = TASK_WAKING; 2321 p->state = TASK_WAKING;
2394 2322
2395 if (p->sched_class->task_waking) 2323 if (p->sched_class->task_waking) {
2396 p->sched_class->task_waking(rq, p); 2324 p->sched_class->task_waking(rq, p);
2325 en_flags |= ENQUEUE_WAKING;
2326 }
2397 2327
2398 __task_rq_unlock(rq); 2328 cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
2399 2329 if (cpu != orig_cpu)
2400 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2401 if (cpu != orig_cpu) {
2402 /*
2403 * Since we migrate the task without holding any rq->lock,
2404 * we need to be careful with task_rq_lock(), since that
2405 * might end up locking an invalid rq.
2406 */
2407 set_task_cpu(p, cpu); 2330 set_task_cpu(p, cpu);
2408 } 2331 __task_rq_unlock(rq);
2409 2332
2410 rq = cpu_rq(cpu); 2333 rq = cpu_rq(cpu);
2411 raw_spin_lock(&rq->lock); 2334 raw_spin_lock(&rq->lock);
2412 update_rq_clock(rq);
2413 2335
2414 /* 2336 /*
2415 * We migrated the task without holding either rq->lock, however 2337 * We migrated the task without holding either rq->lock, however
@@ -2437,36 +2359,20 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2437 2359
2438out_activate: 2360out_activate:
2439#endif /* CONFIG_SMP */ 2361#endif /* CONFIG_SMP */
2440 schedstat_inc(p, se.nr_wakeups); 2362 schedstat_inc(p, se.statistics.nr_wakeups);
2441 if (wake_flags & WF_SYNC) 2363 if (wake_flags & WF_SYNC)
2442 schedstat_inc(p, se.nr_wakeups_sync); 2364 schedstat_inc(p, se.statistics.nr_wakeups_sync);
2443 if (orig_cpu != cpu) 2365 if (orig_cpu != cpu)
2444 schedstat_inc(p, se.nr_wakeups_migrate); 2366 schedstat_inc(p, se.statistics.nr_wakeups_migrate);
2445 if (cpu == this_cpu) 2367 if (cpu == this_cpu)
2446 schedstat_inc(p, se.nr_wakeups_local); 2368 schedstat_inc(p, se.statistics.nr_wakeups_local);
2447 else 2369 else
2448 schedstat_inc(p, se.nr_wakeups_remote); 2370 schedstat_inc(p, se.statistics.nr_wakeups_remote);
2449 activate_task(rq, p, 1); 2371 activate_task(rq, p, en_flags);
2450 success = 1; 2372 success = 1;
2451 2373
2452 /*
2453 * Only attribute actual wakeups done by this task.
2454 */
2455 if (!in_interrupt()) {
2456 struct sched_entity *se = &current->se;
2457 u64 sample = se->sum_exec_runtime;
2458
2459 if (se->last_wakeup)
2460 sample -= se->last_wakeup;
2461 else
2462 sample -= se->start_runtime;
2463 update_avg(&se->avg_wakeup, sample);
2464
2465 se->last_wakeup = se->sum_exec_runtime;
2466 }
2467
2468out_running: 2374out_running:
2469 trace_sched_wakeup(rq, p, success); 2375 trace_sched_wakeup(p, success);
2470 check_preempt_curr(rq, p, wake_flags); 2376 check_preempt_curr(rq, p, wake_flags);
2471 2377
2472 p->state = TASK_RUNNING; 2378 p->state = TASK_RUNNING;
@@ -2526,42 +2432,9 @@ static void __sched_fork(struct task_struct *p)
2526 p->se.sum_exec_runtime = 0; 2432 p->se.sum_exec_runtime = 0;
2527 p->se.prev_sum_exec_runtime = 0; 2433 p->se.prev_sum_exec_runtime = 0;
2528 p->se.nr_migrations = 0; 2434 p->se.nr_migrations = 0;
2529 p->se.last_wakeup = 0;
2530 p->se.avg_overlap = 0;
2531 p->se.start_runtime = 0;
2532 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2533 2435
2534#ifdef CONFIG_SCHEDSTATS 2436#ifdef CONFIG_SCHEDSTATS
2535 p->se.wait_start = 0; 2437 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2536 p->se.wait_max = 0;
2537 p->se.wait_count = 0;
2538 p->se.wait_sum = 0;
2539
2540 p->se.sleep_start = 0;
2541 p->se.sleep_max = 0;
2542 p->se.sum_sleep_runtime = 0;
2543
2544 p->se.block_start = 0;
2545 p->se.block_max = 0;
2546 p->se.exec_max = 0;
2547 p->se.slice_max = 0;
2548
2549 p->se.nr_migrations_cold = 0;
2550 p->se.nr_failed_migrations_affine = 0;
2551 p->se.nr_failed_migrations_running = 0;
2552 p->se.nr_failed_migrations_hot = 0;
2553 p->se.nr_forced_migrations = 0;
2554
2555 p->se.nr_wakeups = 0;
2556 p->se.nr_wakeups_sync = 0;
2557 p->se.nr_wakeups_migrate = 0;
2558 p->se.nr_wakeups_local = 0;
2559 p->se.nr_wakeups_remote = 0;
2560 p->se.nr_wakeups_affine = 0;
2561 p->se.nr_wakeups_affine_attempts = 0;
2562 p->se.nr_wakeups_passive = 0;
2563 p->se.nr_wakeups_idle = 0;
2564
2565#endif 2438#endif
2566 2439
2567 INIT_LIST_HEAD(&p->rt.run_list); 2440 INIT_LIST_HEAD(&p->rt.run_list);
@@ -2582,11 +2455,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
2582 2455
2583 __sched_fork(p); 2456 __sched_fork(p);
2584 /* 2457 /*
2585 * We mark the process as waking here. This guarantees that 2458 * We mark the process as running here. This guarantees that
2586 * nobody will actually run it, and a signal or other external 2459 * nobody will actually run it, and a signal or other external
2587 * event cannot wake it up and insert it on the runqueue either. 2460 * event cannot wake it up and insert it on the runqueue either.
2588 */ 2461 */
2589 p->state = TASK_WAKING; 2462 p->state = TASK_RUNNING;
2590 2463
2591 /* 2464 /*
2592 * Revert to default priority/policy on fork if requested. 2465 * Revert to default priority/policy on fork if requested.
@@ -2621,7 +2494,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
2621 if (p->sched_class->task_fork) 2494 if (p->sched_class->task_fork)
2622 p->sched_class->task_fork(p); 2495 p->sched_class->task_fork(p);
2623 2496
2497 /*
2498 * The child is not yet in the pid-hash so no cgroup attach races,
2499 * and the cgroup is pinned to this child due to cgroup_fork()
2500 * is ran before sched_fork().
2501 *
2502 * Silence PROVE_RCU.
2503 */
2504 rcu_read_lock();
2624 set_task_cpu(p, cpu); 2505 set_task_cpu(p, cpu);
2506 rcu_read_unlock();
2625 2507
2626#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2508#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
2627 if (likely(sched_info_on())) 2509 if (likely(sched_info_on()))
@@ -2650,34 +2532,30 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2650{ 2532{
2651 unsigned long flags; 2533 unsigned long flags;
2652 struct rq *rq; 2534 struct rq *rq;
2653 int cpu = get_cpu(); 2535 int cpu __maybe_unused = get_cpu();
2654 2536
2655#ifdef CONFIG_SMP 2537#ifdef CONFIG_SMP
2538 rq = task_rq_lock(p, &flags);
2539 p->state = TASK_WAKING;
2540
2656 /* 2541 /*
2657 * Fork balancing, do it here and not earlier because: 2542 * Fork balancing, do it here and not earlier because:
2658 * - cpus_allowed can change in the fork path 2543 * - cpus_allowed can change in the fork path
2659 * - any previously selected cpu might disappear through hotplug 2544 * - any previously selected cpu might disappear through hotplug
2660 * 2545 *
2661 * We still have TASK_WAKING but PF_STARTING is gone now, meaning 2546 * We set TASK_WAKING so that select_task_rq() can drop rq->lock
2662 * ->cpus_allowed is stable, we have preemption disabled, meaning 2547 * without people poking at ->cpus_allowed.
2663 * cpu_online_mask is stable.
2664 */ 2548 */
2665 cpu = select_task_rq(p, SD_BALANCE_FORK, 0); 2549 cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
2666 set_task_cpu(p, cpu); 2550 set_task_cpu(p, cpu);
2667#endif
2668 2551
2669 /*
2670 * Since the task is not on the rq and we still have TASK_WAKING set
2671 * nobody else will migrate this task.
2672 */
2673 rq = cpu_rq(cpu);
2674 raw_spin_lock_irqsave(&rq->lock, flags);
2675
2676 BUG_ON(p->state != TASK_WAKING);
2677 p->state = TASK_RUNNING; 2552 p->state = TASK_RUNNING;
2678 update_rq_clock(rq); 2553 task_rq_unlock(rq, &flags);
2554#endif
2555
2556 rq = task_rq_lock(p, &flags);
2679 activate_task(rq, p, 0); 2557 activate_task(rq, p, 0);
2680 trace_sched_wakeup_new(rq, p, 1); 2558 trace_sched_wakeup_new(p, 1);
2681 check_preempt_curr(rq, p, WF_FORK); 2559 check_preempt_curr(rq, p, WF_FORK);
2682#ifdef CONFIG_SMP 2560#ifdef CONFIG_SMP
2683 if (p->sched_class->task_woken) 2561 if (p->sched_class->task_woken)
@@ -2897,7 +2775,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2897 struct mm_struct *mm, *oldmm; 2775 struct mm_struct *mm, *oldmm;
2898 2776
2899 prepare_task_switch(rq, prev, next); 2777 prepare_task_switch(rq, prev, next);
2900 trace_sched_switch(rq, prev, next); 2778 trace_sched_switch(prev, next);
2901 mm = next->mm; 2779 mm = next->mm;
2902 oldmm = prev->active_mm; 2780 oldmm = prev->active_mm;
2903 /* 2781 /*
@@ -2995,9 +2873,9 @@ unsigned long nr_iowait(void)
2995 return sum; 2873 return sum;
2996} 2874}
2997 2875
2998unsigned long nr_iowait_cpu(void) 2876unsigned long nr_iowait_cpu(int cpu)
2999{ 2877{
3000 struct rq *this = this_rq(); 2878 struct rq *this = cpu_rq(cpu);
3001 return atomic_read(&this->nr_iowait); 2879 return atomic_read(&this->nr_iowait);
3002} 2880}
3003 2881
@@ -3014,6 +2892,61 @@ static unsigned long calc_load_update;
3014unsigned long avenrun[3]; 2892unsigned long avenrun[3];
3015EXPORT_SYMBOL(avenrun); 2893EXPORT_SYMBOL(avenrun);
3016 2894
2895static long calc_load_fold_active(struct rq *this_rq)
2896{
2897 long nr_active, delta = 0;
2898
2899 nr_active = this_rq->nr_running;
2900 nr_active += (long) this_rq->nr_uninterruptible;
2901
2902 if (nr_active != this_rq->calc_load_active) {
2903 delta = nr_active - this_rq->calc_load_active;
2904 this_rq->calc_load_active = nr_active;
2905 }
2906
2907 return delta;
2908}
2909
2910#ifdef CONFIG_NO_HZ
2911/*
2912 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
2913 *
2914 * When making the ILB scale, we should try to pull this in as well.
2915 */
2916static atomic_long_t calc_load_tasks_idle;
2917
2918static void calc_load_account_idle(struct rq *this_rq)
2919{
2920 long delta;
2921
2922 delta = calc_load_fold_active(this_rq);
2923 if (delta)
2924 atomic_long_add(delta, &calc_load_tasks_idle);
2925}
2926
2927static long calc_load_fold_idle(void)
2928{
2929 long delta = 0;
2930
2931 /*
2932 * Its got a race, we don't care...
2933 */
2934 if (atomic_long_read(&calc_load_tasks_idle))
2935 delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
2936
2937 return delta;
2938}
2939#else
2940static void calc_load_account_idle(struct rq *this_rq)
2941{
2942}
2943
2944static inline long calc_load_fold_idle(void)
2945{
2946 return 0;
2947}
2948#endif
2949
3017/** 2950/**
3018 * get_avenrun - get the load average array 2951 * get_avenrun - get the load average array
3019 * @loads: pointer to dest load array 2952 * @loads: pointer to dest load array
@@ -3060,20 +2993,22 @@ void calc_global_load(void)
3060} 2993}
3061 2994
3062/* 2995/*
3063 * Either called from update_cpu_load() or from a cpu going idle 2996 * Called from update_cpu_load() to periodically update this CPU's
2997 * active count.
3064 */ 2998 */
3065static void calc_load_account_active(struct rq *this_rq) 2999static void calc_load_account_active(struct rq *this_rq)
3066{ 3000{
3067 long nr_active, delta; 3001 long delta;
3068 3002
3069 nr_active = this_rq->nr_running; 3003 if (time_before(jiffies, this_rq->calc_load_update))
3070 nr_active += (long) this_rq->nr_uninterruptible; 3004 return;
3071 3005
3072 if (nr_active != this_rq->calc_load_active) { 3006 delta = calc_load_fold_active(this_rq);
3073 delta = nr_active - this_rq->calc_load_active; 3007 delta += calc_load_fold_idle();
3074 this_rq->calc_load_active = nr_active; 3008 if (delta)
3075 atomic_long_add(delta, &calc_load_tasks); 3009 atomic_long_add(delta, &calc_load_tasks);
3076 } 3010
3011 this_rq->calc_load_update += LOAD_FREQ;
3077} 3012}
3078 3013
3079/* 3014/*
@@ -3105,10 +3040,7 @@ static void update_cpu_load(struct rq *this_rq)
3105 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; 3040 this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
3106 } 3041 }
3107 3042
3108 if (time_after_eq(jiffies, this_rq->calc_load_update)) { 3043 calc_load_account_active(this_rq);
3109 this_rq->calc_load_update += LOAD_FREQ;
3110 calc_load_account_active(this_rq);
3111 }
3112} 3044}
3113 3045
3114#ifdef CONFIG_SMP 3046#ifdef CONFIG_SMP
@@ -3120,44 +3052,27 @@ static void update_cpu_load(struct rq *this_rq)
3120void sched_exec(void) 3052void sched_exec(void)
3121{ 3053{
3122 struct task_struct *p = current; 3054 struct task_struct *p = current;
3123 struct migration_req req;
3124 int dest_cpu, this_cpu;
3125 unsigned long flags; 3055 unsigned long flags;
3126 struct rq *rq; 3056 struct rq *rq;
3127 3057 int dest_cpu;
3128again:
3129 this_cpu = get_cpu();
3130 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3131 if (dest_cpu == this_cpu) {
3132 put_cpu();
3133 return;
3134 }
3135 3058
3136 rq = task_rq_lock(p, &flags); 3059 rq = task_rq_lock(p, &flags);
3137 put_cpu(); 3060 dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
3061 if (dest_cpu == smp_processor_id())
3062 goto unlock;
3138 3063
3139 /* 3064 /*
3140 * select_task_rq() can race against ->cpus_allowed 3065 * select_task_rq() can race against ->cpus_allowed
3141 */ 3066 */
3142 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3067 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3143 || unlikely(!cpu_active(dest_cpu))) { 3068 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) {
3144 task_rq_unlock(rq, &flags); 3069 struct migration_arg arg = { p, dest_cpu };
3145 goto again;
3146 }
3147
3148 /* force the process onto the specified CPU */
3149 if (migrate_task(p, dest_cpu, &req)) {
3150 /* Need to wait for migration thread (might exit: take ref). */
3151 struct task_struct *mt = rq->migration_thread;
3152 3070
3153 get_task_struct(mt);
3154 task_rq_unlock(rq, &flags); 3071 task_rq_unlock(rq, &flags);
3155 wake_up_process(mt); 3072 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
3156 put_task_struct(mt);
3157 wait_for_completion(&req.done);
3158
3159 return; 3073 return;
3160 } 3074 }
3075unlock:
3161 task_rq_unlock(rq, &flags); 3076 task_rq_unlock(rq, &flags);
3162} 3077}
3163 3078
@@ -3629,23 +3544,9 @@ static inline void schedule_debug(struct task_struct *prev)
3629 3544
3630static void put_prev_task(struct rq *rq, struct task_struct *prev) 3545static void put_prev_task(struct rq *rq, struct task_struct *prev)
3631{ 3546{
3632 if (prev->state == TASK_RUNNING) { 3547 if (prev->se.on_rq)
3633 u64 runtime = prev->se.sum_exec_runtime; 3548 update_rq_clock(rq);
3634 3549 rq->skip_clock_update = 0;
3635 runtime -= prev->se.prev_sum_exec_runtime;
3636 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
3637
3638 /*
3639 * In order to avoid avg_overlap growing stale when we are
3640 * indeed overlapping and hence not getting put to sleep, grow
3641 * the avg_overlap on preemption.
3642 *
3643 * We use the average preemption runtime because that
3644 * correlates to the amount of cache footprint a task can
3645 * build up.
3646 */
3647 update_avg(&prev->se.avg_overlap, runtime);
3648 }
3649 prev->sched_class->put_prev_task(rq, prev); 3550 prev->sched_class->put_prev_task(rq, prev);
3650} 3551}
3651 3552
@@ -3695,7 +3596,7 @@ need_resched:
3695 preempt_disable(); 3596 preempt_disable();
3696 cpu = smp_processor_id(); 3597 cpu = smp_processor_id();
3697 rq = cpu_rq(cpu); 3598 rq = cpu_rq(cpu);
3698 rcu_sched_qs(cpu); 3599 rcu_note_context_switch(cpu);
3699 prev = rq->curr; 3600 prev = rq->curr;
3700 switch_count = &prev->nivcsw; 3601 switch_count = &prev->nivcsw;
3701 3602
@@ -3708,14 +3609,13 @@ need_resched_nonpreemptible:
3708 hrtick_clear(rq); 3609 hrtick_clear(rq);
3709 3610
3710 raw_spin_lock_irq(&rq->lock); 3611 raw_spin_lock_irq(&rq->lock);
3711 update_rq_clock(rq);
3712 clear_tsk_need_resched(prev); 3612 clear_tsk_need_resched(prev);
3713 3613
3714 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3614 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
3715 if (unlikely(signal_pending_state(prev->state, prev))) 3615 if (unlikely(signal_pending_state(prev->state, prev)))
3716 prev->state = TASK_RUNNING; 3616 prev->state = TASK_RUNNING;
3717 else 3617 else
3718 deactivate_task(rq, prev, 1); 3618 deactivate_task(rq, prev, DEQUEUE_SLEEP);
3719 switch_count = &prev->nvcsw; 3619 switch_count = &prev->nvcsw;
3720 } 3620 }
3721 3621
@@ -3779,7 +3679,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3779 * the mutex owner just released it and exited. 3679 * the mutex owner just released it and exited.
3780 */ 3680 */
3781 if (probe_kernel_address(&owner->cpu, cpu)) 3681 if (probe_kernel_address(&owner->cpu, cpu))
3782 goto out; 3682 return 0;
3783#else 3683#else
3784 cpu = owner->cpu; 3684 cpu = owner->cpu;
3785#endif 3685#endif
@@ -3789,14 +3689,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3789 * the cpu field may no longer be valid. 3689 * the cpu field may no longer be valid.
3790 */ 3690 */
3791 if (cpu >= nr_cpumask_bits) 3691 if (cpu >= nr_cpumask_bits)
3792 goto out; 3692 return 0;
3793 3693
3794 /* 3694 /*
3795 * We need to validate that we can do a 3695 * We need to validate that we can do a
3796 * get_cpu() and that we have the percpu area. 3696 * get_cpu() and that we have the percpu area.
3797 */ 3697 */
3798 if (!cpu_online(cpu)) 3698 if (!cpu_online(cpu))
3799 goto out; 3699 return 0;
3800 3700
3801 rq = cpu_rq(cpu); 3701 rq = cpu_rq(cpu);
3802 3702
@@ -3815,7 +3715,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3815 3715
3816 cpu_relax(); 3716 cpu_relax();
3817 } 3717 }
3818out: 3718
3819 return 1; 3719 return 1;
3820} 3720}
3821#endif 3721#endif
@@ -3939,6 +3839,7 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
3939{ 3839{
3940 __wake_up_common(q, mode, 1, 0, NULL); 3840 __wake_up_common(q, mode, 1, 0, NULL);
3941} 3841}
3842EXPORT_SYMBOL_GPL(__wake_up_locked);
3942 3843
3943void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) 3844void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
3944{ 3845{
@@ -4038,8 +3939,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4038 if (!x->done) { 3939 if (!x->done) {
4039 DECLARE_WAITQUEUE(wait, current); 3940 DECLARE_WAITQUEUE(wait, current);
4040 3941
4041 wait.flags |= WQ_FLAG_EXCLUSIVE; 3942 __add_wait_queue_tail_exclusive(&x->wait, &wait);
4042 __add_wait_queue_tail(&x->wait, &wait);
4043 do { 3943 do {
4044 if (signal_pending_state(state, current)) { 3944 if (signal_pending_state(state, current)) {
4045 timeout = -ERESTARTSYS; 3945 timeout = -ERESTARTSYS;
@@ -4150,6 +4050,23 @@ int __sched wait_for_completion_killable(struct completion *x)
4150EXPORT_SYMBOL(wait_for_completion_killable); 4050EXPORT_SYMBOL(wait_for_completion_killable);
4151 4051
4152/** 4052/**
4053 * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable))
4054 * @x: holds the state of this particular completion
4055 * @timeout: timeout value in jiffies
4056 *
4057 * This waits for either a completion of a specific task to be
4058 * signaled or for a specified timeout to expire. It can be
4059 * interrupted by a kill signal. The timeout is in jiffies.
4060 */
4061unsigned long __sched
4062wait_for_completion_killable_timeout(struct completion *x,
4063 unsigned long timeout)
4064{
4065 return wait_for_common(x, timeout, TASK_KILLABLE);
4066}
4067EXPORT_SYMBOL(wait_for_completion_killable_timeout);
4068
4069/**
4153 * try_wait_for_completion - try to decrement a completion without blocking 4070 * try_wait_for_completion - try to decrement a completion without blocking
4154 * @x: completion structure 4071 * @x: completion structure
4155 * 4072 *
@@ -4265,7 +4182,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4265 BUG_ON(prio < 0 || prio > MAX_PRIO); 4182 BUG_ON(prio < 0 || prio > MAX_PRIO);
4266 4183
4267 rq = task_rq_lock(p, &flags); 4184 rq = task_rq_lock(p, &flags);
4268 update_rq_clock(rq);
4269 4185
4270 oldprio = p->prio; 4186 oldprio = p->prio;
4271 prev_class = p->sched_class; 4187 prev_class = p->sched_class;
@@ -4286,7 +4202,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4286 if (running) 4202 if (running)
4287 p->sched_class->set_curr_task(rq); 4203 p->sched_class->set_curr_task(rq);
4288 if (on_rq) { 4204 if (on_rq) {
4289 enqueue_task(rq, p, 0, oldprio < prio); 4205 enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
4290 4206
4291 check_class_changed(rq, p, prev_class, oldprio, running); 4207 check_class_changed(rq, p, prev_class, oldprio, running);
4292 } 4208 }
@@ -4308,7 +4224,6 @@ void set_user_nice(struct task_struct *p, long nice)
4308 * the task might be in the middle of scheduling on another CPU. 4224 * the task might be in the middle of scheduling on another CPU.
4309 */ 4225 */
4310 rq = task_rq_lock(p, &flags); 4226 rq = task_rq_lock(p, &flags);
4311 update_rq_clock(rq);
4312 /* 4227 /*
4313 * The RT priorities are set via sched_setscheduler(), but we still 4228 * The RT priorities are set via sched_setscheduler(), but we still
4314 * allow the 'normal' nice value to be set - but as expected 4229 * allow the 'normal' nice value to be set - but as expected
@@ -4330,7 +4245,7 @@ void set_user_nice(struct task_struct *p, long nice)
4330 delta = p->prio - old_prio; 4245 delta = p->prio - old_prio;
4331 4246
4332 if (on_rq) { 4247 if (on_rq) {
4333 enqueue_task(rq, p, 0, false); 4248 enqueue_task(rq, p, 0);
4334 /* 4249 /*
4335 * If the task increased its priority or is running and 4250 * If the task increased its priority or is running and
4336 * lowered its priority, then reschedule its CPU: 4251 * lowered its priority, then reschedule its CPU:
@@ -4559,16 +4474,6 @@ recheck:
4559 } 4474 }
4560 4475
4561 if (user) { 4476 if (user) {
4562#ifdef CONFIG_RT_GROUP_SCHED
4563 /*
4564 * Do not allow realtime tasks into groups that have no runtime
4565 * assigned.
4566 */
4567 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4568 task_group(p)->rt_bandwidth.rt_runtime == 0)
4569 return -EPERM;
4570#endif
4571
4572 retval = security_task_setscheduler(p, policy, param); 4477 retval = security_task_setscheduler(p, policy, param);
4573 if (retval) 4478 if (retval)
4574 return retval; 4479 return retval;
@@ -4584,6 +4489,22 @@ recheck:
4584 * runqueue lock must be held. 4489 * runqueue lock must be held.
4585 */ 4490 */
4586 rq = __task_rq_lock(p); 4491 rq = __task_rq_lock(p);
4492
4493#ifdef CONFIG_RT_GROUP_SCHED
4494 if (user) {
4495 /*
4496 * Do not allow realtime tasks into groups that have no runtime
4497 * assigned.
4498 */
4499 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4500 task_group(p)->rt_bandwidth.rt_runtime == 0) {
4501 __task_rq_unlock(rq);
4502 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4503 return -EPERM;
4504 }
4505 }
4506#endif
4507
4587 /* recheck policy now with rq lock held */ 4508 /* recheck policy now with rq lock held */
4588 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4509 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4589 policy = oldpolicy = -1; 4510 policy = oldpolicy = -1;
@@ -4591,7 +4512,6 @@ recheck:
4591 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4512 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4592 goto recheck; 4513 goto recheck;
4593 } 4514 }
4594 update_rq_clock(rq);
4595 on_rq = p->se.on_rq; 4515 on_rq = p->se.on_rq;
4596 running = task_current(rq, p); 4516 running = task_current(rq, p);
4597 if (on_rq) 4517 if (on_rq)
@@ -4902,7 +4822,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4902 int ret; 4822 int ret;
4903 cpumask_var_t mask; 4823 cpumask_var_t mask;
4904 4824
4905 if (len < cpumask_size()) 4825 if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4826 return -EINVAL;
4827 if (len & (sizeof(unsigned long)-1))
4906 return -EINVAL; 4828 return -EINVAL;
4907 4829
4908 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) 4830 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -4910,10 +4832,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4910 4832
4911 ret = sched_getaffinity(pid, mask); 4833 ret = sched_getaffinity(pid, mask);
4912 if (ret == 0) { 4834 if (ret == 0) {
4913 if (copy_to_user(user_mask_ptr, mask, cpumask_size())) 4835 size_t retlen = min_t(size_t, len, cpumask_size());
4836
4837 if (copy_to_user(user_mask_ptr, mask, retlen))
4914 ret = -EFAULT; 4838 ret = -EFAULT;
4915 else 4839 else
4916 ret = cpumask_size(); 4840 ret = retlen;
4917 } 4841 }
4918 free_cpumask_var(mask); 4842 free_cpumask_var(mask);
4919 4843
@@ -5324,17 +5248,15 @@ static inline void sched_init_granularity(void)
5324/* 5248/*
5325 * This is how migration works: 5249 * This is how migration works:
5326 * 5250 *
5327 * 1) we queue a struct migration_req structure in the source CPU's 5251 * 1) we invoke migration_cpu_stop() on the target CPU using
5328 * runqueue and wake up that CPU's migration thread. 5252 * stop_one_cpu().
5329 * 2) we down() the locked semaphore => thread blocks. 5253 * 2) stopper starts to run (implicitly forcing the migrated thread
5330 * 3) migration thread wakes up (implicitly it forces the migrated 5254 * off the CPU)
5331 * thread off the CPU) 5255 * 3) it checks whether the migrated task is still in the wrong runqueue.
5332 * 4) it gets the migration request and checks whether the migrated 5256 * 4) if it's in the wrong runqueue then the migration thread removes
5333 * task is still in the wrong runqueue.
5334 * 5) if it's in the wrong runqueue then the migration thread removes
5335 * it and puts it into the right queue. 5257 * it and puts it into the right queue.
5336 * 6) migration thread up()s the semaphore. 5258 * 5) stopper completes and stop_one_cpu() returns and the migration
5337 * 7) we wake up and the migration is done. 5259 * is done.
5338 */ 5260 */
5339 5261
5340/* 5262/*
@@ -5348,12 +5270,23 @@ static inline void sched_init_granularity(void)
5348 */ 5270 */
5349int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) 5271int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5350{ 5272{
5351 struct migration_req req;
5352 unsigned long flags; 5273 unsigned long flags;
5353 struct rq *rq; 5274 struct rq *rq;
5275 unsigned int dest_cpu;
5354 int ret = 0; 5276 int ret = 0;
5355 5277
5278 /*
5279 * Serialize against TASK_WAKING so that ttwu() and wunt() can
5280 * drop the rq->lock and still rely on ->cpus_allowed.
5281 */
5282again:
5283 while (task_is_waking(p))
5284 cpu_relax();
5356 rq = task_rq_lock(p, &flags); 5285 rq = task_rq_lock(p, &flags);
5286 if (task_is_waking(p)) {
5287 task_rq_unlock(rq, &flags);
5288 goto again;
5289 }
5357 5290
5358 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5291 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
5359 ret = -EINVAL; 5292 ret = -EINVAL;
@@ -5377,15 +5310,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
5377 if (cpumask_test_cpu(task_cpu(p), new_mask)) 5310 if (cpumask_test_cpu(task_cpu(p), new_mask))
5378 goto out; 5311 goto out;
5379 5312
5380 if (migrate_task(p, cpumask_any_and(cpu_active_mask, new_mask), &req)) { 5313 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5314 if (migrate_task(p, dest_cpu)) {
5315 struct migration_arg arg = { p, dest_cpu };
5381 /* Need help from migration thread: drop lock and wait. */ 5316 /* Need help from migration thread: drop lock and wait. */
5382 struct task_struct *mt = rq->migration_thread;
5383
5384 get_task_struct(mt);
5385 task_rq_unlock(rq, &flags); 5317 task_rq_unlock(rq, &flags);
5386 wake_up_process(rq->migration_thread); 5318 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
5387 put_task_struct(mt);
5388 wait_for_completion(&req.done);
5389 tlb_migrate_finish(p->mm); 5319 tlb_migrate_finish(p->mm);
5390 return 0; 5320 return 0;
5391 } 5321 }
@@ -5443,98 +5373,49 @@ fail:
5443 return ret; 5373 return ret;
5444} 5374}
5445 5375
5446#define RCU_MIGRATION_IDLE 0
5447#define RCU_MIGRATION_NEED_QS 1
5448#define RCU_MIGRATION_GOT_QS 2
5449#define RCU_MIGRATION_MUST_SYNC 3
5450
5451/* 5376/*
5452 * migration_thread - this is a highprio system thread that performs 5377 * migration_cpu_stop - this will be executed by a highprio stopper thread
5453 * thread migration by bumping thread off CPU then 'pushing' onto 5378 * and performs thread migration by bumping thread off CPU then
5454 * another runqueue. 5379 * 'pushing' onto another runqueue.
5455 */ 5380 */
5456static int migration_thread(void *data) 5381static int migration_cpu_stop(void *data)
5457{ 5382{
5458 int badcpu; 5383 struct migration_arg *arg = data;
5459 int cpu = (long)data;
5460 struct rq *rq;
5461
5462 rq = cpu_rq(cpu);
5463 BUG_ON(rq->migration_thread != current);
5464
5465 set_current_state(TASK_INTERRUPTIBLE);
5466 while (!kthread_should_stop()) {
5467 struct migration_req *req;
5468 struct list_head *head;
5469
5470 raw_spin_lock_irq(&rq->lock);
5471
5472 if (cpu_is_offline(cpu)) {
5473 raw_spin_unlock_irq(&rq->lock);
5474 break;
5475 }
5476
5477 if (rq->active_balance) {
5478 active_load_balance(rq, cpu);
5479 rq->active_balance = 0;
5480 }
5481
5482 head = &rq->migration_queue;
5483
5484 if (list_empty(head)) {
5485 raw_spin_unlock_irq(&rq->lock);
5486 schedule();
5487 set_current_state(TASK_INTERRUPTIBLE);
5488 continue;
5489 }
5490 req = list_entry(head->next, struct migration_req, list);
5491 list_del_init(head->next);
5492
5493 if (req->task != NULL) {
5494 raw_spin_unlock(&rq->lock);
5495 __migrate_task(req->task, cpu, req->dest_cpu);
5496 } else if (likely(cpu == (badcpu = smp_processor_id()))) {
5497 req->dest_cpu = RCU_MIGRATION_GOT_QS;
5498 raw_spin_unlock(&rq->lock);
5499 } else {
5500 req->dest_cpu = RCU_MIGRATION_MUST_SYNC;
5501 raw_spin_unlock(&rq->lock);
5502 WARN_ONCE(1, "migration_thread() on CPU %d, expected %d\n", badcpu, cpu);
5503 }
5504 local_irq_enable();
5505
5506 complete(&req->done);
5507 }
5508 __set_current_state(TASK_RUNNING);
5509
5510 return 0;
5511}
5512
5513#ifdef CONFIG_HOTPLUG_CPU
5514
5515static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
5516{
5517 int ret;
5518 5384
5385 /*
5386 * The original target cpu might have gone down and we might
5387 * be on another cpu but it doesn't matter.
5388 */
5519 local_irq_disable(); 5389 local_irq_disable();
5520 ret = __migrate_task(p, src_cpu, dest_cpu); 5390 __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
5521 local_irq_enable(); 5391 local_irq_enable();
5522 return ret; 5392 return 0;
5523} 5393}
5524 5394
5395#ifdef CONFIG_HOTPLUG_CPU
5525/* 5396/*
5526 * Figure out where task on dead CPU should go, use force if necessary. 5397 * Figure out where task on dead CPU should go, use force if necessary.
5527 */ 5398 */
5528static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5399void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5529{ 5400{
5530 int dest_cpu; 5401 struct rq *rq = cpu_rq(dead_cpu);
5402 int needs_cpu, uninitialized_var(dest_cpu);
5403 unsigned long flags;
5531 5404
5532again: 5405 local_irq_save(flags);
5533 dest_cpu = select_fallback_rq(dead_cpu, p);
5534 5406
5535 /* It can have affinity changed while we were choosing. */ 5407 raw_spin_lock(&rq->lock);
5536 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5408 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING);
5537 goto again; 5409 if (needs_cpu)
5410 dest_cpu = select_fallback_rq(dead_cpu, p);
5411 raw_spin_unlock(&rq->lock);
5412 /*
5413 * It can only fail if we race with set_cpus_allowed(),
5414 * in the racer should migrate the task anyway.
5415 */
5416 if (needs_cpu)
5417 __migrate_task(p, dead_cpu, dest_cpu);
5418 local_irq_restore(flags);
5538} 5419}
5539 5420
5540/* 5421/*
@@ -5598,7 +5479,6 @@ void sched_idle_next(void)
5598 5479
5599 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5480 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5600 5481
5601 update_rq_clock(rq);
5602 activate_task(rq, p, 0); 5482 activate_task(rq, p, 0);
5603 5483
5604 raw_spin_unlock_irqrestore(&rq->lock, flags); 5484 raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -5653,7 +5533,6 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5653 for ( ; ; ) { 5533 for ( ; ; ) {
5654 if (!rq->nr_running) 5534 if (!rq->nr_running)
5655 break; 5535 break;
5656 update_rq_clock(rq);
5657 next = pick_next_task(rq); 5536 next = pick_next_task(rq);
5658 if (!next) 5537 if (!next)
5659 break; 5538 break;
@@ -5876,35 +5755,20 @@ static void set_rq_offline(struct rq *rq)
5876static int __cpuinit 5755static int __cpuinit
5877migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) 5756migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5878{ 5757{
5879 struct task_struct *p;
5880 int cpu = (long)hcpu; 5758 int cpu = (long)hcpu;
5881 unsigned long flags; 5759 unsigned long flags;
5882 struct rq *rq; 5760 struct rq *rq = cpu_rq(cpu);
5883 5761
5884 switch (action) { 5762 switch (action) {
5885 5763
5886 case CPU_UP_PREPARE: 5764 case CPU_UP_PREPARE:
5887 case CPU_UP_PREPARE_FROZEN: 5765 case CPU_UP_PREPARE_FROZEN:
5888 p = kthread_create(migration_thread, hcpu, "migration/%d", cpu);
5889 if (IS_ERR(p))
5890 return NOTIFY_BAD;
5891 kthread_bind(p, cpu);
5892 /* Must be high prio: stop_machine expects to yield to it. */
5893 rq = task_rq_lock(p, &flags);
5894 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5895 task_rq_unlock(rq, &flags);
5896 get_task_struct(p);
5897 cpu_rq(cpu)->migration_thread = p;
5898 rq->calc_load_update = calc_load_update; 5766 rq->calc_load_update = calc_load_update;
5899 break; 5767 break;
5900 5768
5901 case CPU_ONLINE: 5769 case CPU_ONLINE:
5902 case CPU_ONLINE_FROZEN: 5770 case CPU_ONLINE_FROZEN:
5903 /* Strictly unnecessary, as first user will wake it. */
5904 wake_up_process(cpu_rq(cpu)->migration_thread);
5905
5906 /* Update our root-domain */ 5771 /* Update our root-domain */
5907 rq = cpu_rq(cpu);
5908 raw_spin_lock_irqsave(&rq->lock, flags); 5772 raw_spin_lock_irqsave(&rq->lock, flags);
5909 if (rq->rd) { 5773 if (rq->rd) {
5910 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5774 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -5915,61 +5779,24 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5915 break; 5779 break;
5916 5780
5917#ifdef CONFIG_HOTPLUG_CPU 5781#ifdef CONFIG_HOTPLUG_CPU
5918 case CPU_UP_CANCELED:
5919 case CPU_UP_CANCELED_FROZEN:
5920 if (!cpu_rq(cpu)->migration_thread)
5921 break;
5922 /* Unbind it from offline cpu so it can run. Fall thru. */
5923 kthread_bind(cpu_rq(cpu)->migration_thread,
5924 cpumask_any(cpu_online_mask));
5925 kthread_stop(cpu_rq(cpu)->migration_thread);
5926 put_task_struct(cpu_rq(cpu)->migration_thread);
5927 cpu_rq(cpu)->migration_thread = NULL;
5928 break;
5929
5930 case CPU_DEAD: 5782 case CPU_DEAD:
5931 case CPU_DEAD_FROZEN: 5783 case CPU_DEAD_FROZEN:
5932 cpuset_lock(); /* around calls to cpuset_cpus_allowed_lock() */
5933 migrate_live_tasks(cpu); 5784 migrate_live_tasks(cpu);
5934 rq = cpu_rq(cpu);
5935 kthread_stop(rq->migration_thread);
5936 put_task_struct(rq->migration_thread);
5937 rq->migration_thread = NULL;
5938 /* Idle task back to normal (off runqueue, low prio) */ 5785 /* Idle task back to normal (off runqueue, low prio) */
5939 raw_spin_lock_irq(&rq->lock); 5786 raw_spin_lock_irq(&rq->lock);
5940 update_rq_clock(rq);
5941 deactivate_task(rq, rq->idle, 0); 5787 deactivate_task(rq, rq->idle, 0);
5942 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); 5788 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5943 rq->idle->sched_class = &idle_sched_class; 5789 rq->idle->sched_class = &idle_sched_class;
5944 migrate_dead_tasks(cpu); 5790 migrate_dead_tasks(cpu);
5945 raw_spin_unlock_irq(&rq->lock); 5791 raw_spin_unlock_irq(&rq->lock);
5946 cpuset_unlock();
5947 migrate_nr_uninterruptible(rq); 5792 migrate_nr_uninterruptible(rq);
5948 BUG_ON(rq->nr_running != 0); 5793 BUG_ON(rq->nr_running != 0);
5949 calc_global_load_remove(rq); 5794 calc_global_load_remove(rq);
5950 /*
5951 * No need to migrate the tasks: it was best-effort if
5952 * they didn't take sched_hotcpu_mutex. Just wake up
5953 * the requestors.
5954 */
5955 raw_spin_lock_irq(&rq->lock);
5956 while (!list_empty(&rq->migration_queue)) {
5957 struct migration_req *req;
5958
5959 req = list_entry(rq->migration_queue.next,
5960 struct migration_req, list);
5961 list_del_init(&req->list);
5962 raw_spin_unlock_irq(&rq->lock);
5963 complete(&req->done);
5964 raw_spin_lock_irq(&rq->lock);
5965 }
5966 raw_spin_unlock_irq(&rq->lock);
5967 break; 5795 break;
5968 5796
5969 case CPU_DYING: 5797 case CPU_DYING:
5970 case CPU_DYING_FROZEN: 5798 case CPU_DYING_FROZEN:
5971 /* Update our root-domain */ 5799 /* Update our root-domain */
5972 rq = cpu_rq(cpu);
5973 raw_spin_lock_irqsave(&rq->lock, flags); 5800 raw_spin_lock_irqsave(&rq->lock, flags);
5974 if (rq->rd) { 5801 if (rq->rd) {
5975 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 5802 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -6300,6 +6127,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
6300 struct rq *rq = cpu_rq(cpu); 6127 struct rq *rq = cpu_rq(cpu);
6301 struct sched_domain *tmp; 6128 struct sched_domain *tmp;
6302 6129
6130 for (tmp = sd; tmp; tmp = tmp->parent)
6131 tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
6132
6303 /* Remove the sched domains which do not contribute to scheduling. */ 6133 /* Remove the sched domains which do not contribute to scheduling. */
6304 for (tmp = sd; tmp; ) { 6134 for (tmp = sd; tmp; ) {
6305 struct sched_domain *parent = tmp->parent; 6135 struct sched_domain *parent = tmp->parent;
@@ -7777,16 +7607,15 @@ void __init sched_init(void)
7777#ifdef CONFIG_SMP 7607#ifdef CONFIG_SMP
7778 rq->sd = NULL; 7608 rq->sd = NULL;
7779 rq->rd = NULL; 7609 rq->rd = NULL;
7610 rq->cpu_power = SCHED_LOAD_SCALE;
7780 rq->post_schedule = 0; 7611 rq->post_schedule = 0;
7781 rq->active_balance = 0; 7612 rq->active_balance = 0;
7782 rq->next_balance = jiffies; 7613 rq->next_balance = jiffies;
7783 rq->push_cpu = 0; 7614 rq->push_cpu = 0;
7784 rq->cpu = i; 7615 rq->cpu = i;
7785 rq->online = 0; 7616 rq->online = 0;
7786 rq->migration_thread = NULL;
7787 rq->idle_stamp = 0; 7617 rq->idle_stamp = 0;
7788 rq->avg_idle = 2*sysctl_sched_migration_cost; 7618 rq->avg_idle = 2*sysctl_sched_migration_cost;
7789 INIT_LIST_HEAD(&rq->migration_queue);
7790 rq_attach_root(rq, &def_root_domain); 7619 rq_attach_root(rq, &def_root_domain);
7791#endif 7620#endif
7792 init_rq_hrtick(rq); 7621 init_rq_hrtick(rq);
@@ -7887,7 +7716,6 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
7887{ 7716{
7888 int on_rq; 7717 int on_rq;
7889 7718
7890 update_rq_clock(rq);
7891 on_rq = p->se.on_rq; 7719 on_rq = p->se.on_rq;
7892 if (on_rq) 7720 if (on_rq)
7893 deactivate_task(rq, p, 0); 7721 deactivate_task(rq, p, 0);
@@ -7914,9 +7742,9 @@ void normalize_rt_tasks(void)
7914 7742
7915 p->se.exec_start = 0; 7743 p->se.exec_start = 0;
7916#ifdef CONFIG_SCHEDSTATS 7744#ifdef CONFIG_SCHEDSTATS
7917 p->se.wait_start = 0; 7745 p->se.statistics.wait_start = 0;
7918 p->se.sleep_start = 0; 7746 p->se.statistics.sleep_start = 0;
7919 p->se.block_start = 0; 7747 p->se.statistics.block_start = 0;
7920#endif 7748#endif
7921 7749
7922 if (!rt_task(p)) { 7750 if (!rt_task(p)) {
@@ -7943,9 +7771,9 @@ void normalize_rt_tasks(void)
7943 7771
7944#endif /* CONFIG_MAGIC_SYSRQ */ 7772#endif /* CONFIG_MAGIC_SYSRQ */
7945 7773
7946#ifdef CONFIG_IA64 7774#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
7947/* 7775/*
7948 * These functions are only useful for the IA64 MCA handling. 7776 * These functions are only useful for the IA64 MCA handling, or kdb.
7949 * 7777 *
7950 * They can only be called when the whole system has been 7778 * They can only be called when the whole system has been
7951 * stopped - every CPU needs to be quiescent, and no scheduling 7779 * stopped - every CPU needs to be quiescent, and no scheduling
@@ -7965,6 +7793,9 @@ struct task_struct *curr_task(int cpu)
7965 return cpu_curr(cpu); 7793 return cpu_curr(cpu);
7966} 7794}
7967 7795
7796#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
7797
7798#ifdef CONFIG_IA64
7968/** 7799/**
7969 * set_curr_task - set the current task for a given cpu. 7800 * set_curr_task - set the current task for a given cpu.
7970 * @cpu: the processor in question. 7801 * @cpu: the processor in question.
@@ -8249,8 +8080,6 @@ void sched_move_task(struct task_struct *tsk)
8249 8080
8250 rq = task_rq_lock(tsk, &flags); 8081 rq = task_rq_lock(tsk, &flags);
8251 8082
8252 update_rq_clock(rq);
8253
8254 running = task_current(rq, tsk); 8083 running = task_current(rq, tsk);
8255 on_rq = tsk->se.on_rq; 8084 on_rq = tsk->se.on_rq;
8256 8085
@@ -8269,7 +8098,7 @@ void sched_move_task(struct task_struct *tsk)
8269 if (unlikely(running)) 8098 if (unlikely(running))
8270 tsk->sched_class->set_curr_task(rq); 8099 tsk->sched_class->set_curr_task(rq);
8271 if (on_rq) 8100 if (on_rq)
8272 enqueue_task(rq, tsk, 0, false); 8101 enqueue_task(rq, tsk, 0);
8273 8102
8274 task_rq_unlock(rq, &flags); 8103 task_rq_unlock(rq, &flags);
8275} 8104}
@@ -9083,43 +8912,32 @@ struct cgroup_subsys cpuacct_subsys = {
9083 8912
9084#ifndef CONFIG_SMP 8913#ifndef CONFIG_SMP
9085 8914
9086int rcu_expedited_torture_stats(char *page)
9087{
9088 return 0;
9089}
9090EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9091
9092void synchronize_sched_expedited(void) 8915void synchronize_sched_expedited(void)
9093{ 8916{
8917 barrier();
9094} 8918}
9095EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8919EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9096 8920
9097#else /* #ifndef CONFIG_SMP */ 8921#else /* #ifndef CONFIG_SMP */
9098 8922
9099static DEFINE_PER_CPU(struct migration_req, rcu_migration_req); 8923static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9100static DEFINE_MUTEX(rcu_sched_expedited_mutex);
9101 8924
9102#define RCU_EXPEDITED_STATE_POST -2 8925static int synchronize_sched_expedited_cpu_stop(void *data)
9103#define RCU_EXPEDITED_STATE_IDLE -1
9104
9105static int rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9106
9107int rcu_expedited_torture_stats(char *page)
9108{ 8926{
9109 int cnt = 0; 8927 /*
9110 int cpu; 8928 * There must be a full memory barrier on each affected CPU
9111 8929 * between the time that try_stop_cpus() is called and the
9112 cnt += sprintf(&page[cnt], "state: %d /", rcu_expedited_state); 8930 * time that it returns.
9113 for_each_online_cpu(cpu) { 8931 *
9114 cnt += sprintf(&page[cnt], " %d:%d", 8932 * In the current initial implementation of cpu_stop, the
9115 cpu, per_cpu(rcu_migration_req, cpu).dest_cpu); 8933 * above condition is already met when the control reaches
9116 } 8934 * this point and the following smp_mb() is not strictly
9117 cnt += sprintf(&page[cnt], "\n"); 8935 * necessary. Do smp_mb() anyway for documentation and
9118 return cnt; 8936 * robustness against future implementation changes.
8937 */
8938 smp_mb(); /* See above comment block. */
8939 return 0;
9119} 8940}
9120EXPORT_SYMBOL_GPL(rcu_expedited_torture_stats);
9121
9122static long synchronize_sched_expedited_count;
9123 8941
9124/* 8942/*
9125 * Wait for an rcu-sched grace period to elapse, but use "big hammer" 8943 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
@@ -9133,18 +8951,14 @@ static long synchronize_sched_expedited_count;
9133 */ 8951 */
9134void synchronize_sched_expedited(void) 8952void synchronize_sched_expedited(void)
9135{ 8953{
9136 int cpu; 8954 int snap, trycount = 0;
9137 unsigned long flags;
9138 bool need_full_sync = 0;
9139 struct rq *rq;
9140 struct migration_req *req;
9141 long snap;
9142 int trycount = 0;
9143 8955
9144 smp_mb(); /* ensure prior mod happens before capturing snap. */ 8956 smp_mb(); /* ensure prior mod happens before capturing snap. */
9145 snap = ACCESS_ONCE(synchronize_sched_expedited_count) + 1; 8957 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9146 get_online_cpus(); 8958 get_online_cpus();
9147 while (!mutex_trylock(&rcu_sched_expedited_mutex)) { 8959 while (try_stop_cpus(cpu_online_mask,
8960 synchronize_sched_expedited_cpu_stop,
8961 NULL) == -EAGAIN) {
9148 put_online_cpus(); 8962 put_online_cpus();
9149 if (trycount++ < 10) 8963 if (trycount++ < 10)
9150 udelay(trycount * num_online_cpus()); 8964 udelay(trycount * num_online_cpus());
@@ -9152,41 +8966,15 @@ void synchronize_sched_expedited(void)
9152 synchronize_sched(); 8966 synchronize_sched();
9153 return; 8967 return;
9154 } 8968 }
9155 if (ACCESS_ONCE(synchronize_sched_expedited_count) - snap > 0) { 8969 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9156 smp_mb(); /* ensure test happens before caller kfree */ 8970 smp_mb(); /* ensure test happens before caller kfree */
9157 return; 8971 return;
9158 } 8972 }
9159 get_online_cpus(); 8973 get_online_cpus();
9160 } 8974 }
9161 rcu_expedited_state = RCU_EXPEDITED_STATE_POST; 8975 atomic_inc(&synchronize_sched_expedited_count);
9162 for_each_online_cpu(cpu) { 8976 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9163 rq = cpu_rq(cpu);
9164 req = &per_cpu(rcu_migration_req, cpu);
9165 init_completion(&req->done);
9166 req->task = NULL;
9167 req->dest_cpu = RCU_MIGRATION_NEED_QS;
9168 raw_spin_lock_irqsave(&rq->lock, flags);
9169 list_add(&req->list, &rq->migration_queue);
9170 raw_spin_unlock_irqrestore(&rq->lock, flags);
9171 wake_up_process(rq->migration_thread);
9172 }
9173 for_each_online_cpu(cpu) {
9174 rcu_expedited_state = cpu;
9175 req = &per_cpu(rcu_migration_req, cpu);
9176 rq = cpu_rq(cpu);
9177 wait_for_completion(&req->done);
9178 raw_spin_lock_irqsave(&rq->lock, flags);
9179 if (unlikely(req->dest_cpu == RCU_MIGRATION_MUST_SYNC))
9180 need_full_sync = 1;
9181 req->dest_cpu = RCU_MIGRATION_IDLE;
9182 raw_spin_unlock_irqrestore(&rq->lock, flags);
9183 }
9184 rcu_expedited_state = RCU_EXPEDITED_STATE_IDLE;
9185 synchronize_sched_expedited_count++;
9186 mutex_unlock(&rcu_sched_expedited_mutex);
9187 put_online_cpus(); 8977 put_online_cpus();
9188 if (need_full_sync)
9189 synchronize_sched();
9190} 8978}
9191EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 8979EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9192 8980
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 5b496132c28a..906a0f718cb3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -41,6 +41,7 @@ unsigned long long __attribute__((weak)) sched_clock(void)
41 return (unsigned long long)(jiffies - INITIAL_JIFFIES) 41 return (unsigned long long)(jiffies - INITIAL_JIFFIES)
42 * (NSEC_PER_SEC / HZ); 42 * (NSEC_PER_SEC / HZ);
43} 43}
44EXPORT_SYMBOL_GPL(sched_clock);
44 45
45static __read_mostly int sched_clock_running; 46static __read_mostly int sched_clock_running;
46 47
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index fccf9fbb0d7b..e6871cb3fc83 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -27,6 +27,7 @@
27 * of the License. 27 * of the License.
28 */ 28 */
29 29
30#include <linux/gfp.h>
30#include "sched_cpupri.h" 31#include "sched_cpupri.h"
31 32
32/* Convert between a 140 based task->prio, and our 102 based cpupri */ 33/* Convert between a 140 based task->prio, and our 102 based cpupri */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 67f95aada4b9..35565395d00d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -70,16 +70,16 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu,
70 PN(se->vruntime); 70 PN(se->vruntime);
71 PN(se->sum_exec_runtime); 71 PN(se->sum_exec_runtime);
72#ifdef CONFIG_SCHEDSTATS 72#ifdef CONFIG_SCHEDSTATS
73 PN(se->wait_start); 73 PN(se->statistics.wait_start);
74 PN(se->sleep_start); 74 PN(se->statistics.sleep_start);
75 PN(se->block_start); 75 PN(se->statistics.block_start);
76 PN(se->sleep_max); 76 PN(se->statistics.sleep_max);
77 PN(se->block_max); 77 PN(se->statistics.block_max);
78 PN(se->exec_max); 78 PN(se->statistics.exec_max);
79 PN(se->slice_max); 79 PN(se->statistics.slice_max);
80 PN(se->wait_max); 80 PN(se->statistics.wait_max);
81 PN(se->wait_sum); 81 PN(se->statistics.wait_sum);
82 P(se->wait_count); 82 P(se->statistics.wait_count);
83#endif 83#endif
84 P(se->load.weight); 84 P(se->load.weight);
85#undef PN 85#undef PN
@@ -104,7 +104,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 104 SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
105 SPLIT_NS(p->se.vruntime), 105 SPLIT_NS(p->se.vruntime),
106 SPLIT_NS(p->se.sum_exec_runtime), 106 SPLIT_NS(p->se.sum_exec_runtime),
107 SPLIT_NS(p->se.sum_sleep_runtime)); 107 SPLIT_NS(p->se.statistics.sum_sleep_runtime));
108#else 108#else
109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld", 109 SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L); 110 0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
@@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
114 { 114 {
115 char path[64]; 115 char path[64];
116 116
117 rcu_read_lock();
117 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); 118 cgroup_path(task_group(p)->css.cgroup, path, sizeof(path));
119 rcu_read_unlock();
118 SEQ_printf(m, " %s", path); 120 SEQ_printf(m, " %s", path);
119 } 121 }
120#endif 122#endif
@@ -173,11 +175,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
173 task_group_path(tg, path, sizeof(path)); 175 task_group_path(tg, path, sizeof(path));
174 176
175 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 177 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
176#elif defined(CONFIG_USER_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
177 {
178 uid_t uid = cfs_rq->tg->uid;
179 SEQ_printf(m, "\ncfs_rq[%d] for UID: %u\n", cpu, uid);
180 }
181#else 178#else
182 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu); 179 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
183#endif 180#endif
@@ -384,15 +381,9 @@ __initcall(init_sched_debug_procfs);
384void proc_sched_show_task(struct task_struct *p, struct seq_file *m) 381void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
385{ 382{
386 unsigned long nr_switches; 383 unsigned long nr_switches;
387 unsigned long flags;
388 int num_threads = 1;
389
390 if (lock_task_sighand(p, &flags)) {
391 num_threads = atomic_read(&p->signal->count);
392 unlock_task_sighand(p, &flags);
393 }
394 384
395 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 385 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid,
386 get_nr_threads(p));
396 SEQ_printf(m, 387 SEQ_printf(m,
397 "---------------------------------------------------------\n"); 388 "---------------------------------------------------------\n");
398#define __P(F) \ 389#define __P(F) \
@@ -407,40 +398,38 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
407 PN(se.exec_start); 398 PN(se.exec_start);
408 PN(se.vruntime); 399 PN(se.vruntime);
409 PN(se.sum_exec_runtime); 400 PN(se.sum_exec_runtime);
410 PN(se.avg_overlap);
411 PN(se.avg_wakeup);
412 401
413 nr_switches = p->nvcsw + p->nivcsw; 402 nr_switches = p->nvcsw + p->nivcsw;
414 403
415#ifdef CONFIG_SCHEDSTATS 404#ifdef CONFIG_SCHEDSTATS
416 PN(se.wait_start); 405 PN(se.statistics.wait_start);
417 PN(se.sleep_start); 406 PN(se.statistics.sleep_start);
418 PN(se.block_start); 407 PN(se.statistics.block_start);
419 PN(se.sleep_max); 408 PN(se.statistics.sleep_max);
420 PN(se.block_max); 409 PN(se.statistics.block_max);
421 PN(se.exec_max); 410 PN(se.statistics.exec_max);
422 PN(se.slice_max); 411 PN(se.statistics.slice_max);
423 PN(se.wait_max); 412 PN(se.statistics.wait_max);
424 PN(se.wait_sum); 413 PN(se.statistics.wait_sum);
425 P(se.wait_count); 414 P(se.statistics.wait_count);
426 PN(se.iowait_sum); 415 PN(se.statistics.iowait_sum);
427 P(se.iowait_count); 416 P(se.statistics.iowait_count);
428 P(sched_info.bkl_count); 417 P(sched_info.bkl_count);
429 P(se.nr_migrations); 418 P(se.nr_migrations);
430 P(se.nr_migrations_cold); 419 P(se.statistics.nr_migrations_cold);
431 P(se.nr_failed_migrations_affine); 420 P(se.statistics.nr_failed_migrations_affine);
432 P(se.nr_failed_migrations_running); 421 P(se.statistics.nr_failed_migrations_running);
433 P(se.nr_failed_migrations_hot); 422 P(se.statistics.nr_failed_migrations_hot);
434 P(se.nr_forced_migrations); 423 P(se.statistics.nr_forced_migrations);
435 P(se.nr_wakeups); 424 P(se.statistics.nr_wakeups);
436 P(se.nr_wakeups_sync); 425 P(se.statistics.nr_wakeups_sync);
437 P(se.nr_wakeups_migrate); 426 P(se.statistics.nr_wakeups_migrate);
438 P(se.nr_wakeups_local); 427 P(se.statistics.nr_wakeups_local);
439 P(se.nr_wakeups_remote); 428 P(se.statistics.nr_wakeups_remote);
440 P(se.nr_wakeups_affine); 429 P(se.statistics.nr_wakeups_affine);
441 P(se.nr_wakeups_affine_attempts); 430 P(se.statistics.nr_wakeups_affine_attempts);
442 P(se.nr_wakeups_passive); 431 P(se.statistics.nr_wakeups_passive);
443 P(se.nr_wakeups_idle); 432 P(se.statistics.nr_wakeups_idle);
444 433
445 { 434 {
446 u64 avg_atom, avg_per_cpu; 435 u64 avg_atom, avg_per_cpu;
@@ -491,35 +480,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
491void proc_sched_set_task(struct task_struct *p) 480void proc_sched_set_task(struct task_struct *p)
492{ 481{
493#ifdef CONFIG_SCHEDSTATS 482#ifdef CONFIG_SCHEDSTATS
494 p->se.wait_max = 0; 483 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
495 p->se.wait_sum = 0;
496 p->se.wait_count = 0;
497 p->se.iowait_sum = 0;
498 p->se.iowait_count = 0;
499 p->se.sleep_max = 0;
500 p->se.sum_sleep_runtime = 0;
501 p->se.block_max = 0;
502 p->se.exec_max = 0;
503 p->se.slice_max = 0;
504 p->se.nr_migrations = 0;
505 p->se.nr_migrations_cold = 0;
506 p->se.nr_failed_migrations_affine = 0;
507 p->se.nr_failed_migrations_running = 0;
508 p->se.nr_failed_migrations_hot = 0;
509 p->se.nr_forced_migrations = 0;
510 p->se.nr_wakeups = 0;
511 p->se.nr_wakeups_sync = 0;
512 p->se.nr_wakeups_migrate = 0;
513 p->se.nr_wakeups_local = 0;
514 p->se.nr_wakeups_remote = 0;
515 p->se.nr_wakeups_affine = 0;
516 p->se.nr_wakeups_affine_attempts = 0;
517 p->se.nr_wakeups_passive = 0;
518 p->se.nr_wakeups_idle = 0;
519 p->sched_info.bkl_count = 0;
520#endif 484#endif
521 p->se.sum_exec_runtime = 0;
522 p->se.prev_sum_exec_runtime = 0;
523 p->nvcsw = 0;
524 p->nivcsw = 0;
525} 485}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5a5ea2cd924f..a878b5332daa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -35,8 +35,8 @@
35 * (to see the precise effective timeslice length of your workload, 35 * (to see the precise effective timeslice length of your workload,
36 * run vmstat and monitor the context-switches (cs) field) 36 * run vmstat and monitor the context-switches (cs) field)
37 */ 37 */
38unsigned int sysctl_sched_latency = 5000000ULL; 38unsigned int sysctl_sched_latency = 6000000ULL;
39unsigned int normalized_sysctl_sched_latency = 5000000ULL; 39unsigned int normalized_sysctl_sched_latency = 6000000ULL;
40 40
41/* 41/*
42 * The initial- and re-scaling of tunables is configurable 42 * The initial- and re-scaling of tunables is configurable
@@ -52,15 +52,15 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
52 52
53/* 53/*
54 * Minimal preemption granularity for CPU-bound tasks: 54 * Minimal preemption granularity for CPU-bound tasks:
55 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds) 55 * (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
56 */ 56 */
57unsigned int sysctl_sched_min_granularity = 1000000ULL; 57unsigned int sysctl_sched_min_granularity = 2000000ULL;
58unsigned int normalized_sysctl_sched_min_granularity = 1000000ULL; 58unsigned int normalized_sysctl_sched_min_granularity = 2000000ULL;
59 59
60/* 60/*
61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity 61 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
62 */ 62 */
63static unsigned int sched_nr_latency = 5; 63static unsigned int sched_nr_latency = 3;
64 64
65/* 65/*
66 * After fork, child runs first. If set to 0 (default) then 66 * After fork, child runs first. If set to 0 (default) then
@@ -505,7 +505,8 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
505{ 505{
506 unsigned long delta_exec_weighted; 506 unsigned long delta_exec_weighted;
507 507
508 schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); 508 schedstat_set(curr->statistics.exec_max,
509 max((u64)delta_exec, curr->statistics.exec_max));
509 510
510 curr->sum_exec_runtime += delta_exec; 511 curr->sum_exec_runtime += delta_exec;
511 schedstat_add(cfs_rq, exec_clock, delta_exec); 512 schedstat_add(cfs_rq, exec_clock, delta_exec);
@@ -548,7 +549,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
548static inline void 549static inline void
549update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) 550update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
550{ 551{
551 schedstat_set(se->wait_start, rq_of(cfs_rq)->clock); 552 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
552} 553}
553 554
554/* 555/*
@@ -567,18 +568,18 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
567static void 568static void
568update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) 569update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
569{ 570{
570 schedstat_set(se->wait_max, max(se->wait_max, 571 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
571 rq_of(cfs_rq)->clock - se->wait_start)); 572 rq_of(cfs_rq)->clock - se->statistics.wait_start));
572 schedstat_set(se->wait_count, se->wait_count + 1); 573 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
573 schedstat_set(se->wait_sum, se->wait_sum + 574 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
574 rq_of(cfs_rq)->clock - se->wait_start); 575 rq_of(cfs_rq)->clock - se->statistics.wait_start);
575#ifdef CONFIG_SCHEDSTATS 576#ifdef CONFIG_SCHEDSTATS
576 if (entity_is_task(se)) { 577 if (entity_is_task(se)) {
577 trace_sched_stat_wait(task_of(se), 578 trace_sched_stat_wait(task_of(se),
578 rq_of(cfs_rq)->clock - se->wait_start); 579 rq_of(cfs_rq)->clock - se->statistics.wait_start);
579 } 580 }
580#endif 581#endif
581 schedstat_set(se->wait_start, 0); 582 schedstat_set(se->statistics.wait_start, 0);
582} 583}
583 584
584static inline void 585static inline void
@@ -657,39 +658,39 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
657 if (entity_is_task(se)) 658 if (entity_is_task(se))
658 tsk = task_of(se); 659 tsk = task_of(se);
659 660
660 if (se->sleep_start) { 661 if (se->statistics.sleep_start) {
661 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 662 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
662 663
663 if ((s64)delta < 0) 664 if ((s64)delta < 0)
664 delta = 0; 665 delta = 0;
665 666
666 if (unlikely(delta > se->sleep_max)) 667 if (unlikely(delta > se->statistics.sleep_max))
667 se->sleep_max = delta; 668 se->statistics.sleep_max = delta;
668 669
669 se->sleep_start = 0; 670 se->statistics.sleep_start = 0;
670 se->sum_sleep_runtime += delta; 671 se->statistics.sum_sleep_runtime += delta;
671 672
672 if (tsk) { 673 if (tsk) {
673 account_scheduler_latency(tsk, delta >> 10, 1); 674 account_scheduler_latency(tsk, delta >> 10, 1);
674 trace_sched_stat_sleep(tsk, delta); 675 trace_sched_stat_sleep(tsk, delta);
675 } 676 }
676 } 677 }
677 if (se->block_start) { 678 if (se->statistics.block_start) {
678 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 679 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
679 680
680 if ((s64)delta < 0) 681 if ((s64)delta < 0)
681 delta = 0; 682 delta = 0;
682 683
683 if (unlikely(delta > se->block_max)) 684 if (unlikely(delta > se->statistics.block_max))
684 se->block_max = delta; 685 se->statistics.block_max = delta;
685 686
686 se->block_start = 0; 687 se->statistics.block_start = 0;
687 se->sum_sleep_runtime += delta; 688 se->statistics.sum_sleep_runtime += delta;
688 689
689 if (tsk) { 690 if (tsk) {
690 if (tsk->in_iowait) { 691 if (tsk->in_iowait) {
691 se->iowait_sum += delta; 692 se->statistics.iowait_sum += delta;
692 se->iowait_count++; 693 se->statistics.iowait_count++;
693 trace_sched_stat_iowait(tsk, delta); 694 trace_sched_stat_iowait(tsk, delta);
694 } 695 }
695 696
@@ -737,20 +738,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
737 vruntime += sched_vslice(cfs_rq, se); 738 vruntime += sched_vslice(cfs_rq, se);
738 739
739 /* sleeps up to a single latency don't count. */ 740 /* sleeps up to a single latency don't count. */
740 if (!initial && sched_feat(FAIR_SLEEPERS)) { 741 if (!initial) {
741 unsigned long thresh = sysctl_sched_latency; 742 unsigned long thresh = sysctl_sched_latency;
742 743
743 /* 744 /*
744 * Convert the sleeper threshold into virtual time.
745 * SCHED_IDLE is a special sub-class. We care about
746 * fairness only relative to other SCHED_IDLE tasks,
747 * all of which have the same weight.
748 */
749 if (sched_feat(NORMALIZED_SLEEPER) && (!entity_is_task(se) ||
750 task_of(se)->policy != SCHED_IDLE))
751 thresh = calc_delta_fair(thresh, se);
752
753 /*
754 * Halve their sleep time's effect, to allow 745 * Halve their sleep time's effect, to allow
755 * for a gentler effect of sleepers: 746 * for a gentler effect of sleepers:
756 */ 747 */
@@ -766,9 +757,6 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
766 se->vruntime = vruntime; 757 se->vruntime = vruntime;
767} 758}
768 759
769#define ENQUEUE_WAKEUP 1
770#define ENQUEUE_MIGRATE 2
771
772static void 760static void
773enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) 761enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
774{ 762{
@@ -776,7 +764,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
776 * Update the normalized vruntime before updating min_vruntime 764 * Update the normalized vruntime before updating min_vruntime
777 * through callig update_curr(). 765 * through callig update_curr().
778 */ 766 */
779 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATE)) 767 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
780 se->vruntime += cfs_rq->min_vruntime; 768 se->vruntime += cfs_rq->min_vruntime;
781 769
782 /* 770 /*
@@ -812,7 +800,7 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
812} 800}
813 801
814static void 802static void
815dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 803dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
816{ 804{
817 /* 805 /*
818 * Update run-time statistics of the 'current'. 806 * Update run-time statistics of the 'current'.
@@ -820,15 +808,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
820 update_curr(cfs_rq); 808 update_curr(cfs_rq);
821 809
822 update_stats_dequeue(cfs_rq, se); 810 update_stats_dequeue(cfs_rq, se);
823 if (sleep) { 811 if (flags & DEQUEUE_SLEEP) {
824#ifdef CONFIG_SCHEDSTATS 812#ifdef CONFIG_SCHEDSTATS
825 if (entity_is_task(se)) { 813 if (entity_is_task(se)) {
826 struct task_struct *tsk = task_of(se); 814 struct task_struct *tsk = task_of(se);
827 815
828 if (tsk->state & TASK_INTERRUPTIBLE) 816 if (tsk->state & TASK_INTERRUPTIBLE)
829 se->sleep_start = rq_of(cfs_rq)->clock; 817 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
830 if (tsk->state & TASK_UNINTERRUPTIBLE) 818 if (tsk->state & TASK_UNINTERRUPTIBLE)
831 se->block_start = rq_of(cfs_rq)->clock; 819 se->statistics.block_start = rq_of(cfs_rq)->clock;
832 } 820 }
833#endif 821#endif
834 } 822 }
@@ -845,7 +833,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
845 * update can refer to the ->curr item and we need to reflect this 833 * update can refer to the ->curr item and we need to reflect this
846 * movement in our normalized position. 834 * movement in our normalized position.
847 */ 835 */
848 if (!sleep) 836 if (!(flags & DEQUEUE_SLEEP))
849 se->vruntime -= cfs_rq->min_vruntime; 837 se->vruntime -= cfs_rq->min_vruntime;
850} 838}
851 839
@@ -912,7 +900,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
912 * when there are only lesser-weight tasks around): 900 * when there are only lesser-weight tasks around):
913 */ 901 */
914 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { 902 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
915 se->slice_max = max(se->slice_max, 903 se->statistics.slice_max = max(se->statistics.slice_max,
916 se->sum_exec_runtime - se->prev_sum_exec_runtime); 904 se->sum_exec_runtime - se->prev_sum_exec_runtime);
917 } 905 }
918#endif 906#endif
@@ -1054,16 +1042,10 @@ static inline void hrtick_update(struct rq *rq)
1054 * then put the task into the rbtree: 1042 * then put the task into the rbtree:
1055 */ 1043 */
1056static void 1044static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head) 1045enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1058{ 1046{
1059 struct cfs_rq *cfs_rq; 1047 struct cfs_rq *cfs_rq;
1060 struct sched_entity *se = &p->se; 1048 struct sched_entity *se = &p->se;
1061 int flags = 0;
1062
1063 if (wakeup)
1064 flags |= ENQUEUE_WAKEUP;
1065 if (p->state == TASK_WAKING)
1066 flags |= ENQUEUE_MIGRATE;
1067 1049
1068 for_each_sched_entity(se) { 1050 for_each_sched_entity(se) {
1069 if (se->on_rq) 1051 if (se->on_rq)
@@ -1081,18 +1063,18 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1081 * decreased. We remove the task from the rbtree and 1063 * decreased. We remove the task from the rbtree and
1082 * update the fair scheduling stats: 1064 * update the fair scheduling stats:
1083 */ 1065 */
1084static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 1066static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1085{ 1067{
1086 struct cfs_rq *cfs_rq; 1068 struct cfs_rq *cfs_rq;
1087 struct sched_entity *se = &p->se; 1069 struct sched_entity *se = &p->se;
1088 1070
1089 for_each_sched_entity(se) { 1071 for_each_sched_entity(se) {
1090 cfs_rq = cfs_rq_of(se); 1072 cfs_rq = cfs_rq_of(se);
1091 dequeue_entity(cfs_rq, se, sleep); 1073 dequeue_entity(cfs_rq, se, flags);
1092 /* Don't dequeue parent if it has other entities besides us */ 1074 /* Don't dequeue parent if it has other entities besides us */
1093 if (cfs_rq->load.weight) 1075 if (cfs_rq->load.weight)
1094 break; 1076 break;
1095 sleep = 1; 1077 flags |= DEQUEUE_SLEEP;
1096 } 1078 }
1097 1079
1098 hrtick_update(rq); 1080 hrtick_update(rq);
@@ -1240,11 +1222,9 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1240 1222
1241static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) 1223static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1242{ 1224{
1243 struct task_struct *curr = current;
1244 unsigned long this_load, load; 1225 unsigned long this_load, load;
1245 int idx, this_cpu, prev_cpu; 1226 int idx, this_cpu, prev_cpu;
1246 unsigned long tl_per_task; 1227 unsigned long tl_per_task;
1247 unsigned int imbalance;
1248 struct task_group *tg; 1228 struct task_group *tg;
1249 unsigned long weight; 1229 unsigned long weight;
1250 int balanced; 1230 int balanced;
@@ -1255,23 +1235,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1255 load = source_load(prev_cpu, idx); 1235 load = source_load(prev_cpu, idx);
1256 this_load = target_load(this_cpu, idx); 1236 this_load = target_load(this_cpu, idx);
1257 1237
1258 if (sync) {
1259 if (sched_feat(SYNC_LESS) &&
1260 (curr->se.avg_overlap > sysctl_sched_migration_cost ||
1261 p->se.avg_overlap > sysctl_sched_migration_cost))
1262 sync = 0;
1263 } else {
1264 if (sched_feat(SYNC_MORE) &&
1265 (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1266 p->se.avg_overlap < sysctl_sched_migration_cost))
1267 sync = 1;
1268 }
1269
1270 /* 1238 /*
1271 * If sync wakeup then subtract the (maximum possible) 1239 * If sync wakeup then subtract the (maximum possible)
1272 * effect of the currently running task from the load 1240 * effect of the currently running task from the load
1273 * of the current CPU: 1241 * of the current CPU:
1274 */ 1242 */
1243 rcu_read_lock();
1275 if (sync) { 1244 if (sync) {
1276 tg = task_group(current); 1245 tg = task_group(current);
1277 weight = current->se.load.weight; 1246 weight = current->se.load.weight;
@@ -1283,8 +1252,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1283 tg = task_group(p); 1252 tg = task_group(p);
1284 weight = p->se.load.weight; 1253 weight = p->se.load.weight;
1285 1254
1286 imbalance = 100 + (sd->imbalance_pct - 100) / 2;
1287
1288 /* 1255 /*
1289 * In low-load situations, where prev_cpu is idle and this_cpu is idle 1256 * In low-load situations, where prev_cpu is idle and this_cpu is idle
1290 * due to the sync cause above having dropped this_load to 0, we'll 1257 * due to the sync cause above having dropped this_load to 0, we'll
@@ -1294,9 +1261,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1294 * Otherwise check if either cpus are near enough in load to allow this 1261 * Otherwise check if either cpus are near enough in load to allow this
1295 * task to be woken on this_cpu. 1262 * task to be woken on this_cpu.
1296 */ 1263 */
1297 balanced = !this_load || 1264 if (this_load) {
1298 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= 1265 unsigned long this_eff_load, prev_eff_load;
1299 imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); 1266
1267 this_eff_load = 100;
1268 this_eff_load *= power_of(prev_cpu);
1269 this_eff_load *= this_load +
1270 effective_load(tg, this_cpu, weight, weight);
1271
1272 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
1273 prev_eff_load *= power_of(this_cpu);
1274 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
1275
1276 balanced = this_eff_load <= prev_eff_load;
1277 } else
1278 balanced = true;
1279 rcu_read_unlock();
1300 1280
1301 /* 1281 /*
1302 * If the currently running task will sleep within 1282 * If the currently running task will sleep within
@@ -1306,7 +1286,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1306 if (sync && balanced) 1286 if (sync && balanced)
1307 return 1; 1287 return 1;
1308 1288
1309 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1289 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
1310 tl_per_task = cpu_avg_load_per_task(this_cpu); 1290 tl_per_task = cpu_avg_load_per_task(this_cpu);
1311 1291
1312 if (balanced || 1292 if (balanced ||
@@ -1318,7 +1298,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
1318 * there is no bad imbalance. 1298 * there is no bad imbalance.
1319 */ 1299 */
1320 schedstat_inc(sd, ttwu_move_affine); 1300 schedstat_inc(sd, ttwu_move_affine);
1321 schedstat_inc(p, se.nr_wakeups_affine); 1301 schedstat_inc(p, se.statistics.nr_wakeups_affine);
1322 1302
1323 return 1; 1303 return 1;
1324 } 1304 }
@@ -1406,29 +1386,48 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1406/* 1386/*
1407 * Try and locate an idle CPU in the sched_domain. 1387 * Try and locate an idle CPU in the sched_domain.
1408 */ 1388 */
1409static int 1389static int select_idle_sibling(struct task_struct *p, int target)
1410select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1411{ 1390{
1412 int cpu = smp_processor_id(); 1391 int cpu = smp_processor_id();
1413 int prev_cpu = task_cpu(p); 1392 int prev_cpu = task_cpu(p);
1393 struct sched_domain *sd;
1414 int i; 1394 int i;
1415 1395
1416 /* 1396 /*
1417 * If this domain spans both cpu and prev_cpu (see the SD_WAKE_AFFINE 1397 * If the task is going to be woken-up on this cpu and if it is
1418 * test in select_task_rq_fair) and the prev_cpu is idle then that's 1398 * already idle, then it is the right target.
1419 * always a better target than the current cpu. 1399 */
1400 if (target == cpu && idle_cpu(cpu))
1401 return cpu;
1402
1403 /*
1404 * If the task is going to be woken-up on the cpu where it previously
1405 * ran and if it is currently idle, then it the right target.
1420 */ 1406 */
1421 if (target == cpu && !cpu_rq(prev_cpu)->cfs.nr_running) 1407 if (target == prev_cpu && idle_cpu(prev_cpu))
1422 return prev_cpu; 1408 return prev_cpu;
1423 1409
1424 /* 1410 /*
1425 * Otherwise, iterate the domain and find an elegible idle cpu. 1411 * Otherwise, iterate the domains and find an elegible idle cpu.
1426 */ 1412 */
1427 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) { 1413 for_each_domain(target, sd) {
1428 if (!cpu_rq(i)->cfs.nr_running) { 1414 if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
1429 target = i;
1430 break; 1415 break;
1416
1417 for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
1418 if (idle_cpu(i)) {
1419 target = i;
1420 break;
1421 }
1431 } 1422 }
1423
1424 /*
1425 * Lets stop looking for an idle sibling when we reached
1426 * the domain that spans the current cpu and prev_cpu.
1427 */
1428 if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
1429 cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
1430 break;
1432 } 1431 }
1433 1432
1434 return target; 1433 return target;
@@ -1445,7 +1444,8 @@ select_idle_sibling(struct task_struct *p, struct sched_domain *sd, int target)
1445 * 1444 *
1446 * preempt must be disabled. 1445 * preempt must be disabled.
1447 */ 1446 */
1448static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) 1447static int
1448select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
1449{ 1449{
1450 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; 1450 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
1451 int cpu = smp_processor_id(); 1451 int cpu = smp_processor_id();
@@ -1456,8 +1456,7 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1456 int sync = wake_flags & WF_SYNC; 1456 int sync = wake_flags & WF_SYNC;
1457 1457
1458 if (sd_flag & SD_BALANCE_WAKE) { 1458 if (sd_flag & SD_BALANCE_WAKE) {
1459 if (sched_feat(AFFINE_WAKEUPS) && 1459 if (cpumask_test_cpu(cpu, &p->cpus_allowed))
1460 cpumask_test_cpu(cpu, &p->cpus_allowed))
1461 want_affine = 1; 1460 want_affine = 1;
1462 new_cpu = prev_cpu; 1461 new_cpu = prev_cpu;
1463 } 1462 }
@@ -1491,34 +1490,13 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1491 } 1490 }
1492 1491
1493 /* 1492 /*
1494 * While iterating the domains looking for a spanning 1493 * If both cpu and prev_cpu are part of this domain,
1495 * WAKE_AFFINE domain, adjust the affine target to any idle cpu 1494 * cpu is a valid SD_WAKE_AFFINE target.
1496 * in cache sharing domains along the way.
1497 */ 1495 */
1498 if (want_affine) { 1496 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
1499 int target = -1; 1497 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
1500 1498 affine_sd = tmp;
1501 /* 1499 want_affine = 0;
1502 * If both cpu and prev_cpu are part of this domain,
1503 * cpu is a valid SD_WAKE_AFFINE target.
1504 */
1505 if (cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
1506 target = cpu;
1507
1508 /*
1509 * If there's an idle sibling in this domain, make that
1510 * the wake_affine target instead of the current cpu.
1511 */
1512 if (tmp->flags & SD_SHARE_PKG_RESOURCES)
1513 target = select_idle_sibling(p, tmp, target);
1514
1515 if (target >= 0) {
1516 if (tmp->flags & SD_WAKE_AFFINE) {
1517 affine_sd = tmp;
1518 want_affine = 0;
1519 }
1520 cpu = target;
1521 }
1522 } 1500 }
1523 1501
1524 if (!want_sd && !want_affine) 1502 if (!want_sd && !want_affine)
@@ -1531,22 +1509,29 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1531 sd = tmp; 1509 sd = tmp;
1532 } 1510 }
1533 1511
1512#ifdef CONFIG_FAIR_GROUP_SCHED
1534 if (sched_feat(LB_SHARES_UPDATE)) { 1513 if (sched_feat(LB_SHARES_UPDATE)) {
1535 /* 1514 /*
1536 * Pick the largest domain to update shares over 1515 * Pick the largest domain to update shares over
1537 */ 1516 */
1538 tmp = sd; 1517 tmp = sd;
1539 if (affine_sd && (!tmp || 1518 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1540 cpumask_weight(sched_domain_span(affine_sd)) >
1541 cpumask_weight(sched_domain_span(sd))))
1542 tmp = affine_sd; 1519 tmp = affine_sd;
1543 1520
1544 if (tmp) 1521 if (tmp) {
1522 raw_spin_unlock(&rq->lock);
1545 update_shares(tmp); 1523 update_shares(tmp);
1524 raw_spin_lock(&rq->lock);
1525 }
1546 } 1526 }
1527#endif
1547 1528
1548 if (affine_sd && wake_affine(affine_sd, p, sync)) 1529 if (affine_sd) {
1549 return cpu; 1530 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1531 return select_idle_sibling(p, cpu);
1532 else
1533 return select_idle_sibling(p, prev_cpu);
1534 }
1550 1535
1551 while (sd) { 1536 while (sd) {
1552 int load_idx = sd->forkexec_idx; 1537 int load_idx = sd->forkexec_idx;
@@ -1576,10 +1561,10 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1576 1561
1577 /* Now try balancing at a lower domain level of new_cpu */ 1562 /* Now try balancing at a lower domain level of new_cpu */
1578 cpu = new_cpu; 1563 cpu = new_cpu;
1579 weight = cpumask_weight(sched_domain_span(sd)); 1564 weight = sd->span_weight;
1580 sd = NULL; 1565 sd = NULL;
1581 for_each_domain(cpu, tmp) { 1566 for_each_domain(cpu, tmp) {
1582 if (weight <= cpumask_weight(sched_domain_span(tmp))) 1567 if (weight <= tmp->span_weight)
1583 break; 1568 break;
1584 if (tmp->flags & sd_flag) 1569 if (tmp->flags & sd_flag)
1585 sd = tmp; 1570 sd = tmp;
@@ -1591,63 +1576,26 @@ static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flag
1591} 1576}
1592#endif /* CONFIG_SMP */ 1577#endif /* CONFIG_SMP */
1593 1578
1594/*
1595 * Adaptive granularity
1596 *
1597 * se->avg_wakeup gives the average time a task runs until it does a wakeup,
1598 * with the limit of wakeup_gran -- when it never does a wakeup.
1599 *
1600 * So the smaller avg_wakeup is the faster we want this task to preempt,
1601 * but we don't want to treat the preemptee unfairly and therefore allow it
1602 * to run for at least the amount of time we'd like to run.
1603 *
1604 * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
1605 *
1606 * NOTE: we use *nr_running to scale with load, this nicely matches the
1607 * degrading latency on load.
1608 */
1609static unsigned long
1610adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
1611{
1612 u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
1613 u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
1614 u64 gran = 0;
1615
1616 if (this_run < expected_wakeup)
1617 gran = expected_wakeup - this_run;
1618
1619 return min_t(s64, gran, sysctl_sched_wakeup_granularity);
1620}
1621
1622static unsigned long 1579static unsigned long
1623wakeup_gran(struct sched_entity *curr, struct sched_entity *se) 1580wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
1624{ 1581{
1625 unsigned long gran = sysctl_sched_wakeup_granularity; 1582 unsigned long gran = sysctl_sched_wakeup_granularity;
1626 1583
1627 if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
1628 gran = adaptive_gran(curr, se);
1629
1630 /* 1584 /*
1631 * Since its curr running now, convert the gran from real-time 1585 * Since its curr running now, convert the gran from real-time
1632 * to virtual-time in his units. 1586 * to virtual-time in his units.
1587 *
1588 * By using 'se' instead of 'curr' we penalize light tasks, so
1589 * they get preempted easier. That is, if 'se' < 'curr' then
1590 * the resulting gran will be larger, therefore penalizing the
1591 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1592 * be smaller, again penalizing the lighter task.
1593 *
1594 * This is especially important for buddies when the leftmost
1595 * task is higher priority than the buddy.
1633 */ 1596 */
1634 if (sched_feat(ASYM_GRAN)) { 1597 if (unlikely(se->load.weight != NICE_0_LOAD))
1635 /* 1598 gran = calc_delta_fair(gran, se);
1636 * By using 'se' instead of 'curr' we penalize light tasks, so
1637 * they get preempted easier. That is, if 'se' < 'curr' then
1638 * the resulting gran will be larger, therefore penalizing the
1639 * lighter, if otoh 'se' > 'curr' then the resulting gran will
1640 * be smaller, again penalizing the lighter task.
1641 *
1642 * This is especially important for buddies when the leftmost
1643 * task is higher priority than the buddy.
1644 */
1645 if (unlikely(se->load.weight != NICE_0_LOAD))
1646 gran = calc_delta_fair(gran, se);
1647 } else {
1648 if (unlikely(curr->load.weight != NICE_0_LOAD))
1649 gran = calc_delta_fair(gran, curr);
1650 }
1651 1599
1652 return gran; 1600 return gran;
1653} 1601}
@@ -1705,7 +1653,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1705 struct task_struct *curr = rq->curr; 1653 struct task_struct *curr = rq->curr;
1706 struct sched_entity *se = &curr->se, *pse = &p->se; 1654 struct sched_entity *se = &curr->se, *pse = &p->se;
1707 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1655 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1708 int sync = wake_flags & WF_SYNC;
1709 int scale = cfs_rq->nr_running >= sched_nr_latency; 1656 int scale = cfs_rq->nr_running >= sched_nr_latency;
1710 1657
1711 if (unlikely(rt_prio(p->prio))) 1658 if (unlikely(rt_prio(p->prio)))
@@ -1738,14 +1685,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1738 if (unlikely(curr->policy == SCHED_IDLE)) 1685 if (unlikely(curr->policy == SCHED_IDLE))
1739 goto preempt; 1686 goto preempt;
1740 1687
1741 if (sched_feat(WAKEUP_SYNC) && sync)
1742 goto preempt;
1743
1744 if (sched_feat(WAKEUP_OVERLAP) &&
1745 se->avg_overlap < sysctl_sched_migration_cost &&
1746 pse->avg_overlap < sysctl_sched_migration_cost)
1747 goto preempt;
1748
1749 if (!sched_feat(WAKEUP_PREEMPT)) 1688 if (!sched_feat(WAKEUP_PREEMPT))
1750 return; 1689 return;
1751 1690
@@ -1844,13 +1783,13 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1844 * 3) are cache-hot on their current CPU. 1783 * 3) are cache-hot on their current CPU.
1845 */ 1784 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { 1785 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine); 1786 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
1848 return 0; 1787 return 0;
1849 } 1788 }
1850 *all_pinned = 0; 1789 *all_pinned = 0;
1851 1790
1852 if (task_running(rq, p)) { 1791 if (task_running(rq, p)) {
1853 schedstat_inc(p, se.nr_failed_migrations_running); 1792 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1854 return 0; 1793 return 0;
1855 } 1794 }
1856 1795
@@ -1866,14 +1805,14 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1866#ifdef CONFIG_SCHEDSTATS 1805#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) { 1806 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]); 1807 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations); 1808 schedstat_inc(p, se.statistics.nr_forced_migrations);
1870 } 1809 }
1871#endif 1810#endif
1872 return 1; 1811 return 1;
1873 } 1812 }
1874 1813
1875 if (tsk_cache_hot) { 1814 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot); 1815 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
1877 return 0; 1816 return 0;
1878 } 1817 }
1879 return 1; 1818 return 1;
@@ -2311,7 +2250,7 @@ unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2311 2250
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) 2251unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{ 2252{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2253 unsigned long weight = sd->span_weight;
2315 unsigned long smt_gain = sd->smt_gain; 2254 unsigned long smt_gain = sd->smt_gain;
2316 2255
2317 smt_gain /= weight; 2256 smt_gain /= weight;
@@ -2344,7 +2283,7 @@ unsigned long scale_rt_power(int cpu)
2344 2283
2345static void update_cpu_power(struct sched_domain *sd, int cpu) 2284static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{ 2285{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd)); 2286 unsigned long weight = sd->span_weight;
2348 unsigned long power = SCHED_LOAD_SCALE; 2287 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups; 2288 struct sched_group *sdg = sd->groups;
2350 2289
@@ -2370,6 +2309,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
2370 if (!power) 2309 if (!power)
2371 power = 1; 2310 power = 1;
2372 2311
2312 cpu_rq(cpu)->cpu_power = power;
2373 sdg->cpu_power = power; 2313 sdg->cpu_power = power;
2374} 2314}
2375 2315
@@ -2870,6 +2810,8 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); 2810 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871} 2811}
2872 2812
2813static int active_load_balance_cpu_stop(void *data);
2814
2873/* 2815/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2816 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance. 2817 * tasks if there is an imbalance.
@@ -2959,8 +2901,9 @@ redo:
2959 if (need_active_balance(sd, sd_idle, idle)) { 2901 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags); 2902 raw_spin_lock_irqsave(&busiest->lock, flags);
2961 2903
2962 /* don't kick the migration_thread, if the curr 2904 /* don't kick the active_load_balance_cpu_stop,
2963 * task on busiest cpu can't be moved to this_cpu 2905 * if the curr task on busiest cpu can't be
2906 * moved to this_cpu
2964 */ 2907 */
2965 if (!cpumask_test_cpu(this_cpu, 2908 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) { 2909 &busiest->curr->cpus_allowed)) {
@@ -2970,14 +2913,22 @@ redo:
2970 goto out_one_pinned; 2913 goto out_one_pinned;
2971 } 2914 }
2972 2915
2916 /*
2917 * ->active_balance synchronizes accesses to
2918 * ->active_balance_work. Once set, it's cleared
2919 * only after active load balance is finished.
2920 */
2973 if (!busiest->active_balance) { 2921 if (!busiest->active_balance) {
2974 busiest->active_balance = 1; 2922 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu; 2923 busiest->push_cpu = this_cpu;
2976 active_balance = 1; 2924 active_balance = 1;
2977 } 2925 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags); 2926 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2927
2979 if (active_balance) 2928 if (active_balance)
2980 wake_up_process(busiest->migration_thread); 2929 stop_one_cpu_nowait(cpu_of(busiest),
2930 active_load_balance_cpu_stop, busiest,
2931 &busiest->active_balance_work);
2981 2932
2982 /* 2933 /*
2983 * We've kicked active balancing, reset the failure 2934 * We've kicked active balancing, reset the failure
@@ -3084,24 +3035,29 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3084} 3035}
3085 3036
3086/* 3037/*
3087 * active_load_balance is run by migration threads. It pushes running tasks 3038 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be 3039 * running tasks off the busiest CPU onto idle CPUs. It requires at
3089 * running on each physical CPU where possible, and avoids physical / 3040 * least 1 task to be running on each physical CPU where possible, and
3090 * logical imbalances. 3041 * avoids physical / logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */ 3042 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) 3043static int active_load_balance_cpu_stop(void *data)
3095{ 3044{
3045 struct rq *busiest_rq = data;
3046 int busiest_cpu = cpu_of(busiest_rq);
3096 int target_cpu = busiest_rq->push_cpu; 3047 int target_cpu = busiest_rq->push_cpu;
3048 struct rq *target_rq = cpu_rq(target_cpu);
3097 struct sched_domain *sd; 3049 struct sched_domain *sd;
3098 struct rq *target_rq; 3050
3051 raw_spin_lock_irq(&busiest_rq->lock);
3052
3053 /* make sure the requested cpu hasn't gone down in the meantime */
3054 if (unlikely(busiest_cpu != smp_processor_id() ||
3055 !busiest_rq->active_balance))
3056 goto out_unlock;
3099 3057
3100 /* Is there any task to move? */ 3058 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1) 3059 if (busiest_rq->nr_running <= 1)
3102 return; 3060 goto out_unlock;
3103
3104 target_rq = cpu_rq(target_cpu);
3105 3061
3106 /* 3062 /*
3107 * This condition is "impossible", if it occurs 3063 * This condition is "impossible", if it occurs
@@ -3112,8 +3068,6 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3112 3068
3113 /* move a task from busiest_rq to target_rq */ 3069 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq); 3070 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117 3071
3118 /* Search for an sd spanning us and the target CPU. */ 3072 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) { 3073 for_each_domain(target_cpu, sd) {
@@ -3132,6 +3086,10 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3132 schedstat_inc(sd, alb_failed); 3086 schedstat_inc(sd, alb_failed);
3133 } 3087 }
3134 double_unlock_balance(busiest_rq, target_rq); 3088 double_unlock_balance(busiest_rq, target_rq);
3089out_unlock:
3090 busiest_rq->active_balance = 0;
3091 raw_spin_unlock_irq(&busiest_rq->lock);
3092 return 0;
3135} 3093}
3136 3094
3137#ifdef CONFIG_NO_HZ 3095#ifdef CONFIG_NO_HZ
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index d5059fd761d9..83c66e8ad3ee 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,11 +1,4 @@
1/* 1/*
2 * Disregards a certain amount of sleep time (sched_latency_ns) and
3 * considers the task to be running during that period. This gives it
4 * a service deficit on wakeup, allowing it to run sooner.
5 */
6SCHED_FEAT(FAIR_SLEEPERS, 1)
7
8/*
9 * Only give sleepers 50% of their service deficit. This allows 2 * Only give sleepers 50% of their service deficit. This allows
10 * them to run sooner, but does not allow tons of sleepers to 3 * them to run sooner, but does not allow tons of sleepers to
11 * rip the spread apart. 4 * rip the spread apart.
@@ -13,13 +6,6 @@ SCHED_FEAT(FAIR_SLEEPERS, 1)
13SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) 6SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
14 7
15/* 8/*
16 * By not normalizing the sleep time, heavy tasks get an effective
17 * longer period, and lighter task an effective shorter period they
18 * are considered running.
19 */
20SCHED_FEAT(NORMALIZED_SLEEPER, 0)
21
22/*
23 * Place new tasks ahead so that they do not starve already running 9 * Place new tasks ahead so that they do not starve already running
24 * tasks 10 * tasks
25 */ 11 */
@@ -31,37 +17,6 @@ SCHED_FEAT(START_DEBIT, 1)
31SCHED_FEAT(WAKEUP_PREEMPT, 1) 17SCHED_FEAT(WAKEUP_PREEMPT, 1)
32 18
33/* 19/*
34 * Compute wakeup_gran based on task behaviour, clipped to
35 * [0, sched_wakeup_gran_ns]
36 */
37SCHED_FEAT(ADAPTIVE_GRAN, 1)
38
39/*
40 * When converting the wakeup granularity to virtual time, do it such
41 * that heavier tasks preempting a lighter task have an edge.
42 */
43SCHED_FEAT(ASYM_GRAN, 1)
44
45/*
46 * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS.
47 */
48SCHED_FEAT(WAKEUP_SYNC, 0)
49
50/*
51 * Wakeup preempt based on task behaviour. Tasks that do not overlap
52 * don't get preempted.
53 */
54SCHED_FEAT(WAKEUP_OVERLAP, 0)
55
56/*
57 * Use the SYNC wakeup hint, pipes and the likes use this to indicate
58 * the remote end is likely to consume the data we just wrote, and
59 * therefore has cache benefit from being placed on the same cpu, see
60 * also AFFINE_WAKEUPS.
61 */
62SCHED_FEAT(SYNC_WAKEUPS, 1)
63
64/*
65 * Based on load and program behaviour, see if it makes sense to place 20 * Based on load and program behaviour, see if it makes sense to place
66 * a newly woken task on the same cpu as the task that woke it -- 21 * a newly woken task on the same cpu as the task that woke it --
67 * improve cache locality. Typically used with SYNC wakeups as 22 * improve cache locality. Typically used with SYNC wakeups as
@@ -70,16 +25,6 @@ SCHED_FEAT(SYNC_WAKEUPS, 1)
70SCHED_FEAT(AFFINE_WAKEUPS, 1) 25SCHED_FEAT(AFFINE_WAKEUPS, 1)
71 26
72/* 27/*
73 * Weaken SYNC hint based on overlap
74 */
75SCHED_FEAT(SYNC_LESS, 1)
76
77/*
78 * Add SYNC hint based on overlap
79 */
80SCHED_FEAT(SYNC_MORE, 0)
81
82/*
83 * Prefer to schedule the task we woke last (assuming it failed 28 * Prefer to schedule the task we woke last (assuming it failed
84 * wakeup-preemption), since its likely going to consume data we 29 * wakeup-preemption), since its likely going to consume data we
85 * touched, increases cache locality. 30 * touched, increases cache locality.
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index a8a6d8a50947..9fa0f402c87c 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -6,7 +6,8 @@
6 */ 6 */
7 7
8#ifdef CONFIG_SMP 8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) 9static int
10select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
10{ 11{
11 return task_cpu(p); /* IDLE tasks as never migrated */ 12 return task_cpu(p); /* IDLE tasks as never migrated */
12} 13}
@@ -22,8 +23,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
22static struct task_struct *pick_next_task_idle(struct rq *rq) 23static struct task_struct *pick_next_task_idle(struct rq *rq)
23{ 24{
24 schedstat_inc(rq, sched_goidle); 25 schedstat_inc(rq, sched_goidle);
25 /* adjust the active tasks as we might go into a long sleep */ 26 calc_load_account_idle(rq);
26 calc_load_account_active(rq);
27 return rq->idle; 27 return rq->idle;
28} 28}
29 29
@@ -32,7 +32,7 @@ static struct task_struct *pick_next_task_idle(struct rq *rq)
32 * message if some code attempts to do it: 32 * message if some code attempts to do it:
33 */ 33 */
34static void 34static void
35dequeue_task_idle(struct rq *rq, struct task_struct *p, int sleep) 35dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
36{ 36{
37 raw_spin_unlock_irq(&rq->lock); 37 raw_spin_unlock_irq(&rq->lock);
38 printk(KERN_ERR "bad: scheduling from the idle thread!\n"); 38 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b5b920ae2ea7..8afb953e31c6 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -613,7 +613,7 @@ static void update_curr_rt(struct rq *rq)
613 if (unlikely((s64)delta_exec < 0)) 613 if (unlikely((s64)delta_exec < 0))
614 delta_exec = 0; 614 delta_exec = 0;
615 615
616 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 616 schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec));
617 617
618 curr->se.sum_exec_runtime += delta_exec; 618 curr->se.sum_exec_runtime += delta_exec;
619 account_group_exec_runtime(curr, delta_exec); 619 account_group_exec_runtime(curr, delta_exec);
@@ -888,20 +888,20 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
888 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
889 */ 889 */
890static void 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head) 891enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
892{ 892{
893 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
894 894
895 if (wakeup) 895 if (flags & ENQUEUE_WAKEUP)
896 rt_se->timeout = 0; 896 rt_se->timeout = 0;
897 897
898 enqueue_rt_entity(rt_se, head); 898 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
899 899
900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
901 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
902} 902}
903 903
904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 904static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
905{ 905{
906 struct sched_rt_entity *rt_se = &p->rt; 906 struct sched_rt_entity *rt_se = &p->rt;
907 907
@@ -948,10 +948,9 @@ static void yield_task_rt(struct rq *rq)
948#ifdef CONFIG_SMP 948#ifdef CONFIG_SMP
949static int find_lowest_rq(struct task_struct *task); 949static int find_lowest_rq(struct task_struct *task);
950 950
951static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) 951static int
952select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
952{ 953{
953 struct rq *rq = task_rq(p);
954
955 if (sd_flag != SD_BALANCE_WAKE) 954 if (sd_flag != SD_BALANCE_WAKE)
956 return smp_processor_id(); 955 return smp_processor_id();
957 956
diff --git a/kernel/signal.c b/kernel/signal.c
index dbd7fe073c55..bded65187780 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -637,12 +637,12 @@ static inline bool si_fromuser(const struct siginfo *info)
637 637
638/* 638/*
639 * Bad permissions for sending the signal 639 * Bad permissions for sending the signal
640 * - the caller must hold at least the RCU read lock 640 * - the caller must hold the RCU read lock
641 */ 641 */
642static int check_kill_permission(int sig, struct siginfo *info, 642static int check_kill_permission(int sig, struct siginfo *info,
643 struct task_struct *t) 643 struct task_struct *t)
644{ 644{
645 const struct cred *cred = current_cred(), *tcred; 645 const struct cred *cred, *tcred;
646 struct pid *sid; 646 struct pid *sid;
647 int error; 647 int error;
648 648
@@ -656,8 +656,10 @@ static int check_kill_permission(int sig, struct siginfo *info,
656 if (error) 656 if (error)
657 return error; 657 return error;
658 658
659 cred = current_cred();
659 tcred = __task_cred(t); 660 tcred = __task_cred(t);
660 if ((cred->euid ^ tcred->suid) && 661 if (!same_thread_group(current, t) &&
662 (cred->euid ^ tcred->suid) &&
661 (cred->euid ^ tcred->uid) && 663 (cred->euid ^ tcred->uid) &&
662 (cred->uid ^ tcred->suid) && 664 (cred->uid ^ tcred->suid) &&
663 (cred->uid ^ tcred->uid) && 665 (cred->uid ^ tcred->uid) &&
@@ -1083,23 +1085,24 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
1083/* 1085/*
1084 * Nuke all other threads in the group. 1086 * Nuke all other threads in the group.
1085 */ 1087 */
1086void zap_other_threads(struct task_struct *p) 1088int zap_other_threads(struct task_struct *p)
1087{ 1089{
1088 struct task_struct *t; 1090 struct task_struct *t = p;
1091 int count = 0;
1089 1092
1090 p->signal->group_stop_count = 0; 1093 p->signal->group_stop_count = 0;
1091 1094
1092 for (t = next_thread(p); t != p; t = next_thread(t)) { 1095 while_each_thread(p, t) {
1093 /* 1096 count++;
1094 * Don't bother with already dead threads 1097
1095 */ 1098 /* Don't bother with already dead threads */
1096 if (t->exit_state) 1099 if (t->exit_state)
1097 continue; 1100 continue;
1098
1099 /* SIGKILL will be handled before any pending SIGSTOP */
1100 sigaddset(&t->pending.signal, SIGKILL); 1101 sigaddset(&t->pending.signal, SIGKILL);
1101 signal_wake_up(t, 1); 1102 signal_wake_up(t, 1);
1102 } 1103 }
1104
1105 return count;
1103} 1106}
1104 1107
1105struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) 1108struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
@@ -1124,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
1124 1127
1125/* 1128/*
1126 * send signal info to all the members of a group 1129 * send signal info to all the members of a group
1127 * - the caller must hold the RCU read lock at least
1128 */ 1130 */
1129int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) 1131int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
1130{ 1132{
1131 int ret = check_kill_permission(sig, info, p); 1133 int ret;
1134
1135 rcu_read_lock();
1136 ret = check_kill_permission(sig, info, p);
1137 rcu_read_unlock();
1132 1138
1133 if (!ret && sig) 1139 if (!ret && sig)
1134 ret = do_send_sig_info(sig, info, p, true); 1140 ret = do_send_sig_info(sig, info, p, true);
@@ -2735,3 +2741,43 @@ void __init signals_init(void)
2735{ 2741{
2736 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); 2742 sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC);
2737} 2743}
2744
2745#ifdef CONFIG_KGDB_KDB
2746#include <linux/kdb.h>
2747/*
2748 * kdb_send_sig_info - Allows kdb to send signals without exposing
2749 * signal internals. This function checks if the required locks are
2750 * available before calling the main signal code, to avoid kdb
2751 * deadlocks.
2752 */
2753void
2754kdb_send_sig_info(struct task_struct *t, struct siginfo *info)
2755{
2756 static struct task_struct *kdb_prev_t;
2757 int sig, new_t;
2758 if (!spin_trylock(&t->sighand->siglock)) {
2759 kdb_printf("Can't do kill command now.\n"
2760 "The sigmask lock is held somewhere else in "
2761 "kernel, try again later\n");
2762 return;
2763 }
2764 spin_unlock(&t->sighand->siglock);
2765 new_t = kdb_prev_t != t;
2766 kdb_prev_t = t;
2767 if (t->state != TASK_RUNNING && new_t) {
2768 kdb_printf("Process is not RUNNING, sending a signal from "
2769 "kdb risks deadlock\n"
2770 "on the run queue locks. "
2771 "The signal has _not_ been sent.\n"
2772 "Reissue the kill command if you want to risk "
2773 "the deadlock.\n");
2774 return;
2775 }
2776 sig = info->si_signo;
2777 if (send_sig_info(sig, info, t))
2778 kdb_printf("Fail to deliver Signal %d to process %d.\n",
2779 sig, t->pid);
2780 else
2781 kdb_printf("Signal %d is sent to process %d.\n", sig, t->pid);
2782}
2783#endif /* CONFIG_KGDB_KDB */
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 7494bbf5a270..7d3f4fa9ef4f 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
637 goto cancelled; 637 goto cancelled;
638 638
639 /* the timer holds a reference whilst it is pending */ 639 /* the timer holds a reference whilst it is pending */
640 ret = work->ops->get_ref(work); 640 ret = slow_work_get_ref(work);
641 if (ret < 0) 641 if (ret < 0)
642 goto cant_get_ref; 642 goto cant_get_ref;
643 643
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 321f3c59d732..a29ebd1ef41d 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
43 */ 43 */
44static inline void slow_work_set_thread_pid(int id, pid_t pid) 44static inline void slow_work_set_thread_pid(int id, pid_t pid)
45{ 45{
46#ifdef CONFIG_SLOW_WORK_PROC 46#ifdef CONFIG_SLOW_WORK_DEBUG
47 slow_work_pids[id] = pid; 47 slow_work_pids[id] = pid;
48#endif 48#endif
49} 49}
50 50
51static inline void slow_work_mark_time(struct slow_work *work) 51static inline void slow_work_mark_time(struct slow_work *work)
52{ 52{
53#ifdef CONFIG_SLOW_WORK_PROC 53#ifdef CONFIG_SLOW_WORK_DEBUG
54 work->mark = CURRENT_TIME; 54 work->mark = CURRENT_TIME;
55#endif 55#endif
56} 56}
57 57
58static inline void slow_work_begin_exec(int id, struct slow_work *work) 58static inline void slow_work_begin_exec(int id, struct slow_work *work)
59{ 59{
60#ifdef CONFIG_SLOW_WORK_PROC 60#ifdef CONFIG_SLOW_WORK_DEBUG
61 slow_work_execs[id] = work; 61 slow_work_execs[id] = work;
62#endif 62#endif
63} 63}
64 64
65static inline void slow_work_end_exec(int id, struct slow_work *work) 65static inline void slow_work_end_exec(int id, struct slow_work *work)
66{ 66{
67#ifdef CONFIG_SLOW_WORK_PROC 67#ifdef CONFIG_SLOW_WORK_DEBUG
68 write_lock(&slow_work_execs_lock); 68 write_lock(&slow_work_execs_lock);
69 slow_work_execs[id] = NULL; 69 slow_work_execs[id] = NULL;
70 write_unlock(&slow_work_execs_lock); 70 write_unlock(&slow_work_execs_lock);
diff --git a/kernel/smp.c b/kernel/smp.c
index 9867b6bfefce..75c970c715d3 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -9,6 +9,7 @@
9#include <linux/module.h> 9#include <linux/module.h>
10#include <linux/percpu.h> 10#include <linux/percpu.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/gfp.h>
12#include <linux/smp.h> 13#include <linux/smp.h>
13#include <linux/cpu.h> 14#include <linux/cpu.h>
14 15
@@ -51,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
51 case CPU_UP_PREPARE_FROZEN: 52 case CPU_UP_PREPARE_FROZEN:
52 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, 53 if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
53 cpu_to_node(cpu))) 54 cpu_to_node(cpu)))
54 return NOTIFY_BAD; 55 return notifier_from_errno(-ENOMEM);
55 break; 56 break;
56 57
57#ifdef CONFIG_HOTPLUG_CPU 58#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7c1a67ef0274..07b4f1b1a73a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -716,7 +716,7 @@ static int run_ksoftirqd(void * __bind_cpu)
716 preempt_enable_no_resched(); 716 preempt_enable_no_resched();
717 cond_resched(); 717 cond_resched();
718 preempt_disable(); 718 preempt_disable();
719 rcu_sched_qs((long)__bind_cpu); 719 rcu_note_context_switch((long)__bind_cpu);
720 } 720 }
721 preempt_enable(); 721 preempt_enable();
722 set_current_state(TASK_INTERRUPTIBLE); 722 set_current_state(TASK_INTERRUPTIBLE);
@@ -808,7 +808,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); 808 p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
809 if (IS_ERR(p)) { 809 if (IS_ERR(p)) {
810 printk("ksoftirqd for %i failed\n", hotcpu); 810 printk("ksoftirqd for %i failed\n", hotcpu);
811 return NOTIFY_BAD; 811 return notifier_from_errno(PTR_ERR(p));
812 } 812 }
813 kthread_bind(p, hotcpu); 813 kthread_bind(p, hotcpu);
814 per_cpu(ksoftirqd, hotcpu) = p; 814 per_cpu(ksoftirqd, hotcpu) = p;
@@ -850,7 +850,7 @@ static __init int spawn_ksoftirqd(void)
850 void *cpu = (void *)(long)smp_processor_id(); 850 void *cpu = (void *)(long)smp_processor_id();
851 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 851 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
852 852
853 BUG_ON(err == NOTIFY_BAD); 853 BUG_ON(err != NOTIFY_OK);
854 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 854 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
855 register_cpu_notifier(&cpu_nfb); 855 register_cpu_notifier(&cpu_nfb);
856 return 0; 856 return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 0d4c7898ab80..4b493f67dcb5 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -155,11 +155,11 @@ void softlockup_tick(void)
155 * Wake up the high-prio watchdog task twice per 155 * Wake up the high-prio watchdog task twice per
156 * threshold timespan. 156 * threshold timespan.
157 */ 157 */
158 if (now > touch_ts + softlockup_thresh/2) 158 if (time_after(now - softlockup_thresh/2, touch_ts))
159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); 159 wake_up_process(per_cpu(softlockup_watchdog, this_cpu));
160 160
161 /* Warn about unreasonable delays: */ 161 /* Warn about unreasonable delays: */
162 if (now <= (touch_ts + softlockup_thresh)) 162 if (time_before_eq(now - softlockup_thresh, touch_ts))
163 return; 163 return;
164 164
165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts; 165 per_cpu(softlockup_print_ts, this_cpu) = touch_ts;
diff --git a/kernel/srcu.c b/kernel/srcu.c
index bde4295774c8..2980da3fd509 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -30,7 +30,6 @@
30#include <linux/preempt.h> 30#include <linux/preempt.h>
31#include <linux/rcupdate.h> 31#include <linux/rcupdate.h>
32#include <linux/sched.h> 32#include <linux/sched.h>
33#include <linux/slab.h>
34#include <linux/smp.h> 33#include <linux/smp.h>
35#include <linux/srcu.h> 34#include <linux/srcu.h>
36 35
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 9bb9fb1bd79c..70f8d90331e9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,17 +1,384 @@
1/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. 1/*
2 * GPL v2 and any later version. 2 * kernel/stop_machine.c
3 *
4 * Copyright (C) 2008, 2005 IBM Corporation.
5 * Copyright (C) 2008, 2005 Rusty Russell rusty@rustcorp.com.au
6 * Copyright (C) 2010 SUSE Linux Products GmbH
7 * Copyright (C) 2010 Tejun Heo <tj@kernel.org>
8 *
9 * This file is released under the GPLv2 and any later version.
3 */ 10 */
11#include <linux/completion.h>
4#include <linux/cpu.h> 12#include <linux/cpu.h>
5#include <linux/err.h> 13#include <linux/init.h>
6#include <linux/kthread.h> 14#include <linux/kthread.h>
7#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/percpu.h>
8#include <linux/sched.h> 17#include <linux/sched.h>
9#include <linux/stop_machine.h> 18#include <linux/stop_machine.h>
10#include <linux/syscalls.h>
11#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/kallsyms.h>
12 21
13#include <asm/atomic.h> 22#include <asm/atomic.h>
14#include <asm/uaccess.h> 23
24/*
25 * Structure to determine completion condition and record errors. May
26 * be shared by works on different cpus.
27 */
28struct cpu_stop_done {
29 atomic_t nr_todo; /* nr left to execute */
30 bool executed; /* actually executed? */
31 int ret; /* collected return value */
32 struct completion completion; /* fired if nr_todo reaches 0 */
33};
34
35/* the actual stopper, one per every possible cpu, enabled on online cpus */
36struct cpu_stopper {
37 spinlock_t lock;
38 struct list_head works; /* list of pending works */
39 struct task_struct *thread; /* stopper thread */
40 bool enabled; /* is this stopper enabled? */
41};
42
43static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
44
45static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
46{
47 memset(done, 0, sizeof(*done));
48 atomic_set(&done->nr_todo, nr_todo);
49 init_completion(&done->completion);
50}
51
52/* signal completion unless @done is NULL */
53static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
54{
55 if (done) {
56 if (executed)
57 done->executed = true;
58 if (atomic_dec_and_test(&done->nr_todo))
59 complete(&done->completion);
60 }
61}
62
63/* queue @work to @stopper. if offline, @work is completed immediately */
64static void cpu_stop_queue_work(struct cpu_stopper *stopper,
65 struct cpu_stop_work *work)
66{
67 unsigned long flags;
68
69 spin_lock_irqsave(&stopper->lock, flags);
70
71 if (stopper->enabled) {
72 list_add_tail(&work->list, &stopper->works);
73 wake_up_process(stopper->thread);
74 } else
75 cpu_stop_signal_done(work->done, false);
76
77 spin_unlock_irqrestore(&stopper->lock, flags);
78}
79
80/**
81 * stop_one_cpu - stop a cpu
82 * @cpu: cpu to stop
83 * @fn: function to execute
84 * @arg: argument to @fn
85 *
86 * Execute @fn(@arg) on @cpu. @fn is run in a process context with
87 * the highest priority preempting any task on the cpu and
88 * monopolizing it. This function returns after the execution is
89 * complete.
90 *
91 * This function doesn't guarantee @cpu stays online till @fn
92 * completes. If @cpu goes down in the middle, execution may happen
93 * partially or fully on different cpus. @fn should either be ready
94 * for that or the caller should ensure that @cpu stays online until
95 * this function completes.
96 *
97 * CONTEXT:
98 * Might sleep.
99 *
100 * RETURNS:
101 * -ENOENT if @fn(@arg) was not executed because @cpu was offline;
102 * otherwise, the return value of @fn.
103 */
104int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
105{
106 struct cpu_stop_done done;
107 struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done };
108
109 cpu_stop_init_done(&done, 1);
110 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), &work);
111 wait_for_completion(&done.completion);
112 return done.executed ? done.ret : -ENOENT;
113}
114
115/**
116 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
117 * @cpu: cpu to stop
118 * @fn: function to execute
119 * @arg: argument to @fn
120 *
121 * Similar to stop_one_cpu() but doesn't wait for completion. The
122 * caller is responsible for ensuring @work_buf is currently unused
123 * and will remain untouched until stopper starts executing @fn.
124 *
125 * CONTEXT:
126 * Don't care.
127 */
128void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
129 struct cpu_stop_work *work_buf)
130{
131 *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, };
132 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
133}
134
135/* static data for stop_cpus */
136static DEFINE_MUTEX(stop_cpus_mutex);
137static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
138
139int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
140{
141 struct cpu_stop_work *work;
142 struct cpu_stop_done done;
143 unsigned int cpu;
144
145 /* initialize works and done */
146 for_each_cpu(cpu, cpumask) {
147 work = &per_cpu(stop_cpus_work, cpu);
148 work->fn = fn;
149 work->arg = arg;
150 work->done = &done;
151 }
152 cpu_stop_init_done(&done, cpumask_weight(cpumask));
153
154 /*
155 * Disable preemption while queueing to avoid getting
156 * preempted by a stopper which might wait for other stoppers
157 * to enter @fn which can lead to deadlock.
158 */
159 preempt_disable();
160 for_each_cpu(cpu, cpumask)
161 cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu),
162 &per_cpu(stop_cpus_work, cpu));
163 preempt_enable();
164
165 wait_for_completion(&done.completion);
166 return done.executed ? done.ret : -ENOENT;
167}
168
169/**
170 * stop_cpus - stop multiple cpus
171 * @cpumask: cpus to stop
172 * @fn: function to execute
173 * @arg: argument to @fn
174 *
175 * Execute @fn(@arg) on online cpus in @cpumask. On each target cpu,
176 * @fn is run in a process context with the highest priority
177 * preempting any task on the cpu and monopolizing it. This function
178 * returns after all executions are complete.
179 *
180 * This function doesn't guarantee the cpus in @cpumask stay online
181 * till @fn completes. If some cpus go down in the middle, execution
182 * on the cpu may happen partially or fully on different cpus. @fn
183 * should either be ready for that or the caller should ensure that
184 * the cpus stay online until this function completes.
185 *
186 * All stop_cpus() calls are serialized making it safe for @fn to wait
187 * for all cpus to start executing it.
188 *
189 * CONTEXT:
190 * Might sleep.
191 *
192 * RETURNS:
193 * -ENOENT if @fn(@arg) was not executed at all because all cpus in
194 * @cpumask were offline; otherwise, 0 if all executions of @fn
195 * returned 0, any non zero return value if any returned non zero.
196 */
197int stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
198{
199 int ret;
200
201 /* static works are used, process one request at a time */
202 mutex_lock(&stop_cpus_mutex);
203 ret = __stop_cpus(cpumask, fn, arg);
204 mutex_unlock(&stop_cpus_mutex);
205 return ret;
206}
207
208/**
209 * try_stop_cpus - try to stop multiple cpus
210 * @cpumask: cpus to stop
211 * @fn: function to execute
212 * @arg: argument to @fn
213 *
214 * Identical to stop_cpus() except that it fails with -EAGAIN if
215 * someone else is already using the facility.
216 *
217 * CONTEXT:
218 * Might sleep.
219 *
220 * RETURNS:
221 * -EAGAIN if someone else is already stopping cpus, -ENOENT if
222 * @fn(@arg) was not executed at all because all cpus in @cpumask were
223 * offline; otherwise, 0 if all executions of @fn returned 0, any non
224 * zero return value if any returned non zero.
225 */
226int try_stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
227{
228 int ret;
229
230 /* static works are used, process one request at a time */
231 if (!mutex_trylock(&stop_cpus_mutex))
232 return -EAGAIN;
233 ret = __stop_cpus(cpumask, fn, arg);
234 mutex_unlock(&stop_cpus_mutex);
235 return ret;
236}
237
238static int cpu_stopper_thread(void *data)
239{
240 struct cpu_stopper *stopper = data;
241 struct cpu_stop_work *work;
242 int ret;
243
244repeat:
245 set_current_state(TASK_INTERRUPTIBLE); /* mb paired w/ kthread_stop */
246
247 if (kthread_should_stop()) {
248 __set_current_state(TASK_RUNNING);
249 return 0;
250 }
251
252 work = NULL;
253 spin_lock_irq(&stopper->lock);
254 if (!list_empty(&stopper->works)) {
255 work = list_first_entry(&stopper->works,
256 struct cpu_stop_work, list);
257 list_del_init(&work->list);
258 }
259 spin_unlock_irq(&stopper->lock);
260
261 if (work) {
262 cpu_stop_fn_t fn = work->fn;
263 void *arg = work->arg;
264 struct cpu_stop_done *done = work->done;
265 char ksym_buf[KSYM_NAME_LEN];
266
267 __set_current_state(TASK_RUNNING);
268
269 /* cpu stop callbacks are not allowed to sleep */
270 preempt_disable();
271
272 ret = fn(arg);
273 if (ret)
274 done->ret = ret;
275
276 /* restore preemption and check it's still balanced */
277 preempt_enable();
278 WARN_ONCE(preempt_count(),
279 "cpu_stop: %s(%p) leaked preempt count\n",
280 kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL,
281 ksym_buf), arg);
282
283 cpu_stop_signal_done(done, true);
284 } else
285 schedule();
286
287 goto repeat;
288}
289
290/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
291static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
292 unsigned long action, void *hcpu)
293{
294 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
295 unsigned int cpu = (unsigned long)hcpu;
296 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
297 struct task_struct *p;
298
299 switch (action & ~CPU_TASKS_FROZEN) {
300 case CPU_UP_PREPARE:
301 BUG_ON(stopper->thread || stopper->enabled ||
302 !list_empty(&stopper->works));
303 p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
304 cpu);
305 if (IS_ERR(p))
306 return NOTIFY_BAD;
307 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
308 get_task_struct(p);
309 stopper->thread = p;
310 break;
311
312 case CPU_ONLINE:
313 kthread_bind(stopper->thread, cpu);
314 /* strictly unnecessary, as first user will wake it */
315 wake_up_process(stopper->thread);
316 /* mark enabled */
317 spin_lock_irq(&stopper->lock);
318 stopper->enabled = true;
319 spin_unlock_irq(&stopper->lock);
320 break;
321
322#ifdef CONFIG_HOTPLUG_CPU
323 case CPU_UP_CANCELED:
324 case CPU_POST_DEAD:
325 {
326 struct cpu_stop_work *work;
327
328 /* kill the stopper */
329 kthread_stop(stopper->thread);
330 /* drain remaining works */
331 spin_lock_irq(&stopper->lock);
332 list_for_each_entry(work, &stopper->works, list)
333 cpu_stop_signal_done(work->done, false);
334 stopper->enabled = false;
335 spin_unlock_irq(&stopper->lock);
336 /* release the stopper */
337 put_task_struct(stopper->thread);
338 stopper->thread = NULL;
339 break;
340 }
341#endif
342 }
343
344 return NOTIFY_OK;
345}
346
347/*
348 * Give it a higher priority so that cpu stopper is available to other
349 * cpu notifiers. It currently shares the same priority as sched
350 * migration_notifier.
351 */
352static struct notifier_block __cpuinitdata cpu_stop_cpu_notifier = {
353 .notifier_call = cpu_stop_cpu_callback,
354 .priority = 10,
355};
356
357static int __init cpu_stop_init(void)
358{
359 void *bcpu = (void *)(long)smp_processor_id();
360 unsigned int cpu;
361 int err;
362
363 for_each_possible_cpu(cpu) {
364 struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
365
366 spin_lock_init(&stopper->lock);
367 INIT_LIST_HEAD(&stopper->works);
368 }
369
370 /* start one for the boot cpu */
371 err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
372 bcpu);
373 BUG_ON(err == NOTIFY_BAD);
374 cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
375 register_cpu_notifier(&cpu_stop_cpu_notifier);
376
377 return 0;
378}
379early_initcall(cpu_stop_init);
380
381#ifdef CONFIG_STOP_MACHINE
15 382
16/* This controls the threads on each CPU. */ 383/* This controls the threads on each CPU. */
17enum stopmachine_state { 384enum stopmachine_state {
@@ -26,174 +393,94 @@ enum stopmachine_state {
26 /* Exit */ 393 /* Exit */
27 STOPMACHINE_EXIT, 394 STOPMACHINE_EXIT,
28}; 395};
29static enum stopmachine_state state;
30 396
31struct stop_machine_data { 397struct stop_machine_data {
32 int (*fn)(void *); 398 int (*fn)(void *);
33 void *data; 399 void *data;
34 int fnret; 400 /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
401 unsigned int num_threads;
402 const struct cpumask *active_cpus;
403
404 enum stopmachine_state state;
405 atomic_t thread_ack;
35}; 406};
36 407
37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */ 408static void set_state(struct stop_machine_data *smdata,
38static unsigned int num_threads; 409 enum stopmachine_state newstate)
39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock);
41/* setup_lock protects refcount, stop_machine_wq and stop_machine_work. */
42static DEFINE_MUTEX(setup_lock);
43/* Users of stop_machine. */
44static int refcount;
45static struct workqueue_struct *stop_machine_wq;
46static struct stop_machine_data active, idle;
47static const struct cpumask *active_cpus;
48static void __percpu *stop_machine_work;
49
50static void set_state(enum stopmachine_state newstate)
51{ 410{
52 /* Reset ack counter. */ 411 /* Reset ack counter. */
53 atomic_set(&thread_ack, num_threads); 412 atomic_set(&smdata->thread_ack, smdata->num_threads);
54 smp_wmb(); 413 smp_wmb();
55 state = newstate; 414 smdata->state = newstate;
56} 415}
57 416
58/* Last one to ack a state moves to the next state. */ 417/* Last one to ack a state moves to the next state. */
59static void ack_state(void) 418static void ack_state(struct stop_machine_data *smdata)
60{ 419{
61 if (atomic_dec_and_test(&thread_ack)) 420 if (atomic_dec_and_test(&smdata->thread_ack))
62 set_state(state + 1); 421 set_state(smdata, smdata->state + 1);
63} 422}
64 423
65/* This is the actual function which stops the CPU. It runs 424/* This is the cpu_stop function which stops the CPU. */
66 * in the context of a dedicated stopmachine workqueue. */ 425static int stop_machine_cpu_stop(void *data)
67static void stop_cpu(struct work_struct *unused)
68{ 426{
427 struct stop_machine_data *smdata = data;
69 enum stopmachine_state curstate = STOPMACHINE_NONE; 428 enum stopmachine_state curstate = STOPMACHINE_NONE;
70 struct stop_machine_data *smdata = &idle; 429 int cpu = smp_processor_id(), err = 0;
71 int cpu = smp_processor_id(); 430 bool is_active;
72 int err; 431
432 if (!smdata->active_cpus)
433 is_active = cpu == cpumask_first(cpu_online_mask);
434 else
435 is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
73 436
74 if (!active_cpus) {
75 if (cpu == cpumask_first(cpu_online_mask))
76 smdata = &active;
77 } else {
78 if (cpumask_test_cpu(cpu, active_cpus))
79 smdata = &active;
80 }
81 /* Simple state machine */ 437 /* Simple state machine */
82 do { 438 do {
83 /* Chill out and ensure we re-read stopmachine_state. */ 439 /* Chill out and ensure we re-read stopmachine_state. */
84 cpu_relax(); 440 cpu_relax();
85 if (state != curstate) { 441 if (smdata->state != curstate) {
86 curstate = state; 442 curstate = smdata->state;
87 switch (curstate) { 443 switch (curstate) {
88 case STOPMACHINE_DISABLE_IRQ: 444 case STOPMACHINE_DISABLE_IRQ:
89 local_irq_disable(); 445 local_irq_disable();
90 hard_irq_disable(); 446 hard_irq_disable();
91 break; 447 break;
92 case STOPMACHINE_RUN: 448 case STOPMACHINE_RUN:
93 /* On multiple CPUs only a single error code 449 if (is_active)
94 * is needed to tell that something failed. */ 450 err = smdata->fn(smdata->data);
95 err = smdata->fn(smdata->data);
96 if (err)
97 smdata->fnret = err;
98 break; 451 break;
99 default: 452 default:
100 break; 453 break;
101 } 454 }
102 ack_state(); 455 ack_state(smdata);
103 } 456 }
104 } while (curstate != STOPMACHINE_EXIT); 457 } while (curstate != STOPMACHINE_EXIT);
105 458
106 local_irq_enable(); 459 local_irq_enable();
460 return err;
107} 461}
108 462
109/* Callback for CPUs which aren't supposed to do anything. */
110static int chill(void *unused)
111{
112 return 0;
113}
114
115int stop_machine_create(void)
116{
117 mutex_lock(&setup_lock);
118 if (refcount)
119 goto done;
120 stop_machine_wq = create_rt_workqueue("kstop");
121 if (!stop_machine_wq)
122 goto err_out;
123 stop_machine_work = alloc_percpu(struct work_struct);
124 if (!stop_machine_work)
125 goto err_out;
126done:
127 refcount++;
128 mutex_unlock(&setup_lock);
129 return 0;
130
131err_out:
132 if (stop_machine_wq)
133 destroy_workqueue(stop_machine_wq);
134 mutex_unlock(&setup_lock);
135 return -ENOMEM;
136}
137EXPORT_SYMBOL_GPL(stop_machine_create);
138
139void stop_machine_destroy(void)
140{
141 mutex_lock(&setup_lock);
142 refcount--;
143 if (refcount)
144 goto done;
145 destroy_workqueue(stop_machine_wq);
146 free_percpu(stop_machine_work);
147done:
148 mutex_unlock(&setup_lock);
149}
150EXPORT_SYMBOL_GPL(stop_machine_destroy);
151
152int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 463int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
153{ 464{
154 struct work_struct *sm_work; 465 struct stop_machine_data smdata = { .fn = fn, .data = data,
155 int i, ret; 466 .num_threads = num_online_cpus(),
156 467 .active_cpus = cpus };
157 /* Set up initial state. */ 468
158 mutex_lock(&lock); 469 /* Set the initial state and stop all online cpus. */
159 num_threads = num_online_cpus(); 470 set_state(&smdata, STOPMACHINE_PREPARE);
160 active_cpus = cpus; 471 return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
161 active.fn = fn;
162 active.data = data;
163 active.fnret = 0;
164 idle.fn = chill;
165 idle.data = NULL;
166
167 set_state(STOPMACHINE_PREPARE);
168
169 /* Schedule the stop_cpu work on all cpus: hold this CPU so one
170 * doesn't hit this CPU until we're ready. */
171 get_cpu();
172 for_each_online_cpu(i) {
173 sm_work = per_cpu_ptr(stop_machine_work, i);
174 INIT_WORK(sm_work, stop_cpu);
175 queue_work_on(i, stop_machine_wq, sm_work);
176 }
177 /* This will release the thread on our CPU. */
178 put_cpu();
179 flush_workqueue(stop_machine_wq);
180 ret = active.fnret;
181 mutex_unlock(&lock);
182 return ret;
183} 472}
184 473
185int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) 474int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
186{ 475{
187 int ret; 476 int ret;
188 477
189 ret = stop_machine_create();
190 if (ret)
191 return ret;
192 /* No CPUs can come up or down during this. */ 478 /* No CPUs can come up or down during this. */
193 get_online_cpus(); 479 get_online_cpus();
194 ret = __stop_machine(fn, data, cpus); 480 ret = __stop_machine(fn, data, cpus);
195 put_online_cpus(); 481 put_online_cpus();
196 stop_machine_destroy();
197 return ret; 482 return ret;
198} 483}
199EXPORT_SYMBOL_GPL(stop_machine); 484EXPORT_SYMBOL_GPL(stop_machine);
485
486#endif /* CONFIG_STOP_MACHINE */
diff --git a/kernel/sys.c b/kernel/sys.c
index 8298878f4f71..e83ddbbaf89d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -36,6 +36,7 @@
36#include <linux/personality.h> 36#include <linux/personality.h>
37#include <linux/ptrace.h> 37#include <linux/ptrace.h>
38#include <linux/fs_struct.h> 38#include <linux/fs_struct.h>
39#include <linux/gfp.h>
39 40
40#include <linux/compat.h> 41#include <linux/compat.h>
41#include <linux/syscalls.h> 42#include <linux/syscalls.h>
@@ -491,10 +492,6 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
491 return -ENOMEM; 492 return -ENOMEM;
492 old = current_cred(); 493 old = current_cred();
493 494
494 retval = security_task_setgid(rgid, egid, (gid_t)-1, LSM_SETID_RE);
495 if (retval)
496 goto error;
497
498 retval = -EPERM; 495 retval = -EPERM;
499 if (rgid != (gid_t) -1) { 496 if (rgid != (gid_t) -1) {
500 if (old->gid == rgid || 497 if (old->gid == rgid ||
@@ -542,10 +539,6 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
542 return -ENOMEM; 539 return -ENOMEM;
543 old = current_cred(); 540 old = current_cred();
544 541
545 retval = security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_ID);
546 if (retval)
547 goto error;
548
549 retval = -EPERM; 542 retval = -EPERM;
550 if (capable(CAP_SETGID)) 543 if (capable(CAP_SETGID))
551 new->gid = new->egid = new->sgid = new->fsgid = gid; 544 new->gid = new->egid = new->sgid = new->fsgid = gid;
@@ -609,10 +602,6 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
609 return -ENOMEM; 602 return -ENOMEM;
610 old = current_cred(); 603 old = current_cred();
611 604
612 retval = security_task_setuid(ruid, euid, (uid_t)-1, LSM_SETID_RE);
613 if (retval)
614 goto error;
615
616 retval = -EPERM; 605 retval = -EPERM;
617 if (ruid != (uid_t) -1) { 606 if (ruid != (uid_t) -1) {
618 new->uid = ruid; 607 new->uid = ruid;
@@ -674,10 +663,6 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
674 return -ENOMEM; 663 return -ENOMEM;
675 old = current_cred(); 664 old = current_cred();
676 665
677 retval = security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_ID);
678 if (retval)
679 goto error;
680
681 retval = -EPERM; 666 retval = -EPERM;
682 if (capable(CAP_SETUID)) { 667 if (capable(CAP_SETUID)) {
683 new->suid = new->uid = uid; 668 new->suid = new->uid = uid;
@@ -718,9 +703,6 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
718 if (!new) 703 if (!new)
719 return -ENOMEM; 704 return -ENOMEM;
720 705
721 retval = security_task_setuid(ruid, euid, suid, LSM_SETID_RES);
722 if (retval)
723 goto error;
724 old = current_cred(); 706 old = current_cred();
725 707
726 retval = -EPERM; 708 retval = -EPERM;
@@ -787,10 +769,6 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
787 return -ENOMEM; 769 return -ENOMEM;
788 old = current_cred(); 770 old = current_cred();
789 771
790 retval = security_task_setgid(rgid, egid, sgid, LSM_SETID_RES);
791 if (retval)
792 goto error;
793
794 retval = -EPERM; 772 retval = -EPERM;
795 if (!capable(CAP_SETGID)) { 773 if (!capable(CAP_SETGID)) {
796 if (rgid != (gid_t) -1 && rgid != old->gid && 774 if (rgid != (gid_t) -1 && rgid != old->gid &&
@@ -850,9 +828,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
850 old = current_cred(); 828 old = current_cred();
851 old_fsuid = old->fsuid; 829 old_fsuid = old->fsuid;
852 830
853 if (security_task_setuid(uid, (uid_t)-1, (uid_t)-1, LSM_SETID_FS) < 0)
854 goto error;
855
856 if (uid == old->uid || uid == old->euid || 831 if (uid == old->uid || uid == old->euid ||
857 uid == old->suid || uid == old->fsuid || 832 uid == old->suid || uid == old->fsuid ||
858 capable(CAP_SETUID)) { 833 capable(CAP_SETUID)) {
@@ -863,7 +838,6 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
863 } 838 }
864 } 839 }
865 840
866error:
867 abort_creds(new); 841 abort_creds(new);
868 return old_fsuid; 842 return old_fsuid;
869 843
@@ -887,9 +861,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
887 old = current_cred(); 861 old = current_cred();
888 old_fsgid = old->fsgid; 862 old_fsgid = old->fsgid;
889 863
890 if (security_task_setgid(gid, (gid_t)-1, (gid_t)-1, LSM_SETID_FS))
891 goto error;
892
893 if (gid == old->gid || gid == old->egid || 864 if (gid == old->gid || gid == old->egid ||
894 gid == old->sgid || gid == old->fsgid || 865 gid == old->sgid || gid == old->fsgid ||
895 capable(CAP_SETGID)) { 866 capable(CAP_SETGID)) {
@@ -899,7 +870,6 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
899 } 870 }
900 } 871 }
901 872
902error:
903 abort_creds(new); 873 abort_creds(new);
904 return old_fsgid; 874 return old_fsgid;
905 875
@@ -1117,7 +1087,7 @@ DECLARE_RWSEM(uts_sem);
1117 1087
1118#ifdef COMPAT_UTS_MACHINE 1088#ifdef COMPAT_UTS_MACHINE
1119#define override_architecture(name) \ 1089#define override_architecture(name) \
1120 (current->personality == PER_LINUX32 && \ 1090 (personality(current->personality) == PER_LINUX32 && \
1121 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ 1091 copy_to_user(name->machine, COMPAT_UTS_MACHINE, \
1122 sizeof(COMPAT_UTS_MACHINE))) 1092 sizeof(COMPAT_UTS_MACHINE)))
1123#else 1093#else
@@ -1662,9 +1632,9 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
1662 1632
1663char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; 1633char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
1664 1634
1665static void argv_cleanup(char **argv, char **envp) 1635static void argv_cleanup(struct subprocess_info *info)
1666{ 1636{
1667 argv_free(argv); 1637 argv_free(info->argv);
1668} 1638}
1669 1639
1670/** 1640/**
@@ -1698,7 +1668,7 @@ int orderly_poweroff(bool force)
1698 goto out; 1668 goto out;
1699 } 1669 }
1700 1670
1701 call_usermodehelper_setcleanup(info, argv_cleanup); 1671 call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL);
1702 1672
1703 ret = call_usermodehelper_exec(info, UMH_NO_WAIT); 1673 ret = call_usermodehelper_exec(info, UMH_NO_WAIT);
1704 1674
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8686b0f5fc12..d24f761f4876 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -37,6 +37,7 @@
37#include <linux/highuid.h> 37#include <linux/highuid.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/ratelimit.h> 39#include <linux/ratelimit.h>
40#include <linux/compaction.h>
40#include <linux/hugetlb.h> 41#include <linux/hugetlb.h>
41#include <linux/initrd.h> 42#include <linux/initrd.h>
42#include <linux/key.h> 43#include <linux/key.h>
@@ -52,6 +53,7 @@
52#include <linux/slow-work.h> 53#include <linux/slow-work.h>
53#include <linux/perf_event.h> 54#include <linux/perf_event.h>
54#include <linux/kprobes.h> 55#include <linux/kprobes.h>
56#include <linux/pipe_fs_i.h>
55 57
56#include <asm/uaccess.h> 58#include <asm/uaccess.h>
57#include <asm/processor.h> 59#include <asm/processor.h>
@@ -163,6 +165,27 @@ static int proc_taint(struct ctl_table *table, int write,
163 void __user *buffer, size_t *lenp, loff_t *ppos); 165 void __user *buffer, size_t *lenp, loff_t *ppos);
164#endif 166#endif
165 167
168#ifdef CONFIG_MAGIC_SYSRQ
169static int __sysrq_enabled; /* Note: sysrq code ises it's own private copy */
170
171static int sysrq_sysctl_handler(ctl_table *table, int write,
172 void __user *buffer, size_t *lenp,
173 loff_t *ppos)
174{
175 int error;
176
177 error = proc_dointvec(table, write, buffer, lenp, ppos);
178 if (error)
179 return error;
180
181 if (write)
182 sysrq_toggle_support(__sysrq_enabled);
183
184 return 0;
185}
186
187#endif
188
166static struct ctl_table root_table[]; 189static struct ctl_table root_table[];
167static struct ctl_table_root sysctl_table_root; 190static struct ctl_table_root sysctl_table_root;
168static struct ctl_table_header root_table_header = { 191static struct ctl_table_header root_table_header = {
@@ -240,6 +263,11 @@ static int min_sched_shares_ratelimit = 100000; /* 100 usec */
240static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ 263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
241#endif 264#endif
242 265
266#ifdef CONFIG_COMPACTION
267static int min_extfrag_threshold;
268static int max_extfrag_threshold = 1000;
269#endif
270
243static struct ctl_table kern_table[] = { 271static struct ctl_table kern_table[] = {
244 { 272 {
245 .procname = "sched_child_runs_first", 273 .procname = "sched_child_runs_first",
@@ -567,7 +595,7 @@ static struct ctl_table kern_table[] = {
567 .data = &__sysrq_enabled, 595 .data = &__sysrq_enabled,
568 .maxlen = sizeof (int), 596 .maxlen = sizeof (int),
569 .mode = 0644, 597 .mode = 0644,
570 .proc_handler = proc_dointvec, 598 .proc_handler = sysrq_sysctl_handler,
571 }, 599 },
572#endif 600#endif
573#ifdef CONFIG_PROC_SYSCTL 601#ifdef CONFIG_PROC_SYSCTL
@@ -621,7 +649,7 @@ static struct ctl_table kern_table[] = {
621#endif 649#endif
622 { 650 {
623 .procname = "userprocess_debug", 651 .procname = "userprocess_debug",
624 .data = &sysctl_userprocess_debug, 652 .data = &show_unhandled_signals,
625 .maxlen = sizeof(int), 653 .maxlen = sizeof(int),
626 .mode = 0644, 654 .mode = 0644,
627 .proc_handler = proc_dointvec, 655 .proc_handler = proc_dointvec,
@@ -1099,6 +1127,25 @@ static struct ctl_table vm_table[] = {
1099 .mode = 0644, 1127 .mode = 0644,
1100 .proc_handler = drop_caches_sysctl_handler, 1128 .proc_handler = drop_caches_sysctl_handler,
1101 }, 1129 },
1130#ifdef CONFIG_COMPACTION
1131 {
1132 .procname = "compact_memory",
1133 .data = &sysctl_compact_memory,
1134 .maxlen = sizeof(int),
1135 .mode = 0200,
1136 .proc_handler = sysctl_compaction_handler,
1137 },
1138 {
1139 .procname = "extfrag_threshold",
1140 .data = &sysctl_extfrag_threshold,
1141 .maxlen = sizeof(int),
1142 .mode = 0644,
1143 .proc_handler = sysctl_extfrag_handler,
1144 .extra1 = &min_extfrag_threshold,
1145 .extra2 = &max_extfrag_threshold,
1146 },
1147
1148#endif /* CONFIG_COMPACTION */
1102 { 1149 {
1103 .procname = "min_free_kbytes", 1150 .procname = "min_free_kbytes",
1104 .data = &min_free_kbytes, 1151 .data = &min_free_kbytes,
@@ -1423,6 +1470,14 @@ static struct ctl_table fs_table[] = {
1423 .child = binfmt_misc_table, 1470 .child = binfmt_misc_table,
1424 }, 1471 },
1425#endif 1472#endif
1473 {
1474 .procname = "pipe-max-size",
1475 .data = &pipe_max_size,
1476 .maxlen = sizeof(int),
1477 .mode = 0644,
1478 .proc_handler = &pipe_proc_fn,
1479 .extra1 = &pipe_min_size,
1480 },
1426/* 1481/*
1427 * NOTE: do not add new entries to this table unless you have read 1482 * NOTE: do not add new entries to this table unless you have read
1428 * Documentation/sysctl/ctl_unnumbered.txt 1483 * Documentation/sysctl/ctl_unnumbered.txt
@@ -1431,7 +1486,8 @@ static struct ctl_table fs_table[] = {
1431}; 1486};
1432 1487
1433static struct ctl_table debug_table[] = { 1488static struct ctl_table debug_table[] = {
1434#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) 1489#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
1490 defined(CONFIG_S390)
1435 { 1491 {
1436 .procname = "exception-trace", 1492 .procname = "exception-trace",
1437 .data = &show_unhandled_signals, 1493 .data = &show_unhandled_signals,
@@ -2040,8 +2096,132 @@ int proc_dostring(struct ctl_table *table, int write,
2040 buffer, lenp, ppos); 2096 buffer, lenp, ppos);
2041} 2097}
2042 2098
2099static size_t proc_skip_spaces(char **buf)
2100{
2101 size_t ret;
2102 char *tmp = skip_spaces(*buf);
2103 ret = tmp - *buf;
2104 *buf = tmp;
2105 return ret;
2106}
2107
2108static void proc_skip_char(char **buf, size_t *size, const char v)
2109{
2110 while (*size) {
2111 if (**buf != v)
2112 break;
2113 (*size)--;
2114 (*buf)++;
2115 }
2116}
2117
2118#define TMPBUFLEN 22
2119/**
2120 * proc_get_long - reads an ASCII formatted integer from a user buffer
2121 *
2122 * @buf: a kernel buffer
2123 * @size: size of the kernel buffer
2124 * @val: this is where the number will be stored
2125 * @neg: set to %TRUE if number is negative
2126 * @perm_tr: a vector which contains the allowed trailers
2127 * @perm_tr_len: size of the perm_tr vector
2128 * @tr: pointer to store the trailer character
2129 *
2130 * In case of success %0 is returned and @buf and @size are updated with
2131 * the amount of bytes read. If @tr is non-NULL and a trailing
2132 * character exists (size is non-zero after returning from this
2133 * function), @tr is updated with the trailing character.
2134 */
2135static int proc_get_long(char **buf, size_t *size,
2136 unsigned long *val, bool *neg,
2137 const char *perm_tr, unsigned perm_tr_len, char *tr)
2138{
2139 int len;
2140 char *p, tmp[TMPBUFLEN];
2141
2142 if (!*size)
2143 return -EINVAL;
2144
2145 len = *size;
2146 if (len > TMPBUFLEN - 1)
2147 len = TMPBUFLEN - 1;
2148
2149 memcpy(tmp, *buf, len);
2150
2151 tmp[len] = 0;
2152 p = tmp;
2153 if (*p == '-' && *size > 1) {
2154 *neg = true;
2155 p++;
2156 } else
2157 *neg = false;
2158 if (!isdigit(*p))
2159 return -EINVAL;
2160
2161 *val = simple_strtoul(p, &p, 0);
2162
2163 len = p - tmp;
2164
2165 /* We don't know if the next char is whitespace thus we may accept
2166 * invalid integers (e.g. 1234...a) or two integers instead of one
2167 * (e.g. 123...1). So lets not allow such large numbers. */
2168 if (len == TMPBUFLEN - 1)
2169 return -EINVAL;
2170
2171 if (len < *size && perm_tr_len && !memchr(perm_tr, *p, perm_tr_len))
2172 return -EINVAL;
2173
2174 if (tr && (len < *size))
2175 *tr = *p;
2176
2177 *buf += len;
2178 *size -= len;
2179
2180 return 0;
2181}
2182
2183/**
2184 * proc_put_long - converts an integer to a decimal ASCII formatted string
2185 *
2186 * @buf: the user buffer
2187 * @size: the size of the user buffer
2188 * @val: the integer to be converted
2189 * @neg: sign of the number, %TRUE for negative
2190 *
2191 * In case of success %0 is returned and @buf and @size are updated with
2192 * the amount of bytes written.
2193 */
2194static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
2195 bool neg)
2196{
2197 int len;
2198 char tmp[TMPBUFLEN], *p = tmp;
2199
2200 sprintf(p, "%s%lu", neg ? "-" : "", val);
2201 len = strlen(tmp);
2202 if (len > *size)
2203 len = *size;
2204 if (copy_to_user(*buf, tmp, len))
2205 return -EFAULT;
2206 *size -= len;
2207 *buf += len;
2208 return 0;
2209}
2210#undef TMPBUFLEN
2211
2212static int proc_put_char(void __user **buf, size_t *size, char c)
2213{
2214 if (*size) {
2215 char __user **buffer = (char __user **)buf;
2216 if (put_user(c, *buffer))
2217 return -EFAULT;
2218 (*size)--, (*buffer)++;
2219 *buf = *buffer;
2220 }
2221 return 0;
2222}
2043 2223
2044static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, 2224static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
2045 int *valp, 2225 int *valp,
2046 int write, void *data) 2226 int write, void *data)
2047{ 2227{
@@ -2050,33 +2230,31 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp,
2050 } else { 2230 } else {
2051 int val = *valp; 2231 int val = *valp;
2052 if (val < 0) { 2232 if (val < 0) {
2053 *negp = -1; 2233 *negp = true;
2054 *lvalp = (unsigned long)-val; 2234 *lvalp = (unsigned long)-val;
2055 } else { 2235 } else {
2056 *negp = 0; 2236 *negp = false;
2057 *lvalp = (unsigned long)val; 2237 *lvalp = (unsigned long)val;
2058 } 2238 }
2059 } 2239 }
2060 return 0; 2240 return 0;
2061} 2241}
2062 2242
2243static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
2244
2063static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, 2245static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2064 int write, void __user *buffer, 2246 int write, void __user *buffer,
2065 size_t *lenp, loff_t *ppos, 2247 size_t *lenp, loff_t *ppos,
2066 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2248 int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
2067 int write, void *data), 2249 int write, void *data),
2068 void *data) 2250 void *data)
2069{ 2251{
2070#define TMPBUFLEN 21 2252 int *i, vleft, first = 1, err = 0;
2071 int *i, vleft, first = 1, neg; 2253 unsigned long page = 0;
2072 unsigned long lval; 2254 size_t left;
2073 size_t left, len; 2255 char *kbuf;
2074 2256
2075 char buf[TMPBUFLEN], *p; 2257 if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
2076 char __user *s = buffer;
2077
2078 if (!tbl_data || !table->maxlen || !*lenp ||
2079 (*ppos && !write)) {
2080 *lenp = 0; 2258 *lenp = 0;
2081 return 0; 2259 return 0;
2082 } 2260 }
@@ -2088,89 +2266,71 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
2088 if (!conv) 2266 if (!conv)
2089 conv = do_proc_dointvec_conv; 2267 conv = do_proc_dointvec_conv;
2090 2268
2269 if (write) {
2270 if (left > PAGE_SIZE - 1)
2271 left = PAGE_SIZE - 1;
2272 page = __get_free_page(GFP_TEMPORARY);
2273 kbuf = (char *) page;
2274 if (!kbuf)
2275 return -ENOMEM;
2276 if (copy_from_user(kbuf, buffer, left)) {
2277 err = -EFAULT;
2278 goto free;
2279 }
2280 kbuf[left] = 0;
2281 }
2282
2091 for (; left && vleft--; i++, first=0) { 2283 for (; left && vleft--; i++, first=0) {
2284 unsigned long lval;
2285 bool neg;
2286
2092 if (write) { 2287 if (write) {
2093 while (left) { 2288 left -= proc_skip_spaces(&kbuf);
2094 char c; 2289
2095 if (get_user(c, s))
2096 return -EFAULT;
2097 if (!isspace(c))
2098 break;
2099 left--;
2100 s++;
2101 }
2102 if (!left) 2290 if (!left)
2103 break; 2291 break;
2104 neg = 0; 2292 err = proc_get_long(&kbuf, &left, &lval, &neg,
2105 len = left; 2293 proc_wspace_sep,
2106 if (len > sizeof(buf) - 1) 2294 sizeof(proc_wspace_sep), NULL);
2107 len = sizeof(buf) - 1; 2295 if (err)
2108 if (copy_from_user(buf, s, len))
2109 return -EFAULT;
2110 buf[len] = 0;
2111 p = buf;
2112 if (*p == '-' && left > 1) {
2113 neg = 1;
2114 p++;
2115 }
2116 if (*p < '0' || *p > '9')
2117 break; 2296 break;
2118 2297 if (conv(&neg, &lval, i, 1, data)) {
2119 lval = simple_strtoul(p, &p, 0); 2298 err = -EINVAL;
2120
2121 len = p-buf;
2122 if ((len < left) && *p && !isspace(*p))
2123 break;
2124 s += len;
2125 left -= len;
2126
2127 if (conv(&neg, &lval, i, 1, data))
2128 break; 2299 break;
2300 }
2129 } else { 2301 } else {
2130 p = buf; 2302 if (conv(&neg, &lval, i, 0, data)) {
2303 err = -EINVAL;
2304 break;
2305 }
2131 if (!first) 2306 if (!first)
2132 *p++ = '\t'; 2307 err = proc_put_char(&buffer, &left, '\t');
2133 2308 if (err)
2134 if (conv(&neg, &lval, i, 0, data)) 2309 break;
2310 err = proc_put_long(&buffer, &left, lval, neg);
2311 if (err)
2135 break; 2312 break;
2136
2137 sprintf(p, "%s%lu", neg ? "-" : "", lval);
2138 len = strlen(buf);
2139 if (len > left)
2140 len = left;
2141 if(copy_to_user(s, buf, len))
2142 return -EFAULT;
2143 left -= len;
2144 s += len;
2145 } 2313 }
2146 } 2314 }
2147 2315
2148 if (!write && !first && left) { 2316 if (!write && !first && left && !err)
2149 if(put_user('\n', s)) 2317 err = proc_put_char(&buffer, &left, '\n');
2150 return -EFAULT; 2318 if (write && !err && left)
2151 left--, s++; 2319 left -= proc_skip_spaces(&kbuf);
2152 } 2320free:
2153 if (write) { 2321 if (write) {
2154 while (left) { 2322 free_page(page);
2155 char c; 2323 if (first)
2156 if (get_user(c, s++)) 2324 return err ? : -EINVAL;
2157 return -EFAULT;
2158 if (!isspace(c))
2159 break;
2160 left--;
2161 }
2162 } 2325 }
2163 if (write && first)
2164 return -EINVAL;
2165 *lenp -= left; 2326 *lenp -= left;
2166 *ppos += *lenp; 2327 *ppos += *lenp;
2167 return 0; 2328 return err;
2168#undef TMPBUFLEN
2169} 2329}
2170 2330
2171static int do_proc_dointvec(struct ctl_table *table, int write, 2331static int do_proc_dointvec(struct ctl_table *table, int write,
2172 void __user *buffer, size_t *lenp, loff_t *ppos, 2332 void __user *buffer, size_t *lenp, loff_t *ppos,
2173 int (*conv)(int *negp, unsigned long *lvalp, int *valp, 2333 int (*conv)(bool *negp, unsigned long *lvalp, int *valp,
2174 int write, void *data), 2334 int write, void *data),
2175 void *data) 2335 void *data)
2176{ 2336{
@@ -2238,8 +2398,8 @@ struct do_proc_dointvec_minmax_conv_param {
2238 int *max; 2398 int *max;
2239}; 2399};
2240 2400
2241static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, 2401static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
2242 int *valp, 2402 int *valp,
2243 int write, void *data) 2403 int write, void *data)
2244{ 2404{
2245 struct do_proc_dointvec_minmax_conv_param *param = data; 2405 struct do_proc_dointvec_minmax_conv_param *param = data;
@@ -2252,10 +2412,10 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp,
2252 } else { 2412 } else {
2253 int val = *valp; 2413 int val = *valp;
2254 if (val < 0) { 2414 if (val < 0) {
2255 *negp = -1; 2415 *negp = true;
2256 *lvalp = (unsigned long)-val; 2416 *lvalp = (unsigned long)-val;
2257 } else { 2417 } else {
2258 *negp = 0; 2418 *negp = false;
2259 *lvalp = (unsigned long)val; 2419 *lvalp = (unsigned long)val;
2260 } 2420 }
2261 } 2421 }
@@ -2295,102 +2455,78 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
2295 unsigned long convmul, 2455 unsigned long convmul,
2296 unsigned long convdiv) 2456 unsigned long convdiv)
2297{ 2457{
2298#define TMPBUFLEN 21 2458 unsigned long *i, *min, *max;
2299 unsigned long *i, *min, *max, val; 2459 int vleft, first = 1, err = 0;
2300 int vleft, first=1, neg; 2460 unsigned long page = 0;
2301 size_t len, left; 2461 size_t left;
2302 char buf[TMPBUFLEN], *p; 2462 char *kbuf;
2303 char __user *s = buffer; 2463
2304 2464 if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
2305 if (!data || !table->maxlen || !*lenp ||
2306 (*ppos && !write)) {
2307 *lenp = 0; 2465 *lenp = 0;
2308 return 0; 2466 return 0;
2309 } 2467 }
2310 2468
2311 i = (unsigned long *) data; 2469 i = (unsigned long *) data;
2312 min = (unsigned long *) table->extra1; 2470 min = (unsigned long *) table->extra1;
2313 max = (unsigned long *) table->extra2; 2471 max = (unsigned long *) table->extra2;
2314 vleft = table->maxlen / sizeof(unsigned long); 2472 vleft = table->maxlen / sizeof(unsigned long);
2315 left = *lenp; 2473 left = *lenp;
2316 2474
2475 if (write) {
2476 if (left > PAGE_SIZE - 1)
2477 left = PAGE_SIZE - 1;
2478 page = __get_free_page(GFP_TEMPORARY);
2479 kbuf = (char *) page;
2480 if (!kbuf)
2481 return -ENOMEM;
2482 if (copy_from_user(kbuf, buffer, left)) {
2483 err = -EFAULT;
2484 goto free;
2485 }
2486 kbuf[left] = 0;
2487 }
2488
2317 for (; left && vleft--; i++, min++, max++, first=0) { 2489 for (; left && vleft--; i++, min++, max++, first=0) {
2490 unsigned long val;
2491
2318 if (write) { 2492 if (write) {
2319 while (left) { 2493 bool neg;
2320 char c; 2494
2321 if (get_user(c, s)) 2495 left -= proc_skip_spaces(&kbuf);
2322 return -EFAULT; 2496
2323 if (!isspace(c)) 2497 err = proc_get_long(&kbuf, &left, &val, &neg,
2324 break; 2498 proc_wspace_sep,
2325 left--; 2499 sizeof(proc_wspace_sep), NULL);
2326 s++; 2500 if (err)
2327 }
2328 if (!left)
2329 break;
2330 neg = 0;
2331 len = left;
2332 if (len > TMPBUFLEN-1)
2333 len = TMPBUFLEN-1;
2334 if (copy_from_user(buf, s, len))
2335 return -EFAULT;
2336 buf[len] = 0;
2337 p = buf;
2338 if (*p == '-' && left > 1) {
2339 neg = 1;
2340 p++;
2341 }
2342 if (*p < '0' || *p > '9')
2343 break;
2344 val = simple_strtoul(p, &p, 0) * convmul / convdiv ;
2345 len = p-buf;
2346 if ((len < left) && *p && !isspace(*p))
2347 break; 2501 break;
2348 if (neg) 2502 if (neg)
2349 val = -val;
2350 s += len;
2351 left -= len;
2352
2353 if(neg)
2354 continue; 2503 continue;
2355 if ((min && val < *min) || (max && val > *max)) 2504 if ((min && val < *min) || (max && val > *max))
2356 continue; 2505 continue;
2357 *i = val; 2506 *i = val;
2358 } else { 2507 } else {
2359 p = buf; 2508 val = convdiv * (*i) / convmul;
2360 if (!first) 2509 if (!first)
2361 *p++ = '\t'; 2510 err = proc_put_char(&buffer, &left, '\t');
2362 sprintf(p, "%lu", convdiv * (*i) / convmul); 2511 err = proc_put_long(&buffer, &left, val, false);
2363 len = strlen(buf); 2512 if (err)
2364 if (len > left) 2513 break;
2365 len = left;
2366 if(copy_to_user(s, buf, len))
2367 return -EFAULT;
2368 left -= len;
2369 s += len;
2370 } 2514 }
2371 } 2515 }
2372 2516
2373 if (!write && !first && left) { 2517 if (!write && !first && left && !err)
2374 if(put_user('\n', s)) 2518 err = proc_put_char(&buffer, &left, '\n');
2375 return -EFAULT; 2519 if (write && !err)
2376 left--, s++; 2520 left -= proc_skip_spaces(&kbuf);
2377 } 2521free:
2378 if (write) { 2522 if (write) {
2379 while (left) { 2523 free_page(page);
2380 char c; 2524 if (first)
2381 if (get_user(c, s++)) 2525 return err ? : -EINVAL;
2382 return -EFAULT;
2383 if (!isspace(c))
2384 break;
2385 left--;
2386 }
2387 } 2526 }
2388 if (write && first)
2389 return -EINVAL;
2390 *lenp -= left; 2527 *lenp -= left;
2391 *ppos += *lenp; 2528 *ppos += *lenp;
2392 return 0; 2529 return err;
2393#undef TMPBUFLEN
2394} 2530}
2395 2531
2396static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, 2532static int do_proc_doulongvec_minmax(struct ctl_table *table, int write,
@@ -2451,7 +2587,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2451} 2587}
2452 2588
2453 2589
2454static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp, 2590static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp,
2455 int *valp, 2591 int *valp,
2456 int write, void *data) 2592 int write, void *data)
2457{ 2593{
@@ -2463,10 +2599,10 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2463 int val = *valp; 2599 int val = *valp;
2464 unsigned long lval; 2600 unsigned long lval;
2465 if (val < 0) { 2601 if (val < 0) {
2466 *negp = -1; 2602 *negp = true;
2467 lval = (unsigned long)-val; 2603 lval = (unsigned long)-val;
2468 } else { 2604 } else {
2469 *negp = 0; 2605 *negp = false;
2470 lval = (unsigned long)val; 2606 lval = (unsigned long)val;
2471 } 2607 }
2472 *lvalp = lval / HZ; 2608 *lvalp = lval / HZ;
@@ -2474,7 +2610,7 @@ static int do_proc_dointvec_jiffies_conv(int *negp, unsigned long *lvalp,
2474 return 0; 2610 return 0;
2475} 2611}
2476 2612
2477static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp, 2613static int do_proc_dointvec_userhz_jiffies_conv(bool *negp, unsigned long *lvalp,
2478 int *valp, 2614 int *valp,
2479 int write, void *data) 2615 int write, void *data)
2480{ 2616{
@@ -2486,10 +2622,10 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2486 int val = *valp; 2622 int val = *valp;
2487 unsigned long lval; 2623 unsigned long lval;
2488 if (val < 0) { 2624 if (val < 0) {
2489 *negp = -1; 2625 *negp = true;
2490 lval = (unsigned long)-val; 2626 lval = (unsigned long)-val;
2491 } else { 2627 } else {
2492 *negp = 0; 2628 *negp = false;
2493 lval = (unsigned long)val; 2629 lval = (unsigned long)val;
2494 } 2630 }
2495 *lvalp = jiffies_to_clock_t(lval); 2631 *lvalp = jiffies_to_clock_t(lval);
@@ -2497,7 +2633,7 @@ static int do_proc_dointvec_userhz_jiffies_conv(int *negp, unsigned long *lvalp,
2497 return 0; 2633 return 0;
2498} 2634}
2499 2635
2500static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, 2636static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
2501 int *valp, 2637 int *valp,
2502 int write, void *data) 2638 int write, void *data)
2503{ 2639{
@@ -2507,10 +2643,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp,
2507 int val = *valp; 2643 int val = *valp;
2508 unsigned long lval; 2644 unsigned long lval;
2509 if (val < 0) { 2645 if (val < 0) {
2510 *negp = -1; 2646 *negp = true;
2511 lval = (unsigned long)-val; 2647 lval = (unsigned long)-val;
2512 } else { 2648 } else {
2513 *negp = 0; 2649 *negp = false;
2514 lval = (unsigned long)val; 2650 lval = (unsigned long)val;
2515 } 2651 }
2516 *lvalp = jiffies_to_msecs(lval); 2652 *lvalp = jiffies_to_msecs(lval);
@@ -2607,6 +2743,157 @@ static int proc_do_cad_pid(struct ctl_table *table, int write,
2607 return 0; 2743 return 0;
2608} 2744}
2609 2745
2746/**
2747 * proc_do_large_bitmap - read/write from/to a large bitmap
2748 * @table: the sysctl table
2749 * @write: %TRUE if this is a write to the sysctl file
2750 * @buffer: the user buffer
2751 * @lenp: the size of the user buffer
2752 * @ppos: file position
2753 *
2754 * The bitmap is stored at table->data and the bitmap length (in bits)
2755 * in table->maxlen.
2756 *
2757 * We use a range comma separated format (e.g. 1,3-4,10-10) so that
2758 * large bitmaps may be represented in a compact manner. Writing into
2759 * the file will clear the bitmap then update it with the given input.
2760 *
2761 * Returns 0 on success.
2762 */
2763int proc_do_large_bitmap(struct ctl_table *table, int write,
2764 void __user *buffer, size_t *lenp, loff_t *ppos)
2765{
2766 int err = 0;
2767 bool first = 1;
2768 size_t left = *lenp;
2769 unsigned long bitmap_len = table->maxlen;
2770 unsigned long *bitmap = (unsigned long *) table->data;
2771 unsigned long *tmp_bitmap = NULL;
2772 char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
2773
2774 if (!bitmap_len || !left || (*ppos && !write)) {
2775 *lenp = 0;
2776 return 0;
2777 }
2778
2779 if (write) {
2780 unsigned long page = 0;
2781 char *kbuf;
2782
2783 if (left > PAGE_SIZE - 1)
2784 left = PAGE_SIZE - 1;
2785
2786 page = __get_free_page(GFP_TEMPORARY);
2787 kbuf = (char *) page;
2788 if (!kbuf)
2789 return -ENOMEM;
2790 if (copy_from_user(kbuf, buffer, left)) {
2791 free_page(page);
2792 return -EFAULT;
2793 }
2794 kbuf[left] = 0;
2795
2796 tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long),
2797 GFP_KERNEL);
2798 if (!tmp_bitmap) {
2799 free_page(page);
2800 return -ENOMEM;
2801 }
2802 proc_skip_char(&kbuf, &left, '\n');
2803 while (!err && left) {
2804 unsigned long val_a, val_b;
2805 bool neg;
2806
2807 err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a,
2808 sizeof(tr_a), &c);
2809 if (err)
2810 break;
2811 if (val_a >= bitmap_len || neg) {
2812 err = -EINVAL;
2813 break;
2814 }
2815
2816 val_b = val_a;
2817 if (left) {
2818 kbuf++;
2819 left--;
2820 }
2821
2822 if (c == '-') {
2823 err = proc_get_long(&kbuf, &left, &val_b,
2824 &neg, tr_b, sizeof(tr_b),
2825 &c);
2826 if (err)
2827 break;
2828 if (val_b >= bitmap_len || neg ||
2829 val_a > val_b) {
2830 err = -EINVAL;
2831 break;
2832 }
2833 if (left) {
2834 kbuf++;
2835 left--;
2836 }
2837 }
2838
2839 while (val_a <= val_b)
2840 set_bit(val_a++, tmp_bitmap);
2841
2842 first = 0;
2843 proc_skip_char(&kbuf, &left, '\n');
2844 }
2845 free_page(page);
2846 } else {
2847 unsigned long bit_a, bit_b = 0;
2848
2849 while (left) {
2850 bit_a = find_next_bit(bitmap, bitmap_len, bit_b);
2851 if (bit_a >= bitmap_len)
2852 break;
2853 bit_b = find_next_zero_bit(bitmap, bitmap_len,
2854 bit_a + 1) - 1;
2855
2856 if (!first) {
2857 err = proc_put_char(&buffer, &left, ',');
2858 if (err)
2859 break;
2860 }
2861 err = proc_put_long(&buffer, &left, bit_a, false);
2862 if (err)
2863 break;
2864 if (bit_a != bit_b) {
2865 err = proc_put_char(&buffer, &left, '-');
2866 if (err)
2867 break;
2868 err = proc_put_long(&buffer, &left, bit_b, false);
2869 if (err)
2870 break;
2871 }
2872
2873 first = 0; bit_b++;
2874 }
2875 if (!err)
2876 err = proc_put_char(&buffer, &left, '\n');
2877 }
2878
2879 if (!err) {
2880 if (write) {
2881 if (*ppos)
2882 bitmap_or(bitmap, bitmap, tmp_bitmap, bitmap_len);
2883 else
2884 memcpy(bitmap, tmp_bitmap,
2885 BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long));
2886 }
2887 kfree(tmp_bitmap);
2888 *lenp -= left;
2889 *ppos += *lenp;
2890 return 0;
2891 } else {
2892 kfree(tmp_bitmap);
2893 return err;
2894 }
2895}
2896
2610#else /* CONFIG_PROC_FS */ 2897#else /* CONFIG_PROC_FS */
2611 2898
2612int proc_dostring(struct ctl_table *table, int write, 2899int proc_dostring(struct ctl_table *table, int write,
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 8cd50d8f9bde..1357c5786064 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -13,6 +13,8 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/ctype.h> 14#include <linux/ctype.h>
15#include <linux/netdevice.h> 15#include <linux/netdevice.h>
16#include <linux/kernel.h>
17#include <linux/slab.h>
16 18
17#ifdef CONFIG_SYSCTL_SYSCALL 19#ifdef CONFIG_SYSCTL_SYSCALL
18 20
@@ -223,7 +225,6 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
223 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" }, 225 { CTL_INT, NET_IPV4_ROUTE_MTU_EXPIRES, "mtu_expires" },
224 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" }, 226 { CTL_INT, NET_IPV4_ROUTE_MIN_PMTU, "min_pmtu" },
225 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" }, 227 { CTL_INT, NET_IPV4_ROUTE_MIN_ADVMSS, "min_adv_mss" },
226 { CTL_INT, NET_IPV4_ROUTE_SECRET_INTERVAL, "secret_interval" },
227 {} 228 {}
228}; 229};
229 230
@@ -1124,11 +1125,6 @@ out:
1124 return result; 1125 return result;
1125} 1126}
1126 1127
1127static unsigned hex_value(int ch)
1128{
1129 return isdigit(ch) ? ch - '0' : ((ch | 0x20) - 'a') + 10;
1130}
1131
1132static ssize_t bin_uuid(struct file *file, 1128static ssize_t bin_uuid(struct file *file,
1133 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen) 1129 void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
1134{ 1130{
@@ -1156,7 +1152,8 @@ static ssize_t bin_uuid(struct file *file,
1156 if (!isxdigit(str[0]) || !isxdigit(str[1])) 1152 if (!isxdigit(str[0]) || !isxdigit(str[1]))
1157 goto out; 1153 goto out;
1158 1154
1159 uuid[i] = (hex_value(str[0]) << 4) | hex_value(str[1]); 1155 uuid[i] = (hex_to_bin(str[0]) << 4) |
1156 hex_to_bin(str[1]);
1160 str += 2; 1157 str += 2;
1161 if (*str == '-') 1158 if (*str == '-')
1162 str++; 1159 str++;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 899ca51be5e8..11281d5792bd 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -22,6 +22,7 @@
22#include <linux/delayacct.h> 22#include <linux/delayacct.h>
23#include <linux/cpumask.h> 23#include <linux/cpumask.h>
24#include <linux/percpu.h> 24#include <linux/percpu.h>
25#include <linux/slab.h>
25#include <linux/cgroupstats.h> 26#include <linux/cgroupstats.h>
26#include <linux/cgroup.h> 27#include <linux/cgroup.h>
27#include <linux/fs.h> 28#include <linux/fs.h>
diff --git a/kernel/time.c b/kernel/time.c
index 804798005d19..848b1c2ab09a 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -35,7 +35,6 @@
35#include <linux/syscalls.h> 35#include <linux/syscalls.h>
36#include <linux/security.h> 36#include <linux/security.h>
37#include <linux/fs.h> 37#include <linux/fs.h>
38#include <linux/slab.h>
39#include <linux/math64.h> 38#include <linux/math64.h>
40#include <linux/ptrace.h> 39#include <linux/ptrace.h>
41 40
@@ -133,12 +132,11 @@ SYSCALL_DEFINE2(gettimeofday, struct timeval __user *, tv,
133 */ 132 */
134static inline void warp_clock(void) 133static inline void warp_clock(void)
135{ 134{
136 write_seqlock_irq(&xtime_lock); 135 struct timespec adjust;
137 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 136
138 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 137 adjust = current_kernel_time();
139 update_xtime_cache(0); 138 adjust.tv_sec += sys_tz.tz_minuteswest * 60;
140 write_sequnlock_irq(&xtime_lock); 139 do_settimeofday(&adjust);
141 clock_was_set();
142} 140}
143 141
144/* 142/*
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 1f5dde637457..f08e99c1d561 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -625,6 +625,54 @@ static void clocksource_enqueue(struct clocksource *cs)
625 list_add(&cs->list, entry); 625 list_add(&cs->list, entry);
626} 626}
627 627
628
629/*
630 * Maximum time we expect to go between ticks. This includes idle
631 * tickless time. It provides the trade off between selecting a
632 * mult/shift pair that is very precise but can only handle a short
633 * period of time, vs. a mult/shift pair that can handle long periods
634 * of time but isn't as precise.
635 *
636 * This is a subsystem constant, and actual hardware limitations
637 * may override it (ie: clocksources that wrap every 3 seconds).
638 */
639#define MAX_UPDATE_LENGTH 5 /* Seconds */
640
641/**
642 * __clocksource_register_scale - Used to install new clocksources
643 * @t: clocksource to be registered
644 * @scale: Scale factor multiplied against freq to get clocksource hz
645 * @freq: clocksource frequency (cycles per second) divided by scale
646 *
647 * Returns -EBUSY if registration fails, zero otherwise.
648 *
649 * This *SHOULD NOT* be called directly! Please use the
650 * clocksource_register_hz() or clocksource_register_khz helper functions.
651 */
652int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
653{
654
655 /*
656 * Ideally we want to use some of the limits used in
657 * clocksource_max_deferment, to provide a more informed
658 * MAX_UPDATE_LENGTH. But for now this just gets the
659 * register interface working properly.
660 */
661 clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
662 NSEC_PER_SEC/scale,
663 MAX_UPDATE_LENGTH*scale);
664 cs->max_idle_ns = clocksource_max_deferment(cs);
665
666 mutex_lock(&clocksource_mutex);
667 clocksource_enqueue(cs);
668 clocksource_select();
669 clocksource_enqueue_watchdog(cs);
670 mutex_unlock(&clocksource_mutex);
671 return 0;
672}
673EXPORT_SYMBOL_GPL(__clocksource_register_scale);
674
675
628/** 676/**
629 * clocksource_register - Used to install new clocksources 677 * clocksource_register - Used to install new clocksources
630 * @t: clocksource to be registered 678 * @t: clocksource to be registered
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7c0f180d6e9d..c63116863a80 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -69,7 +69,7 @@ static s64 time_freq;
69/* time at last adjustment (secs): */ 69/* time at last adjustment (secs): */
70static long time_reftime; 70static long time_reftime;
71 71
72long time_adjust; 72static long time_adjust;
73 73
74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */ 74/* constant (boot-param configurable) NTP tick adjustment (upscaled) */
75static s64 ntp_tick_adj; 75static s64 ntp_tick_adj;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b3bafd5fc66d..48b2761b5668 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -188,7 +188,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
188 /* 188 /*
189 * Setup the next period for devices, which do not have 189 * Setup the next period for devices, which do not have
190 * periodic mode. We read dev->next_event first and add to it 190 * periodic mode. We read dev->next_event first and add to it
191 * when the event alrady expired. clockevents_program_event() 191 * when the event already expired. clockevents_program_event()
192 * sets dev->next_event only when the event is really 192 * sets dev->next_event only when the event is really
193 * programmed to the device. 193 * programmed to the device.
194 */ 194 */
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 0a8a213016f0..aada0e52680a 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -22,6 +22,29 @@
22 22
23#include "tick-internal.h" 23#include "tick-internal.h"
24 24
25/* Limit min_delta to a jiffie */
26#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ)
27
28static int tick_increase_min_delta(struct clock_event_device *dev)
29{
30 /* Nothing to do if we already reached the limit */
31 if (dev->min_delta_ns >= MIN_DELTA_LIMIT)
32 return -ETIME;
33
34 if (dev->min_delta_ns < 5000)
35 dev->min_delta_ns = 5000;
36 else
37 dev->min_delta_ns += dev->min_delta_ns >> 1;
38
39 if (dev->min_delta_ns > MIN_DELTA_LIMIT)
40 dev->min_delta_ns = MIN_DELTA_LIMIT;
41
42 printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n",
43 dev->name ? dev->name : "?",
44 (unsigned long long) dev->min_delta_ns);
45 return 0;
46}
47
25/** 48/**
26 * tick_program_event internal worker function 49 * tick_program_event internal worker function
27 */ 50 */
@@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
37 if (!ret || !force) 60 if (!ret || !force)
38 return ret; 61 return ret;
39 62
63 dev->retries++;
40 /* 64 /*
41 * We tried 2 times to program the device with the given 65 * We tried 3 times to program the device with the given
42 * min_delta_ns. If that's not working then we double it 66 * min_delta_ns. If that's not working then we increase it
43 * and emit a warning. 67 * and emit a warning.
44 */ 68 */
45 if (++i > 2) { 69 if (++i > 2) {
46 /* Increase the min. delta and try again */ 70 /* Increase the min. delta and try again */
47 if (!dev->min_delta_ns) 71 if (tick_increase_min_delta(dev)) {
48 dev->min_delta_ns = 5000; 72 /*
49 else 73 * Get out of the loop if min_delta_ns
50 dev->min_delta_ns += dev->min_delta_ns >> 1; 74 * hit the limit already. That's
51 75 * better than staying here forever.
52 printk(KERN_WARNING 76 *
53 "CE: %s increasing min_delta_ns to %llu nsec\n", 77 * We clear next_event so we have a
54 dev->name ? dev->name : "?", 78 * chance that the box survives.
55 (unsigned long long) dev->min_delta_ns << 1); 79 */
56 80 printk(KERN_WARNING
81 "CE: Reprogramming failure. Giving up\n");
82 dev->next_event.tv64 = KTIME_MAX;
83 return -ETIME;
84 }
57 i = 0; 85 i = 0;
58 } 86 }
59 87
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f992762d7f51..813993b5fb61 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -150,35 +150,65 @@ static void tick_nohz_update_jiffies(ktime_t now)
150 touch_softlockup_watchdog(); 150 touch_softlockup_watchdog();
151} 151}
152 152
153/*
154 * Updates the per cpu time idle statistics counters
155 */
156static void
157update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time)
158{
159 ktime_t delta;
160
161 if (ts->idle_active) {
162 delta = ktime_sub(now, ts->idle_entrytime);
163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
164 if (nr_iowait_cpu(cpu) > 0)
165 ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta);
166 ts->idle_entrytime = now;
167 }
168
169 if (last_update_time)
170 *last_update_time = ktime_to_us(now);
171
172}
173
153static void tick_nohz_stop_idle(int cpu, ktime_t now) 174static void tick_nohz_stop_idle(int cpu, ktime_t now)
154{ 175{
155 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 176 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
156 ktime_t delta;
157 177
158 delta = ktime_sub(now, ts->idle_entrytime); 178 update_ts_time_stats(cpu, ts, now, NULL);
159 ts->idle_lastupdate = now;
160 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
161 ts->idle_active = 0; 179 ts->idle_active = 0;
162 180
163 sched_clock_idle_wakeup_event(0); 181 sched_clock_idle_wakeup_event(0);
164} 182}
165 183
166static ktime_t tick_nohz_start_idle(struct tick_sched *ts) 184static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
167{ 185{
168 ktime_t now, delta; 186 ktime_t now;
169 187
170 now = ktime_get(); 188 now = ktime_get();
171 if (ts->idle_active) { 189
172 delta = ktime_sub(now, ts->idle_entrytime); 190 update_ts_time_stats(cpu, ts, now, NULL);
173 ts->idle_lastupdate = now; 191
174 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
175 }
176 ts->idle_entrytime = now; 192 ts->idle_entrytime = now;
177 ts->idle_active = 1; 193 ts->idle_active = 1;
178 sched_clock_idle_sleep_event(); 194 sched_clock_idle_sleep_event();
179 return now; 195 return now;
180} 196}
181 197
198/**
199 * get_cpu_idle_time_us - get the total idle time of a cpu
200 * @cpu: CPU number to query
201 * @last_update_time: variable to store update time in
202 *
203 * Return the cummulative idle time (since boot) for a given
204 * CPU, in microseconds. The idle time returned includes
205 * the iowait time (unlike what "top" and co report).
206 *
207 * This time is measured via accounting rather than sampling,
208 * and is as accurate as ktime_get() is.
209 *
210 * This function returns -1 if NOHZ is not enabled.
211 */
182u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) 212u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
183{ 213{
184 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 214 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -186,15 +216,38 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
186 if (!tick_nohz_enabled) 216 if (!tick_nohz_enabled)
187 return -1; 217 return -1;
188 218
189 if (ts->idle_active) 219 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
190 *last_update_time = ktime_to_us(ts->idle_lastupdate);
191 else
192 *last_update_time = ktime_to_us(ktime_get());
193 220
194 return ktime_to_us(ts->idle_sleeptime); 221 return ktime_to_us(ts->idle_sleeptime);
195} 222}
196EXPORT_SYMBOL_GPL(get_cpu_idle_time_us); 223EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
197 224
225/*
226 * get_cpu_iowait_time_us - get the total iowait time of a cpu
227 * @cpu: CPU number to query
228 * @last_update_time: variable to store update time in
229 *
230 * Return the cummulative iowait time (since boot) for a given
231 * CPU, in microseconds.
232 *
233 * This time is measured via accounting rather than sampling,
234 * and is as accurate as ktime_get() is.
235 *
236 * This function returns -1 if NOHZ is not enabled.
237 */
238u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
239{
240 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
241
242 if (!tick_nohz_enabled)
243 return -1;
244
245 update_ts_time_stats(cpu, ts, ktime_get(), last_update_time);
246
247 return ktime_to_us(ts->iowait_sleeptime);
248}
249EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
250
198/** 251/**
199 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 252 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
200 * 253 *
@@ -231,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle)
231 */ 284 */
232 ts->inidle = 1; 285 ts->inidle = 1;
233 286
234 now = tick_nohz_start_idle(ts); 287 now = tick_nohz_start_idle(cpu, ts);
235 288
236 /* 289 /*
237 * If this cpu is offline and it is the one which updates 290 * If this cpu is offline and it is the one which updates
@@ -272,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
272 } while (read_seqretry(&xtime_lock, seq)); 325 } while (read_seqretry(&xtime_lock, seq));
273 326
274 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || 327 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
275 arch_needs_cpu(cpu)) { 328 arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
276 next_jiffies = last_jiffies + 1; 329 next_jiffies = last_jiffies + 1;
277 delta_jiffies = 1; 330 delta_jiffies = 1;
278 } else { 331 } else {
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
index 12f5c55090be..ac38fbb176cc 100644
--- a/kernel/time/timecompare.c
+++ b/kernel/time/timecompare.c
@@ -19,6 +19,7 @@
19 19
20#include <linux/timecompare.h> 20#include <linux/timecompare.h>
21#include <linux/module.h> 21#include <linux/module.h>
22#include <linux/slab.h>
22#include <linux/math64.h> 23#include <linux/math64.h>
23 24
24/* 25/*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 16736379a9ca..caf8d4d4f5c8 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -165,13 +165,6 @@ struct timespec raw_time;
165/* flag for if timekeeping is suspended */ 165/* flag for if timekeeping is suspended */
166int __read_mostly timekeeping_suspended; 166int __read_mostly timekeeping_suspended;
167 167
168static struct timespec xtime_cache __attribute__ ((aligned (16)));
169void update_xtime_cache(u64 nsec)
170{
171 xtime_cache = xtime;
172 timespec_add_ns(&xtime_cache, nsec);
173}
174
175/* must hold xtime_lock */ 168/* must hold xtime_lock */
176void timekeeping_leap_insert(int leapsecond) 169void timekeeping_leap_insert(int leapsecond)
177{ 170{
@@ -332,8 +325,6 @@ int do_settimeofday(struct timespec *tv)
332 325
333 xtime = *tv; 326 xtime = *tv;
334 327
335 update_xtime_cache(0);
336
337 timekeeper.ntp_error = 0; 328 timekeeper.ntp_error = 0;
338 ntp_clear(); 329 ntp_clear();
339 330
@@ -559,7 +550,6 @@ void __init timekeeping_init(void)
559 } 550 }
560 set_normalized_timespec(&wall_to_monotonic, 551 set_normalized_timespec(&wall_to_monotonic,
561 -boot.tv_sec, -boot.tv_nsec); 552 -boot.tv_sec, -boot.tv_nsec);
562 update_xtime_cache(0);
563 total_sleep_time.tv_sec = 0; 553 total_sleep_time.tv_sec = 0;
564 total_sleep_time.tv_nsec = 0; 554 total_sleep_time.tv_nsec = 0;
565 write_sequnlock_irqrestore(&xtime_lock, flags); 555 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -593,7 +583,6 @@ static int timekeeping_resume(struct sys_device *dev)
593 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts); 583 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
594 total_sleep_time = timespec_add_safe(total_sleep_time, ts); 584 total_sleep_time = timespec_add_safe(total_sleep_time, ts);
595 } 585 }
596 update_xtime_cache(0);
597 /* re-base the last cycle value */ 586 /* re-base the last cycle value */
598 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock); 587 timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
599 timekeeper.ntp_error = 0; 588 timekeeper.ntp_error = 0;
@@ -788,7 +777,6 @@ void update_wall_time(void)
788{ 777{
789 struct clocksource *clock; 778 struct clocksource *clock;
790 cycle_t offset; 779 cycle_t offset;
791 u64 nsecs;
792 int shift = 0, maxshift; 780 int shift = 0, maxshift;
793 781
794 /* Make sure we're fully resumed: */ 782 /* Make sure we're fully resumed: */
@@ -818,7 +806,8 @@ void update_wall_time(void)
818 shift = min(shift, maxshift); 806 shift = min(shift, maxshift);
819 while (offset >= timekeeper.cycle_interval) { 807 while (offset >= timekeeper.cycle_interval) {
820 offset = logarithmic_accumulation(offset, shift); 808 offset = logarithmic_accumulation(offset, shift);
821 shift--; 809 if(offset < timekeeper.cycle_interval<<shift)
810 shift--;
822 } 811 }
823 812
824 /* correct the clock when NTP error is too big */ 813 /* correct the clock when NTP error is too big */
@@ -846,7 +835,9 @@ void update_wall_time(void)
846 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift; 835 timekeeper.ntp_error += neg << timekeeper.ntp_error_shift;
847 } 836 }
848 837
849 /* store full nanoseconds into xtime after rounding it up and 838
839 /*
840 * Store full nanoseconds into xtime after rounding it up and
850 * add the remainder to the error difference. 841 * add the remainder to the error difference.
851 */ 842 */
852 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1; 843 xtime.tv_nsec = ((s64) timekeeper.xtime_nsec >> timekeeper.shift) + 1;
@@ -854,8 +845,15 @@ void update_wall_time(void)
854 timekeeper.ntp_error += timekeeper.xtime_nsec << 845 timekeeper.ntp_error += timekeeper.xtime_nsec <<
855 timekeeper.ntp_error_shift; 846 timekeeper.ntp_error_shift;
856 847
857 nsecs = clocksource_cyc2ns(offset, timekeeper.mult, timekeeper.shift); 848 /*
858 update_xtime_cache(nsecs); 849 * Finally, make sure that after the rounding
850 * xtime.tv_nsec isn't larger then NSEC_PER_SEC
851 */
852 if (unlikely(xtime.tv_nsec >= NSEC_PER_SEC)) {
853 xtime.tv_nsec -= NSEC_PER_SEC;
854 xtime.tv_sec++;
855 second_overflow();
856 }
859 857
860 /* check to see if there is a new clocksource to use */ 858 /* check to see if there is a new clocksource to use */
861 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult); 859 update_vsyscall(&xtime, timekeeper.clock, timekeeper.mult);
@@ -895,13 +893,13 @@ EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
895 893
896unsigned long get_seconds(void) 894unsigned long get_seconds(void)
897{ 895{
898 return xtime_cache.tv_sec; 896 return xtime.tv_sec;
899} 897}
900EXPORT_SYMBOL(get_seconds); 898EXPORT_SYMBOL(get_seconds);
901 899
902struct timespec __current_kernel_time(void) 900struct timespec __current_kernel_time(void)
903{ 901{
904 return xtime_cache; 902 return xtime;
905} 903}
906 904
907struct timespec current_kernel_time(void) 905struct timespec current_kernel_time(void)
@@ -912,7 +910,7 @@ struct timespec current_kernel_time(void)
912 do { 910 do {
913 seq = read_seqbegin(&xtime_lock); 911 seq = read_seqbegin(&xtime_lock);
914 912
915 now = xtime_cache; 913 now = xtime;
916 } while (read_seqretry(&xtime_lock, seq)); 914 } while (read_seqretry(&xtime_lock, seq));
917 915
918 return now; 916 return now;
@@ -927,7 +925,7 @@ struct timespec get_monotonic_coarse(void)
927 do { 925 do {
928 seq = read_seqbegin(&xtime_lock); 926 seq = read_seqbegin(&xtime_lock);
929 927
930 now = xtime_cache; 928 now = xtime;
931 mono = wall_to_monotonic; 929 mono = wall_to_monotonic;
932 } while (read_seqretry(&xtime_lock, seq)); 930 } while (read_seqretry(&xtime_lock, seq));
933 931
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdfb8dd1050c..ab8f5e33fa92 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -176,6 +176,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
176 P_ns(idle_waketime); 176 P_ns(idle_waketime);
177 P_ns(idle_exittime); 177 P_ns(idle_exittime);
178 P_ns(idle_sleeptime); 178 P_ns(idle_sleeptime);
179 P_ns(iowait_sleeptime);
179 P(last_jiffies); 180 P(last_jiffies);
180 P(next_jiffies); 181 P(next_jiffies);
181 P_ns(idle_expires); 182 P_ns(idle_expires);
@@ -228,6 +229,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
228 SEQ_printf(m, " event_handler: "); 229 SEQ_printf(m, " event_handler: ");
229 print_name_offset(m, dev->event_handler); 230 print_name_offset(m, dev->event_handler);
230 SEQ_printf(m, "\n"); 231 SEQ_printf(m, "\n");
232 SEQ_printf(m, " retries: %lu\n", dev->retries);
231} 233}
232 234
233static void timer_list_show_tickdevices(struct seq_file *m) 235static void timer_list_show_tickdevices(struct seq_file *m)
@@ -257,7 +259,7 @@ static int timer_list_show(struct seq_file *m, void *v)
257 u64 now = ktime_to_ns(ktime_get()); 259 u64 now = ktime_to_ns(ktime_get());
258 int cpu; 260 int cpu;
259 261
260 SEQ_printf(m, "Timer List Version: v0.5\n"); 262 SEQ_printf(m, "Timer List Version: v0.6\n");
261 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 263 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
262 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 264 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
263 265
diff --git a/kernel/timer.c b/kernel/timer.c
index c61a7949387f..efde11e197c4 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -39,6 +39,7 @@
39#include <linux/kallsyms.h> 39#include <linux/kallsyms.h>
40#include <linux/perf_event.h> 40#include <linux/perf_event.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/slab.h>
42 43
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44#include <asm/unistd.h> 45#include <asm/unistd.h>
@@ -318,6 +319,24 @@ unsigned long round_jiffies_up_relative(unsigned long j)
318} 319}
319EXPORT_SYMBOL_GPL(round_jiffies_up_relative); 320EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
320 321
322/**
323 * set_timer_slack - set the allowed slack for a timer
324 * @slack_hz: the amount of time (in jiffies) allowed for rounding
325 *
326 * Set the amount of time, in jiffies, that a certain timer has
327 * in terms of slack. By setting this value, the timer subsystem
328 * will schedule the actual timer somewhere between
329 * the time mod_timer() asks for, and that time plus the slack.
330 *
331 * By setting the slack to -1, a percentage of the delay is used
332 * instead.
333 */
334void set_timer_slack(struct timer_list *timer, int slack_hz)
335{
336 timer->slack = slack_hz;
337}
338EXPORT_SYMBOL_GPL(set_timer_slack);
339
321 340
322static inline void set_running_timer(struct tvec_base *base, 341static inline void set_running_timer(struct tvec_base *base,
323 struct timer_list *timer) 342 struct timer_list *timer)
@@ -549,6 +568,7 @@ static void __init_timer(struct timer_list *timer,
549{ 568{
550 timer->entry.next = NULL; 569 timer->entry.next = NULL;
551 timer->base = __raw_get_cpu_var(tvec_bases); 570 timer->base = __raw_get_cpu_var(tvec_bases);
571 timer->slack = -1;
552#ifdef CONFIG_TIMER_STATS 572#ifdef CONFIG_TIMER_STATS
553 timer->start_site = NULL; 573 timer->start_site = NULL;
554 timer->start_pid = -1; 574 timer->start_pid = -1;
@@ -557,6 +577,19 @@ static void __init_timer(struct timer_list *timer,
557 lockdep_init_map(&timer->lockdep_map, name, key, 0); 577 lockdep_init_map(&timer->lockdep_map, name, key, 0);
558} 578}
559 579
580void setup_deferrable_timer_on_stack_key(struct timer_list *timer,
581 const char *name,
582 struct lock_class_key *key,
583 void (*function)(unsigned long),
584 unsigned long data)
585{
586 timer->function = function;
587 timer->data = data;
588 init_timer_on_stack_key(timer, name, key);
589 timer_set_deferrable(timer);
590}
591EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key);
592
560/** 593/**
561 * init_timer_key - initialize a timer 594 * init_timer_key - initialize a timer
562 * @timer: the timer to be initialized 595 * @timer: the timer to be initialized
@@ -714,6 +747,46 @@ int mod_timer_pending(struct timer_list *timer, unsigned long expires)
714} 747}
715EXPORT_SYMBOL(mod_timer_pending); 748EXPORT_SYMBOL(mod_timer_pending);
716 749
750/*
751 * Decide where to put the timer while taking the slack into account
752 *
753 * Algorithm:
754 * 1) calculate the maximum (absolute) time
755 * 2) calculate the highest bit where the expires and new max are different
756 * 3) use this bit to make a mask
757 * 4) use the bitmask to round down the maximum time, so that all last
758 * bits are zeros
759 */
760static inline
761unsigned long apply_slack(struct timer_list *timer, unsigned long expires)
762{
763 unsigned long expires_limit, mask;
764 int bit;
765
766 expires_limit = expires;
767
768 if (timer->slack >= 0) {
769 expires_limit = expires + timer->slack;
770 } else {
771 unsigned long now = jiffies;
772
773 /* No slack, if already expired else auto slack 0.4% */
774 if (time_after(expires, now))
775 expires_limit = expires + (expires - now)/256;
776 }
777 mask = expires ^ expires_limit;
778 if (mask == 0)
779 return expires;
780
781 bit = find_last_bit(&mask, BITS_PER_LONG);
782
783 mask = (1 << bit) - 1;
784
785 expires_limit = expires_limit & ~(mask);
786
787 return expires_limit;
788}
789
717/** 790/**
718 * mod_timer - modify a timer's timeout 791 * mod_timer - modify a timer's timeout
719 * @timer: the timer to be modified 792 * @timer: the timer to be modified
@@ -744,6 +817,8 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
744 if (timer_pending(timer) && timer->expires == expires) 817 if (timer_pending(timer) && timer->expires == expires)
745 return 1; 818 return 1;
746 819
820 expires = apply_slack(timer, expires);
821
747 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED); 822 return __mod_timer(timer, expires, false, TIMER_NOT_PINNED);
748} 823}
749EXPORT_SYMBOL(mod_timer); 824EXPORT_SYMBOL(mod_timer);
@@ -880,6 +955,7 @@ int try_to_del_timer_sync(struct timer_list *timer)
880 if (base->running_timer == timer) 955 if (base->running_timer == timer)
881 goto out; 956 goto out;
882 957
958 timer_stats_timer_clear_start_info(timer);
883 ret = 0; 959 ret = 0;
884 if (timer_pending(timer)) { 960 if (timer_pending(timer)) {
885 detach_timer(timer, 1); 961 detach_timer(timer, 1);
@@ -953,6 +1029,47 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
953 return index; 1029 return index;
954} 1030}
955 1031
1032static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
1033 unsigned long data)
1034{
1035 int preempt_count = preempt_count();
1036
1037#ifdef CONFIG_LOCKDEP
1038 /*
1039 * It is permissible to free the timer from inside the
1040 * function that is called from it, this we need to take into
1041 * account for lockdep too. To avoid bogus "held lock freed"
1042 * warnings as well as problems when looking into
1043 * timer->lockdep_map, make a copy and use that here.
1044 */
1045 struct lockdep_map lockdep_map = timer->lockdep_map;
1046#endif
1047 /*
1048 * Couple the lock chain with the lock chain at
1049 * del_timer_sync() by acquiring the lock_map around the fn()
1050 * call here and in del_timer_sync().
1051 */
1052 lock_map_acquire(&lockdep_map);
1053
1054 trace_timer_expire_entry(timer);
1055 fn(data);
1056 trace_timer_expire_exit(timer);
1057
1058 lock_map_release(&lockdep_map);
1059
1060 if (preempt_count != preempt_count()) {
1061 WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
1062 fn, preempt_count, preempt_count());
1063 /*
1064 * Restore the preempt count. That gives us a decent
1065 * chance to survive and extract information. If the
1066 * callback kept a lock held, bad luck, but not worse
1067 * than the BUG() we had.
1068 */
1069 preempt_count() = preempt_count;
1070 }
1071}
1072
956#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK) 1073#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
957 1074
958/** 1075/**
@@ -996,45 +1113,7 @@ static inline void __run_timers(struct tvec_base *base)
996 detach_timer(timer, 1); 1113 detach_timer(timer, 1);
997 1114
998 spin_unlock_irq(&base->lock); 1115 spin_unlock_irq(&base->lock);
999 { 1116 call_timer_fn(timer, fn, data);
1000 int preempt_count = preempt_count();
1001
1002#ifdef CONFIG_LOCKDEP
1003 /*
1004 * It is permissible to free the timer from
1005 * inside the function that is called from
1006 * it, this we need to take into account for
1007 * lockdep too. To avoid bogus "held lock
1008 * freed" warnings as well as problems when
1009 * looking into timer->lockdep_map, make a
1010 * copy and use that here.
1011 */
1012 struct lockdep_map lockdep_map =
1013 timer->lockdep_map;
1014#endif
1015 /*
1016 * Couple the lock chain with the lock chain at
1017 * del_timer_sync() by acquiring the lock_map
1018 * around the fn() call here and in
1019 * del_timer_sync().
1020 */
1021 lock_map_acquire(&lockdep_map);
1022
1023 trace_timer_expire_entry(timer);
1024 fn(data);
1025 trace_timer_expire_exit(timer);
1026
1027 lock_map_release(&lockdep_map);
1028
1029 if (preempt_count != preempt_count()) {
1030 printk(KERN_ERR "huh, entered %p "
1031 "with preempt_count %08x, exited"
1032 " with %08x?\n",
1033 fn, preempt_count,
1034 preempt_count());
1035 BUG();
1036 }
1037 }
1038 spin_lock_irq(&base->lock); 1117 spin_lock_irq(&base->lock);
1039 } 1118 }
1040 } 1119 }
@@ -1618,11 +1697,14 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1618 unsigned long action, void *hcpu) 1697 unsigned long action, void *hcpu)
1619{ 1698{
1620 long cpu = (long)hcpu; 1699 long cpu = (long)hcpu;
1700 int err;
1701
1621 switch(action) { 1702 switch(action) {
1622 case CPU_UP_PREPARE: 1703 case CPU_UP_PREPARE:
1623 case CPU_UP_PREPARE_FROZEN: 1704 case CPU_UP_PREPARE_FROZEN:
1624 if (init_timers_cpu(cpu) < 0) 1705 err = init_timers_cpu(cpu);
1625 return NOTIFY_BAD; 1706 if (err < 0)
1707 return notifier_from_errno(err);
1626 break; 1708 break;
1627#ifdef CONFIG_HOTPLUG_CPU 1709#ifdef CONFIG_HOTPLUG_CPU
1628 case CPU_DEAD: 1710 case CPU_DEAD:
@@ -1648,7 +1730,7 @@ void __init init_timers(void)
1648 1730
1649 init_timer_stats(); 1731 init_timer_stats();
1650 1732
1651 BUG_ON(err == NOTIFY_BAD); 1733 BUG_ON(err != NOTIFY_OK);
1652 register_cpu_notifier(&timers_nb); 1734 register_cpu_notifier(&timers_nb);
1653 open_softirq(TIMER_SOFTIRQ, run_timer_softirq); 1735 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1654} 1736}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 13e13d428cd3..8b1797c4545b 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,9 +44,6 @@ config HAVE_FTRACE_MCOUNT_RECORD
44 help 44 help
45 See Documentation/trace/ftrace-design.txt 45 See Documentation/trace/ftrace-design.txt
46 46
47config HAVE_HW_BRANCH_TRACER
48 bool
49
50config HAVE_SYSCALL_TRACEPOINTS 47config HAVE_SYSCALL_TRACEPOINTS
51 bool 48 bool
52 help 49 help
@@ -374,14 +371,6 @@ config STACK_TRACER
374 371
375 Say N if unsure. 372 Say N if unsure.
376 373
377config HW_BRANCH_TRACER
378 depends on HAVE_HW_BRANCH_TRACER
379 bool "Trace hw branches"
380 select GENERIC_TRACER
381 help
382 This tracer records all branches on the system in a circular
383 buffer, giving access to the last N branches for each cpu.
384
385config KMEMTRACE 374config KMEMTRACE
386 bool "Trace SLAB allocations" 375 bool "Trace SLAB allocations"
387 select GENERIC_TRACER 376 select GENERIC_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 78edc6490038..ffb1a5b0550e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o 41obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o 42obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o 43obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
44obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
45obj-$(CONFIG_KMEMTRACE) += kmemtrace.o 44obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
46obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o 45obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
47obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o 46obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 07f945a99430..638711c17504 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -21,6 +21,7 @@
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/init.h> 22#include <linux/init.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/slab.h>
24#include <linux/debugfs.h> 25#include <linux/debugfs.h>
25#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
26#include <linux/time.h> 27#include <linux/time.h>
@@ -674,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
674 } 675 }
675} 676}
676 677
677static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) 678static void blk_add_trace_rq_abort(void *ignore,
679 struct request_queue *q, struct request *rq)
678{ 680{
679 blk_add_trace_rq(q, rq, BLK_TA_ABORT); 681 blk_add_trace_rq(q, rq, BLK_TA_ABORT);
680} 682}
681 683
682static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) 684static void blk_add_trace_rq_insert(void *ignore,
685 struct request_queue *q, struct request *rq)
683{ 686{
684 blk_add_trace_rq(q, rq, BLK_TA_INSERT); 687 blk_add_trace_rq(q, rq, BLK_TA_INSERT);
685} 688}
686 689
687static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) 690static void blk_add_trace_rq_issue(void *ignore,
691 struct request_queue *q, struct request *rq)
688{ 692{
689 blk_add_trace_rq(q, rq, BLK_TA_ISSUE); 693 blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
690} 694}
691 695
692static void blk_add_trace_rq_requeue(struct request_queue *q, 696static void blk_add_trace_rq_requeue(void *ignore,
697 struct request_queue *q,
693 struct request *rq) 698 struct request *rq)
694{ 699{
695 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); 700 blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
696} 701}
697 702
698static void blk_add_trace_rq_complete(struct request_queue *q, 703static void blk_add_trace_rq_complete(void *ignore,
704 struct request_queue *q,
699 struct request *rq) 705 struct request *rq)
700{ 706{
701 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); 707 blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
@@ -723,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
723 !bio_flagged(bio, BIO_UPTODATE), 0, NULL); 729 !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
724} 730}
725 731
726static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) 732static void blk_add_trace_bio_bounce(void *ignore,
733 struct request_queue *q, struct bio *bio)
727{ 734{
728 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); 735 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
729} 736}
730 737
731static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) 738static void blk_add_trace_bio_complete(void *ignore,
739 struct request_queue *q, struct bio *bio)
732{ 740{
733 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); 741 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
734} 742}
735 743
736static void blk_add_trace_bio_backmerge(struct request_queue *q, 744static void blk_add_trace_bio_backmerge(void *ignore,
745 struct request_queue *q,
737 struct bio *bio) 746 struct bio *bio)
738{ 747{
739 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); 748 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
740} 749}
741 750
742static void blk_add_trace_bio_frontmerge(struct request_queue *q, 751static void blk_add_trace_bio_frontmerge(void *ignore,
752 struct request_queue *q,
743 struct bio *bio) 753 struct bio *bio)
744{ 754{
745 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); 755 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
746} 756}
747 757
748static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) 758static void blk_add_trace_bio_queue(void *ignore,
759 struct request_queue *q, struct bio *bio)
749{ 760{
750 blk_add_trace_bio(q, bio, BLK_TA_QUEUE); 761 blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
751} 762}
752 763
753static void blk_add_trace_getrq(struct request_queue *q, 764static void blk_add_trace_getrq(void *ignore,
765 struct request_queue *q,
754 struct bio *bio, int rw) 766 struct bio *bio, int rw)
755{ 767{
756 if (bio) 768 if (bio)
@@ -764,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q,
764} 776}
765 777
766 778
767static void blk_add_trace_sleeprq(struct request_queue *q, 779static void blk_add_trace_sleeprq(void *ignore,
780 struct request_queue *q,
768 struct bio *bio, int rw) 781 struct bio *bio, int rw)
769{ 782{
770 if (bio) 783 if (bio)
@@ -778,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q,
778 } 791 }
779} 792}
780 793
781static void blk_add_trace_plug(struct request_queue *q) 794static void blk_add_trace_plug(void *ignore, struct request_queue *q)
782{ 795{
783 struct blk_trace *bt = q->blk_trace; 796 struct blk_trace *bt = q->blk_trace;
784 797
@@ -786,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q)
786 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); 799 __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
787} 800}
788 801
789static void blk_add_trace_unplug_io(struct request_queue *q) 802static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
790{ 803{
791 struct blk_trace *bt = q->blk_trace; 804 struct blk_trace *bt = q->blk_trace;
792 805
@@ -799,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q)
799 } 812 }
800} 813}
801 814
802static void blk_add_trace_unplug_timer(struct request_queue *q) 815static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
803{ 816{
804 struct blk_trace *bt = q->blk_trace; 817 struct blk_trace *bt = q->blk_trace;
805 818
@@ -812,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q)
812 } 825 }
813} 826}
814 827
815static void blk_add_trace_split(struct request_queue *q, struct bio *bio, 828static void blk_add_trace_split(void *ignore,
829 struct request_queue *q, struct bio *bio,
816 unsigned int pdu) 830 unsigned int pdu)
817{ 831{
818 struct blk_trace *bt = q->blk_trace; 832 struct blk_trace *bt = q->blk_trace;
@@ -828,6 +842,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
828 842
829/** 843/**
830 * blk_add_trace_remap - Add a trace for a remap operation 844 * blk_add_trace_remap - Add a trace for a remap operation
845 * @ignore: trace callback data parameter (not used)
831 * @q: queue the io is for 846 * @q: queue the io is for
832 * @bio: the source bio 847 * @bio: the source bio
833 * @dev: target device 848 * @dev: target device
@@ -838,8 +853,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
838 * it spans a stripe (or similar). Add a trace for that action. 853 * it spans a stripe (or similar). Add a trace for that action.
839 * 854 *
840 **/ 855 **/
841static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, 856static void blk_add_trace_remap(void *ignore,
842 dev_t dev, sector_t from) 857 struct request_queue *q, struct bio *bio,
858 dev_t dev, sector_t from)
843{ 859{
844 struct blk_trace *bt = q->blk_trace; 860 struct blk_trace *bt = q->blk_trace;
845 struct blk_io_trace_remap r; 861 struct blk_io_trace_remap r;
@@ -858,6 +874,7 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
858 874
859/** 875/**
860 * blk_add_trace_rq_remap - Add a trace for a request-remap operation 876 * blk_add_trace_rq_remap - Add a trace for a request-remap operation
877 * @ignore: trace callback data parameter (not used)
861 * @q: queue the io is for 878 * @q: queue the io is for
862 * @rq: the source request 879 * @rq: the source request
863 * @dev: target device 880 * @dev: target device
@@ -868,7 +885,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
868 * Add a trace for that action. 885 * Add a trace for that action.
869 * 886 *
870 **/ 887 **/
871static void blk_add_trace_rq_remap(struct request_queue *q, 888static void blk_add_trace_rq_remap(void *ignore,
889 struct request_queue *q,
872 struct request *rq, dev_t dev, 890 struct request *rq, dev_t dev,
873 sector_t from) 891 sector_t from)
874{ 892{
@@ -920,64 +938,64 @@ static void blk_register_tracepoints(void)
920{ 938{
921 int ret; 939 int ret;
922 940
923 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); 941 ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
924 WARN_ON(ret); 942 WARN_ON(ret);
925 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); 943 ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
926 WARN_ON(ret); 944 WARN_ON(ret);
927 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); 945 ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
928 WARN_ON(ret); 946 WARN_ON(ret);
929 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); 947 ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
930 WARN_ON(ret); 948 WARN_ON(ret);
931 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); 949 ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
932 WARN_ON(ret); 950 WARN_ON(ret);
933 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); 951 ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
934 WARN_ON(ret); 952 WARN_ON(ret);
935 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); 953 ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
936 WARN_ON(ret); 954 WARN_ON(ret);
937 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); 955 ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
938 WARN_ON(ret); 956 WARN_ON(ret);
939 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); 957 ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
940 WARN_ON(ret); 958 WARN_ON(ret);
941 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); 959 ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
942 WARN_ON(ret); 960 WARN_ON(ret);
943 ret = register_trace_block_getrq(blk_add_trace_getrq); 961 ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
944 WARN_ON(ret); 962 WARN_ON(ret);
945 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); 963 ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
946 WARN_ON(ret); 964 WARN_ON(ret);
947 ret = register_trace_block_plug(blk_add_trace_plug); 965 ret = register_trace_block_plug(blk_add_trace_plug, NULL);
948 WARN_ON(ret); 966 WARN_ON(ret);
949 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); 967 ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
950 WARN_ON(ret); 968 WARN_ON(ret);
951 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); 969 ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
952 WARN_ON(ret); 970 WARN_ON(ret);
953 ret = register_trace_block_split(blk_add_trace_split); 971 ret = register_trace_block_split(blk_add_trace_split, NULL);
954 WARN_ON(ret); 972 WARN_ON(ret);
955 ret = register_trace_block_remap(blk_add_trace_remap); 973 ret = register_trace_block_remap(blk_add_trace_remap, NULL);
956 WARN_ON(ret); 974 WARN_ON(ret);
957 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); 975 ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
958 WARN_ON(ret); 976 WARN_ON(ret);
959} 977}
960 978
961static void blk_unregister_tracepoints(void) 979static void blk_unregister_tracepoints(void)
962{ 980{
963 unregister_trace_block_rq_remap(blk_add_trace_rq_remap); 981 unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
964 unregister_trace_block_remap(blk_add_trace_remap); 982 unregister_trace_block_remap(blk_add_trace_remap, NULL);
965 unregister_trace_block_split(blk_add_trace_split); 983 unregister_trace_block_split(blk_add_trace_split, NULL);
966 unregister_trace_block_unplug_io(blk_add_trace_unplug_io); 984 unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
967 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); 985 unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
968 unregister_trace_block_plug(blk_add_trace_plug); 986 unregister_trace_block_plug(blk_add_trace_plug, NULL);
969 unregister_trace_block_sleeprq(blk_add_trace_sleeprq); 987 unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
970 unregister_trace_block_getrq(blk_add_trace_getrq); 988 unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
971 unregister_trace_block_bio_queue(blk_add_trace_bio_queue); 989 unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
972 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); 990 unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
973 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); 991 unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
974 unregister_trace_block_bio_complete(blk_add_trace_bio_complete); 992 unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
975 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); 993 unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
976 unregister_trace_block_rq_complete(blk_add_trace_rq_complete); 994 unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
977 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); 995 unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
978 unregister_trace_block_rq_issue(blk_add_trace_rq_issue); 996 unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
979 unregister_trace_block_rq_insert(blk_add_trace_rq_insert); 997 unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
980 unregister_trace_block_rq_abort(blk_add_trace_rq_abort); 998 unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
981 999
982 tracepoint_synchronize_unregister(); 1000 tracepoint_synchronize_unregister();
983} 1001}
@@ -1320,7 +1338,7 @@ out:
1320} 1338}
1321 1339
1322static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, 1340static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
1323 int flags) 1341 int flags, struct trace_event *event)
1324{ 1342{
1325 return print_one_line(iter, false); 1343 return print_one_line(iter, false);
1326} 1344}
@@ -1342,7 +1360,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
1342} 1360}
1343 1361
1344static enum print_line_t 1362static enum print_line_t
1345blk_trace_event_print_binary(struct trace_iterator *iter, int flags) 1363blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
1364 struct trace_event *event)
1346{ 1365{
1347 return blk_trace_synthesize_old_trace(iter) ? 1366 return blk_trace_synthesize_old_trace(iter) ?
1348 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; 1367 TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
@@ -1380,12 +1399,16 @@ static struct tracer blk_tracer __read_mostly = {
1380 .set_flag = blk_tracer_set_flag, 1399 .set_flag = blk_tracer_set_flag,
1381}; 1400};
1382 1401
1383static struct trace_event trace_blk_event = { 1402static struct trace_event_functions trace_blk_event_funcs = {
1384 .type = TRACE_BLK,
1385 .trace = blk_trace_event_print, 1403 .trace = blk_trace_event_print,
1386 .binary = blk_trace_event_print_binary, 1404 .binary = blk_trace_event_print_binary,
1387}; 1405};
1388 1406
1407static struct trace_event trace_blk_event = {
1408 .type = TRACE_BLK,
1409 .funcs = &trace_blk_event_funcs,
1410};
1411
1389static int __init init_blk_tracer(void) 1412static int __init init_blk_tracer(void)
1390{ 1413{
1391 if (!register_ftrace_event(&trace_blk_event)) { 1414 if (!register_ftrace_event(&trace_blk_event)) {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d9062f5cc0c0..6d2cb14f9449 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -24,6 +24,7 @@
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/ftrace.h> 25#include <linux/ftrace.h>
26#include <linux/sysctl.h> 26#include <linux/sysctl.h>
27#include <linux/slab.h>
27#include <linux/ctype.h> 28#include <linux/ctype.h>
28#include <linux/list.h> 29#include <linux/list.h>
29#include <linux/hash.h> 30#include <linux/hash.h>
@@ -263,6 +264,7 @@ struct ftrace_profile {
263 unsigned long counter; 264 unsigned long counter;
264#ifdef CONFIG_FUNCTION_GRAPH_TRACER 265#ifdef CONFIG_FUNCTION_GRAPH_TRACER
265 unsigned long long time; 266 unsigned long long time;
267 unsigned long long time_squared;
266#endif 268#endif
267}; 269};
268 270
@@ -365,9 +367,9 @@ static int function_stat_headers(struct seq_file *m)
365{ 367{
366#ifdef CONFIG_FUNCTION_GRAPH_TRACER 368#ifdef CONFIG_FUNCTION_GRAPH_TRACER
367 seq_printf(m, " Function " 369 seq_printf(m, " Function "
368 "Hit Time Avg\n" 370 "Hit Time Avg s^2\n"
369 " -------- " 371 " -------- "
370 "--- ---- ---\n"); 372 "--- ---- --- ---\n");
371#else 373#else
372 seq_printf(m, " Function Hit\n" 374 seq_printf(m, " Function Hit\n"
373 " -------- ---\n"); 375 " -------- ---\n");
@@ -383,6 +385,7 @@ static int function_stat_show(struct seq_file *m, void *v)
383 static DEFINE_MUTEX(mutex); 385 static DEFINE_MUTEX(mutex);
384 static struct trace_seq s; 386 static struct trace_seq s;
385 unsigned long long avg; 387 unsigned long long avg;
388 unsigned long long stddev;
386#endif 389#endif
387 390
388 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); 391 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -393,11 +396,25 @@ static int function_stat_show(struct seq_file *m, void *v)
393 avg = rec->time; 396 avg = rec->time;
394 do_div(avg, rec->counter); 397 do_div(avg, rec->counter);
395 398
399 /* Sample standard deviation (s^2) */
400 if (rec->counter <= 1)
401 stddev = 0;
402 else {
403 stddev = rec->time_squared - rec->counter * avg * avg;
404 /*
405 * Divide only 1000 for ns^2 -> us^2 conversion.
406 * trace_print_graph_duration will divide 1000 again.
407 */
408 do_div(stddev, (rec->counter - 1) * 1000);
409 }
410
396 mutex_lock(&mutex); 411 mutex_lock(&mutex);
397 trace_seq_init(&s); 412 trace_seq_init(&s);
398 trace_print_graph_duration(rec->time, &s); 413 trace_print_graph_duration(rec->time, &s);
399 trace_seq_puts(&s, " "); 414 trace_seq_puts(&s, " ");
400 trace_print_graph_duration(avg, &s); 415 trace_print_graph_duration(avg, &s);
416 trace_seq_puts(&s, " ");
417 trace_print_graph_duration(stddev, &s);
401 trace_print_seq(m, &s); 418 trace_print_seq(m, &s);
402 mutex_unlock(&mutex); 419 mutex_unlock(&mutex);
403#endif 420#endif
@@ -649,6 +666,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
649 if (!stat->hash || !ftrace_profile_enabled) 666 if (!stat->hash || !ftrace_profile_enabled)
650 goto out; 667 goto out;
651 668
669 /* If the calltime was zero'd ignore it */
670 if (!trace->calltime)
671 goto out;
672
652 calltime = trace->rettime - trace->calltime; 673 calltime = trace->rettime - trace->calltime;
653 674
654 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { 675 if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
@@ -667,8 +688,10 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
667 } 688 }
668 689
669 rec = ftrace_find_profiled_func(stat, trace->func); 690 rec = ftrace_find_profiled_func(stat, trace->func);
670 if (rec) 691 if (rec) {
671 rec->time += calltime; 692 rec->time += calltime;
693 rec->time_squared += calltime * calltime;
694 }
672 695
673 out: 696 out:
674 local_irq_restore(flags); 697 local_irq_restore(flags);
@@ -3211,8 +3234,8 @@ free:
3211} 3234}
3212 3235
3213static void 3236static void
3214ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev, 3237ftrace_graph_probe_sched_switch(void *ignore,
3215 struct task_struct *next) 3238 struct task_struct *prev, struct task_struct *next)
3216{ 3239{
3217 unsigned long long timestamp; 3240 unsigned long long timestamp;
3218 int index; 3241 int index;
@@ -3266,7 +3289,7 @@ static int start_graph_tracing(void)
3266 } while (ret == -EAGAIN); 3289 } while (ret == -EAGAIN);
3267 3290
3268 if (!ret) { 3291 if (!ret) {
3269 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); 3292 ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3270 if (ret) 3293 if (ret)
3271 pr_info("ftrace_graph: Couldn't activate tracepoint" 3294 pr_info("ftrace_graph: Couldn't activate tracepoint"
3272 " probe to kernel_sched_switch\n"); 3295 " probe to kernel_sched_switch\n");
@@ -3338,11 +3361,11 @@ void unregister_ftrace_graph(void)
3338 goto out; 3361 goto out;
3339 3362
3340 ftrace_graph_active--; 3363 ftrace_graph_active--;
3341 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
3342 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; 3364 ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
3343 ftrace_graph_entry = ftrace_graph_entry_stub; 3365 ftrace_graph_entry = ftrace_graph_entry_stub;
3344 ftrace_shutdown(FTRACE_STOP_FUNC_RET); 3366 ftrace_shutdown(FTRACE_STOP_FUNC_RET);
3345 unregister_pm_notifier(&ftrace_suspend_notifier); 3367 unregister_pm_notifier(&ftrace_suspend_notifier);
3368 unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
3346 3369
3347 out: 3370 out:
3348 mutex_unlock(&ftrace_lock); 3371 mutex_unlock(&ftrace_lock);
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index a91da69f153a..bbfc1bb1660b 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -95,7 +95,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
95 trace_wake_up(); 95 trace_wake_up();
96} 96}
97 97
98static void kmemtrace_kmalloc(unsigned long call_site, 98static void kmemtrace_kmalloc(void *ignore,
99 unsigned long call_site,
99 const void *ptr, 100 const void *ptr,
100 size_t bytes_req, 101 size_t bytes_req,
101 size_t bytes_alloc, 102 size_t bytes_alloc,
@@ -105,7 +106,8 @@ static void kmemtrace_kmalloc(unsigned long call_site,
105 bytes_req, bytes_alloc, gfp_flags, -1); 106 bytes_req, bytes_alloc, gfp_flags, -1);
106} 107}
107 108
108static void kmemtrace_kmem_cache_alloc(unsigned long call_site, 109static void kmemtrace_kmem_cache_alloc(void *ignore,
110 unsigned long call_site,
109 const void *ptr, 111 const void *ptr,
110 size_t bytes_req, 112 size_t bytes_req,
111 size_t bytes_alloc, 113 size_t bytes_alloc,
@@ -115,7 +117,8 @@ static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
115 bytes_req, bytes_alloc, gfp_flags, -1); 117 bytes_req, bytes_alloc, gfp_flags, -1);
116} 118}
117 119
118static void kmemtrace_kmalloc_node(unsigned long call_site, 120static void kmemtrace_kmalloc_node(void *ignore,
121 unsigned long call_site,
119 const void *ptr, 122 const void *ptr,
120 size_t bytes_req, 123 size_t bytes_req,
121 size_t bytes_alloc, 124 size_t bytes_alloc,
@@ -126,7 +129,8 @@ static void kmemtrace_kmalloc_node(unsigned long call_site,
126 bytes_req, bytes_alloc, gfp_flags, node); 129 bytes_req, bytes_alloc, gfp_flags, node);
127} 130}
128 131
129static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, 132static void kmemtrace_kmem_cache_alloc_node(void *ignore,
133 unsigned long call_site,
130 const void *ptr, 134 const void *ptr,
131 size_t bytes_req, 135 size_t bytes_req,
132 size_t bytes_alloc, 136 size_t bytes_alloc,
@@ -137,12 +141,14 @@ static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
137 bytes_req, bytes_alloc, gfp_flags, node); 141 bytes_req, bytes_alloc, gfp_flags, node);
138} 142}
139 143
140static void kmemtrace_kfree(unsigned long call_site, const void *ptr) 144static void
145kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr)
141{ 146{
142 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); 147 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
143} 148}
144 149
145static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr) 150static void kmemtrace_kmem_cache_free(void *ignore,
151 unsigned long call_site, const void *ptr)
146{ 152{
147 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); 153 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
148} 154}
@@ -151,34 +157,34 @@ static int kmemtrace_start_probes(void)
151{ 157{
152 int err; 158 int err;
153 159
154 err = register_trace_kmalloc(kmemtrace_kmalloc); 160 err = register_trace_kmalloc(kmemtrace_kmalloc, NULL);
155 if (err) 161 if (err)
156 return err; 162 return err;
157 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); 163 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
158 if (err) 164 if (err)
159 return err; 165 return err;
160 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node); 166 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
161 if (err) 167 if (err)
162 return err; 168 return err;
163 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); 169 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
164 if (err) 170 if (err)
165 return err; 171 return err;
166 err = register_trace_kfree(kmemtrace_kfree); 172 err = register_trace_kfree(kmemtrace_kfree, NULL);
167 if (err) 173 if (err)
168 return err; 174 return err;
169 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free); 175 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
170 176
171 return err; 177 return err;
172} 178}
173 179
174static void kmemtrace_stop_probes(void) 180static void kmemtrace_stop_probes(void)
175{ 181{
176 unregister_trace_kmalloc(kmemtrace_kmalloc); 182 unregister_trace_kmalloc(kmemtrace_kmalloc, NULL);
177 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); 183 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL);
178 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node); 184 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL);
179 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); 185 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL);
180 unregister_trace_kfree(kmemtrace_kfree); 186 unregister_trace_kfree(kmemtrace_kfree, NULL);
181 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free); 187 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL);
182} 188}
183 189
184static int kmem_trace_init(struct trace_array *tr) 190static int kmem_trace_init(struct trace_array *tr)
@@ -237,7 +243,8 @@ struct kmemtrace_user_event_alloc {
237}; 243};
238 244
239static enum print_line_t 245static enum print_line_t
240kmemtrace_print_alloc(struct trace_iterator *iter, int flags) 246kmemtrace_print_alloc(struct trace_iterator *iter, int flags,
247 struct trace_event *event)
241{ 248{
242 struct trace_seq *s = &iter->seq; 249 struct trace_seq *s = &iter->seq;
243 struct kmemtrace_alloc_entry *entry; 250 struct kmemtrace_alloc_entry *entry;
@@ -257,7 +264,8 @@ kmemtrace_print_alloc(struct trace_iterator *iter, int flags)
257} 264}
258 265
259static enum print_line_t 266static enum print_line_t
260kmemtrace_print_free(struct trace_iterator *iter, int flags) 267kmemtrace_print_free(struct trace_iterator *iter, int flags,
268 struct trace_event *event)
261{ 269{
262 struct trace_seq *s = &iter->seq; 270 struct trace_seq *s = &iter->seq;
263 struct kmemtrace_free_entry *entry; 271 struct kmemtrace_free_entry *entry;
@@ -275,7 +283,8 @@ kmemtrace_print_free(struct trace_iterator *iter, int flags)
275} 283}
276 284
277static enum print_line_t 285static enum print_line_t
278kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) 286kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags,
287 struct trace_event *event)
279{ 288{
280 struct trace_seq *s = &iter->seq; 289 struct trace_seq *s = &iter->seq;
281 struct kmemtrace_alloc_entry *entry; 290 struct kmemtrace_alloc_entry *entry;
@@ -309,7 +318,8 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags)
309} 318}
310 319
311static enum print_line_t 320static enum print_line_t
312kmemtrace_print_free_user(struct trace_iterator *iter, int flags) 321kmemtrace_print_free_user(struct trace_iterator *iter, int flags,
322 struct trace_event *event)
313{ 323{
314 struct trace_seq *s = &iter->seq; 324 struct trace_seq *s = &iter->seq;
315 struct kmemtrace_free_entry *entry; 325 struct kmemtrace_free_entry *entry;
@@ -463,18 +473,26 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
463 } 473 }
464} 474}
465 475
466static struct trace_event kmem_trace_alloc = { 476static struct trace_event_functions kmem_trace_alloc_funcs = {
467 .type = TRACE_KMEM_ALLOC,
468 .trace = kmemtrace_print_alloc, 477 .trace = kmemtrace_print_alloc,
469 .binary = kmemtrace_print_alloc_user, 478 .binary = kmemtrace_print_alloc_user,
470}; 479};
471 480
472static struct trace_event kmem_trace_free = { 481static struct trace_event kmem_trace_alloc = {
473 .type = TRACE_KMEM_FREE, 482 .type = TRACE_KMEM_ALLOC,
483 .funcs = &kmem_trace_alloc_funcs,
484};
485
486static struct trace_event_functions kmem_trace_free_funcs = {
474 .trace = kmemtrace_print_free, 487 .trace = kmemtrace_print_free,
475 .binary = kmemtrace_print_free_user, 488 .binary = kmemtrace_print_free_user,
476}; 489};
477 490
491static struct trace_event kmem_trace_free = {
492 .type = TRACE_KMEM_FREE,
493 .funcs = &kmem_trace_free_funcs,
494};
495
478static struct tracer kmem_tracer __read_mostly = { 496static struct tracer kmem_tracer __read_mostly = {
479 .name = "kmemtrace", 497 .name = "kmemtrace",
480 .init = kmem_trace_init, 498 .init = kmem_trace_init,
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
index 9f4f565b01e6..a22582a06161 100644
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -9,7 +9,6 @@
9#include <linux/workqueue.h> 9#include <linux/workqueue.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/slab.h>
13 12
14#define CREATE_TRACE_POINTS 13#define CREATE_TRACE_POINTS
15#include <trace/events/power.h> 14#include <trace/events/power.h>
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 05a9f83b8819..1da7b6ea8b85 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/percpu.h> 15#include <linux/percpu.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17#include <linux/slab.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/hash.h> 19#include <linux/hash.h>
19#include <linux/list.h> 20#include <linux/list.h>
@@ -207,6 +208,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
207#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) 208#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
208#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ 209#define RB_EVNT_MIN_SIZE 8U /* two 32bit words */
209 210
211#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
212# define RB_FORCE_8BYTE_ALIGNMENT 0
213# define RB_ARCH_ALIGNMENT RB_ALIGNMENT
214#else
215# define RB_FORCE_8BYTE_ALIGNMENT 1
216# define RB_ARCH_ALIGNMENT 8U
217#endif
218
210/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ 219/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
211#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX 220#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
212 221
@@ -310,6 +319,11 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
310#define TS_MASK ((1ULL << TS_SHIFT) - 1) 319#define TS_MASK ((1ULL << TS_SHIFT) - 1)
311#define TS_DELTA_TEST (~TS_MASK) 320#define TS_DELTA_TEST (~TS_MASK)
312 321
322/* Flag when events were overwritten */
323#define RB_MISSED_EVENTS (1 << 31)
324/* Missed count stored at end */
325#define RB_MISSED_STORED (1 << 30)
326
313struct buffer_data_page { 327struct buffer_data_page {
314 u64 time_stamp; /* page time stamp */ 328 u64 time_stamp; /* page time stamp */
315 local_t commit; /* write committed index */ 329 local_t commit; /* write committed index */
@@ -329,6 +343,7 @@ struct buffer_page {
329 local_t write; /* index for next write */ 343 local_t write; /* index for next write */
330 unsigned read; /* index for next read */ 344 unsigned read; /* index for next read */
331 local_t entries; /* entries on this page */ 345 local_t entries; /* entries on this page */
346 unsigned long real_end; /* real end of data */
332 struct buffer_data_page *page; /* Actual data page */ 347 struct buffer_data_page *page; /* Actual data page */
333}; 348};
334 349
@@ -408,6 +423,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
408 (unsigned int)sizeof(field.commit), 423 (unsigned int)sizeof(field.commit),
409 (unsigned int)is_signed_type(long)); 424 (unsigned int)is_signed_type(long));
410 425
426 ret = trace_seq_printf(s, "\tfield: int overwrite;\t"
427 "offset:%u;\tsize:%u;\tsigned:%u;\n",
428 (unsigned int)offsetof(typeof(field), commit),
429 1,
430 (unsigned int)is_signed_type(long));
431
411 ret = trace_seq_printf(s, "\tfield: char data;\t" 432 ret = trace_seq_printf(s, "\tfield: char data;\t"
412 "offset:%u;\tsize:%u;\tsigned:%u;\n", 433 "offset:%u;\tsize:%u;\tsigned:%u;\n",
413 (unsigned int)offsetof(typeof(field), data), 434 (unsigned int)offsetof(typeof(field), data),
@@ -431,6 +452,8 @@ struct ring_buffer_per_cpu {
431 struct buffer_page *tail_page; /* write to tail */ 452 struct buffer_page *tail_page; /* write to tail */
432 struct buffer_page *commit_page; /* committed pages */ 453 struct buffer_page *commit_page; /* committed pages */
433 struct buffer_page *reader_page; 454 struct buffer_page *reader_page;
455 unsigned long lost_events;
456 unsigned long last_overrun;
434 local_t commit_overrun; 457 local_t commit_overrun;
435 local_t overrun; 458 local_t overrun;
436 local_t entries; 459 local_t entries;
@@ -1201,18 +1224,19 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1201 1224
1202 for (i = 0; i < nr_pages; i++) { 1225 for (i = 0; i < nr_pages; i++) {
1203 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1226 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1204 return; 1227 goto out;
1205 p = cpu_buffer->pages->next; 1228 p = cpu_buffer->pages->next;
1206 bpage = list_entry(p, struct buffer_page, list); 1229 bpage = list_entry(p, struct buffer_page, list);
1207 list_del_init(&bpage->list); 1230 list_del_init(&bpage->list);
1208 free_buffer_page(bpage); 1231 free_buffer_page(bpage);
1209 } 1232 }
1210 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages))) 1233 if (RB_WARN_ON(cpu_buffer, list_empty(cpu_buffer->pages)))
1211 return; 1234 goto out;
1212 1235
1213 rb_reset_cpu(cpu_buffer); 1236 rb_reset_cpu(cpu_buffer);
1214 rb_check_pages(cpu_buffer); 1237 rb_check_pages(cpu_buffer);
1215 1238
1239out:
1216 spin_unlock_irq(&cpu_buffer->reader_lock); 1240 spin_unlock_irq(&cpu_buffer->reader_lock);
1217} 1241}
1218 1242
@@ -1229,7 +1253,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1229 1253
1230 for (i = 0; i < nr_pages; i++) { 1254 for (i = 0; i < nr_pages; i++) {
1231 if (RB_WARN_ON(cpu_buffer, list_empty(pages))) 1255 if (RB_WARN_ON(cpu_buffer, list_empty(pages)))
1232 return; 1256 goto out;
1233 p = pages->next; 1257 p = pages->next;
1234 bpage = list_entry(p, struct buffer_page, list); 1258 bpage = list_entry(p, struct buffer_page, list);
1235 list_del_init(&bpage->list); 1259 list_del_init(&bpage->list);
@@ -1238,6 +1262,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
1238 rb_reset_cpu(cpu_buffer); 1262 rb_reset_cpu(cpu_buffer);
1239 rb_check_pages(cpu_buffer); 1263 rb_check_pages(cpu_buffer);
1240 1264
1265out:
1241 spin_unlock_irq(&cpu_buffer->reader_lock); 1266 spin_unlock_irq(&cpu_buffer->reader_lock);
1242} 1267}
1243 1268
@@ -1547,7 +1572,7 @@ rb_update_event(struct ring_buffer_event *event,
1547 1572
1548 case 0: 1573 case 0:
1549 length -= RB_EVNT_HDR_SIZE; 1574 length -= RB_EVNT_HDR_SIZE;
1550 if (length > RB_MAX_SMALL_DATA) 1575 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1551 event->array[0] = length; 1576 event->array[0] = length;
1552 else 1577 else
1553 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); 1578 event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
@@ -1722,11 +1747,11 @@ static unsigned rb_calculate_event_length(unsigned length)
1722 if (!length) 1747 if (!length)
1723 length = 1; 1748 length = 1;
1724 1749
1725 if (length > RB_MAX_SMALL_DATA) 1750 if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
1726 length += sizeof(event.array[0]); 1751 length += sizeof(event.array[0]);
1727 1752
1728 length += RB_EVNT_HDR_SIZE; 1753 length += RB_EVNT_HDR_SIZE;
1729 length = ALIGN(length, RB_ALIGNMENT); 1754 length = ALIGN(length, RB_ARCH_ALIGNMENT);
1730 1755
1731 return length; 1756 return length;
1732} 1757}
@@ -1743,6 +1768,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1743 * must fill the old tail_page with padding. 1768 * must fill the old tail_page with padding.
1744 */ 1769 */
1745 if (tail >= BUF_PAGE_SIZE) { 1770 if (tail >= BUF_PAGE_SIZE) {
1771 /*
1772 * If the page was filled, then we still need
1773 * to update the real_end. Reset it to zero
1774 * and the reader will ignore it.
1775 */
1776 if (tail == BUF_PAGE_SIZE)
1777 tail_page->real_end = 0;
1778
1746 local_sub(length, &tail_page->write); 1779 local_sub(length, &tail_page->write);
1747 return; 1780 return;
1748 } 1781 }
@@ -1751,6 +1784,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
1751 kmemcheck_annotate_bitfield(event, bitfield); 1784 kmemcheck_annotate_bitfield(event, bitfield);
1752 1785
1753 /* 1786 /*
1787 * Save the original length to the meta data.
1788 * This will be used by the reader to add lost event
1789 * counter.
1790 */
1791 tail_page->real_end = tail;
1792
1793 /*
1754 * If this event is bigger than the minimum size, then 1794 * If this event is bigger than the minimum size, then
1755 * we need to be careful that we don't subtract the 1795 * we need to be careful that we don't subtract the
1756 * write counter enough to allow another writer to slip 1796 * write counter enough to allow another writer to slip
@@ -1968,17 +2008,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1968 u64 *ts, u64 *delta) 2008 u64 *ts, u64 *delta)
1969{ 2009{
1970 struct ring_buffer_event *event; 2010 struct ring_buffer_event *event;
1971 static int once;
1972 int ret; 2011 int ret;
1973 2012
1974 if (unlikely(*delta > (1ULL << 59) && !once++)) { 2013 WARN_ONCE(*delta > (1ULL << 59),
1975 printk(KERN_WARNING "Delta way too big! %llu" 2014 KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
1976 " ts=%llu write stamp = %llu\n", 2015 (unsigned long long)*delta,
1977 (unsigned long long)*delta, 2016 (unsigned long long)*ts,
1978 (unsigned long long)*ts, 2017 (unsigned long long)cpu_buffer->write_stamp);
1979 (unsigned long long)cpu_buffer->write_stamp);
1980 WARN_ON(1);
1981 }
1982 2018
1983 /* 2019 /*
1984 * The delta is too big, we to add a 2020 * The delta is too big, we to add a
@@ -2827,6 +2863,7 @@ static struct buffer_page *
2827rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) 2863rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2828{ 2864{
2829 struct buffer_page *reader = NULL; 2865 struct buffer_page *reader = NULL;
2866 unsigned long overwrite;
2830 unsigned long flags; 2867 unsigned long flags;
2831 int nr_loops = 0; 2868 int nr_loops = 0;
2832 int ret; 2869 int ret;
@@ -2868,6 +2905,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2868 local_set(&cpu_buffer->reader_page->write, 0); 2905 local_set(&cpu_buffer->reader_page->write, 0);
2869 local_set(&cpu_buffer->reader_page->entries, 0); 2906 local_set(&cpu_buffer->reader_page->entries, 0);
2870 local_set(&cpu_buffer->reader_page->page->commit, 0); 2907 local_set(&cpu_buffer->reader_page->page->commit, 0);
2908 cpu_buffer->reader_page->real_end = 0;
2871 2909
2872 spin: 2910 spin:
2873 /* 2911 /*
@@ -2888,6 +2926,18 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2888 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list); 2926 rb_set_list_to_head(cpu_buffer, &cpu_buffer->reader_page->list);
2889 2927
2890 /* 2928 /*
2929 * We want to make sure we read the overruns after we set up our
2930 * pointers to the next object. The writer side does a
2931 * cmpxchg to cross pages which acts as the mb on the writer
2932 * side. Note, the reader will constantly fail the swap
2933 * while the writer is updating the pointers, so this
2934 * guarantees that the overwrite recorded here is the one we
2935 * want to compare with the last_overrun.
2936 */
2937 smp_mb();
2938 overwrite = local_read(&(cpu_buffer->overrun));
2939
2940 /*
2891 * Here's the tricky part. 2941 * Here's the tricky part.
2892 * 2942 *
2893 * We need to move the pointer past the header page. 2943 * We need to move the pointer past the header page.
@@ -2918,6 +2968,11 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
2918 cpu_buffer->reader_page = reader; 2968 cpu_buffer->reader_page = reader;
2919 rb_reset_reader_page(cpu_buffer); 2969 rb_reset_reader_page(cpu_buffer);
2920 2970
2971 if (overwrite != cpu_buffer->last_overrun) {
2972 cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun;
2973 cpu_buffer->last_overrun = overwrite;
2974 }
2975
2921 goto again; 2976 goto again;
2922 2977
2923 out: 2978 out:
@@ -2994,8 +3049,14 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
2994 rb_advance_iter(iter); 3049 rb_advance_iter(iter);
2995} 3050}
2996 3051
3052static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
3053{
3054 return cpu_buffer->lost_events;
3055}
3056
2997static struct ring_buffer_event * 3057static struct ring_buffer_event *
2998rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts) 3058rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
3059 unsigned long *lost_events)
2999{ 3060{
3000 struct ring_buffer_event *event; 3061 struct ring_buffer_event *event;
3001 struct buffer_page *reader; 3062 struct buffer_page *reader;
@@ -3047,6 +3108,8 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)
3047 ring_buffer_normalize_time_stamp(cpu_buffer->buffer, 3108 ring_buffer_normalize_time_stamp(cpu_buffer->buffer,
3048 cpu_buffer->cpu, ts); 3109 cpu_buffer->cpu, ts);
3049 } 3110 }
3111 if (lost_events)
3112 *lost_events = rb_lost_events(cpu_buffer);
3050 return event; 3113 return event;
3051 3114
3052 default: 3115 default:
@@ -3157,12 +3220,14 @@ static inline int rb_ok_to_lock(void)
3157 * @buffer: The ring buffer to read 3220 * @buffer: The ring buffer to read
3158 * @cpu: The cpu to peak at 3221 * @cpu: The cpu to peak at
3159 * @ts: The timestamp counter of this event. 3222 * @ts: The timestamp counter of this event.
3223 * @lost_events: a variable to store if events were lost (may be NULL)
3160 * 3224 *
3161 * This will return the event that will be read next, but does 3225 * This will return the event that will be read next, but does
3162 * not consume the data. 3226 * not consume the data.
3163 */ 3227 */
3164struct ring_buffer_event * 3228struct ring_buffer_event *
3165ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) 3229ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
3230 unsigned long *lost_events)
3166{ 3231{
3167 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 3232 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
3168 struct ring_buffer_event *event; 3233 struct ring_buffer_event *event;
@@ -3177,7 +3242,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
3177 local_irq_save(flags); 3242 local_irq_save(flags);
3178 if (dolock) 3243 if (dolock)
3179 spin_lock(&cpu_buffer->reader_lock); 3244 spin_lock(&cpu_buffer->reader_lock);
3180 event = rb_buffer_peek(cpu_buffer, ts); 3245 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3181 if (event && event->type_len == RINGBUF_TYPE_PADDING) 3246 if (event && event->type_len == RINGBUF_TYPE_PADDING)
3182 rb_advance_reader(cpu_buffer); 3247 rb_advance_reader(cpu_buffer);
3183 if (dolock) 3248 if (dolock)
@@ -3219,13 +3284,17 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
3219/** 3284/**
3220 * ring_buffer_consume - return an event and consume it 3285 * ring_buffer_consume - return an event and consume it
3221 * @buffer: The ring buffer to get the next event from 3286 * @buffer: The ring buffer to get the next event from
3287 * @cpu: the cpu to read the buffer from
3288 * @ts: a variable to store the timestamp (may be NULL)
3289 * @lost_events: a variable to store if events were lost (may be NULL)
3222 * 3290 *
3223 * Returns the next event in the ring buffer, and that event is consumed. 3291 * Returns the next event in the ring buffer, and that event is consumed.
3224 * Meaning, that sequential reads will keep returning a different event, 3292 * Meaning, that sequential reads will keep returning a different event,
3225 * and eventually empty the ring buffer if the producer is slower. 3293 * and eventually empty the ring buffer if the producer is slower.
3226 */ 3294 */
3227struct ring_buffer_event * 3295struct ring_buffer_event *
3228ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) 3296ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
3297 unsigned long *lost_events)
3229{ 3298{
3230 struct ring_buffer_per_cpu *cpu_buffer; 3299 struct ring_buffer_per_cpu *cpu_buffer;
3231 struct ring_buffer_event *event = NULL; 3300 struct ring_buffer_event *event = NULL;
@@ -3246,9 +3315,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3246 if (dolock) 3315 if (dolock)
3247 spin_lock(&cpu_buffer->reader_lock); 3316 spin_lock(&cpu_buffer->reader_lock);
3248 3317
3249 event = rb_buffer_peek(cpu_buffer, ts); 3318 event = rb_buffer_peek(cpu_buffer, ts, lost_events);
3250 if (event) 3319 if (event) {
3320 cpu_buffer->lost_events = 0;
3251 rb_advance_reader(cpu_buffer); 3321 rb_advance_reader(cpu_buffer);
3322 }
3252 3323
3253 if (dolock) 3324 if (dolock)
3254 spin_unlock(&cpu_buffer->reader_lock); 3325 spin_unlock(&cpu_buffer->reader_lock);
@@ -3265,23 +3336,30 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
3265EXPORT_SYMBOL_GPL(ring_buffer_consume); 3336EXPORT_SYMBOL_GPL(ring_buffer_consume);
3266 3337
3267/** 3338/**
3268 * ring_buffer_read_start - start a non consuming read of the buffer 3339 * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
3269 * @buffer: The ring buffer to read from 3340 * @buffer: The ring buffer to read from
3270 * @cpu: The cpu buffer to iterate over 3341 * @cpu: The cpu buffer to iterate over
3271 * 3342 *
3272 * This starts up an iteration through the buffer. It also disables 3343 * This performs the initial preparations necessary to iterate
3273 * the recording to the buffer until the reading is finished. 3344 * through the buffer. Memory is allocated, buffer recording
3274 * This prevents the reading from being corrupted. This is not 3345 * is disabled, and the iterator pointer is returned to the caller.
3275 * a consuming read, so a producer is not expected.
3276 * 3346 *
3277 * Must be paired with ring_buffer_finish. 3347 * Disabling buffer recordng prevents the reading from being
3348 * corrupted. This is not a consuming read, so a producer is not
3349 * expected.
3350 *
3351 * After a sequence of ring_buffer_read_prepare calls, the user is
3352 * expected to make at least one call to ring_buffer_prepare_sync.
3353 * Afterwards, ring_buffer_read_start is invoked to get things going
3354 * for real.
3355 *
3356 * This overall must be paired with ring_buffer_finish.
3278 */ 3357 */
3279struct ring_buffer_iter * 3358struct ring_buffer_iter *
3280ring_buffer_read_start(struct ring_buffer *buffer, int cpu) 3359ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
3281{ 3360{
3282 struct ring_buffer_per_cpu *cpu_buffer; 3361 struct ring_buffer_per_cpu *cpu_buffer;
3283 struct ring_buffer_iter *iter; 3362 struct ring_buffer_iter *iter;
3284 unsigned long flags;
3285 3363
3286 if (!cpumask_test_cpu(cpu, buffer->cpumask)) 3364 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3287 return NULL; 3365 return NULL;
@@ -3295,15 +3373,52 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
3295 iter->cpu_buffer = cpu_buffer; 3373 iter->cpu_buffer = cpu_buffer;
3296 3374
3297 atomic_inc(&cpu_buffer->record_disabled); 3375 atomic_inc(&cpu_buffer->record_disabled);
3376
3377 return iter;
3378}
3379EXPORT_SYMBOL_GPL(ring_buffer_read_prepare);
3380
3381/**
3382 * ring_buffer_read_prepare_sync - Synchronize a set of prepare calls
3383 *
3384 * All previously invoked ring_buffer_read_prepare calls to prepare
3385 * iterators will be synchronized. Afterwards, read_buffer_read_start
3386 * calls on those iterators are allowed.
3387 */
3388void
3389ring_buffer_read_prepare_sync(void)
3390{
3298 synchronize_sched(); 3391 synchronize_sched();
3392}
3393EXPORT_SYMBOL_GPL(ring_buffer_read_prepare_sync);
3394
3395/**
3396 * ring_buffer_read_start - start a non consuming read of the buffer
3397 * @iter: The iterator returned by ring_buffer_read_prepare
3398 *
3399 * This finalizes the startup of an iteration through the buffer.
3400 * The iterator comes from a call to ring_buffer_read_prepare and
3401 * an intervening ring_buffer_read_prepare_sync must have been
3402 * performed.
3403 *
3404 * Must be paired with ring_buffer_finish.
3405 */
3406void
3407ring_buffer_read_start(struct ring_buffer_iter *iter)
3408{
3409 struct ring_buffer_per_cpu *cpu_buffer;
3410 unsigned long flags;
3411
3412 if (!iter)
3413 return;
3414
3415 cpu_buffer = iter->cpu_buffer;
3299 3416
3300 spin_lock_irqsave(&cpu_buffer->reader_lock, flags); 3417 spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
3301 arch_spin_lock(&cpu_buffer->lock); 3418 arch_spin_lock(&cpu_buffer->lock);
3302 rb_iter_reset(iter); 3419 rb_iter_reset(iter);
3303 arch_spin_unlock(&cpu_buffer->lock); 3420 arch_spin_unlock(&cpu_buffer->lock);
3304 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3421 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3305
3306 return iter;
3307} 3422}
3308EXPORT_SYMBOL_GPL(ring_buffer_read_start); 3423EXPORT_SYMBOL_GPL(ring_buffer_read_start);
3309 3424
@@ -3397,6 +3512,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
3397 cpu_buffer->write_stamp = 0; 3512 cpu_buffer->write_stamp = 0;
3398 cpu_buffer->read_stamp = 0; 3513 cpu_buffer->read_stamp = 0;
3399 3514
3515 cpu_buffer->lost_events = 0;
3516 cpu_buffer->last_overrun = 0;
3517
3400 rb_head_page_activate(cpu_buffer); 3518 rb_head_page_activate(cpu_buffer);
3401} 3519}
3402 3520
@@ -3672,6 +3790,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3672 struct ring_buffer_event *event; 3790 struct ring_buffer_event *event;
3673 struct buffer_data_page *bpage; 3791 struct buffer_data_page *bpage;
3674 struct buffer_page *reader; 3792 struct buffer_page *reader;
3793 unsigned long missed_events;
3675 unsigned long flags; 3794 unsigned long flags;
3676 unsigned int commit; 3795 unsigned int commit;
3677 unsigned int read; 3796 unsigned int read;
@@ -3708,6 +3827,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3708 read = reader->read; 3827 read = reader->read;
3709 commit = rb_page_commit(reader); 3828 commit = rb_page_commit(reader);
3710 3829
3830 /* Check if any events were dropped */
3831 missed_events = cpu_buffer->lost_events;
3832
3711 /* 3833 /*
3712 * If this page has been partially read or 3834 * If this page has been partially read or
3713 * if len is not big enough to read the rest of the page or 3835 * if len is not big enough to read the rest of the page or
@@ -3768,9 +3890,42 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
3768 local_set(&reader->entries, 0); 3890 local_set(&reader->entries, 0);
3769 reader->read = 0; 3891 reader->read = 0;
3770 *data_page = bpage; 3892 *data_page = bpage;
3893
3894 /*
3895 * Use the real_end for the data size,
3896 * This gives us a chance to store the lost events
3897 * on the page.
3898 */
3899 if (reader->real_end)
3900 local_set(&bpage->commit, reader->real_end);
3771 } 3901 }
3772 ret = read; 3902 ret = read;
3773 3903
3904 cpu_buffer->lost_events = 0;
3905
3906 commit = local_read(&bpage->commit);
3907 /*
3908 * Set a flag in the commit field if we lost events
3909 */
3910 if (missed_events) {
3911 /* If there is room at the end of the page to save the
3912 * missed events, then record it there.
3913 */
3914 if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
3915 memcpy(&bpage->data[commit], &missed_events,
3916 sizeof(missed_events));
3917 local_add(RB_MISSED_STORED, &bpage->commit);
3918 commit += sizeof(missed_events);
3919 }
3920 local_add(RB_MISSED_EVENTS, &bpage->commit);
3921 }
3922
3923 /*
3924 * This page may be off to user land. Zero it out here.
3925 */
3926 if (commit < BUF_PAGE_SIZE)
3927 memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
3928
3774 out_unlock: 3929 out_unlock:
3775 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); 3930 spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
3776 3931
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index df74c7982255..302f8a614635 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -81,7 +81,7 @@ static enum event_status read_event(int cpu)
81 int *entry; 81 int *entry;
82 u64 ts; 82 u64 ts;
83 83
84 event = ring_buffer_consume(buffer, cpu, &ts); 84 event = ring_buffer_consume(buffer, cpu, &ts, NULL);
85 if (!event) 85 if (!event)
86 return EVENT_DROPPED; 86 return EVENT_DROPPED;
87 87
@@ -113,7 +113,8 @@ static enum event_status read_page(int cpu)
113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); 113 ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
114 if (ret >= 0) { 114 if (ret >= 0) {
115 rpage = bpage; 115 rpage = bpage;
116 commit = local_read(&rpage->commit); 116 /* The commit may have missed event flags set, clear them */
117 commit = local_read(&rpage->commit) & 0xfffff;
117 for (i = 0; i < commit && !kill_test; i += inc) { 118 for (i = 0; i < commit && !kill_test; i += inc) {
118 119
119 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { 120 if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3ec2ee6f6560..086d36316805 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -33,10 +33,10 @@
33#include <linux/kdebug.h> 33#include <linux/kdebug.h>
34#include <linux/string.h> 34#include <linux/string.h>
35#include <linux/rwsem.h> 35#include <linux/rwsem.h>
36#include <linux/slab.h>
36#include <linux/ctype.h> 37#include <linux/ctype.h>
37#include <linux/init.h> 38#include <linux/init.h>
38#include <linux/poll.h> 39#include <linux/poll.h>
39#include <linux/gfp.h>
40#include <linux/fs.h> 40#include <linux/fs.h>
41 41
42#include "trace.h" 42#include "trace.h"
@@ -117,9 +117,12 @@ static cpumask_var_t __read_mostly tracing_buffer_mask;
117 * 117 *
118 * It is default off, but you can enable it with either specifying 118 * It is default off, but you can enable it with either specifying
119 * "ftrace_dump_on_oops" in the kernel command line, or setting 119 * "ftrace_dump_on_oops" in the kernel command line, or setting
120 * /proc/sys/kernel/ftrace_dump_on_oops to true. 120 * /proc/sys/kernel/ftrace_dump_on_oops
121 * Set 1 if you want to dump buffers of all CPUs
122 * Set 2 if you want to dump the buffer of the CPU that triggered oops
121 */ 123 */
122int ftrace_dump_on_oops; 124
125enum ftrace_dump_mode ftrace_dump_on_oops;
123 126
124static int tracing_set_tracer(const char *buf); 127static int tracing_set_tracer(const char *buf);
125 128
@@ -139,8 +142,17 @@ __setup("ftrace=", set_cmdline_ftrace);
139 142
140static int __init set_ftrace_dump_on_oops(char *str) 143static int __init set_ftrace_dump_on_oops(char *str)
141{ 144{
142 ftrace_dump_on_oops = 1; 145 if (*str++ != '=' || !*str) {
143 return 1; 146 ftrace_dump_on_oops = DUMP_ALL;
147 return 1;
148 }
149
150 if (!strcmp("orig_cpu", str)) {
151 ftrace_dump_on_oops = DUMP_ORIG;
152 return 1;
153 }
154
155 return 0;
144} 156}
145__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 157__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
146 158
@@ -1545,7 +1557,8 @@ static void trace_iterator_increment(struct trace_iterator *iter)
1545} 1557}
1546 1558
1547static struct trace_entry * 1559static struct trace_entry *
1548peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts) 1560peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
1561 unsigned long *lost_events)
1549{ 1562{
1550 struct ring_buffer_event *event; 1563 struct ring_buffer_event *event;
1551 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu]; 1564 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
@@ -1556,7 +1569,8 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1556 if (buf_iter) 1569 if (buf_iter)
1557 event = ring_buffer_iter_peek(buf_iter, ts); 1570 event = ring_buffer_iter_peek(buf_iter, ts);
1558 else 1571 else
1559 event = ring_buffer_peek(iter->tr->buffer, cpu, ts); 1572 event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
1573 lost_events);
1560 1574
1561 ftrace_enable_cpu(); 1575 ftrace_enable_cpu();
1562 1576
@@ -1564,10 +1578,12 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1564} 1578}
1565 1579
1566static struct trace_entry * 1580static struct trace_entry *
1567__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts) 1581__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
1582 unsigned long *missing_events, u64 *ent_ts)
1568{ 1583{
1569 struct ring_buffer *buffer = iter->tr->buffer; 1584 struct ring_buffer *buffer = iter->tr->buffer;
1570 struct trace_entry *ent, *next = NULL; 1585 struct trace_entry *ent, *next = NULL;
1586 unsigned long lost_events = 0, next_lost = 0;
1571 int cpu_file = iter->cpu_file; 1587 int cpu_file = iter->cpu_file;
1572 u64 next_ts = 0, ts; 1588 u64 next_ts = 0, ts;
1573 int next_cpu = -1; 1589 int next_cpu = -1;
@@ -1580,7 +1596,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1580 if (cpu_file > TRACE_PIPE_ALL_CPU) { 1596 if (cpu_file > TRACE_PIPE_ALL_CPU) {
1581 if (ring_buffer_empty_cpu(buffer, cpu_file)) 1597 if (ring_buffer_empty_cpu(buffer, cpu_file))
1582 return NULL; 1598 return NULL;
1583 ent = peek_next_entry(iter, cpu_file, ent_ts); 1599 ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
1584 if (ent_cpu) 1600 if (ent_cpu)
1585 *ent_cpu = cpu_file; 1601 *ent_cpu = cpu_file;
1586 1602
@@ -1592,7 +1608,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1592 if (ring_buffer_empty_cpu(buffer, cpu)) 1608 if (ring_buffer_empty_cpu(buffer, cpu))
1593 continue; 1609 continue;
1594 1610
1595 ent = peek_next_entry(iter, cpu, &ts); 1611 ent = peek_next_entry(iter, cpu, &ts, &lost_events);
1596 1612
1597 /* 1613 /*
1598 * Pick the entry with the smallest timestamp: 1614 * Pick the entry with the smallest timestamp:
@@ -1601,6 +1617,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1601 next = ent; 1617 next = ent;
1602 next_cpu = cpu; 1618 next_cpu = cpu;
1603 next_ts = ts; 1619 next_ts = ts;
1620 next_lost = lost_events;
1604 } 1621 }
1605 } 1622 }
1606 1623
@@ -1610,6 +1627,9 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1610 if (ent_ts) 1627 if (ent_ts)
1611 *ent_ts = next_ts; 1628 *ent_ts = next_ts;
1612 1629
1630 if (missing_events)
1631 *missing_events = next_lost;
1632
1613 return next; 1633 return next;
1614} 1634}
1615 1635
@@ -1617,13 +1637,14 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1617struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, 1637struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
1618 int *ent_cpu, u64 *ent_ts) 1638 int *ent_cpu, u64 *ent_ts)
1619{ 1639{
1620 return __find_next_entry(iter, ent_cpu, ent_ts); 1640 return __find_next_entry(iter, ent_cpu, NULL, ent_ts);
1621} 1641}
1622 1642
1623/* Find the next real entry, and increment the iterator to the next entry */ 1643/* Find the next real entry, and increment the iterator to the next entry */
1624static void *find_next_entry_inc(struct trace_iterator *iter) 1644static void *find_next_entry_inc(struct trace_iterator *iter)
1625{ 1645{
1626 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts); 1646 iter->ent = __find_next_entry(iter, &iter->cpu,
1647 &iter->lost_events, &iter->ts);
1627 1648
1628 if (iter->ent) 1649 if (iter->ent)
1629 trace_iterator_increment(iter); 1650 trace_iterator_increment(iter);
@@ -1635,7 +1656,8 @@ static void trace_consume(struct trace_iterator *iter)
1635{ 1656{
1636 /* Don't allow ftrace to trace into the ring buffers */ 1657 /* Don't allow ftrace to trace into the ring buffers */
1637 ftrace_disable_cpu(); 1658 ftrace_disable_cpu();
1638 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts); 1659 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
1660 &iter->lost_events);
1639 ftrace_enable_cpu(); 1661 ftrace_enable_cpu();
1640} 1662}
1641 1663
@@ -1786,7 +1808,7 @@ static void print_func_help_header(struct seq_file *m)
1786} 1808}
1787 1809
1788 1810
1789static void 1811void
1790print_trace_header(struct seq_file *m, struct trace_iterator *iter) 1812print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1791{ 1813{
1792 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1814 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1914,7 +1936,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1914 } 1936 }
1915 1937
1916 if (event) 1938 if (event)
1917 return event->trace(iter, sym_flags); 1939 return event->funcs->trace(iter, sym_flags, event);
1918 1940
1919 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) 1941 if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
1920 goto partial; 1942 goto partial;
@@ -1940,7 +1962,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1940 1962
1941 event = ftrace_find_event(entry->type); 1963 event = ftrace_find_event(entry->type);
1942 if (event) 1964 if (event)
1943 return event->raw(iter, 0); 1965 return event->funcs->raw(iter, 0, event);
1944 1966
1945 if (!trace_seq_printf(s, "%d ?\n", entry->type)) 1967 if (!trace_seq_printf(s, "%d ?\n", entry->type))
1946 goto partial; 1968 goto partial;
@@ -1967,7 +1989,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1967 1989
1968 event = ftrace_find_event(entry->type); 1990 event = ftrace_find_event(entry->type);
1969 if (event) { 1991 if (event) {
1970 enum print_line_t ret = event->hex(iter, 0); 1992 enum print_line_t ret = event->funcs->hex(iter, 0, event);
1971 if (ret != TRACE_TYPE_HANDLED) 1993 if (ret != TRACE_TYPE_HANDLED)
1972 return ret; 1994 return ret;
1973 } 1995 }
@@ -1992,10 +2014,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1992 } 2014 }
1993 2015
1994 event = ftrace_find_event(entry->type); 2016 event = ftrace_find_event(entry->type);
1995 return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; 2017 return event ? event->funcs->binary(iter, 0, event) :
2018 TRACE_TYPE_HANDLED;
1996} 2019}
1997 2020
1998static int trace_empty(struct trace_iterator *iter) 2021int trace_empty(struct trace_iterator *iter)
1999{ 2022{
2000 int cpu; 2023 int cpu;
2001 2024
@@ -2030,6 +2053,10 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2030{ 2053{
2031 enum print_line_t ret; 2054 enum print_line_t ret;
2032 2055
2056 if (iter->lost_events)
2057 trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
2058 iter->cpu, iter->lost_events);
2059
2033 if (iter->trace && iter->trace->print_line) { 2060 if (iter->trace && iter->trace->print_line) {
2034 ret = iter->trace->print_line(iter); 2061 ret = iter->trace->print_line(iter);
2035 if (ret != TRACE_TYPE_UNHANDLED) 2062 if (ret != TRACE_TYPE_UNHANDLED)
@@ -2058,6 +2085,23 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
2058 return print_trace_fmt(iter); 2085 return print_trace_fmt(iter);
2059} 2086}
2060 2087
2088void trace_default_header(struct seq_file *m)
2089{
2090 struct trace_iterator *iter = m->private;
2091
2092 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
2093 /* print nothing if the buffers are empty */
2094 if (trace_empty(iter))
2095 return;
2096 print_trace_header(m, iter);
2097 if (!(trace_flags & TRACE_ITER_VERBOSE))
2098 print_lat_help_header(m);
2099 } else {
2100 if (!(trace_flags & TRACE_ITER_VERBOSE))
2101 print_func_help_header(m);
2102 }
2103}
2104
2061static int s_show(struct seq_file *m, void *v) 2105static int s_show(struct seq_file *m, void *v)
2062{ 2106{
2063 struct trace_iterator *iter = v; 2107 struct trace_iterator *iter = v;
@@ -2070,17 +2114,9 @@ static int s_show(struct seq_file *m, void *v)
2070 } 2114 }
2071 if (iter->trace && iter->trace->print_header) 2115 if (iter->trace && iter->trace->print_header)
2072 iter->trace->print_header(m); 2116 iter->trace->print_header(m);
2073 else if (iter->iter_flags & TRACE_FILE_LAT_FMT) { 2117 else
2074 /* print nothing if the buffers are empty */ 2118 trace_default_header(m);
2075 if (trace_empty(iter)) 2119
2076 return 0;
2077 print_trace_header(m, iter);
2078 if (!(trace_flags & TRACE_ITER_VERBOSE))
2079 print_lat_help_header(m);
2080 } else {
2081 if (!(trace_flags & TRACE_ITER_VERBOSE))
2082 print_func_help_header(m);
2083 }
2084 } else if (iter->leftover) { 2120 } else if (iter->leftover) {
2085 /* 2121 /*
2086 * If we filled the seq_file buffer earlier, we 2122 * If we filled the seq_file buffer earlier, we
@@ -2166,15 +2202,20 @@ __tracing_open(struct inode *inode, struct file *file)
2166 2202
2167 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2203 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2168 for_each_tracing_cpu(cpu) { 2204 for_each_tracing_cpu(cpu) {
2169
2170 iter->buffer_iter[cpu] = 2205 iter->buffer_iter[cpu] =
2171 ring_buffer_read_start(iter->tr->buffer, cpu); 2206 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2207 }
2208 ring_buffer_read_prepare_sync();
2209 for_each_tracing_cpu(cpu) {
2210 ring_buffer_read_start(iter->buffer_iter[cpu]);
2172 tracing_iter_reset(iter, cpu); 2211 tracing_iter_reset(iter, cpu);
2173 } 2212 }
2174 } else { 2213 } else {
2175 cpu = iter->cpu_file; 2214 cpu = iter->cpu_file;
2176 iter->buffer_iter[cpu] = 2215 iter->buffer_iter[cpu] =
2177 ring_buffer_read_start(iter->tr->buffer, cpu); 2216 ring_buffer_read_prepare(iter->tr->buffer, cpu);
2217 ring_buffer_read_prepare_sync();
2218 ring_buffer_read_start(iter->buffer_iter[cpu]);
2178 tracing_iter_reset(iter, cpu); 2219 tracing_iter_reset(iter, cpu);
2179 } 2220 }
2180 2221
@@ -3269,12 +3310,12 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3269 size_t len, 3310 size_t len,
3270 unsigned int flags) 3311 unsigned int flags)
3271{ 3312{
3272 struct page *pages[PIPE_BUFFERS]; 3313 struct page *pages_def[PIPE_DEF_BUFFERS];
3273 struct partial_page partial[PIPE_BUFFERS]; 3314 struct partial_page partial_def[PIPE_DEF_BUFFERS];
3274 struct trace_iterator *iter = filp->private_data; 3315 struct trace_iterator *iter = filp->private_data;
3275 struct splice_pipe_desc spd = { 3316 struct splice_pipe_desc spd = {
3276 .pages = pages, 3317 .pages = pages_def,
3277 .partial = partial, 3318 .partial = partial_def,
3278 .nr_pages = 0, /* This gets updated below. */ 3319 .nr_pages = 0, /* This gets updated below. */
3279 .flags = flags, 3320 .flags = flags,
3280 .ops = &tracing_pipe_buf_ops, 3321 .ops = &tracing_pipe_buf_ops,
@@ -3285,6 +3326,9 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3285 size_t rem; 3326 size_t rem;
3286 unsigned int i; 3327 unsigned int i;
3287 3328
3329 if (splice_grow_spd(pipe, &spd))
3330 return -ENOMEM;
3331
3288 /* copy the tracer to avoid using a global lock all around */ 3332 /* copy the tracer to avoid using a global lock all around */
3289 mutex_lock(&trace_types_lock); 3333 mutex_lock(&trace_types_lock);
3290 if (unlikely(old_tracer != current_trace && current_trace)) { 3334 if (unlikely(old_tracer != current_trace && current_trace)) {
@@ -3315,23 +3359,23 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3315 trace_access_lock(iter->cpu_file); 3359 trace_access_lock(iter->cpu_file);
3316 3360
3317 /* Fill as many pages as possible. */ 3361 /* Fill as many pages as possible. */
3318 for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { 3362 for (i = 0, rem = len; i < pipe->buffers && rem; i++) {
3319 pages[i] = alloc_page(GFP_KERNEL); 3363 spd.pages[i] = alloc_page(GFP_KERNEL);
3320 if (!pages[i]) 3364 if (!spd.pages[i])
3321 break; 3365 break;
3322 3366
3323 rem = tracing_fill_pipe_page(rem, iter); 3367 rem = tracing_fill_pipe_page(rem, iter);
3324 3368
3325 /* Copy the data into the page, so we can start over. */ 3369 /* Copy the data into the page, so we can start over. */
3326 ret = trace_seq_to_buffer(&iter->seq, 3370 ret = trace_seq_to_buffer(&iter->seq,
3327 page_address(pages[i]), 3371 page_address(spd.pages[i]),
3328 iter->seq.len); 3372 iter->seq.len);
3329 if (ret < 0) { 3373 if (ret < 0) {
3330 __free_page(pages[i]); 3374 __free_page(spd.pages[i]);
3331 break; 3375 break;
3332 } 3376 }
3333 partial[i].offset = 0; 3377 spd.partial[i].offset = 0;
3334 partial[i].len = iter->seq.len; 3378 spd.partial[i].len = iter->seq.len;
3335 3379
3336 trace_seq_init(&iter->seq); 3380 trace_seq_init(&iter->seq);
3337 } 3381 }
@@ -3342,12 +3386,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3342 3386
3343 spd.nr_pages = i; 3387 spd.nr_pages = i;
3344 3388
3345 return splice_to_pipe(pipe, &spd); 3389 ret = splice_to_pipe(pipe, &spd);
3390out:
3391 splice_shrink_spd(pipe, &spd);
3392 return ret;
3346 3393
3347out_err: 3394out_err:
3348 mutex_unlock(&iter->mutex); 3395 mutex_unlock(&iter->mutex);
3349 3396 goto out;
3350 return ret;
3351} 3397}
3352 3398
3353static ssize_t 3399static ssize_t
@@ -3620,7 +3666,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3620 size_t count, loff_t *ppos) 3666 size_t count, loff_t *ppos)
3621{ 3667{
3622 struct ftrace_buffer_info *info = filp->private_data; 3668 struct ftrace_buffer_info *info = filp->private_data;
3623 unsigned int pos;
3624 ssize_t ret; 3669 ssize_t ret;
3625 size_t size; 3670 size_t size;
3626 3671
@@ -3647,11 +3692,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3647 if (ret < 0) 3692 if (ret < 0)
3648 return 0; 3693 return 0;
3649 3694
3650 pos = ring_buffer_page_len(info->spare);
3651
3652 if (pos < PAGE_SIZE)
3653 memset(info->spare + pos, 0, PAGE_SIZE - pos);
3654
3655read: 3695read:
3656 size = PAGE_SIZE - info->read; 3696 size = PAGE_SIZE - info->read;
3657 if (size > count) 3697 if (size > count)
@@ -3746,11 +3786,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3746 unsigned int flags) 3786 unsigned int flags)
3747{ 3787{
3748 struct ftrace_buffer_info *info = file->private_data; 3788 struct ftrace_buffer_info *info = file->private_data;
3749 struct partial_page partial[PIPE_BUFFERS]; 3789 struct partial_page partial_def[PIPE_DEF_BUFFERS];
3750 struct page *pages[PIPE_BUFFERS]; 3790 struct page *pages_def[PIPE_DEF_BUFFERS];
3751 struct splice_pipe_desc spd = { 3791 struct splice_pipe_desc spd = {
3752 .pages = pages, 3792 .pages = pages_def,
3753 .partial = partial, 3793 .partial = partial_def,
3754 .flags = flags, 3794 .flags = flags,
3755 .ops = &buffer_pipe_buf_ops, 3795 .ops = &buffer_pipe_buf_ops,
3756 .spd_release = buffer_spd_release, 3796 .spd_release = buffer_spd_release,
@@ -3759,22 +3799,28 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3759 int entries, size, i; 3799 int entries, size, i;
3760 size_t ret; 3800 size_t ret;
3761 3801
3802 if (splice_grow_spd(pipe, &spd))
3803 return -ENOMEM;
3804
3762 if (*ppos & (PAGE_SIZE - 1)) { 3805 if (*ppos & (PAGE_SIZE - 1)) {
3763 WARN_ONCE(1, "Ftrace: previous read must page-align\n"); 3806 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
3764 return -EINVAL; 3807 ret = -EINVAL;
3808 goto out;
3765 } 3809 }
3766 3810
3767 if (len & (PAGE_SIZE - 1)) { 3811 if (len & (PAGE_SIZE - 1)) {
3768 WARN_ONCE(1, "Ftrace: splice_read should page-align\n"); 3812 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
3769 if (len < PAGE_SIZE) 3813 if (len < PAGE_SIZE) {
3770 return -EINVAL; 3814 ret = -EINVAL;
3815 goto out;
3816 }
3771 len &= PAGE_MASK; 3817 len &= PAGE_MASK;
3772 } 3818 }
3773 3819
3774 trace_access_lock(info->cpu); 3820 trace_access_lock(info->cpu);
3775 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); 3821 entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
3776 3822
3777 for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { 3823 for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
3778 struct page *page; 3824 struct page *page;
3779 int r; 3825 int r;
3780 3826
@@ -3829,11 +3875,12 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3829 else 3875 else
3830 ret = 0; 3876 ret = 0;
3831 /* TODO: block */ 3877 /* TODO: block */
3832 return ret; 3878 goto out;
3833 } 3879 }
3834 3880
3835 ret = splice_to_pipe(pipe, &spd); 3881 ret = splice_to_pipe(pipe, &spd);
3836 3882 splice_shrink_spd(pipe, &spd);
3883out:
3837 return ret; 3884 return ret;
3838} 3885}
3839 3886
@@ -4324,7 +4371,7 @@ static int trace_panic_handler(struct notifier_block *this,
4324 unsigned long event, void *unused) 4371 unsigned long event, void *unused)
4325{ 4372{
4326 if (ftrace_dump_on_oops) 4373 if (ftrace_dump_on_oops)
4327 ftrace_dump(); 4374 ftrace_dump(ftrace_dump_on_oops);
4328 return NOTIFY_OK; 4375 return NOTIFY_OK;
4329} 4376}
4330 4377
@@ -4341,7 +4388,7 @@ static int trace_die_handler(struct notifier_block *self,
4341 switch (val) { 4388 switch (val) {
4342 case DIE_OOPS: 4389 case DIE_OOPS:
4343 if (ftrace_dump_on_oops) 4390 if (ftrace_dump_on_oops)
4344 ftrace_dump(); 4391 ftrace_dump(ftrace_dump_on_oops);
4345 break; 4392 break;
4346 default: 4393 default:
4347 break; 4394 break;
@@ -4382,7 +4429,8 @@ trace_printk_seq(struct trace_seq *s)
4382 trace_seq_init(s); 4429 trace_seq_init(s);
4383} 4430}
4384 4431
4385static void __ftrace_dump(bool disable_tracing) 4432static void
4433__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
4386{ 4434{
4387 static arch_spinlock_t ftrace_dump_lock = 4435 static arch_spinlock_t ftrace_dump_lock =
4388 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; 4436 (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -4415,12 +4463,25 @@ static void __ftrace_dump(bool disable_tracing)
4415 /* don't look at user memory in panic mode */ 4463 /* don't look at user memory in panic mode */
4416 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 4464 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
4417 4465
4418 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4419
4420 /* Simulate the iterator */ 4466 /* Simulate the iterator */
4421 iter.tr = &global_trace; 4467 iter.tr = &global_trace;
4422 iter.trace = current_trace; 4468 iter.trace = current_trace;
4423 iter.cpu_file = TRACE_PIPE_ALL_CPU; 4469
4470 switch (oops_dump_mode) {
4471 case DUMP_ALL:
4472 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4473 break;
4474 case DUMP_ORIG:
4475 iter.cpu_file = raw_smp_processor_id();
4476 break;
4477 case DUMP_NONE:
4478 goto out_enable;
4479 default:
4480 printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
4481 iter.cpu_file = TRACE_PIPE_ALL_CPU;
4482 }
4483
4484 printk(KERN_TRACE "Dumping ftrace buffer:\n");
4424 4485
4425 /* 4486 /*
4426 * We need to stop all tracing on all CPUS to read the 4487 * We need to stop all tracing on all CPUS to read the
@@ -4459,6 +4520,7 @@ static void __ftrace_dump(bool disable_tracing)
4459 else 4520 else
4460 printk(KERN_TRACE "---------------------------------\n"); 4521 printk(KERN_TRACE "---------------------------------\n");
4461 4522
4523 out_enable:
4462 /* Re-enable tracing if requested */ 4524 /* Re-enable tracing if requested */
4463 if (!disable_tracing) { 4525 if (!disable_tracing) {
4464 trace_flags |= old_userobj; 4526 trace_flags |= old_userobj;
@@ -4475,9 +4537,9 @@ static void __ftrace_dump(bool disable_tracing)
4475} 4537}
4476 4538
4477/* By default: disable tracing after the dump */ 4539/* By default: disable tracing after the dump */
4478void ftrace_dump(void) 4540void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
4479{ 4541{
4480 __ftrace_dump(true); 4542 __ftrace_dump(true, oops_dump_mode);
4481} 4543}
4482 4544
4483__init static int tracer_alloc_buffers(void) 4545__init static int tracer_alloc_buffers(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2825ef2c0b15..2cd96399463f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -34,7 +34,6 @@ enum trace_type {
34 TRACE_GRAPH_RET, 34 TRACE_GRAPH_RET,
35 TRACE_GRAPH_ENT, 35 TRACE_GRAPH_ENT,
36 TRACE_USER_STACK, 36 TRACE_USER_STACK,
37 TRACE_HW_BRANCHES,
38 TRACE_KMEM_ALLOC, 37 TRACE_KMEM_ALLOC,
39 TRACE_KMEM_FREE, 38 TRACE_KMEM_FREE,
40 TRACE_BLK, 39 TRACE_BLK,
@@ -103,29 +102,17 @@ struct syscall_trace_exit {
103 long ret; 102 long ret;
104}; 103};
105 104
106struct kprobe_trace_entry { 105struct kprobe_trace_entry_head {
107 struct trace_entry ent; 106 struct trace_entry ent;
108 unsigned long ip; 107 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111}; 108};
112 109
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \ 110struct kretprobe_trace_entry_head {
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent; 111 struct trace_entry ent;
119 unsigned long func; 112 unsigned long func;
120 unsigned long ret_ip; 113 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123}; 114};
124 115
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
129/* 116/*
130 * trace_flag_type is an enumeration that holds different 117 * trace_flag_type is an enumeration that holds different
131 * states when a trace occurs. These are: 118 * states when a trace occurs. These are:
@@ -229,7 +216,6 @@ extern void __ftrace_bad_type(void);
229 TRACE_GRAPH_ENT); \ 216 TRACE_GRAPH_ENT); \
230 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ 217 IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \
231 TRACE_GRAPH_RET); \ 218 TRACE_GRAPH_RET); \
232 IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
233 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ 219 IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \
234 TRACE_KMEM_ALLOC); \ 220 TRACE_KMEM_ALLOC); \
235 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ 221 IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \
@@ -378,6 +364,9 @@ void trace_function(struct trace_array *tr,
378 unsigned long ip, 364 unsigned long ip,
379 unsigned long parent_ip, 365 unsigned long parent_ip,
380 unsigned long flags, int pc); 366 unsigned long flags, int pc);
367void trace_default_header(struct seq_file *m);
368void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
369int trace_empty(struct trace_iterator *iter);
381 370
382void trace_graph_return(struct ftrace_graph_ret *trace); 371void trace_graph_return(struct ftrace_graph_ret *trace);
383int trace_graph_entry(struct ftrace_graph_ent *trace); 372int trace_graph_entry(struct ftrace_graph_ent *trace);
@@ -416,12 +405,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
416void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, 405void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
417 int pc); 406 int pc);
418#else 407#else
419static inline void ftrace_trace_stack(struct trace_array *tr, 408static inline void ftrace_trace_stack(struct ring_buffer *buffer,
420 unsigned long flags, int skip, int pc) 409 unsigned long flags, int skip, int pc)
421{ 410{
422} 411}
423 412
424static inline void ftrace_trace_userstack(struct trace_array *tr, 413static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
425 unsigned long flags, int pc) 414 unsigned long flags, int pc)
426{ 415{
427} 416}
@@ -467,8 +456,6 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
467 struct trace_array *tr); 456 struct trace_array *tr);
468extern int trace_selftest_startup_branch(struct tracer *trace, 457extern int trace_selftest_startup_branch(struct tracer *trace,
469 struct trace_array *tr); 458 struct trace_array *tr);
470extern int trace_selftest_startup_hw_branches(struct tracer *trace,
471 struct trace_array *tr);
472extern int trace_selftest_startup_ksym(struct tracer *trace, 459extern int trace_selftest_startup_ksym(struct tracer *trace,
473 struct trace_array *tr); 460 struct trace_array *tr);
474#endif /* CONFIG_FTRACE_STARTUP_TEST */ 461#endif /* CONFIG_FTRACE_STARTUP_TEST */
@@ -491,9 +478,29 @@ extern int trace_clock_id;
491 478
492/* Standard output formatting function used for function return traces */ 479/* Standard output formatting function used for function return traces */
493#ifdef CONFIG_FUNCTION_GRAPH_TRACER 480#ifdef CONFIG_FUNCTION_GRAPH_TRACER
494extern enum print_line_t print_graph_function(struct trace_iterator *iter); 481
482/* Flag options */
483#define TRACE_GRAPH_PRINT_OVERRUN 0x1
484#define TRACE_GRAPH_PRINT_CPU 0x2
485#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
486#define TRACE_GRAPH_PRINT_PROC 0x8
487#define TRACE_GRAPH_PRINT_DURATION 0x10
488#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
489
490extern enum print_line_t
491print_graph_function_flags(struct trace_iterator *iter, u32 flags);
492extern void print_graph_headers_flags(struct seq_file *s, u32 flags);
495extern enum print_line_t 493extern enum print_line_t
496trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); 494trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
495extern void graph_trace_open(struct trace_iterator *iter);
496extern void graph_trace_close(struct trace_iterator *iter);
497extern int __trace_graph_entry(struct trace_array *tr,
498 struct ftrace_graph_ent *trace,
499 unsigned long flags, int pc);
500extern void __trace_graph_return(struct trace_array *tr,
501 struct ftrace_graph_ret *trace,
502 unsigned long flags, int pc);
503
497 504
498#ifdef CONFIG_DYNAMIC_FTRACE 505#ifdef CONFIG_DYNAMIC_FTRACE
499/* TODO: make this variable */ 506/* TODO: make this variable */
@@ -524,7 +531,7 @@ static inline int ftrace_graph_addr(unsigned long addr)
524#endif /* CONFIG_DYNAMIC_FTRACE */ 531#endif /* CONFIG_DYNAMIC_FTRACE */
525#else /* CONFIG_FUNCTION_GRAPH_TRACER */ 532#else /* CONFIG_FUNCTION_GRAPH_TRACER */
526static inline enum print_line_t 533static inline enum print_line_t
527print_graph_function(struct trace_iterator *iter) 534print_graph_function_flags(struct trace_iterator *iter, u32 flags)
528{ 535{
529 return TRACE_TYPE_UNHANDLED; 536 return TRACE_TYPE_UNHANDLED;
530} 537}
@@ -771,12 +778,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system,
771 struct trace_seq *s); 778 struct trace_seq *s);
772extern int filter_assign_type(const char *type); 779extern int filter_assign_type(const char *type);
773 780
781struct list_head *
782trace_get_fields(struct ftrace_event_call *event_call);
783
774static inline int 784static inline int
775filter_check_discard(struct ftrace_event_call *call, void *rec, 785filter_check_discard(struct ftrace_event_call *call, void *rec,
776 struct ring_buffer *buffer, 786 struct ring_buffer *buffer,
777 struct ring_buffer_event *event) 787 struct ring_buffer_event *event)
778{ 788{
779 if (unlikely(call->filter_active) && 789 if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) &&
780 !filter_match_preds(call->filter, rec)) { 790 !filter_match_preds(call->filter, rec)) {
781 ring_buffer_discard_commit(buffer, event); 791 ring_buffer_discard_commit(buffer, event);
782 return 1; 792 return 1;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index b9bc4d470177..8d3538b4ea5f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr)
143} 143}
144 144
145static enum print_line_t trace_branch_print(struct trace_iterator *iter, 145static enum print_line_t trace_branch_print(struct trace_iterator *iter,
146 int flags) 146 int flags, struct trace_event *event)
147{ 147{
148 struct trace_branch *field; 148 struct trace_branch *field;
149 149
@@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s)
167 " |\n"); 167 " |\n");
168} 168}
169 169
170static struct trace_event_functions trace_branch_funcs = {
171 .trace = trace_branch_print,
172};
173
170static struct trace_event trace_branch_event = { 174static struct trace_event trace_branch_event = {
171 .type = TRACE_BRANCH, 175 .type = TRACE_BRANCH,
172 .trace = trace_branch_print, 176 .funcs = &trace_branch_funcs,
173}; 177};
174 178
175static struct tracer branch_trace __read_mostly = 179static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 6fbfb8f417b9..9d589d8dcd1a 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -84,7 +84,7 @@ u64 notrace trace_clock_global(void)
84 int this_cpu; 84 int this_cpu;
85 u64 now; 85 u64 now;
86 86
87 raw_local_irq_save(flags); 87 local_irq_save(flags);
88 88
89 this_cpu = raw_smp_processor_id(); 89 this_cpu = raw_smp_processor_id();
90 now = cpu_clock(this_cpu); 90 now = cpu_clock(this_cpu);
@@ -110,7 +110,7 @@ u64 notrace trace_clock_global(void)
110 arch_spin_unlock(&trace_clock_struct.lock); 110 arch_spin_unlock(&trace_clock_struct.lock);
111 111
112 out: 112 out:
113 raw_local_irq_restore(flags); 113 local_irq_restore(flags);
114 114
115 return now; 115 return now;
116} 116}
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index c16a08f399df..dc008c1240da 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -318,18 +318,6 @@ FTRACE_ENTRY(branch, trace_branch,
318 __entry->func, __entry->file, __entry->correct) 318 __entry->func, __entry->file, __entry->correct)
319); 319);
320 320
321FTRACE_ENTRY(hw_branch, hw_branch_entry,
322
323 TRACE_HW_BRANCHES,
324
325 F_STRUCT(
326 __field( u64, from )
327 __field( u64, to )
328 ),
329
330 F_printk("from: %llx to: %llx", __entry->from, __entry->to)
331);
332
333FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, 321FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
334 322
335 TRACE_KMEM_ALLOC, 323 TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 81f691eb3a30..8a2b73f7c068 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -9,70 +9,98 @@
9#include <linux/kprobes.h> 9#include <linux/kprobes.h>
10#include "trace.h" 10#include "trace.h"
11 11
12DEFINE_PER_CPU(struct pt_regs, perf_trace_regs);
13EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs);
14
15EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); 12EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs);
16 13
17static char *perf_trace_buf; 14static char *perf_trace_buf[4];
18static char *perf_trace_buf_nmi;
19 15
20typedef typeof(char [PERF_MAX_TRACE_SIZE]) perf_trace_t ; 16/*
17 * Force it to be aligned to unsigned long to avoid misaligned accesses
18 * suprises
19 */
20typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
21 perf_trace_t;
21 22
22/* Count the events in use (per event id, not per instance) */ 23/* Count the events in use (per event id, not per instance) */
23static int total_ref_count; 24static int total_ref_count;
24 25
25static int perf_trace_event_enable(struct ftrace_event_call *event) 26static int perf_trace_event_init(struct ftrace_event_call *tp_event,
27 struct perf_event *p_event)
26{ 28{
27 char *buf; 29 struct hlist_head *list;
28 int ret = -ENOMEM; 30 int ret = -ENOMEM;
31 int cpu;
29 32
30 if (event->perf_refcount++ > 0) 33 p_event->tp_event = tp_event;
34 if (tp_event->perf_refcount++ > 0)
31 return 0; 35 return 0;
32 36
33 if (!total_ref_count) { 37 list = alloc_percpu(struct hlist_head);
34 buf = (char *)alloc_percpu(perf_trace_t); 38 if (!list)
35 if (!buf) 39 goto fail;
36 goto fail_buf;
37 40
38 rcu_assign_pointer(perf_trace_buf, buf); 41 for_each_possible_cpu(cpu)
42 INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
39 43
40 buf = (char *)alloc_percpu(perf_trace_t); 44 tp_event->perf_events = list;
41 if (!buf)
42 goto fail_buf_nmi;
43 45
44 rcu_assign_pointer(perf_trace_buf_nmi, buf); 46 if (!total_ref_count) {
45 } 47 char *buf;
48 int i;
46 49
47 ret = event->perf_event_enable(event); 50 for (i = 0; i < 4; i++) {
48 if (!ret) { 51 buf = (char *)alloc_percpu(perf_trace_t);
49 total_ref_count++; 52 if (!buf)
50 return 0; 53 goto fail;
54
55 perf_trace_buf[i] = buf;
56 }
51 } 57 }
52 58
53fail_buf_nmi: 59 if (tp_event->class->reg)
60 ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER);
61 else
62 ret = tracepoint_probe_register(tp_event->name,
63 tp_event->class->perf_probe,
64 tp_event);
65
66 if (ret)
67 goto fail;
68
69 total_ref_count++;
70 return 0;
71
72fail:
54 if (!total_ref_count) { 73 if (!total_ref_count) {
55 free_percpu(perf_trace_buf_nmi); 74 int i;
56 free_percpu(perf_trace_buf); 75
57 perf_trace_buf_nmi = NULL; 76 for (i = 0; i < 4; i++) {
58 perf_trace_buf = NULL; 77 free_percpu(perf_trace_buf[i]);
78 perf_trace_buf[i] = NULL;
79 }
80 }
81
82 if (!--tp_event->perf_refcount) {
83 free_percpu(tp_event->perf_events);
84 tp_event->perf_events = NULL;
59 } 85 }
60fail_buf:
61 event->perf_refcount--;
62 86
63 return ret; 87 return ret;
64} 88}
65 89
66int perf_trace_enable(int event_id) 90int perf_trace_init(struct perf_event *p_event)
67{ 91{
68 struct ftrace_event_call *event; 92 struct ftrace_event_call *tp_event;
93 int event_id = p_event->attr.config;
69 int ret = -EINVAL; 94 int ret = -EINVAL;
70 95
71 mutex_lock(&event_mutex); 96 mutex_lock(&event_mutex);
72 list_for_each_entry(event, &ftrace_events, list) { 97 list_for_each_entry(tp_event, &ftrace_events, list) {
73 if (event->id == event_id && event->perf_event_enable && 98 if (tp_event->event.type == event_id &&
74 try_module_get(event->mod)) { 99 tp_event->class &&
75 ret = perf_trace_event_enable(event); 100 (tp_event->class->perf_probe ||
101 tp_event->class->reg) &&
102 try_module_get(tp_event->mod)) {
103 ret = perf_trace_event_init(tp_event, p_event);
76 break; 104 break;
77 } 105 }
78 } 106 }
@@ -81,88 +109,87 @@ int perf_trace_enable(int event_id)
81 return ret; 109 return ret;
82} 110}
83 111
84static void perf_trace_event_disable(struct ftrace_event_call *event) 112int perf_trace_enable(struct perf_event *p_event)
85{ 113{
86 char *buf, *nmi_buf; 114 struct ftrace_event_call *tp_event = p_event->tp_event;
87 115 struct hlist_head *list;
88 if (--event->perf_refcount > 0)
89 return;
90 116
91 event->perf_event_disable(event); 117 list = tp_event->perf_events;
118 if (WARN_ON_ONCE(!list))
119 return -EINVAL;
92 120
93 if (!--total_ref_count) { 121 list = this_cpu_ptr(list);
94 buf = perf_trace_buf; 122 hlist_add_head_rcu(&p_event->hlist_entry, list);
95 rcu_assign_pointer(perf_trace_buf, NULL);
96
97 nmi_buf = perf_trace_buf_nmi;
98 rcu_assign_pointer(perf_trace_buf_nmi, NULL);
99 123
100 /* 124 return 0;
101 * Ensure every events in profiling have finished before 125}
102 * releasing the buffers
103 */
104 synchronize_sched();
105 126
106 free_percpu(buf); 127void perf_trace_disable(struct perf_event *p_event)
107 free_percpu(nmi_buf); 128{
108 } 129 hlist_del_rcu(&p_event->hlist_entry);
109} 130}
110 131
111void perf_trace_disable(int event_id) 132void perf_trace_destroy(struct perf_event *p_event)
112{ 133{
113 struct ftrace_event_call *event; 134 struct ftrace_event_call *tp_event = p_event->tp_event;
135 int i;
114 136
115 mutex_lock(&event_mutex); 137 mutex_lock(&event_mutex);
116 list_for_each_entry(event, &ftrace_events, list) { 138 if (--tp_event->perf_refcount > 0)
117 if (event->id == event_id) { 139 goto out;
118 perf_trace_event_disable(event); 140
119 module_put(event->mod); 141 if (tp_event->class->reg)
120 break; 142 tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER);
143 else
144 tracepoint_probe_unregister(tp_event->name,
145 tp_event->class->perf_probe,
146 tp_event);
147
148 /*
149 * Ensure our callback won't be called anymore. See
150 * tracepoint_probe_unregister() and __DO_TRACE().
151 */
152 synchronize_sched();
153
154 free_percpu(tp_event->perf_events);
155 tp_event->perf_events = NULL;
156
157 if (!--total_ref_count) {
158 for (i = 0; i < 4; i++) {
159 free_percpu(perf_trace_buf[i]);
160 perf_trace_buf[i] = NULL;
121 } 161 }
122 } 162 }
163out:
123 mutex_unlock(&event_mutex); 164 mutex_unlock(&event_mutex);
124} 165}
125 166
126__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, 167__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
127 int *rctxp, unsigned long *irq_flags) 168 struct pt_regs *regs, int *rctxp)
128{ 169{
129 struct trace_entry *entry; 170 struct trace_entry *entry;
130 char *trace_buf, *raw_data; 171 unsigned long flags;
131 int pc, cpu; 172 char *raw_data;
173 int pc;
132 174
133 pc = preempt_count(); 175 BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
134 176
135 /* Protect the per cpu buffer, begin the rcu read side */ 177 pc = preempt_count();
136 local_irq_save(*irq_flags);
137 178
138 *rctxp = perf_swevent_get_recursion_context(); 179 *rctxp = perf_swevent_get_recursion_context();
139 if (*rctxp < 0) 180 if (*rctxp < 0)
140 goto err_recursion; 181 return NULL;
141
142 cpu = smp_processor_id();
143
144 if (in_nmi())
145 trace_buf = rcu_dereference_sched(perf_trace_buf_nmi);
146 else
147 trace_buf = rcu_dereference_sched(perf_trace_buf);
148
149 if (!trace_buf)
150 goto err;
151 182
152 raw_data = per_cpu_ptr(trace_buf, cpu); 183 raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]);
153 184
154 /* zero the dead bytes from align to not leak stack to user */ 185 /* zero the dead bytes from align to not leak stack to user */
155 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 186 memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64));
156 187
157 entry = (struct trace_entry *)raw_data; 188 entry = (struct trace_entry *)raw_data;
158 tracing_generic_entry_update(entry, *irq_flags, pc); 189 local_save_flags(flags);
190 tracing_generic_entry_update(entry, flags, pc);
159 entry->type = type; 191 entry->type = type;
160 192
161 return raw_data; 193 return raw_data;
162err:
163 perf_swevent_put_recursion_context(*rctxp);
164err_recursion:
165 local_irq_restore(*irq_flags);
166 return NULL;
167} 194}
168EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); 195EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index beab8bf2f310..53cffc0b0801 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -15,6 +15,7 @@
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/slab.h>
18#include <linux/delay.h> 19#include <linux/delay.h>
19 20
20#include <asm/setup.h> 21#include <asm/setup.h>
@@ -28,11 +29,23 @@ DEFINE_MUTEX(event_mutex);
28 29
29LIST_HEAD(ftrace_events); 30LIST_HEAD(ftrace_events);
30 31
32struct list_head *
33trace_get_fields(struct ftrace_event_call *event_call)
34{
35 if (!event_call->class->get_fields)
36 return &event_call->class->fields;
37 return event_call->class->get_fields(event_call);
38}
39
31int trace_define_field(struct ftrace_event_call *call, const char *type, 40int trace_define_field(struct ftrace_event_call *call, const char *type,
32 const char *name, int offset, int size, int is_signed, 41 const char *name, int offset, int size, int is_signed,
33 int filter_type) 42 int filter_type)
34{ 43{
35 struct ftrace_event_field *field; 44 struct ftrace_event_field *field;
45 struct list_head *head;
46
47 if (WARN_ON(!call->class))
48 return 0;
36 49
37 field = kzalloc(sizeof(*field), GFP_KERNEL); 50 field = kzalloc(sizeof(*field), GFP_KERNEL);
38 if (!field) 51 if (!field)
@@ -55,7 +68,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type,
55 field->size = size; 68 field->size = size;
56 field->is_signed = is_signed; 69 field->is_signed = is_signed;
57 70
58 list_add(&field->link, &call->fields); 71 head = trace_get_fields(call);
72 list_add(&field->link, head);
59 73
60 return 0; 74 return 0;
61 75
@@ -93,8 +107,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call)
93void trace_destroy_fields(struct ftrace_event_call *call) 107void trace_destroy_fields(struct ftrace_event_call *call)
94{ 108{
95 struct ftrace_event_field *field, *next; 109 struct ftrace_event_field *field, *next;
110 struct list_head *head;
96 111
97 list_for_each_entry_safe(field, next, &call->fields, link) { 112 head = trace_get_fields(call);
113 list_for_each_entry_safe(field, next, head, link) {
98 list_del(&field->link); 114 list_del(&field->link);
99 kfree(field->type); 115 kfree(field->type);
100 kfree(field->name); 116 kfree(field->name);
@@ -106,11 +122,9 @@ int trace_event_raw_init(struct ftrace_event_call *call)
106{ 122{
107 int id; 123 int id;
108 124
109 id = register_ftrace_event(call->event); 125 id = register_ftrace_event(&call->event);
110 if (!id) 126 if (!id)
111 return -ENODEV; 127 return -ENODEV;
112 call->id = id;
113 INIT_LIST_HEAD(&call->fields);
114 128
115 return 0; 129 return 0;
116} 130}
@@ -123,23 +137,33 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
123 137
124 switch (enable) { 138 switch (enable) {
125 case 0: 139 case 0:
126 if (call->enabled) { 140 if (call->flags & TRACE_EVENT_FL_ENABLED) {
127 call->enabled = 0; 141 call->flags &= ~TRACE_EVENT_FL_ENABLED;
128 tracing_stop_cmdline_record(); 142 tracing_stop_cmdline_record();
129 call->unregfunc(call); 143 if (call->class->reg)
144 call->class->reg(call, TRACE_REG_UNREGISTER);
145 else
146 tracepoint_probe_unregister(call->name,
147 call->class->probe,
148 call);
130 } 149 }
131 break; 150 break;
132 case 1: 151 case 1:
133 if (!call->enabled) { 152 if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
134 tracing_start_cmdline_record(); 153 tracing_start_cmdline_record();
135 ret = call->regfunc(call); 154 if (call->class->reg)
155 ret = call->class->reg(call, TRACE_REG_REGISTER);
156 else
157 ret = tracepoint_probe_register(call->name,
158 call->class->probe,
159 call);
136 if (ret) { 160 if (ret) {
137 tracing_stop_cmdline_record(); 161 tracing_stop_cmdline_record();
138 pr_info("event trace: Could not enable event " 162 pr_info("event trace: Could not enable event "
139 "%s\n", call->name); 163 "%s\n", call->name);
140 break; 164 break;
141 } 165 }
142 call->enabled = 1; 166 call->flags |= TRACE_EVENT_FL_ENABLED;
143 } 167 }
144 break; 168 break;
145 } 169 }
@@ -170,15 +194,16 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
170 mutex_lock(&event_mutex); 194 mutex_lock(&event_mutex);
171 list_for_each_entry(call, &ftrace_events, list) { 195 list_for_each_entry(call, &ftrace_events, list) {
172 196
173 if (!call->name || !call->regfunc) 197 if (!call->name || !call->class ||
198 (!call->class->probe && !call->class->reg))
174 continue; 199 continue;
175 200
176 if (match && 201 if (match &&
177 strcmp(match, call->name) != 0 && 202 strcmp(match, call->name) != 0 &&
178 strcmp(match, call->system) != 0) 203 strcmp(match, call->class->system) != 0)
179 continue; 204 continue;
180 205
181 if (sub && strcmp(sub, call->system) != 0) 206 if (sub && strcmp(sub, call->class->system) != 0)
182 continue; 207 continue;
183 208
184 if (event && strcmp(event, call->name) != 0) 209 if (event && strcmp(event, call->name) != 0)
@@ -296,7 +321,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
296 * The ftrace subsystem is for showing formats only. 321 * The ftrace subsystem is for showing formats only.
297 * They can not be enabled or disabled via the event files. 322 * They can not be enabled or disabled via the event files.
298 */ 323 */
299 if (call->regfunc) 324 if (call->class && (call->class->probe || call->class->reg))
300 return call; 325 return call;
301 } 326 }
302 327
@@ -327,7 +352,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
327 (*pos)++; 352 (*pos)++;
328 353
329 list_for_each_entry_continue(call, &ftrace_events, list) { 354 list_for_each_entry_continue(call, &ftrace_events, list) {
330 if (call->enabled) 355 if (call->flags & TRACE_EVENT_FL_ENABLED)
331 return call; 356 return call;
332 } 357 }
333 358
@@ -354,8 +379,8 @@ static int t_show(struct seq_file *m, void *v)
354{ 379{
355 struct ftrace_event_call *call = v; 380 struct ftrace_event_call *call = v;
356 381
357 if (strcmp(call->system, TRACE_SYSTEM) != 0) 382 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
358 seq_printf(m, "%s:", call->system); 383 seq_printf(m, "%s:", call->class->system);
359 seq_printf(m, "%s\n", call->name); 384 seq_printf(m, "%s\n", call->name);
360 385
361 return 0; 386 return 0;
@@ -386,7 +411,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
386 struct ftrace_event_call *call = filp->private_data; 411 struct ftrace_event_call *call = filp->private_data;
387 char *buf; 412 char *buf;
388 413
389 if (call->enabled) 414 if (call->flags & TRACE_EVENT_FL_ENABLED)
390 buf = "1\n"; 415 buf = "1\n";
391 else 416 else
392 buf = "0\n"; 417 buf = "0\n";
@@ -449,10 +474,11 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
449 474
450 mutex_lock(&event_mutex); 475 mutex_lock(&event_mutex);
451 list_for_each_entry(call, &ftrace_events, list) { 476 list_for_each_entry(call, &ftrace_events, list) {
452 if (!call->name || !call->regfunc) 477 if (!call->name || !call->class ||
478 (!call->class->probe && !call->class->reg))
453 continue; 479 continue;
454 480
455 if (system && strcmp(call->system, system) != 0) 481 if (system && strcmp(call->class->system, system) != 0)
456 continue; 482 continue;
457 483
458 /* 484 /*
@@ -460,7 +486,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
460 * or if all events or cleared, or if we have 486 * or if all events or cleared, or if we have
461 * a mixture. 487 * a mixture.
462 */ 488 */
463 set |= (1 << !!call->enabled); 489 set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
464 490
465 /* 491 /*
466 * If we have a mixture, no need to look further. 492 * If we have a mixture, no need to look further.
@@ -524,6 +550,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
524{ 550{
525 struct ftrace_event_call *call = filp->private_data; 551 struct ftrace_event_call *call = filp->private_data;
526 struct ftrace_event_field *field; 552 struct ftrace_event_field *field;
553 struct list_head *head;
527 struct trace_seq *s; 554 struct trace_seq *s;
528 int common_field_count = 5; 555 int common_field_count = 5;
529 char *buf; 556 char *buf;
@@ -539,10 +566,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
539 trace_seq_init(s); 566 trace_seq_init(s);
540 567
541 trace_seq_printf(s, "name: %s\n", call->name); 568 trace_seq_printf(s, "name: %s\n", call->name);
542 trace_seq_printf(s, "ID: %d\n", call->id); 569 trace_seq_printf(s, "ID: %d\n", call->event.type);
543 trace_seq_printf(s, "format:\n"); 570 trace_seq_printf(s, "format:\n");
544 571
545 list_for_each_entry_reverse(field, &call->fields, link) { 572 head = trace_get_fields(call);
573 list_for_each_entry_reverse(field, head, link) {
546 /* 574 /*
547 * Smartly shows the array type(except dynamic array). 575 * Smartly shows the array type(except dynamic array).
548 * Normal: 576 * Normal:
@@ -612,7 +640,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
612 return -ENOMEM; 640 return -ENOMEM;
613 641
614 trace_seq_init(s); 642 trace_seq_init(s);
615 trace_seq_printf(s, "%d\n", call->id); 643 trace_seq_printf(s, "%d\n", call->event.type);
616 644
617 r = simple_read_from_buffer(ubuf, cnt, ppos, 645 r = simple_read_from_buffer(ubuf, cnt, ppos,
618 s->buffer, s->len); 646 s->buffer, s->len);
@@ -918,14 +946,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
918 const struct file_operations *filter, 946 const struct file_operations *filter,
919 const struct file_operations *format) 947 const struct file_operations *format)
920{ 948{
949 struct list_head *head;
921 int ret; 950 int ret;
922 951
923 /* 952 /*
924 * If the trace point header did not define TRACE_SYSTEM 953 * If the trace point header did not define TRACE_SYSTEM
925 * then the system would be called "TRACE_SYSTEM". 954 * then the system would be called "TRACE_SYSTEM".
926 */ 955 */
927 if (strcmp(call->system, TRACE_SYSTEM) != 0) 956 if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
928 d_events = event_subsystem_dir(call->system, d_events); 957 d_events = event_subsystem_dir(call->class->system, d_events);
929 958
930 call->dir = debugfs_create_dir(call->name, d_events); 959 call->dir = debugfs_create_dir(call->name, d_events);
931 if (!call->dir) { 960 if (!call->dir) {
@@ -934,22 +963,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
934 return -1; 963 return -1;
935 } 964 }
936 965
937 if (call->regfunc) 966 if (call->class->probe || call->class->reg)
938 trace_create_file("enable", 0644, call->dir, call, 967 trace_create_file("enable", 0644, call->dir, call,
939 enable); 968 enable);
940 969
941 if (call->id && call->perf_event_enable) 970#ifdef CONFIG_PERF_EVENTS
971 if (call->event.type && (call->class->perf_probe || call->class->reg))
942 trace_create_file("id", 0444, call->dir, call, 972 trace_create_file("id", 0444, call->dir, call,
943 id); 973 id);
974#endif
944 975
945 if (call->define_fields) { 976 if (call->class->define_fields) {
946 ret = trace_define_common_fields(call); 977 /*
947 if (!ret) 978 * Other events may have the same class. Only update
948 ret = call->define_fields(call); 979 * the fields if they are not already defined.
949 if (ret < 0) { 980 */
950 pr_warning("Could not initialize trace point" 981 head = trace_get_fields(call);
951 " events/%s\n", call->name); 982 if (list_empty(head)) {
952 return ret; 983 ret = trace_define_common_fields(call);
984 if (!ret)
985 ret = call->class->define_fields(call);
986 if (ret < 0) {
987 pr_warning("Could not initialize trace point"
988 " events/%s\n", call->name);
989 return ret;
990 }
953 } 991 }
954 trace_create_file("filter", 0644, call->dir, call, 992 trace_create_file("filter", 0644, call->dir, call,
955 filter); 993 filter);
@@ -969,8 +1007,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call)
969 if (!call->name) 1007 if (!call->name)
970 return -EINVAL; 1008 return -EINVAL;
971 1009
972 if (call->raw_init) { 1010 if (call->class->raw_init) {
973 ret = call->raw_init(call); 1011 ret = call->class->raw_init(call);
974 if (ret < 0) { 1012 if (ret < 0) {
975 if (ret != -ENOSYS) 1013 if (ret != -ENOSYS)
976 pr_warning("Could not initialize trace " 1014 pr_warning("Could not initialize trace "
@@ -1034,13 +1072,13 @@ static void remove_subsystem_dir(const char *name)
1034static void __trace_remove_event_call(struct ftrace_event_call *call) 1072static void __trace_remove_event_call(struct ftrace_event_call *call)
1035{ 1073{
1036 ftrace_event_enable_disable(call, 0); 1074 ftrace_event_enable_disable(call, 0);
1037 if (call->event) 1075 if (call->event.funcs)
1038 __unregister_ftrace_event(call->event); 1076 __unregister_ftrace_event(&call->event);
1039 debugfs_remove_recursive(call->dir); 1077 debugfs_remove_recursive(call->dir);
1040 list_del(&call->list); 1078 list_del(&call->list);
1041 trace_destroy_fields(call); 1079 trace_destroy_fields(call);
1042 destroy_preds(call); 1080 destroy_preds(call);
1043 remove_subsystem_dir(call->system); 1081 remove_subsystem_dir(call->class->system);
1044} 1082}
1045 1083
1046/* Remove an event_call */ 1084/* Remove an event_call */
@@ -1131,8 +1169,8 @@ static void trace_module_add_events(struct module *mod)
1131 /* The linker may leave blanks */ 1169 /* The linker may leave blanks */
1132 if (!call->name) 1170 if (!call->name)
1133 continue; 1171 continue;
1134 if (call->raw_init) { 1172 if (call->class->raw_init) {
1135 ret = call->raw_init(call); 1173 ret = call->class->raw_init(call);
1136 if (ret < 0) { 1174 if (ret < 0) {
1137 if (ret != -ENOSYS) 1175 if (ret != -ENOSYS)
1138 pr_warning("Could not initialize trace " 1176 pr_warning("Could not initialize trace "
@@ -1285,8 +1323,8 @@ static __init int event_trace_init(void)
1285 /* The linker may leave blanks */ 1323 /* The linker may leave blanks */
1286 if (!call->name) 1324 if (!call->name)
1287 continue; 1325 continue;
1288 if (call->raw_init) { 1326 if (call->class->raw_init) {
1289 ret = call->raw_init(call); 1327 ret = call->class->raw_init(call);
1290 if (ret < 0) { 1328 if (ret < 0) {
1291 if (ret != -ENOSYS) 1329 if (ret != -ENOSYS)
1292 pr_warning("Could not initialize trace " 1330 pr_warning("Could not initialize trace "
@@ -1387,8 +1425,8 @@ static __init void event_trace_self_tests(void)
1387 1425
1388 list_for_each_entry(call, &ftrace_events, list) { 1426 list_for_each_entry(call, &ftrace_events, list) {
1389 1427
1390 /* Only test those that have a regfunc */ 1428 /* Only test those that have a probe */
1391 if (!call->regfunc) 1429 if (!call->class || !call->class->probe)
1392 continue; 1430 continue;
1393 1431
1394/* 1432/*
@@ -1398,8 +1436,8 @@ static __init void event_trace_self_tests(void)
1398 * syscalls as we test. 1436 * syscalls as we test.
1399 */ 1437 */
1400#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS 1438#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS
1401 if (call->system && 1439 if (call->class->system &&
1402 strcmp(call->system, "syscalls") == 0) 1440 strcmp(call->class->system, "syscalls") == 0)
1403 continue; 1441 continue;
1404#endif 1442#endif
1405 1443
@@ -1409,7 +1447,7 @@ static __init void event_trace_self_tests(void)
1409 * If an event is already enabled, someone is using 1447 * If an event is already enabled, someone is using
1410 * it and the self test should not be on. 1448 * it and the self test should not be on.
1411 */ 1449 */
1412 if (call->enabled) { 1450 if (call->flags & TRACE_EVENT_FL_ENABLED) {
1413 pr_warning("Enabled event during self test!\n"); 1451 pr_warning("Enabled event during self test!\n");
1414 WARN_ON_ONCE(1); 1452 WARN_ON_ONCE(1);
1415 continue; 1453 continue;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 4615f62a04f1..57bb1bb32999 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,6 +22,7 @@
22#include <linux/ctype.h> 22#include <linux/ctype.h>
23#include <linux/mutex.h> 23#include <linux/mutex.h>
24#include <linux/perf_event.h> 24#include <linux/perf_event.h>
25#include <linux/slab.h>
25 26
26#include "trace.h" 27#include "trace.h"
27#include "trace_output.h" 28#include "trace_output.h"
@@ -499,8 +500,10 @@ static struct ftrace_event_field *
499find_event_field(struct ftrace_event_call *call, char *name) 500find_event_field(struct ftrace_event_call *call, char *name)
500{ 501{
501 struct ftrace_event_field *field; 502 struct ftrace_event_field *field;
503 struct list_head *head;
502 504
503 list_for_each_entry(field, &call->fields, link) { 505 head = trace_get_fields(call);
506 list_for_each_entry(field, head, link) {
504 if (!strcmp(field->name, name)) 507 if (!strcmp(field->name, name))
505 return field; 508 return field;
506 } 509 }
@@ -544,7 +547,7 @@ static void filter_disable_preds(struct ftrace_event_call *call)
544 struct event_filter *filter = call->filter; 547 struct event_filter *filter = call->filter;
545 int i; 548 int i;
546 549
547 call->filter_active = 0; 550 call->flags &= ~TRACE_EVENT_FL_FILTERED;
548 filter->n_preds = 0; 551 filter->n_preds = 0;
549 552
550 for (i = 0; i < MAX_FILTER_PRED; i++) 553 for (i = 0; i < MAX_FILTER_PRED; i++)
@@ -571,7 +574,7 @@ void destroy_preds(struct ftrace_event_call *call)
571{ 574{
572 __free_preds(call->filter); 575 __free_preds(call->filter);
573 call->filter = NULL; 576 call->filter = NULL;
574 call->filter_active = 0; 577 call->flags &= ~TRACE_EVENT_FL_FILTERED;
575} 578}
576 579
577static struct event_filter *__alloc_preds(void) 580static struct event_filter *__alloc_preds(void)
@@ -610,7 +613,7 @@ static int init_preds(struct ftrace_event_call *call)
610 if (call->filter) 613 if (call->filter)
611 return 0; 614 return 0;
612 615
613 call->filter_active = 0; 616 call->flags &= ~TRACE_EVENT_FL_FILTERED;
614 call->filter = __alloc_preds(); 617 call->filter = __alloc_preds();
615 if (IS_ERR(call->filter)) 618 if (IS_ERR(call->filter))
616 return PTR_ERR(call->filter); 619 return PTR_ERR(call->filter);
@@ -624,10 +627,10 @@ static int init_subsystem_preds(struct event_subsystem *system)
624 int err; 627 int err;
625 628
626 list_for_each_entry(call, &ftrace_events, list) { 629 list_for_each_entry(call, &ftrace_events, list) {
627 if (!call->define_fields) 630 if (!call->class || !call->class->define_fields)
628 continue; 631 continue;
629 632
630 if (strcmp(call->system, system->name) != 0) 633 if (strcmp(call->class->system, system->name) != 0)
631 continue; 634 continue;
632 635
633 err = init_preds(call); 636 err = init_preds(call);
@@ -643,10 +646,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
643 struct ftrace_event_call *call; 646 struct ftrace_event_call *call;
644 647
645 list_for_each_entry(call, &ftrace_events, list) { 648 list_for_each_entry(call, &ftrace_events, list) {
646 if (!call->define_fields) 649 if (!call->class || !call->class->define_fields)
647 continue; 650 continue;
648 651
649 if (strcmp(call->system, system->name) != 0) 652 if (strcmp(call->class->system, system->name) != 0)
650 continue; 653 continue;
651 654
652 filter_disable_preds(call); 655 filter_disable_preds(call);
@@ -1248,10 +1251,10 @@ static int replace_system_preds(struct event_subsystem *system,
1248 list_for_each_entry(call, &ftrace_events, list) { 1251 list_for_each_entry(call, &ftrace_events, list) {
1249 struct event_filter *filter = call->filter; 1252 struct event_filter *filter = call->filter;
1250 1253
1251 if (!call->define_fields) 1254 if (!call->class || !call->class->define_fields)
1252 continue; 1255 continue;
1253 1256
1254 if (strcmp(call->system, system->name) != 0) 1257 if (strcmp(call->class->system, system->name) != 0)
1255 continue; 1258 continue;
1256 1259
1257 /* try to see if the filter can be applied */ 1260 /* try to see if the filter can be applied */
@@ -1265,7 +1268,7 @@ static int replace_system_preds(struct event_subsystem *system,
1265 if (err) 1268 if (err)
1266 filter_disable_preds(call); 1269 filter_disable_preds(call);
1267 else { 1270 else {
1268 call->filter_active = 1; 1271 call->flags |= TRACE_EVENT_FL_FILTERED;
1269 replace_filter_string(filter, filter_string); 1272 replace_filter_string(filter, filter_string);
1270 } 1273 }
1271 fail = false; 1274 fail = false;
@@ -1314,7 +1317,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
1314 if (err) 1317 if (err)
1315 append_filter_err(ps, call->filter); 1318 append_filter_err(ps, call->filter);
1316 else 1319 else
1317 call->filter_active = 1; 1320 call->flags |= TRACE_EVENT_FL_FILTERED;
1318out: 1321out:
1319 filter_opstack_clear(ps); 1322 filter_opstack_clear(ps);
1320 postfix_clear(ps); 1323 postfix_clear(ps);
@@ -1392,12 +1395,12 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
1392 mutex_lock(&event_mutex); 1395 mutex_lock(&event_mutex);
1393 1396
1394 list_for_each_entry(call, &ftrace_events, list) { 1397 list_for_each_entry(call, &ftrace_events, list) {
1395 if (call->id == event_id) 1398 if (call->event.type == event_id)
1396 break; 1399 break;
1397 } 1400 }
1398 1401
1399 err = -EINVAL; 1402 err = -EINVAL;
1400 if (!call) 1403 if (&call->list == &ftrace_events)
1401 goto out_unlock; 1404 goto out_unlock;
1402 1405
1403 err = -EEXIST; 1406 err = -EEXIST;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e091f64ba6ce..8536e2a65969 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -127,7 +127,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
127 127
128static int ftrace_raw_init_event(struct ftrace_event_call *call) 128static int ftrace_raw_init_event(struct ftrace_event_call *call)
129{ 129{
130 INIT_LIST_HEAD(&call->fields); 130 INIT_LIST_HEAD(&call->class->fields);
131 return 0; 131 return 0;
132} 132}
133 133
@@ -153,17 +153,21 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call)
153#define F_printk(fmt, args...) #fmt ", " __stringify(args) 153#define F_printk(fmt, args...) #fmt ", " __stringify(args)
154 154
155#undef FTRACE_ENTRY 155#undef FTRACE_ENTRY
156#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 156#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \
157 \
158struct ftrace_event_class event_class_ftrace_##call = { \
159 .system = __stringify(TRACE_SYSTEM), \
160 .define_fields = ftrace_define_fields_##call, \
161 .raw_init = ftrace_raw_init_event, \
162}; \
157 \ 163 \
158struct ftrace_event_call __used \ 164struct ftrace_event_call __used \
159__attribute__((__aligned__(4))) \ 165__attribute__((__aligned__(4))) \
160__attribute__((section("_ftrace_events"))) event_##call = { \ 166__attribute__((section("_ftrace_events"))) event_##call = { \
161 .name = #call, \ 167 .name = #call, \
162 .id = type, \ 168 .event.type = etype, \
163 .system = __stringify(TRACE_SYSTEM), \ 169 .class = &event_class_ftrace_##call, \
164 .raw_init = ftrace_raw_init_event, \
165 .print_fmt = print, \ 170 .print_fmt = print, \
166 .define_fields = ftrace_define_fields_##call, \
167}; \ 171}; \
168 172
169#include "trace_entries.h" 173#include "trace_entries.h"
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e6989d9b44da..79f4bac99a94 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -9,6 +9,7 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/ftrace.h> 11#include <linux/ftrace.h>
12#include <linux/slab.h>
12#include <linux/fs.h> 13#include <linux/fs.h>
13 14
14#include "trace.h" 15#include "trace.h"
@@ -39,7 +40,7 @@ struct fgraph_data {
39#define TRACE_GRAPH_PRINT_OVERHEAD 0x4 40#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
40#define TRACE_GRAPH_PRINT_PROC 0x8 41#define TRACE_GRAPH_PRINT_PROC 0x8
41#define TRACE_GRAPH_PRINT_DURATION 0x10 42#define TRACE_GRAPH_PRINT_DURATION 0x10
42#define TRACE_GRAPH_PRINT_ABS_TIME 0X20 43#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
43 44
44static struct tracer_opt trace_opts[] = { 45static struct tracer_opt trace_opts[] = {
45 /* Display overruns? (for self-debug purpose) */ 46 /* Display overruns? (for self-debug purpose) */
@@ -178,7 +179,7 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
178 return ret; 179 return ret;
179} 180}
180 181
181static int __trace_graph_entry(struct trace_array *tr, 182int __trace_graph_entry(struct trace_array *tr,
182 struct ftrace_graph_ent *trace, 183 struct ftrace_graph_ent *trace,
183 unsigned long flags, 184 unsigned long flags,
184 int pc) 185 int pc)
@@ -245,7 +246,7 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace)
245 return trace_graph_entry(trace); 246 return trace_graph_entry(trace);
246} 247}
247 248
248static void __trace_graph_return(struct trace_array *tr, 249void __trace_graph_return(struct trace_array *tr,
249 struct ftrace_graph_ret *trace, 250 struct ftrace_graph_ret *trace,
250 unsigned long flags, 251 unsigned long flags,
251 int pc) 252 int pc)
@@ -489,9 +490,10 @@ get_return_for_leaf(struct trace_iterator *iter,
489 * We need to consume the current entry to see 490 * We need to consume the current entry to see
490 * the next one. 491 * the next one.
491 */ 492 */
492 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL); 493 ring_buffer_consume(iter->tr->buffer, iter->cpu,
494 NULL, NULL);
493 event = ring_buffer_peek(iter->tr->buffer, iter->cpu, 495 event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
494 NULL); 496 NULL, NULL);
495 } 497 }
496 498
497 if (!event) 499 if (!event)
@@ -525,17 +527,18 @@ get_return_for_leaf(struct trace_iterator *iter,
525 527
526/* Signal a overhead of time execution to the output */ 528/* Signal a overhead of time execution to the output */
527static int 529static int
528print_graph_overhead(unsigned long long duration, struct trace_seq *s) 530print_graph_overhead(unsigned long long duration, struct trace_seq *s,
531 u32 flags)
529{ 532{
530 /* If duration disappear, we don't need anything */ 533 /* If duration disappear, we don't need anything */
531 if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)) 534 if (!(flags & TRACE_GRAPH_PRINT_DURATION))
532 return 1; 535 return 1;
533 536
534 /* Non nested entry or return */ 537 /* Non nested entry or return */
535 if (duration == -1) 538 if (duration == -1)
536 return trace_seq_printf(s, " "); 539 return trace_seq_printf(s, " ");
537 540
538 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) { 541 if (flags & TRACE_GRAPH_PRINT_OVERHEAD) {
539 /* Duration exceeded 100 msecs */ 542 /* Duration exceeded 100 msecs */
540 if (duration > 100000ULL) 543 if (duration > 100000ULL)
541 return trace_seq_printf(s, "! "); 544 return trace_seq_printf(s, "! ");
@@ -561,7 +564,7 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
561 564
562static enum print_line_t 565static enum print_line_t
563print_graph_irq(struct trace_iterator *iter, unsigned long addr, 566print_graph_irq(struct trace_iterator *iter, unsigned long addr,
564 enum trace_type type, int cpu, pid_t pid) 567 enum trace_type type, int cpu, pid_t pid, u32 flags)
565{ 568{
566 int ret; 569 int ret;
567 struct trace_seq *s = &iter->seq; 570 struct trace_seq *s = &iter->seq;
@@ -571,21 +574,21 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
571 return TRACE_TYPE_UNHANDLED; 574 return TRACE_TYPE_UNHANDLED;
572 575
573 /* Absolute time */ 576 /* Absolute time */
574 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 577 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
575 ret = print_graph_abs_time(iter->ts, s); 578 ret = print_graph_abs_time(iter->ts, s);
576 if (!ret) 579 if (!ret)
577 return TRACE_TYPE_PARTIAL_LINE; 580 return TRACE_TYPE_PARTIAL_LINE;
578 } 581 }
579 582
580 /* Cpu */ 583 /* Cpu */
581 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 584 if (flags & TRACE_GRAPH_PRINT_CPU) {
582 ret = print_graph_cpu(s, cpu); 585 ret = print_graph_cpu(s, cpu);
583 if (ret == TRACE_TYPE_PARTIAL_LINE) 586 if (ret == TRACE_TYPE_PARTIAL_LINE)
584 return TRACE_TYPE_PARTIAL_LINE; 587 return TRACE_TYPE_PARTIAL_LINE;
585 } 588 }
586 589
587 /* Proc */ 590 /* Proc */
588 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 591 if (flags & TRACE_GRAPH_PRINT_PROC) {
589 ret = print_graph_proc(s, pid); 592 ret = print_graph_proc(s, pid);
590 if (ret == TRACE_TYPE_PARTIAL_LINE) 593 if (ret == TRACE_TYPE_PARTIAL_LINE)
591 return TRACE_TYPE_PARTIAL_LINE; 594 return TRACE_TYPE_PARTIAL_LINE;
@@ -595,7 +598,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
595 } 598 }
596 599
597 /* No overhead */ 600 /* No overhead */
598 ret = print_graph_overhead(-1, s); 601 ret = print_graph_overhead(-1, s, flags);
599 if (!ret) 602 if (!ret)
600 return TRACE_TYPE_PARTIAL_LINE; 603 return TRACE_TYPE_PARTIAL_LINE;
601 604
@@ -608,7 +611,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
608 return TRACE_TYPE_PARTIAL_LINE; 611 return TRACE_TYPE_PARTIAL_LINE;
609 612
610 /* Don't close the duration column if haven't one */ 613 /* Don't close the duration column if haven't one */
611 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 614 if (flags & TRACE_GRAPH_PRINT_DURATION)
612 trace_seq_printf(s, " |"); 615 trace_seq_printf(s, " |");
613 ret = trace_seq_printf(s, "\n"); 616 ret = trace_seq_printf(s, "\n");
614 617
@@ -678,7 +681,8 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
678static enum print_line_t 681static enum print_line_t
679print_graph_entry_leaf(struct trace_iterator *iter, 682print_graph_entry_leaf(struct trace_iterator *iter,
680 struct ftrace_graph_ent_entry *entry, 683 struct ftrace_graph_ent_entry *entry,
681 struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s) 684 struct ftrace_graph_ret_entry *ret_entry,
685 struct trace_seq *s, u32 flags)
682{ 686{
683 struct fgraph_data *data = iter->private; 687 struct fgraph_data *data = iter->private;
684 struct ftrace_graph_ret *graph_ret; 688 struct ftrace_graph_ret *graph_ret;
@@ -710,12 +714,12 @@ print_graph_entry_leaf(struct trace_iterator *iter,
710 } 714 }
711 715
712 /* Overhead */ 716 /* Overhead */
713 ret = print_graph_overhead(duration, s); 717 ret = print_graph_overhead(duration, s, flags);
714 if (!ret) 718 if (!ret)
715 return TRACE_TYPE_PARTIAL_LINE; 719 return TRACE_TYPE_PARTIAL_LINE;
716 720
717 /* Duration */ 721 /* Duration */
718 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 722 if (flags & TRACE_GRAPH_PRINT_DURATION) {
719 ret = print_graph_duration(duration, s); 723 ret = print_graph_duration(duration, s);
720 if (ret == TRACE_TYPE_PARTIAL_LINE) 724 if (ret == TRACE_TYPE_PARTIAL_LINE)
721 return TRACE_TYPE_PARTIAL_LINE; 725 return TRACE_TYPE_PARTIAL_LINE;
@@ -738,7 +742,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
738static enum print_line_t 742static enum print_line_t
739print_graph_entry_nested(struct trace_iterator *iter, 743print_graph_entry_nested(struct trace_iterator *iter,
740 struct ftrace_graph_ent_entry *entry, 744 struct ftrace_graph_ent_entry *entry,
741 struct trace_seq *s, int cpu) 745 struct trace_seq *s, int cpu, u32 flags)
742{ 746{
743 struct ftrace_graph_ent *call = &entry->graph_ent; 747 struct ftrace_graph_ent *call = &entry->graph_ent;
744 struct fgraph_data *data = iter->private; 748 struct fgraph_data *data = iter->private;
@@ -758,12 +762,12 @@ print_graph_entry_nested(struct trace_iterator *iter,
758 } 762 }
759 763
760 /* No overhead */ 764 /* No overhead */
761 ret = print_graph_overhead(-1, s); 765 ret = print_graph_overhead(-1, s, flags);
762 if (!ret) 766 if (!ret)
763 return TRACE_TYPE_PARTIAL_LINE; 767 return TRACE_TYPE_PARTIAL_LINE;
764 768
765 /* No time */ 769 /* No time */
766 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 770 if (flags & TRACE_GRAPH_PRINT_DURATION) {
767 ret = trace_seq_printf(s, " | "); 771 ret = trace_seq_printf(s, " | ");
768 if (!ret) 772 if (!ret)
769 return TRACE_TYPE_PARTIAL_LINE; 773 return TRACE_TYPE_PARTIAL_LINE;
@@ -789,7 +793,7 @@ print_graph_entry_nested(struct trace_iterator *iter,
789 793
790static enum print_line_t 794static enum print_line_t
791print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, 795print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
792 int type, unsigned long addr) 796 int type, unsigned long addr, u32 flags)
793{ 797{
794 struct fgraph_data *data = iter->private; 798 struct fgraph_data *data = iter->private;
795 struct trace_entry *ent = iter->ent; 799 struct trace_entry *ent = iter->ent;
@@ -802,27 +806,27 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
802 806
803 if (type) { 807 if (type) {
804 /* Interrupt */ 808 /* Interrupt */
805 ret = print_graph_irq(iter, addr, type, cpu, ent->pid); 809 ret = print_graph_irq(iter, addr, type, cpu, ent->pid, flags);
806 if (ret == TRACE_TYPE_PARTIAL_LINE) 810 if (ret == TRACE_TYPE_PARTIAL_LINE)
807 return TRACE_TYPE_PARTIAL_LINE; 811 return TRACE_TYPE_PARTIAL_LINE;
808 } 812 }
809 813
810 /* Absolute time */ 814 /* Absolute time */
811 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) { 815 if (flags & TRACE_GRAPH_PRINT_ABS_TIME) {
812 ret = print_graph_abs_time(iter->ts, s); 816 ret = print_graph_abs_time(iter->ts, s);
813 if (!ret) 817 if (!ret)
814 return TRACE_TYPE_PARTIAL_LINE; 818 return TRACE_TYPE_PARTIAL_LINE;
815 } 819 }
816 820
817 /* Cpu */ 821 /* Cpu */
818 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) { 822 if (flags & TRACE_GRAPH_PRINT_CPU) {
819 ret = print_graph_cpu(s, cpu); 823 ret = print_graph_cpu(s, cpu);
820 if (ret == TRACE_TYPE_PARTIAL_LINE) 824 if (ret == TRACE_TYPE_PARTIAL_LINE)
821 return TRACE_TYPE_PARTIAL_LINE; 825 return TRACE_TYPE_PARTIAL_LINE;
822 } 826 }
823 827
824 /* Proc */ 828 /* Proc */
825 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) { 829 if (flags & TRACE_GRAPH_PRINT_PROC) {
826 ret = print_graph_proc(s, ent->pid); 830 ret = print_graph_proc(s, ent->pid);
827 if (ret == TRACE_TYPE_PARTIAL_LINE) 831 if (ret == TRACE_TYPE_PARTIAL_LINE)
828 return TRACE_TYPE_PARTIAL_LINE; 832 return TRACE_TYPE_PARTIAL_LINE;
@@ -844,7 +848,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
844 848
845static enum print_line_t 849static enum print_line_t
846print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, 850print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
847 struct trace_iterator *iter) 851 struct trace_iterator *iter, u32 flags)
848{ 852{
849 struct fgraph_data *data = iter->private; 853 struct fgraph_data *data = iter->private;
850 struct ftrace_graph_ent *call = &field->graph_ent; 854 struct ftrace_graph_ent *call = &field->graph_ent;
@@ -852,14 +856,14 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
852 static enum print_line_t ret; 856 static enum print_line_t ret;
853 int cpu = iter->cpu; 857 int cpu = iter->cpu;
854 858
855 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func)) 859 if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags))
856 return TRACE_TYPE_PARTIAL_LINE; 860 return TRACE_TYPE_PARTIAL_LINE;
857 861
858 leaf_ret = get_return_for_leaf(iter, field); 862 leaf_ret = get_return_for_leaf(iter, field);
859 if (leaf_ret) 863 if (leaf_ret)
860 ret = print_graph_entry_leaf(iter, field, leaf_ret, s); 864 ret = print_graph_entry_leaf(iter, field, leaf_ret, s, flags);
861 else 865 else
862 ret = print_graph_entry_nested(iter, field, s, cpu); 866 ret = print_graph_entry_nested(iter, field, s, cpu, flags);
863 867
864 if (data) { 868 if (data) {
865 /* 869 /*
@@ -878,7 +882,8 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
878 882
879static enum print_line_t 883static enum print_line_t
880print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, 884print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
881 struct trace_entry *ent, struct trace_iterator *iter) 885 struct trace_entry *ent, struct trace_iterator *iter,
886 u32 flags)
882{ 887{
883 unsigned long long duration = trace->rettime - trace->calltime; 888 unsigned long long duration = trace->rettime - trace->calltime;
884 struct fgraph_data *data = iter->private; 889 struct fgraph_data *data = iter->private;
@@ -908,16 +913,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
908 } 913 }
909 } 914 }
910 915
911 if (print_graph_prologue(iter, s, 0, 0)) 916 if (print_graph_prologue(iter, s, 0, 0, flags))
912 return TRACE_TYPE_PARTIAL_LINE; 917 return TRACE_TYPE_PARTIAL_LINE;
913 918
914 /* Overhead */ 919 /* Overhead */
915 ret = print_graph_overhead(duration, s); 920 ret = print_graph_overhead(duration, s, flags);
916 if (!ret) 921 if (!ret)
917 return TRACE_TYPE_PARTIAL_LINE; 922 return TRACE_TYPE_PARTIAL_LINE;
918 923
919 /* Duration */ 924 /* Duration */
920 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 925 if (flags & TRACE_GRAPH_PRINT_DURATION) {
921 ret = print_graph_duration(duration, s); 926 ret = print_graph_duration(duration, s);
922 if (ret == TRACE_TYPE_PARTIAL_LINE) 927 if (ret == TRACE_TYPE_PARTIAL_LINE)
923 return TRACE_TYPE_PARTIAL_LINE; 928 return TRACE_TYPE_PARTIAL_LINE;
@@ -947,14 +952,15 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
947 } 952 }
948 953
949 /* Overrun */ 954 /* Overrun */
950 if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { 955 if (flags & TRACE_GRAPH_PRINT_OVERRUN) {
951 ret = trace_seq_printf(s, " (Overruns: %lu)\n", 956 ret = trace_seq_printf(s, " (Overruns: %lu)\n",
952 trace->overrun); 957 trace->overrun);
953 if (!ret) 958 if (!ret)
954 return TRACE_TYPE_PARTIAL_LINE; 959 return TRACE_TYPE_PARTIAL_LINE;
955 } 960 }
956 961
957 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid); 962 ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET,
963 cpu, pid, flags);
958 if (ret == TRACE_TYPE_PARTIAL_LINE) 964 if (ret == TRACE_TYPE_PARTIAL_LINE)
959 return TRACE_TYPE_PARTIAL_LINE; 965 return TRACE_TYPE_PARTIAL_LINE;
960 966
@@ -962,8 +968,8 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
962} 968}
963 969
964static enum print_line_t 970static enum print_line_t
965print_graph_comment(struct trace_seq *s, struct trace_entry *ent, 971print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
966 struct trace_iterator *iter) 972 struct trace_iterator *iter, u32 flags)
967{ 973{
968 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 974 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
969 struct fgraph_data *data = iter->private; 975 struct fgraph_data *data = iter->private;
@@ -975,16 +981,16 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
975 if (data) 981 if (data)
976 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth; 982 depth = per_cpu_ptr(data->cpu_data, iter->cpu)->depth;
977 983
978 if (print_graph_prologue(iter, s, 0, 0)) 984 if (print_graph_prologue(iter, s, 0, 0, flags))
979 return TRACE_TYPE_PARTIAL_LINE; 985 return TRACE_TYPE_PARTIAL_LINE;
980 986
981 /* No overhead */ 987 /* No overhead */
982 ret = print_graph_overhead(-1, s); 988 ret = print_graph_overhead(-1, s, flags);
983 if (!ret) 989 if (!ret)
984 return TRACE_TYPE_PARTIAL_LINE; 990 return TRACE_TYPE_PARTIAL_LINE;
985 991
986 /* No time */ 992 /* No time */
987 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) { 993 if (flags & TRACE_GRAPH_PRINT_DURATION) {
988 ret = trace_seq_printf(s, " | "); 994 ret = trace_seq_printf(s, " | ");
989 if (!ret) 995 if (!ret)
990 return TRACE_TYPE_PARTIAL_LINE; 996 return TRACE_TYPE_PARTIAL_LINE;
@@ -1019,7 +1025,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1019 if (!event) 1025 if (!event)
1020 return TRACE_TYPE_UNHANDLED; 1026 return TRACE_TYPE_UNHANDLED;
1021 1027
1022 ret = event->trace(iter, sym_flags); 1028 ret = event->funcs->trace(iter, sym_flags, event);
1023 if (ret != TRACE_TYPE_HANDLED) 1029 if (ret != TRACE_TYPE_HANDLED)
1024 return ret; 1030 return ret;
1025 } 1031 }
@@ -1039,7 +1045,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent,
1039 1045
1040 1046
1041enum print_line_t 1047enum print_line_t
1042print_graph_function(struct trace_iterator *iter) 1048print_graph_function_flags(struct trace_iterator *iter, u32 flags)
1043{ 1049{
1044 struct ftrace_graph_ent_entry *field; 1050 struct ftrace_graph_ent_entry *field;
1045 struct fgraph_data *data = iter->private; 1051 struct fgraph_data *data = iter->private;
@@ -1060,7 +1066,7 @@ print_graph_function(struct trace_iterator *iter)
1060 if (data && data->failed) { 1066 if (data && data->failed) {
1061 field = &data->ent; 1067 field = &data->ent;
1062 iter->cpu = data->cpu; 1068 iter->cpu = data->cpu;
1063 ret = print_graph_entry(field, s, iter); 1069 ret = print_graph_entry(field, s, iter, flags);
1064 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) { 1070 if (ret == TRACE_TYPE_HANDLED && iter->cpu != cpu) {
1065 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1; 1071 per_cpu_ptr(data->cpu_data, iter->cpu)->ignore = 1;
1066 ret = TRACE_TYPE_NO_CONSUME; 1072 ret = TRACE_TYPE_NO_CONSUME;
@@ -1080,32 +1086,50 @@ print_graph_function(struct trace_iterator *iter)
1080 struct ftrace_graph_ent_entry saved; 1086 struct ftrace_graph_ent_entry saved;
1081 trace_assign_type(field, entry); 1087 trace_assign_type(field, entry);
1082 saved = *field; 1088 saved = *field;
1083 return print_graph_entry(&saved, s, iter); 1089 return print_graph_entry(&saved, s, iter, flags);
1084 } 1090 }
1085 case TRACE_GRAPH_RET: { 1091 case TRACE_GRAPH_RET: {
1086 struct ftrace_graph_ret_entry *field; 1092 struct ftrace_graph_ret_entry *field;
1087 trace_assign_type(field, entry); 1093 trace_assign_type(field, entry);
1088 return print_graph_return(&field->ret, s, entry, iter); 1094 return print_graph_return(&field->ret, s, entry, iter, flags);
1089 } 1095 }
1096 case TRACE_STACK:
1097 case TRACE_FN:
1098 /* dont trace stack and functions as comments */
1099 return TRACE_TYPE_UNHANDLED;
1100
1090 default: 1101 default:
1091 return print_graph_comment(s, entry, iter); 1102 return print_graph_comment(s, entry, iter, flags);
1092 } 1103 }
1093 1104
1094 return TRACE_TYPE_HANDLED; 1105 return TRACE_TYPE_HANDLED;
1095} 1106}
1096 1107
1097static void print_lat_header(struct seq_file *s) 1108static enum print_line_t
1109print_graph_function(struct trace_iterator *iter)
1110{
1111 return print_graph_function_flags(iter, tracer_flags.val);
1112}
1113
1114static enum print_line_t
1115print_graph_function_event(struct trace_iterator *iter, int flags,
1116 struct trace_event *event)
1117{
1118 return print_graph_function(iter);
1119}
1120
1121static void print_lat_header(struct seq_file *s, u32 flags)
1098{ 1122{
1099 static const char spaces[] = " " /* 16 spaces */ 1123 static const char spaces[] = " " /* 16 spaces */
1100 " " /* 4 spaces */ 1124 " " /* 4 spaces */
1101 " "; /* 17 spaces */ 1125 " "; /* 17 spaces */
1102 int size = 0; 1126 int size = 0;
1103 1127
1104 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1128 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1105 size += 16; 1129 size += 16;
1106 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1130 if (flags & TRACE_GRAPH_PRINT_CPU)
1107 size += 4; 1131 size += 4;
1108 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1132 if (flags & TRACE_GRAPH_PRINT_PROC)
1109 size += 17; 1133 size += 17;
1110 1134
1111 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces); 1135 seq_printf(s, "#%.*s _-----=> irqs-off \n", size, spaces);
@@ -1116,43 +1140,48 @@ static void print_lat_header(struct seq_file *s)
1116 seq_printf(s, "#%.*s|||| / \n", size, spaces); 1140 seq_printf(s, "#%.*s|||| / \n", size, spaces);
1117} 1141}
1118 1142
1119static void print_graph_headers(struct seq_file *s) 1143void print_graph_headers_flags(struct seq_file *s, u32 flags)
1120{ 1144{
1121 int lat = trace_flags & TRACE_ITER_LATENCY_FMT; 1145 int lat = trace_flags & TRACE_ITER_LATENCY_FMT;
1122 1146
1123 if (lat) 1147 if (lat)
1124 print_lat_header(s); 1148 print_lat_header(s, flags);
1125 1149
1126 /* 1st line */ 1150 /* 1st line */
1127 seq_printf(s, "#"); 1151 seq_printf(s, "#");
1128 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1152 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1129 seq_printf(s, " TIME "); 1153 seq_printf(s, " TIME ");
1130 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1154 if (flags & TRACE_GRAPH_PRINT_CPU)
1131 seq_printf(s, " CPU"); 1155 seq_printf(s, " CPU");
1132 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1156 if (flags & TRACE_GRAPH_PRINT_PROC)
1133 seq_printf(s, " TASK/PID "); 1157 seq_printf(s, " TASK/PID ");
1134 if (lat) 1158 if (lat)
1135 seq_printf(s, "|||||"); 1159 seq_printf(s, "|||||");
1136 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1160 if (flags & TRACE_GRAPH_PRINT_DURATION)
1137 seq_printf(s, " DURATION "); 1161 seq_printf(s, " DURATION ");
1138 seq_printf(s, " FUNCTION CALLS\n"); 1162 seq_printf(s, " FUNCTION CALLS\n");
1139 1163
1140 /* 2nd line */ 1164 /* 2nd line */
1141 seq_printf(s, "#"); 1165 seq_printf(s, "#");
1142 if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) 1166 if (flags & TRACE_GRAPH_PRINT_ABS_TIME)
1143 seq_printf(s, " | "); 1167 seq_printf(s, " | ");
1144 if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) 1168 if (flags & TRACE_GRAPH_PRINT_CPU)
1145 seq_printf(s, " | "); 1169 seq_printf(s, " | ");
1146 if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) 1170 if (flags & TRACE_GRAPH_PRINT_PROC)
1147 seq_printf(s, " | | "); 1171 seq_printf(s, " | | ");
1148 if (lat) 1172 if (lat)
1149 seq_printf(s, "|||||"); 1173 seq_printf(s, "|||||");
1150 if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) 1174 if (flags & TRACE_GRAPH_PRINT_DURATION)
1151 seq_printf(s, " | | "); 1175 seq_printf(s, " | | ");
1152 seq_printf(s, " | | | |\n"); 1176 seq_printf(s, " | | | |\n");
1153} 1177}
1154 1178
1155static void graph_trace_open(struct trace_iterator *iter) 1179void print_graph_headers(struct seq_file *s)
1180{
1181 print_graph_headers_flags(s, tracer_flags.val);
1182}
1183
1184void graph_trace_open(struct trace_iterator *iter)
1156{ 1185{
1157 /* pid and depth on the last trace processed */ 1186 /* pid and depth on the last trace processed */
1158 struct fgraph_data *data; 1187 struct fgraph_data *data;
@@ -1187,7 +1216,7 @@ static void graph_trace_open(struct trace_iterator *iter)
1187 pr_warning("function graph tracer: not enough memory\n"); 1216 pr_warning("function graph tracer: not enough memory\n");
1188} 1217}
1189 1218
1190static void graph_trace_close(struct trace_iterator *iter) 1219void graph_trace_close(struct trace_iterator *iter)
1191{ 1220{
1192 struct fgraph_data *data = iter->private; 1221 struct fgraph_data *data = iter->private;
1193 1222
@@ -1197,6 +1226,20 @@ static void graph_trace_close(struct trace_iterator *iter)
1197 } 1226 }
1198} 1227}
1199 1228
1229static struct trace_event_functions graph_functions = {
1230 .trace = print_graph_function_event,
1231};
1232
1233static struct trace_event graph_trace_entry_event = {
1234 .type = TRACE_GRAPH_ENT,
1235 .funcs = &graph_functions,
1236};
1237
1238static struct trace_event graph_trace_ret_event = {
1239 .type = TRACE_GRAPH_RET,
1240 .funcs = &graph_functions
1241};
1242
1200static struct tracer graph_trace __read_mostly = { 1243static struct tracer graph_trace __read_mostly = {
1201 .name = "function_graph", 1244 .name = "function_graph",
1202 .open = graph_trace_open, 1245 .open = graph_trace_open,
@@ -1218,6 +1261,16 @@ static __init int init_graph_trace(void)
1218{ 1261{
1219 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1262 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
1220 1263
1264 if (!register_ftrace_event(&graph_trace_entry_event)) {
1265 pr_warning("Warning: could not register graph trace events\n");
1266 return 1;
1267 }
1268
1269 if (!register_ftrace_event(&graph_trace_ret_event)) {
1270 pr_warning("Warning: could not register graph trace events\n");
1271 return 1;
1272 }
1273
1221 return register_tracer(&graph_trace); 1274 return register_tracer(&graph_trace);
1222} 1275}
1223 1276
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
deleted file mode 100644
index 7b97000745f5..000000000000
--- a/kernel/trace/trace_hw_branches.c
+++ /dev/null
@@ -1,312 +0,0 @@
1/*
2 * h/w branch tracer for x86 based on BTS
3 *
4 * Copyright (C) 2008-2009 Intel Corporation.
5 * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
6 */
7#include <linux/kallsyms.h>
8#include <linux/debugfs.h>
9#include <linux/ftrace.h>
10#include <linux/module.h>
11#include <linux/cpu.h>
12#include <linux/smp.h>
13#include <linux/fs.h>
14
15#include <asm/ds.h>
16
17#include "trace_output.h"
18#include "trace.h"
19
20
21#define BTS_BUFFER_SIZE (1 << 13)
22
23static DEFINE_PER_CPU(struct bts_tracer *, hwb_tracer);
24static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], hwb_buffer);
25
26#define this_tracer per_cpu(hwb_tracer, smp_processor_id())
27
28static int trace_hw_branches_enabled __read_mostly;
29static int trace_hw_branches_suspended __read_mostly;
30static struct trace_array *hw_branch_trace __read_mostly;
31
32
33static void bts_trace_init_cpu(int cpu)
34{
35 per_cpu(hwb_tracer, cpu) =
36 ds_request_bts_cpu(cpu, per_cpu(hwb_buffer, cpu),
37 BTS_BUFFER_SIZE, NULL, (size_t)-1,
38 BTS_KERNEL);
39
40 if (IS_ERR(per_cpu(hwb_tracer, cpu)))
41 per_cpu(hwb_tracer, cpu) = NULL;
42}
43
44static int bts_trace_init(struct trace_array *tr)
45{
46 int cpu;
47
48 hw_branch_trace = tr;
49 trace_hw_branches_enabled = 0;
50
51 get_online_cpus();
52 for_each_online_cpu(cpu) {
53 bts_trace_init_cpu(cpu);
54
55 if (likely(per_cpu(hwb_tracer, cpu)))
56 trace_hw_branches_enabled = 1;
57 }
58 trace_hw_branches_suspended = 0;
59 put_online_cpus();
60
61 /* If we could not enable tracing on a single cpu, we fail. */
62 return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
63}
64
65static void bts_trace_reset(struct trace_array *tr)
66{
67 int cpu;
68
69 get_online_cpus();
70 for_each_online_cpu(cpu) {
71 if (likely(per_cpu(hwb_tracer, cpu))) {
72 ds_release_bts(per_cpu(hwb_tracer, cpu));
73 per_cpu(hwb_tracer, cpu) = NULL;
74 }
75 }
76 trace_hw_branches_enabled = 0;
77 trace_hw_branches_suspended = 0;
78 put_online_cpus();
79}
80
81static void bts_trace_start(struct trace_array *tr)
82{
83 int cpu;
84
85 get_online_cpus();
86 for_each_online_cpu(cpu)
87 if (likely(per_cpu(hwb_tracer, cpu)))
88 ds_resume_bts(per_cpu(hwb_tracer, cpu));
89 trace_hw_branches_suspended = 0;
90 put_online_cpus();
91}
92
93static void bts_trace_stop(struct trace_array *tr)
94{
95 int cpu;
96
97 get_online_cpus();
98 for_each_online_cpu(cpu)
99 if (likely(per_cpu(hwb_tracer, cpu)))
100 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
101 trace_hw_branches_suspended = 1;
102 put_online_cpus();
103}
104
105static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
106 unsigned long action, void *hcpu)
107{
108 int cpu = (long)hcpu;
109
110 switch (action) {
111 case CPU_ONLINE:
112 case CPU_DOWN_FAILED:
113 /* The notification is sent with interrupts enabled. */
114 if (trace_hw_branches_enabled) {
115 bts_trace_init_cpu(cpu);
116
117 if (trace_hw_branches_suspended &&
118 likely(per_cpu(hwb_tracer, cpu)))
119 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
120 }
121 break;
122
123 case CPU_DOWN_PREPARE:
124 /* The notification is sent with interrupts enabled. */
125 if (likely(per_cpu(hwb_tracer, cpu))) {
126 ds_release_bts(per_cpu(hwb_tracer, cpu));
127 per_cpu(hwb_tracer, cpu) = NULL;
128 }
129 }
130
131 return NOTIFY_DONE;
132}
133
134static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
135 .notifier_call = bts_hotcpu_handler
136};
137
138static void bts_trace_print_header(struct seq_file *m)
139{
140 seq_puts(m, "# CPU# TO <- FROM\n");
141}
142
143static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
144{
145 unsigned long symflags = TRACE_ITER_SYM_OFFSET;
146 struct trace_entry *entry = iter->ent;
147 struct trace_seq *seq = &iter->seq;
148 struct hw_branch_entry *it;
149
150 trace_assign_type(it, entry);
151
152 if (entry->type == TRACE_HW_BRANCHES) {
153 if (trace_seq_printf(seq, "%4d ", iter->cpu) &&
154 seq_print_ip_sym(seq, it->to, symflags) &&
155 trace_seq_printf(seq, "\t <- ") &&
156 seq_print_ip_sym(seq, it->from, symflags) &&
157 trace_seq_printf(seq, "\n"))
158 return TRACE_TYPE_HANDLED;
159 return TRACE_TYPE_PARTIAL_LINE;
160 }
161 return TRACE_TYPE_UNHANDLED;
162}
163
164void trace_hw_branch(u64 from, u64 to)
165{
166 struct ftrace_event_call *call = &event_hw_branch;
167 struct trace_array *tr = hw_branch_trace;
168 struct ring_buffer_event *event;
169 struct ring_buffer *buf;
170 struct hw_branch_entry *entry;
171 unsigned long irq1;
172 int cpu;
173
174 if (unlikely(!tr))
175 return;
176
177 if (unlikely(!trace_hw_branches_enabled))
178 return;
179
180 local_irq_save(irq1);
181 cpu = raw_smp_processor_id();
182 if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
183 goto out;
184
185 buf = tr->buffer;
186 event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
187 sizeof(*entry), 0, 0);
188 if (!event)
189 goto out;
190 entry = ring_buffer_event_data(event);
191 tracing_generic_entry_update(&entry->ent, 0, from);
192 entry->ent.type = TRACE_HW_BRANCHES;
193 entry->from = from;
194 entry->to = to;
195 if (!filter_check_discard(call, entry, buf, event))
196 trace_buffer_unlock_commit(buf, event, 0, 0);
197
198 out:
199 atomic_dec(&tr->data[cpu]->disabled);
200 local_irq_restore(irq1);
201}
202
203static void trace_bts_at(const struct bts_trace *trace, void *at)
204{
205 struct bts_struct bts;
206 int err = 0;
207
208 WARN_ON_ONCE(!trace->read);
209 if (!trace->read)
210 return;
211
212 err = trace->read(this_tracer, at, &bts);
213 if (err < 0)
214 return;
215
216 switch (bts.qualifier) {
217 case BTS_BRANCH:
218 trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
219 break;
220 }
221}
222
223/*
224 * Collect the trace on the current cpu and write it into the ftrace buffer.
225 *
226 * pre: tracing must be suspended on the current cpu
227 */
228static void trace_bts_cpu(void *arg)
229{
230 struct trace_array *tr = (struct trace_array *)arg;
231 const struct bts_trace *trace;
232 unsigned char *at;
233
234 if (unlikely(!tr))
235 return;
236
237 if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
238 return;
239
240 if (unlikely(!this_tracer))
241 return;
242
243 trace = ds_read_bts(this_tracer);
244 if (!trace)
245 return;
246
247 for (at = trace->ds.top; (void *)at < trace->ds.end;
248 at += trace->ds.size)
249 trace_bts_at(trace, at);
250
251 for (at = trace->ds.begin; (void *)at < trace->ds.top;
252 at += trace->ds.size)
253 trace_bts_at(trace, at);
254}
255
256static void trace_bts_prepare(struct trace_iterator *iter)
257{
258 int cpu;
259
260 get_online_cpus();
261 for_each_online_cpu(cpu)
262 if (likely(per_cpu(hwb_tracer, cpu)))
263 ds_suspend_bts(per_cpu(hwb_tracer, cpu));
264 /*
265 * We need to collect the trace on the respective cpu since ftrace
266 * implicitly adds the record for the current cpu.
267 * Once that is more flexible, we could collect the data from any cpu.
268 */
269 on_each_cpu(trace_bts_cpu, iter->tr, 1);
270
271 for_each_online_cpu(cpu)
272 if (likely(per_cpu(hwb_tracer, cpu)))
273 ds_resume_bts(per_cpu(hwb_tracer, cpu));
274 put_online_cpus();
275}
276
277static void trace_bts_close(struct trace_iterator *iter)
278{
279 tracing_reset_online_cpus(iter->tr);
280}
281
282void trace_hw_branch_oops(void)
283{
284 if (this_tracer) {
285 ds_suspend_bts_noirq(this_tracer);
286 trace_bts_cpu(hw_branch_trace);
287 ds_resume_bts_noirq(this_tracer);
288 }
289}
290
291struct tracer bts_tracer __read_mostly =
292{
293 .name = "hw-branch-tracer",
294 .init = bts_trace_init,
295 .reset = bts_trace_reset,
296 .print_header = bts_trace_print_header,
297 .print_line = bts_trace_print_line,
298 .start = bts_trace_start,
299 .stop = bts_trace_stop,
300 .open = trace_bts_prepare,
301 .close = trace_bts_close,
302#ifdef CONFIG_FTRACE_SELFTEST
303 .selftest = trace_selftest_startup_hw_branches,
304#endif /* CONFIG_FTRACE_SELFTEST */
305};
306
307__init static int init_bts_trace(void)
308{
309 register_hotcpu_notifier(&bts_hotcpu_notifier);
310 return register_tracer(&bts_tracer);
311}
312device_initcall(init_bts_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 2974bc7538c7..6fd486e0cef4 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -34,6 +34,9 @@ static int trace_type __read_mostly;
34 34
35static int save_lat_flag; 35static int save_lat_flag;
36 36
37static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
38static int start_irqsoff_tracer(struct trace_array *tr, int graph);
39
37#ifdef CONFIG_PREEMPT_TRACER 40#ifdef CONFIG_PREEMPT_TRACER
38static inline int 41static inline int
39preempt_trace(void) 42preempt_trace(void)
@@ -55,6 +58,23 @@ irq_trace(void)
55# define irq_trace() (0) 58# define irq_trace() (0)
56#endif 59#endif
57 60
61#define TRACE_DISPLAY_GRAPH 1
62
63static struct tracer_opt trace_opts[] = {
64#ifdef CONFIG_FUNCTION_GRAPH_TRACER
65 /* display latency trace as call graph */
66 { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) },
67#endif
68 { } /* Empty entry */
69};
70
71static struct tracer_flags tracer_flags = {
72 .val = 0,
73 .opts = trace_opts,
74};
75
76#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH)
77
58/* 78/*
59 * Sequence count - we record it when starting a measurement and 79 * Sequence count - we record it when starting a measurement and
60 * skip the latency if the sequence has changed - some other section 80 * skip the latency if the sequence has changed - some other section
@@ -108,6 +128,202 @@ static struct ftrace_ops trace_ops __read_mostly =
108}; 128};
109#endif /* CONFIG_FUNCTION_TRACER */ 129#endif /* CONFIG_FUNCTION_TRACER */
110 130
131#ifdef CONFIG_FUNCTION_GRAPH_TRACER
132static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
133{
134 int cpu;
135
136 if (!(bit & TRACE_DISPLAY_GRAPH))
137 return -EINVAL;
138
139 if (!(is_graph() ^ set))
140 return 0;
141
142 stop_irqsoff_tracer(irqsoff_trace, !set);
143
144 for_each_possible_cpu(cpu)
145 per_cpu(tracing_cpu, cpu) = 0;
146
147 tracing_max_latency = 0;
148 tracing_reset_online_cpus(irqsoff_trace);
149
150 return start_irqsoff_tracer(irqsoff_trace, set);
151}
152
153static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
154{
155 struct trace_array *tr = irqsoff_trace;
156 struct trace_array_cpu *data;
157 unsigned long flags;
158 long disabled;
159 int ret;
160 int cpu;
161 int pc;
162
163 cpu = raw_smp_processor_id();
164 if (likely(!per_cpu(tracing_cpu, cpu)))
165 return 0;
166
167 local_save_flags(flags);
168 /* slight chance to get a false positive on tracing_cpu */
169 if (!irqs_disabled_flags(flags))
170 return 0;
171
172 data = tr->data[cpu];
173 disabled = atomic_inc_return(&data->disabled);
174
175 if (likely(disabled == 1)) {
176 pc = preempt_count();
177 ret = __trace_graph_entry(tr, trace, flags, pc);
178 } else
179 ret = 0;
180
181 atomic_dec(&data->disabled);
182 return ret;
183}
184
185static void irqsoff_graph_return(struct ftrace_graph_ret *trace)
186{
187 struct trace_array *tr = irqsoff_trace;
188 struct trace_array_cpu *data;
189 unsigned long flags;
190 long disabled;
191 int cpu;
192 int pc;
193
194 cpu = raw_smp_processor_id();
195 if (likely(!per_cpu(tracing_cpu, cpu)))
196 return;
197
198 local_save_flags(flags);
199 /* slight chance to get a false positive on tracing_cpu */
200 if (!irqs_disabled_flags(flags))
201 return;
202
203 data = tr->data[cpu];
204 disabled = atomic_inc_return(&data->disabled);
205
206 if (likely(disabled == 1)) {
207 pc = preempt_count();
208 __trace_graph_return(tr, trace, flags, pc);
209 }
210
211 atomic_dec(&data->disabled);
212}
213
214static void irqsoff_trace_open(struct trace_iterator *iter)
215{
216 if (is_graph())
217 graph_trace_open(iter);
218
219}
220
221static void irqsoff_trace_close(struct trace_iterator *iter)
222{
223 if (iter->private)
224 graph_trace_close(iter);
225}
226
227#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_CPU | \
228 TRACE_GRAPH_PRINT_PROC)
229
230static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
231{
232 u32 flags = GRAPH_TRACER_FLAGS;
233
234 if (trace_flags & TRACE_ITER_LATENCY_FMT)
235 flags |= TRACE_GRAPH_PRINT_DURATION;
236 else
237 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
238
239 /*
240 * In graph mode call the graph tracer output function,
241 * otherwise go with the TRACE_FN event handler
242 */
243 if (is_graph())
244 return print_graph_function_flags(iter, flags);
245
246 return TRACE_TYPE_UNHANDLED;
247}
248
249static void irqsoff_print_header(struct seq_file *s)
250{
251 if (is_graph()) {
252 struct trace_iterator *iter = s->private;
253 u32 flags = GRAPH_TRACER_FLAGS;
254
255 if (trace_flags & TRACE_ITER_LATENCY_FMT) {
256 /* print nothing if the buffers are empty */
257 if (trace_empty(iter))
258 return;
259
260 print_trace_header(s, iter);
261 flags |= TRACE_GRAPH_PRINT_DURATION;
262 } else
263 flags |= TRACE_GRAPH_PRINT_ABS_TIME;
264
265 print_graph_headers_flags(s, flags);
266 } else
267 trace_default_header(s);
268}
269
270static void
271trace_graph_function(struct trace_array *tr,
272 unsigned long ip, unsigned long flags, int pc)
273{
274 u64 time = trace_clock_local();
275 struct ftrace_graph_ent ent = {
276 .func = ip,
277 .depth = 0,
278 };
279 struct ftrace_graph_ret ret = {
280 .func = ip,
281 .depth = 0,
282 .calltime = time,
283 .rettime = time,
284 };
285
286 __trace_graph_entry(tr, &ent, flags, pc);
287 __trace_graph_return(tr, &ret, flags, pc);
288}
289
290static void
291__trace_function(struct trace_array *tr,
292 unsigned long ip, unsigned long parent_ip,
293 unsigned long flags, int pc)
294{
295 if (!is_graph())
296 trace_function(tr, ip, parent_ip, flags, pc);
297 else {
298 trace_graph_function(tr, parent_ip, flags, pc);
299 trace_graph_function(tr, ip, flags, pc);
300 }
301}
302
303#else
304#define __trace_function trace_function
305
306static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
307{
308 return -EINVAL;
309}
310
311static int irqsoff_graph_entry(struct ftrace_graph_ent *trace)
312{
313 return -1;
314}
315
316static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
317{
318 return TRACE_TYPE_UNHANDLED;
319}
320
321static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
322static void irqsoff_print_header(struct seq_file *s) { }
323static void irqsoff_trace_open(struct trace_iterator *iter) { }
324static void irqsoff_trace_close(struct trace_iterator *iter) { }
325#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
326
111/* 327/*
112 * Should this new latency be reported/recorded? 328 * Should this new latency be reported/recorded?
113 */ 329 */
@@ -150,7 +366,7 @@ check_critical_timing(struct trace_array *tr,
150 if (!report_latency(delta)) 366 if (!report_latency(delta))
151 goto out_unlock; 367 goto out_unlock;
152 368
153 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 369 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
154 /* Skip 5 functions to get to the irq/preempt enable function */ 370 /* Skip 5 functions to get to the irq/preempt enable function */
155 __trace_stack(tr, flags, 5, pc); 371 __trace_stack(tr, flags, 5, pc);
156 372
@@ -172,7 +388,7 @@ out_unlock:
172out: 388out:
173 data->critical_sequence = max_sequence; 389 data->critical_sequence = max_sequence;
174 data->preempt_timestamp = ftrace_now(cpu); 390 data->preempt_timestamp = ftrace_now(cpu);
175 trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); 391 __trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
176} 392}
177 393
178static inline void 394static inline void
@@ -204,7 +420,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
204 420
205 local_save_flags(flags); 421 local_save_flags(flags);
206 422
207 trace_function(tr, ip, parent_ip, flags, preempt_count()); 423 __trace_function(tr, ip, parent_ip, flags, preempt_count());
208 424
209 per_cpu(tracing_cpu, cpu) = 1; 425 per_cpu(tracing_cpu, cpu) = 1;
210 426
@@ -238,7 +454,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
238 atomic_inc(&data->disabled); 454 atomic_inc(&data->disabled);
239 455
240 local_save_flags(flags); 456 local_save_flags(flags);
241 trace_function(tr, ip, parent_ip, flags, preempt_count()); 457 __trace_function(tr, ip, parent_ip, flags, preempt_count());
242 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 458 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
243 data->critical_start = 0; 459 data->critical_start = 0;
244 atomic_dec(&data->disabled); 460 atomic_dec(&data->disabled);
@@ -347,19 +563,32 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
347} 563}
348#endif /* CONFIG_PREEMPT_TRACER */ 564#endif /* CONFIG_PREEMPT_TRACER */
349 565
350static void start_irqsoff_tracer(struct trace_array *tr) 566static int start_irqsoff_tracer(struct trace_array *tr, int graph)
351{ 567{
352 register_ftrace_function(&trace_ops); 568 int ret = 0;
353 if (tracing_is_enabled()) 569
570 if (!graph)
571 ret = register_ftrace_function(&trace_ops);
572 else
573 ret = register_ftrace_graph(&irqsoff_graph_return,
574 &irqsoff_graph_entry);
575
576 if (!ret && tracing_is_enabled())
354 tracer_enabled = 1; 577 tracer_enabled = 1;
355 else 578 else
356 tracer_enabled = 0; 579 tracer_enabled = 0;
580
581 return ret;
357} 582}
358 583
359static void stop_irqsoff_tracer(struct trace_array *tr) 584static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
360{ 585{
361 tracer_enabled = 0; 586 tracer_enabled = 0;
362 unregister_ftrace_function(&trace_ops); 587
588 if (!graph)
589 unregister_ftrace_function(&trace_ops);
590 else
591 unregister_ftrace_graph();
363} 592}
364 593
365static void __irqsoff_tracer_init(struct trace_array *tr) 594static void __irqsoff_tracer_init(struct trace_array *tr)
@@ -372,12 +601,14 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
372 /* make sure that the tracer is visible */ 601 /* make sure that the tracer is visible */
373 smp_wmb(); 602 smp_wmb();
374 tracing_reset_online_cpus(tr); 603 tracing_reset_online_cpus(tr);
375 start_irqsoff_tracer(tr); 604
605 if (start_irqsoff_tracer(tr, is_graph()))
606 printk(KERN_ERR "failed to start irqsoff tracer\n");
376} 607}
377 608
378static void irqsoff_tracer_reset(struct trace_array *tr) 609static void irqsoff_tracer_reset(struct trace_array *tr)
379{ 610{
380 stop_irqsoff_tracer(tr); 611 stop_irqsoff_tracer(tr, is_graph());
381 612
382 if (!save_lat_flag) 613 if (!save_lat_flag)
383 trace_flags &= ~TRACE_ITER_LATENCY_FMT; 614 trace_flags &= ~TRACE_ITER_LATENCY_FMT;
@@ -409,9 +640,15 @@ static struct tracer irqsoff_tracer __read_mostly =
409 .start = irqsoff_tracer_start, 640 .start = irqsoff_tracer_start,
410 .stop = irqsoff_tracer_stop, 641 .stop = irqsoff_tracer_stop,
411 .print_max = 1, 642 .print_max = 1,
643 .print_header = irqsoff_print_header,
644 .print_line = irqsoff_print_line,
645 .flags = &tracer_flags,
646 .set_flag = irqsoff_set_flag,
412#ifdef CONFIG_FTRACE_SELFTEST 647#ifdef CONFIG_FTRACE_SELFTEST
413 .selftest = trace_selftest_startup_irqsoff, 648 .selftest = trace_selftest_startup_irqsoff,
414#endif 649#endif
650 .open = irqsoff_trace_open,
651 .close = irqsoff_trace_close,
415}; 652};
416# define register_irqsoff(trace) register_tracer(&trace) 653# define register_irqsoff(trace) register_tracer(&trace)
417#else 654#else
@@ -435,9 +672,15 @@ static struct tracer preemptoff_tracer __read_mostly =
435 .start = irqsoff_tracer_start, 672 .start = irqsoff_tracer_start,
436 .stop = irqsoff_tracer_stop, 673 .stop = irqsoff_tracer_stop,
437 .print_max = 1, 674 .print_max = 1,
675 .print_header = irqsoff_print_header,
676 .print_line = irqsoff_print_line,
677 .flags = &tracer_flags,
678 .set_flag = irqsoff_set_flag,
438#ifdef CONFIG_FTRACE_SELFTEST 679#ifdef CONFIG_FTRACE_SELFTEST
439 .selftest = trace_selftest_startup_preemptoff, 680 .selftest = trace_selftest_startup_preemptoff,
440#endif 681#endif
682 .open = irqsoff_trace_open,
683 .close = irqsoff_trace_close,
441}; 684};
442# define register_preemptoff(trace) register_tracer(&trace) 685# define register_preemptoff(trace) register_tracer(&trace)
443#else 686#else
@@ -463,9 +706,15 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
463 .start = irqsoff_tracer_start, 706 .start = irqsoff_tracer_start,
464 .stop = irqsoff_tracer_stop, 707 .stop = irqsoff_tracer_stop,
465 .print_max = 1, 708 .print_max = 1,
709 .print_header = irqsoff_print_header,
710 .print_line = irqsoff_print_line,
711 .flags = &tracer_flags,
712 .set_flag = irqsoff_set_flag,
466#ifdef CONFIG_FTRACE_SELFTEST 713#ifdef CONFIG_FTRACE_SELFTEST
467 .selftest = trace_selftest_startup_preemptirqsoff, 714 .selftest = trace_selftest_startup_preemptirqsoff,
468#endif 715#endif
716 .open = irqsoff_trace_open,
717 .close = irqsoff_trace_close,
469}; 718};
470 719
471# define register_preemptirqsoff(trace) register_tracer(&trace) 720# define register_preemptirqsoff(trace) register_tracer(&trace)
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 1251e367bae9..f52b5f50299d 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -29,6 +29,8 @@
29#include <linux/ctype.h> 29#include <linux/ctype.h>
30#include <linux/ptrace.h> 30#include <linux/ptrace.h>
31#include <linux/perf_event.h> 31#include <linux/perf_event.h>
32#include <linux/stringify.h>
33#include <asm/bitsperlong.h>
32 34
33#include "trace.h" 35#include "trace.h"
34#include "trace_output.h" 36#include "trace_output.h"
@@ -40,7 +42,6 @@
40 42
41/* Reserved field names */ 43/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip" 44#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip" 45#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func" 46#define FIELD_STRING_FUNC "__probe_func"
46 47
@@ -52,56 +53,102 @@ const char *reserved_field_names[] = {
52 "common_tgid", 53 "common_tgid",
53 "common_lock_depth", 54 "common_lock_depth",
54 FIELD_STRING_IP, 55 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP, 56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC, 57 FIELD_STRING_FUNC,
58}; 58};
59 59
60struct fetch_func { 60/* Printing function type */
61 unsigned long (*func)(struct pt_regs *, void *); 61typedef int (*print_type_func_t)(struct trace_seq *, const char *, void *);
62#define PRINT_TYPE_FUNC_NAME(type) print_type_##type
63#define PRINT_TYPE_FMT_NAME(type) print_type_format_##type
64
65/* Printing in basic type function template */
66#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt, cast) \
67static __kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
68 const char *name, void *data)\
69{ \
70 return trace_seq_printf(s, " %s=" fmt, name, (cast)*(type *)data);\
71} \
72static const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
73
74DEFINE_BASIC_PRINT_TYPE_FUNC(u8, "%x", unsigned int)
75DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "%x", unsigned int)
76DEFINE_BASIC_PRINT_TYPE_FUNC(u32, "%lx", unsigned long)
77DEFINE_BASIC_PRINT_TYPE_FUNC(u64, "%llx", unsigned long long)
78DEFINE_BASIC_PRINT_TYPE_FUNC(s8, "%d", int)
79DEFINE_BASIC_PRINT_TYPE_FUNC(s16, "%d", int)
80DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%ld", long)
81DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%lld", long long)
82
83/* Data fetch function type */
84typedef void (*fetch_func_t)(struct pt_regs *, void *, void *);
85
86struct fetch_param {
87 fetch_func_t fn;
62 void *data; 88 void *data;
63}; 89};
64 90
65static __kprobes unsigned long call_fetch(struct fetch_func *f, 91static __kprobes void call_fetch(struct fetch_param *fprm,
66 struct pt_regs *regs) 92 struct pt_regs *regs, void *dest)
67{ 93{
68 return f->func(regs, f->data); 94 return fprm->fn(regs, fprm->data, dest);
69} 95}
70 96
71/* fetch handlers */ 97#define FETCH_FUNC_NAME(kind, type) fetch_##kind##_##type
72static __kprobes unsigned long fetch_register(struct pt_regs *regs, 98/*
73 void *offset) 99 * Define macro for basic types - we don't need to define s* types, because
74{ 100 * we have to care only about bitwidth at recording time.
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset)); 101 */
102#define DEFINE_BASIC_FETCH_FUNCS(kind) \
103DEFINE_FETCH_##kind(u8) \
104DEFINE_FETCH_##kind(u16) \
105DEFINE_FETCH_##kind(u32) \
106DEFINE_FETCH_##kind(u64)
107
108#define CHECK_BASIC_FETCH_FUNCS(kind, fn) \
109 ((FETCH_FUNC_NAME(kind, u8) == fn) || \
110 (FETCH_FUNC_NAME(kind, u16) == fn) || \
111 (FETCH_FUNC_NAME(kind, u32) == fn) || \
112 (FETCH_FUNC_NAME(kind, u64) == fn))
113
114/* Data fetch function templates */
115#define DEFINE_FETCH_reg(type) \
116static __kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
117 void *offset, void *dest) \
118{ \
119 *(type *)dest = (type)regs_get_register(regs, \
120 (unsigned int)((unsigned long)offset)); \
76} 121}
77 122DEFINE_BASIC_FETCH_FUNCS(reg)
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs, 123
79 void *num) 124#define DEFINE_FETCH_stack(type) \
80{ 125static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
81 return regs_get_kernel_stack_nth(regs, 126 void *offset, void *dest) \
82 (unsigned int)((unsigned long)num)); 127{ \
128 *(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
129 (unsigned int)((unsigned long)offset)); \
83} 130}
131DEFINE_BASIC_FETCH_FUNCS(stack)
84 132
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) 133#define DEFINE_FETCH_retval(type) \
86{ 134static __kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs,\
87 unsigned long retval; 135 void *dummy, void *dest) \
88 136{ \
89 if (probe_kernel_address(addr, retval)) 137 *(type *)dest = (type)regs_return_value(regs); \
90 return 0;
91 return retval;
92} 138}
93 139DEFINE_BASIC_FETCH_FUNCS(retval)
94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 140
95 void *dummy) 141#define DEFINE_FETCH_memory(type) \
96{ 142static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
97 return regs_return_value(regs); 143 void *addr, void *dest) \
98} 144{ \
99 145 type retval; \
100static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, 146 if (probe_kernel_address(addr, retval)) \
101 void *dummy) 147 *(type *)dest = 0; \
102{ 148 else \
103 return kernel_stack_pointer(regs); 149 *(type *)dest = retval; \
104} 150}
151DEFINE_BASIC_FETCH_FUNCS(memory)
105 152
106/* Memory fetching by symbol */ 153/* Memory fetching by symbol */
107struct symbol_cache { 154struct symbol_cache {
@@ -145,51 +192,126 @@ static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
145 return sc; 192 return sc;
146} 193}
147 194
148static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) 195#define DEFINE_FETCH_symbol(type) \
149{ 196static __kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs,\
150 struct symbol_cache *sc = data; 197 void *data, void *dest) \
151 198{ \
152 if (sc->addr) 199 struct symbol_cache *sc = data; \
153 return fetch_memory(regs, (void *)sc->addr); 200 if (sc->addr) \
154 else 201 fetch_memory_##type(regs, (void *)sc->addr, dest); \
155 return 0; 202 else \
203 *(type *)dest = 0; \
156} 204}
205DEFINE_BASIC_FETCH_FUNCS(symbol)
157 206
158/* Special indirect memory access interface */ 207/* Dereference memory access function */
159struct indirect_fetch_data { 208struct deref_fetch_param {
160 struct fetch_func orig; 209 struct fetch_param orig;
161 long offset; 210 long offset;
162}; 211};
163 212
164static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) 213#define DEFINE_FETCH_deref(type) \
165{ 214static __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs,\
166 struct indirect_fetch_data *ind = data; 215 void *data, void *dest) \
167 unsigned long addr; 216{ \
168 217 struct deref_fetch_param *dprm = data; \
169 addr = call_fetch(&ind->orig, regs); 218 unsigned long addr; \
170 if (addr) { 219 call_fetch(&dprm->orig, regs, &addr); \
171 addr += ind->offset; 220 if (addr) { \
172 return fetch_memory(regs, (void *)addr); 221 addr += dprm->offset; \
173 } else 222 fetch_memory_##type(regs, (void *)addr, dest); \
174 return 0; 223 } else \
224 *(type *)dest = 0; \
175} 225}
226DEFINE_BASIC_FETCH_FUNCS(deref)
176 227
177static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) 228static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
178{ 229{
179 if (data->orig.func == fetch_indirect) 230 if (CHECK_BASIC_FETCH_FUNCS(deref, data->orig.fn))
180 free_indirect_fetch_data(data->orig.data); 231 free_deref_fetch_param(data->orig.data);
181 else if (data->orig.func == fetch_symbol) 232 else if (CHECK_BASIC_FETCH_FUNCS(symbol, data->orig.fn))
182 free_symbol_cache(data->orig.data); 233 free_symbol_cache(data->orig.data);
183 kfree(data); 234 kfree(data);
184} 235}
185 236
237/* Default (unsigned long) fetch type */
238#define __DEFAULT_FETCH_TYPE(t) u##t
239#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
240#define DEFAULT_FETCH_TYPE _DEFAULT_FETCH_TYPE(BITS_PER_LONG)
241#define DEFAULT_FETCH_TYPE_STR __stringify(DEFAULT_FETCH_TYPE)
242
243#define ASSIGN_FETCH_FUNC(kind, type) \
244 .kind = FETCH_FUNC_NAME(kind, type)
245
246#define ASSIGN_FETCH_TYPE(ptype, ftype, sign) \
247 {.name = #ptype, \
248 .size = sizeof(ftype), \
249 .is_signed = sign, \
250 .print = PRINT_TYPE_FUNC_NAME(ptype), \
251 .fmt = PRINT_TYPE_FMT_NAME(ptype), \
252ASSIGN_FETCH_FUNC(reg, ftype), \
253ASSIGN_FETCH_FUNC(stack, ftype), \
254ASSIGN_FETCH_FUNC(retval, ftype), \
255ASSIGN_FETCH_FUNC(memory, ftype), \
256ASSIGN_FETCH_FUNC(symbol, ftype), \
257ASSIGN_FETCH_FUNC(deref, ftype), \
258 }
259
260/* Fetch type information table */
261static const struct fetch_type {
262 const char *name; /* Name of type */
263 size_t size; /* Byte size of type */
264 int is_signed; /* Signed flag */
265 print_type_func_t print; /* Print functions */
266 const char *fmt; /* Fromat string */
267 /* Fetch functions */
268 fetch_func_t reg;
269 fetch_func_t stack;
270 fetch_func_t retval;
271 fetch_func_t memory;
272 fetch_func_t symbol;
273 fetch_func_t deref;
274} fetch_type_table[] = {
275 ASSIGN_FETCH_TYPE(u8, u8, 0),
276 ASSIGN_FETCH_TYPE(u16, u16, 0),
277 ASSIGN_FETCH_TYPE(u32, u32, 0),
278 ASSIGN_FETCH_TYPE(u64, u64, 0),
279 ASSIGN_FETCH_TYPE(s8, u8, 1),
280 ASSIGN_FETCH_TYPE(s16, u16, 1),
281 ASSIGN_FETCH_TYPE(s32, u32, 1),
282 ASSIGN_FETCH_TYPE(s64, u64, 1),
283};
284
285static const struct fetch_type *find_fetch_type(const char *type)
286{
287 int i;
288
289 if (!type)
290 type = DEFAULT_FETCH_TYPE_STR;
291
292 for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
293 if (strcmp(type, fetch_type_table[i].name) == 0)
294 return &fetch_type_table[i];
295 return NULL;
296}
297
298/* Special function : only accept unsigned long */
299static __kprobes void fetch_stack_address(struct pt_regs *regs,
300 void *dummy, void *dest)
301{
302 *(unsigned long *)dest = kernel_stack_pointer(regs);
303}
304
186/** 305/**
187 * Kprobe event core functions 306 * Kprobe event core functions
188 */ 307 */
189 308
190struct probe_arg { 309struct probe_arg {
191 struct fetch_func fetch; 310 struct fetch_param fetch;
192 const char *name; 311 unsigned int offset; /* Offset from argument entry */
312 const char *name; /* Name of this argument */
313 const char *comm; /* Command of this argument */
314 const struct fetch_type *type; /* Type of this argument */
193}; 315};
194 316
195/* Flags for trace_probe */ 317/* Flags for trace_probe */
@@ -202,8 +324,9 @@ struct trace_probe {
202 unsigned long nhit; 324 unsigned long nhit;
203 unsigned int flags; /* For TP_FLAG_* */ 325 unsigned int flags; /* For TP_FLAG_* */
204 const char *symbol; /* symbol name */ 326 const char *symbol; /* symbol name */
327 struct ftrace_event_class class;
205 struct ftrace_event_call call; 328 struct ftrace_event_call call;
206 struct trace_event event; 329 ssize_t size; /* trace entry size */
207 unsigned int nr_args; 330 unsigned int nr_args;
208 struct probe_arg args[]; 331 struct probe_arg args[];
209}; 332};
@@ -212,6 +335,7 @@ struct trace_probe {
212 (offsetof(struct trace_probe, args) + \ 335 (offsetof(struct trace_probe, args) + \
213 (sizeof(struct probe_arg) * (n))) 336 (sizeof(struct probe_arg) * (n)))
214 337
338
215static __kprobes int probe_is_return(struct trace_probe *tp) 339static __kprobes int probe_is_return(struct trace_probe *tp)
216{ 340{
217 return tp->rp.handler != NULL; 341 return tp->rp.handler != NULL;
@@ -222,49 +346,6 @@ static __kprobes const char *probe_symbol(struct trace_probe *tp)
222 return tp->symbol ? tp->symbol : "unknown"; 346 return tp->symbol ? tp->symbol : "unknown";
223} 347}
224 348
225static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
226{
227 int ret = -EINVAL;
228
229 if (ff->func == fetch_register) {
230 const char *name;
231 name = regs_query_register_name((unsigned int)((long)ff->data));
232 ret = snprintf(buf, n, "%%%s", name);
233 } else if (ff->func == fetch_stack)
234 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
235 else if (ff->func == fetch_memory)
236 ret = snprintf(buf, n, "@0x%p", ff->data);
237 else if (ff->func == fetch_symbol) {
238 struct symbol_cache *sc = ff->data;
239 if (sc->offset)
240 ret = snprintf(buf, n, "@%s%+ld", sc->symbol,
241 sc->offset);
242 else
243 ret = snprintf(buf, n, "@%s", sc->symbol);
244 } else if (ff->func == fetch_retvalue)
245 ret = snprintf(buf, n, "$retval");
246 else if (ff->func == fetch_stack_address)
247 ret = snprintf(buf, n, "$stack");
248 else if (ff->func == fetch_indirect) {
249 struct indirect_fetch_data *id = ff->data;
250 size_t l = 0;
251 ret = snprintf(buf, n, "%+ld(", id->offset);
252 if (ret >= n)
253 goto end;
254 l += ret;
255 ret = probe_arg_string(buf + l, n - l, &id->orig);
256 if (ret < 0)
257 goto end;
258 l += ret;
259 ret = snprintf(buf + l, n - l, ")");
260 ret += l;
261 }
262end:
263 if (ret >= n)
264 return -ENOSPC;
265 return ret;
266}
267
268static int register_probe_event(struct trace_probe *tp); 349static int register_probe_event(struct trace_probe *tp);
269static void unregister_probe_event(struct trace_probe *tp); 350static void unregister_probe_event(struct trace_probe *tp);
270 351
@@ -323,6 +404,7 @@ static struct trace_probe *alloc_trace_probe(const char *group,
323 goto error; 404 goto error;
324 } 405 }
325 406
407 tp->call.class = &tp->class;
326 tp->call.name = kstrdup(event, GFP_KERNEL); 408 tp->call.name = kstrdup(event, GFP_KERNEL);
327 if (!tp->call.name) 409 if (!tp->call.name)
328 goto error; 410 goto error;
@@ -332,8 +414,8 @@ static struct trace_probe *alloc_trace_probe(const char *group,
332 goto error; 414 goto error;
333 } 415 }
334 416
335 tp->call.system = kstrdup(group, GFP_KERNEL); 417 tp->class.system = kstrdup(group, GFP_KERNEL);
336 if (!tp->call.system) 418 if (!tp->class.system)
337 goto error; 419 goto error;
338 420
339 INIT_LIST_HEAD(&tp->list); 421 INIT_LIST_HEAD(&tp->list);
@@ -347,11 +429,12 @@ error:
347 429
348static void free_probe_arg(struct probe_arg *arg) 430static void free_probe_arg(struct probe_arg *arg)
349{ 431{
350 if (arg->fetch.func == fetch_symbol) 432 if (CHECK_BASIC_FETCH_FUNCS(deref, arg->fetch.fn))
433 free_deref_fetch_param(arg->fetch.data);
434 else if (CHECK_BASIC_FETCH_FUNCS(symbol, arg->fetch.fn))
351 free_symbol_cache(arg->fetch.data); 435 free_symbol_cache(arg->fetch.data);
352 else if (arg->fetch.func == fetch_indirect)
353 free_indirect_fetch_data(arg->fetch.data);
354 kfree(arg->name); 436 kfree(arg->name);
437 kfree(arg->comm);
355} 438}
356 439
357static void free_trace_probe(struct trace_probe *tp) 440static void free_trace_probe(struct trace_probe *tp)
@@ -361,7 +444,7 @@ static void free_trace_probe(struct trace_probe *tp)
361 for (i = 0; i < tp->nr_args; i++) 444 for (i = 0; i < tp->nr_args; i++)
362 free_probe_arg(&tp->args[i]); 445 free_probe_arg(&tp->args[i]);
363 446
364 kfree(tp->call.system); 447 kfree(tp->call.class->system);
365 kfree(tp->call.name); 448 kfree(tp->call.name);
366 kfree(tp->symbol); 449 kfree(tp->symbol);
367 kfree(tp); 450 kfree(tp);
@@ -374,7 +457,7 @@ static struct trace_probe *find_probe_event(const char *event,
374 457
375 list_for_each_entry(tp, &probe_list, list) 458 list_for_each_entry(tp, &probe_list, list)
376 if (strcmp(tp->call.name, event) == 0 && 459 if (strcmp(tp->call.name, event) == 0 &&
377 strcmp(tp->call.system, group) == 0) 460 strcmp(tp->call.class->system, group) == 0)
378 return tp; 461 return tp;
379 return NULL; 462 return NULL;
380} 463}
@@ -399,7 +482,7 @@ static int register_trace_probe(struct trace_probe *tp)
399 mutex_lock(&probe_lock); 482 mutex_lock(&probe_lock);
400 483
401 /* register as an event */ 484 /* register as an event */
402 old_tp = find_probe_event(tp->call.name, tp->call.system); 485 old_tp = find_probe_event(tp->call.name, tp->call.class->system);
403 if (old_tp) { 486 if (old_tp) {
404 /* delete old event */ 487 /* delete old event */
405 unregister_trace_probe(old_tp); 488 unregister_trace_probe(old_tp);
@@ -457,28 +540,30 @@ static int split_symbol_offset(char *symbol, unsigned long *offset)
457#define PARAM_MAX_ARGS 16 540#define PARAM_MAX_ARGS 16
458#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) 541#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
459 542
460static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) 543static int parse_probe_vars(char *arg, const struct fetch_type *t,
544 struct fetch_param *f, int is_return)
461{ 545{
462 int ret = 0; 546 int ret = 0;
463 unsigned long param; 547 unsigned long param;
464 548
465 if (strcmp(arg, "retval") == 0) { 549 if (strcmp(arg, "retval") == 0) {
466 if (is_return) { 550 if (is_return)
467 ff->func = fetch_retvalue; 551 f->fn = t->retval;
468 ff->data = NULL; 552 else
469 } else
470 ret = -EINVAL; 553 ret = -EINVAL;
471 } else if (strncmp(arg, "stack", 5) == 0) { 554 } else if (strncmp(arg, "stack", 5) == 0) {
472 if (arg[5] == '\0') { 555 if (arg[5] == '\0') {
473 ff->func = fetch_stack_address; 556 if (strcmp(t->name, DEFAULT_FETCH_TYPE_STR) == 0)
474 ff->data = NULL; 557 f->fn = fetch_stack_address;
558 else
559 ret = -EINVAL;
475 } else if (isdigit(arg[5])) { 560 } else if (isdigit(arg[5])) {
476 ret = strict_strtoul(arg + 5, 10, &param); 561 ret = strict_strtoul(arg + 5, 10, &param);
477 if (ret || param > PARAM_MAX_STACK) 562 if (ret || param > PARAM_MAX_STACK)
478 ret = -EINVAL; 563 ret = -EINVAL;
479 else { 564 else {
480 ff->func = fetch_stack; 565 f->fn = t->stack;
481 ff->data = (void *)param; 566 f->data = (void *)param;
482 } 567 }
483 } else 568 } else
484 ret = -EINVAL; 569 ret = -EINVAL;
@@ -488,7 +573,8 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
488} 573}
489 574
490/* Recursive argument parser */ 575/* Recursive argument parser */
491static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 576static int __parse_probe_arg(char *arg, const struct fetch_type *t,
577 struct fetch_param *f, int is_return)
492{ 578{
493 int ret = 0; 579 int ret = 0;
494 unsigned long param; 580 unsigned long param;
@@ -497,13 +583,13 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
497 583
498 switch (arg[0]) { 584 switch (arg[0]) {
499 case '$': 585 case '$':
500 ret = parse_probe_vars(arg + 1, ff, is_return); 586 ret = parse_probe_vars(arg + 1, t, f, is_return);
501 break; 587 break;
502 case '%': /* named register */ 588 case '%': /* named register */
503 ret = regs_query_register_offset(arg + 1); 589 ret = regs_query_register_offset(arg + 1);
504 if (ret >= 0) { 590 if (ret >= 0) {
505 ff->func = fetch_register; 591 f->fn = t->reg;
506 ff->data = (void *)(unsigned long)ret; 592 f->data = (void *)(unsigned long)ret;
507 ret = 0; 593 ret = 0;
508 } 594 }
509 break; 595 break;
@@ -512,26 +598,22 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
512 ret = strict_strtoul(arg + 1, 0, &param); 598 ret = strict_strtoul(arg + 1, 0, &param);
513 if (ret) 599 if (ret)
514 break; 600 break;
515 ff->func = fetch_memory; 601 f->fn = t->memory;
516 ff->data = (void *)param; 602 f->data = (void *)param;
517 } else { 603 } else {
518 ret = split_symbol_offset(arg + 1, &offset); 604 ret = split_symbol_offset(arg + 1, &offset);
519 if (ret) 605 if (ret)
520 break; 606 break;
521 ff->data = alloc_symbol_cache(arg + 1, offset); 607 f->data = alloc_symbol_cache(arg + 1, offset);
522 if (ff->data) 608 if (f->data)
523 ff->func = fetch_symbol; 609 f->fn = t->symbol;
524 else
525 ret = -EINVAL;
526 } 610 }
527 break; 611 break;
528 case '+': /* indirect memory */ 612 case '+': /* deref memory */
529 case '-': 613 case '-':
530 tmp = strchr(arg, '('); 614 tmp = strchr(arg, '(');
531 if (!tmp) { 615 if (!tmp)
532 ret = -EINVAL;
533 break; 616 break;
534 }
535 *tmp = '\0'; 617 *tmp = '\0';
536 ret = strict_strtol(arg + 1, 0, &offset); 618 ret = strict_strtol(arg + 1, 0, &offset);
537 if (ret) 619 if (ret)
@@ -541,38 +623,58 @@ static int __parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
541 arg = tmp + 1; 623 arg = tmp + 1;
542 tmp = strrchr(arg, ')'); 624 tmp = strrchr(arg, ')');
543 if (tmp) { 625 if (tmp) {
544 struct indirect_fetch_data *id; 626 struct deref_fetch_param *dprm;
627 const struct fetch_type *t2 = find_fetch_type(NULL);
545 *tmp = '\0'; 628 *tmp = '\0';
546 id = kzalloc(sizeof(struct indirect_fetch_data), 629 dprm = kzalloc(sizeof(struct deref_fetch_param),
547 GFP_KERNEL); 630 GFP_KERNEL);
548 if (!id) 631 if (!dprm)
549 return -ENOMEM; 632 return -ENOMEM;
550 id->offset = offset; 633 dprm->offset = offset;
551 ret = __parse_probe_arg(arg, &id->orig, is_return); 634 ret = __parse_probe_arg(arg, t2, &dprm->orig,
635 is_return);
552 if (ret) 636 if (ret)
553 kfree(id); 637 kfree(dprm);
554 else { 638 else {
555 ff->func = fetch_indirect; 639 f->fn = t->deref;
556 ff->data = (void *)id; 640 f->data = (void *)dprm;
557 } 641 }
558 } else 642 }
559 ret = -EINVAL;
560 break; 643 break;
561 default:
562 /* TODO: support custom handler */
563 ret = -EINVAL;
564 } 644 }
645 if (!ret && !f->fn)
646 ret = -EINVAL;
565 return ret; 647 return ret;
566} 648}
567 649
568/* String length checking wrapper */ 650/* String length checking wrapper */
569static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) 651static int parse_probe_arg(char *arg, struct trace_probe *tp,
652 struct probe_arg *parg, int is_return)
570{ 653{
654 const char *t;
655
571 if (strlen(arg) > MAX_ARGSTR_LEN) { 656 if (strlen(arg) > MAX_ARGSTR_LEN) {
572 pr_info("Argument is too long.: %s\n", arg); 657 pr_info("Argument is too long.: %s\n", arg);
573 return -ENOSPC; 658 return -ENOSPC;
574 } 659 }
575 return __parse_probe_arg(arg, ff, is_return); 660 parg->comm = kstrdup(arg, GFP_KERNEL);
661 if (!parg->comm) {
662 pr_info("Failed to allocate memory for command '%s'.\n", arg);
663 return -ENOMEM;
664 }
665 t = strchr(parg->comm, ':');
666 if (t) {
667 arg[t - parg->comm] = '\0';
668 t++;
669 }
670 parg->type = find_fetch_type(t);
671 if (!parg->type) {
672 pr_info("Unsupported type: %s\n", t);
673 return -EINVAL;
674 }
675 parg->offset = tp->size;
676 tp->size += parg->type->size;
677 return __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
576} 678}
577 679
578/* Return 1 if name is reserved or already used by another argument */ 680/* Return 1 if name is reserved or already used by another argument */
@@ -602,15 +704,18 @@ static int create_trace_probe(int argc, char **argv)
602 * @ADDR : fetch memory at ADDR (ADDR should be in kernel) 704 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
603 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) 705 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
604 * %REG : fetch register REG 706 * %REG : fetch register REG
605 * Indirect memory fetch: 707 * Dereferencing memory fetch:
606 * +|-offs(ARG) : fetch memory at ARG +|- offs address. 708 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
607 * Alias name of args: 709 * Alias name of args:
608 * NAME=FETCHARG : set NAME as alias of FETCHARG. 710 * NAME=FETCHARG : set NAME as alias of FETCHARG.
711 * Type of args:
712 * FETCHARG:TYPE : use TYPE instead of unsigned long.
609 */ 713 */
610 struct trace_probe *tp; 714 struct trace_probe *tp;
611 int i, ret = 0; 715 int i, ret = 0;
612 int is_return = 0, is_delete = 0; 716 int is_return = 0, is_delete = 0;
613 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; 717 char *symbol = NULL, *event = NULL, *group = NULL;
718 char *arg, *tmp;
614 unsigned long offset = 0; 719 unsigned long offset = 0;
615 void *addr = NULL; 720 void *addr = NULL;
616 char buf[MAX_EVENT_NAME_LEN]; 721 char buf[MAX_EVENT_NAME_LEN];
@@ -723,13 +828,6 @@ static int create_trace_probe(int argc, char **argv)
723 else 828 else
724 arg = argv[i]; 829 arg = argv[i];
725 830
726 if (conflict_field_name(argv[i], tp->args, i)) {
727 pr_info("Argument%d name '%s' conflicts with "
728 "another field.\n", i, argv[i]);
729 ret = -EINVAL;
730 goto error;
731 }
732
733 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); 831 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
734 if (!tp->args[i].name) { 832 if (!tp->args[i].name) {
735 pr_info("Failed to allocate argument%d name '%s'.\n", 833 pr_info("Failed to allocate argument%d name '%s'.\n",
@@ -737,9 +835,19 @@ static int create_trace_probe(int argc, char **argv)
737 ret = -ENOMEM; 835 ret = -ENOMEM;
738 goto error; 836 goto error;
739 } 837 }
838 tmp = strchr(tp->args[i].name, ':');
839 if (tmp)
840 *tmp = '_'; /* convert : to _ */
841
842 if (conflict_field_name(tp->args[i].name, tp->args, i)) {
843 pr_info("Argument%d name '%s' conflicts with "
844 "another field.\n", i, argv[i]);
845 ret = -EINVAL;
846 goto error;
847 }
740 848
741 /* Parse fetch argument */ 849 /* Parse fetch argument */
742 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); 850 ret = parse_probe_arg(arg, tp, &tp->args[i], is_return);
743 if (ret) { 851 if (ret) {
744 pr_info("Parse error at argument%d. (%d)\n", i, ret); 852 pr_info("Parse error at argument%d. (%d)\n", i, ret);
745 kfree(tp->args[i].name); 853 kfree(tp->args[i].name);
@@ -794,11 +902,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
794static int probes_seq_show(struct seq_file *m, void *v) 902static int probes_seq_show(struct seq_file *m, void *v)
795{ 903{
796 struct trace_probe *tp = v; 904 struct trace_probe *tp = v;
797 int i, ret; 905 int i;
798 char buf[MAX_ARGSTR_LEN + 1];
799 906
800 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); 907 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
801 seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); 908 seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name);
802 909
803 if (!tp->symbol) 910 if (!tp->symbol)
804 seq_printf(m, " 0x%p", tp->rp.kp.addr); 911 seq_printf(m, " 0x%p", tp->rp.kp.addr);
@@ -807,15 +914,10 @@ static int probes_seq_show(struct seq_file *m, void *v)
807 else 914 else
808 seq_printf(m, " %s", probe_symbol(tp)); 915 seq_printf(m, " %s", probe_symbol(tp));
809 916
810 for (i = 0; i < tp->nr_args; i++) { 917 for (i = 0; i < tp->nr_args; i++)
811 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); 918 seq_printf(m, " %s=%s", tp->args[i].name, tp->args[i].comm);
812 if (ret < 0) {
813 pr_warning("Argument%d decoding error(%d).\n", i, ret);
814 return ret;
815 }
816 seq_printf(m, " %s=%s", tp->args[i].name, buf);
817 }
818 seq_printf(m, "\n"); 919 seq_printf(m, "\n");
920
819 return 0; 921 return 0;
820} 922}
821 923
@@ -945,9 +1047,10 @@ static const struct file_operations kprobe_profile_ops = {
945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 1047static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
946{ 1048{
947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1049 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
948 struct kprobe_trace_entry *entry; 1050 struct kprobe_trace_entry_head *entry;
949 struct ring_buffer_event *event; 1051 struct ring_buffer_event *event;
950 struct ring_buffer *buffer; 1052 struct ring_buffer *buffer;
1053 u8 *data;
951 int size, i, pc; 1054 int size, i, pc;
952 unsigned long irq_flags; 1055 unsigned long irq_flags;
953 struct ftrace_event_call *call = &tp->call; 1056 struct ftrace_event_call *call = &tp->call;
@@ -957,18 +1060,18 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
957 local_save_flags(irq_flags); 1060 local_save_flags(irq_flags);
958 pc = preempt_count(); 1061 pc = preempt_count();
959 1062
960 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1063 size = sizeof(*entry) + tp->size;
961 1064
962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1065 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
963 irq_flags, pc); 1066 size, irq_flags, pc);
964 if (!event) 1067 if (!event)
965 return; 1068 return;
966 1069
967 entry = ring_buffer_event_data(event); 1070 entry = ring_buffer_event_data(event);
968 entry->nargs = tp->nr_args;
969 entry->ip = (unsigned long)kp->addr; 1071 entry->ip = (unsigned long)kp->addr;
1072 data = (u8 *)&entry[1];
970 for (i = 0; i < tp->nr_args; i++) 1073 for (i = 0; i < tp->nr_args; i++)
971 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1074 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
972 1075
973 if (!filter_current_check_discard(buffer, call, entry, event)) 1076 if (!filter_current_check_discard(buffer, call, entry, event))
974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1077 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -979,9 +1082,10 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
979 struct pt_regs *regs) 1082 struct pt_regs *regs)
980{ 1083{
981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1084 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
982 struct kretprobe_trace_entry *entry; 1085 struct kretprobe_trace_entry_head *entry;
983 struct ring_buffer_event *event; 1086 struct ring_buffer_event *event;
984 struct ring_buffer *buffer; 1087 struct ring_buffer *buffer;
1088 u8 *data;
985 int size, i, pc; 1089 int size, i, pc;
986 unsigned long irq_flags; 1090 unsigned long irq_flags;
987 struct ftrace_event_call *call = &tp->call; 1091 struct ftrace_event_call *call = &tp->call;
@@ -989,19 +1093,19 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
989 local_save_flags(irq_flags); 1093 local_save_flags(irq_flags);
990 pc = preempt_count(); 1094 pc = preempt_count();
991 1095
992 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1096 size = sizeof(*entry) + tp->size;
993 1097
994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 1098 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
995 irq_flags, pc); 1099 size, irq_flags, pc);
996 if (!event) 1100 if (!event)
997 return; 1101 return;
998 1102
999 entry = ring_buffer_event_data(event); 1103 entry = ring_buffer_event_data(event);
1000 entry->nargs = tp->nr_args;
1001 entry->func = (unsigned long)tp->rp.kp.addr; 1104 entry->func = (unsigned long)tp->rp.kp.addr;
1002 entry->ret_ip = (unsigned long)ri->ret_addr; 1105 entry->ret_ip = (unsigned long)ri->ret_addr;
1106 data = (u8 *)&entry[1];
1003 for (i = 0; i < tp->nr_args; i++) 1107 for (i = 0; i < tp->nr_args; i++)
1004 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1108 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1005 1109
1006 if (!filter_current_check_discard(buffer, call, entry, event)) 1110 if (!filter_current_check_discard(buffer, call, entry, event))
1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1111 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
@@ -1009,17 +1113,17 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
1009 1113
1010/* Event entry printers */ 1114/* Event entry printers */
1011enum print_line_t 1115enum print_line_t
1012print_kprobe_event(struct trace_iterator *iter, int flags) 1116print_kprobe_event(struct trace_iterator *iter, int flags,
1117 struct trace_event *event)
1013{ 1118{
1014 struct kprobe_trace_entry *field; 1119 struct kprobe_trace_entry_head *field;
1015 struct trace_seq *s = &iter->seq; 1120 struct trace_seq *s = &iter->seq;
1016 struct trace_event *event;
1017 struct trace_probe *tp; 1121 struct trace_probe *tp;
1122 u8 *data;
1018 int i; 1123 int i;
1019 1124
1020 field = (struct kprobe_trace_entry *)iter->ent; 1125 field = (struct kprobe_trace_entry_head *)iter->ent;
1021 event = ftrace_find_event(field->ent.type); 1126 tp = container_of(event, struct trace_probe, call.event);
1022 tp = container_of(event, struct trace_probe, event);
1023 1127
1024 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1128 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1025 goto partial; 1129 goto partial;
@@ -1030,9 +1134,10 @@ print_kprobe_event(struct trace_iterator *iter, int flags)
1030 if (!trace_seq_puts(s, ")")) 1134 if (!trace_seq_puts(s, ")"))
1031 goto partial; 1135 goto partial;
1032 1136
1033 for (i = 0; i < field->nargs; i++) 1137 data = (u8 *)&field[1];
1034 if (!trace_seq_printf(s, " %s=%lx", 1138 for (i = 0; i < tp->nr_args; i++)
1035 tp->args[i].name, field->args[i])) 1139 if (!tp->args[i].type->print(s, tp->args[i].name,
1140 data + tp->args[i].offset))
1036 goto partial; 1141 goto partial;
1037 1142
1038 if (!trace_seq_puts(s, "\n")) 1143 if (!trace_seq_puts(s, "\n"))
@@ -1044,17 +1149,17 @@ partial:
1044} 1149}
1045 1150
1046enum print_line_t 1151enum print_line_t
1047print_kretprobe_event(struct trace_iterator *iter, int flags) 1152print_kretprobe_event(struct trace_iterator *iter, int flags,
1153 struct trace_event *event)
1048{ 1154{
1049 struct kretprobe_trace_entry *field; 1155 struct kretprobe_trace_entry_head *field;
1050 struct trace_seq *s = &iter->seq; 1156 struct trace_seq *s = &iter->seq;
1051 struct trace_event *event;
1052 struct trace_probe *tp; 1157 struct trace_probe *tp;
1158 u8 *data;
1053 int i; 1159 int i;
1054 1160
1055 field = (struct kretprobe_trace_entry *)iter->ent; 1161 field = (struct kretprobe_trace_entry_head *)iter->ent;
1056 event = ftrace_find_event(field->ent.type); 1162 tp = container_of(event, struct trace_probe, call.event);
1057 tp = container_of(event, struct trace_probe, event);
1058 1163
1059 if (!trace_seq_printf(s, "%s: (", tp->call.name)) 1164 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1060 goto partial; 1165 goto partial;
@@ -1071,9 +1176,10 @@ print_kretprobe_event(struct trace_iterator *iter, int flags)
1071 if (!trace_seq_puts(s, ")")) 1176 if (!trace_seq_puts(s, ")"))
1072 goto partial; 1177 goto partial;
1073 1178
1074 for (i = 0; i < field->nargs; i++) 1179 data = (u8 *)&field[1];
1075 if (!trace_seq_printf(s, " %s=%lx", 1180 for (i = 0; i < tp->nr_args; i++)
1076 tp->args[i].name, field->args[i])) 1181 if (!tp->args[i].type->print(s, tp->args[i].name,
1182 data + tp->args[i].offset))
1077 goto partial; 1183 goto partial;
1078 1184
1079 if (!trace_seq_puts(s, "\n")) 1185 if (!trace_seq_puts(s, "\n"))
@@ -1110,8 +1216,6 @@ static void probe_event_disable(struct ftrace_event_call *call)
1110 1216
1111static int probe_event_raw_init(struct ftrace_event_call *event_call) 1217static int probe_event_raw_init(struct ftrace_event_call *event_call)
1112{ 1218{
1113 INIT_LIST_HEAD(&event_call->fields);
1114
1115 return 0; 1219 return 0;
1116} 1220}
1117 1221
@@ -1129,29 +1233,43 @@ static int probe_event_raw_init(struct ftrace_event_call *event_call)
1129static int kprobe_event_define_fields(struct ftrace_event_call *event_call) 1233static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1130{ 1234{
1131 int ret, i; 1235 int ret, i;
1132 struct kprobe_trace_entry field; 1236 struct kprobe_trace_entry_head field;
1133 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1237 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1134 1238
1135 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); 1239 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1136 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1137 /* Set argument names as fields */ 1240 /* Set argument names as fields */
1138 for (i = 0; i < tp->nr_args; i++) 1241 for (i = 0; i < tp->nr_args; i++) {
1139 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1242 ret = trace_define_field(event_call, tp->args[i].type->name,
1243 tp->args[i].name,
1244 sizeof(field) + tp->args[i].offset,
1245 tp->args[i].type->size,
1246 tp->args[i].type->is_signed,
1247 FILTER_OTHER);
1248 if (ret)
1249 return ret;
1250 }
1140 return 0; 1251 return 0;
1141} 1252}
1142 1253
1143static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) 1254static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1144{ 1255{
1145 int ret, i; 1256 int ret, i;
1146 struct kretprobe_trace_entry field; 1257 struct kretprobe_trace_entry_head field;
1147 struct trace_probe *tp = (struct trace_probe *)event_call->data; 1258 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1148 1259
1149 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); 1260 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1150 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); 1261 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1151 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1152 /* Set argument names as fields */ 1262 /* Set argument names as fields */
1153 for (i = 0; i < tp->nr_args; i++) 1263 for (i = 0; i < tp->nr_args; i++) {
1154 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); 1264 ret = trace_define_field(event_call, tp->args[i].type->name,
1265 tp->args[i].name,
1266 sizeof(field) + tp->args[i].offset,
1267 tp->args[i].type->size,
1268 tp->args[i].type->is_signed,
1269 FILTER_OTHER);
1270 if (ret)
1271 return ret;
1272 }
1155 return 0; 1273 return 0;
1156} 1274}
1157 1275
@@ -1176,8 +1294,8 @@ static int __set_print_fmt(struct trace_probe *tp, char *buf, int len)
1176 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); 1294 pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt);
1177 1295
1178 for (i = 0; i < tp->nr_args; i++) { 1296 for (i = 0; i < tp->nr_args; i++) {
1179 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", 1297 pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%s",
1180 tp->args[i].name); 1298 tp->args[i].name, tp->args[i].type->fmt);
1181 } 1299 }
1182 1300
1183 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); 1301 pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg);
@@ -1219,28 +1337,30 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp,
1219{ 1337{
1220 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1338 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1221 struct ftrace_event_call *call = &tp->call; 1339 struct ftrace_event_call *call = &tp->call;
1222 struct kprobe_trace_entry *entry; 1340 struct kprobe_trace_entry_head *entry;
1341 struct hlist_head *head;
1342 u8 *data;
1223 int size, __size, i; 1343 int size, __size, i;
1224 unsigned long irq_flags;
1225 int rctx; 1344 int rctx;
1226 1345
1227 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1346 __size = sizeof(*entry) + tp->size;
1228 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1347 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1229 size -= sizeof(u32); 1348 size -= sizeof(u32);
1230 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1349 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1231 "profile buffer not large enough")) 1350 "profile buffer not large enough"))
1232 return; 1351 return;
1233 1352
1234 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); 1353 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1235 if (!entry) 1354 if (!entry)
1236 return; 1355 return;
1237 1356
1238 entry->nargs = tp->nr_args;
1239 entry->ip = (unsigned long)kp->addr; 1357 entry->ip = (unsigned long)kp->addr;
1358 data = (u8 *)&entry[1];
1240 for (i = 0; i < tp->nr_args; i++) 1359 for (i = 0; i < tp->nr_args; i++)
1241 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1360 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1242 1361
1243 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); 1362 head = this_cpu_ptr(call->perf_events);
1363 perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head);
1244} 1364}
1245 1365
1246/* Kretprobe profile handler */ 1366/* Kretprobe profile handler */
@@ -1249,30 +1369,31 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri,
1249{ 1369{
1250 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1370 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1251 struct ftrace_event_call *call = &tp->call; 1371 struct ftrace_event_call *call = &tp->call;
1252 struct kretprobe_trace_entry *entry; 1372 struct kretprobe_trace_entry_head *entry;
1373 struct hlist_head *head;
1374 u8 *data;
1253 int size, __size, i; 1375 int size, __size, i;
1254 unsigned long irq_flags;
1255 int rctx; 1376 int rctx;
1256 1377
1257 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1378 __size = sizeof(*entry) + tp->size;
1258 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1379 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1259 size -= sizeof(u32); 1380 size -= sizeof(u32);
1260 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, 1381 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
1261 "profile buffer not large enough")) 1382 "profile buffer not large enough"))
1262 return; 1383 return;
1263 1384
1264 entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); 1385 entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
1265 if (!entry) 1386 if (!entry)
1266 return; 1387 return;
1267 1388
1268 entry->nargs = tp->nr_args;
1269 entry->func = (unsigned long)tp->rp.kp.addr; 1389 entry->func = (unsigned long)tp->rp.kp.addr;
1270 entry->ret_ip = (unsigned long)ri->ret_addr; 1390 entry->ret_ip = (unsigned long)ri->ret_addr;
1391 data = (u8 *)&entry[1];
1271 for (i = 0; i < tp->nr_args; i++) 1392 for (i = 0; i < tp->nr_args; i++)
1272 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1393 call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset);
1273 1394
1274 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, 1395 head = this_cpu_ptr(call->perf_events);
1275 irq_flags, regs); 1396 perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head);
1276} 1397}
1277 1398
1278static int probe_perf_enable(struct ftrace_event_call *call) 1399static int probe_perf_enable(struct ftrace_event_call *call)
@@ -1302,6 +1423,26 @@ static void probe_perf_disable(struct ftrace_event_call *call)
1302} 1423}
1303#endif /* CONFIG_PERF_EVENTS */ 1424#endif /* CONFIG_PERF_EVENTS */
1304 1425
1426static __kprobes
1427int kprobe_register(struct ftrace_event_call *event, enum trace_reg type)
1428{
1429 switch (type) {
1430 case TRACE_REG_REGISTER:
1431 return probe_event_enable(event);
1432 case TRACE_REG_UNREGISTER:
1433 probe_event_disable(event);
1434 return 0;
1435
1436#ifdef CONFIG_PERF_EVENTS
1437 case TRACE_REG_PERF_REGISTER:
1438 return probe_perf_enable(event);
1439 case TRACE_REG_PERF_UNREGISTER:
1440 probe_perf_disable(event);
1441 return 0;
1442#endif
1443 }
1444 return 0;
1445}
1305 1446
1306static __kprobes 1447static __kprobes
1307int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) 1448int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
@@ -1331,6 +1472,14 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1331 return 0; /* We don't tweek kernel, so just return 0 */ 1472 return 0; /* We don't tweek kernel, so just return 0 */
1332} 1473}
1333 1474
1475static struct trace_event_functions kretprobe_funcs = {
1476 .trace = print_kretprobe_event
1477};
1478
1479static struct trace_event_functions kprobe_funcs = {
1480 .trace = print_kprobe_event
1481};
1482
1334static int register_probe_event(struct trace_probe *tp) 1483static int register_probe_event(struct trace_probe *tp)
1335{ 1484{
1336 struct ftrace_event_call *call = &tp->call; 1485 struct ftrace_event_call *call = &tp->call;
@@ -1338,36 +1487,31 @@ static int register_probe_event(struct trace_probe *tp)
1338 1487
1339 /* Initialize ftrace_event_call */ 1488 /* Initialize ftrace_event_call */
1340 if (probe_is_return(tp)) { 1489 if (probe_is_return(tp)) {
1341 tp->event.trace = print_kretprobe_event; 1490 INIT_LIST_HEAD(&call->class->fields);
1342 call->raw_init = probe_event_raw_init; 1491 call->event.funcs = &kretprobe_funcs;
1343 call->define_fields = kretprobe_event_define_fields; 1492 call->class->raw_init = probe_event_raw_init;
1493 call->class->define_fields = kretprobe_event_define_fields;
1344 } else { 1494 } else {
1345 tp->event.trace = print_kprobe_event; 1495 INIT_LIST_HEAD(&call->class->fields);
1346 call->raw_init = probe_event_raw_init; 1496 call->event.funcs = &kprobe_funcs;
1347 call->define_fields = kprobe_event_define_fields; 1497 call->class->raw_init = probe_event_raw_init;
1498 call->class->define_fields = kprobe_event_define_fields;
1348 } 1499 }
1349 if (set_print_fmt(tp) < 0) 1500 if (set_print_fmt(tp) < 0)
1350 return -ENOMEM; 1501 return -ENOMEM;
1351 call->event = &tp->event; 1502 ret = register_ftrace_event(&call->event);
1352 call->id = register_ftrace_event(&tp->event); 1503 if (!ret) {
1353 if (!call->id) {
1354 kfree(call->print_fmt); 1504 kfree(call->print_fmt);
1355 return -ENODEV; 1505 return -ENODEV;
1356 } 1506 }
1357 call->enabled = 0; 1507 call->flags = 0;
1358 call->regfunc = probe_event_enable; 1508 call->class->reg = kprobe_register;
1359 call->unregfunc = probe_event_disable;
1360
1361#ifdef CONFIG_PERF_EVENTS
1362 call->perf_event_enable = probe_perf_enable;
1363 call->perf_event_disable = probe_perf_disable;
1364#endif
1365 call->data = tp; 1509 call->data = tp;
1366 ret = trace_add_event_call(call); 1510 ret = trace_add_event_call(call);
1367 if (ret) { 1511 if (ret) {
1368 pr_info("Failed to register kprobe event: %s\n", call->name); 1512 pr_info("Failed to register kprobe event: %s\n", call->name);
1369 kfree(call->print_fmt); 1513 kfree(call->print_fmt);
1370 unregister_ftrace_event(&tp->event); 1514 unregister_ftrace_event(&call->event);
1371 } 1515 }
1372 return ret; 1516 return ret;
1373} 1517}
diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c
index 94103cdcf9d8..8eaf00749b65 100644
--- a/kernel/trace/trace_ksym.c
+++ b/kernel/trace/trace_ksym.c
@@ -23,6 +23,7 @@
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/module.h> 25#include <linux/module.h>
26#include <linux/slab.h>
26#include <linux/fs.h> 27#include <linux/fs.h>
27 28
28#include "trace_output.h" 29#include "trace_output.h"
@@ -33,12 +34,6 @@
33 34
34#include <asm/atomic.h> 35#include <asm/atomic.h>
35 36
36/*
37 * For now, let us restrict the no. of symbols traced simultaneously to number
38 * of available hardware breakpoint registers.
39 */
40#define KSYM_TRACER_MAX HBP_NUM
41
42#define KSYM_TRACER_OP_LEN 3 /* rw- */ 37#define KSYM_TRACER_OP_LEN 3 /* rw- */
43 38
44struct trace_ksym { 39struct trace_ksym {
@@ -52,7 +47,6 @@ struct trace_ksym {
52 47
53static struct trace_array *ksym_trace_array; 48static struct trace_array *ksym_trace_array;
54 49
55static unsigned int ksym_filter_entry_count;
56static unsigned int ksym_tracing_enabled; 50static unsigned int ksym_tracing_enabled;
57 51
58static HLIST_HEAD(ksym_filter_head); 52static HLIST_HEAD(ksym_filter_head);
@@ -180,13 +174,6 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
180 struct trace_ksym *entry; 174 struct trace_ksym *entry;
181 int ret = -ENOMEM; 175 int ret = -ENOMEM;
182 176
183 if (ksym_filter_entry_count >= KSYM_TRACER_MAX) {
184 printk(KERN_ERR "ksym_tracer: Maximum limit:(%d) reached. No"
185 " new requests for tracing can be accepted now.\n",
186 KSYM_TRACER_MAX);
187 return -ENOSPC;
188 }
189
190 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL); 177 entry = kzalloc(sizeof(struct trace_ksym), GFP_KERNEL);
191 if (!entry) 178 if (!entry)
192 return -ENOMEM; 179 return -ENOMEM;
@@ -202,13 +189,17 @@ int process_new_ksym_entry(char *ksymname, int op, unsigned long addr)
202 189
203 if (IS_ERR(entry->ksym_hbp)) { 190 if (IS_ERR(entry->ksym_hbp)) {
204 ret = PTR_ERR(entry->ksym_hbp); 191 ret = PTR_ERR(entry->ksym_hbp);
205 printk(KERN_INFO "ksym_tracer request failed. Try again" 192 if (ret == -ENOSPC) {
206 " later!!\n"); 193 printk(KERN_ERR "ksym_tracer: Maximum limit reached."
194 " No new requests for tracing can be accepted now.\n");
195 } else {
196 printk(KERN_INFO "ksym_tracer request failed. Try again"
197 " later!!\n");
198 }
207 goto err; 199 goto err;
208 } 200 }
209 201
210 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head); 202 hlist_add_head_rcu(&(entry->ksym_hlist), &ksym_filter_head);
211 ksym_filter_entry_count++;
212 203
213 return 0; 204 return 0;
214 205
@@ -264,7 +255,6 @@ static void __ksym_trace_reset(void)
264 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head, 255 hlist_for_each_entry_safe(entry, node, node1, &ksym_filter_head,
265 ksym_hlist) { 256 ksym_hlist) {
266 unregister_wide_hw_breakpoint(entry->ksym_hbp); 257 unregister_wide_hw_breakpoint(entry->ksym_hbp);
267 ksym_filter_entry_count--;
268 hlist_del_rcu(&(entry->ksym_hlist)); 258 hlist_del_rcu(&(entry->ksym_hlist));
269 synchronize_rcu(); 259 synchronize_rcu();
270 kfree(entry); 260 kfree(entry);
@@ -337,7 +327,6 @@ static ssize_t ksym_trace_filter_write(struct file *file,
337 goto out_unlock; 327 goto out_unlock;
338 } 328 }
339 /* Error or "symbol:---" case: drop it */ 329 /* Error or "symbol:---" case: drop it */
340 ksym_filter_entry_count--;
341 hlist_del_rcu(&(entry->ksym_hlist)); 330 hlist_del_rcu(&(entry->ksym_hlist));
342 synchronize_rcu(); 331 synchronize_rcu();
343 kfree(entry); 332 kfree(entry);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 0acd834659ed..017fa376505d 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/mmiotrace.h> 10#include <linux/mmiotrace.h>
11#include <linux/pci.h> 11#include <linux/pci.h>
12#include <linux/slab.h>
12#include <linux/time.h> 13#include <linux/time.h>
13 14
14#include <asm/atomic.h> 15#include <asm/atomic.h>
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8e46b3323cdc..57c1b4596470 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -209,6 +209,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
209 209
210 return 1; 210 return 1;
211} 211}
212EXPORT_SYMBOL(trace_seq_putc);
212 213
213int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) 214int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
214{ 215{
@@ -253,7 +254,7 @@ void *trace_seq_reserve(struct trace_seq *s, size_t len)
253 void *ret; 254 void *ret;
254 255
255 if (s->full) 256 if (s->full)
256 return 0; 257 return NULL;
257 258
258 if (len > ((PAGE_SIZE - 1) - s->len)) { 259 if (len > ((PAGE_SIZE - 1) - s->len)) {
259 s->full = 1; 260 s->full = 1;
@@ -355,6 +356,21 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
355} 356}
356EXPORT_SYMBOL(ftrace_print_symbols_seq); 357EXPORT_SYMBOL(ftrace_print_symbols_seq);
357 358
359const char *
360ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
361{
362 int i;
363 const char *ret = p->buffer + p->len;
364
365 for (i = 0; i < buf_len; i++)
366 trace_seq_printf(p, "%s%2.2x", i == 0 ? "" : " ", buf[i]);
367
368 trace_seq_putc(p, 0);
369
370 return ret;
371}
372EXPORT_SYMBOL(ftrace_print_hex_seq);
373
358#ifdef CONFIG_KRETPROBES 374#ifdef CONFIG_KRETPROBES
359static inline const char *kretprobed(const char *name) 375static inline const char *kretprobed(const char *name)
360{ 376{
@@ -726,6 +742,9 @@ int register_ftrace_event(struct trace_event *event)
726 if (WARN_ON(!event)) 742 if (WARN_ON(!event))
727 goto out; 743 goto out;
728 744
745 if (WARN_ON(!event->funcs))
746 goto out;
747
729 INIT_LIST_HEAD(&event->list); 748 INIT_LIST_HEAD(&event->list);
730 749
731 if (!event->type) { 750 if (!event->type) {
@@ -758,14 +777,14 @@ int register_ftrace_event(struct trace_event *event)
758 goto out; 777 goto out;
759 } 778 }
760 779
761 if (event->trace == NULL) 780 if (event->funcs->trace == NULL)
762 event->trace = trace_nop_print; 781 event->funcs->trace = trace_nop_print;
763 if (event->raw == NULL) 782 if (event->funcs->raw == NULL)
764 event->raw = trace_nop_print; 783 event->funcs->raw = trace_nop_print;
765 if (event->hex == NULL) 784 if (event->funcs->hex == NULL)
766 event->hex = trace_nop_print; 785 event->funcs->hex = trace_nop_print;
767 if (event->binary == NULL) 786 if (event->funcs->binary == NULL)
768 event->binary = trace_nop_print; 787 event->funcs->binary = trace_nop_print;
769 788
770 key = event->type & (EVENT_HASHSIZE - 1); 789 key = event->type & (EVENT_HASHSIZE - 1);
771 790
@@ -807,13 +826,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
807 * Standard events 826 * Standard events
808 */ 827 */
809 828
810enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) 829enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
830 struct trace_event *event)
811{ 831{
812 return TRACE_TYPE_HANDLED; 832 return TRACE_TYPE_HANDLED;
813} 833}
814 834
815/* TRACE_FN */ 835/* TRACE_FN */
816static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) 836static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags,
837 struct trace_event *event)
817{ 838{
818 struct ftrace_entry *field; 839 struct ftrace_entry *field;
819 struct trace_seq *s = &iter->seq; 840 struct trace_seq *s = &iter->seq;
@@ -840,7 +861,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
840 return TRACE_TYPE_PARTIAL_LINE; 861 return TRACE_TYPE_PARTIAL_LINE;
841} 862}
842 863
843static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) 864static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags,
865 struct trace_event *event)
844{ 866{
845 struct ftrace_entry *field; 867 struct ftrace_entry *field;
846 868
@@ -854,7 +876,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
854 return TRACE_TYPE_HANDLED; 876 return TRACE_TYPE_HANDLED;
855} 877}
856 878
857static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) 879static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags,
880 struct trace_event *event)
858{ 881{
859 struct ftrace_entry *field; 882 struct ftrace_entry *field;
860 struct trace_seq *s = &iter->seq; 883 struct trace_seq *s = &iter->seq;
@@ -867,7 +890,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
867 return TRACE_TYPE_HANDLED; 890 return TRACE_TYPE_HANDLED;
868} 891}
869 892
870static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) 893static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags,
894 struct trace_event *event)
871{ 895{
872 struct ftrace_entry *field; 896 struct ftrace_entry *field;
873 struct trace_seq *s = &iter->seq; 897 struct trace_seq *s = &iter->seq;
@@ -880,14 +904,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
880 return TRACE_TYPE_HANDLED; 904 return TRACE_TYPE_HANDLED;
881} 905}
882 906
883static struct trace_event trace_fn_event = { 907static struct trace_event_functions trace_fn_funcs = {
884 .type = TRACE_FN,
885 .trace = trace_fn_trace, 908 .trace = trace_fn_trace,
886 .raw = trace_fn_raw, 909 .raw = trace_fn_raw,
887 .hex = trace_fn_hex, 910 .hex = trace_fn_hex,
888 .binary = trace_fn_bin, 911 .binary = trace_fn_bin,
889}; 912};
890 913
914static struct trace_event trace_fn_event = {
915 .type = TRACE_FN,
916 .funcs = &trace_fn_funcs,
917};
918
891/* TRACE_CTX an TRACE_WAKE */ 919/* TRACE_CTX an TRACE_WAKE */
892static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, 920static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
893 char *delim) 921 char *delim)
@@ -916,13 +944,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
916 return TRACE_TYPE_HANDLED; 944 return TRACE_TYPE_HANDLED;
917} 945}
918 946
919static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags) 947static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags,
948 struct trace_event *event)
920{ 949{
921 return trace_ctxwake_print(iter, "==>"); 950 return trace_ctxwake_print(iter, "==>");
922} 951}
923 952
924static enum print_line_t trace_wake_print(struct trace_iterator *iter, 953static enum print_line_t trace_wake_print(struct trace_iterator *iter,
925 int flags) 954 int flags, struct trace_event *event)
926{ 955{
927 return trace_ctxwake_print(iter, " +"); 956 return trace_ctxwake_print(iter, " +");
928} 957}
@@ -950,12 +979,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
950 return TRACE_TYPE_HANDLED; 979 return TRACE_TYPE_HANDLED;
951} 980}
952 981
953static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags) 982static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags,
983 struct trace_event *event)
954{ 984{
955 return trace_ctxwake_raw(iter, 0); 985 return trace_ctxwake_raw(iter, 0);
956} 986}
957 987
958static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags) 988static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags,
989 struct trace_event *event)
959{ 990{
960 return trace_ctxwake_raw(iter, '+'); 991 return trace_ctxwake_raw(iter, '+');
961} 992}
@@ -984,18 +1015,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
984 return TRACE_TYPE_HANDLED; 1015 return TRACE_TYPE_HANDLED;
985} 1016}
986 1017
987static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags) 1018static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags,
1019 struct trace_event *event)
988{ 1020{
989 return trace_ctxwake_hex(iter, 0); 1021 return trace_ctxwake_hex(iter, 0);
990} 1022}
991 1023
992static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags) 1024static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags,
1025 struct trace_event *event)
993{ 1026{
994 return trace_ctxwake_hex(iter, '+'); 1027 return trace_ctxwake_hex(iter, '+');
995} 1028}
996 1029
997static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, 1030static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
998 int flags) 1031 int flags, struct trace_event *event)
999{ 1032{
1000 struct ctx_switch_entry *field; 1033 struct ctx_switch_entry *field;
1001 struct trace_seq *s = &iter->seq; 1034 struct trace_seq *s = &iter->seq;
@@ -1012,25 +1045,33 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
1012 return TRACE_TYPE_HANDLED; 1045 return TRACE_TYPE_HANDLED;
1013} 1046}
1014 1047
1015static struct trace_event trace_ctx_event = { 1048static struct trace_event_functions trace_ctx_funcs = {
1016 .type = TRACE_CTX,
1017 .trace = trace_ctx_print, 1049 .trace = trace_ctx_print,
1018 .raw = trace_ctx_raw, 1050 .raw = trace_ctx_raw,
1019 .hex = trace_ctx_hex, 1051 .hex = trace_ctx_hex,
1020 .binary = trace_ctxwake_bin, 1052 .binary = trace_ctxwake_bin,
1021}; 1053};
1022 1054
1023static struct trace_event trace_wake_event = { 1055static struct trace_event trace_ctx_event = {
1024 .type = TRACE_WAKE, 1056 .type = TRACE_CTX,
1057 .funcs = &trace_ctx_funcs,
1058};
1059
1060static struct trace_event_functions trace_wake_funcs = {
1025 .trace = trace_wake_print, 1061 .trace = trace_wake_print,
1026 .raw = trace_wake_raw, 1062 .raw = trace_wake_raw,
1027 .hex = trace_wake_hex, 1063 .hex = trace_wake_hex,
1028 .binary = trace_ctxwake_bin, 1064 .binary = trace_ctxwake_bin,
1029}; 1065};
1030 1066
1067static struct trace_event trace_wake_event = {
1068 .type = TRACE_WAKE,
1069 .funcs = &trace_wake_funcs,
1070};
1071
1031/* TRACE_SPECIAL */ 1072/* TRACE_SPECIAL */
1032static enum print_line_t trace_special_print(struct trace_iterator *iter, 1073static enum print_line_t trace_special_print(struct trace_iterator *iter,
1033 int flags) 1074 int flags, struct trace_event *event)
1034{ 1075{
1035 struct special_entry *field; 1076 struct special_entry *field;
1036 1077
@@ -1046,7 +1087,7 @@ static enum print_line_t trace_special_print(struct trace_iterator *iter,
1046} 1087}
1047 1088
1048static enum print_line_t trace_special_hex(struct trace_iterator *iter, 1089static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1049 int flags) 1090 int flags, struct trace_event *event)
1050{ 1091{
1051 struct special_entry *field; 1092 struct special_entry *field;
1052 struct trace_seq *s = &iter->seq; 1093 struct trace_seq *s = &iter->seq;
@@ -1061,7 +1102,7 @@ static enum print_line_t trace_special_hex(struct trace_iterator *iter,
1061} 1102}
1062 1103
1063static enum print_line_t trace_special_bin(struct trace_iterator *iter, 1104static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1064 int flags) 1105 int flags, struct trace_event *event)
1065{ 1106{
1066 struct special_entry *field; 1107 struct special_entry *field;
1067 struct trace_seq *s = &iter->seq; 1108 struct trace_seq *s = &iter->seq;
@@ -1075,18 +1116,22 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
1075 return TRACE_TYPE_HANDLED; 1116 return TRACE_TYPE_HANDLED;
1076} 1117}
1077 1118
1078static struct trace_event trace_special_event = { 1119static struct trace_event_functions trace_special_funcs = {
1079 .type = TRACE_SPECIAL,
1080 .trace = trace_special_print, 1120 .trace = trace_special_print,
1081 .raw = trace_special_print, 1121 .raw = trace_special_print,
1082 .hex = trace_special_hex, 1122 .hex = trace_special_hex,
1083 .binary = trace_special_bin, 1123 .binary = trace_special_bin,
1084}; 1124};
1085 1125
1126static struct trace_event trace_special_event = {
1127 .type = TRACE_SPECIAL,
1128 .funcs = &trace_special_funcs,
1129};
1130
1086/* TRACE_STACK */ 1131/* TRACE_STACK */
1087 1132
1088static enum print_line_t trace_stack_print(struct trace_iterator *iter, 1133static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1089 int flags) 1134 int flags, struct trace_event *event)
1090{ 1135{
1091 struct stack_entry *field; 1136 struct stack_entry *field;
1092 struct trace_seq *s = &iter->seq; 1137 struct trace_seq *s = &iter->seq;
@@ -1114,17 +1159,21 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
1114 return TRACE_TYPE_PARTIAL_LINE; 1159 return TRACE_TYPE_PARTIAL_LINE;
1115} 1160}
1116 1161
1117static struct trace_event trace_stack_event = { 1162static struct trace_event_functions trace_stack_funcs = {
1118 .type = TRACE_STACK,
1119 .trace = trace_stack_print, 1163 .trace = trace_stack_print,
1120 .raw = trace_special_print, 1164 .raw = trace_special_print,
1121 .hex = trace_special_hex, 1165 .hex = trace_special_hex,
1122 .binary = trace_special_bin, 1166 .binary = trace_special_bin,
1123}; 1167};
1124 1168
1169static struct trace_event trace_stack_event = {
1170 .type = TRACE_STACK,
1171 .funcs = &trace_stack_funcs,
1172};
1173
1125/* TRACE_USER_STACK */ 1174/* TRACE_USER_STACK */
1126static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, 1175static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1127 int flags) 1176 int flags, struct trace_event *event)
1128{ 1177{
1129 struct userstack_entry *field; 1178 struct userstack_entry *field;
1130 struct trace_seq *s = &iter->seq; 1179 struct trace_seq *s = &iter->seq;
@@ -1143,17 +1192,22 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
1143 return TRACE_TYPE_PARTIAL_LINE; 1192 return TRACE_TYPE_PARTIAL_LINE;
1144} 1193}
1145 1194
1146static struct trace_event trace_user_stack_event = { 1195static struct trace_event_functions trace_user_stack_funcs = {
1147 .type = TRACE_USER_STACK,
1148 .trace = trace_user_stack_print, 1196 .trace = trace_user_stack_print,
1149 .raw = trace_special_print, 1197 .raw = trace_special_print,
1150 .hex = trace_special_hex, 1198 .hex = trace_special_hex,
1151 .binary = trace_special_bin, 1199 .binary = trace_special_bin,
1152}; 1200};
1153 1201
1202static struct trace_event trace_user_stack_event = {
1203 .type = TRACE_USER_STACK,
1204 .funcs = &trace_user_stack_funcs,
1205};
1206
1154/* TRACE_BPRINT */ 1207/* TRACE_BPRINT */
1155static enum print_line_t 1208static enum print_line_t
1156trace_bprint_print(struct trace_iterator *iter, int flags) 1209trace_bprint_print(struct trace_iterator *iter, int flags,
1210 struct trace_event *event)
1157{ 1211{
1158 struct trace_entry *entry = iter->ent; 1212 struct trace_entry *entry = iter->ent;
1159 struct trace_seq *s = &iter->seq; 1213 struct trace_seq *s = &iter->seq;
@@ -1178,7 +1232,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags)
1178 1232
1179 1233
1180static enum print_line_t 1234static enum print_line_t
1181trace_bprint_raw(struct trace_iterator *iter, int flags) 1235trace_bprint_raw(struct trace_iterator *iter, int flags,
1236 struct trace_event *event)
1182{ 1237{
1183 struct bprint_entry *field; 1238 struct bprint_entry *field;
1184 struct trace_seq *s = &iter->seq; 1239 struct trace_seq *s = &iter->seq;
@@ -1197,16 +1252,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags)
1197 return TRACE_TYPE_PARTIAL_LINE; 1252 return TRACE_TYPE_PARTIAL_LINE;
1198} 1253}
1199 1254
1255static struct trace_event_functions trace_bprint_funcs = {
1256 .trace = trace_bprint_print,
1257 .raw = trace_bprint_raw,
1258};
1200 1259
1201static struct trace_event trace_bprint_event = { 1260static struct trace_event trace_bprint_event = {
1202 .type = TRACE_BPRINT, 1261 .type = TRACE_BPRINT,
1203 .trace = trace_bprint_print, 1262 .funcs = &trace_bprint_funcs,
1204 .raw = trace_bprint_raw,
1205}; 1263};
1206 1264
1207/* TRACE_PRINT */ 1265/* TRACE_PRINT */
1208static enum print_line_t trace_print_print(struct trace_iterator *iter, 1266static enum print_line_t trace_print_print(struct trace_iterator *iter,
1209 int flags) 1267 int flags, struct trace_event *event)
1210{ 1268{
1211 struct print_entry *field; 1269 struct print_entry *field;
1212 struct trace_seq *s = &iter->seq; 1270 struct trace_seq *s = &iter->seq;
@@ -1225,7 +1283,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter,
1225 return TRACE_TYPE_PARTIAL_LINE; 1283 return TRACE_TYPE_PARTIAL_LINE;
1226} 1284}
1227 1285
1228static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) 1286static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags,
1287 struct trace_event *event)
1229{ 1288{
1230 struct print_entry *field; 1289 struct print_entry *field;
1231 1290
@@ -1240,12 +1299,16 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
1240 return TRACE_TYPE_PARTIAL_LINE; 1299 return TRACE_TYPE_PARTIAL_LINE;
1241} 1300}
1242 1301
1243static struct trace_event trace_print_event = { 1302static struct trace_event_functions trace_print_funcs = {
1244 .type = TRACE_PRINT,
1245 .trace = trace_print_print, 1303 .trace = trace_print_print,
1246 .raw = trace_print_raw, 1304 .raw = trace_print_raw,
1247}; 1305};
1248 1306
1307static struct trace_event trace_print_event = {
1308 .type = TRACE_PRINT,
1309 .funcs = &trace_print_funcs,
1310};
1311
1249 1312
1250static struct trace_event *events[] __initdata = { 1313static struct trace_event *events[] __initdata = {
1251 &trace_fn_event, 1314 &trace_fn_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 9d91c72ba38b..c038eba0492b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void);
25extern struct trace_event *ftrace_find_event(int type); 25extern struct trace_event *ftrace_find_event(int type);
26 26
27extern enum print_line_t trace_nop_print(struct trace_iterator *iter, 27extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
28 int flags); 28 int flags, struct trace_event *event);
29extern int 29extern int
30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); 30trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
31 31
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 5fca0f51fde4..8f758d070c43 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -50,8 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
50} 50}
51 51
52static void 52static void
53probe_sched_switch(struct rq *__rq, struct task_struct *prev, 53probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next)
54 struct task_struct *next)
55{ 54{
56 struct trace_array_cpu *data; 55 struct trace_array_cpu *data;
57 unsigned long flags; 56 unsigned long flags;
@@ -109,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
109} 108}
110 109
111static void 110static void
112probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) 111probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
113{ 112{
114 struct trace_array_cpu *data; 113 struct trace_array_cpu *data;
115 unsigned long flags; 114 unsigned long flags;
@@ -139,21 +138,21 @@ static int tracing_sched_register(void)
139{ 138{
140 int ret; 139 int ret;
141 140
142 ret = register_trace_sched_wakeup(probe_sched_wakeup); 141 ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL);
143 if (ret) { 142 if (ret) {
144 pr_info("wakeup trace: Couldn't activate tracepoint" 143 pr_info("wakeup trace: Couldn't activate tracepoint"
145 " probe to kernel_sched_wakeup\n"); 144 " probe to kernel_sched_wakeup\n");
146 return ret; 145 return ret;
147 } 146 }
148 147
149 ret = register_trace_sched_wakeup_new(probe_sched_wakeup); 148 ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
150 if (ret) { 149 if (ret) {
151 pr_info("wakeup trace: Couldn't activate tracepoint" 150 pr_info("wakeup trace: Couldn't activate tracepoint"
152 " probe to kernel_sched_wakeup_new\n"); 151 " probe to kernel_sched_wakeup_new\n");
153 goto fail_deprobe; 152 goto fail_deprobe;
154 } 153 }
155 154
156 ret = register_trace_sched_switch(probe_sched_switch); 155 ret = register_trace_sched_switch(probe_sched_switch, NULL);
157 if (ret) { 156 if (ret) {
158 pr_info("sched trace: Couldn't activate tracepoint" 157 pr_info("sched trace: Couldn't activate tracepoint"
159 " probe to kernel_sched_switch\n"); 158 " probe to kernel_sched_switch\n");
@@ -162,17 +161,17 @@ static int tracing_sched_register(void)
162 161
163 return ret; 162 return ret;
164fail_deprobe_wake_new: 163fail_deprobe_wake_new:
165 unregister_trace_sched_wakeup_new(probe_sched_wakeup); 164 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
166fail_deprobe: 165fail_deprobe:
167 unregister_trace_sched_wakeup(probe_sched_wakeup); 166 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
168 return ret; 167 return ret;
169} 168}
170 169
171static void tracing_sched_unregister(void) 170static void tracing_sched_unregister(void)
172{ 171{
173 unregister_trace_sched_switch(probe_sched_switch); 172 unregister_trace_sched_switch(probe_sched_switch, NULL);
174 unregister_trace_sched_wakeup_new(probe_sched_wakeup); 173 unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL);
175 unregister_trace_sched_wakeup(probe_sched_wakeup); 174 unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
176} 175}
177 176
178static void tracing_start_sched_switch(void) 177static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 0271742abb8d..0e73bc2ef8c5 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -98,7 +98,8 @@ static int report_latency(cycle_t delta)
98 return 1; 98 return 1;
99} 99}
100 100
101static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) 101static void
102probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
102{ 103{
103 if (task != wakeup_task) 104 if (task != wakeup_task)
104 return; 105 return;
@@ -107,8 +108,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu)
107} 108}
108 109
109static void notrace 110static void notrace
110probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, 111probe_wakeup_sched_switch(void *ignore,
111 struct task_struct *next) 112 struct task_struct *prev, struct task_struct *next)
112{ 113{
113 struct trace_array_cpu *data; 114 struct trace_array_cpu *data;
114 cycle_t T0, T1, delta; 115 cycle_t T0, T1, delta;
@@ -200,7 +201,7 @@ static void wakeup_reset(struct trace_array *tr)
200} 201}
201 202
202static void 203static void
203probe_wakeup(struct rq *rq, struct task_struct *p, int success) 204probe_wakeup(void *ignore, struct task_struct *p, int success)
204{ 205{
205 struct trace_array_cpu *data; 206 struct trace_array_cpu *data;
206 int cpu = smp_processor_id(); 207 int cpu = smp_processor_id();
@@ -264,28 +265,28 @@ static void start_wakeup_tracer(struct trace_array *tr)
264{ 265{
265 int ret; 266 int ret;
266 267
267 ret = register_trace_sched_wakeup(probe_wakeup); 268 ret = register_trace_sched_wakeup(probe_wakeup, NULL);
268 if (ret) { 269 if (ret) {
269 pr_info("wakeup trace: Couldn't activate tracepoint" 270 pr_info("wakeup trace: Couldn't activate tracepoint"
270 " probe to kernel_sched_wakeup\n"); 271 " probe to kernel_sched_wakeup\n");
271 return; 272 return;
272 } 273 }
273 274
274 ret = register_trace_sched_wakeup_new(probe_wakeup); 275 ret = register_trace_sched_wakeup_new(probe_wakeup, NULL);
275 if (ret) { 276 if (ret) {
276 pr_info("wakeup trace: Couldn't activate tracepoint" 277 pr_info("wakeup trace: Couldn't activate tracepoint"
277 " probe to kernel_sched_wakeup_new\n"); 278 " probe to kernel_sched_wakeup_new\n");
278 goto fail_deprobe; 279 goto fail_deprobe;
279 } 280 }
280 281
281 ret = register_trace_sched_switch(probe_wakeup_sched_switch); 282 ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL);
282 if (ret) { 283 if (ret) {
283 pr_info("sched trace: Couldn't activate tracepoint" 284 pr_info("sched trace: Couldn't activate tracepoint"
284 " probe to kernel_sched_switch\n"); 285 " probe to kernel_sched_switch\n");
285 goto fail_deprobe_wake_new; 286 goto fail_deprobe_wake_new;
286 } 287 }
287 288
288 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); 289 ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
289 if (ret) { 290 if (ret) {
290 pr_info("wakeup trace: Couldn't activate tracepoint" 291 pr_info("wakeup trace: Couldn't activate tracepoint"
291 " probe to kernel_sched_migrate_task\n"); 292 " probe to kernel_sched_migrate_task\n");
@@ -312,19 +313,19 @@ static void start_wakeup_tracer(struct trace_array *tr)
312 313
313 return; 314 return;
314fail_deprobe_wake_new: 315fail_deprobe_wake_new:
315 unregister_trace_sched_wakeup_new(probe_wakeup); 316 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
316fail_deprobe: 317fail_deprobe:
317 unregister_trace_sched_wakeup(probe_wakeup); 318 unregister_trace_sched_wakeup(probe_wakeup, NULL);
318} 319}
319 320
320static void stop_wakeup_tracer(struct trace_array *tr) 321static void stop_wakeup_tracer(struct trace_array *tr)
321{ 322{
322 tracer_enabled = 0; 323 tracer_enabled = 0;
323 unregister_ftrace_function(&trace_ops); 324 unregister_ftrace_function(&trace_ops);
324 unregister_trace_sched_switch(probe_wakeup_sched_switch); 325 unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL);
325 unregister_trace_sched_wakeup_new(probe_wakeup); 326 unregister_trace_sched_wakeup_new(probe_wakeup, NULL);
326 unregister_trace_sched_wakeup(probe_wakeup); 327 unregister_trace_sched_wakeup(probe_wakeup, NULL);
327 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); 328 unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL);
328} 329}
329 330
330static int __wakeup_tracer_init(struct trace_array *tr) 331static int __wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 280fea470d67..250e7f9bd2f0 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -3,6 +3,7 @@
3#include <linux/stringify.h> 3#include <linux/stringify.h>
4#include <linux/kthread.h> 4#include <linux/kthread.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/slab.h>
6 7
7static inline int trace_valid_entry(struct trace_entry *entry) 8static inline int trace_valid_entry(struct trace_entry *entry)
8{ 9{
@@ -16,7 +17,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
16 case TRACE_BRANCH: 17 case TRACE_BRANCH:
17 case TRACE_GRAPH_ENT: 18 case TRACE_GRAPH_ENT:
18 case TRACE_GRAPH_RET: 19 case TRACE_GRAPH_RET:
19 case TRACE_HW_BRANCHES:
20 case TRACE_KSYM: 20 case TRACE_KSYM:
21 return 1; 21 return 1;
22 } 22 }
@@ -29,7 +29,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
29 struct trace_entry *entry; 29 struct trace_entry *entry;
30 unsigned int loops = 0; 30 unsigned int loops = 0;
31 31
32 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) { 32 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
33 entry = ring_buffer_event_data(event); 33 entry = ring_buffer_event_data(event);
34 34
35 /* 35 /*
@@ -255,7 +255,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
255/* Maximum number of functions to trace before diagnosing a hang */ 255/* Maximum number of functions to trace before diagnosing a hang */
256#define GRAPH_MAX_FUNC_TEST 100000000 256#define GRAPH_MAX_FUNC_TEST 100000000
257 257
258static void __ftrace_dump(bool disable_tracing); 258static void
259__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
259static unsigned int graph_hang_thresh; 260static unsigned int graph_hang_thresh;
260 261
261/* Wrap the real function entry probe to avoid possible hanging */ 262/* Wrap the real function entry probe to avoid possible hanging */
@@ -266,7 +267,7 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
266 ftrace_graph_stop(); 267 ftrace_graph_stop();
267 printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); 268 printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
268 if (ftrace_dump_on_oops) 269 if (ftrace_dump_on_oops)
269 __ftrace_dump(false); 270 __ftrace_dump(false, DUMP_ALL);
270 return 0; 271 return 0;
271 } 272 }
272 273
@@ -754,62 +755,6 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
754} 755}
755#endif /* CONFIG_BRANCH_TRACER */ 756#endif /* CONFIG_BRANCH_TRACER */
756 757
757#ifdef CONFIG_HW_BRANCH_TRACER
758int
759trace_selftest_startup_hw_branches(struct tracer *trace,
760 struct trace_array *tr)
761{
762 struct trace_iterator *iter;
763 struct tracer tracer;
764 unsigned long count;
765 int ret;
766
767 if (!trace->open) {
768 printk(KERN_CONT "missing open function...");
769 return -1;
770 }
771
772 ret = tracer_init(trace, tr);
773 if (ret) {
774 warn_failed_init_tracer(trace, ret);
775 return ret;
776 }
777
778 /*
779 * The hw-branch tracer needs to collect the trace from the various
780 * cpu trace buffers - before tracing is stopped.
781 */
782 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
783 if (!iter)
784 return -ENOMEM;
785
786 memcpy(&tracer, trace, sizeof(tracer));
787
788 iter->trace = &tracer;
789 iter->tr = tr;
790 iter->pos = -1;
791 mutex_init(&iter->mutex);
792
793 trace->open(iter);
794
795 mutex_destroy(&iter->mutex);
796 kfree(iter);
797
798 tracing_stop();
799
800 ret = trace_test_buffer(tr, &count);
801 trace->reset(tr);
802 tracing_start();
803
804 if (!ret && !count) {
805 printk(KERN_CONT "no entries found..");
806 ret = -1;
807 }
808
809 return ret;
810}
811#endif /* CONFIG_HW_BRANCH_TRACER */
812
813#ifdef CONFIG_KSYM_TRACER 758#ifdef CONFIG_KSYM_TRACER
814static int ksym_selftest_dummy; 759static int ksym_selftest_dummy;
815 760
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index a4bb239eb987..96cffb269e73 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,6 +10,7 @@
10 10
11 11
12#include <linux/list.h> 12#include <linux/list.h>
13#include <linux/slab.h>
13#include <linux/rbtree.h> 14#include <linux/rbtree.h>
14#include <linux/debugfs.h> 15#include <linux/debugfs.h>
15#include "trace_stat.h" 16#include "trace_stat.h"
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 33c2a5b769dc..34e35804304b 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,6 @@
1#include <trace/syscall.h> 1#include <trace/syscall.h>
2#include <trace/events/syscalls.h> 2#include <trace/events/syscalls.h>
3#include <linux/slab.h>
3#include <linux/kernel.h> 4#include <linux/kernel.h>
4#include <linux/ftrace.h> 5#include <linux/ftrace.h>
5#include <linux/perf_event.h> 6#include <linux/perf_event.h>
@@ -14,6 +15,54 @@ static int sys_refcount_exit;
14static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); 15static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
15static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); 16static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
16 17
18static int syscall_enter_register(struct ftrace_event_call *event,
19 enum trace_reg type);
20static int syscall_exit_register(struct ftrace_event_call *event,
21 enum trace_reg type);
22
23static int syscall_enter_define_fields(struct ftrace_event_call *call);
24static int syscall_exit_define_fields(struct ftrace_event_call *call);
25
26static struct list_head *
27syscall_get_enter_fields(struct ftrace_event_call *call)
28{
29 struct syscall_metadata *entry = call->data;
30
31 return &entry->enter_fields;
32}
33
34static struct list_head *
35syscall_get_exit_fields(struct ftrace_event_call *call)
36{
37 struct syscall_metadata *entry = call->data;
38
39 return &entry->exit_fields;
40}
41
42struct trace_event_functions enter_syscall_print_funcs = {
43 .trace = print_syscall_enter,
44};
45
46struct trace_event_functions exit_syscall_print_funcs = {
47 .trace = print_syscall_exit,
48};
49
50struct ftrace_event_class event_class_syscall_enter = {
51 .system = "syscalls",
52 .reg = syscall_enter_register,
53 .define_fields = syscall_enter_define_fields,
54 .get_fields = syscall_get_enter_fields,
55 .raw_init = init_syscall_trace,
56};
57
58struct ftrace_event_class event_class_syscall_exit = {
59 .system = "syscalls",
60 .reg = syscall_exit_register,
61 .define_fields = syscall_exit_define_fields,
62 .get_fields = syscall_get_exit_fields,
63 .raw_init = init_syscall_trace,
64};
65
17extern unsigned long __start_syscalls_metadata[]; 66extern unsigned long __start_syscalls_metadata[];
18extern unsigned long __stop_syscalls_metadata[]; 67extern unsigned long __stop_syscalls_metadata[];
19 68
@@ -52,7 +101,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
52} 101}
53 102
54enum print_line_t 103enum print_line_t
55print_syscall_enter(struct trace_iterator *iter, int flags) 104print_syscall_enter(struct trace_iterator *iter, int flags,
105 struct trace_event *event)
56{ 106{
57 struct trace_seq *s = &iter->seq; 107 struct trace_seq *s = &iter->seq;
58 struct trace_entry *ent = iter->ent; 108 struct trace_entry *ent = iter->ent;
@@ -67,7 +117,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags)
67 if (!entry) 117 if (!entry)
68 goto end; 118 goto end;
69 119
70 if (entry->enter_event->id != ent->type) { 120 if (entry->enter_event->event.type != ent->type) {
71 WARN_ON_ONCE(1); 121 WARN_ON_ONCE(1);
72 goto end; 122 goto end;
73 } 123 }
@@ -104,7 +154,8 @@ end:
104} 154}
105 155
106enum print_line_t 156enum print_line_t
107print_syscall_exit(struct trace_iterator *iter, int flags) 157print_syscall_exit(struct trace_iterator *iter, int flags,
158 struct trace_event *event)
108{ 159{
109 struct trace_seq *s = &iter->seq; 160 struct trace_seq *s = &iter->seq;
110 struct trace_entry *ent = iter->ent; 161 struct trace_entry *ent = iter->ent;
@@ -122,7 +173,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags)
122 return TRACE_TYPE_HANDLED; 173 return TRACE_TYPE_HANDLED;
123 } 174 }
124 175
125 if (entry->exit_event->id != ent->type) { 176 if (entry->exit_event->event.type != ent->type) {
126 WARN_ON_ONCE(1); 177 WARN_ON_ONCE(1);
127 return TRACE_TYPE_UNHANDLED; 178 return TRACE_TYPE_UNHANDLED;
128 } 179 }
@@ -204,7 +255,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
204 kfree(call->print_fmt); 255 kfree(call->print_fmt);
205} 256}
206 257
207int syscall_enter_define_fields(struct ftrace_event_call *call) 258static int syscall_enter_define_fields(struct ftrace_event_call *call)
208{ 259{
209 struct syscall_trace_enter trace; 260 struct syscall_trace_enter trace;
210 struct syscall_metadata *meta = call->data; 261 struct syscall_metadata *meta = call->data;
@@ -227,7 +278,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call)
227 return ret; 278 return ret;
228} 279}
229 280
230int syscall_exit_define_fields(struct ftrace_event_call *call) 281static int syscall_exit_define_fields(struct ftrace_event_call *call)
231{ 282{
232 struct syscall_trace_exit trace; 283 struct syscall_trace_exit trace;
233 int ret; 284 int ret;
@@ -242,7 +293,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
242 return ret; 293 return ret;
243} 294}
244 295
245void ftrace_syscall_enter(struct pt_regs *regs, long id) 296void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
246{ 297{
247 struct syscall_trace_enter *entry; 298 struct syscall_trace_enter *entry;
248 struct syscall_metadata *sys_data; 299 struct syscall_metadata *sys_data;
@@ -264,7 +315,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
264 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; 315 size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
265 316
266 event = trace_current_buffer_lock_reserve(&buffer, 317 event = trace_current_buffer_lock_reserve(&buffer,
267 sys_data->enter_event->id, size, 0, 0); 318 sys_data->enter_event->event.type, size, 0, 0);
268 if (!event) 319 if (!event)
269 return; 320 return;
270 321
@@ -277,7 +328,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id)
277 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 328 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
278} 329}
279 330
280void ftrace_syscall_exit(struct pt_regs *regs, long ret) 331void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
281{ 332{
282 struct syscall_trace_exit *entry; 333 struct syscall_trace_exit *entry;
283 struct syscall_metadata *sys_data; 334 struct syscall_metadata *sys_data;
@@ -296,7 +347,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
296 return; 347 return;
297 348
298 event = trace_current_buffer_lock_reserve(&buffer, 349 event = trace_current_buffer_lock_reserve(&buffer,
299 sys_data->exit_event->id, sizeof(*entry), 0, 0); 350 sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
300 if (!event) 351 if (!event)
301 return; 352 return;
302 353
@@ -319,7 +370,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
319 return -ENOSYS; 370 return -ENOSYS;
320 mutex_lock(&syscall_trace_lock); 371 mutex_lock(&syscall_trace_lock);
321 if (!sys_refcount_enter) 372 if (!sys_refcount_enter)
322 ret = register_trace_sys_enter(ftrace_syscall_enter); 373 ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
323 if (!ret) { 374 if (!ret) {
324 set_bit(num, enabled_enter_syscalls); 375 set_bit(num, enabled_enter_syscalls);
325 sys_refcount_enter++; 376 sys_refcount_enter++;
@@ -339,7 +390,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
339 sys_refcount_enter--; 390 sys_refcount_enter--;
340 clear_bit(num, enabled_enter_syscalls); 391 clear_bit(num, enabled_enter_syscalls);
341 if (!sys_refcount_enter) 392 if (!sys_refcount_enter)
342 unregister_trace_sys_enter(ftrace_syscall_enter); 393 unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
343 mutex_unlock(&syscall_trace_lock); 394 mutex_unlock(&syscall_trace_lock);
344} 395}
345 396
@@ -353,7 +404,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
353 return -ENOSYS; 404 return -ENOSYS;
354 mutex_lock(&syscall_trace_lock); 405 mutex_lock(&syscall_trace_lock);
355 if (!sys_refcount_exit) 406 if (!sys_refcount_exit)
356 ret = register_trace_sys_exit(ftrace_syscall_exit); 407 ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
357 if (!ret) { 408 if (!ret) {
358 set_bit(num, enabled_exit_syscalls); 409 set_bit(num, enabled_exit_syscalls);
359 sys_refcount_exit++; 410 sys_refcount_exit++;
@@ -373,7 +424,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
373 sys_refcount_exit--; 424 sys_refcount_exit--;
374 clear_bit(num, enabled_exit_syscalls); 425 clear_bit(num, enabled_exit_syscalls);
375 if (!sys_refcount_exit) 426 if (!sys_refcount_exit)
376 unregister_trace_sys_exit(ftrace_syscall_exit); 427 unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
377 mutex_unlock(&syscall_trace_lock); 428 mutex_unlock(&syscall_trace_lock);
378} 429}
379 430
@@ -433,11 +484,11 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls);
433static int sys_perf_refcount_enter; 484static int sys_perf_refcount_enter;
434static int sys_perf_refcount_exit; 485static int sys_perf_refcount_exit;
435 486
436static void perf_syscall_enter(struct pt_regs *regs, long id) 487static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
437{ 488{
438 struct syscall_metadata *sys_data; 489 struct syscall_metadata *sys_data;
439 struct syscall_trace_enter *rec; 490 struct syscall_trace_enter *rec;
440 unsigned long flags; 491 struct hlist_head *head;
441 int syscall_nr; 492 int syscall_nr;
442 int rctx; 493 int rctx;
443 int size; 494 int size;
@@ -460,14 +511,16 @@ static void perf_syscall_enter(struct pt_regs *regs, long id)
460 return; 511 return;
461 512
462 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, 513 rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size,
463 sys_data->enter_event->id, &rctx, &flags); 514 sys_data->enter_event->event.type, regs, &rctx);
464 if (!rec) 515 if (!rec)
465 return; 516 return;
466 517
467 rec->nr = syscall_nr; 518 rec->nr = syscall_nr;
468 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 519 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
469 (unsigned long *)&rec->args); 520 (unsigned long *)&rec->args);
470 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); 521
522 head = this_cpu_ptr(sys_data->enter_event->perf_events);
523 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
471} 524}
472 525
473int perf_sysenter_enable(struct ftrace_event_call *call) 526int perf_sysenter_enable(struct ftrace_event_call *call)
@@ -479,7 +532,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call)
479 532
480 mutex_lock(&syscall_trace_lock); 533 mutex_lock(&syscall_trace_lock);
481 if (!sys_perf_refcount_enter) 534 if (!sys_perf_refcount_enter)
482 ret = register_trace_sys_enter(perf_syscall_enter); 535 ret = register_trace_sys_enter(perf_syscall_enter, NULL);
483 if (ret) { 536 if (ret) {
484 pr_info("event trace: Could not activate" 537 pr_info("event trace: Could not activate"
485 "syscall entry trace point"); 538 "syscall entry trace point");
@@ -501,15 +554,15 @@ void perf_sysenter_disable(struct ftrace_event_call *call)
501 sys_perf_refcount_enter--; 554 sys_perf_refcount_enter--;
502 clear_bit(num, enabled_perf_enter_syscalls); 555 clear_bit(num, enabled_perf_enter_syscalls);
503 if (!sys_perf_refcount_enter) 556 if (!sys_perf_refcount_enter)
504 unregister_trace_sys_enter(perf_syscall_enter); 557 unregister_trace_sys_enter(perf_syscall_enter, NULL);
505 mutex_unlock(&syscall_trace_lock); 558 mutex_unlock(&syscall_trace_lock);
506} 559}
507 560
508static void perf_syscall_exit(struct pt_regs *regs, long ret) 561static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
509{ 562{
510 struct syscall_metadata *sys_data; 563 struct syscall_metadata *sys_data;
511 struct syscall_trace_exit *rec; 564 struct syscall_trace_exit *rec;
512 unsigned long flags; 565 struct hlist_head *head;
513 int syscall_nr; 566 int syscall_nr;
514 int rctx; 567 int rctx;
515 int size; 568 int size;
@@ -535,14 +588,15 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret)
535 return; 588 return;
536 589
537 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, 590 rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size,
538 sys_data->exit_event->id, &rctx, &flags); 591 sys_data->exit_event->event.type, regs, &rctx);
539 if (!rec) 592 if (!rec)
540 return; 593 return;
541 594
542 rec->nr = syscall_nr; 595 rec->nr = syscall_nr;
543 rec->ret = syscall_get_return_value(current, regs); 596 rec->ret = syscall_get_return_value(current, regs);
544 597
545 perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); 598 head = this_cpu_ptr(sys_data->exit_event->perf_events);
599 perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head);
546} 600}
547 601
548int perf_sysexit_enable(struct ftrace_event_call *call) 602int perf_sysexit_enable(struct ftrace_event_call *call)
@@ -554,7 +608,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call)
554 608
555 mutex_lock(&syscall_trace_lock); 609 mutex_lock(&syscall_trace_lock);
556 if (!sys_perf_refcount_exit) 610 if (!sys_perf_refcount_exit)
557 ret = register_trace_sys_exit(perf_syscall_exit); 611 ret = register_trace_sys_exit(perf_syscall_exit, NULL);
558 if (ret) { 612 if (ret) {
559 pr_info("event trace: Could not activate" 613 pr_info("event trace: Could not activate"
560 "syscall exit trace point"); 614 "syscall exit trace point");
@@ -576,9 +630,50 @@ void perf_sysexit_disable(struct ftrace_event_call *call)
576 sys_perf_refcount_exit--; 630 sys_perf_refcount_exit--;
577 clear_bit(num, enabled_perf_exit_syscalls); 631 clear_bit(num, enabled_perf_exit_syscalls);
578 if (!sys_perf_refcount_exit) 632 if (!sys_perf_refcount_exit)
579 unregister_trace_sys_exit(perf_syscall_exit); 633 unregister_trace_sys_exit(perf_syscall_exit, NULL);
580 mutex_unlock(&syscall_trace_lock); 634 mutex_unlock(&syscall_trace_lock);
581} 635}
582 636
583#endif /* CONFIG_PERF_EVENTS */ 637#endif /* CONFIG_PERF_EVENTS */
584 638
639static int syscall_enter_register(struct ftrace_event_call *event,
640 enum trace_reg type)
641{
642 switch (type) {
643 case TRACE_REG_REGISTER:
644 return reg_event_syscall_enter(event);
645 case TRACE_REG_UNREGISTER:
646 unreg_event_syscall_enter(event);
647 return 0;
648
649#ifdef CONFIG_PERF_EVENTS
650 case TRACE_REG_PERF_REGISTER:
651 return perf_sysenter_enable(event);
652 case TRACE_REG_PERF_UNREGISTER:
653 perf_sysenter_disable(event);
654 return 0;
655#endif
656 }
657 return 0;
658}
659
660static int syscall_exit_register(struct ftrace_event_call *event,
661 enum trace_reg type)
662{
663 switch (type) {
664 case TRACE_REG_REGISTER:
665 return reg_event_syscall_exit(event);
666 case TRACE_REG_UNREGISTER:
667 unreg_event_syscall_exit(event);
668 return 0;
669
670#ifdef CONFIG_PERF_EVENTS
671 case TRACE_REG_PERF_REGISTER:
672 return perf_sysexit_enable(event);
673 case TRACE_REG_PERF_UNREGISTER:
674 perf_sysexit_disable(event);
675 return 0;
676#endif
677 }
678 return 0;
679}
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 40cafb07dffd..a7cc3793baf6 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -9,6 +9,7 @@
9#include <trace/events/workqueue.h> 9#include <trace/events/workqueue.h>
10#include <linux/list.h> 10#include <linux/list.h>
11#include <linux/percpu.h> 11#include <linux/percpu.h>
12#include <linux/slab.h>
12#include <linux/kref.h> 13#include <linux/kref.h>
13#include "trace_stat.h" 14#include "trace_stat.h"
14#include "trace.h" 15#include "trace.h"
@@ -48,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref)
48 49
49/* Insertion of a work */ 50/* Insertion of a work */
50static void 51static void
51probe_workqueue_insertion(struct task_struct *wq_thread, 52probe_workqueue_insertion(void *ignore,
53 struct task_struct *wq_thread,
52 struct work_struct *work) 54 struct work_struct *work)
53{ 55{
54 int cpu = cpumask_first(&wq_thread->cpus_allowed); 56 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -69,7 +71,8 @@ found:
69 71
70/* Execution of a work */ 72/* Execution of a work */
71static void 73static void
72probe_workqueue_execution(struct task_struct *wq_thread, 74probe_workqueue_execution(void *ignore,
75 struct task_struct *wq_thread,
73 struct work_struct *work) 76 struct work_struct *work)
74{ 77{
75 int cpu = cpumask_first(&wq_thread->cpus_allowed); 78 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -89,7 +92,8 @@ found:
89} 92}
90 93
91/* Creation of a cpu workqueue thread */ 94/* Creation of a cpu workqueue thread */
92static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) 95static void probe_workqueue_creation(void *ignore,
96 struct task_struct *wq_thread, int cpu)
93{ 97{
94 struct cpu_workqueue_stats *cws; 98 struct cpu_workqueue_stats *cws;
95 unsigned long flags; 99 unsigned long flags;
@@ -113,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
113} 117}
114 118
115/* Destruction of a cpu workqueue thread */ 119/* Destruction of a cpu workqueue thread */
116static void probe_workqueue_destruction(struct task_struct *wq_thread) 120static void
121probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread)
117{ 122{
118 /* Workqueue only execute on one cpu */ 123 /* Workqueue only execute on one cpu */
119 int cpu = cpumask_first(&wq_thread->cpus_allowed); 124 int cpu = cpumask_first(&wq_thread->cpus_allowed);
@@ -258,19 +263,19 @@ int __init trace_workqueue_early_init(void)
258{ 263{
259 int ret, cpu; 264 int ret, cpu;
260 265
261 ret = register_trace_workqueue_insertion(probe_workqueue_insertion); 266 ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
262 if (ret) 267 if (ret)
263 goto out; 268 goto out;
264 269
265 ret = register_trace_workqueue_execution(probe_workqueue_execution); 270 ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL);
266 if (ret) 271 if (ret)
267 goto no_insertion; 272 goto no_insertion;
268 273
269 ret = register_trace_workqueue_creation(probe_workqueue_creation); 274 ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL);
270 if (ret) 275 if (ret)
271 goto no_execution; 276 goto no_execution;
272 277
273 ret = register_trace_workqueue_destruction(probe_workqueue_destruction); 278 ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL);
274 if (ret) 279 if (ret)
275 goto no_creation; 280 goto no_creation;
276 281
@@ -282,11 +287,11 @@ int __init trace_workqueue_early_init(void)
282 return 0; 287 return 0;
283 288
284no_creation: 289no_creation:
285 unregister_trace_workqueue_creation(probe_workqueue_creation); 290 unregister_trace_workqueue_creation(probe_workqueue_creation, NULL);
286no_execution: 291no_execution:
287 unregister_trace_workqueue_execution(probe_workqueue_execution); 292 unregister_trace_workqueue_execution(probe_workqueue_execution, NULL);
288no_insertion: 293no_insertion:
289 unregister_trace_workqueue_insertion(probe_workqueue_insertion); 294 unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL);
290out: 295out:
291 pr_warning("trace_workqueue: unable to trace workqueues\n"); 296 pr_warning("trace_workqueue: unable to trace workqueues\n");
292 297
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index cc89be5bc0f8..c77f3eceea25 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
54 */ 54 */
55struct tracepoint_entry { 55struct tracepoint_entry {
56 struct hlist_node hlist; 56 struct hlist_node hlist;
57 void **funcs; 57 struct tracepoint_func *funcs;
58 int refcount; /* Number of times armed. 0 if disarmed. */ 58 int refcount; /* Number of times armed. 0 if disarmed. */
59 char name[0]; 59 char name[0];
60}; 60};
@@ -64,12 +64,12 @@ struct tp_probes {
64 struct rcu_head rcu; 64 struct rcu_head rcu;
65 struct list_head list; 65 struct list_head list;
66 } u; 66 } u;
67 void *probes[0]; 67 struct tracepoint_func probes[0];
68}; 68};
69 69
70static inline void *allocate_probes(int count) 70static inline void *allocate_probes(int count)
71{ 71{
72 struct tp_probes *p = kmalloc(count * sizeof(void *) 72 struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func)
73 + sizeof(struct tp_probes), GFP_KERNEL); 73 + sizeof(struct tp_probes), GFP_KERNEL);
74 return p == NULL ? NULL : p->probes; 74 return p == NULL ? NULL : p->probes;
75} 75}
@@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head)
79 kfree(container_of(head, struct tp_probes, u.rcu)); 79 kfree(container_of(head, struct tp_probes, u.rcu));
80} 80}
81 81
82static inline void release_probes(void *old) 82static inline void release_probes(struct tracepoint_func *old)
83{ 83{
84 if (old) { 84 if (old) {
85 struct tp_probes *tp_probes = container_of(old, 85 struct tp_probes *tp_probes = container_of(old,
@@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry)
95 if (!tracepoint_debug || !entry->funcs) 95 if (!tracepoint_debug || !entry->funcs)
96 return; 96 return;
97 97
98 for (i = 0; entry->funcs[i]; i++) 98 for (i = 0; entry->funcs[i].func; i++)
99 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); 99 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func);
100} 100}
101 101
102static void * 102static struct tracepoint_func *
103tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) 103tracepoint_entry_add_probe(struct tracepoint_entry *entry,
104 void *probe, void *data)
104{ 105{
105 int nr_probes = 0; 106 int nr_probes = 0;
106 void **old, **new; 107 struct tracepoint_func *old, *new;
107 108
108 WARN_ON(!probe); 109 WARN_ON(!probe);
109 110
@@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
111 old = entry->funcs; 112 old = entry->funcs;
112 if (old) { 113 if (old) {
113 /* (N -> N+1), (N != 0, 1) probes */ 114 /* (N -> N+1), (N != 0, 1) probes */
114 for (nr_probes = 0; old[nr_probes]; nr_probes++) 115 for (nr_probes = 0; old[nr_probes].func; nr_probes++)
115 if (old[nr_probes] == probe) 116 if (old[nr_probes].func == probe &&
117 old[nr_probes].data == data)
116 return ERR_PTR(-EEXIST); 118 return ERR_PTR(-EEXIST);
117 } 119 }
118 /* + 2 : one for new probe, one for NULL func */ 120 /* + 2 : one for new probe, one for NULL func */
@@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
120 if (new == NULL) 122 if (new == NULL)
121 return ERR_PTR(-ENOMEM); 123 return ERR_PTR(-ENOMEM);
122 if (old) 124 if (old)
123 memcpy(new, old, nr_probes * sizeof(void *)); 125 memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
124 new[nr_probes] = probe; 126 new[nr_probes].func = probe;
125 new[nr_probes + 1] = NULL; 127 new[nr_probes].data = data;
128 new[nr_probes + 1].func = NULL;
126 entry->refcount = nr_probes + 1; 129 entry->refcount = nr_probes + 1;
127 entry->funcs = new; 130 entry->funcs = new;
128 debug_print_probes(entry); 131 debug_print_probes(entry);
@@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
130} 133}
131 134
132static void * 135static void *
133tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) 136tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
137 void *probe, void *data)
134{ 138{
135 int nr_probes = 0, nr_del = 0, i; 139 int nr_probes = 0, nr_del = 0, i;
136 void **old, **new; 140 struct tracepoint_func *old, *new;
137 141
138 old = entry->funcs; 142 old = entry->funcs;
139 143
@@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
142 146
143 debug_print_probes(entry); 147 debug_print_probes(entry);
144 /* (N -> M), (N > 1, M >= 0) probes */ 148 /* (N -> M), (N > 1, M >= 0) probes */
145 for (nr_probes = 0; old[nr_probes]; nr_probes++) { 149 for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
146 if ((!probe || old[nr_probes] == probe)) 150 if (!probe ||
151 (old[nr_probes].func == probe &&
152 old[nr_probes].data == data))
147 nr_del++; 153 nr_del++;
148 } 154 }
149 155
@@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
160 new = allocate_probes(nr_probes - nr_del + 1); 166 new = allocate_probes(nr_probes - nr_del + 1);
161 if (new == NULL) 167 if (new == NULL)
162 return ERR_PTR(-ENOMEM); 168 return ERR_PTR(-ENOMEM);
163 for (i = 0; old[i]; i++) 169 for (i = 0; old[i].func; i++)
164 if ((probe && old[i] != probe)) 170 if (probe &&
171 (old[i].func != probe || old[i].data != data))
165 new[j++] = old[i]; 172 new[j++] = old[i];
166 new[nr_probes - nr_del] = NULL; 173 new[nr_probes - nr_del].func = NULL;
167 entry->refcount = nr_probes - nr_del; 174 entry->refcount = nr_probes - nr_del;
168 entry->funcs = new; 175 entry->funcs = new;
169 } 176 }
@@ -315,18 +322,19 @@ static void tracepoint_update_probes(void)
315 module_update_tracepoints(); 322 module_update_tracepoints();
316} 323}
317 324
318static void *tracepoint_add_probe(const char *name, void *probe) 325static struct tracepoint_func *
326tracepoint_add_probe(const char *name, void *probe, void *data)
319{ 327{
320 struct tracepoint_entry *entry; 328 struct tracepoint_entry *entry;
321 void *old; 329 struct tracepoint_func *old;
322 330
323 entry = get_tracepoint(name); 331 entry = get_tracepoint(name);
324 if (!entry) { 332 if (!entry) {
325 entry = add_tracepoint(name); 333 entry = add_tracepoint(name);
326 if (IS_ERR(entry)) 334 if (IS_ERR(entry))
327 return entry; 335 return (struct tracepoint_func *)entry;
328 } 336 }
329 old = tracepoint_entry_add_probe(entry, probe); 337 old = tracepoint_entry_add_probe(entry, probe, data);
330 if (IS_ERR(old) && !entry->refcount) 338 if (IS_ERR(old) && !entry->refcount)
331 remove_tracepoint(entry); 339 remove_tracepoint(entry);
332 return old; 340 return old;
@@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe)
340 * Returns 0 if ok, error value on error. 348 * Returns 0 if ok, error value on error.
341 * The probe address must at least be aligned on the architecture pointer size. 349 * The probe address must at least be aligned on the architecture pointer size.
342 */ 350 */
343int tracepoint_probe_register(const char *name, void *probe) 351int tracepoint_probe_register(const char *name, void *probe, void *data)
344{ 352{
345 void *old; 353 struct tracepoint_func *old;
346 354
347 mutex_lock(&tracepoints_mutex); 355 mutex_lock(&tracepoints_mutex);
348 old = tracepoint_add_probe(name, probe); 356 old = tracepoint_add_probe(name, probe, data);
349 mutex_unlock(&tracepoints_mutex); 357 mutex_unlock(&tracepoints_mutex);
350 if (IS_ERR(old)) 358 if (IS_ERR(old))
351 return PTR_ERR(old); 359 return PTR_ERR(old);
@@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe)
356} 364}
357EXPORT_SYMBOL_GPL(tracepoint_probe_register); 365EXPORT_SYMBOL_GPL(tracepoint_probe_register);
358 366
359static void *tracepoint_remove_probe(const char *name, void *probe) 367static struct tracepoint_func *
368tracepoint_remove_probe(const char *name, void *probe, void *data)
360{ 369{
361 struct tracepoint_entry *entry; 370 struct tracepoint_entry *entry;
362 void *old; 371 struct tracepoint_func *old;
363 372
364 entry = get_tracepoint(name); 373 entry = get_tracepoint(name);
365 if (!entry) 374 if (!entry)
366 return ERR_PTR(-ENOENT); 375 return ERR_PTR(-ENOENT);
367 old = tracepoint_entry_remove_probe(entry, probe); 376 old = tracepoint_entry_remove_probe(entry, probe, data);
368 if (IS_ERR(old)) 377 if (IS_ERR(old))
369 return old; 378 return old;
370 if (!entry->refcount) 379 if (!entry->refcount)
@@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe)
382 * itself uses stop_machine(), which insures that every preempt disabled section 391 * itself uses stop_machine(), which insures that every preempt disabled section
383 * have finished. 392 * have finished.
384 */ 393 */
385int tracepoint_probe_unregister(const char *name, void *probe) 394int tracepoint_probe_unregister(const char *name, void *probe, void *data)
386{ 395{
387 void *old; 396 struct tracepoint_func *old;
388 397
389 mutex_lock(&tracepoints_mutex); 398 mutex_lock(&tracepoints_mutex);
390 old = tracepoint_remove_probe(name, probe); 399 old = tracepoint_remove_probe(name, probe, data);
391 mutex_unlock(&tracepoints_mutex); 400 mutex_unlock(&tracepoints_mutex);
392 if (IS_ERR(old)) 401 if (IS_ERR(old))
393 return PTR_ERR(old); 402 return PTR_ERR(old);
@@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old)
418 * 427 *
419 * caller must call tracepoint_probe_update_all() 428 * caller must call tracepoint_probe_update_all()
420 */ 429 */
421int tracepoint_probe_register_noupdate(const char *name, void *probe) 430int tracepoint_probe_register_noupdate(const char *name, void *probe,
431 void *data)
422{ 432{
423 void *old; 433 struct tracepoint_func *old;
424 434
425 mutex_lock(&tracepoints_mutex); 435 mutex_lock(&tracepoints_mutex);
426 old = tracepoint_add_probe(name, probe); 436 old = tracepoint_add_probe(name, probe, data);
427 if (IS_ERR(old)) { 437 if (IS_ERR(old)) {
428 mutex_unlock(&tracepoints_mutex); 438 mutex_unlock(&tracepoints_mutex);
429 return PTR_ERR(old); 439 return PTR_ERR(old);
@@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate);
441 * 451 *
442 * caller must call tracepoint_probe_update_all() 452 * caller must call tracepoint_probe_update_all()
443 */ 453 */
444int tracepoint_probe_unregister_noupdate(const char *name, void *probe) 454int tracepoint_probe_unregister_noupdate(const char *name, void *probe,
455 void *data)
445{ 456{
446 void *old; 457 struct tracepoint_func *old;
447 458
448 mutex_lock(&tracepoints_mutex); 459 mutex_lock(&tracepoints_mutex);
449 old = tracepoint_remove_probe(name, probe); 460 old = tracepoint_remove_probe(name, probe, data);
450 if (IS_ERR(old)) { 461 if (IS_ERR(old)) {
451 mutex_unlock(&tracepoints_mutex); 462 mutex_unlock(&tracepoints_mutex);
452 return PTR_ERR(old); 463 return PTR_ERR(old);
diff --git a/kernel/user.c b/kernel/user.c
index 766467b3bcb7..7e72614b736d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,7 +16,6 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/user_namespace.h> 18#include <linux/user_namespace.h>
19#include "cred-internals.h"
20 19
21struct user_namespace init_user_ns = { 20struct user_namespace init_user_ns = {
22 .kref = { 21 .kref = {
@@ -137,9 +136,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
137 struct hlist_head *hashent = uidhashentry(ns, uid); 136 struct hlist_head *hashent = uidhashentry(ns, uid);
138 struct user_struct *up, *new; 137 struct user_struct *up, *new;
139 138
140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
141 * atomic.
142 */
143 spin_lock_irq(&uidhash_lock); 139 spin_lock_irq(&uidhash_lock);
144 up = uid_hash_find(uid, hashent); 140 up = uid_hash_find(uid, hashent);
145 spin_unlock_irq(&uidhash_lock); 141 spin_unlock_irq(&uidhash_lock);
@@ -161,11 +157,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
161 spin_lock_irq(&uidhash_lock); 157 spin_lock_irq(&uidhash_lock);
162 up = uid_hash_find(uid, hashent); 158 up = uid_hash_find(uid, hashent);
163 if (up) { 159 if (up) {
164 /* This case is not possible when CONFIG_USER_SCHED
165 * is defined, since we serialize alloc_uid() using
166 * uids_mutex. Hence no need to call
167 * sched_destroy_user() or remove_user_sysfs_dir().
168 */
169 key_put(new->uid_keyring); 160 key_put(new->uid_keyring);
170 key_put(new->session_keyring); 161 key_put(new->session_keyring);
171 kmem_cache_free(uid_cachep, new); 162 kmem_cache_free(uid_cachep, new);
@@ -178,8 +169,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
178 169
179 return up; 170 return up;
180 171
181 put_user_ns(new->user_ns);
182 kmem_cache_free(uid_cachep, new);
183out_unlock: 172out_unlock:
184 return NULL; 173 return NULL;
185} 174}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 076c7c8215b0..25915832291a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
9#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
10#include <linux/slab.h> 10#include <linux/slab.h>
11#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
12#include <linux/highuid.h>
12#include <linux/cred.h> 13#include <linux/cred.h>
13 14
14/* 15/*
@@ -54,8 +55,8 @@ int create_user_ns(struct cred *new)
54#endif 55#endif
55 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */ 56 /* tgcred will be cleared in our caller bc CLONE_THREAD won't be set */
56 57
57 /* alloc_uid() incremented the userns refcount. Just set it to 1 */ 58 /* root_user holds a reference to ns, our reference can be dropped */
58 kref_set(&ns->kref, 1); 59 put_user_ns(ns);
59 60
60 return 0; 61 return 0;
61} 62}
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
82 schedule_work(&ns->destroyer); 83 schedule_work(&ns->destroyer);
83} 84}
84EXPORT_SYMBOL(free_user_ns); 85EXPORT_SYMBOL(free_user_ns);
86
87uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
88{
89 struct user_namespace *tmp;
90
91 if (likely(to == cred->user->user_ns))
92 return uid;
93
94
95 /* Is cred->user the creator of the target user_ns
96 * or the creator of one of it's parents?
97 */
98 for ( tmp = to; tmp != &init_user_ns;
99 tmp = tmp->creator->user_ns ) {
100 if (cred->user == tmp->creator) {
101 return (uid_t)0;
102 }
103 }
104
105 /* No useful relationship so no mapping */
106 return overflowuid;
107}
108
109gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
110{
111 struct user_namespace *tmp;
112
113 if (likely(to == cred->user->user_ns))
114 return gid;
115
116 /* Is cred->user the creator of the target user_ns
117 * or the creator of one of it's parents?
118 */
119 for ( tmp = to; tmp != &init_user_ns;
120 tmp = tmp->creator->user_ns ) {
121 if (cred->user == tmp->creator) {
122 return (gid_t)0;
123 }
124 }
125
126 /* No useful relationship so no mapping */
127 return overflowgid;
128}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dee48658805c..327d2deb4451 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -229,6 +229,16 @@ static inline void set_wq_data(struct work_struct *work,
229 atomic_long_set(&work->data, new); 229 atomic_long_set(&work->data, new);
230} 230}
231 231
232/*
233 * Clear WORK_STRUCT_PENDING and the workqueue on which it was queued.
234 */
235static inline void clear_wq_data(struct work_struct *work)
236{
237 unsigned long flags = *work_data_bits(work) &
238 (1UL << WORK_STRUCT_STATIC);
239 atomic_long_set(&work->data, flags);
240}
241
232static inline 242static inline
233struct cpu_workqueue_struct *get_wq_data(struct work_struct *work) 243struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
234{ 244{
@@ -671,7 +681,7 @@ static int __cancel_work_timer(struct work_struct *work,
671 wait_on_work(work); 681 wait_on_work(work);
672 } while (unlikely(ret < 0)); 682 } while (unlikely(ret < 0));
673 683
674 work_clear_pending(work); 684 clear_wq_data(work);
675 return ret; 685 return ret;
676} 686}
677 687
@@ -774,7 +784,7 @@ void flush_delayed_work(struct delayed_work *dwork)
774{ 784{
775 if (del_timer_sync(&dwork->timer)) { 785 if (del_timer_sync(&dwork->timer)) {
776 struct cpu_workqueue_struct *cwq; 786 struct cpu_workqueue_struct *cwq;
777 cwq = wq_per_cpu(keventd_wq, get_cpu()); 787 cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu());
778 __queue_work(cwq, &dwork->work); 788 __queue_work(cwq, &dwork->work);
779 put_cpu(); 789 put_cpu();
780 } 790 }
@@ -845,6 +855,30 @@ int schedule_on_each_cpu(work_func_t func)
845 return 0; 855 return 0;
846} 856}
847 857
858/**
859 * flush_scheduled_work - ensure that any scheduled work has run to completion.
860 *
861 * Forces execution of the kernel-global workqueue and blocks until its
862 * completion.
863 *
864 * Think twice before calling this function! It's very easy to get into
865 * trouble if you don't take great care. Either of the following situations
866 * will lead to deadlock:
867 *
868 * One of the work items currently on the workqueue needs to acquire
869 * a lock held by your code or its caller.
870 *
871 * Your code is running in the context of a work routine.
872 *
873 * They will be detected by lockdep when they occur, but the first might not
874 * occur very often. It depends on what work items are on the workqueue and
875 * what locks they need, which you have no control over.
876 *
877 * In most situations flushing the entire workqueue is overkill; you merely
878 * need to know that a particular work item isn't queued and isn't running.
879 * In such cases you should use cancel_delayed_work_sync() or
880 * cancel_work_sync() instead.
881 */
848void flush_scheduled_work(void) 882void flush_scheduled_work(void)
849{ 883{
850 flush_workqueue(keventd_wq); 884 flush_workqueue(keventd_wq);
@@ -1076,7 +1110,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
1076 unsigned int cpu = (unsigned long)hcpu; 1110 unsigned int cpu = (unsigned long)hcpu;
1077 struct cpu_workqueue_struct *cwq; 1111 struct cpu_workqueue_struct *cwq;
1078 struct workqueue_struct *wq; 1112 struct workqueue_struct *wq;
1079 int ret = NOTIFY_OK; 1113 int err = 0;
1080 1114
1081 action &= ~CPU_TASKS_FROZEN; 1115 action &= ~CPU_TASKS_FROZEN;
1082 1116
@@ -1090,12 +1124,13 @@ undo:
1090 1124
1091 switch (action) { 1125 switch (action) {
1092 case CPU_UP_PREPARE: 1126 case CPU_UP_PREPARE:
1093 if (!create_workqueue_thread(cwq, cpu)) 1127 err = create_workqueue_thread(cwq, cpu);
1128 if (!err)
1094 break; 1129 break;
1095 printk(KERN_ERR "workqueue [%s] for %i failed\n", 1130 printk(KERN_ERR "workqueue [%s] for %i failed\n",
1096 wq->name, cpu); 1131 wq->name, cpu);
1097 action = CPU_UP_CANCELED; 1132 action = CPU_UP_CANCELED;
1098 ret = NOTIFY_BAD; 1133 err = -ENOMEM;
1099 goto undo; 1134 goto undo;
1100 1135
1101 case CPU_ONLINE: 1136 case CPU_ONLINE:
@@ -1116,7 +1151,7 @@ undo:
1116 cpumask_clear_cpu(cpu, cpu_populated_map); 1151 cpumask_clear_cpu(cpu, cpu_populated_map);
1117 } 1152 }
1118 1153
1119 return ret; 1154 return notifier_from_errno(err);
1120} 1155}
1121 1156
1122#ifdef CONFIG_SMP 1157#ifdef CONFIG_SMP