aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile19
-rw-r--r--kernel/audit.c6
-rw-r--r--kernel/auditfilter.c3
-rw-r--r--kernel/backtracetest.c65
-rw-r--r--kernel/capability.c132
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/cpu.c25
-rw-r--r--kernel/cpuset.c56
-rw-r--r--kernel/exit.c459
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/futex.c93
-rw-r--r--kernel/hrtimer.c21
-rw-r--r--kernel/irq/manage.c33
-rw-r--r--kernel/irq/proc.c59
-rw-r--r--kernel/kgdb.c19
-rw-r--r--kernel/kprobes.c17
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/lockdep.c80
-rw-r--r--kernel/lockdep_internals.h6
-rw-r--r--kernel/lockdep_proc.c97
-rw-r--r--kernel/marker.c30
-rw-r--r--kernel/module.c18
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex.c5
-rw-r--r--kernel/pid.c1
-rw-r--r--kernel/pm_qos_params.c7
-rw-r--r--kernel/power/disk.c50
-rw-r--r--kernel/power/main.c16
-rw-r--r--kernel/power/process.c97
-rw-r--r--kernel/power/user.c71
-rw-r--r--kernel/printk.c112
-rw-r--r--kernel/profile.c6
-rw-r--r--kernel/ptrace.c52
-rw-r--r--kernel/rcuclassic.c52
-rw-r--r--kernel/rcupdate.c71
-rw-r--r--kernel/rcupreempt.c440
-rw-r--r--kernel/rcupreempt_trace.c1
-rw-r--r--kernel/rcutorture.c174
-rw-r--r--kernel/relay.c2
-rw-r--r--kernel/sched.c972
-rw-r--r--kernel/sched_clock.c153
-rw-r--r--kernel/sched_cpupri.c174
-rw-r--r--kernel/sched_cpupri.h36
-rw-r--r--kernel/sched_debug.c59
-rw-r--r--kernel/sched_fair.c247
-rw-r--r--kernel/sched_features.h7
-rw-r--r--kernel/sched_rt.c457
-rw-r--r--kernel/sched_stats.h49
-rw-r--r--kernel/semaphore.c1
-rw-r--r--kernel/signal.c51
-rw-r--r--kernel/smp.c383
-rw-r--r--kernel/softirq.c69
-rw-r--r--kernel/softlockup.c16
-rw-r--r--kernel/spinlock.c2
-rw-r--r--kernel/stacktrace.c14
-rw-r--r--kernel/stop_machine.c9
-rw-r--r--kernel/sys.c6
-rw-r--r--kernel/sysctl.c44
-rw-r--r--kernel/time/tick-broadcast.c8
-rw-r--r--kernel/time/tick-sched.c22
-rw-r--r--kernel/timer.c12
-rw-r--r--kernel/trace/Kconfig135
-rw-r--r--kernel/trace/Makefile24
-rw-r--r--kernel/trace/ftrace.c1727
-rw-r--r--kernel/trace/trace.c3161
-rw-r--r--kernel/trace/trace.h339
-rw-r--r--kernel/trace/trace_functions.c81
-rw-r--r--kernel/trace/trace_irqsoff.c486
-rw-r--r--kernel/trace/trace_mmiotrace.c295
-rw-r--r--kernel/trace/trace_sched_switch.c286
-rw-r--r--kernel/trace/trace_sched_wakeup.c448
-rw-r--r--kernel/trace/trace_selftest.c563
-rw-r--r--kernel/trace/trace_selftest_dynamic.c7
-rw-r--r--kernel/trace/trace_sysprof.c363
-rw-r--r--kernel/workqueue.c2
75 files changed, 11636 insertions, 1483 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..985ddb7da4d0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -3,7 +3,7 @@
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
6 exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
@@ -11,6 +11,17 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13 13
14ifdef CONFIG_FTRACE
15# Do not trace debug files and internal ftrace files
16CFLAGS_REMOVE_lockdep.o = -pg
17CFLAGS_REMOVE_lockdep_proc.o = -pg
18CFLAGS_REMOVE_mutex-debug.o = -pg
19CFLAGS_REMOVE_rtmutex-debug.o = -pg
20CFLAGS_REMOVE_cgroup-debug.o = -pg
21CFLAGS_REMOVE_sched_clock.o = -pg
22CFLAGS_REMOVE_sched.o = -mno-spe -pg
23endif
24
14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 25obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
15obj-$(CONFIG_STACKTRACE) += stacktrace.o 26obj-$(CONFIG_STACKTRACE) += stacktrace.o
16obj-y += time/ 27obj-y += time/
@@ -27,7 +38,8 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
27obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 38obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
28obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 39obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
29obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 40obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
30obj-$(CONFIG_SMP) += cpu.o spinlock.o 41obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
42obj-$(CONFIG_SMP) += spinlock.o
31obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 43obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
32obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 44obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
33obj-$(CONFIG_UID16) += uid16.o 45obj-$(CONFIG_UID16) += uid16.o
@@ -69,6 +81,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
69obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 81obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
70obj-$(CONFIG_MARKERS) += marker.o 82obj-$(CONFIG_MARKERS) += marker.o
71obj-$(CONFIG_LATENCYTOP) += latencytop.o 83obj-$(CONFIG_LATENCYTOP) += latencytop.o
84obj-$(CONFIG_FTRACE) += trace/
85obj-$(CONFIG_TRACING) += trace/
86obj-$(CONFIG_SMP) += sched_cpupri.o
72 87
73ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 88ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
74# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 89# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index e8692a5748c2..e092f1c0ce30 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -738,7 +738,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
738 if (!audit_enabled && msg_type != AUDIT_USER_AVC) 738 if (!audit_enabled && msg_type != AUDIT_USER_AVC)
739 return 0; 739 return 0;
740 740
741 err = audit_filter_user(&NETLINK_CB(skb), msg_type); 741 err = audit_filter_user(&NETLINK_CB(skb));
742 if (err == 1) { 742 if (err == 1) {
743 err = 0; 743 err = 0;
744 if (msg_type == AUDIT_USER_TTY) { 744 if (msg_type == AUDIT_USER_TTY) {
@@ -779,7 +779,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
779 } 779 }
780 /* fallthrough */ 780 /* fallthrough */
781 case AUDIT_LIST: 781 case AUDIT_LIST:
782 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 782 err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
783 uid, seq, data, nlmsg_len(nlh), 783 uid, seq, data, nlmsg_len(nlh),
784 loginuid, sessionid, sid); 784 loginuid, sessionid, sid);
785 break; 785 break;
@@ -798,7 +798,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
798 } 798 }
799 /* fallthrough */ 799 /* fallthrough */
800 case AUDIT_LIST_RULES: 800 case AUDIT_LIST_RULES:
801 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 801 err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
802 uid, seq, data, nlmsg_len(nlh), 802 uid, seq, data, nlmsg_len(nlh),
803 loginuid, sessionid, sid); 803 loginuid, sessionid, sid);
804 break; 804 break;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0e0bd27e6512..98c50cc671bb 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1544,6 +1544,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1544 * @data: payload data 1544 * @data: payload data
1545 * @datasz: size of payload data 1545 * @datasz: size of payload data
1546 * @loginuid: loginuid of sender 1546 * @loginuid: loginuid of sender
1547 * @sessionid: sessionid for netlink audit message
1547 * @sid: SE Linux Security ID of sender 1548 * @sid: SE Linux Security ID of sender
1548 */ 1549 */
1549int audit_receive_filter(int type, int pid, int uid, int seq, void *data, 1550int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
@@ -1720,7 +1721,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1720 return 1; 1721 return 1;
1721} 1722}
1722 1723
1723int audit_filter_user(struct netlink_skb_parms *cb, int type) 1724int audit_filter_user(struct netlink_skb_parms *cb)
1724{ 1725{
1725 enum audit_state state = AUDIT_DISABLED; 1726 enum audit_state state = AUDIT_DISABLED;
1726 struct audit_entry *e; 1727 struct audit_entry *e;
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index d1a7605c5b8f..a5e026bc45c4 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -10,30 +10,73 @@
10 * of the License. 10 * of the License.
11 */ 11 */
12 12
13#include <linux/completion.h>
14#include <linux/delay.h>
15#include <linux/interrupt.h>
13#include <linux/module.h> 16#include <linux/module.h>
14#include <linux/sched.h> 17#include <linux/sched.h>
15#include <linux/delay.h> 18#include <linux/stacktrace.h>
19
20static void backtrace_test_normal(void)
21{
22 printk("Testing a backtrace from process context.\n");
23 printk("The following trace is a kernel self test and not a bug!\n");
16 24
17static struct timer_list backtrace_timer; 25 dump_stack();
26}
18 27
19static void backtrace_test_timer(unsigned long data) 28static DECLARE_COMPLETION(backtrace_work);
29
30static void backtrace_test_irq_callback(unsigned long data)
31{
32 dump_stack();
33 complete(&backtrace_work);
34}
35
36static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
37
38static void backtrace_test_irq(void)
20{ 39{
21 printk("Testing a backtrace from irq context.\n"); 40 printk("Testing a backtrace from irq context.\n");
22 printk("The following trace is a kernel self test and not a bug!\n"); 41 printk("The following trace is a kernel self test and not a bug!\n");
23 dump_stack(); 42
43 init_completion(&backtrace_work);
44 tasklet_schedule(&backtrace_tasklet);
45 wait_for_completion(&backtrace_work);
46}
47
48#ifdef CONFIG_STACKTRACE
49static void backtrace_test_saved(void)
50{
51 struct stack_trace trace;
52 unsigned long entries[8];
53
54 printk("Testing a saved backtrace.\n");
55 printk("The following trace is a kernel self test and not a bug!\n");
56
57 trace.nr_entries = 0;
58 trace.max_entries = ARRAY_SIZE(entries);
59 trace.entries = entries;
60 trace.skip = 0;
61
62 save_stack_trace(&trace);
63 print_stack_trace(&trace, 0);
64}
65#else
66static void backtrace_test_saved(void)
67{
68 printk("Saved backtrace test skipped.\n");
24} 69}
70#endif
71
25static int backtrace_regression_test(void) 72static int backtrace_regression_test(void)
26{ 73{
27 printk("====[ backtrace testing ]===========\n"); 74 printk("====[ backtrace testing ]===========\n");
28 printk("Testing a backtrace from process context.\n");
29 printk("The following trace is a kernel self test and not a bug!\n");
30 dump_stack();
31 75
32 init_timer(&backtrace_timer); 76 backtrace_test_normal();
33 backtrace_timer.function = backtrace_test_timer; 77 backtrace_test_irq();
34 mod_timer(&backtrace_timer, jiffies + 10); 78 backtrace_test_saved();
35 79
36 msleep(10);
37 printk("====[ end of backtrace testing ]====\n"); 80 printk("====[ end of backtrace testing ]====\n");
38 return 0; 81 return 0;
39} 82}
diff --git a/kernel/capability.c b/kernel/capability.c
index 39e8193b41ea..901e0fdc3fff 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -53,11 +53,95 @@ static void warn_legacy_capability_use(void)
53} 53}
54 54
55/* 55/*
56 * Version 2 capabilities worked fine, but the linux/capability.h file
57 * that accompanied their introduction encouraged their use without
58 * the necessary user-space source code changes. As such, we have
59 * created a version 3 with equivalent functionality to version 2, but
60 * with a header change to protect legacy source code from using
61 * version 2 when it wanted to use version 1. If your system has code
62 * that trips the following warning, it is using version 2 specific
63 * capabilities and may be doing so insecurely.
64 *
65 * The remedy is to either upgrade your version of libcap (to 2.10+,
66 * if the application is linked against it), or recompile your
67 * application with modern kernel headers and this warning will go
68 * away.
69 */
70
71static void warn_deprecated_v2(void)
72{
73 static int warned;
74
75 if (!warned) {
76 char name[sizeof(current->comm)];
77
78 printk(KERN_INFO "warning: `%s' uses deprecated v2"
79 " capabilities in a way that may be insecure.\n",
80 get_task_comm(name, current));
81 warned = 1;
82 }
83}
84
85/*
86 * Version check. Return the number of u32s in each capability flag
87 * array, or a negative value on error.
88 */
89static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
90{
91 __u32 version;
92
93 if (get_user(version, &header->version))
94 return -EFAULT;
95
96 switch (version) {
97 case _LINUX_CAPABILITY_VERSION_1:
98 warn_legacy_capability_use();
99 *tocopy = _LINUX_CAPABILITY_U32S_1;
100 break;
101 case _LINUX_CAPABILITY_VERSION_2:
102 warn_deprecated_v2();
103 /*
104 * fall through - v3 is otherwise equivalent to v2.
105 */
106 case _LINUX_CAPABILITY_VERSION_3:
107 *tocopy = _LINUX_CAPABILITY_U32S_3;
108 break;
109 default:
110 if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version))
111 return -EFAULT;
112 return -EINVAL;
113 }
114
115 return 0;
116}
117
118/*
56 * For sys_getproccap() and sys_setproccap(), any of the three 119 * For sys_getproccap() and sys_setproccap(), any of the three
57 * capability set pointers may be NULL -- indicating that that set is 120 * capability set pointers may be NULL -- indicating that that set is
58 * uninteresting and/or not to be changed. 121 * uninteresting and/or not to be changed.
59 */ 122 */
60 123
124/*
125 * Atomically modify the effective capabilities returning the original
126 * value. No permission check is performed here - it is assumed that the
127 * caller is permitted to set the desired effective capabilities.
128 */
129kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
130{
131 kernel_cap_t pE_old;
132
133 spin_lock(&task_capability_lock);
134
135 pE_old = current->cap_effective;
136 current->cap_effective = pE_new;
137
138 spin_unlock(&task_capability_lock);
139
140 return pE_old;
141}
142
143EXPORT_SYMBOL(cap_set_effective);
144
61/** 145/**
62 * sys_capget - get the capabilities of a given process. 146 * sys_capget - get the capabilities of a given process.
63 * @header: pointer to struct that contains capability version and 147 * @header: pointer to struct that contains capability version and
@@ -71,27 +155,13 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
71{ 155{
72 int ret = 0; 156 int ret = 0;
73 pid_t pid; 157 pid_t pid;
74 __u32 version;
75 struct task_struct *target; 158 struct task_struct *target;
76 unsigned tocopy; 159 unsigned tocopy;
77 kernel_cap_t pE, pI, pP; 160 kernel_cap_t pE, pI, pP;
78 161
79 if (get_user(version, &header->version)) 162 ret = cap_validate_magic(header, &tocopy);
80 return -EFAULT; 163 if (ret != 0)
81 164 return ret;
82 switch (version) {
83 case _LINUX_CAPABILITY_VERSION_1:
84 warn_legacy_capability_use();
85 tocopy = _LINUX_CAPABILITY_U32S_1;
86 break;
87 case _LINUX_CAPABILITY_VERSION_2:
88 tocopy = _LINUX_CAPABILITY_U32S_2;
89 break;
90 default:
91 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
92 return -EFAULT;
93 return -EINVAL;
94 }
95 165
96 if (get_user(pid, &header->pid)) 166 if (get_user(pid, &header->pid))
97 return -EFAULT; 167 return -EFAULT;
@@ -118,7 +188,7 @@ out:
118 spin_unlock(&task_capability_lock); 188 spin_unlock(&task_capability_lock);
119 189
120 if (!ret) { 190 if (!ret) {
121 struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; 191 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
122 unsigned i; 192 unsigned i;
123 193
124 for (i = 0; i < tocopy; i++) { 194 for (i = 0; i < tocopy; i++) {
@@ -128,7 +198,7 @@ out:
128 } 198 }
129 199
130 /* 200 /*
131 * Note, in the case, tocopy < _LINUX_CAPABILITY_U32S, 201 * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S,
132 * we silently drop the upper capabilities here. This 202 * we silently drop the upper capabilities here. This
133 * has the effect of making older libcap 203 * has the effect of making older libcap
134 * implementations implicitly drop upper capability 204 * implementations implicitly drop upper capability
@@ -240,30 +310,16 @@ static inline int cap_set_all(kernel_cap_t *effective,
240 */ 310 */
241asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) 311asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
242{ 312{
243 struct __user_cap_data_struct kdata[_LINUX_CAPABILITY_U32S]; 313 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
244 unsigned i, tocopy; 314 unsigned i, tocopy;
245 kernel_cap_t inheritable, permitted, effective; 315 kernel_cap_t inheritable, permitted, effective;
246 __u32 version;
247 struct task_struct *target; 316 struct task_struct *target;
248 int ret; 317 int ret;
249 pid_t pid; 318 pid_t pid;
250 319
251 if (get_user(version, &header->version)) 320 ret = cap_validate_magic(header, &tocopy);
252 return -EFAULT; 321 if (ret != 0)
253 322 return ret;
254 switch (version) {
255 case _LINUX_CAPABILITY_VERSION_1:
256 warn_legacy_capability_use();
257 tocopy = _LINUX_CAPABILITY_U32S_1;
258 break;
259 case _LINUX_CAPABILITY_VERSION_2:
260 tocopy = _LINUX_CAPABILITY_U32S_2;
261 break;
262 default:
263 if (put_user(_LINUX_CAPABILITY_VERSION, &header->version))
264 return -EFAULT;
265 return -EINVAL;
266 }
267 323
268 if (get_user(pid, &header->pid)) 324 if (get_user(pid, &header->pid))
269 return -EFAULT; 325 return -EFAULT;
@@ -281,7 +337,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
281 permitted.cap[i] = kdata[i].permitted; 337 permitted.cap[i] = kdata[i].permitted;
282 inheritable.cap[i] = kdata[i].inheritable; 338 inheritable.cap[i] = kdata[i].inheritable;
283 } 339 }
284 while (i < _LINUX_CAPABILITY_U32S) { 340 while (i < _KERNEL_CAPABILITY_U32S) {
285 effective.cap[i] = 0; 341 effective.cap[i] = 0;
286 permitted.cap[i] = 0; 342 permitted.cap[i] = 0;
287 inheritable.cap[i] = 0; 343 inheritable.cap[i] = 0;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fbc6fc8949b4..15ac0e1e4f4d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2903,7 +2903,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2903 cg = tsk->cgroups; 2903 cg = tsk->cgroups;
2904 parent = task_cgroup(tsk, subsys->subsys_id); 2904 parent = task_cgroup(tsk, subsys->subsys_id);
2905 2905
2906 snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "node_%d", tsk->pid); 2906 snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
2907 2907
2908 /* Pin the hierarchy */ 2908 /* Pin the hierarchy */
2909 atomic_inc(&parent->root->sb->s_active); 2909 atomic_inc(&parent->root->sb->s_active);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c722..cfb1d43ab801 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,28 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/*
19 * Represents all cpu's present in the system
20 * In systems capable of hotplug, this map could dynamically grow
21 * as new cpu's are detected in the system via any platform specific
22 * method, such as ACPI for e.g.
23 */
24cpumask_t cpu_present_map __read_mostly;
25EXPORT_SYMBOL(cpu_present_map);
26
27#ifndef CONFIG_SMP
28
29/*
30 * Represents all cpu's that are currently online.
31 */
32cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
33EXPORT_SYMBOL(cpu_online_map);
34
35cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
36EXPORT_SYMBOL(cpu_possible_map);
37
38#else /* CONFIG_SMP */
39
18/* Serializes the updates to cpu_online_map, cpu_present_map */ 40/* Serializes the updates to cpu_online_map, cpu_present_map */
19static DEFINE_MUTEX(cpu_add_remove_lock); 41static DEFINE_MUTEX(cpu_add_remove_lock);
20 42
@@ -277,6 +299,7 @@ int __ref cpu_down(unsigned int cpu)
277 cpu_maps_update_done(); 299 cpu_maps_update_done();
278 return err; 300 return err;
279} 301}
302EXPORT_SYMBOL(cpu_down);
280#endif /*CONFIG_HOTPLUG_CPU*/ 303#endif /*CONFIG_HOTPLUG_CPU*/
281 304
282/* Requires cpu_add_remove_lock to be held */ 305/* Requires cpu_add_remove_lock to be held */
@@ -403,3 +426,5 @@ out:
403 cpu_maps_update_done(); 426 cpu_maps_update_done();
404} 427}
405#endif /* CONFIG_PM_SLEEP_SMP */ 428#endif /* CONFIG_PM_SLEEP_SMP */
429
430#endif /* CONFIG_SMP */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 86ea9e34e326..459d601947a8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -797,8 +797,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
797 retval = cpulist_parse(buf, trialcs.cpus_allowed); 797 retval = cpulist_parse(buf, trialcs.cpus_allowed);
798 if (retval < 0) 798 if (retval < 0)
799 return retval; 799 return retval;
800
801 if (!cpus_subset(trialcs.cpus_allowed, cpu_online_map))
802 return -EINVAL;
800 } 803 }
801 cpus_and(trialcs.cpus_allowed, trialcs.cpus_allowed, cpu_online_map);
802 retval = validate_change(cs, &trialcs); 804 retval = validate_change(cs, &trialcs);
803 if (retval < 0) 805 if (retval < 0)
804 return retval; 806 return retval;
@@ -932,9 +934,11 @@ static int update_nodemask(struct cpuset *cs, char *buf)
932 retval = nodelist_parse(buf, trialcs.mems_allowed); 934 retval = nodelist_parse(buf, trialcs.mems_allowed);
933 if (retval < 0) 935 if (retval < 0)
934 goto done; 936 goto done;
937
938 if (!nodes_subset(trialcs.mems_allowed,
939 node_states[N_HIGH_MEMORY]))
940 return -EINVAL;
935 } 941 }
936 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed,
937 node_states[N_HIGH_MEMORY]);
938 oldmem = cs->mems_allowed; 942 oldmem = cs->mems_allowed;
939 if (nodes_equal(oldmem, trialcs.mems_allowed)) { 943 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
940 retval = 0; /* Too easy - nothing to do */ 944 retval = 0; /* Too easy - nothing to do */
@@ -1033,8 +1037,8 @@ int current_cpuset_is_being_rebound(void)
1033 1037
1034static int update_relax_domain_level(struct cpuset *cs, s64 val) 1038static int update_relax_domain_level(struct cpuset *cs, s64 val)
1035{ 1039{
1036 if ((int)val < 0) 1040 if (val < -1 || val >= SD_LV_MAX)
1037 val = -1; 1041 return -EINVAL;
1038 1042
1039 if (val != cs->relax_domain_level) { 1043 if (val != cs->relax_domain_level) {
1040 cs->relax_domain_level = val; 1044 cs->relax_domain_level = val;
@@ -1190,6 +1194,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1190 1194
1191 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1195 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1192 return -ENOSPC; 1196 return -ENOSPC;
1197 if (tsk->flags & PF_THREAD_BOUND) {
1198 cpumask_t mask;
1199
1200 mutex_lock(&callback_mutex);
1201 mask = cs->cpus_allowed;
1202 mutex_unlock(&callback_mutex);
1203 if (!cpus_equal(tsk->cpus_allowed, mask))
1204 return -EINVAL;
1205 }
1193 1206
1194 return security_task_setscheduler(tsk, 0, NULL); 1207 return security_task_setscheduler(tsk, 0, NULL);
1195} 1208}
@@ -1203,11 +1216,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1203 struct mm_struct *mm; 1216 struct mm_struct *mm;
1204 struct cpuset *cs = cgroup_cs(cont); 1217 struct cpuset *cs = cgroup_cs(cont);
1205 struct cpuset *oldcs = cgroup_cs(oldcont); 1218 struct cpuset *oldcs = cgroup_cs(oldcont);
1219 int err;
1206 1220
1207 mutex_lock(&callback_mutex); 1221 mutex_lock(&callback_mutex);
1208 guarantee_online_cpus(cs, &cpus); 1222 guarantee_online_cpus(cs, &cpus);
1209 set_cpus_allowed_ptr(tsk, &cpus); 1223 err = set_cpus_allowed_ptr(tsk, &cpus);
1210 mutex_unlock(&callback_mutex); 1224 mutex_unlock(&callback_mutex);
1225 if (err)
1226 return;
1211 1227
1212 from = oldcs->mems_allowed; 1228 from = oldcs->mems_allowed;
1213 to = cs->mems_allowed; 1229 to = cs->mems_allowed;
@@ -1878,7 +1894,7 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1878 * in order to minimize text size. 1894 * in order to minimize text size.
1879 */ 1895 */
1880 1896
1881static void common_cpu_mem_hotplug_unplug(void) 1897static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1882{ 1898{
1883 cgroup_lock(); 1899 cgroup_lock();
1884 1900
@@ -1886,6 +1902,13 @@ static void common_cpu_mem_hotplug_unplug(void)
1886 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 1902 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1887 scan_for_empty_cpusets(&top_cpuset); 1903 scan_for_empty_cpusets(&top_cpuset);
1888 1904
1905 /*
1906 * Scheduler destroys domains on hotplug events.
1907 * Rebuild them based on the current settings.
1908 */
1909 if (rebuild_sd)
1910 rebuild_sched_domains();
1911
1889 cgroup_unlock(); 1912 cgroup_unlock();
1890} 1913}
1891 1914
@@ -1902,11 +1925,22 @@ static void common_cpu_mem_hotplug_unplug(void)
1902static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, 1925static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1903 unsigned long phase, void *unused_cpu) 1926 unsigned long phase, void *unused_cpu)
1904{ 1927{
1905 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) 1928 switch (phase) {
1929 case CPU_UP_CANCELED:
1930 case CPU_UP_CANCELED_FROZEN:
1931 case CPU_DOWN_FAILED:
1932 case CPU_DOWN_FAILED_FROZEN:
1933 case CPU_ONLINE:
1934 case CPU_ONLINE_FROZEN:
1935 case CPU_DEAD:
1936 case CPU_DEAD_FROZEN:
1937 common_cpu_mem_hotplug_unplug(1);
1938 break;
1939 default:
1906 return NOTIFY_DONE; 1940 return NOTIFY_DONE;
1941 }
1907 1942
1908 common_cpu_mem_hotplug_unplug(); 1943 return NOTIFY_OK;
1909 return 0;
1910} 1944}
1911 1945
1912#ifdef CONFIG_MEMORY_HOTPLUG 1946#ifdef CONFIG_MEMORY_HOTPLUG
@@ -1919,7 +1953,7 @@ static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1919 1953
1920void cpuset_track_online_nodes(void) 1954void cpuset_track_online_nodes(void)
1921{ 1955{
1922 common_cpu_mem_hotplug_unplug(); 1956 common_cpu_mem_hotplug_unplug(0);
1923} 1957}
1924#endif 1958#endif
1925 1959
diff --git a/kernel/exit.c b/kernel/exit.c
index 1510f78a0ffa..93d2711b9381 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,6 +13,7 @@
13#include <linux/personality.h> 13#include <linux/personality.h>
14#include <linux/tty.h> 14#include <linux/tty.h>
15#include <linux/mnt_namespace.h> 15#include <linux/mnt_namespace.h>
16#include <linux/iocontext.h>
16#include <linux/key.h> 17#include <linux/key.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/cpu.h> 19#include <linux/cpu.h>
@@ -70,7 +71,7 @@ static void __unhash_process(struct task_struct *p)
70 __get_cpu_var(process_counts)--; 71 __get_cpu_var(process_counts)--;
71 } 72 }
72 list_del_rcu(&p->thread_group); 73 list_del_rcu(&p->thread_group);
73 remove_parent(p); 74 list_del_init(&p->sibling);
74} 75}
75 76
76/* 77/*
@@ -126,6 +127,12 @@ static void __exit_signal(struct task_struct *tsk)
126 127
127 __unhash_process(tsk); 128 __unhash_process(tsk);
128 129
130 /*
131 * Do this under ->siglock, we can race with another thread
132 * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals.
133 */
134 flush_sigqueue(&tsk->pending);
135
129 tsk->signal = NULL; 136 tsk->signal = NULL;
130 tsk->sighand = NULL; 137 tsk->sighand = NULL;
131 spin_unlock(&sighand->siglock); 138 spin_unlock(&sighand->siglock);
@@ -133,7 +140,6 @@ static void __exit_signal(struct task_struct *tsk)
133 140
134 __cleanup_sighand(sighand); 141 __cleanup_sighand(sighand);
135 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 142 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
136 flush_sigqueue(&tsk->pending);
137 if (sig) { 143 if (sig) {
138 flush_sigqueue(&sig->shared_pending); 144 flush_sigqueue(&sig->shared_pending);
139 taskstats_tgid_free(sig); 145 taskstats_tgid_free(sig);
@@ -146,6 +152,18 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
146 put_task_struct(container_of(rhp, struct task_struct, rcu)); 152 put_task_struct(container_of(rhp, struct task_struct, rcu));
147} 153}
148 154
155/*
156 * Do final ptrace-related cleanup of a zombie being reaped.
157 *
158 * Called with write_lock(&tasklist_lock) held.
159 */
160static void ptrace_release_task(struct task_struct *p)
161{
162 BUG_ON(!list_empty(&p->ptraced));
163 ptrace_unlink(p);
164 BUG_ON(!list_empty(&p->ptrace_entry));
165}
166
149void release_task(struct task_struct * p) 167void release_task(struct task_struct * p)
150{ 168{
151 struct task_struct *leader; 169 struct task_struct *leader;
@@ -154,8 +172,7 @@ repeat:
154 atomic_dec(&p->user->processes); 172 atomic_dec(&p->user->processes);
155 proc_flush_task(p); 173 proc_flush_task(p);
156 write_lock_irq(&tasklist_lock); 174 write_lock_irq(&tasklist_lock);
157 ptrace_unlink(p); 175 ptrace_release_task(p);
158 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
159 __exit_signal(p); 176 __exit_signal(p);
160 177
161 /* 178 /*
@@ -309,9 +326,8 @@ static void reparent_to_kthreadd(void)
309 326
310 ptrace_unlink(current); 327 ptrace_unlink(current);
311 /* Reparent to init */ 328 /* Reparent to init */
312 remove_parent(current);
313 current->real_parent = current->parent = kthreadd_task; 329 current->real_parent = current->parent = kthreadd_task;
314 add_parent(current); 330 list_move_tail(&current->sibling, &current->real_parent->children);
315 331
316 /* Set the exit signal to SIGCHLD so we signal init on exit */ 332 /* Set the exit signal to SIGCHLD so we signal init on exit */
317 current->exit_signal = SIGCHLD; 333 current->exit_signal = SIGCHLD;
@@ -686,37 +702,97 @@ static void exit_mm(struct task_struct * tsk)
686 mmput(mm); 702 mmput(mm);
687} 703}
688 704
689static void 705/*
690reparent_thread(struct task_struct *p, struct task_struct *father, int traced) 706 * Return nonzero if @parent's children should reap themselves.
707 *
708 * Called with write_lock_irq(&tasklist_lock) held.
709 */
710static int ignoring_children(struct task_struct *parent)
691{ 711{
692 if (p->pdeath_signal) 712 int ret;
693 /* We already hold the tasklist_lock here. */ 713 struct sighand_struct *psig = parent->sighand;
694 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); 714 unsigned long flags;
715 spin_lock_irqsave(&psig->siglock, flags);
716 ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
717 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
718 spin_unlock_irqrestore(&psig->siglock, flags);
719 return ret;
720}
695 721
696 /* Move the child from its dying parent to the new one. */ 722/*
697 if (unlikely(traced)) { 723 * Detach all tasks we were using ptrace on.
698 /* Preserve ptrace links if someone else is tracing this child. */ 724 * Any that need to be release_task'd are put on the @dead list.
699 list_del_init(&p->ptrace_list); 725 *
700 if (ptrace_reparented(p)) 726 * Called with write_lock(&tasklist_lock) held.
701 list_add(&p->ptrace_list, &p->real_parent->ptrace_children); 727 */
702 } else { 728static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
703 /* If this child is being traced, then we're the one tracing it 729{
704 * anyway, so let go of it. 730 struct task_struct *p, *n;
731 int ign = -1;
732
733 list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
734 __ptrace_unlink(p);
735
736 if (p->exit_state != EXIT_ZOMBIE)
737 continue;
738
739 /*
740 * If it's a zombie, our attachedness prevented normal
741 * parent notification or self-reaping. Do notification
742 * now if it would have happened earlier. If it should
743 * reap itself, add it to the @dead list. We can't call
744 * release_task() here because we already hold tasklist_lock.
745 *
746 * If it's our own child, there is no notification to do.
747 * But if our normal children self-reap, then this child
748 * was prevented by ptrace and we must reap it now.
705 */ 749 */
706 p->ptrace = 0; 750 if (!task_detached(p) && thread_group_empty(p)) {
707 remove_parent(p); 751 if (!same_thread_group(p->real_parent, parent))
708 p->parent = p->real_parent; 752 do_notify_parent(p, p->exit_signal);
709 add_parent(p); 753 else {
754 if (ign < 0)
755 ign = ignoring_children(parent);
756 if (ign)
757 p->exit_signal = -1;
758 }
759 }
710 760
711 if (task_is_traced(p)) { 761 if (task_detached(p)) {
712 /* 762 /*
713 * If it was at a trace stop, turn it into 763 * Mark it as in the process of being reaped.
714 * a normal stop since it's no longer being
715 * traced.
716 */ 764 */
717 ptrace_untrace(p); 765 p->exit_state = EXIT_DEAD;
766 list_add(&p->ptrace_entry, dead);
718 } 767 }
719 } 768 }
769}
770
771/*
772 * Finish up exit-time ptrace cleanup.
773 *
774 * Called without locks.
775 */
776static void ptrace_exit_finish(struct task_struct *parent,
777 struct list_head *dead)
778{
779 struct task_struct *p, *n;
780
781 BUG_ON(!list_empty(&parent->ptraced));
782
783 list_for_each_entry_safe(p, n, dead, ptrace_entry) {
784 list_del_init(&p->ptrace_entry);
785 release_task(p);
786 }
787}
788
789static void reparent_thread(struct task_struct *p, struct task_struct *father)
790{
791 if (p->pdeath_signal)
792 /* We already hold the tasklist_lock here. */
793 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
794
795 list_move_tail(&p->sibling, &p->real_parent->children);
720 796
721 /* If this is a threaded reparent there is no need to 797 /* If this is a threaded reparent there is no need to
722 * notify anyone anything has happened. 798 * notify anyone anything has happened.
@@ -731,7 +807,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
731 /* If we'd notified the old parent about this child's death, 807 /* If we'd notified the old parent about this child's death,
732 * also notify the new parent. 808 * also notify the new parent.
733 */ 809 */
734 if (!traced && p->exit_state == EXIT_ZOMBIE && 810 if (!ptrace_reparented(p) &&
811 p->exit_state == EXIT_ZOMBIE &&
735 !task_detached(p) && thread_group_empty(p)) 812 !task_detached(p) && thread_group_empty(p))
736 do_notify_parent(p, p->exit_signal); 813 do_notify_parent(p, p->exit_signal);
737 814
@@ -748,12 +825,15 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
748static void forget_original_parent(struct task_struct *father) 825static void forget_original_parent(struct task_struct *father)
749{ 826{
750 struct task_struct *p, *n, *reaper = father; 827 struct task_struct *p, *n, *reaper = father;
751 struct list_head ptrace_dead; 828 LIST_HEAD(ptrace_dead);
752
753 INIT_LIST_HEAD(&ptrace_dead);
754 829
755 write_lock_irq(&tasklist_lock); 830 write_lock_irq(&tasklist_lock);
756 831
832 /*
833 * First clean up ptrace if we were using it.
834 */
835 ptrace_exit(father, &ptrace_dead);
836
757 do { 837 do {
758 reaper = next_thread(reaper); 838 reaper = next_thread(reaper);
759 if (reaper == father) { 839 if (reaper == father) {
@@ -762,58 +842,19 @@ static void forget_original_parent(struct task_struct *father)
762 } 842 }
763 } while (reaper->flags & PF_EXITING); 843 } while (reaper->flags & PF_EXITING);
764 844
765 /*
766 * There are only two places where our children can be:
767 *
768 * - in our child list
769 * - in our ptraced child list
770 *
771 * Search them and reparent children.
772 */
773 list_for_each_entry_safe(p, n, &father->children, sibling) { 845 list_for_each_entry_safe(p, n, &father->children, sibling) {
774 int ptrace;
775
776 ptrace = p->ptrace;
777
778 /* if father isn't the real parent, then ptrace must be enabled */
779 BUG_ON(father != p->real_parent && !ptrace);
780
781 if (father == p->real_parent) {
782 /* reparent with a reaper, real father it's us */
783 p->real_parent = reaper;
784 reparent_thread(p, father, 0);
785 } else {
786 /* reparent ptraced task to its real parent */
787 __ptrace_unlink (p);
788 if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
789 thread_group_empty(p))
790 do_notify_parent(p, p->exit_signal);
791 }
792
793 /*
794 * if the ptraced child is a detached zombie we must collect
795 * it before we exit, or it will remain zombie forever since
796 * we prevented it from self-reap itself while it was being
797 * traced by us, to be able to see it in wait4.
798 */
799 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p)))
800 list_add(&p->ptrace_list, &ptrace_dead);
801 }
802
803 list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) {
804 p->real_parent = reaper; 846 p->real_parent = reaper;
805 reparent_thread(p, father, 1); 847 if (p->parent == father) {
848 BUG_ON(p->ptrace);
849 p->parent = p->real_parent;
850 }
851 reparent_thread(p, father);
806 } 852 }
807 853
808 write_unlock_irq(&tasklist_lock); 854 write_unlock_irq(&tasklist_lock);
809 BUG_ON(!list_empty(&father->children)); 855 BUG_ON(!list_empty(&father->children));
810 BUG_ON(!list_empty(&father->ptrace_children));
811
812 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
813 list_del_init(&p->ptrace_list);
814 release_task(p);
815 }
816 856
857 ptrace_exit_finish(father, &ptrace_dead);
817} 858}
818 859
819/* 860/*
@@ -1174,13 +1215,6 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1174 return 0; 1215 return 0;
1175 } 1216 }
1176 1217
1177 /*
1178 * Do not consider detached threads that are
1179 * not ptraced:
1180 */
1181 if (task_detached(p) && !p->ptrace)
1182 return 0;
1183
1184 /* Wait for all children (clone and not) if __WALL is set; 1218 /* Wait for all children (clone and not) if __WALL is set;
1185 * otherwise, wait for clone children *only* if __WCLONE is 1219 * otherwise, wait for clone children *only* if __WCLONE is
1186 * set; otherwise, wait for non-clone children *only*. (Note: 1220 * set; otherwise, wait for non-clone children *only*. (Note:
@@ -1191,14 +1225,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1191 return 0; 1225 return 0;
1192 1226
1193 err = security_task_wait(p); 1227 err = security_task_wait(p);
1194 if (likely(!err)) 1228 if (err)
1195 return 1; 1229 return err;
1196 1230
1197 if (type != PIDTYPE_PID) 1231 return 1;
1198 return 0;
1199 /* This child was explicitly requested, abort */
1200 read_unlock(&tasklist_lock);
1201 return err;
1202} 1232}
1203 1233
1204static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, 1234static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1232,7 +1262,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
1232 * the lock and this task is uninteresting. If we return nonzero, we have 1262 * the lock and this task is uninteresting. If we return nonzero, we have
1233 * released the lock and the system call should return. 1263 * released the lock and the system call should return.
1234 */ 1264 */
1235static int wait_task_zombie(struct task_struct *p, int noreap, 1265static int wait_task_zombie(struct task_struct *p, int options,
1236 struct siginfo __user *infop, 1266 struct siginfo __user *infop,
1237 int __user *stat_addr, struct rusage __user *ru) 1267 int __user *stat_addr, struct rusage __user *ru)
1238{ 1268{
@@ -1240,7 +1270,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1240 int retval, status, traced; 1270 int retval, status, traced;
1241 pid_t pid = task_pid_vnr(p); 1271 pid_t pid = task_pid_vnr(p);
1242 1272
1243 if (unlikely(noreap)) { 1273 if (!likely(options & WEXITED))
1274 return 0;
1275
1276 if (unlikely(options & WNOWAIT)) {
1244 uid_t uid = p->uid; 1277 uid_t uid = p->uid;
1245 int exit_code = p->exit_code; 1278 int exit_code = p->exit_code;
1246 int why, status; 1279 int why, status;
@@ -1390,21 +1423,24 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1390 * the lock and this task is uninteresting. If we return nonzero, we have 1423 * the lock and this task is uninteresting. If we return nonzero, we have
1391 * released the lock and the system call should return. 1424 * released the lock and the system call should return.
1392 */ 1425 */
1393static int wait_task_stopped(struct task_struct *p, 1426static int wait_task_stopped(int ptrace, struct task_struct *p,
1394 int noreap, struct siginfo __user *infop, 1427 int options, struct siginfo __user *infop,
1395 int __user *stat_addr, struct rusage __user *ru) 1428 int __user *stat_addr, struct rusage __user *ru)
1396{ 1429{
1397 int retval, exit_code, why; 1430 int retval, exit_code, why;
1398 uid_t uid = 0; /* unneeded, required by compiler */ 1431 uid_t uid = 0; /* unneeded, required by compiler */
1399 pid_t pid; 1432 pid_t pid;
1400 1433
1434 if (!(options & WUNTRACED))
1435 return 0;
1436
1401 exit_code = 0; 1437 exit_code = 0;
1402 spin_lock_irq(&p->sighand->siglock); 1438 spin_lock_irq(&p->sighand->siglock);
1403 1439
1404 if (unlikely(!task_is_stopped_or_traced(p))) 1440 if (unlikely(!task_is_stopped_or_traced(p)))
1405 goto unlock_sig; 1441 goto unlock_sig;
1406 1442
1407 if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0) 1443 if (!ptrace && p->signal->group_stop_count > 0)
1408 /* 1444 /*
1409 * A group stop is in progress and this is the group leader. 1445 * A group stop is in progress and this is the group leader.
1410 * We won't report until all threads have stopped. 1446 * We won't report until all threads have stopped.
@@ -1415,7 +1451,7 @@ static int wait_task_stopped(struct task_struct *p,
1415 if (!exit_code) 1451 if (!exit_code)
1416 goto unlock_sig; 1452 goto unlock_sig;
1417 1453
1418 if (!noreap) 1454 if (!unlikely(options & WNOWAIT))
1419 p->exit_code = 0; 1455 p->exit_code = 0;
1420 1456
1421 uid = p->uid; 1457 uid = p->uid;
@@ -1433,10 +1469,10 @@ unlock_sig:
1433 */ 1469 */
1434 get_task_struct(p); 1470 get_task_struct(p);
1435 pid = task_pid_vnr(p); 1471 pid = task_pid_vnr(p);
1436 why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; 1472 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1437 read_unlock(&tasklist_lock); 1473 read_unlock(&tasklist_lock);
1438 1474
1439 if (unlikely(noreap)) 1475 if (unlikely(options & WNOWAIT))
1440 return wait_noreap_copyout(p, pid, uid, 1476 return wait_noreap_copyout(p, pid, uid,
1441 why, exit_code, 1477 why, exit_code,
1442 infop, ru); 1478 infop, ru);
@@ -1470,7 +1506,7 @@ unlock_sig:
1470 * the lock and this task is uninteresting. If we return nonzero, we have 1506 * the lock and this task is uninteresting. If we return nonzero, we have
1471 * released the lock and the system call should return. 1507 * released the lock and the system call should return.
1472 */ 1508 */
1473static int wait_task_continued(struct task_struct *p, int noreap, 1509static int wait_task_continued(struct task_struct *p, int options,
1474 struct siginfo __user *infop, 1510 struct siginfo __user *infop,
1475 int __user *stat_addr, struct rusage __user *ru) 1511 int __user *stat_addr, struct rusage __user *ru)
1476{ 1512{
@@ -1478,6 +1514,9 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1478 pid_t pid; 1514 pid_t pid;
1479 uid_t uid; 1515 uid_t uid;
1480 1516
1517 if (!unlikely(options & WCONTINUED))
1518 return 0;
1519
1481 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1520 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1482 return 0; 1521 return 0;
1483 1522
@@ -1487,7 +1526,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1487 spin_unlock_irq(&p->sighand->siglock); 1526 spin_unlock_irq(&p->sighand->siglock);
1488 return 0; 1527 return 0;
1489 } 1528 }
1490 if (!noreap) 1529 if (!unlikely(options & WNOWAIT))
1491 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1530 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1492 spin_unlock_irq(&p->sighand->siglock); 1531 spin_unlock_irq(&p->sighand->siglock);
1493 1532
@@ -1513,89 +1552,161 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1513 return retval; 1552 return retval;
1514} 1553}
1515 1554
1555/*
1556 * Consider @p for a wait by @parent.
1557 *
1558 * -ECHILD should be in *@notask_error before the first call.
1559 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1560 * Returns zero if the search for a child should continue;
1561 * then *@notask_error is 0 if @p is an eligible child,
1562 * or another error from security_task_wait(), or still -ECHILD.
1563 */
1564static int wait_consider_task(struct task_struct *parent, int ptrace,
1565 struct task_struct *p, int *notask_error,
1566 enum pid_type type, struct pid *pid, int options,
1567 struct siginfo __user *infop,
1568 int __user *stat_addr, struct rusage __user *ru)
1569{
1570 int ret = eligible_child(type, pid, options, p);
1571 if (!ret)
1572 return ret;
1573
1574 if (unlikely(ret < 0)) {
1575 /*
1576 * If we have not yet seen any eligible child,
1577 * then let this error code replace -ECHILD.
1578 * A permission error will give the user a clue
1579 * to look for security policy problems, rather
1580 * than for mysterious wait bugs.
1581 */
1582 if (*notask_error)
1583 *notask_error = ret;
1584 }
1585
1586 if (likely(!ptrace) && unlikely(p->ptrace)) {
1587 /*
1588 * This child is hidden by ptrace.
1589 * We aren't allowed to see it now, but eventually we will.
1590 */
1591 *notask_error = 0;
1592 return 0;
1593 }
1594
1595 if (p->exit_state == EXIT_DEAD)
1596 return 0;
1597
1598 /*
1599 * We don't reap group leaders with subthreads.
1600 */
1601 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
1602 return wait_task_zombie(p, options, infop, stat_addr, ru);
1603
1604 /*
1605 * It's stopped or running now, so it might
1606 * later continue, exit, or stop again.
1607 */
1608 *notask_error = 0;
1609
1610 if (task_is_stopped_or_traced(p))
1611 return wait_task_stopped(ptrace, p, options,
1612 infop, stat_addr, ru);
1613
1614 return wait_task_continued(p, options, infop, stat_addr, ru);
1615}
1616
1617/*
1618 * Do the work of do_wait() for one thread in the group, @tsk.
1619 *
1620 * -ECHILD should be in *@notask_error before the first call.
1621 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1622 * Returns zero if the search for a child should continue; then
1623 * *@notask_error is 0 if there were any eligible children,
1624 * or another error from security_task_wait(), or still -ECHILD.
1625 */
1626static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1627 enum pid_type type, struct pid *pid, int options,
1628 struct siginfo __user *infop, int __user *stat_addr,
1629 struct rusage __user *ru)
1630{
1631 struct task_struct *p;
1632
1633 list_for_each_entry(p, &tsk->children, sibling) {
1634 /*
1635 * Do not consider detached threads.
1636 */
1637 if (!task_detached(p)) {
1638 int ret = wait_consider_task(tsk, 0, p, notask_error,
1639 type, pid, options,
1640 infop, stat_addr, ru);
1641 if (ret)
1642 return ret;
1643 }
1644 }
1645
1646 return 0;
1647}
1648
1649static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
1650 enum pid_type type, struct pid *pid, int options,
1651 struct siginfo __user *infop, int __user *stat_addr,
1652 struct rusage __user *ru)
1653{
1654 struct task_struct *p;
1655
1656 /*
1657 * Traditionally we see ptrace'd stopped tasks regardless of options.
1658 */
1659 options |= WUNTRACED;
1660
1661 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1662 int ret = wait_consider_task(tsk, 1, p, notask_error,
1663 type, pid, options,
1664 infop, stat_addr, ru);
1665 if (ret)
1666 return ret;
1667 }
1668
1669 return 0;
1670}
1671
1516static long do_wait(enum pid_type type, struct pid *pid, int options, 1672static long do_wait(enum pid_type type, struct pid *pid, int options,
1517 struct siginfo __user *infop, int __user *stat_addr, 1673 struct siginfo __user *infop, int __user *stat_addr,
1518 struct rusage __user *ru) 1674 struct rusage __user *ru)
1519{ 1675{
1520 DECLARE_WAITQUEUE(wait, current); 1676 DECLARE_WAITQUEUE(wait, current);
1521 struct task_struct *tsk; 1677 struct task_struct *tsk;
1522 int flag, retval; 1678 int retval;
1523 1679
1524 add_wait_queue(&current->signal->wait_chldexit,&wait); 1680 add_wait_queue(&current->signal->wait_chldexit,&wait);
1525repeat: 1681repeat:
1526 /* If there is nothing that can match our critier just get out */ 1682 /*
1683 * If there is nothing that can match our critiera just get out.
1684 * We will clear @retval to zero if we see any child that might later
1685 * match our criteria, even if we are not able to reap it yet.
1686 */
1527 retval = -ECHILD; 1687 retval = -ECHILD;
1528 if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) 1688 if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
1529 goto end; 1689 goto end;
1530 1690
1531 /*
1532 * We will set this flag if we see any child that might later
1533 * match our criteria, even if we are not able to reap it yet.
1534 */
1535 flag = retval = 0;
1536 current->state = TASK_INTERRUPTIBLE; 1691 current->state = TASK_INTERRUPTIBLE;
1537 read_lock(&tasklist_lock); 1692 read_lock(&tasklist_lock);
1538 tsk = current; 1693 tsk = current;
1539 do { 1694 do {
1540 struct task_struct *p; 1695 int tsk_result = do_wait_thread(tsk, &retval,
1541 1696 type, pid, options,
1542 list_for_each_entry(p, &tsk->children, sibling) { 1697 infop, stat_addr, ru);
1543 int ret = eligible_child(type, pid, options, p); 1698 if (!tsk_result)
1544 if (!ret) 1699 tsk_result = ptrace_do_wait(tsk, &retval,
1545 continue; 1700 type, pid, options,
1546 1701 infop, stat_addr, ru);
1547 if (unlikely(ret < 0)) { 1702 if (tsk_result) {
1548 retval = ret; 1703 /*
1549 } else if (task_is_stopped_or_traced(p)) { 1704 * tasklist_lock is unlocked and we have a final result.
1550 /* 1705 */
1551 * It's stopped now, so it might later 1706 retval = tsk_result;
1552 * continue, exit, or stop again. 1707 goto end;
1553 */
1554 flag = 1;
1555 if (!(p->ptrace & PT_PTRACED) &&
1556 !(options & WUNTRACED))
1557 continue;
1558
1559 retval = wait_task_stopped(p,
1560 (options & WNOWAIT), infop,
1561 stat_addr, ru);
1562 } else if (p->exit_state == EXIT_ZOMBIE &&
1563 !delay_group_leader(p)) {
1564 /*
1565 * We don't reap group leaders with subthreads.
1566 */
1567 if (!likely(options & WEXITED))
1568 continue;
1569 retval = wait_task_zombie(p,
1570 (options & WNOWAIT), infop,
1571 stat_addr, ru);
1572 } else if (p->exit_state != EXIT_DEAD) {
1573 /*
1574 * It's running now, so it might later
1575 * exit, stop, or stop and then continue.
1576 */
1577 flag = 1;
1578 if (!unlikely(options & WCONTINUED))
1579 continue;
1580 retval = wait_task_continued(p,
1581 (options & WNOWAIT), infop,
1582 stat_addr, ru);
1583 }
1584 if (retval != 0) /* tasklist_lock released */
1585 goto end;
1586 }
1587 if (!flag) {
1588 list_for_each_entry(p, &tsk->ptrace_children,
1589 ptrace_list) {
1590 flag = eligible_child(type, pid, options, p);
1591 if (!flag)
1592 continue;
1593 if (likely(flag > 0))
1594 break;
1595 retval = flag;
1596 goto end;
1597 }
1598 } 1708 }
1709
1599 if (options & __WNOTHREAD) 1710 if (options & __WNOTHREAD)
1600 break; 1711 break;
1601 tsk = next_thread(tsk); 1712 tsk = next_thread(tsk);
@@ -1603,16 +1714,14 @@ repeat:
1603 } while (tsk != current); 1714 } while (tsk != current);
1604 read_unlock(&tasklist_lock); 1715 read_unlock(&tasklist_lock);
1605 1716
1606 if (flag) { 1717 if (!retval && !(options & WNOHANG)) {
1607 if (options & WNOHANG)
1608 goto end;
1609 retval = -ERESTARTSYS; 1718 retval = -ERESTARTSYS;
1610 if (signal_pending(current)) 1719 if (!signal_pending(current)) {
1611 goto end; 1720 schedule();
1612 schedule(); 1721 goto repeat;
1613 goto repeat; 1722 }
1614 } 1723 }
1615 retval = -ECHILD; 1724
1616end: 1725end:
1617 current->state = TASK_RUNNING; 1726 current->state = TASK_RUNNING;
1618 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1727 remove_wait_queue(&current->signal->wait_chldexit,&wait);
diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..adefc1131f27 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
23#include <linux/sem.h> 23#include <linux/sem.h>
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/fdtable.h> 25#include <linux/fdtable.h>
26#include <linux/iocontext.h>
26#include <linux/key.h> 27#include <linux/key.h>
27#include <linux/binfmts.h> 28#include <linux/binfmts.h>
28#include <linux/mman.h> 29#include <linux/mman.h>
@@ -909,7 +910,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
909 910
910 rt_mutex_init_task(p); 911 rt_mutex_init_task(p);
911 912
912#ifdef CONFIG_TRACE_IRQFLAGS 913#ifdef CONFIG_PROVE_LOCKING
913 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 914 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
914 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); 915 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
915#endif 916#endif
@@ -1124,8 +1125,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1124 */ 1125 */
1125 p->group_leader = p; 1126 p->group_leader = p;
1126 INIT_LIST_HEAD(&p->thread_group); 1127 INIT_LIST_HEAD(&p->thread_group);
1127 INIT_LIST_HEAD(&p->ptrace_children); 1128 INIT_LIST_HEAD(&p->ptrace_entry);
1128 INIT_LIST_HEAD(&p->ptrace_list); 1129 INIT_LIST_HEAD(&p->ptraced);
1129 1130
1130 /* Now that the task is set up, run cgroup callbacks if 1131 /* Now that the task is set up, run cgroup callbacks if
1131 * necessary. We need to run them before the task is visible 1132 * necessary. We need to run them before the task is visible
@@ -1197,7 +1198,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1197 } 1198 }
1198 1199
1199 if (likely(p->pid)) { 1200 if (likely(p->pid)) {
1200 add_parent(p); 1201 list_add_tail(&p->sibling, &p->real_parent->children);
1201 if (unlikely(p->ptrace & PT_PTRACED)) 1202 if (unlikely(p->ptrace & PT_PTRACED))
1202 __ptrace_link(p, current->parent); 1203 __ptrace_link(p, current->parent);
1203 1204
diff --git a/kernel/futex.c b/kernel/futex.c
index 449def8074fe..7d1136e97c14 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1096,21 +1096,64 @@ static void unqueue_me_pi(struct futex_q *q)
1096 * private futexes. 1096 * private futexes.
1097 */ 1097 */
1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1098static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1099 struct task_struct *newowner) 1099 struct task_struct *newowner,
1100 struct rw_semaphore *fshared)
1100{ 1101{
1101 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; 1102 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1102 struct futex_pi_state *pi_state = q->pi_state; 1103 struct futex_pi_state *pi_state = q->pi_state;
1104 struct task_struct *oldowner = pi_state->owner;
1103 u32 uval, curval, newval; 1105 u32 uval, curval, newval;
1104 int ret; 1106 int ret, attempt = 0;
1105 1107
1106 /* Owner died? */ 1108 /* Owner died? */
1109 if (!pi_state->owner)
1110 newtid |= FUTEX_OWNER_DIED;
1111
1112 /*
1113 * We are here either because we stole the rtmutex from the
1114 * pending owner or we are the pending owner which failed to
1115 * get the rtmutex. We have to replace the pending owner TID
1116 * in the user space variable. This must be atomic as we have
1117 * to preserve the owner died bit here.
1118 *
1119 * Note: We write the user space value _before_ changing the
1120 * pi_state because we can fault here. Imagine swapped out
1121 * pages or a fork, which was running right before we acquired
1122 * mmap_sem, that marked all the anonymous memory readonly for
1123 * cow.
1124 *
1125 * Modifying pi_state _before_ the user space value would
1126 * leave the pi_state in an inconsistent state when we fault
1127 * here, because we need to drop the hash bucket lock to
1128 * handle the fault. This might be observed in the PID check
1129 * in lookup_pi_state.
1130 */
1131retry:
1132 if (get_futex_value_locked(&uval, uaddr))
1133 goto handle_fault;
1134
1135 while (1) {
1136 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1137
1138 curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
1139
1140 if (curval == -EFAULT)
1141 goto handle_fault;
1142 if (curval == uval)
1143 break;
1144 uval = curval;
1145 }
1146
1147 /*
1148 * We fixed up user space. Now we need to fix the pi_state
1149 * itself.
1150 */
1107 if (pi_state->owner != NULL) { 1151 if (pi_state->owner != NULL) {
1108 spin_lock_irq(&pi_state->owner->pi_lock); 1152 spin_lock_irq(&pi_state->owner->pi_lock);
1109 WARN_ON(list_empty(&pi_state->list)); 1153 WARN_ON(list_empty(&pi_state->list));
1110 list_del_init(&pi_state->list); 1154 list_del_init(&pi_state->list);
1111 spin_unlock_irq(&pi_state->owner->pi_lock); 1155 spin_unlock_irq(&pi_state->owner->pi_lock);
1112 } else 1156 }
1113 newtid |= FUTEX_OWNER_DIED;
1114 1157
1115 pi_state->owner = newowner; 1158 pi_state->owner = newowner;
1116 1159
@@ -1118,26 +1161,35 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1118 WARN_ON(!list_empty(&pi_state->list)); 1161 WARN_ON(!list_empty(&pi_state->list));
1119 list_add(&pi_state->list, &newowner->pi_state_list); 1162 list_add(&pi_state->list, &newowner->pi_state_list);
1120 spin_unlock_irq(&newowner->pi_lock); 1163 spin_unlock_irq(&newowner->pi_lock);
1164 return 0;
1121 1165
1122 /* 1166 /*
1123 * We own it, so we have to replace the pending owner 1167 * To handle the page fault we need to drop the hash bucket
1124 * TID. This must be atomic as we have preserve the 1168 * lock here. That gives the other task (either the pending
1125 * owner died bit here. 1169 * owner itself or the task which stole the rtmutex) the
1170 * chance to try the fixup of the pi_state. So once we are
1171 * back from handling the fault we need to check the pi_state
1172 * after reacquiring the hash bucket lock and before trying to
1173 * do another fixup. When the fixup has been done already we
1174 * simply return.
1126 */ 1175 */
1127 ret = get_futex_value_locked(&uval, uaddr); 1176handle_fault:
1177 spin_unlock(q->lock_ptr);
1128 1178
1129 while (!ret) { 1179 ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
1130 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1131 1180
1132 curval = cmpxchg_futex_value_locked(uaddr, uval, newval); 1181 spin_lock(q->lock_ptr);
1133 1182
1134 if (curval == -EFAULT) 1183 /*
1135 ret = -EFAULT; 1184 * Check if someone else fixed it for us:
1136 if (curval == uval) 1185 */
1137 break; 1186 if (pi_state->owner != oldowner)
1138 uval = curval; 1187 return 0;
1139 } 1188
1140 return ret; 1189 if (ret)
1190 return ret;
1191
1192 goto retry;
1141} 1193}
1142 1194
1143/* 1195/*
@@ -1507,7 +1559,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1507 * that case: 1559 * that case:
1508 */ 1560 */
1509 if (q.pi_state->owner != curr) 1561 if (q.pi_state->owner != curr)
1510 ret = fixup_pi_state_owner(uaddr, &q, curr); 1562 ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
1511 } else { 1563 } else {
1512 /* 1564 /*
1513 * Catch the rare case, where the lock was released 1565 * Catch the rare case, where the lock was released
@@ -1539,7 +1591,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1539 int res; 1591 int res;
1540 1592
1541 owner = rt_mutex_owner(&q.pi_state->pi_mutex); 1593 owner = rt_mutex_owner(&q.pi_state->pi_mutex);
1542 res = fixup_pi_state_owner(uaddr, &q, owner); 1594 res = fixup_pi_state_owner(uaddr, &q, owner,
1595 fshared);
1543 1596
1544 /* propagate -EFAULT, if the fixup failed */ 1597 /* propagate -EFAULT, if the fixup failed */
1545 if (res) 1598 if (res)
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc7..b8e4dce80a74 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -300,11 +300,10 @@ EXPORT_SYMBOL_GPL(ktime_sub_ns);
300 */ 300 */
301u64 ktime_divns(const ktime_t kt, s64 div) 301u64 ktime_divns(const ktime_t kt, s64 div)
302{ 302{
303 u64 dclc, inc, dns; 303 u64 dclc;
304 int sft = 0; 304 int sft = 0;
305 305
306 dclc = dns = ktime_to_ns(kt); 306 dclc = ktime_to_ns(kt);
307 inc = div;
308 /* Make sure the divisor is less than 2^32: */ 307 /* Make sure the divisor is less than 2^32: */
309 while (div >> 32) { 308 while (div >> 32) {
310 sft++; 309 sft++;
@@ -623,7 +622,7 @@ static void retrigger_next_event(void *arg)
623void clock_was_set(void) 622void clock_was_set(void)
624{ 623{
625 /* Retrigger the CPU local events everywhere */ 624 /* Retrigger the CPU local events everywhere */
626 on_each_cpu(retrigger_next_event, NULL, 0, 1); 625 on_each_cpu(retrigger_next_event, NULL, 1);
627} 626}
628 627
629/* 628/*
@@ -632,8 +631,6 @@ void clock_was_set(void)
632 */ 631 */
633void hres_timers_resume(void) 632void hres_timers_resume(void)
634{ 633{
635 WARN_ON_ONCE(num_online_cpus() > 1);
636
637 /* Retrigger the CPU local events: */ 634 /* Retrigger the CPU local events: */
638 retrigger_next_event(NULL); 635 retrigger_next_event(NULL);
639} 636}
@@ -1003,10 +1000,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
1003 */ 1000 */
1004 raise = timer->state == HRTIMER_STATE_PENDING; 1001 raise = timer->state == HRTIMER_STATE_PENDING;
1005 1002
1003 /*
1004 * We use preempt_disable to prevent this task from migrating after
1005 * setting up the softirq and raising it. Otherwise, if me migrate
1006 * we will raise the softirq on the wrong CPU.
1007 */
1008 preempt_disable();
1009
1006 unlock_hrtimer_base(timer, &flags); 1010 unlock_hrtimer_base(timer, &flags);
1007 1011
1008 if (raise) 1012 if (raise)
1009 hrtimer_raise_softirq(); 1013 hrtimer_raise_softirq();
1014 preempt_enable();
1010 1015
1011 return ret; 1016 return ret;
1012} 1017}
@@ -1078,7 +1083,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1078} 1083}
1079EXPORT_SYMBOL_GPL(hrtimer_get_remaining); 1084EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1080 1085
1081#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) 1086#ifdef CONFIG_NO_HZ
1082/** 1087/**
1083 * hrtimer_get_next_event - get the time until next expiry event 1088 * hrtimer_get_next_event - get the time until next expiry event
1084 * 1089 *
@@ -1669,7 +1674,7 @@ void __init hrtimers_init(void)
1669 (void *)(long)smp_processor_id()); 1674 (void *)(long)smp_processor_id());
1670 register_cpu_notifier(&hrtimers_nb); 1675 register_cpu_notifier(&hrtimers_nb);
1671#ifdef CONFIG_HIGH_RES_TIMERS 1676#ifdef CONFIG_HIGH_RES_TIMERS
1672 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); 1677 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
1673#endif 1678#endif
1674} 1679}
1675 1680
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 46d6611a33bb..77a51be36010 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,8 @@
17 17
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
19 19
20cpumask_t irq_default_affinity = CPU_MASK_ALL;
21
20/** 22/**
21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 23 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
22 * @irq: interrupt number to wait for 24 * @irq: interrupt number to wait for
@@ -95,6 +97,27 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
95 return 0; 97 return 0;
96} 98}
97 99
100#ifndef CONFIG_AUTO_IRQ_AFFINITY
101/*
102 * Generic version of the affinity autoselector.
103 */
104int irq_select_affinity(unsigned int irq)
105{
106 cpumask_t mask;
107
108 if (!irq_can_set_affinity(irq))
109 return 0;
110
111 cpus_and(mask, cpu_online_map, irq_default_affinity);
112
113 irq_desc[irq].affinity = mask;
114 irq_desc[irq].chip->set_affinity(irq, mask);
115
116 set_balance_irq_affinity(irq, mask);
117 return 0;
118}
119#endif
120
98#endif 121#endif
99 122
100/** 123/**
@@ -354,7 +377,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
354 377
355 /* Setup the type (level, edge polarity) if configured: */ 378 /* Setup the type (level, edge polarity) if configured: */
356 if (new->flags & IRQF_TRIGGER_MASK) { 379 if (new->flags & IRQF_TRIGGER_MASK) {
357 if (desc->chip && desc->chip->set_type) 380 if (desc->chip->set_type)
358 desc->chip->set_type(irq, 381 desc->chip->set_type(irq,
359 new->flags & IRQF_TRIGGER_MASK); 382 new->flags & IRQF_TRIGGER_MASK);
360 else 383 else
@@ -364,8 +387,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
364 */ 387 */
365 printk(KERN_WARNING "No IRQF_TRIGGER set_type " 388 printk(KERN_WARNING "No IRQF_TRIGGER set_type "
366 "function for IRQ %d (%s)\n", irq, 389 "function for IRQ %d (%s)\n", irq,
367 desc->chip ? desc->chip->name : 390 desc->chip->name);
368 "unknown");
369 } else 391 } else
370 compat_irq_chip_set_default_handler(desc); 392 compat_irq_chip_set_default_handler(desc);
371 393
@@ -382,6 +404,9 @@ int setup_irq(unsigned int irq, struct irqaction *new)
382 } else 404 } else
383 /* Undo nested disables: */ 405 /* Undo nested disables: */
384 desc->depth = 1; 406 desc->depth = 1;
407
408 /* Set default affinity mask once everything is setup */
409 irq_select_affinity(irq);
385 } 410 }
386 /* Reset broken irq detection when installing new handler */ 411 /* Reset broken irq detection when installing new handler */
387 desc->irq_count = 0; 412 desc->irq_count = 0;
@@ -571,8 +596,6 @@ int request_irq(unsigned int irq, irq_handler_t handler,
571 action->next = NULL; 596 action->next = NULL;
572 action->dev_id = dev_id; 597 action->dev_id = dev_id;
573 598
574 select_smp_affinity(irq);
575
576#ifdef CONFIG_DEBUG_SHIRQ 599#ifdef CONFIG_DEBUG_SHIRQ
577 if (irqflags & IRQF_SHARED) { 600 if (irqflags & IRQF_SHARED) {
578 /* 601 /*
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index c2f2ccb0549a..6c6d35d68ee9 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -44,7 +44,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
44 unsigned long count, void *data) 44 unsigned long count, void *data)
45{ 45{
46 unsigned int irq = (int)(long)data, full_count = count, err; 46 unsigned int irq = (int)(long)data, full_count = count, err;
47 cpumask_t new_value, tmp; 47 cpumask_t new_value;
48 48
49 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || 49 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
50 irq_balancing_disabled(irq)) 50 irq_balancing_disabled(irq))
@@ -62,17 +62,51 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
62 * way to make the system unusable accidentally :-) At least 62 * way to make the system unusable accidentally :-) At least
63 * one online CPU still has to be targeted. 63 * one online CPU still has to be targeted.
64 */ 64 */
65 cpus_and(tmp, new_value, cpu_online_map); 65 if (!cpus_intersects(new_value, cpu_online_map))
66 if (cpus_empty(tmp))
67 /* Special case for empty set - allow the architecture 66 /* Special case for empty set - allow the architecture
68 code to set default SMP affinity. */ 67 code to set default SMP affinity. */
69 return select_smp_affinity(irq) ? -EINVAL : full_count; 68 return irq_select_affinity(irq) ? -EINVAL : full_count;
70 69
71 irq_set_affinity(irq, new_value); 70 irq_set_affinity(irq, new_value);
72 71
73 return full_count; 72 return full_count;
74} 73}
75 74
75static int default_affinity_read(char *page, char **start, off_t off,
76 int count, int *eof, void *data)
77{
78 int len = cpumask_scnprintf(page, count, irq_default_affinity);
79 if (count - len < 2)
80 return -EINVAL;
81 len += sprintf(page + len, "\n");
82 return len;
83}
84
85static int default_affinity_write(struct file *file, const char __user *buffer,
86 unsigned long count, void *data)
87{
88 unsigned int full_count = count, err;
89 cpumask_t new_value;
90
91 err = cpumask_parse_user(buffer, count, new_value);
92 if (err)
93 return err;
94
95 if (!is_affinity_mask_valid(new_value))
96 return -EINVAL;
97
98 /*
99 * Do not allow disabling IRQs completely - it's a too easy
100 * way to make the system unusable accidentally :-) At least
101 * one online CPU still has to be targeted.
102 */
103 if (!cpus_intersects(new_value, cpu_online_map))
104 return -EINVAL;
105
106 irq_default_affinity = new_value;
107
108 return full_count;
109}
76#endif 110#endif
77 111
78static int irq_spurious_read(char *page, char **start, off_t off, 112static int irq_spurious_read(char *page, char **start, off_t off,
@@ -171,6 +205,21 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
171 remove_proc_entry(action->dir->name, irq_desc[irq].dir); 205 remove_proc_entry(action->dir->name, irq_desc[irq].dir);
172} 206}
173 207
208void register_default_affinity_proc(void)
209{
210#ifdef CONFIG_SMP
211 struct proc_dir_entry *entry;
212
213 /* create /proc/irq/default_smp_affinity */
214 entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir);
215 if (entry) {
216 entry->data = NULL;
217 entry->read_proc = default_affinity_read;
218 entry->write_proc = default_affinity_write;
219 }
220#endif
221}
222
174void init_irq_proc(void) 223void init_irq_proc(void)
175{ 224{
176 int i; 225 int i;
@@ -180,6 +229,8 @@ void init_irq_proc(void)
180 if (!root_irq_dir) 229 if (!root_irq_dir)
181 return; 230 return;
182 231
232 register_default_affinity_proc();
233
183 /* 234 /*
184 * Create entries for all existing IRQs. 235 * Create entries for all existing IRQs.
185 */ 236 */
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 14787de568b3..3ec23c3ec97f 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -52,6 +52,7 @@
52#include <asm/byteorder.h> 52#include <asm/byteorder.h>
53#include <asm/atomic.h> 53#include <asm/atomic.h>
54#include <asm/system.h> 54#include <asm/system.h>
55#include <asm/unaligned.h>
55 56
56static int kgdb_break_asap; 57static int kgdb_break_asap;
57 58
@@ -227,8 +228,6 @@ void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
227 * GDB remote protocol parser: 228 * GDB remote protocol parser:
228 */ 229 */
229 230
230static const char hexchars[] = "0123456789abcdef";
231
232static int hex(char ch) 231static int hex(char ch)
233{ 232{
234 if ((ch >= 'a') && (ch <= 'f')) 233 if ((ch >= 'a') && (ch <= 'f'))
@@ -316,8 +315,8 @@ static void put_packet(char *buffer)
316 } 315 }
317 316
318 kgdb_io_ops->write_char('#'); 317 kgdb_io_ops->write_char('#');
319 kgdb_io_ops->write_char(hexchars[checksum >> 4]); 318 kgdb_io_ops->write_char(hex_asc_hi(checksum));
320 kgdb_io_ops->write_char(hexchars[checksum & 0xf]); 319 kgdb_io_ops->write_char(hex_asc_lo(checksum));
321 if (kgdb_io_ops->flush) 320 if (kgdb_io_ops->flush)
322 kgdb_io_ops->flush(); 321 kgdb_io_ops->flush();
323 322
@@ -478,8 +477,8 @@ static void error_packet(char *pkt, int error)
478{ 477{
479 error = -error; 478 error = -error;
480 pkt[0] = 'E'; 479 pkt[0] = 'E';
481 pkt[1] = hexchars[(error / 10)]; 480 pkt[1] = hex_asc[(error / 10)];
482 pkt[2] = hexchars[(error % 10)]; 481 pkt[2] = hex_asc[(error % 10)];
483 pkt[3] = '\0'; 482 pkt[3] = '\0';
484} 483}
485 484
@@ -510,10 +509,7 @@ static void int_to_threadref(unsigned char *id, int value)
510 scan = (unsigned char *)id; 509 scan = (unsigned char *)id;
511 while (i--) 510 while (i--)
512 *scan++ = 0; 511 *scan++ = 0;
513 *scan++ = (value >> 24) & 0xff; 512 put_unaligned_be32(value, scan);
514 *scan++ = (value >> 16) & 0xff;
515 *scan++ = (value >> 8) & 0xff;
516 *scan++ = (value & 0xff);
517} 513}
518 514
519static struct task_struct *getthread(struct pt_regs *regs, int tid) 515static struct task_struct *getthread(struct pt_regs *regs, int tid)
@@ -1503,7 +1499,8 @@ int kgdb_nmicallback(int cpu, void *regs)
1503 return 1; 1499 return 1;
1504} 1500}
1505 1501
1506void kgdb_console_write(struct console *co, const char *s, unsigned count) 1502static void kgdb_console_write(struct console *co, const char *s,
1503 unsigned count)
1507{ 1504{
1508 unsigned long flags; 1505 unsigned long flags;
1509 1506
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1e0250cb9486..1485ca8d0e00 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -79,7 +79,7 @@ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
79 * 79 *
80 * For such cases, we now have a blacklist 80 * For such cases, we now have a blacklist
81 */ 81 */
82struct kprobe_blackpoint kprobe_blacklist[] = { 82static struct kprobe_blackpoint kprobe_blacklist[] = {
83 {"preempt_schedule",}, 83 {"preempt_schedule",},
84 {NULL} /* Terminator */ 84 {NULL} /* Terminator */
85}; 85};
@@ -699,8 +699,9 @@ static int __register_kprobes(struct kprobe **kps, int num,
699 return -EINVAL; 699 return -EINVAL;
700 for (i = 0; i < num; i++) { 700 for (i = 0; i < num; i++) {
701 ret = __register_kprobe(kps[i], called_from); 701 ret = __register_kprobe(kps[i], called_from);
702 if (ret < 0 && i > 0) { 702 if (ret < 0) {
703 unregister_kprobes(kps, i); 703 if (i > 0)
704 unregister_kprobes(kps, i);
704 break; 705 break;
705 } 706 }
706 } 707 }
@@ -776,8 +777,9 @@ static int __register_jprobes(struct jprobe **jps, int num,
776 jp->kp.break_handler = longjmp_break_handler; 777 jp->kp.break_handler = longjmp_break_handler;
777 ret = __register_kprobe(&jp->kp, called_from); 778 ret = __register_kprobe(&jp->kp, called_from);
778 } 779 }
779 if (ret < 0 && i > 0) { 780 if (ret < 0) {
780 unregister_jprobes(jps, i); 781 if (i > 0)
782 unregister_jprobes(jps, i);
781 break; 783 break;
782 } 784 }
783 } 785 }
@@ -920,8 +922,9 @@ static int __register_kretprobes(struct kretprobe **rps, int num,
920 return -EINVAL; 922 return -EINVAL;
921 for (i = 0; i < num; i++) { 923 for (i = 0; i < num; i++) {
922 ret = __register_kretprobe(rps[i], called_from); 924 ret = __register_kretprobe(rps[i], called_from);
923 if (ret < 0 && i > 0) { 925 if (ret < 0) {
924 unregister_kretprobes(rps, i); 926 if (i > 0)
927 unregister_kretprobes(rps, i);
925 break; 928 break;
926 } 929 }
927 } 930 }
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bd1b9ea024e1..ac3fb7326641 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -180,6 +180,7 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
180 set_task_cpu(k, cpu); 180 set_task_cpu(k, cpu);
181 k->cpus_allowed = cpumask_of_cpu(cpu); 181 k->cpus_allowed = cpumask_of_cpu(cpu);
182 k->rt.nr_cpus_allowed = 1; 182 k->rt.nr_cpus_allowed = 1;
183 k->flags |= PF_THREAD_BOUND;
183} 184}
184EXPORT_SYMBOL(kthread_bind); 185EXPORT_SYMBOL(kthread_bind);
185 186
@@ -234,7 +235,7 @@ int kthreadd(void *unused)
234 set_user_nice(tsk, KTHREAD_NICE_LEVEL); 235 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
235 set_cpus_allowed(tsk, CPU_MASK_ALL); 236 set_cpus_allowed(tsk, CPU_MASK_ALL);
236 237
237 current->flags |= PF_NOFREEZE; 238 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
238 239
239 for (;;) { 240 for (;;) {
240 set_current_state(TASK_INTERRUPTIBLE); 241 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81a4e4a3f087..d38a64362973 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -39,6 +39,7 @@
39#include <linux/irqflags.h> 39#include <linux/irqflags.h>
40#include <linux/utsname.h> 40#include <linux/utsname.h>
41#include <linux/hash.h> 41#include <linux/hash.h>
42#include <linux/ftrace.h>
42 43
43#include <asm/sections.h> 44#include <asm/sections.h>
44 45
@@ -81,6 +82,8 @@ static int graph_lock(void)
81 __raw_spin_unlock(&lockdep_lock); 82 __raw_spin_unlock(&lockdep_lock);
82 return 0; 83 return 0;
83 } 84 }
85 /* prevent any recursions within lockdep from causing deadlocks */
86 current->lockdep_recursion++;
84 return 1; 87 return 1;
85} 88}
86 89
@@ -89,6 +92,7 @@ static inline int graph_unlock(void)
89 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) 92 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
90 return DEBUG_LOCKS_WARN_ON(1); 93 return DEBUG_LOCKS_WARN_ON(1);
91 94
95 current->lockdep_recursion--;
92 __raw_spin_unlock(&lockdep_lock); 96 __raw_spin_unlock(&lockdep_lock);
93 return 0; 97 return 0;
94} 98}
@@ -982,7 +986,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
982 return 1; 986 return 1;
983} 987}
984 988
985#ifdef CONFIG_TRACE_IRQFLAGS 989#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
986/* 990/*
987 * Forwards and backwards subgraph searching, for the purposes of 991 * Forwards and backwards subgraph searching, for the purposes of
988 * proving that two subgraphs can be connected by a new dependency 992 * proving that two subgraphs can be connected by a new dependency
@@ -1458,7 +1462,14 @@ out_bug:
1458} 1462}
1459 1463
1460unsigned long nr_lock_chains; 1464unsigned long nr_lock_chains;
1461static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; 1465struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
1466int nr_chain_hlocks;
1467static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
1468
1469struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
1470{
1471 return lock_classes + chain_hlocks[chain->base + i];
1472}
1462 1473
1463/* 1474/*
1464 * Look up a dependency chain. If the key is not present yet then 1475 * Look up a dependency chain. If the key is not present yet then
@@ -1466,10 +1477,15 @@ static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
1466 * validated. If the key is already hashed, return 0. 1477 * validated. If the key is already hashed, return 0.
1467 * (On return with 1 graph_lock is held.) 1478 * (On return with 1 graph_lock is held.)
1468 */ 1479 */
1469static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) 1480static inline int lookup_chain_cache(struct task_struct *curr,
1481 struct held_lock *hlock,
1482 u64 chain_key)
1470{ 1483{
1484 struct lock_class *class = hlock->class;
1471 struct list_head *hash_head = chainhashentry(chain_key); 1485 struct list_head *hash_head = chainhashentry(chain_key);
1472 struct lock_chain *chain; 1486 struct lock_chain *chain;
1487 struct held_lock *hlock_curr, *hlock_next;
1488 int i, j, n, cn;
1473 1489
1474 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 1490 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1475 return 0; 1491 return 0;
@@ -1517,6 +1533,32 @@ cache_hit:
1517 } 1533 }
1518 chain = lock_chains + nr_lock_chains++; 1534 chain = lock_chains + nr_lock_chains++;
1519 chain->chain_key = chain_key; 1535 chain->chain_key = chain_key;
1536 chain->irq_context = hlock->irq_context;
1537 /* Find the first held_lock of current chain */
1538 hlock_next = hlock;
1539 for (i = curr->lockdep_depth - 1; i >= 0; i--) {
1540 hlock_curr = curr->held_locks + i;
1541 if (hlock_curr->irq_context != hlock_next->irq_context)
1542 break;
1543 hlock_next = hlock;
1544 }
1545 i++;
1546 chain->depth = curr->lockdep_depth + 1 - i;
1547 cn = nr_chain_hlocks;
1548 while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) {
1549 n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth);
1550 if (n == cn)
1551 break;
1552 cn = n;
1553 }
1554 if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1555 chain->base = cn;
1556 for (j = 0; j < chain->depth - 1; j++, i++) {
1557 int lock_id = curr->held_locks[i].class - lock_classes;
1558 chain_hlocks[chain->base + j] = lock_id;
1559 }
1560 chain_hlocks[chain->base + j] = class - lock_classes;
1561 }
1520 list_add_tail_rcu(&chain->entry, hash_head); 1562 list_add_tail_rcu(&chain->entry, hash_head);
1521 debug_atomic_inc(&chain_lookup_misses); 1563 debug_atomic_inc(&chain_lookup_misses);
1522 inc_chains(); 1564 inc_chains();
@@ -1538,7 +1580,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1538 * graph_lock for us) 1580 * graph_lock for us)
1539 */ 1581 */
1540 if (!hlock->trylock && (hlock->check == 2) && 1582 if (!hlock->trylock && (hlock->check == 2) &&
1541 lookup_chain_cache(chain_key, hlock->class)) { 1583 lookup_chain_cache(curr, hlock, chain_key)) {
1542 /* 1584 /*
1543 * Check whether last held lock: 1585 * Check whether last held lock:
1544 * 1586 *
@@ -1680,7 +1722,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
1680static int mark_lock(struct task_struct *curr, struct held_lock *this, 1722static int mark_lock(struct task_struct *curr, struct held_lock *this,
1681 enum lock_usage_bit new_bit); 1723 enum lock_usage_bit new_bit);
1682 1724
1683#ifdef CONFIG_TRACE_IRQFLAGS 1725#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
1684 1726
1685/* 1727/*
1686 * print irq inversion bug: 1728 * print irq inversion bug:
@@ -2013,11 +2055,13 @@ void early_boot_irqs_on(void)
2013/* 2055/*
2014 * Hardirqs will be enabled: 2056 * Hardirqs will be enabled:
2015 */ 2057 */
2016void trace_hardirqs_on(void) 2058void trace_hardirqs_on_caller(unsigned long a0)
2017{ 2059{
2018 struct task_struct *curr = current; 2060 struct task_struct *curr = current;
2019 unsigned long ip; 2061 unsigned long ip;
2020 2062
2063 time_hardirqs_on(CALLER_ADDR0, a0);
2064
2021 if (unlikely(!debug_locks || current->lockdep_recursion)) 2065 if (unlikely(!debug_locks || current->lockdep_recursion))
2022 return; 2066 return;
2023 2067
@@ -2055,16 +2099,23 @@ void trace_hardirqs_on(void)
2055 curr->hardirq_enable_event = ++curr->irq_events; 2099 curr->hardirq_enable_event = ++curr->irq_events;
2056 debug_atomic_inc(&hardirqs_on_events); 2100 debug_atomic_inc(&hardirqs_on_events);
2057} 2101}
2102EXPORT_SYMBOL(trace_hardirqs_on_caller);
2058 2103
2104void trace_hardirqs_on(void)
2105{
2106 trace_hardirqs_on_caller(CALLER_ADDR0);
2107}
2059EXPORT_SYMBOL(trace_hardirqs_on); 2108EXPORT_SYMBOL(trace_hardirqs_on);
2060 2109
2061/* 2110/*
2062 * Hardirqs were disabled: 2111 * Hardirqs were disabled:
2063 */ 2112 */
2064void trace_hardirqs_off(void) 2113void trace_hardirqs_off_caller(unsigned long a0)
2065{ 2114{
2066 struct task_struct *curr = current; 2115 struct task_struct *curr = current;
2067 2116
2117 time_hardirqs_off(CALLER_ADDR0, a0);
2118
2068 if (unlikely(!debug_locks || current->lockdep_recursion)) 2119 if (unlikely(!debug_locks || current->lockdep_recursion))
2069 return; 2120 return;
2070 2121
@@ -2082,7 +2133,12 @@ void trace_hardirqs_off(void)
2082 } else 2133 } else
2083 debug_atomic_inc(&redundant_hardirqs_off); 2134 debug_atomic_inc(&redundant_hardirqs_off);
2084} 2135}
2136EXPORT_SYMBOL(trace_hardirqs_off_caller);
2085 2137
2138void trace_hardirqs_off(void)
2139{
2140 trace_hardirqs_off_caller(CALLER_ADDR0);
2141}
2086EXPORT_SYMBOL(trace_hardirqs_off); 2142EXPORT_SYMBOL(trace_hardirqs_off);
2087 2143
2088/* 2144/*
@@ -2246,7 +2302,7 @@ static inline int separate_irq_context(struct task_struct *curr,
2246 * Mark a lock with a usage bit, and validate the state transition: 2302 * Mark a lock with a usage bit, and validate the state transition:
2247 */ 2303 */
2248static int mark_lock(struct task_struct *curr, struct held_lock *this, 2304static int mark_lock(struct task_struct *curr, struct held_lock *this,
2249 enum lock_usage_bit new_bit) 2305 enum lock_usage_bit new_bit)
2250{ 2306{
2251 unsigned int new_mask = 1 << new_bit, ret = 1; 2307 unsigned int new_mask = 1 << new_bit, ret = 1;
2252 2308
@@ -2650,7 +2706,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2650 */ 2706 */
2651static void check_flags(unsigned long flags) 2707static void check_flags(unsigned long flags)
2652{ 2708{
2653#if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) 2709#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
2710 defined(CONFIG_TRACE_IRQFLAGS)
2654 if (!debug_locks) 2711 if (!debug_locks)
2655 return; 2712 return;
2656 2713
@@ -2686,7 +2743,7 @@ static void check_flags(unsigned long flags)
2686 * and also avoid lockdep recursion: 2743 * and also avoid lockdep recursion:
2687 */ 2744 */
2688void lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2745void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2689 int trylock, int read, int check, unsigned long ip) 2746 int trylock, int read, int check, unsigned long ip)
2690{ 2747{
2691 unsigned long flags; 2748 unsigned long flags;
2692 2749
@@ -2708,7 +2765,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2708 2765
2709EXPORT_SYMBOL_GPL(lock_acquire); 2766EXPORT_SYMBOL_GPL(lock_acquire);
2710 2767
2711void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) 2768void lock_release(struct lockdep_map *lock, int nested,
2769 unsigned long ip)
2712{ 2770{
2713 unsigned long flags; 2771 unsigned long flags;
2714 2772
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 8ce09bc4613d..c3600a091a28 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -23,6 +23,8 @@
23#define MAX_LOCKDEP_CHAINS_BITS 14 23#define MAX_LOCKDEP_CHAINS_BITS 14
24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
25 25
26#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
27
26/* 28/*
27 * Stack-trace: tightly packed array of stack backtrace 29 * Stack-trace: tightly packed array of stack backtrace
28 * addresses. Protected by the hash_lock. 30 * addresses. Protected by the hash_lock.
@@ -30,15 +32,19 @@
30#define MAX_STACK_TRACE_ENTRIES 262144UL 32#define MAX_STACK_TRACE_ENTRIES 262144UL
31 33
32extern struct list_head all_lock_classes; 34extern struct list_head all_lock_classes;
35extern struct lock_chain lock_chains[];
33 36
34extern void 37extern void
35get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); 38get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
36 39
37extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); 40extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
38 41
42struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
43
39extern unsigned long nr_lock_classes; 44extern unsigned long nr_lock_classes;
40extern unsigned long nr_list_entries; 45extern unsigned long nr_list_entries;
41extern unsigned long nr_lock_chains; 46extern unsigned long nr_lock_chains;
47extern int nr_chain_hlocks;
42extern unsigned long nr_stack_trace_entries; 48extern unsigned long nr_stack_trace_entries;
43 49
44extern unsigned int nr_hardirq_chains; 50extern unsigned int nr_hardirq_chains;
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index dc5d29648d85..9b0e940e2545 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -139,7 +139,7 @@ static int l_show(struct seq_file *m, void *v)
139 139
140 list_for_each_entry(entry, &class->locks_after, entry) { 140 list_for_each_entry(entry, &class->locks_after, entry) {
141 if (entry->distance == 1) { 141 if (entry->distance == 1) {
142 seq_printf(m, " -> [%p] ", entry->class); 142 seq_printf(m, " -> [%p] ", entry->class->key);
143 print_name(m, entry->class); 143 print_name(m, entry->class);
144 seq_puts(m, "\n"); 144 seq_puts(m, "\n");
145 } 145 }
@@ -178,6 +178,95 @@ static const struct file_operations proc_lockdep_operations = {
178 .release = seq_release, 178 .release = seq_release,
179}; 179};
180 180
181#ifdef CONFIG_PROVE_LOCKING
182static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
183{
184 struct lock_chain *chain;
185
186 (*pos)++;
187
188 if (v == SEQ_START_TOKEN)
189 chain = m->private;
190 else {
191 chain = v;
192
193 if (*pos < nr_lock_chains)
194 chain = lock_chains + *pos;
195 else
196 chain = NULL;
197 }
198
199 return chain;
200}
201
202static void *lc_start(struct seq_file *m, loff_t *pos)
203{
204 if (*pos == 0)
205 return SEQ_START_TOKEN;
206
207 if (*pos < nr_lock_chains)
208 return lock_chains + *pos;
209
210 return NULL;
211}
212
213static void lc_stop(struct seq_file *m, void *v)
214{
215}
216
217static int lc_show(struct seq_file *m, void *v)
218{
219 struct lock_chain *chain = v;
220 struct lock_class *class;
221 int i;
222
223 if (v == SEQ_START_TOKEN) {
224 seq_printf(m, "all lock chains:\n");
225 return 0;
226 }
227
228 seq_printf(m, "irq_context: %d\n", chain->irq_context);
229
230 for (i = 0; i < chain->depth; i++) {
231 class = lock_chain_get_class(chain, i);
232 seq_printf(m, "[%p] ", class->key);
233 print_name(m, class);
234 seq_puts(m, "\n");
235 }
236 seq_puts(m, "\n");
237
238 return 0;
239}
240
241static const struct seq_operations lockdep_chains_ops = {
242 .start = lc_start,
243 .next = lc_next,
244 .stop = lc_stop,
245 .show = lc_show,
246};
247
248static int lockdep_chains_open(struct inode *inode, struct file *file)
249{
250 int res = seq_open(file, &lockdep_chains_ops);
251 if (!res) {
252 struct seq_file *m = file->private_data;
253
254 if (nr_lock_chains)
255 m->private = lock_chains;
256 else
257 m->private = NULL;
258 }
259 return res;
260}
261
262static const struct file_operations proc_lockdep_chains_operations = {
263 .open = lockdep_chains_open,
264 .read = seq_read,
265 .llseek = seq_lseek,
266 .release = seq_release,
267};
268#endif /* CONFIG_PROVE_LOCKING */
269
181static void lockdep_stats_debug_show(struct seq_file *m) 270static void lockdep_stats_debug_show(struct seq_file *m)
182{ 271{
183#ifdef CONFIG_DEBUG_LOCKDEP 272#ifdef CONFIG_DEBUG_LOCKDEP
@@ -294,6 +383,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
294#ifdef CONFIG_PROVE_LOCKING 383#ifdef CONFIG_PROVE_LOCKING
295 seq_printf(m, " dependency chains: %11lu [max: %lu]\n", 384 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
296 nr_lock_chains, MAX_LOCKDEP_CHAINS); 385 nr_lock_chains, MAX_LOCKDEP_CHAINS);
386 seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n",
387 nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
297#endif 388#endif
298 389
299#ifdef CONFIG_TRACE_IRQFLAGS 390#ifdef CONFIG_TRACE_IRQFLAGS
@@ -661,6 +752,10 @@ static const struct file_operations proc_lock_stat_operations = {
661static int __init lockdep_proc_init(void) 752static int __init lockdep_proc_init(void)
662{ 753{
663 proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); 754 proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
755#ifdef CONFIG_PROVE_LOCKING
756 proc_create("lockdep_chains", S_IRUSR, NULL,
757 &proc_lockdep_chains_operations);
758#endif
664 proc_create("lockdep_stats", S_IRUSR, NULL, 759 proc_create("lockdep_stats", S_IRUSR, NULL,
665 &proc_lockdep_stats_operations); 760 &proc_lockdep_stats_operations);
666 761
diff --git a/kernel/marker.c b/kernel/marker.c
index b5a9fe1d50d5..1abfb923b761 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex);
55struct marker_entry { 55struct marker_entry {
56 struct hlist_node hlist; 56 struct hlist_node hlist;
57 char *format; 57 char *format;
58 void (*call)(const struct marker *mdata, /* Probe wrapper */ 58 /* Probe wrapper */
59 void *call_private, const char *fmt, ...); 59 void (*call)(const struct marker *mdata, void *call_private, ...);
60 struct marker_probe_closure single; 60 struct marker_probe_closure single;
61 struct marker_probe_closure *multi; 61 struct marker_probe_closure *multi;
62 int refcount; /* Number of times armed. 0 if disarmed. */ 62 int refcount; /* Number of times armed. 0 if disarmed. */
@@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
91 * marker_probe_cb Callback that prepares the variable argument list for probes. 91 * marker_probe_cb Callback that prepares the variable argument list for probes.
92 * @mdata: pointer of type struct marker 92 * @mdata: pointer of type struct marker
93 * @call_private: caller site private data 93 * @call_private: caller site private data
94 * @fmt: format string
95 * @...: Variable argument list. 94 * @...: Variable argument list.
96 * 95 *
97 * Since we do not use "typical" pointer based RCU in the 1 argument case, we 96 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
98 * need to put a full smp_rmb() in this branch. This is why we do not use 97 * need to put a full smp_rmb() in this branch. This is why we do not use
99 * rcu_dereference() for the pointer read. 98 * rcu_dereference() for the pointer read.
100 */ 99 */
101void marker_probe_cb(const struct marker *mdata, void *call_private, 100void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
102 const char *fmt, ...)
103{ 101{
104 va_list args; 102 va_list args;
105 char ptype; 103 char ptype;
@@ -120,8 +118,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
120 /* Must read the ptr before private data. They are not data 118 /* Must read the ptr before private data. They are not data
121 * dependant, so we put an explicit smp_rmb() here. */ 119 * dependant, so we put an explicit smp_rmb() here. */
122 smp_rmb(); 120 smp_rmb();
123 va_start(args, fmt); 121 va_start(args, call_private);
124 func(mdata->single.probe_private, call_private, fmt, &args); 122 func(mdata->single.probe_private, call_private, mdata->format,
123 &args);
125 va_end(args); 124 va_end(args);
126 } else { 125 } else {
127 struct marker_probe_closure *multi; 126 struct marker_probe_closure *multi;
@@ -136,9 +135,9 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
136 smp_read_barrier_depends(); 135 smp_read_barrier_depends();
137 multi = mdata->multi; 136 multi = mdata->multi;
138 for (i = 0; multi[i].func; i++) { 137 for (i = 0; multi[i].func; i++) {
139 va_start(args, fmt); 138 va_start(args, call_private);
140 multi[i].func(multi[i].probe_private, call_private, fmt, 139 multi[i].func(multi[i].probe_private, call_private,
141 &args); 140 mdata->format, &args);
142 va_end(args); 141 va_end(args);
143 } 142 }
144 } 143 }
@@ -150,13 +149,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
150 * marker_probe_cb Callback that does not prepare the variable argument list. 149 * marker_probe_cb Callback that does not prepare the variable argument list.
151 * @mdata: pointer of type struct marker 150 * @mdata: pointer of type struct marker
152 * @call_private: caller site private data 151 * @call_private: caller site private data
153 * @fmt: format string
154 * @...: Variable argument list. 152 * @...: Variable argument list.
155 * 153 *
156 * Should be connected to markers "MARK_NOARGS". 154 * Should be connected to markers "MARK_NOARGS".
157 */ 155 */
158void marker_probe_cb_noarg(const struct marker *mdata, 156void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
159 void *call_private, const char *fmt, ...)
160{ 157{
161 va_list args; /* not initialized */ 158 va_list args; /* not initialized */
162 char ptype; 159 char ptype;
@@ -172,7 +169,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
172 /* Must read the ptr before private data. They are not data 169 /* Must read the ptr before private data. They are not data
173 * dependant, so we put an explicit smp_rmb() here. */ 170 * dependant, so we put an explicit smp_rmb() here. */
174 smp_rmb(); 171 smp_rmb();
175 func(mdata->single.probe_private, call_private, fmt, &args); 172 func(mdata->single.probe_private, call_private, mdata->format,
173 &args);
176 } else { 174 } else {
177 struct marker_probe_closure *multi; 175 struct marker_probe_closure *multi;
178 int i; 176 int i;
@@ -186,8 +184,8 @@ void marker_probe_cb_noarg(const struct marker *mdata,
186 smp_read_barrier_depends(); 184 smp_read_barrier_depends();
187 multi = mdata->multi; 185 multi = mdata->multi;
188 for (i = 0; multi[i].func; i++) 186 for (i = 0; multi[i].func; i++)
189 multi[i].func(multi[i].probe_private, call_private, fmt, 187 multi[i].func(multi[i].probe_private, call_private,
190 &args); 188 mdata->format, &args);
191 } 189 }
192 preempt_enable(); 190 preempt_enable();
193} 191}
diff --git a/kernel/module.c b/kernel/module.c
index f5e9491ef7ac..5f80478b746d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1337,7 +1337,19 @@ out_unreg:
1337 kobject_put(&mod->mkobj.kobj); 1337 kobject_put(&mod->mkobj.kobj);
1338 return err; 1338 return err;
1339} 1339}
1340#endif 1340
1341static void mod_sysfs_fini(struct module *mod)
1342{
1343 kobject_put(&mod->mkobj.kobj);
1344}
1345
1346#else /* CONFIG_SYSFS */
1347
1348static void mod_sysfs_fini(struct module *mod)
1349{
1350}
1351
1352#endif /* CONFIG_SYSFS */
1341 1353
1342static void mod_kobject_remove(struct module *mod) 1354static void mod_kobject_remove(struct module *mod)
1343{ 1355{
@@ -1345,7 +1357,7 @@ static void mod_kobject_remove(struct module *mod)
1345 module_param_sysfs_remove(mod); 1357 module_param_sysfs_remove(mod);
1346 kobject_put(mod->mkobj.drivers_dir); 1358 kobject_put(mod->mkobj.drivers_dir);
1347 kobject_put(mod->holders_dir); 1359 kobject_put(mod->holders_dir);
1348 kobject_put(&mod->mkobj.kobj); 1360 mod_sysfs_fini(mod);
1349} 1361}
1350 1362
1351/* 1363/*
@@ -1780,7 +1792,7 @@ static struct module *load_module(void __user *umod,
1780 1792
1781 /* Sanity checks against insmoding binaries or wrong arch, 1793 /* Sanity checks against insmoding binaries or wrong arch,
1782 weird elf version */ 1794 weird elf version */
1783 if (memcmp(hdr->e_ident, ELFMAG, 4) != 0 1795 if (memcmp(hdr->e_ident, ELFMAG, SELFMAG) != 0
1784 || hdr->e_type != ET_REL 1796 || hdr->e_type != ET_REL
1785 || !elf_check_arch(hdr) 1797 || !elf_check_arch(hdr)
1786 || hdr->e_shentsize != sizeof(*sechdrs)) { 1798 || hdr->e_shentsize != sizeof(*sechdrs)) {
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 3aaa06c561de..1d94160eb532 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -79,8 +79,8 @@ void debug_mutex_unlock(struct mutex *lock)
79 if (unlikely(!debug_locks)) 79 if (unlikely(!debug_locks))
80 return; 80 return;
81 81
82 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
83 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 82 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
83 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
85 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 85 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
86} 86}
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d046a345d365..bcdc9ac8ef60 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -165,10 +165,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
165 * got a signal? (This code gets eliminated in the 165 * got a signal? (This code gets eliminated in the
166 * TASK_UNINTERRUPTIBLE case.) 166 * TASK_UNINTERRUPTIBLE case.)
167 */ 167 */
168 if (unlikely((state == TASK_INTERRUPTIBLE && 168 if (unlikely(signal_pending_state(state, task))) {
169 signal_pending(task)) ||
170 (state == TASK_KILLABLE &&
171 fatal_signal_pending(task)))) {
172 mutex_remove_waiter(lock, &waiter, 169 mutex_remove_waiter(lock, &waiter,
173 task_thread_info(task)); 170 task_thread_info(task));
174 mutex_release(&lock->dep_map, 1, ip); 171 mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/pid.c b/kernel/pid.c
index 20d59fa2d493..30bd5d4b2ac7 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -30,6 +30,7 @@
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/rculist.h>
33#include <linux/bootmem.h> 34#include <linux/bootmem.h>
34#include <linux/hash.h> 35#include <linux/hash.h>
35#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 0afe32be4c85..8cb757026386 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -29,6 +29,7 @@
29 29
30#include <linux/pm_qos_params.h> 30#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/smp_lock.h>
32#include <linux/spinlock.h> 33#include <linux/spinlock.h>
33#include <linux/slab.h> 34#include <linux/slab.h>
34#include <linux/time.h> 35#include <linux/time.h>
@@ -358,15 +359,19 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
358 int ret; 359 int ret;
359 long pm_qos_class; 360 long pm_qos_class;
360 361
362 lock_kernel();
361 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 363 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
362 if (pm_qos_class >= 0) { 364 if (pm_qos_class >= 0) {
363 filp->private_data = (void *)pm_qos_class; 365 filp->private_data = (void *)pm_qos_class;
364 sprintf(name, "process_%d", current->pid); 366 sprintf(name, "process_%d", current->pid);
365 ret = pm_qos_add_requirement(pm_qos_class, name, 367 ret = pm_qos_add_requirement(pm_qos_class, name,
366 PM_QOS_DEFAULT_VALUE); 368 PM_QOS_DEFAULT_VALUE);
367 if (ret >= 0) 369 if (ret >= 0) {
370 unlock_kernel();
368 return 0; 371 return 0;
372 }
369 } 373 }
374 unlock_kernel();
370 375
371 return -EPERM; 376 return -EPERM;
372} 377}
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 14a656cdc652..f011e0870b52 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -180,6 +180,17 @@ static void platform_restore_cleanup(int platform_mode)
180} 180}
181 181
182/** 182/**
183 * platform_recover - recover the platform from a failure to suspend
184 * devices.
185 */
186
187static void platform_recover(int platform_mode)
188{
189 if (platform_mode && hibernation_ops && hibernation_ops->recover)
190 hibernation_ops->recover();
191}
192
193/**
183 * create_image - freeze devices that need to be frozen with interrupts 194 * create_image - freeze devices that need to be frozen with interrupts
184 * off, create the hibernation image and thaw those devices. Control 195 * off, create the hibernation image and thaw those devices. Control
185 * reappears in this routine after a restore. 196 * reappears in this routine after a restore.
@@ -193,6 +204,7 @@ static int create_image(int platform_mode)
193 if (error) 204 if (error)
194 return error; 205 return error;
195 206
207 device_pm_lock();
196 local_irq_disable(); 208 local_irq_disable();
197 /* At this point, device_suspend() has been called, but *not* 209 /* At this point, device_suspend() has been called, but *not*
198 * device_power_down(). We *must* call device_power_down() now. 210 * device_power_down(). We *must* call device_power_down() now.
@@ -224,9 +236,11 @@ static int create_image(int platform_mode)
224 /* NOTE: device_power_up() is just a resume() for devices 236 /* NOTE: device_power_up() is just a resume() for devices
225 * that suspended with irqs off ... no overall powerup. 237 * that suspended with irqs off ... no overall powerup.
226 */ 238 */
227 device_power_up(); 239 device_power_up(in_suspend ?
240 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
228 Enable_irqs: 241 Enable_irqs:
229 local_irq_enable(); 242 local_irq_enable();
243 device_pm_unlock();
230 return error; 244 return error;
231} 245}
232 246
@@ -255,10 +269,10 @@ int hibernation_snapshot(int platform_mode)
255 suspend_console(); 269 suspend_console();
256 error = device_suspend(PMSG_FREEZE); 270 error = device_suspend(PMSG_FREEZE);
257 if (error) 271 if (error)
258 goto Resume_console; 272 goto Recover_platform;
259 273
260 if (hibernation_test(TEST_DEVICES)) 274 if (hibernation_test(TEST_DEVICES))
261 goto Resume_devices; 275 goto Recover_platform;
262 276
263 error = platform_pre_snapshot(platform_mode); 277 error = platform_pre_snapshot(platform_mode);
264 if (error || hibernation_test(TEST_PLATFORM)) 278 if (error || hibernation_test(TEST_PLATFORM))
@@ -280,12 +294,16 @@ int hibernation_snapshot(int platform_mode)
280 Finish: 294 Finish:
281 platform_finish(platform_mode); 295 platform_finish(platform_mode);
282 Resume_devices: 296 Resume_devices:
283 device_resume(); 297 device_resume(in_suspend ?
284 Resume_console: 298 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
285 resume_console(); 299 resume_console();
286 Close: 300 Close:
287 platform_end(platform_mode); 301 platform_end(platform_mode);
288 return error; 302 return error;
303
304 Recover_platform:
305 platform_recover(platform_mode);
306 goto Resume_devices;
289} 307}
290 308
291/** 309/**
@@ -300,8 +318,9 @@ static int resume_target_kernel(void)
300{ 318{
301 int error; 319 int error;
302 320
321 device_pm_lock();
303 local_irq_disable(); 322 local_irq_disable();
304 error = device_power_down(PMSG_PRETHAW); 323 error = device_power_down(PMSG_QUIESCE);
305 if (error) { 324 if (error) {
306 printk(KERN_ERR "PM: Some devices failed to power down, " 325 printk(KERN_ERR "PM: Some devices failed to power down, "
307 "aborting resume\n"); 326 "aborting resume\n");
@@ -329,9 +348,10 @@ static int resume_target_kernel(void)
329 swsusp_free(); 348 swsusp_free();
330 restore_processor_state(); 349 restore_processor_state();
331 touch_softlockup_watchdog(); 350 touch_softlockup_watchdog();
332 device_power_up(); 351 device_power_up(PMSG_RECOVER);
333 Enable_irqs: 352 Enable_irqs:
334 local_irq_enable(); 353 local_irq_enable();
354 device_pm_unlock();
335 return error; 355 return error;
336} 356}
337 357
@@ -350,7 +370,7 @@ int hibernation_restore(int platform_mode)
350 370
351 pm_prepare_console(); 371 pm_prepare_console();
352 suspend_console(); 372 suspend_console();
353 error = device_suspend(PMSG_PRETHAW); 373 error = device_suspend(PMSG_QUIESCE);
354 if (error) 374 if (error)
355 goto Finish; 375 goto Finish;
356 376
@@ -362,7 +382,7 @@ int hibernation_restore(int platform_mode)
362 enable_nonboot_cpus(); 382 enable_nonboot_cpus();
363 } 383 }
364 platform_restore_cleanup(platform_mode); 384 platform_restore_cleanup(platform_mode);
365 device_resume(); 385 device_resume(PMSG_RECOVER);
366 Finish: 386 Finish:
367 resume_console(); 387 resume_console();
368 pm_restore_console(); 388 pm_restore_console();
@@ -392,8 +412,11 @@ int hibernation_platform_enter(void)
392 412
393 suspend_console(); 413 suspend_console();
394 error = device_suspend(PMSG_HIBERNATE); 414 error = device_suspend(PMSG_HIBERNATE);
395 if (error) 415 if (error) {
396 goto Resume_console; 416 if (hibernation_ops->recover)
417 hibernation_ops->recover();
418 goto Resume_devices;
419 }
397 420
398 error = hibernation_ops->prepare(); 421 error = hibernation_ops->prepare();
399 if (error) 422 if (error)
@@ -403,6 +426,7 @@ int hibernation_platform_enter(void)
403 if (error) 426 if (error)
404 goto Finish; 427 goto Finish;
405 428
429 device_pm_lock();
406 local_irq_disable(); 430 local_irq_disable();
407 error = device_power_down(PMSG_HIBERNATE); 431 error = device_power_down(PMSG_HIBERNATE);
408 if (!error) { 432 if (!error) {
@@ -411,6 +435,7 @@ int hibernation_platform_enter(void)
411 while (1); 435 while (1);
412 } 436 }
413 local_irq_enable(); 437 local_irq_enable();
438 device_pm_unlock();
414 439
415 /* 440 /*
416 * We don't need to reenable the nonboot CPUs or resume consoles, since 441 * We don't need to reenable the nonboot CPUs or resume consoles, since
@@ -419,8 +444,7 @@ int hibernation_platform_enter(void)
419 Finish: 444 Finish:
420 hibernation_ops->finish(); 445 hibernation_ops->finish();
421 Resume_devices: 446 Resume_devices:
422 device_resume(); 447 device_resume(PMSG_RESTORE);
423 Resume_console:
424 resume_console(); 448 resume_console();
425 Close: 449 Close:
426 hibernation_ops->end(); 450 hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6a6d5eb3524e..3398f4651aa1 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -228,6 +228,7 @@ static int suspend_enter(suspend_state_t state)
228{ 228{
229 int error = 0; 229 int error = 0;
230 230
231 device_pm_lock();
231 arch_suspend_disable_irqs(); 232 arch_suspend_disable_irqs();
232 BUG_ON(!irqs_disabled()); 233 BUG_ON(!irqs_disabled());
233 234
@@ -239,10 +240,11 @@ static int suspend_enter(suspend_state_t state)
239 if (!suspend_test(TEST_CORE)) 240 if (!suspend_test(TEST_CORE))
240 error = suspend_ops->enter(state); 241 error = suspend_ops->enter(state);
241 242
242 device_power_up(); 243 device_power_up(PMSG_RESUME);
243 Done: 244 Done:
244 arch_suspend_enable_irqs(); 245 arch_suspend_enable_irqs();
245 BUG_ON(irqs_disabled()); 246 BUG_ON(irqs_disabled());
247 device_pm_unlock();
246 return error; 248 return error;
247} 249}
248 250
@@ -267,11 +269,11 @@ int suspend_devices_and_enter(suspend_state_t state)
267 error = device_suspend(PMSG_SUSPEND); 269 error = device_suspend(PMSG_SUSPEND);
268 if (error) { 270 if (error) {
269 printk(KERN_ERR "PM: Some devices failed to suspend\n"); 271 printk(KERN_ERR "PM: Some devices failed to suspend\n");
270 goto Resume_console; 272 goto Recover_platform;
271 } 273 }
272 274
273 if (suspend_test(TEST_DEVICES)) 275 if (suspend_test(TEST_DEVICES))
274 goto Resume_devices; 276 goto Recover_platform;
275 277
276 if (suspend_ops->prepare) { 278 if (suspend_ops->prepare) {
277 error = suspend_ops->prepare(); 279 error = suspend_ops->prepare();
@@ -291,13 +293,17 @@ int suspend_devices_and_enter(suspend_state_t state)
291 if (suspend_ops->finish) 293 if (suspend_ops->finish)
292 suspend_ops->finish(); 294 suspend_ops->finish();
293 Resume_devices: 295 Resume_devices:
294 device_resume(); 296 device_resume(PMSG_RESUME);
295 Resume_console:
296 resume_console(); 297 resume_console();
297 Close: 298 Close:
298 if (suspend_ops->end) 299 if (suspend_ops->end)
299 suspend_ops->end(); 300 suspend_ops->end();
300 return error; 301 return error;
302
303 Recover_platform:
304 if (suspend_ops->recover)
305 suspend_ops->recover();
306 goto Resume_devices;
301} 307}
302 308
303/** 309/**
diff --git a/kernel/power/process.c b/kernel/power/process.c
index f1d0b345c9ba..5fb87652f214 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -19,9 +19,6 @@
19 */ 19 */
20#define TIMEOUT (20 * HZ) 20#define TIMEOUT (20 * HZ)
21 21
22#define FREEZER_KERNEL_THREADS 0
23#define FREEZER_USER_SPACE 1
24
25static inline int freezeable(struct task_struct * p) 22static inline int freezeable(struct task_struct * p)
26{ 23{
27 if ((p == current) || 24 if ((p == current) ||
@@ -84,63 +81,53 @@ static void fake_signal_wake_up(struct task_struct *p)
84 spin_unlock_irqrestore(&p->sighand->siglock, flags); 81 spin_unlock_irqrestore(&p->sighand->siglock, flags);
85} 82}
86 83
87static int has_mm(struct task_struct *p) 84static inline bool should_send_signal(struct task_struct *p)
88{ 85{
89 return (p->mm && !(p->flags & PF_BORROWED_MM)); 86 return !(p->flags & PF_FREEZER_NOSIG);
90} 87}
91 88
92/** 89/**
93 * freeze_task - send a freeze request to given task 90 * freeze_task - send a freeze request to given task
94 * @p: task to send the request to 91 * @p: task to send the request to
95 * @with_mm_only: if set, the request will only be sent if the task has its 92 * @sig_only: if set, the request will only be sent if the task has the
96 * own mm 93 * PF_FREEZER_NOSIG flag unset
97 * Return value: 0, if @with_mm_only is set and the task has no mm of its 94 * Return value: 'false', if @sig_only is set and the task has
98 * own or the task is frozen, 1, otherwise 95 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
99 * 96 *
100 * The freeze request is sent by seting the tasks's TIF_FREEZE flag and 97 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
101 * either sending a fake signal to it or waking it up, depending on whether 98 * either sending a fake signal to it or waking it up, depending on whether
102 * or not it has its own mm (ie. it is a user land task). If @with_mm_only 99 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
103 * is set and the task has no mm of its own (ie. it is a kernel thread), 100 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
104 * its TIF_FREEZE flag should not be set. 101 * TIF_FREEZE flag will not be set.
105 *
106 * The task_lock() is necessary to prevent races with exit_mm() or
107 * use_mm()/unuse_mm() from occuring.
108 */ 102 */
109static int freeze_task(struct task_struct *p, int with_mm_only) 103static bool freeze_task(struct task_struct *p, bool sig_only)
110{ 104{
111 int ret = 1; 105 /*
106 * We first check if the task is freezing and next if it has already
107 * been frozen to avoid the race with frozen_process() which first marks
108 * the task as frozen and next clears its TIF_FREEZE.
109 */
110 if (!freezing(p)) {
111 rmb();
112 if (frozen(p))
113 return false;
112 114
113 task_lock(p); 115 if (!sig_only || should_send_signal(p))
114 if (freezing(p)) { 116 set_freeze_flag(p);
115 if (has_mm(p)) { 117 else
116 if (!signal_pending(p)) 118 return false;
117 fake_signal_wake_up(p); 119 }
118 } else { 120
119 if (with_mm_only) 121 if (should_send_signal(p)) {
120 ret = 0; 122 if (!signal_pending(p))
121 else 123 fake_signal_wake_up(p);
122 wake_up_state(p, TASK_INTERRUPTIBLE); 124 } else if (sig_only) {
123 } 125 return false;
124 } else { 126 } else {
125 rmb(); 127 wake_up_state(p, TASK_INTERRUPTIBLE);
126 if (frozen(p)) {
127 ret = 0;
128 } else {
129 if (has_mm(p)) {
130 set_freeze_flag(p);
131 fake_signal_wake_up(p);
132 } else {
133 if (with_mm_only) {
134 ret = 0;
135 } else {
136 set_freeze_flag(p);
137 wake_up_state(p, TASK_INTERRUPTIBLE);
138 }
139 }
140 }
141 } 128 }
142 task_unlock(p); 129
143 return ret; 130 return true;
144} 131}
145 132
146static void cancel_freezing(struct task_struct *p) 133static void cancel_freezing(struct task_struct *p)
@@ -156,7 +143,7 @@ static void cancel_freezing(struct task_struct *p)
156 } 143 }
157} 144}
158 145
159static int try_to_freeze_tasks(int freeze_user_space) 146static int try_to_freeze_tasks(bool sig_only)
160{ 147{
161 struct task_struct *g, *p; 148 struct task_struct *g, *p;
162 unsigned long end_time; 149 unsigned long end_time;
@@ -175,7 +162,7 @@ static int try_to_freeze_tasks(int freeze_user_space)
175 if (frozen(p) || !freezeable(p)) 162 if (frozen(p) || !freezeable(p))
176 continue; 163 continue;
177 164
178 if (!freeze_task(p, freeze_user_space)) 165 if (!freeze_task(p, sig_only))
179 continue; 166 continue;
180 167
181 /* 168 /*
@@ -235,13 +222,13 @@ int freeze_processes(void)
235 int error; 222 int error;
236 223
237 printk("Freezing user space processes ... "); 224 printk("Freezing user space processes ... ");
238 error = try_to_freeze_tasks(FREEZER_USER_SPACE); 225 error = try_to_freeze_tasks(true);
239 if (error) 226 if (error)
240 goto Exit; 227 goto Exit;
241 printk("done.\n"); 228 printk("done.\n");
242 229
243 printk("Freezing remaining freezable tasks ... "); 230 printk("Freezing remaining freezable tasks ... ");
244 error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); 231 error = try_to_freeze_tasks(false);
245 if (error) 232 if (error)
246 goto Exit; 233 goto Exit;
247 printk("done."); 234 printk("done.");
@@ -251,7 +238,7 @@ int freeze_processes(void)
251 return error; 238 return error;
252} 239}
253 240
254static void thaw_tasks(int thaw_user_space) 241static void thaw_tasks(bool nosig_only)
255{ 242{
256 struct task_struct *g, *p; 243 struct task_struct *g, *p;
257 244
@@ -260,7 +247,7 @@ static void thaw_tasks(int thaw_user_space)
260 if (!freezeable(p)) 247 if (!freezeable(p))
261 continue; 248 continue;
262 249
263 if (!p->mm == thaw_user_space) 250 if (nosig_only && should_send_signal(p))
264 continue; 251 continue;
265 252
266 thaw_process(p); 253 thaw_process(p);
@@ -271,8 +258,8 @@ static void thaw_tasks(int thaw_user_space)
271void thaw_processes(void) 258void thaw_processes(void)
272{ 259{
273 printk("Restarting tasks ... "); 260 printk("Restarting tasks ... ");
274 thaw_tasks(FREEZER_KERNEL_THREADS); 261 thaw_tasks(true);
275 thaw_tasks(FREEZER_USER_SPACE); 262 thaw_tasks(false);
276 schedule(); 263 schedule();
277 printk("done.\n"); 264 printk("done.\n");
278} 265}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index f5512cb3aa86..a6332a313262 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,6 +23,7 @@
23#include <linux/console.h> 23#include <linux/console.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h>
26 27
27#include <asm/uaccess.h> 28#include <asm/uaccess.h>
28 29
@@ -69,16 +70,22 @@ static int snapshot_open(struct inode *inode, struct file *filp)
69 struct snapshot_data *data; 70 struct snapshot_data *data;
70 int error; 71 int error;
71 72
72 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) 73 mutex_lock(&pm_mutex);
73 return -EBUSY; 74
75 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
76 error = -EBUSY;
77 goto Unlock;
78 }
74 79
75 if ((filp->f_flags & O_ACCMODE) == O_RDWR) { 80 if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
76 atomic_inc(&snapshot_device_available); 81 atomic_inc(&snapshot_device_available);
77 return -ENOSYS; 82 error = -ENOSYS;
83 goto Unlock;
78 } 84 }
79 if(create_basic_memory_bitmaps()) { 85 if(create_basic_memory_bitmaps()) {
80 atomic_inc(&snapshot_device_available); 86 atomic_inc(&snapshot_device_available);
81 return -ENOMEM; 87 error = -ENOMEM;
88 goto Unlock;
82 } 89 }
83 nonseekable_open(inode, filp); 90 nonseekable_open(inode, filp);
84 data = &snapshot_state; 91 data = &snapshot_state;
@@ -98,33 +105,36 @@ static int snapshot_open(struct inode *inode, struct file *filp)
98 if (error) 105 if (error)
99 pm_notifier_call_chain(PM_POST_HIBERNATION); 106 pm_notifier_call_chain(PM_POST_HIBERNATION);
100 } 107 }
101 if (error) { 108 if (error)
102 atomic_inc(&snapshot_device_available); 109 atomic_inc(&snapshot_device_available);
103 return error;
104 }
105 data->frozen = 0; 110 data->frozen = 0;
106 data->ready = 0; 111 data->ready = 0;
107 data->platform_support = 0; 112 data->platform_support = 0;
108 113
109 return 0; 114 Unlock:
115 mutex_unlock(&pm_mutex);
116
117 return error;
110} 118}
111 119
112static int snapshot_release(struct inode *inode, struct file *filp) 120static int snapshot_release(struct inode *inode, struct file *filp)
113{ 121{
114 struct snapshot_data *data; 122 struct snapshot_data *data;
115 123
124 mutex_lock(&pm_mutex);
125
116 swsusp_free(); 126 swsusp_free();
117 free_basic_memory_bitmaps(); 127 free_basic_memory_bitmaps();
118 data = filp->private_data; 128 data = filp->private_data;
119 free_all_swap_pages(data->swap); 129 free_all_swap_pages(data->swap);
120 if (data->frozen) { 130 if (data->frozen)
121 mutex_lock(&pm_mutex);
122 thaw_processes(); 131 thaw_processes();
123 mutex_unlock(&pm_mutex);
124 }
125 pm_notifier_call_chain(data->mode == O_WRONLY ? 132 pm_notifier_call_chain(data->mode == O_WRONLY ?
126 PM_POST_HIBERNATION : PM_POST_RESTORE); 133 PM_POST_HIBERNATION : PM_POST_RESTORE);
127 atomic_inc(&snapshot_device_available); 134 atomic_inc(&snapshot_device_available);
135
136 mutex_unlock(&pm_mutex);
137
128 return 0; 138 return 0;
129} 139}
130 140
@@ -134,9 +144,13 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
134 struct snapshot_data *data; 144 struct snapshot_data *data;
135 ssize_t res; 145 ssize_t res;
136 146
147 mutex_lock(&pm_mutex);
148
137 data = filp->private_data; 149 data = filp->private_data;
138 if (!data->ready) 150 if (!data->ready) {
139 return -ENODATA; 151 res = -ENODATA;
152 goto Unlock;
153 }
140 res = snapshot_read_next(&data->handle, count); 154 res = snapshot_read_next(&data->handle, count);
141 if (res > 0) { 155 if (res > 0) {
142 if (copy_to_user(buf, data_of(data->handle), res)) 156 if (copy_to_user(buf, data_of(data->handle), res))
@@ -144,6 +158,10 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
144 else 158 else
145 *offp = data->handle.offset; 159 *offp = data->handle.offset;
146 } 160 }
161
162 Unlock:
163 mutex_unlock(&pm_mutex);
164
147 return res; 165 return res;
148} 166}
149 167
@@ -153,6 +171,8 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
153 struct snapshot_data *data; 171 struct snapshot_data *data;
154 ssize_t res; 172 ssize_t res;
155 173
174 mutex_lock(&pm_mutex);
175
156 data = filp->private_data; 176 data = filp->private_data;
157 res = snapshot_write_next(&data->handle, count); 177 res = snapshot_write_next(&data->handle, count);
158 if (res > 0) { 178 if (res > 0) {
@@ -161,11 +181,14 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
161 else 181 else
162 *offp = data->handle.offset; 182 *offp = data->handle.offset;
163 } 183 }
184
185 mutex_unlock(&pm_mutex);
186
164 return res; 187 return res;
165} 188}
166 189
167static int snapshot_ioctl(struct inode *inode, struct file *filp, 190static long snapshot_ioctl(struct file *filp, unsigned int cmd,
168 unsigned int cmd, unsigned long arg) 191 unsigned long arg)
169{ 192{
170 int error = 0; 193 int error = 0;
171 struct snapshot_data *data; 194 struct snapshot_data *data;
@@ -179,6 +202,9 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
179 if (!capable(CAP_SYS_ADMIN)) 202 if (!capable(CAP_SYS_ADMIN))
180 return -EPERM; 203 return -EPERM;
181 204
205 if (!mutex_trylock(&pm_mutex))
206 return -EBUSY;
207
182 data = filp->private_data; 208 data = filp->private_data;
183 209
184 switch (cmd) { 210 switch (cmd) {
@@ -186,7 +212,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
186 case SNAPSHOT_FREEZE: 212 case SNAPSHOT_FREEZE:
187 if (data->frozen) 213 if (data->frozen)
188 break; 214 break;
189 mutex_lock(&pm_mutex);
190 printk("Syncing filesystems ... "); 215 printk("Syncing filesystems ... ");
191 sys_sync(); 216 sys_sync();
192 printk("done.\n"); 217 printk("done.\n");
@@ -194,7 +219,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
194 error = freeze_processes(); 219 error = freeze_processes();
195 if (error) 220 if (error)
196 thaw_processes(); 221 thaw_processes();
197 mutex_unlock(&pm_mutex);
198 if (!error) 222 if (!error)
199 data->frozen = 1; 223 data->frozen = 1;
200 break; 224 break;
@@ -202,9 +226,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
202 case SNAPSHOT_UNFREEZE: 226 case SNAPSHOT_UNFREEZE:
203 if (!data->frozen || data->ready) 227 if (!data->frozen || data->ready)
204 break; 228 break;
205 mutex_lock(&pm_mutex);
206 thaw_processes(); 229 thaw_processes();
207 mutex_unlock(&pm_mutex);
208 data->frozen = 0; 230 data->frozen = 0;
209 break; 231 break;
210 232
@@ -307,16 +329,11 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
307 error = -EPERM; 329 error = -EPERM;
308 break; 330 break;
309 } 331 }
310 if (!mutex_trylock(&pm_mutex)) {
311 error = -EBUSY;
312 break;
313 }
314 /* 332 /*
315 * Tasks are frozen and the notifiers have been called with 333 * Tasks are frozen and the notifiers have been called with
316 * PM_HIBERNATION_PREPARE 334 * PM_HIBERNATION_PREPARE
317 */ 335 */
318 error = suspend_devices_and_enter(PM_SUSPEND_MEM); 336 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
319 mutex_unlock(&pm_mutex);
320 break; 337 break;
321 338
322 case SNAPSHOT_PLATFORM_SUPPORT: 339 case SNAPSHOT_PLATFORM_SUPPORT:
@@ -390,6 +407,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
390 407
391 } 408 }
392 409
410 mutex_unlock(&pm_mutex);
411
393 return error; 412 return error;
394} 413}
395 414
@@ -399,7 +418,7 @@ static const struct file_operations snapshot_fops = {
399 .read = snapshot_read, 418 .read = snapshot_read,
400 .write = snapshot_write, 419 .write = snapshot_write,
401 .llseek = no_llseek, 420 .llseek = no_llseek,
402 .ioctl = snapshot_ioctl, 421 .unlocked_ioctl = snapshot_ioctl,
403}; 422};
404 423
405static struct miscdevice snapshot_device = { 424static struct miscdevice snapshot_device = {
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3b..07ad9e7f7a66 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -38,7 +38,7 @@
38/* 38/*
39 * Architectures can override it: 39 * Architectures can override it:
40 */ 40 */
41void __attribute__((weak)) early_printk(const char *fmt, ...) 41void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
42{ 42{
43} 43}
44 44
@@ -75,6 +75,8 @@ EXPORT_SYMBOL(oops_in_progress);
75static DECLARE_MUTEX(console_sem); 75static DECLARE_MUTEX(console_sem);
76static DECLARE_MUTEX(secondary_console_sem); 76static DECLARE_MUTEX(secondary_console_sem);
77struct console *console_drivers; 77struct console *console_drivers;
78EXPORT_SYMBOL_GPL(console_drivers);
79
78/* 80/*
79 * This is used for debugging the mess that is the VT code by 81 * This is used for debugging the mess that is the VT code by
80 * keeping track if we have the console semaphore held. It's 82 * keeping track if we have the console semaphore held. It's
@@ -121,6 +123,8 @@ struct console_cmdline
121static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; 123static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
122static int selected_console = -1; 124static int selected_console = -1;
123static int preferred_console = -1; 125static int preferred_console = -1;
126int console_set_on_cmdline;
127EXPORT_SYMBOL(console_set_on_cmdline);
124 128
125/* Flag: console code may call schedule() */ 129/* Flag: console code may call schedule() */
126static int console_may_schedule; 130static int console_may_schedule;
@@ -231,7 +235,7 @@ static inline void boot_delay_msec(void)
231/* 235/*
232 * Return the number of unread characters in the log buffer. 236 * Return the number of unread characters in the log buffer.
233 */ 237 */
234int log_buf_get_len(void) 238static int log_buf_get_len(void)
235{ 239{
236 return logged_chars; 240 return logged_chars;
237} 241}
@@ -268,19 +272,6 @@ int log_buf_copy(char *dest, int idx, int len)
268} 272}
269 273
270/* 274/*
271 * Extract a single character from the log buffer.
272 */
273int log_buf_read(int idx)
274{
275 char ret;
276
277 if (log_buf_copy(&ret, idx, 1) == 1)
278 return ret;
279 else
280 return -1;
281}
282
283/*
284 * Commands to do_syslog: 275 * Commands to do_syslog:
285 * 276 *
286 * 0 -- Close the log. Currently a NOP. 277 * 0 -- Close the log. Currently a NOP.
@@ -665,18 +656,17 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
665 spin_unlock(&logbuf_lock); 656 spin_unlock(&logbuf_lock);
666 return retval; 657 return retval;
667} 658}
668 659static const char recursion_bug_msg [] =
669const char printk_recursion_bug_msg [] = 660 KERN_CRIT "BUG: recent printk recursion!\n";
670 KERN_CRIT "BUG: recent printk recursion!\n"; 661static int recursion_bug;
671static int printk_recursion_bug; 662 static int new_text_line = 1;
663static char printk_buf[1024];
672 664
673asmlinkage int vprintk(const char *fmt, va_list args) 665asmlinkage int vprintk(const char *fmt, va_list args)
674{ 666{
675 static int log_level_unknown = 1;
676 static char printk_buf[1024];
677
678 unsigned long flags;
679 int printed_len = 0; 667 int printed_len = 0;
668 int current_log_level = default_message_loglevel;
669 unsigned long flags;
680 int this_cpu; 670 int this_cpu;
681 char *p; 671 char *p;
682 672
@@ -699,7 +689,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
699 * it can be printed at the next appropriate moment: 689 * it can be printed at the next appropriate moment:
700 */ 690 */
701 if (!oops_in_progress) { 691 if (!oops_in_progress) {
702 printk_recursion_bug = 1; 692 recursion_bug = 1;
703 goto out_restore_irqs; 693 goto out_restore_irqs;
704 } 694 }
705 zap_locks(); 695 zap_locks();
@@ -709,70 +699,62 @@ asmlinkage int vprintk(const char *fmt, va_list args)
709 spin_lock(&logbuf_lock); 699 spin_lock(&logbuf_lock);
710 printk_cpu = this_cpu; 700 printk_cpu = this_cpu;
711 701
712 if (printk_recursion_bug) { 702 if (recursion_bug) {
713 printk_recursion_bug = 0; 703 recursion_bug = 0;
714 strcpy(printk_buf, printk_recursion_bug_msg); 704 strcpy(printk_buf, recursion_bug_msg);
715 printed_len = sizeof(printk_recursion_bug_msg); 705 printed_len = sizeof(recursion_bug_msg);
716 } 706 }
717 /* Emit the output into the temporary buffer */ 707 /* Emit the output into the temporary buffer */
718 printed_len += vscnprintf(printk_buf + printed_len, 708 printed_len += vscnprintf(printk_buf + printed_len,
719 sizeof(printk_buf) - printed_len, fmt, args); 709 sizeof(printk_buf) - printed_len, fmt, args);
720 710
711
721 /* 712 /*
722 * Copy the output into log_buf. If the caller didn't provide 713 * Copy the output into log_buf. If the caller didn't provide
723 * appropriate log level tags, we insert them here 714 * appropriate log level tags, we insert them here
724 */ 715 */
725 for (p = printk_buf; *p; p++) { 716 for (p = printk_buf; *p; p++) {
726 if (log_level_unknown) { 717 if (new_text_line) {
727 /* log_level_unknown signals the start of a new line */ 718 /* If a token, set current_log_level and skip over */
719 if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
720 p[2] == '>') {
721 current_log_level = p[1] - '0';
722 p += 3;
723 printed_len -= 3;
724 }
725
726 /* Always output the token */
727 emit_log_char('<');
728 emit_log_char(current_log_level + '0');
729 emit_log_char('>');
730 printed_len += 3;
731 new_text_line = 0;
732
728 if (printk_time) { 733 if (printk_time) {
729 int loglev_char; 734 /* Follow the token with the time */
730 char tbuf[50], *tp; 735 char tbuf[50], *tp;
731 unsigned tlen; 736 unsigned tlen;
732 unsigned long long t; 737 unsigned long long t;
733 unsigned long nanosec_rem; 738 unsigned long nanosec_rem;
734 739
735 /*
736 * force the log level token to be
737 * before the time output.
738 */
739 if (p[0] == '<' && p[1] >='0' &&
740 p[1] <= '7' && p[2] == '>') {
741 loglev_char = p[1];
742 p += 3;
743 printed_len -= 3;
744 } else {
745 loglev_char = default_message_loglevel
746 + '0';
747 }
748 t = cpu_clock(printk_cpu); 740 t = cpu_clock(printk_cpu);
749 nanosec_rem = do_div(t, 1000000000); 741 nanosec_rem = do_div(t, 1000000000);
750 tlen = sprintf(tbuf, 742 tlen = sprintf(tbuf, "[%5lu.%06lu] ",
751 "<%c>[%5lu.%06lu] ", 743 (unsigned long) t,
752 loglev_char, 744 nanosec_rem / 1000);
753 (unsigned long)t,
754 nanosec_rem/1000);
755 745
756 for (tp = tbuf; tp < tbuf + tlen; tp++) 746 for (tp = tbuf; tp < tbuf + tlen; tp++)
757 emit_log_char(*tp); 747 emit_log_char(*tp);
758 printed_len += tlen; 748 printed_len += tlen;
759 } else {
760 if (p[0] != '<' || p[1] < '0' ||
761 p[1] > '7' || p[2] != '>') {
762 emit_log_char('<');
763 emit_log_char(default_message_loglevel
764 + '0');
765 emit_log_char('>');
766 printed_len += 3;
767 }
768 } 749 }
769 log_level_unknown = 0; 750
770 if (!*p) 751 if (!*p)
771 break; 752 break;
772 } 753 }
754
773 emit_log_char(*p); 755 emit_log_char(*p);
774 if (*p == '\n') 756 if (*p == '\n')
775 log_level_unknown = 1; 757 new_text_line = 1;
776 } 758 }
777 759
778 /* 760 /*
@@ -890,6 +872,7 @@ static int __init console_setup(char *str)
890 *s = 0; 872 *s = 0;
891 873
892 __add_preferred_console(buf, idx, options, brl_options); 874 __add_preferred_console(buf, idx, options, brl_options);
875 console_set_on_cmdline = 1;
893 return 1; 876 return 1;
894} 877}
895__setup("console=", console_setup); 878__setup("console=", console_setup);
@@ -1041,7 +1024,9 @@ void release_console_sem(void)
1041 _log_end = log_end; 1024 _log_end = log_end;
1042 con_start = log_end; /* Flush */ 1025 con_start = log_end; /* Flush */
1043 spin_unlock(&logbuf_lock); 1026 spin_unlock(&logbuf_lock);
1027 stop_critical_timings(); /* don't trace print latency */
1044 call_console_drivers(_con_start, _log_end); 1028 call_console_drivers(_con_start, _log_end);
1029 start_critical_timings();
1045 local_irq_restore(flags); 1030 local_irq_restore(flags);
1046 } 1031 }
1047 console_locked = 0; 1032 console_locked = 0;
@@ -1172,8 +1157,11 @@ void register_console(struct console *console)
1172 console->index = 0; 1157 console->index = 0;
1173 if (console->setup == NULL || 1158 if (console->setup == NULL ||
1174 console->setup(console, NULL) == 0) { 1159 console->setup(console, NULL) == 0) {
1175 console->flags |= CON_ENABLED | CON_CONSDEV; 1160 console->flags |= CON_ENABLED;
1176 preferred_console = 0; 1161 if (console->device) {
1162 console->flags |= CON_CONSDEV;
1163 preferred_console = 0;
1164 }
1177 } 1165 }
1178 } 1166 }
1179 1167
diff --git a/kernel/profile.c b/kernel/profile.c
index ae7ead82cbc9..58926411eb2a 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -252,7 +252,7 @@ static void profile_flip_buffers(void)
252 mutex_lock(&profile_flip_mutex); 252 mutex_lock(&profile_flip_mutex);
253 j = per_cpu(cpu_profile_flip, get_cpu()); 253 j = per_cpu(cpu_profile_flip, get_cpu());
254 put_cpu(); 254 put_cpu();
255 on_each_cpu(__profile_flip_buffers, NULL, 0, 1); 255 on_each_cpu(__profile_flip_buffers, NULL, 1);
256 for_each_online_cpu(cpu) { 256 for_each_online_cpu(cpu) {
257 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; 257 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
258 for (i = 0; i < NR_PROFILE_HIT; ++i) { 258 for (i = 0; i < NR_PROFILE_HIT; ++i) {
@@ -275,7 +275,7 @@ static void profile_discard_flip_buffers(void)
275 mutex_lock(&profile_flip_mutex); 275 mutex_lock(&profile_flip_mutex);
276 i = per_cpu(cpu_profile_flip, get_cpu()); 276 i = per_cpu(cpu_profile_flip, get_cpu());
277 put_cpu(); 277 put_cpu();
278 on_each_cpu(__profile_flip_buffers, NULL, 0, 1); 278 on_each_cpu(__profile_flip_buffers, NULL, 1);
279 for_each_online_cpu(cpu) { 279 for_each_online_cpu(cpu) {
280 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; 280 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
281 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); 281 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
@@ -558,7 +558,7 @@ static int __init create_hash_tables(void)
558out_cleanup: 558out_cleanup:
559 prof_on = 0; 559 prof_on = 0;
560 smp_mb(); 560 smp_mb();
561 on_each_cpu(profile_nop, NULL, 0, 1); 561 on_each_cpu(profile_nop, NULL, 1);
562 for_each_online_cpu(cpu) { 562 for_each_online_cpu(cpu) {
563 struct page *page; 563 struct page *page;
564 564
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6c19e94fd0a5..8392a9da6450 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -33,13 +33,9 @@
33 */ 33 */
34void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) 34void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
35{ 35{
36 BUG_ON(!list_empty(&child->ptrace_list)); 36 BUG_ON(!list_empty(&child->ptrace_entry));
37 if (child->parent == new_parent) 37 list_add(&child->ptrace_entry, &new_parent->ptraced);
38 return;
39 list_add(&child->ptrace_list, &child->parent->ptrace_children);
40 remove_parent(child);
41 child->parent = new_parent; 38 child->parent = new_parent;
42 add_parent(child);
43} 39}
44 40
45/* 41/*
@@ -73,12 +69,8 @@ void __ptrace_unlink(struct task_struct *child)
73 BUG_ON(!child->ptrace); 69 BUG_ON(!child->ptrace);
74 70
75 child->ptrace = 0; 71 child->ptrace = 0;
76 if (ptrace_reparented(child)) { 72 child->parent = child->real_parent;
77 list_del_init(&child->ptrace_list); 73 list_del_init(&child->ptrace_entry);
78 remove_parent(child);
79 child->parent = child->real_parent;
80 add_parent(child);
81 }
82 74
83 if (task_is_traced(child)) 75 if (task_is_traced(child))
84 ptrace_untrace(child); 76 ptrace_untrace(child);
@@ -121,7 +113,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
121 return ret; 113 return ret;
122} 114}
123 115
124int __ptrace_may_attach(struct task_struct *task) 116int __ptrace_may_access(struct task_struct *task, unsigned int mode)
125{ 117{
126 /* May we inspect the given task? 118 /* May we inspect the given task?
127 * This check is used both for attaching with ptrace 119 * This check is used both for attaching with ptrace
@@ -148,16 +140,16 @@ int __ptrace_may_attach(struct task_struct *task)
148 if (!dumpable && !capable(CAP_SYS_PTRACE)) 140 if (!dumpable && !capable(CAP_SYS_PTRACE))
149 return -EPERM; 141 return -EPERM;
150 142
151 return security_ptrace(current, task); 143 return security_ptrace(current, task, mode);
152} 144}
153 145
154int ptrace_may_attach(struct task_struct *task) 146bool ptrace_may_access(struct task_struct *task, unsigned int mode)
155{ 147{
156 int err; 148 int err;
157 task_lock(task); 149 task_lock(task);
158 err = __ptrace_may_attach(task); 150 err = __ptrace_may_access(task, mode);
159 task_unlock(task); 151 task_unlock(task);
160 return !err; 152 return (!err ? true : false);
161} 153}
162 154
163int ptrace_attach(struct task_struct *task) 155int ptrace_attach(struct task_struct *task)
@@ -195,7 +187,7 @@ repeat:
195 /* the same process cannot be attached many times */ 187 /* the same process cannot be attached many times */
196 if (task->ptrace & PT_PTRACED) 188 if (task->ptrace & PT_PTRACED)
197 goto bad; 189 goto bad;
198 retval = __ptrace_may_attach(task); 190 retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
199 if (retval) 191 if (retval)
200 goto bad; 192 goto bad;
201 193
@@ -492,14 +484,34 @@ int ptrace_traceme(void)
492 /* 484 /*
493 * Are we already being traced? 485 * Are we already being traced?
494 */ 486 */
487repeat:
495 task_lock(current); 488 task_lock(current);
496 if (!(current->ptrace & PT_PTRACED)) { 489 if (!(current->ptrace & PT_PTRACED)) {
497 ret = security_ptrace(current->parent, current); 490 /*
491 * See ptrace_attach() comments about the locking here.
492 */
493 unsigned long flags;
494 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
495 task_unlock(current);
496 do {
497 cpu_relax();
498 } while (!write_can_lock(&tasklist_lock));
499 goto repeat;
500 }
501
502 ret = security_ptrace(current->parent, current,
503 PTRACE_MODE_ATTACH);
504
498 /* 505 /*
499 * Set the ptrace bit in the process ptrace flags. 506 * Set the ptrace bit in the process ptrace flags.
507 * Then link us on our parent's ptraced list.
500 */ 508 */
501 if (!ret) 509 if (!ret) {
502 current->ptrace |= PT_PTRACED; 510 current->ptrace |= PT_PTRACED;
511 __ptrace_link(current, current->real_parent);
512 }
513
514 write_unlock_irqrestore(&tasklist_lock, flags);
503 } 515 }
504 task_unlock(current); 516 task_unlock(current);
505 return ret; 517 return ret;
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0f306f..16eeeaa9d618 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -89,8 +89,22 @@ static void force_quiescent_state(struct rcu_data *rdp,
89 /* 89 /*
90 * Don't send IPI to itself. With irqs disabled, 90 * Don't send IPI to itself. With irqs disabled,
91 * rdp->cpu is the current cpu. 91 * rdp->cpu is the current cpu.
92 *
93 * cpu_online_map is updated by the _cpu_down()
94 * using stop_machine_run(). Since we're in irqs disabled
95 * section, stop_machine_run() is not exectuting, hence
96 * the cpu_online_map is stable.
97 *
98 * However, a cpu might have been offlined _just_ before
99 * we disabled irqs while entering here.
100 * And rcu subsystem might not yet have handled the CPU_DEAD
101 * notification, leading to the offlined cpu's bit
102 * being set in the rcp->cpumask.
103 *
104 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
105 * sending smp_reschedule() to an offlined CPU.
92 */ 106 */
93 cpumask = rcp->cpumask; 107 cpus_and(cpumask, rcp->cpumask, cpu_online_map);
94 cpu_clear(rdp->cpu, cpumask); 108 cpu_clear(rdp->cpu, cpumask);
95 for_each_cpu_mask(cpu, cpumask) 109 for_each_cpu_mask(cpu, cpumask)
96 smp_send_reschedule(cpu); 110 smp_send_reschedule(cpu);
@@ -373,6 +387,10 @@ static void __rcu_offline_cpu(struct rcu_data *this_rdp,
373 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); 387 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
374 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); 388 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
375 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); 389 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
390
391 local_irq_disable();
392 this_rdp->qlen += rdp->qlen;
393 local_irq_enable();
376} 394}
377 395
378static void rcu_offline_cpu(int cpu) 396static void rcu_offline_cpu(int cpu)
@@ -502,10 +520,38 @@ void rcu_check_callbacks(int cpu, int user)
502 if (user || 520 if (user ||
503 (idle_cpu(cpu) && !in_softirq() && 521 (idle_cpu(cpu) && !in_softirq() &&
504 hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 522 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
523
524 /*
525 * Get here if this CPU took its interrupt from user
526 * mode or from the idle loop, and if this is not a
527 * nested interrupt. In this case, the CPU is in
528 * a quiescent state, so count it.
529 *
530 * Also do a memory barrier. This is needed to handle
531 * the case where writes from a preempt-disable section
532 * of code get reordered into schedule() by this CPU's
533 * write buffer. The memory barrier makes sure that
534 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
535 * by other CPUs to happen after any such write.
536 */
537
538 smp_mb(); /* See above block comment. */
505 rcu_qsctr_inc(cpu); 539 rcu_qsctr_inc(cpu);
506 rcu_bh_qsctr_inc(cpu); 540 rcu_bh_qsctr_inc(cpu);
507 } else if (!in_softirq()) 541
542 } else if (!in_softirq()) {
543
544 /*
545 * Get here if this CPU did not take its interrupt from
546 * softirq, in other words, if it is not interrupting
547 * a rcu_bh read-side critical section. This is an _bh
548 * critical section, so count it. The memory barrier
549 * is needed for the same reason as is the above one.
550 */
551
552 smp_mb(); /* See above block comment. */
508 rcu_bh_qsctr_inc(cpu); 553 rcu_bh_qsctr_inc(cpu);
554 }
509 raise_rcu_softirq(); 555 raise_rcu_softirq();
510} 556}
511 557
@@ -529,7 +575,7 @@ static void __cpuinit rcu_online_cpu(int cpu)
529 575
530 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); 576 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
531 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); 577 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
532 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); 578 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
533} 579}
534 580
535static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 581static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c09605f8d16c..f14f372cf6f5 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -39,16 +39,16 @@
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <asm/atomic.h> 40#include <asm/atomic.h>
41#include <linux/bitops.h> 41#include <linux/bitops.h>
42#include <linux/completion.h>
43#include <linux/percpu.h> 42#include <linux/percpu.h>
44#include <linux/notifier.h> 43#include <linux/notifier.h>
45#include <linux/cpu.h> 44#include <linux/cpu.h>
46#include <linux/mutex.h> 45#include <linux/mutex.h>
47#include <linux/module.h> 46#include <linux/module.h>
48 47
49struct rcu_synchronize { 48enum rcu_barrier {
50 struct rcu_head head; 49 RCU_BARRIER_STD,
51 struct completion completion; 50 RCU_BARRIER_BH,
51 RCU_BARRIER_SCHED,
52}; 52};
53 53
54static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 54static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
@@ -60,7 +60,7 @@ static struct completion rcu_barrier_completion;
60 * Awaken the corresponding synchronize_rcu() instance now that a 60 * Awaken the corresponding synchronize_rcu() instance now that a
61 * grace period has elapsed. 61 * grace period has elapsed.
62 */ 62 */
63static void wakeme_after_rcu(struct rcu_head *head) 63void wakeme_after_rcu(struct rcu_head *head)
64{ 64{
65 struct rcu_synchronize *rcu; 65 struct rcu_synchronize *rcu;
66 66
@@ -77,17 +77,7 @@ static void wakeme_after_rcu(struct rcu_head *head)
77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
78 * and may be nested. 78 * and may be nested.
79 */ 79 */
80void synchronize_rcu(void) 80synchronize_rcu_xxx(synchronize_rcu, call_rcu)
81{
82 struct rcu_synchronize rcu;
83
84 init_completion(&rcu.completion);
85 /* Will wake me after RCU finished */
86 call_rcu(&rcu.head, wakeme_after_rcu);
87
88 /* Wait for it */
89 wait_for_completion(&rcu.completion);
90}
91EXPORT_SYMBOL_GPL(synchronize_rcu); 81EXPORT_SYMBOL_GPL(synchronize_rcu);
92 82
93static void rcu_barrier_callback(struct rcu_head *notused) 83static void rcu_barrier_callback(struct rcu_head *notused)
@@ -99,19 +89,30 @@ static void rcu_barrier_callback(struct rcu_head *notused)
99/* 89/*
100 * Called with preemption disabled, and from cross-cpu IRQ context. 90 * Called with preemption disabled, and from cross-cpu IRQ context.
101 */ 91 */
102static void rcu_barrier_func(void *notused) 92static void rcu_barrier_func(void *type)
103{ 93{
104 int cpu = smp_processor_id(); 94 int cpu = smp_processor_id();
105 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); 95 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
106 96
107 atomic_inc(&rcu_barrier_cpu_count); 97 atomic_inc(&rcu_barrier_cpu_count);
108 call_rcu(head, rcu_barrier_callback); 98 switch ((enum rcu_barrier)type) {
99 case RCU_BARRIER_STD:
100 call_rcu(head, rcu_barrier_callback);
101 break;
102 case RCU_BARRIER_BH:
103 call_rcu_bh(head, rcu_barrier_callback);
104 break;
105 case RCU_BARRIER_SCHED:
106 call_rcu_sched(head, rcu_barrier_callback);
107 break;
108 }
109} 109}
110 110
111/** 111/*
112 * rcu_barrier - Wait until all the in-flight RCUs are complete. 112 * Orchestrate the specified type of RCU barrier, waiting for all
113 * RCU callbacks of the specified type to complete.
113 */ 114 */
114void rcu_barrier(void) 115static void _rcu_barrier(enum rcu_barrier type)
115{ 116{
116 BUG_ON(in_interrupt()); 117 BUG_ON(in_interrupt());
117 /* Take cpucontrol mutex to protect against CPU hotplug */ 118 /* Take cpucontrol mutex to protect against CPU hotplug */
@@ -127,13 +128,39 @@ void rcu_barrier(void)
127 * until all the callbacks are queued. 128 * until all the callbacks are queued.
128 */ 129 */
129 rcu_read_lock(); 130 rcu_read_lock();
130 on_each_cpu(rcu_barrier_func, NULL, 0, 1); 131 on_each_cpu(rcu_barrier_func, (void *)type, 1);
131 rcu_read_unlock(); 132 rcu_read_unlock();
132 wait_for_completion(&rcu_barrier_completion); 133 wait_for_completion(&rcu_barrier_completion);
133 mutex_unlock(&rcu_barrier_mutex); 134 mutex_unlock(&rcu_barrier_mutex);
134} 135}
136
137/**
138 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
139 */
140void rcu_barrier(void)
141{
142 _rcu_barrier(RCU_BARRIER_STD);
143}
135EXPORT_SYMBOL_GPL(rcu_barrier); 144EXPORT_SYMBOL_GPL(rcu_barrier);
136 145
146/**
147 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
148 */
149void rcu_barrier_bh(void)
150{
151 _rcu_barrier(RCU_BARRIER_BH);
152}
153EXPORT_SYMBOL_GPL(rcu_barrier_bh);
154
155/**
156 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
157 */
158void rcu_barrier_sched(void)
159{
160 _rcu_barrier(RCU_BARRIER_SCHED);
161}
162EXPORT_SYMBOL_GPL(rcu_barrier_sched);
163
137void __init rcu_init(void) 164void __init rcu_init(void)
138{ 165{
139 __rcu_init(); 166 __rcu_init();
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index e1cdf196a515..6f62b77d93c4 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -46,11 +46,11 @@
46#include <asm/atomic.h> 46#include <asm/atomic.h>
47#include <linux/bitops.h> 47#include <linux/bitops.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/kthread.h>
49#include <linux/completion.h> 50#include <linux/completion.h>
50#include <linux/moduleparam.h> 51#include <linux/moduleparam.h>
51#include <linux/percpu.h> 52#include <linux/percpu.h>
52#include <linux/notifier.h> 53#include <linux/notifier.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h> 54#include <linux/cpu.h>
55#include <linux/random.h> 55#include <linux/random.h>
56#include <linux/delay.h> 56#include <linux/delay.h>
@@ -82,14 +82,18 @@ struct rcu_data {
82 spinlock_t lock; /* Protect rcu_data fields. */ 82 spinlock_t lock; /* Protect rcu_data fields. */
83 long completed; /* Number of last completed batch. */ 83 long completed; /* Number of last completed batch. */
84 int waitlistcount; 84 int waitlistcount;
85 struct tasklet_struct rcu_tasklet;
86 struct rcu_head *nextlist; 85 struct rcu_head *nextlist;
87 struct rcu_head **nexttail; 86 struct rcu_head **nexttail;
88 struct rcu_head *waitlist[GP_STAGES]; 87 struct rcu_head *waitlist[GP_STAGES];
89 struct rcu_head **waittail[GP_STAGES]; 88 struct rcu_head **waittail[GP_STAGES];
90 struct rcu_head *donelist; 89 struct rcu_head *donelist; /* from waitlist & waitschedlist */
91 struct rcu_head **donetail; 90 struct rcu_head **donetail;
92 long rcu_flipctr[2]; 91 long rcu_flipctr[2];
92 struct rcu_head *nextschedlist;
93 struct rcu_head **nextschedtail;
94 struct rcu_head *waitschedlist;
95 struct rcu_head **waitschedtail;
96 int rcu_sched_sleeping;
93#ifdef CONFIG_RCU_TRACE 97#ifdef CONFIG_RCU_TRACE
94 struct rcupreempt_trace trace; 98 struct rcupreempt_trace trace;
95#endif /* #ifdef CONFIG_RCU_TRACE */ 99#endif /* #ifdef CONFIG_RCU_TRACE */
@@ -131,11 +135,24 @@ enum rcu_try_flip_states {
131 rcu_try_flip_waitmb_state, 135 rcu_try_flip_waitmb_state,
132}; 136};
133 137
138/*
139 * States for rcu_ctrlblk.rcu_sched_sleep.
140 */
141
142enum rcu_sched_sleep_states {
143 rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
144 rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
145 rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
146};
147
134struct rcu_ctrlblk { 148struct rcu_ctrlblk {
135 spinlock_t fliplock; /* Protect state-machine transitions. */ 149 spinlock_t fliplock; /* Protect state-machine transitions. */
136 long completed; /* Number of last completed batch. */ 150 long completed; /* Number of last completed batch. */
137 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of 151 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
138 the rcu state machine */ 152 the rcu state machine */
153 spinlock_t schedlock; /* Protect rcu_sched sleep state. */
154 enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
155 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
139}; 156};
140 157
141static DEFINE_PER_CPU(struct rcu_data, rcu_data); 158static DEFINE_PER_CPU(struct rcu_data, rcu_data);
@@ -143,8 +160,12 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
143 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), 160 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
144 .completed = 0, 161 .completed = 0,
145 .rcu_try_flip_state = rcu_try_flip_idle_state, 162 .rcu_try_flip_state = rcu_try_flip_idle_state,
163 .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
164 .sched_sleep = rcu_sched_not_sleeping,
165 .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
146}; 166};
147 167
168static struct task_struct *rcu_sched_grace_period_task;
148 169
149#ifdef CONFIG_RCU_TRACE 170#ifdef CONFIG_RCU_TRACE
150static char *rcu_try_flip_state_names[] = 171static char *rcu_try_flip_state_names[] =
@@ -207,6 +228,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
207 */ 228 */
208#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); 229#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
209 230
231#define RCU_SCHED_BATCH_TIME (HZ / 50)
232
210/* 233/*
211 * Return the number of RCU batches processed thus far. Useful 234 * Return the number of RCU batches processed thus far. Useful
212 * for debug and statistics. 235 * for debug and statistics.
@@ -217,8 +240,6 @@ long rcu_batches_completed(void)
217} 240}
218EXPORT_SYMBOL_GPL(rcu_batches_completed); 241EXPORT_SYMBOL_GPL(rcu_batches_completed);
219 242
220EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
221
222void __rcu_read_lock(void) 243void __rcu_read_lock(void)
223{ 244{
224 int idx; 245 int idx;
@@ -413,32 +434,34 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
413 } 434 }
414} 435}
415 436
416#ifdef CONFIG_NO_HZ 437DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
438 .dynticks = 1,
439};
417 440
418DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; 441#ifdef CONFIG_NO_HZ
419static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
420static DEFINE_PER_CPU(int, rcu_update_flag); 442static DEFINE_PER_CPU(int, rcu_update_flag);
421 443
422/** 444/**
423 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. 445 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
424 * 446 *
425 * If the CPU was idle with dynamic ticks active, this updates the 447 * If the CPU was idle with dynamic ticks active, this updates the
426 * dynticks_progress_counter to let the RCU handling know that the 448 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
427 * CPU is active. 449 * CPU is active.
428 */ 450 */
429void rcu_irq_enter(void) 451void rcu_irq_enter(void)
430{ 452{
431 int cpu = smp_processor_id(); 453 int cpu = smp_processor_id();
454 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
432 455
433 if (per_cpu(rcu_update_flag, cpu)) 456 if (per_cpu(rcu_update_flag, cpu))
434 per_cpu(rcu_update_flag, cpu)++; 457 per_cpu(rcu_update_flag, cpu)++;
435 458
436 /* 459 /*
437 * Only update if we are coming from a stopped ticks mode 460 * Only update if we are coming from a stopped ticks mode
438 * (dynticks_progress_counter is even). 461 * (rcu_dyntick_sched.dynticks is even).
439 */ 462 */
440 if (!in_interrupt() && 463 if (!in_interrupt() &&
441 (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { 464 (rdssp->dynticks & 0x1) == 0) {
442 /* 465 /*
443 * The following might seem like we could have a race 466 * The following might seem like we could have a race
444 * with NMI/SMIs. But this really isn't a problem. 467 * with NMI/SMIs. But this really isn't a problem.
@@ -461,12 +484,12 @@ void rcu_irq_enter(void)
461 * RCU read-side critical sections on this CPU would 484 * RCU read-side critical sections on this CPU would
462 * have already completed. 485 * have already completed.
463 */ 486 */
464 per_cpu(dynticks_progress_counter, cpu)++; 487 rdssp->dynticks++;
465 /* 488 /*
466 * The following memory barrier ensures that any 489 * The following memory barrier ensures that any
467 * rcu_read_lock() primitives in the irq handler 490 * rcu_read_lock() primitives in the irq handler
468 * are seen by other CPUs to follow the above 491 * are seen by other CPUs to follow the above
469 * increment to dynticks_progress_counter. This is 492 * increment to rcu_dyntick_sched.dynticks. This is
470 * required in order for other CPUs to correctly 493 * required in order for other CPUs to correctly
471 * determine when it is safe to advance the RCU 494 * determine when it is safe to advance the RCU
472 * grace-period state machine. 495 * grace-period state machine.
@@ -474,7 +497,7 @@ void rcu_irq_enter(void)
474 smp_mb(); /* see above block comment. */ 497 smp_mb(); /* see above block comment. */
475 /* 498 /*
476 * Since we can't determine the dynamic tick mode from 499 * Since we can't determine the dynamic tick mode from
477 * the dynticks_progress_counter after this routine, 500 * the rcu_dyntick_sched.dynticks after this routine,
478 * we use a second flag to acknowledge that we came 501 * we use a second flag to acknowledge that we came
479 * from an idle state with ticks stopped. 502 * from an idle state with ticks stopped.
480 */ 503 */
@@ -482,7 +505,7 @@ void rcu_irq_enter(void)
482 /* 505 /*
483 * If we take an NMI/SMI now, they will also increment 506 * If we take an NMI/SMI now, they will also increment
484 * the rcu_update_flag, and will not update the 507 * the rcu_update_flag, and will not update the
485 * dynticks_progress_counter on exit. That is for 508 * rcu_dyntick_sched.dynticks on exit. That is for
486 * this IRQ to do. 509 * this IRQ to do.
487 */ 510 */
488 } 511 }
@@ -492,12 +515,13 @@ void rcu_irq_enter(void)
492 * rcu_irq_exit - Called from exiting Hard irq context. 515 * rcu_irq_exit - Called from exiting Hard irq context.
493 * 516 *
494 * If the CPU was idle with dynamic ticks active, update the 517 * If the CPU was idle with dynamic ticks active, update the
495 * dynticks_progress_counter to put let the RCU handling be 518 * rcu_dyntick_sched.dynticks to put let the RCU handling be
496 * aware that the CPU is going back to idle with no ticks. 519 * aware that the CPU is going back to idle with no ticks.
497 */ 520 */
498void rcu_irq_exit(void) 521void rcu_irq_exit(void)
499{ 522{
500 int cpu = smp_processor_id(); 523 int cpu = smp_processor_id();
524 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
501 525
502 /* 526 /*
503 * rcu_update_flag is set if we interrupted the CPU 527 * rcu_update_flag is set if we interrupted the CPU
@@ -505,7 +529,7 @@ void rcu_irq_exit(void)
505 * Once this occurs, we keep track of interrupt nesting 529 * Once this occurs, we keep track of interrupt nesting
506 * because a NMI/SMI could also come in, and we still 530 * because a NMI/SMI could also come in, and we still
507 * only want the IRQ that started the increment of the 531 * only want the IRQ that started the increment of the
508 * dynticks_progress_counter to be the one that modifies 532 * rcu_dyntick_sched.dynticks to be the one that modifies
509 * it on exit. 533 * it on exit.
510 */ 534 */
511 if (per_cpu(rcu_update_flag, cpu)) { 535 if (per_cpu(rcu_update_flag, cpu)) {
@@ -517,28 +541,29 @@ void rcu_irq_exit(void)
517 541
518 /* 542 /*
519 * If an NMI/SMI happens now we are still 543 * If an NMI/SMI happens now we are still
520 * protected by the dynticks_progress_counter being odd. 544 * protected by the rcu_dyntick_sched.dynticks being odd.
521 */ 545 */
522 546
523 /* 547 /*
524 * The following memory barrier ensures that any 548 * The following memory barrier ensures that any
525 * rcu_read_unlock() primitives in the irq handler 549 * rcu_read_unlock() primitives in the irq handler
526 * are seen by other CPUs to preceed the following 550 * are seen by other CPUs to preceed the following
527 * increment to dynticks_progress_counter. This 551 * increment to rcu_dyntick_sched.dynticks. This
528 * is required in order for other CPUs to determine 552 * is required in order for other CPUs to determine
529 * when it is safe to advance the RCU grace-period 553 * when it is safe to advance the RCU grace-period
530 * state machine. 554 * state machine.
531 */ 555 */
532 smp_mb(); /* see above block comment. */ 556 smp_mb(); /* see above block comment. */
533 per_cpu(dynticks_progress_counter, cpu)++; 557 rdssp->dynticks++;
534 WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); 558 WARN_ON(rdssp->dynticks & 0x1);
535 } 559 }
536} 560}
537 561
538static void dyntick_save_progress_counter(int cpu) 562static void dyntick_save_progress_counter(int cpu)
539{ 563{
540 per_cpu(rcu_dyntick_snapshot, cpu) = 564 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
541 per_cpu(dynticks_progress_counter, cpu); 565
566 rdssp->dynticks_snap = rdssp->dynticks;
542} 567}
543 568
544static inline int 569static inline int
@@ -546,9 +571,10 @@ rcu_try_flip_waitack_needed(int cpu)
546{ 571{
547 long curr; 572 long curr;
548 long snap; 573 long snap;
574 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
549 575
550 curr = per_cpu(dynticks_progress_counter, cpu); 576 curr = rdssp->dynticks;
551 snap = per_cpu(rcu_dyntick_snapshot, cpu); 577 snap = rdssp->dynticks_snap;
552 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ 578 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
553 579
554 /* 580 /*
@@ -569,7 +595,7 @@ rcu_try_flip_waitack_needed(int cpu)
569 * that this CPU already acknowledged the counter. 595 * that this CPU already acknowledged the counter.
570 */ 596 */
571 597
572 if ((curr - snap) > 2 || (snap & 0x1) == 0) 598 if ((curr - snap) > 2 || (curr & 0x1) == 0)
573 return 0; 599 return 0;
574 600
575 /* We need this CPU to explicitly acknowledge the counter flip. */ 601 /* We need this CPU to explicitly acknowledge the counter flip. */
@@ -582,9 +608,10 @@ rcu_try_flip_waitmb_needed(int cpu)
582{ 608{
583 long curr; 609 long curr;
584 long snap; 610 long snap;
611 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
585 612
586 curr = per_cpu(dynticks_progress_counter, cpu); 613 curr = rdssp->dynticks;
587 snap = per_cpu(rcu_dyntick_snapshot, cpu); 614 snap = rdssp->dynticks_snap;
588 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ 615 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
589 616
590 /* 617 /*
@@ -611,14 +638,86 @@ rcu_try_flip_waitmb_needed(int cpu)
611 return 1; 638 return 1;
612} 639}
613 640
641static void dyntick_save_progress_counter_sched(int cpu)
642{
643 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
644
645 rdssp->sched_dynticks_snap = rdssp->dynticks;
646}
647
648static int rcu_qsctr_inc_needed_dyntick(int cpu)
649{
650 long curr;
651 long snap;
652 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
653
654 curr = rdssp->dynticks;
655 snap = rdssp->sched_dynticks_snap;
656 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
657
658 /*
659 * If the CPU remained in dynticks mode for the entire time
660 * and didn't take any interrupts, NMIs, SMIs, or whatever,
661 * then it cannot be in the middle of an rcu_read_lock(), so
662 * the next rcu_read_lock() it executes must use the new value
663 * of the counter. Therefore, this CPU has been in a quiescent
664 * state the entire time, and we don't need to wait for it.
665 */
666
667 if ((curr == snap) && ((curr & 0x1) == 0))
668 return 0;
669
670 /*
671 * If the CPU passed through or entered a dynticks idle phase with
672 * no active irq handlers, then, as above, this CPU has already
673 * passed through a quiescent state.
674 */
675
676 if ((curr - snap) > 2 || (snap & 0x1) == 0)
677 return 0;
678
679 /* We need this CPU to go through a quiescent state. */
680
681 return 1;
682}
683
614#else /* !CONFIG_NO_HZ */ 684#else /* !CONFIG_NO_HZ */
615 685
616# define dyntick_save_progress_counter(cpu) do { } while (0) 686# define dyntick_save_progress_counter(cpu) do { } while (0)
617# define rcu_try_flip_waitack_needed(cpu) (1) 687# define rcu_try_flip_waitack_needed(cpu) (1)
618# define rcu_try_flip_waitmb_needed(cpu) (1) 688# define rcu_try_flip_waitmb_needed(cpu) (1)
689
690# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
691# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
619 692
620#endif /* CONFIG_NO_HZ */ 693#endif /* CONFIG_NO_HZ */
621 694
695static void save_qsctr_sched(int cpu)
696{
697 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
698
699 rdssp->sched_qs_snap = rdssp->sched_qs;
700}
701
702static inline int rcu_qsctr_inc_needed(int cpu)
703{
704 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
705
706 /*
707 * If there has been a quiescent state, no more need to wait
708 * on this CPU.
709 */
710
711 if (rdssp->sched_qs != rdssp->sched_qs_snap) {
712 smp_mb(); /* force ordering with cpu entering schedule(). */
713 return 0;
714 }
715
716 /* We need this CPU to go through a quiescent state. */
717
718 return 1;
719}
720
622/* 721/*
623 * Get here when RCU is idle. Decide whether we need to 722 * Get here when RCU is idle. Decide whether we need to
624 * move out of idle state, and return non-zero if so. 723 * move out of idle state, and return non-zero if so.
@@ -821,6 +920,26 @@ void rcu_check_callbacks(int cpu, int user)
821 unsigned long flags; 920 unsigned long flags;
822 struct rcu_data *rdp = RCU_DATA_CPU(cpu); 921 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
823 922
923 /*
924 * If this CPU took its interrupt from user mode or from the
925 * idle loop, and this is not a nested interrupt, then
926 * this CPU has to have exited all prior preept-disable
927 * sections of code. So increment the counter to note this.
928 *
929 * The memory barrier is needed to handle the case where
930 * writes from a preempt-disable section of code get reordered
931 * into schedule() by this CPU's write buffer. So the memory
932 * barrier makes sure that the rcu_qsctr_inc() is seen by other
933 * CPUs to happen after any such write.
934 */
935
936 if (user ||
937 (idle_cpu(cpu) && !in_softirq() &&
938 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
939 smp_mb(); /* Guard against aggressive schedule(). */
940 rcu_qsctr_inc(cpu);
941 }
942
824 rcu_check_mb(cpu); 943 rcu_check_mb(cpu);
825 if (rcu_ctrlblk.completed == rdp->completed) 944 if (rcu_ctrlblk.completed == rdp->completed)
826 rcu_try_flip(); 945 rcu_try_flip();
@@ -871,6 +990,8 @@ void rcu_offline_cpu(int cpu)
871 struct rcu_head *list = NULL; 990 struct rcu_head *list = NULL;
872 unsigned long flags; 991 unsigned long flags;
873 struct rcu_data *rdp = RCU_DATA_CPU(cpu); 992 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
993 struct rcu_head *schedlist = NULL;
994 struct rcu_head **schedtail = &schedlist;
874 struct rcu_head **tail = &list; 995 struct rcu_head **tail = &list;
875 996
876 /* 997 /*
@@ -884,6 +1005,11 @@ void rcu_offline_cpu(int cpu)
884 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], 1005 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
885 list, tail); 1006 list, tail);
886 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); 1007 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1008 rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1009 schedlist, schedtail);
1010 rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1011 schedlist, schedtail);
1012 rdp->rcu_sched_sleeping = 0;
887 spin_unlock_irqrestore(&rdp->lock, flags); 1013 spin_unlock_irqrestore(&rdp->lock, flags);
888 rdp->waitlistcount = 0; 1014 rdp->waitlistcount = 0;
889 1015
@@ -918,36 +1044,50 @@ void rcu_offline_cpu(int cpu)
918 * fix. 1044 * fix.
919 */ 1045 */
920 1046
921 local_irq_save(flags); 1047 local_irq_save(flags); /* disable preempt till we know what lock. */
922 rdp = RCU_DATA_ME(); 1048 rdp = RCU_DATA_ME();
923 spin_lock(&rdp->lock); 1049 spin_lock(&rdp->lock);
924 *rdp->nexttail = list; 1050 *rdp->nexttail = list;
925 if (list) 1051 if (list)
926 rdp->nexttail = tail; 1052 rdp->nexttail = tail;
1053 *rdp->nextschedtail = schedlist;
1054 if (schedlist)
1055 rdp->nextschedtail = schedtail;
927 spin_unlock_irqrestore(&rdp->lock, flags); 1056 spin_unlock_irqrestore(&rdp->lock, flags);
928} 1057}
929 1058
930void __devinit rcu_online_cpu(int cpu) 1059#else /* #ifdef CONFIG_HOTPLUG_CPU */
1060
1061void rcu_offline_cpu(int cpu)
1062{
1063}
1064
1065#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1066
1067void __cpuinit rcu_online_cpu(int cpu)
931{ 1068{
932 unsigned long flags; 1069 unsigned long flags;
1070 struct rcu_data *rdp;
933 1071
934 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); 1072 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
935 cpu_set(cpu, rcu_cpu_online_map); 1073 cpu_set(cpu, rcu_cpu_online_map);
936 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1074 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
937}
938
939#else /* #ifdef CONFIG_HOTPLUG_CPU */
940 1075
941void rcu_offline_cpu(int cpu) 1076 /*
942{ 1077 * The rcu_sched grace-period processing might have bypassed
943} 1078 * this CPU, given that it was not in the rcu_cpu_online_map
1079 * when the grace-period scan started. This means that the
1080 * grace-period task might sleep. So make sure that if this
1081 * should happen, the first callback posted to this CPU will
1082 * wake up the grace-period task if need be.
1083 */
944 1084
945void __devinit rcu_online_cpu(int cpu) 1085 rdp = RCU_DATA_CPU(cpu);
946{ 1086 spin_lock_irqsave(&rdp->lock, flags);
1087 rdp->rcu_sched_sleeping = 1;
1088 spin_unlock_irqrestore(&rdp->lock, flags);
947} 1089}
948 1090
949#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
950
951static void rcu_process_callbacks(struct softirq_action *unused) 1091static void rcu_process_callbacks(struct softirq_action *unused)
952{ 1092{
953 unsigned long flags; 1093 unsigned long flags;
@@ -988,31 +1128,196 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
988 *rdp->nexttail = head; 1128 *rdp->nexttail = head;
989 rdp->nexttail = &head->next; 1129 rdp->nexttail = &head->next;
990 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); 1130 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
991 spin_unlock(&rdp->lock); 1131 spin_unlock_irqrestore(&rdp->lock, flags);
992 local_irq_restore(flags);
993} 1132}
994EXPORT_SYMBOL_GPL(call_rcu); 1133EXPORT_SYMBOL_GPL(call_rcu);
995 1134
1135void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1136{
1137 unsigned long flags;
1138 struct rcu_data *rdp;
1139 int wake_gp = 0;
1140
1141 head->func = func;
1142 head->next = NULL;
1143 local_irq_save(flags);
1144 rdp = RCU_DATA_ME();
1145 spin_lock(&rdp->lock);
1146 *rdp->nextschedtail = head;
1147 rdp->nextschedtail = &head->next;
1148 if (rdp->rcu_sched_sleeping) {
1149
1150 /* Grace-period processing might be sleeping... */
1151
1152 rdp->rcu_sched_sleeping = 0;
1153 wake_gp = 1;
1154 }
1155 spin_unlock_irqrestore(&rdp->lock, flags);
1156 if (wake_gp) {
1157
1158 /* Wake up grace-period processing, unless someone beat us. */
1159
1160 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1161 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1162 wake_gp = 0;
1163 rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1164 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1165 if (wake_gp)
1166 wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1167 }
1168}
1169EXPORT_SYMBOL_GPL(call_rcu_sched);
1170
996/* 1171/*
997 * Wait until all currently running preempt_disable() code segments 1172 * Wait until all currently running preempt_disable() code segments
998 * (including hardware-irq-disable segments) complete. Note that 1173 * (including hardware-irq-disable segments) complete. Note that
999 * in -rt this does -not- necessarily result in all currently executing 1174 * in -rt this does -not- necessarily result in all currently executing
1000 * interrupt -handlers- having completed. 1175 * interrupt -handlers- having completed.
1001 */ 1176 */
1002void __synchronize_sched(void) 1177synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
1178EXPORT_SYMBOL_GPL(__synchronize_sched);
1179
1180/*
1181 * kthread function that manages call_rcu_sched grace periods.
1182 */
1183static int rcu_sched_grace_period(void *arg)
1003{ 1184{
1004 cpumask_t oldmask; 1185 int couldsleep; /* might sleep after current pass. */
1186 int couldsleepnext = 0; /* might sleep after next pass. */
1005 int cpu; 1187 int cpu;
1188 unsigned long flags;
1189 struct rcu_data *rdp;
1190 int ret;
1006 1191
1007 if (sched_getaffinity(0, &oldmask) < 0) 1192 /*
1008 oldmask = cpu_possible_map; 1193 * Each pass through the following loop handles one
1009 for_each_online_cpu(cpu) { 1194 * rcu_sched grace period cycle.
1010 sched_setaffinity(0, &cpumask_of_cpu(cpu)); 1195 */
1011 schedule(); 1196 do {
1012 } 1197 /* Save each CPU's current state. */
1013 sched_setaffinity(0, &oldmask); 1198
1199 for_each_online_cpu(cpu) {
1200 dyntick_save_progress_counter_sched(cpu);
1201 save_qsctr_sched(cpu);
1202 }
1203
1204 /*
1205 * Sleep for about an RCU grace-period's worth to
1206 * allow better batching and to consume less CPU.
1207 */
1208 schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1209
1210 /*
1211 * If there was nothing to do last time, prepare to
1212 * sleep at the end of the current grace period cycle.
1213 */
1214 couldsleep = couldsleepnext;
1215 couldsleepnext = 1;
1216 if (couldsleep) {
1217 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1218 rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1219 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1220 }
1221
1222 /*
1223 * Wait on each CPU in turn to have either visited
1224 * a quiescent state or been in dynticks-idle mode.
1225 */
1226 for_each_online_cpu(cpu) {
1227 while (rcu_qsctr_inc_needed(cpu) &&
1228 rcu_qsctr_inc_needed_dyntick(cpu)) {
1229 /* resched_cpu(cpu); @@@ */
1230 schedule_timeout_interruptible(1);
1231 }
1232 }
1233
1234 /* Advance callbacks for each CPU. */
1235
1236 for_each_online_cpu(cpu) {
1237
1238 rdp = RCU_DATA_CPU(cpu);
1239 spin_lock_irqsave(&rdp->lock, flags);
1240
1241 /*
1242 * We are running on this CPU irq-disabled, so no
1243 * CPU can go offline until we re-enable irqs.
1244 * The current CPU might have already gone
1245 * offline (between the for_each_offline_cpu and
1246 * the spin_lock_irqsave), but in that case all its
1247 * callback lists will be empty, so no harm done.
1248 *
1249 * Advance the callbacks! We share normal RCU's
1250 * donelist, since callbacks are invoked the
1251 * same way in either case.
1252 */
1253 if (rdp->waitschedlist != NULL) {
1254 *rdp->donetail = rdp->waitschedlist;
1255 rdp->donetail = rdp->waitschedtail;
1256
1257 /*
1258 * Next rcu_check_callbacks() will
1259 * do the required raise_softirq().
1260 */
1261 }
1262 if (rdp->nextschedlist != NULL) {
1263 rdp->waitschedlist = rdp->nextschedlist;
1264 rdp->waitschedtail = rdp->nextschedtail;
1265 couldsleep = 0;
1266 couldsleepnext = 0;
1267 } else {
1268 rdp->waitschedlist = NULL;
1269 rdp->waitschedtail = &rdp->waitschedlist;
1270 }
1271 rdp->nextschedlist = NULL;
1272 rdp->nextschedtail = &rdp->nextschedlist;
1273
1274 /* Mark sleep intention. */
1275
1276 rdp->rcu_sched_sleeping = couldsleep;
1277
1278 spin_unlock_irqrestore(&rdp->lock, flags);
1279 }
1280
1281 /* If we saw callbacks on the last scan, go deal with them. */
1282
1283 if (!couldsleep)
1284 continue;
1285
1286 /* Attempt to block... */
1287
1288 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1289 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1290
1291 /*
1292 * Someone posted a callback after we scanned.
1293 * Go take care of it.
1294 */
1295 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1296 couldsleepnext = 0;
1297 continue;
1298 }
1299
1300 /* Block until the next person posts a callback. */
1301
1302 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1303 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1304 ret = 0;
1305 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1306 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1307 ret);
1308
1309 /*
1310 * Signals would prevent us from sleeping, and we cannot
1311 * do much with them in any case. So flush them.
1312 */
1313 if (ret)
1314 flush_signals(current);
1315 couldsleepnext = 0;
1316
1317 } while (!kthread_should_stop());
1318
1319 return (0);
1014} 1320}
1015EXPORT_SYMBOL_GPL(__synchronize_sched);
1016 1321
1017/* 1322/*
1018 * Check to see if any future RCU-related work will need to be done 1323 * Check to see if any future RCU-related work will need to be done
@@ -1029,7 +1334,9 @@ int rcu_needs_cpu(int cpu)
1029 1334
1030 return (rdp->donelist != NULL || 1335 return (rdp->donelist != NULL ||
1031 !!rdp->waitlistcount || 1336 !!rdp->waitlistcount ||
1032 rdp->nextlist != NULL); 1337 rdp->nextlist != NULL ||
1338 rdp->nextschedlist != NULL ||
1339 rdp->waitschedlist != NULL);
1033} 1340}
1034 1341
1035int rcu_pending(int cpu) 1342int rcu_pending(int cpu)
@@ -1040,7 +1347,9 @@ int rcu_pending(int cpu)
1040 1347
1041 if (rdp->donelist != NULL || 1348 if (rdp->donelist != NULL ||
1042 !!rdp->waitlistcount || 1349 !!rdp->waitlistcount ||
1043 rdp->nextlist != NULL) 1350 rdp->nextlist != NULL ||
1351 rdp->nextschedlist != NULL ||
1352 rdp->waitschedlist != NULL)
1044 return 1; 1353 return 1;
1045 1354
1046 /* The RCU core needs an acknowledgement from this CPU. */ 1355 /* The RCU core needs an acknowledgement from this CPU. */
@@ -1107,6 +1416,11 @@ void __init __rcu_init(void)
1107 rdp->donetail = &rdp->donelist; 1416 rdp->donetail = &rdp->donelist;
1108 rdp->rcu_flipctr[0] = 0; 1417 rdp->rcu_flipctr[0] = 0;
1109 rdp->rcu_flipctr[1] = 0; 1418 rdp->rcu_flipctr[1] = 0;
1419 rdp->nextschedlist = NULL;
1420 rdp->nextschedtail = &rdp->nextschedlist;
1421 rdp->waitschedlist = NULL;
1422 rdp->waitschedtail = &rdp->waitschedlist;
1423 rdp->rcu_sched_sleeping = 0;
1110 } 1424 }
1111 register_cpu_notifier(&rcu_nb); 1425 register_cpu_notifier(&rcu_nb);
1112 1426
@@ -1125,15 +1439,19 @@ void __init __rcu_init(void)
1125 for_each_online_cpu(cpu) 1439 for_each_online_cpu(cpu)
1126 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); 1440 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
1127 1441
1128 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); 1442 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1129} 1443}
1130 1444
1131/* 1445/*
1132 * Deprecated, use synchronize_rcu() or synchronize_sched() instead. 1446 * Late-boot-time RCU initialization that must wait until after scheduler
1447 * has been initialized.
1133 */ 1448 */
1134void synchronize_kernel(void) 1449void __init rcu_init_sched(void)
1135{ 1450{
1136 synchronize_rcu(); 1451 rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1452 NULL,
1453 "rcu_sched_grace_period");
1454 WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1137} 1455}
1138 1456
1139#ifdef CONFIG_RCU_TRACE 1457#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 49ac4947af24..5edf82c34bbc 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -38,7 +38,6 @@
38#include <linux/moduleparam.h> 38#include <linux/moduleparam.h>
39#include <linux/percpu.h> 39#include <linux/percpu.h>
40#include <linux/notifier.h> 40#include <linux/notifier.h>
41#include <linux/rcupdate.h>
42#include <linux/cpu.h> 41#include <linux/cpu.h>
43#include <linux/mutex.h> 42#include <linux/mutex.h>
44#include <linux/rcupreempt_trace.h> 43#include <linux/rcupreempt_trace.h>
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 33acc424667e..90b5b123f7a1 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -57,7 +57,9 @@ static int stat_interval; /* Interval between stats, in seconds. */
57 /* Defaults to "only at end of test". */ 57 /* Defaults to "only at end of test". */
58static int verbose; /* Print more debug info. */ 58static int verbose; /* Print more debug info. */
59static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 59static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
60static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ 60static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
61static int stutter = 5; /* Start/stop testing interval (in sec) */
62static int irqreader = 1; /* RCU readers from irq (timers). */
61static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 63static char *torture_type = "rcu"; /* What RCU implementation to torture. */
62 64
63module_param(nreaders, int, 0444); 65module_param(nreaders, int, 0444);
@@ -72,6 +74,10 @@ module_param(test_no_idle_hz, bool, 0444);
72MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 74MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
73module_param(shuffle_interval, int, 0444); 75module_param(shuffle_interval, int, 0444);
74MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 76MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
77module_param(stutter, int, 0444);
78MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
79module_param(irqreader, int, 0444);
80MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
75module_param(torture_type, charp, 0444); 81module_param(torture_type, charp, 0444);
76MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 82MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
77 83
@@ -91,6 +97,7 @@ static struct task_struct **fakewriter_tasks;
91static struct task_struct **reader_tasks; 97static struct task_struct **reader_tasks;
92static struct task_struct *stats_task; 98static struct task_struct *stats_task;
93static struct task_struct *shuffler_task; 99static struct task_struct *shuffler_task;
100static struct task_struct *stutter_task;
94 101
95#define RCU_TORTURE_PIPE_LEN 10 102#define RCU_TORTURE_PIPE_LEN 10
96 103
@@ -117,8 +124,18 @@ static atomic_t n_rcu_torture_alloc_fail;
117static atomic_t n_rcu_torture_free; 124static atomic_t n_rcu_torture_free;
118static atomic_t n_rcu_torture_mberror; 125static atomic_t n_rcu_torture_mberror;
119static atomic_t n_rcu_torture_error; 126static atomic_t n_rcu_torture_error;
127static long n_rcu_torture_timers = 0;
120static struct list_head rcu_torture_removed; 128static struct list_head rcu_torture_removed;
121 129
130static int stutter_pause_test = 0;
131
132#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
133#define RCUTORTURE_RUNNABLE_INIT 1
134#else
135#define RCUTORTURE_RUNNABLE_INIT 0
136#endif
137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
138
122/* 139/*
123 * Allocate an element from the rcu_tortures pool. 140 * Allocate an element from the rcu_tortures pool.
124 */ 141 */
@@ -179,6 +196,16 @@ rcu_random(struct rcu_random_state *rrsp)
179 return swahw32(rrsp->rrs_state); 196 return swahw32(rrsp->rrs_state);
180} 197}
181 198
199static void
200rcu_stutter_wait(void)
201{
202 while (stutter_pause_test || !rcutorture_runnable)
203 if (rcutorture_runnable)
204 schedule_timeout_interruptible(1);
205 else
206 schedule_timeout_interruptible(round_jiffies_relative(HZ));
207}
208
182/* 209/*
183 * Operations vector for selecting different types of tests. 210 * Operations vector for selecting different types of tests.
184 */ 211 */
@@ -192,7 +219,9 @@ struct rcu_torture_ops {
192 int (*completed)(void); 219 int (*completed)(void);
193 void (*deferredfree)(struct rcu_torture *p); 220 void (*deferredfree)(struct rcu_torture *p);
194 void (*sync)(void); 221 void (*sync)(void);
222 void (*cb_barrier)(void);
195 int (*stats)(char *page); 223 int (*stats)(char *page);
224 int irqcapable;
196 char *name; 225 char *name;
197}; 226};
198static struct rcu_torture_ops *cur_ops = NULL; 227static struct rcu_torture_ops *cur_ops = NULL;
@@ -265,7 +294,9 @@ static struct rcu_torture_ops rcu_ops = {
265 .completed = rcu_torture_completed, 294 .completed = rcu_torture_completed,
266 .deferredfree = rcu_torture_deferred_free, 295 .deferredfree = rcu_torture_deferred_free,
267 .sync = synchronize_rcu, 296 .sync = synchronize_rcu,
297 .cb_barrier = rcu_barrier,
268 .stats = NULL, 298 .stats = NULL,
299 .irqcapable = 1,
269 .name = "rcu" 300 .name = "rcu"
270}; 301};
271 302
@@ -304,7 +335,9 @@ static struct rcu_torture_ops rcu_sync_ops = {
304 .completed = rcu_torture_completed, 335 .completed = rcu_torture_completed,
305 .deferredfree = rcu_sync_torture_deferred_free, 336 .deferredfree = rcu_sync_torture_deferred_free,
306 .sync = synchronize_rcu, 337 .sync = synchronize_rcu,
338 .cb_barrier = NULL,
307 .stats = NULL, 339 .stats = NULL,
340 .irqcapable = 1,
308 .name = "rcu_sync" 341 .name = "rcu_sync"
309}; 342};
310 343
@@ -364,7 +397,9 @@ static struct rcu_torture_ops rcu_bh_ops = {
364 .completed = rcu_bh_torture_completed, 397 .completed = rcu_bh_torture_completed,
365 .deferredfree = rcu_bh_torture_deferred_free, 398 .deferredfree = rcu_bh_torture_deferred_free,
366 .sync = rcu_bh_torture_synchronize, 399 .sync = rcu_bh_torture_synchronize,
400 .cb_barrier = rcu_barrier_bh,
367 .stats = NULL, 401 .stats = NULL,
402 .irqcapable = 1,
368 .name = "rcu_bh" 403 .name = "rcu_bh"
369}; 404};
370 405
@@ -377,7 +412,9 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
377 .completed = rcu_bh_torture_completed, 412 .completed = rcu_bh_torture_completed,
378 .deferredfree = rcu_sync_torture_deferred_free, 413 .deferredfree = rcu_sync_torture_deferred_free,
379 .sync = rcu_bh_torture_synchronize, 414 .sync = rcu_bh_torture_synchronize,
415 .cb_barrier = NULL,
380 .stats = NULL, 416 .stats = NULL,
417 .irqcapable = 1,
381 .name = "rcu_bh_sync" 418 .name = "rcu_bh_sync"
382}; 419};
383 420
@@ -458,6 +495,7 @@ static struct rcu_torture_ops srcu_ops = {
458 .completed = srcu_torture_completed, 495 .completed = srcu_torture_completed,
459 .deferredfree = rcu_sync_torture_deferred_free, 496 .deferredfree = rcu_sync_torture_deferred_free,
460 .sync = srcu_torture_synchronize, 497 .sync = srcu_torture_synchronize,
498 .cb_barrier = NULL,
461 .stats = srcu_torture_stats, 499 .stats = srcu_torture_stats,
462 .name = "srcu" 500 .name = "srcu"
463}; 501};
@@ -482,6 +520,11 @@ static int sched_torture_completed(void)
482 return 0; 520 return 0;
483} 521}
484 522
523static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
524{
525 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
526}
527
485static void sched_torture_synchronize(void) 528static void sched_torture_synchronize(void)
486{ 529{
487 synchronize_sched(); 530 synchronize_sched();
@@ -494,12 +537,28 @@ static struct rcu_torture_ops sched_ops = {
494 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 537 .readdelay = rcu_read_delay, /* just reuse rcu's version. */
495 .readunlock = sched_torture_read_unlock, 538 .readunlock = sched_torture_read_unlock,
496 .completed = sched_torture_completed, 539 .completed = sched_torture_completed,
497 .deferredfree = rcu_sync_torture_deferred_free, 540 .deferredfree = rcu_sched_torture_deferred_free,
498 .sync = sched_torture_synchronize, 541 .sync = sched_torture_synchronize,
542 .cb_barrier = rcu_barrier_sched,
499 .stats = NULL, 543 .stats = NULL,
544 .irqcapable = 1,
500 .name = "sched" 545 .name = "sched"
501}; 546};
502 547
548static struct rcu_torture_ops sched_ops_sync = {
549 .init = rcu_sync_torture_init,
550 .cleanup = NULL,
551 .readlock = sched_torture_read_lock,
552 .readdelay = rcu_read_delay, /* just reuse rcu's version. */
553 .readunlock = sched_torture_read_unlock,
554 .completed = sched_torture_completed,
555 .deferredfree = rcu_sync_torture_deferred_free,
556 .sync = sched_torture_synchronize,
557 .cb_barrier = NULL,
558 .stats = NULL,
559 .name = "sched_sync"
560};
561
503/* 562/*
504 * RCU torture writer kthread. Repeatedly substitutes a new structure 563 * RCU torture writer kthread. Repeatedly substitutes a new structure
505 * for that pointed to by rcu_torture_current, freeing the old structure 564 * for that pointed to by rcu_torture_current, freeing the old structure
@@ -537,6 +596,7 @@ rcu_torture_writer(void *arg)
537 } 596 }
538 rcu_torture_current_version++; 597 rcu_torture_current_version++;
539 oldbatch = cur_ops->completed(); 598 oldbatch = cur_ops->completed();
599 rcu_stutter_wait();
540 } while (!kthread_should_stop() && !fullstop); 600 } while (!kthread_should_stop() && !fullstop);
541 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 601 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
542 while (!kthread_should_stop()) 602 while (!kthread_should_stop())
@@ -560,6 +620,7 @@ rcu_torture_fakewriter(void *arg)
560 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 620 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
561 udelay(rcu_random(&rand) & 0x3ff); 621 udelay(rcu_random(&rand) & 0x3ff);
562 cur_ops->sync(); 622 cur_ops->sync();
623 rcu_stutter_wait();
563 } while (!kthread_should_stop() && !fullstop); 624 } while (!kthread_should_stop() && !fullstop);
564 625
565 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 626 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
@@ -569,6 +630,52 @@ rcu_torture_fakewriter(void *arg)
569} 630}
570 631
571/* 632/*
633 * RCU torture reader from timer handler. Dereferences rcu_torture_current,
634 * incrementing the corresponding element of the pipeline array. The
635 * counter in the element should never be greater than 1, otherwise, the
636 * RCU implementation is broken.
637 */
638static void rcu_torture_timer(unsigned long unused)
639{
640 int idx;
641 int completed;
642 static DEFINE_RCU_RANDOM(rand);
643 static DEFINE_SPINLOCK(rand_lock);
644 struct rcu_torture *p;
645 int pipe_count;
646
647 idx = cur_ops->readlock();
648 completed = cur_ops->completed();
649 p = rcu_dereference(rcu_torture_current);
650 if (p == NULL) {
651 /* Leave because rcu_torture_writer is not yet underway */
652 cur_ops->readunlock(idx);
653 return;
654 }
655 if (p->rtort_mbtest == 0)
656 atomic_inc(&n_rcu_torture_mberror);
657 spin_lock(&rand_lock);
658 cur_ops->readdelay(&rand);
659 n_rcu_torture_timers++;
660 spin_unlock(&rand_lock);
661 preempt_disable();
662 pipe_count = p->rtort_pipe_count;
663 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
664 /* Should not happen, but... */
665 pipe_count = RCU_TORTURE_PIPE_LEN;
666 }
667 ++__get_cpu_var(rcu_torture_count)[pipe_count];
668 completed = cur_ops->completed() - completed;
669 if (completed > RCU_TORTURE_PIPE_LEN) {
670 /* Should not happen, but... */
671 completed = RCU_TORTURE_PIPE_LEN;
672 }
673 ++__get_cpu_var(rcu_torture_batch)[completed];
674 preempt_enable();
675 cur_ops->readunlock(idx);
676}
677
678/*
572 * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, 679 * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
573 * incrementing the corresponding element of the pipeline array. The 680 * incrementing the corresponding element of the pipeline array. The
574 * counter in the element should never be greater than 1, otherwise, the 681 * counter in the element should never be greater than 1, otherwise, the
@@ -582,11 +689,18 @@ rcu_torture_reader(void *arg)
582 DEFINE_RCU_RANDOM(rand); 689 DEFINE_RCU_RANDOM(rand);
583 struct rcu_torture *p; 690 struct rcu_torture *p;
584 int pipe_count; 691 int pipe_count;
692 struct timer_list t;
585 693
586 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 694 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
587 set_user_nice(current, 19); 695 set_user_nice(current, 19);
696 if (irqreader && cur_ops->irqcapable)
697 setup_timer_on_stack(&t, rcu_torture_timer, 0);
588 698
589 do { 699 do {
700 if (irqreader && cur_ops->irqcapable) {
701 if (!timer_pending(&t))
702 mod_timer(&t, 1);
703 }
590 idx = cur_ops->readlock(); 704 idx = cur_ops->readlock();
591 completed = cur_ops->completed(); 705 completed = cur_ops->completed();
592 p = rcu_dereference(rcu_torture_current); 706 p = rcu_dereference(rcu_torture_current);
@@ -615,8 +729,11 @@ rcu_torture_reader(void *arg)
615 preempt_enable(); 729 preempt_enable();
616 cur_ops->readunlock(idx); 730 cur_ops->readunlock(idx);
617 schedule(); 731 schedule();
732 rcu_stutter_wait();
618 } while (!kthread_should_stop() && !fullstop); 733 } while (!kthread_should_stop() && !fullstop);
619 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 734 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
735 if (irqreader && cur_ops->irqcapable)
736 del_timer_sync(&t);
620 while (!kthread_should_stop()) 737 while (!kthread_should_stop())
621 schedule_timeout_uninterruptible(1); 738 schedule_timeout_uninterruptible(1);
622 return 0; 739 return 0;
@@ -647,20 +764,22 @@ rcu_torture_printk(char *page)
647 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 764 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
648 cnt += sprintf(&page[cnt], 765 cnt += sprintf(&page[cnt],
649 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 766 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
650 "rtmbe: %d", 767 "rtmbe: %d nt: %ld",
651 rcu_torture_current, 768 rcu_torture_current,
652 rcu_torture_current_version, 769 rcu_torture_current_version,
653 list_empty(&rcu_torture_freelist), 770 list_empty(&rcu_torture_freelist),
654 atomic_read(&n_rcu_torture_alloc), 771 atomic_read(&n_rcu_torture_alloc),
655 atomic_read(&n_rcu_torture_alloc_fail), 772 atomic_read(&n_rcu_torture_alloc_fail),
656 atomic_read(&n_rcu_torture_free), 773 atomic_read(&n_rcu_torture_free),
657 atomic_read(&n_rcu_torture_mberror)); 774 atomic_read(&n_rcu_torture_mberror),
775 n_rcu_torture_timers);
658 if (atomic_read(&n_rcu_torture_mberror) != 0) 776 if (atomic_read(&n_rcu_torture_mberror) != 0)
659 cnt += sprintf(&page[cnt], " !!!"); 777 cnt += sprintf(&page[cnt], " !!!");
660 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 778 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
661 if (i > 1) { 779 if (i > 1) {
662 cnt += sprintf(&page[cnt], "!!! "); 780 cnt += sprintf(&page[cnt], "!!! ");
663 atomic_inc(&n_rcu_torture_error); 781 atomic_inc(&n_rcu_torture_error);
782 WARN_ON_ONCE(1);
664 } 783 }
665 cnt += sprintf(&page[cnt], "Reader Pipe: "); 784 cnt += sprintf(&page[cnt], "Reader Pipe: ");
666 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 785 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -785,15 +904,34 @@ rcu_torture_shuffle(void *arg)
785 return 0; 904 return 0;
786} 905}
787 906
907/* Cause the rcutorture test to "stutter", starting and stopping all
908 * threads periodically.
909 */
910static int
911rcu_torture_stutter(void *arg)
912{
913 VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
914 do {
915 schedule_timeout_interruptible(stutter * HZ);
916 stutter_pause_test = 1;
917 if (!kthread_should_stop())
918 schedule_timeout_interruptible(stutter * HZ);
919 stutter_pause_test = 0;
920 } while (!kthread_should_stop());
921 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
922 return 0;
923}
924
788static inline void 925static inline void
789rcu_torture_print_module_parms(char *tag) 926rcu_torture_print_module_parms(char *tag)
790{ 927{
791 printk(KERN_ALERT "%s" TORTURE_FLAG 928 printk(KERN_ALERT "%s" TORTURE_FLAG
792 "--- %s: nreaders=%d nfakewriters=%d " 929 "--- %s: nreaders=%d nfakewriters=%d "
793 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 930 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
794 "shuffle_interval = %d\n", 931 "shuffle_interval=%d stutter=%d irqreader=%d\n",
795 torture_type, tag, nrealreaders, nfakewriters, 932 torture_type, tag, nrealreaders, nfakewriters,
796 stat_interval, verbose, test_no_idle_hz, shuffle_interval); 933 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
934 stutter, irqreader);
797} 935}
798 936
799static void 937static void
@@ -802,6 +940,11 @@ rcu_torture_cleanup(void)
802 int i; 940 int i;
803 941
804 fullstop = 1; 942 fullstop = 1;
943 if (stutter_task) {
944 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
945 kthread_stop(stutter_task);
946 }
947 stutter_task = NULL;
805 if (shuffler_task) { 948 if (shuffler_task) {
806 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); 949 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
807 kthread_stop(shuffler_task); 950 kthread_stop(shuffler_task);
@@ -848,7 +991,9 @@ rcu_torture_cleanup(void)
848 stats_task = NULL; 991 stats_task = NULL;
849 992
850 /* Wait for all RCU callbacks to fire. */ 993 /* Wait for all RCU callbacks to fire. */
851 rcu_barrier(); 994
995 if (cur_ops->cb_barrier != NULL)
996 cur_ops->cb_barrier();
852 997
853 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 998 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
854 999
@@ -868,7 +1013,7 @@ rcu_torture_init(void)
868 int firsterr = 0; 1013 int firsterr = 0;
869 static struct rcu_torture_ops *torture_ops[] = 1014 static struct rcu_torture_ops *torture_ops[] =
870 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1015 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
871 &srcu_ops, &sched_ops, }; 1016 &srcu_ops, &sched_ops, &sched_ops_sync, };
872 1017
873 /* Process args and tell the world that the torturer is on the job. */ 1018 /* Process args and tell the world that the torturer is on the job. */
874 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1019 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -988,6 +1133,19 @@ rcu_torture_init(void)
988 goto unwind; 1133 goto unwind;
989 } 1134 }
990 } 1135 }
1136 if (stutter < 0)
1137 stutter = 0;
1138 if (stutter) {
1139 /* Create the stutter thread */
1140 stutter_task = kthread_run(rcu_torture_stutter, NULL,
1141 "rcu_torture_stutter");
1142 if (IS_ERR(stutter_task)) {
1143 firsterr = PTR_ERR(stutter_task);
1144 VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
1145 stutter_task = NULL;
1146 goto unwind;
1147 }
1148 }
991 return 0; 1149 return 0;
992 1150
993unwind: 1151unwind:
diff --git a/kernel/relay.c b/kernel/relay.c
index bc24dcdc570f..7de644cdec43 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -1191,7 +1191,7 @@ static ssize_t relay_file_splice_read(struct file *in,
1191 ret = 0; 1191 ret = 0;
1192 spliced = 0; 1192 spliced = 0;
1193 1193
1194 while (len) { 1194 while (len && !spliced) {
1195 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret); 1195 ret = subbuf_splice_actor(in, ppos, pipe, len, flags, &nonpad_ret);
1196 if (ret < 0) 1196 if (ret < 0)
1197 break; 1197 break;
diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..99e6d850ecab 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,10 +70,13 @@
70#include <linux/bootmem.h> 70#include <linux/bootmem.h>
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h>
73 74
74#include <asm/tlb.h> 75#include <asm/tlb.h>
75#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
76 77
78#include "sched_cpupri.h"
79
77/* 80/*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 81 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 82 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -136,7 +139,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
136 139
137static inline int rt_policy(int policy) 140static inline int rt_policy(int policy)
138{ 141{
139 if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) 142 if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
140 return 1; 143 return 1;
141 return 0; 144 return 0;
142} 145}
@@ -289,15 +292,15 @@ struct task_group root_task_group;
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 292static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290/* Default task group's cfs_rq on each cpu */ 293/* Default task group's cfs_rq on each cpu */
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 294static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif 295#endif /* CONFIG_FAIR_GROUP_SCHED */
293 296
294#ifdef CONFIG_RT_GROUP_SCHED 297#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif 300#endif /* CONFIG_RT_GROUP_SCHED */
298#else 301#else /* !CONFIG_FAIR_GROUP_SCHED */
299#define root_task_group init_task_group 302#define root_task_group init_task_group
300#endif 303#endif /* CONFIG_FAIR_GROUP_SCHED */
301 304
302/* task_group_lock serializes add/remove of task groups and also changes to 305/* task_group_lock serializes add/remove of task groups and also changes to
303 * a task group's cpu shares. 306 * a task group's cpu shares.
@@ -307,17 +310,20 @@ static DEFINE_SPINLOCK(task_group_lock);
307#ifdef CONFIG_FAIR_GROUP_SCHED 310#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED 311#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 312# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else 313#else /* !CONFIG_USER_SCHED */
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 314# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif 315#endif /* CONFIG_USER_SCHED */
313 316
314/* 317/*
315 * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. 318 * A weight of 0 or 1 can cause arithmetics problems.
319 * A weight of a cfs_rq is the sum of weights of which entities
320 * are queued on this cfs_rq, so a weight of a entity should not be
321 * too large, so as the shares value of a task group.
316 * (The default weight is 1024 - so there's no practical 322 * (The default weight is 1024 - so there's no practical
317 * limitation from this.) 323 * limitation from this.)
318 */ 324 */
319#define MIN_SHARES 2 325#define MIN_SHARES 2
320#define MAX_SHARES (ULONG_MAX - 1) 326#define MAX_SHARES (1UL << 18)
321 327
322static int init_task_group_load = INIT_TASK_GROUP_LOAD; 328static int init_task_group_load = INIT_TASK_GROUP_LOAD;
323#endif 329#endif
@@ -360,6 +366,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
360#else 366#else
361 367
362static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 368static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
369static inline struct task_group *task_group(struct task_struct *p)
370{
371 return NULL;
372}
363 373
364#endif /* CONFIG_GROUP_SCHED */ 374#endif /* CONFIG_GROUP_SCHED */
365 375
@@ -370,6 +380,7 @@ struct cfs_rq {
370 380
371 u64 exec_clock; 381 u64 exec_clock;
372 u64 min_vruntime; 382 u64 min_vruntime;
383 u64 pair_start;
373 384
374 struct rb_root tasks_timeline; 385 struct rb_root tasks_timeline;
375 struct rb_node *rb_leftmost; 386 struct rb_node *rb_leftmost;
@@ -400,40 +411,28 @@ struct cfs_rq {
400 struct task_group *tg; /* group that "owns" this runqueue */ 411 struct task_group *tg; /* group that "owns" this runqueue */
401 412
402#ifdef CONFIG_SMP 413#ifdef CONFIG_SMP
403 unsigned long task_weight;
404 unsigned long shares;
405 /* 414 /*
406 * We need space to build a sched_domain wide view of the full task 415 * the part of load.weight contributed by tasks
407 * group tree, in order to avoid depending on dynamic memory allocation
408 * during the load balancing we place this in the per cpu task group
409 * hierarchy. This limits the load balancing to one instance per cpu,
410 * but more should not be needed anyway.
411 */ 416 */
412 struct aggregate_struct { 417 unsigned long task_weight;
413 /*
414 * load = weight(cpus) * f(tg)
415 *
416 * Where f(tg) is the recursive weight fraction assigned to
417 * this group.
418 */
419 unsigned long load;
420 418
421 /* 419 /*
422 * part of the group weight distributed to this span. 420 * h_load = weight * f(tg)
423 */ 421 *
424 unsigned long shares; 422 * Where f(tg) is the recursive weight fraction assigned to
423 * this group.
424 */
425 unsigned long h_load;
425 426
426 /* 427 /*
427 * The sum of all runqueue weights within this span. 428 * this cpu's part of tg->shares
428 */ 429 */
429 unsigned long rq_weight; 430 unsigned long shares;
430 431
431 /* 432 /*
432 * Weight contributed by tasks; this is the part we can 433 * load.weight at the time we set shares
433 * influence by moving tasks around. 434 */
434 */ 435 unsigned long rq_weight;
435 unsigned long task_weight;
436 } aggregate;
437#endif 436#endif
438#endif 437#endif
439}; 438};
@@ -486,6 +485,9 @@ struct root_domain {
486 */ 485 */
487 cpumask_t rto_mask; 486 cpumask_t rto_mask;
488 atomic_t rto_count; 487 atomic_t rto_count;
488#ifdef CONFIG_SMP
489 struct cpupri cpupri;
490#endif
489}; 491};
490 492
491/* 493/*
@@ -560,6 +562,9 @@ struct rq {
560 int push_cpu; 562 int push_cpu;
561 /* cpu of this runqueue: */ 563 /* cpu of this runqueue: */
562 int cpu; 564 int cpu;
565 int online;
566
567 unsigned long avg_load_per_task;
563 568
564 struct task_struct *migration_thread; 569 struct task_struct *migration_thread;
565 struct list_head migration_queue; 570 struct list_head migration_queue;
@@ -641,6 +646,24 @@ static inline void update_rq_clock(struct rq *rq)
641# define const_debug static const 646# define const_debug static const
642#endif 647#endif
643 648
649/**
650 * runqueue_is_locked
651 *
652 * Returns true if the current cpu runqueue is locked.
653 * This interface allows printk to be called with the runqueue lock
654 * held and know whether or not it is OK to wake up the klogd.
655 */
656int runqueue_is_locked(void)
657{
658 int cpu = get_cpu();
659 struct rq *rq = cpu_rq(cpu);
660 int ret;
661
662 ret = spin_is_locked(&rq->lock);
663 put_cpu();
664 return ret;
665}
666
644/* 667/*
645 * Debugging: various feature bits 668 * Debugging: various feature bits
646 */ 669 */
@@ -783,6 +806,12 @@ late_initcall(sched_init_debug);
783const_debug unsigned int sysctl_sched_nr_migrate = 32; 806const_debug unsigned int sysctl_sched_nr_migrate = 32;
784 807
785/* 808/*
809 * ratelimit for updating the group shares.
810 * default: 0.5ms
811 */
812const_debug unsigned int sysctl_sched_shares_ratelimit = 500000;
813
814/*
786 * period over which we measure -rt task cpu usage in us. 815 * period over which we measure -rt task cpu usage in us.
787 * default: 1s 816 * default: 1s
788 */ 817 */
@@ -809,82 +838,6 @@ static inline u64 global_rt_runtime(void)
809 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 838 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
810} 839}
811 840
812unsigned long long time_sync_thresh = 100000;
813
814static DEFINE_PER_CPU(unsigned long long, time_offset);
815static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
816
817/*
818 * Global lock which we take every now and then to synchronize
819 * the CPUs time. This method is not warp-safe, but it's good
820 * enough to synchronize slowly diverging time sources and thus
821 * it's good enough for tracing:
822 */
823static DEFINE_SPINLOCK(time_sync_lock);
824static unsigned long long prev_global_time;
825
826static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
827{
828 /*
829 * We want this inlined, to not get tracer function calls
830 * in this critical section:
831 */
832 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
833 __raw_spin_lock(&time_sync_lock.raw_lock);
834
835 if (time < prev_global_time) {
836 per_cpu(time_offset, cpu) += prev_global_time - time;
837 time = prev_global_time;
838 } else {
839 prev_global_time = time;
840 }
841
842 __raw_spin_unlock(&time_sync_lock.raw_lock);
843 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
844
845 return time;
846}
847
848static unsigned long long __cpu_clock(int cpu)
849{
850 unsigned long long now;
851
852 /*
853 * Only call sched_clock() if the scheduler has already been
854 * initialized (some code might call cpu_clock() very early):
855 */
856 if (unlikely(!scheduler_running))
857 return 0;
858
859 now = sched_clock_cpu(cpu);
860
861 return now;
862}
863
864/*
865 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
866 * clock constructed from sched_clock():
867 */
868unsigned long long cpu_clock(int cpu)
869{
870 unsigned long long prev_cpu_time, time, delta_time;
871 unsigned long flags;
872
873 local_irq_save(flags);
874 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
875 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
876 delta_time = time-prev_cpu_time;
877
878 if (unlikely(delta_time > time_sync_thresh)) {
879 time = __sync_cpu_clock(time, cpu);
880 per_cpu(prev_cpu_time, cpu) = time;
881 }
882 local_irq_restore(flags);
883
884 return time;
885}
886EXPORT_SYMBOL_GPL(cpu_clock);
887
888#ifndef prepare_arch_switch 841#ifndef prepare_arch_switch
889# define prepare_arch_switch(next) do { } while (0) 842# define prepare_arch_switch(next) do { } while (0)
890#endif 843#endif
@@ -1161,6 +1114,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1161 return HRTIMER_NORESTART; 1114 return HRTIMER_NORESTART;
1162} 1115}
1163 1116
1117#ifdef CONFIG_SMP
1164static void hotplug_hrtick_disable(int cpu) 1118static void hotplug_hrtick_disable(int cpu)
1165{ 1119{
1166 struct rq *rq = cpu_rq(cpu); 1120 struct rq *rq = cpu_rq(cpu);
@@ -1216,6 +1170,7 @@ static void init_hrtick(void)
1216{ 1170{
1217 hotcpu_notifier(hotplug_hrtick, 0); 1171 hotcpu_notifier(hotplug_hrtick, 0);
1218} 1172}
1173#endif /* CONFIG_SMP */
1219 1174
1220static void init_rq_hrtick(struct rq *rq) 1175static void init_rq_hrtick(struct rq *rq)
1221{ 1176{
@@ -1345,15 +1300,15 @@ void wake_up_idle_cpu(int cpu)
1345 if (!tsk_is_polling(rq->idle)) 1300 if (!tsk_is_polling(rq->idle))
1346 smp_send_reschedule(cpu); 1301 smp_send_reschedule(cpu);
1347} 1302}
1348#endif 1303#endif /* CONFIG_NO_HZ */
1349 1304
1350#else 1305#else /* !CONFIG_SMP */
1351static void __resched_task(struct task_struct *p, int tif_bit) 1306static void __resched_task(struct task_struct *p, int tif_bit)
1352{ 1307{
1353 assert_spin_locked(&task_rq(p)->lock); 1308 assert_spin_locked(&task_rq(p)->lock);
1354 set_tsk_thread_flag(p, tif_bit); 1309 set_tsk_thread_flag(p, tif_bit);
1355} 1310}
1356#endif 1311#endif /* CONFIG_SMP */
1357 1312
1358#if BITS_PER_LONG == 32 1313#if BITS_PER_LONG == 32
1359# define WMULT_CONST (~0UL) 1314# define WMULT_CONST (~0UL)
@@ -1377,8 +1332,13 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1377{ 1332{
1378 u64 tmp; 1333 u64 tmp;
1379 1334
1380 if (!lw->inv_weight) 1335 if (!lw->inv_weight) {
1381 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1); 1336 if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
1337 lw->inv_weight = 1;
1338 else
1339 lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
1340 / (lw->weight+1);
1341 }
1382 1342
1383 tmp = (u64)delta_exec * weight; 1343 tmp = (u64)delta_exec * weight;
1384 /* 1344 /*
@@ -1503,63 +1463,35 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1503#ifdef CONFIG_SMP 1463#ifdef CONFIG_SMP
1504static unsigned long source_load(int cpu, int type); 1464static unsigned long source_load(int cpu, int type);
1505static unsigned long target_load(int cpu, int type); 1465static unsigned long target_load(int cpu, int type);
1506static unsigned long cpu_avg_load_per_task(int cpu);
1507static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1466static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1508 1467
1509#ifdef CONFIG_FAIR_GROUP_SCHED 1468static unsigned long cpu_avg_load_per_task(int cpu)
1469{
1470 struct rq *rq = cpu_rq(cpu);
1510 1471
1511/* 1472 if (rq->nr_running)
1512 * Group load balancing. 1473 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1513 *
1514 * We calculate a few balance domain wide aggregate numbers; load and weight.
1515 * Given the pictures below, and assuming each item has equal weight:
1516 *
1517 * root 1 - thread
1518 * / | \ A - group
1519 * A 1 B
1520 * /|\ / \
1521 * C 2 D 3 4
1522 * | |
1523 * 5 6
1524 *
1525 * load:
1526 * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1527 * which equals 1/9-th of the total load.
1528 *
1529 * shares:
1530 * The weight of this group on the selected cpus.
1531 *
1532 * rq_weight:
1533 * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1534 * B would get 2.
1535 *
1536 * task_weight:
1537 * Part of the rq_weight contributed by tasks; all groups except B would
1538 * get 1, B gets 2.
1539 */
1540 1474
1541static inline struct aggregate_struct * 1475 return rq->avg_load_per_task;
1542aggregate(struct task_group *tg, struct sched_domain *sd)
1543{
1544 return &tg->cfs_rq[sd->first_cpu]->aggregate;
1545} 1476}
1546 1477
1547typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); 1478#ifdef CONFIG_FAIR_GROUP_SCHED
1479
1480typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1548 1481
1549/* 1482/*
1550 * Iterate the full tree, calling @down when first entering a node and @up when 1483 * Iterate the full tree, calling @down when first entering a node and @up when
1551 * leaving it for the final time. 1484 * leaving it for the final time.
1552 */ 1485 */
1553static 1486static void
1554void aggregate_walk_tree(aggregate_func down, aggregate_func up, 1487walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1555 struct sched_domain *sd)
1556{ 1488{
1557 struct task_group *parent, *child; 1489 struct task_group *parent, *child;
1558 1490
1559 rcu_read_lock(); 1491 rcu_read_lock();
1560 parent = &root_task_group; 1492 parent = &root_task_group;
1561down: 1493down:
1562 (*down)(parent, sd); 1494 (*down)(parent, cpu, sd);
1563 list_for_each_entry_rcu(child, &parent->children, siblings) { 1495 list_for_each_entry_rcu(child, &parent->children, siblings) {
1564 parent = child; 1496 parent = child;
1565 goto down; 1497 goto down;
@@ -1567,7 +1499,7 @@ down:
1567up: 1499up:
1568 continue; 1500 continue;
1569 } 1501 }
1570 (*up)(parent, sd); 1502 (*up)(parent, cpu, sd);
1571 1503
1572 child = parent; 1504 child = parent;
1573 parent = parent->parent; 1505 parent = parent->parent;
@@ -1576,90 +1508,23 @@ up:
1576 rcu_read_unlock(); 1508 rcu_read_unlock();
1577} 1509}
1578 1510
1579/*
1580 * Calculate the aggregate runqueue weight.
1581 */
1582static
1583void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
1584{
1585 unsigned long rq_weight = 0;
1586 unsigned long task_weight = 0;
1587 int i;
1588
1589 for_each_cpu_mask(i, sd->span) {
1590 rq_weight += tg->cfs_rq[i]->load.weight;
1591 task_weight += tg->cfs_rq[i]->task_weight;
1592 }
1593
1594 aggregate(tg, sd)->rq_weight = rq_weight;
1595 aggregate(tg, sd)->task_weight = task_weight;
1596}
1597
1598/*
1599 * Compute the weight of this group on the given cpus.
1600 */
1601static
1602void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
1603{
1604 unsigned long shares = 0;
1605 int i;
1606
1607 for_each_cpu_mask(i, sd->span)
1608 shares += tg->cfs_rq[i]->shares;
1609
1610 if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
1611 shares = tg->shares;
1612
1613 aggregate(tg, sd)->shares = shares;
1614}
1615
1616/*
1617 * Compute the load fraction assigned to this group, relies on the aggregate
1618 * weight and this group's parent's load, i.e. top-down.
1619 */
1620static
1621void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
1622{
1623 unsigned long load;
1624
1625 if (!tg->parent) {
1626 int i;
1627
1628 load = 0;
1629 for_each_cpu_mask(i, sd->span)
1630 load += cpu_rq(i)->load.weight;
1631
1632 } else {
1633 load = aggregate(tg->parent, sd)->load;
1634
1635 /*
1636 * shares is our weight in the parent's rq so
1637 * shares/parent->rq_weight gives our fraction of the load
1638 */
1639 load *= aggregate(tg, sd)->shares;
1640 load /= aggregate(tg->parent, sd)->rq_weight + 1;
1641 }
1642
1643 aggregate(tg, sd)->load = load;
1644}
1645
1646static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1511static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1647 1512
1648/* 1513/*
1649 * Calculate and set the cpu's group shares. 1514 * Calculate and set the cpu's group shares.
1650 */ 1515 */
1651static void 1516static void
1652__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, 1517__update_group_shares_cpu(struct task_group *tg, int cpu,
1653 int tcpu) 1518 unsigned long sd_shares, unsigned long sd_rq_weight)
1654{ 1519{
1655 int boost = 0; 1520 int boost = 0;
1656 unsigned long shares; 1521 unsigned long shares;
1657 unsigned long rq_weight; 1522 unsigned long rq_weight;
1658 1523
1659 if (!tg->se[tcpu]) 1524 if (!tg->se[cpu])
1660 return; 1525 return;
1661 1526
1662 rq_weight = tg->cfs_rq[tcpu]->load.weight; 1527 rq_weight = tg->cfs_rq[cpu]->load.weight;
1663 1528
1664 /* 1529 /*
1665 * If there are currently no tasks on the cpu pretend there is one of 1530 * If there are currently no tasks on the cpu pretend there is one of
@@ -1671,170 +1536,139 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1671 rq_weight = NICE_0_LOAD; 1536 rq_weight = NICE_0_LOAD;
1672 } 1537 }
1673 1538
1539 if (unlikely(rq_weight > sd_rq_weight))
1540 rq_weight = sd_rq_weight;
1541
1674 /* 1542 /*
1675 * \Sum shares * rq_weight 1543 * \Sum shares * rq_weight
1676 * shares = ----------------------- 1544 * shares = -----------------------
1677 * \Sum rq_weight 1545 * \Sum rq_weight
1678 * 1546 *
1679 */ 1547 */
1680 shares = aggregate(tg, sd)->shares * rq_weight; 1548 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1681 shares /= aggregate(tg, sd)->rq_weight + 1;
1682 1549
1683 /* 1550 /*
1684 * record the actual number of shares, not the boosted amount. 1551 * record the actual number of shares, not the boosted amount.
1685 */ 1552 */
1686 tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; 1553 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1554 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1687 1555
1688 if (shares < MIN_SHARES) 1556 if (shares < MIN_SHARES)
1689 shares = MIN_SHARES; 1557 shares = MIN_SHARES;
1690 else if (shares > MAX_SHARES) 1558 else if (shares > MAX_SHARES)
1691 shares = MAX_SHARES; 1559 shares = MAX_SHARES;
1692 1560
1693 __set_se_shares(tg->se[tcpu], shares); 1561 __set_se_shares(tg->se[cpu], shares);
1694} 1562}
1695 1563
1696/* 1564/*
1697 * Re-adjust the weights on the cpu the task came from and on the cpu the 1565 * Re-compute the task group their per cpu shares over the given domain.
1698 * task went to. 1566 * This needs to be done in a bottom-up fashion because the rq weight of a
1567 * parent group depends on the shares of its child groups.
1699 */ 1568 */
1700static void 1569static void
1701__move_group_shares(struct task_group *tg, struct sched_domain *sd, 1570tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1702 int scpu, int dcpu)
1703{ 1571{
1704 unsigned long shares; 1572 unsigned long rq_weight = 0;
1705 1573 unsigned long shares = 0;
1706 shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; 1574 int i;
1707 1575
1708 __update_group_shares_cpu(tg, sd, scpu); 1576 for_each_cpu_mask(i, sd->span) {
1709 __update_group_shares_cpu(tg, sd, dcpu); 1577 rq_weight += tg->cfs_rq[i]->load.weight;
1578 shares += tg->cfs_rq[i]->shares;
1579 }
1710 1580
1711 /* 1581 if ((!shares && rq_weight) || shares > tg->shares)
1712 * ensure we never loose shares due to rounding errors in the 1582 shares = tg->shares;
1713 * above redistribution.
1714 */
1715 shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1716 if (shares)
1717 tg->cfs_rq[dcpu]->shares += shares;
1718}
1719 1583
1720/* 1584 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1721 * Because changing a group's shares changes the weight of the super-group 1585 shares = tg->shares;
1722 * we need to walk up the tree and change all shares until we hit the root.
1723 */
1724static void
1725move_group_shares(struct task_group *tg, struct sched_domain *sd,
1726 int scpu, int dcpu)
1727{
1728 while (tg) {
1729 __move_group_shares(tg, sd, scpu, dcpu);
1730 tg = tg->parent;
1731 }
1732}
1733 1586
1734static 1587 if (!rq_weight)
1735void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) 1588 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1736{
1737 unsigned long shares = aggregate(tg, sd)->shares;
1738 int i;
1739 1589
1740 for_each_cpu_mask(i, sd->span) { 1590 for_each_cpu_mask(i, sd->span) {
1741 struct rq *rq = cpu_rq(i); 1591 struct rq *rq = cpu_rq(i);
1742 unsigned long flags; 1592 unsigned long flags;
1743 1593
1744 spin_lock_irqsave(&rq->lock, flags); 1594 spin_lock_irqsave(&rq->lock, flags);
1745 __update_group_shares_cpu(tg, sd, i); 1595 __update_group_shares_cpu(tg, i, shares, rq_weight);
1746 spin_unlock_irqrestore(&rq->lock, flags); 1596 spin_unlock_irqrestore(&rq->lock, flags);
1747 } 1597 }
1748
1749 aggregate_group_shares(tg, sd);
1750
1751 /*
1752 * ensure we never loose shares due to rounding errors in the
1753 * above redistribution.
1754 */
1755 shares -= aggregate(tg, sd)->shares;
1756 if (shares) {
1757 tg->cfs_rq[sd->first_cpu]->shares += shares;
1758 aggregate(tg, sd)->shares += shares;
1759 }
1760} 1598}
1761 1599
1762/* 1600/*
1763 * Calculate the accumulative weight and recursive load of each task group 1601 * Compute the cpu's hierarchical load factor for each task group.
1764 * while walking down the tree. 1602 * This needs to be done in a top-down fashion because the load of a child
1603 * group is a fraction of its parents load.
1765 */ 1604 */
1766static 1605static void
1767void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) 1606tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1768{ 1607{
1769 aggregate_group_weight(tg, sd); 1608 unsigned long load;
1770 aggregate_group_shares(tg, sd);
1771 aggregate_group_load(tg, sd);
1772}
1773 1609
1774/* 1610 if (!tg->parent) {
1775 * Rebalance the cpu shares while walking back up the tree. 1611 load = cpu_rq(cpu)->load.weight;
1776 */ 1612 } else {
1777static 1613 load = tg->parent->cfs_rq[cpu]->h_load;
1778void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) 1614 load *= tg->cfs_rq[cpu]->shares;
1779{ 1615 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1780 aggregate_group_set_shares(tg, sd); 1616 }
1781}
1782 1617
1783static DEFINE_PER_CPU(spinlock_t, aggregate_lock); 1618 tg->cfs_rq[cpu]->h_load = load;
1619}
1784 1620
1785static void __init init_aggregate(void) 1621static void
1622tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1786{ 1623{
1787 int i;
1788
1789 for_each_possible_cpu(i)
1790 spin_lock_init(&per_cpu(aggregate_lock, i));
1791} 1624}
1792 1625
1793static int get_aggregate(struct sched_domain *sd) 1626static void update_shares(struct sched_domain *sd)
1794{ 1627{
1795 if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) 1628 u64 now = cpu_clock(raw_smp_processor_id());
1796 return 0; 1629 s64 elapsed = now - sd->last_update;
1797 1630
1798 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); 1631 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1799 return 1; 1632 sd->last_update = now;
1633 walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1634 }
1800} 1635}
1801 1636
1802static void put_aggregate(struct sched_domain *sd) 1637static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1803{ 1638{
1804 spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); 1639 spin_unlock(&rq->lock);
1640 update_shares(sd);
1641 spin_lock(&rq->lock);
1805} 1642}
1806 1643
1807static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1644static void update_h_load(int cpu)
1808{ 1645{
1809 cfs_rq->shares = shares; 1646 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1810} 1647}
1811 1648
1812#else 1649#else
1813 1650
1814static inline void init_aggregate(void) 1651static inline void update_shares(struct sched_domain *sd)
1815{ 1652{
1816} 1653}
1817 1654
1818static inline int get_aggregate(struct sched_domain *sd) 1655static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1819{ 1656{
1820 return 0;
1821} 1657}
1822 1658
1823static inline void put_aggregate(struct sched_domain *sd)
1824{
1825}
1826#endif 1659#endif
1827 1660
1828#else /* CONFIG_SMP */ 1661#endif
1829 1662
1830#ifdef CONFIG_FAIR_GROUP_SCHED 1663#ifdef CONFIG_FAIR_GROUP_SCHED
1831static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1664static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1832{ 1665{
1666#ifdef CONFIG_SMP
1667 cfs_rq->shares = shares;
1668#endif
1833} 1669}
1834#endif 1670#endif
1835 1671
1836#endif /* CONFIG_SMP */
1837
1838#include "sched_stats.h" 1672#include "sched_stats.h"
1839#include "sched_idletask.c" 1673#include "sched_idletask.c"
1840#include "sched_fair.c" 1674#include "sched_fair.c"
@@ -1844,6 +1678,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1844#endif 1678#endif
1845 1679
1846#define sched_class_highest (&rt_sched_class) 1680#define sched_class_highest (&rt_sched_class)
1681#define for_each_class(class) \
1682 for (class = sched_class_highest; class; class = class->next)
1847 1683
1848static void inc_nr_running(struct rq *rq) 1684static void inc_nr_running(struct rq *rq)
1849{ 1685{
@@ -1876,6 +1712,12 @@ static void set_load_weight(struct task_struct *p)
1876 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1712 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1877} 1713}
1878 1714
1715static void update_avg(u64 *avg, u64 sample)
1716{
1717 s64 diff = sample - *avg;
1718 *avg += diff >> 3;
1719}
1720
1879static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1721static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1880{ 1722{
1881 sched_info_queued(p); 1723 sched_info_queued(p);
@@ -1885,6 +1727,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1885 1727
1886static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1728static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1887{ 1729{
1730 if (sleep && p->se.last_wakeup) {
1731 update_avg(&p->se.avg_overlap,
1732 p->se.sum_exec_runtime - p->se.last_wakeup);
1733 p->se.last_wakeup = 0;
1734 }
1735
1736 sched_info_dequeued(p);
1888 p->sched_class->dequeue_task(rq, p, sleep); 1737 p->sched_class->dequeue_task(rq, p, sleep);
1889 p->se.on_rq = 0; 1738 p->se.on_rq = 0;
1890} 1739}
@@ -1968,12 +1817,6 @@ inline int task_curr(const struct task_struct *p)
1968 return cpu_curr(task_cpu(p)) == p; 1817 return cpu_curr(task_cpu(p)) == p;
1969} 1818}
1970 1819
1971/* Used instead of source_load when we know the type == 0 */
1972unsigned long weighted_cpuload(const int cpu)
1973{
1974 return cpu_rq(cpu)->load.weight;
1975}
1976
1977static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1820static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1978{ 1821{
1979 set_task_rq(p, cpu); 1822 set_task_rq(p, cpu);
@@ -2002,6 +1845,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2002 1845
2003#ifdef CONFIG_SMP 1846#ifdef CONFIG_SMP
2004 1847
1848/* Used instead of source_load when we know the type == 0 */
1849static unsigned long weighted_cpuload(const int cpu)
1850{
1851 return cpu_rq(cpu)->load.weight;
1852}
1853
2005/* 1854/*
2006 * Is this task likely cache-hot: 1855 * Is this task likely cache-hot:
2007 */ 1856 */
@@ -2212,7 +2061,7 @@ static unsigned long source_load(int cpu, int type)
2212 struct rq *rq = cpu_rq(cpu); 2061 struct rq *rq = cpu_rq(cpu);
2213 unsigned long total = weighted_cpuload(cpu); 2062 unsigned long total = weighted_cpuload(cpu);
2214 2063
2215 if (type == 0) 2064 if (type == 0 || !sched_feat(LB_BIAS))
2216 return total; 2065 return total;
2217 2066
2218 return min(rq->cpu_load[type-1], total); 2067 return min(rq->cpu_load[type-1], total);
@@ -2227,25 +2076,13 @@ static unsigned long target_load(int cpu, int type)
2227 struct rq *rq = cpu_rq(cpu); 2076 struct rq *rq = cpu_rq(cpu);
2228 unsigned long total = weighted_cpuload(cpu); 2077 unsigned long total = weighted_cpuload(cpu);
2229 2078
2230 if (type == 0) 2079 if (type == 0 || !sched_feat(LB_BIAS))
2231 return total; 2080 return total;
2232 2081
2233 return max(rq->cpu_load[type-1], total); 2082 return max(rq->cpu_load[type-1], total);
2234} 2083}
2235 2084
2236/* 2085/*
2237 * Return the average load per task on the cpu's run queue
2238 */
2239static unsigned long cpu_avg_load_per_task(int cpu)
2240{
2241 struct rq *rq = cpu_rq(cpu);
2242 unsigned long total = weighted_cpuload(cpu);
2243 unsigned long n = rq->nr_running;
2244
2245 return n ? total / n : SCHED_LOAD_SCALE;
2246}
2247
2248/*
2249 * find_idlest_group finds and returns the least busy CPU group within the 2086 * find_idlest_group finds and returns the least busy CPU group within the
2250 * domain. 2087 * domain.
2251 */ 2088 */
@@ -2351,6 +2188,9 @@ static int sched_balance_self(int cpu, int flag)
2351 sd = tmp; 2188 sd = tmp;
2352 } 2189 }
2353 2190
2191 if (sd)
2192 update_shares(sd);
2193
2354 while (sd) { 2194 while (sd) {
2355 cpumask_t span, tmpmask; 2195 cpumask_t span, tmpmask;
2356 struct sched_group *group; 2196 struct sched_group *group;
@@ -2417,6 +2257,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2417 if (!sched_feat(SYNC_WAKEUPS)) 2257 if (!sched_feat(SYNC_WAKEUPS))
2418 sync = 0; 2258 sync = 0;
2419 2259
2260#ifdef CONFIG_SMP
2261 if (sched_feat(LB_WAKEUP_UPDATE)) {
2262 struct sched_domain *sd;
2263
2264 this_cpu = raw_smp_processor_id();
2265 cpu = task_cpu(p);
2266
2267 for_each_domain(this_cpu, sd) {
2268 if (cpu_isset(cpu, sd->span)) {
2269 update_shares(sd);
2270 break;
2271 }
2272 }
2273 }
2274#endif
2275
2420 smp_wmb(); 2276 smp_wmb();
2421 rq = task_rq_lock(p, &flags); 2277 rq = task_rq_lock(p, &flags);
2422 old_state = p->state; 2278 old_state = p->state;
@@ -2463,7 +2319,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2463 } 2319 }
2464 } 2320 }
2465 } 2321 }
2466#endif 2322#endif /* CONFIG_SCHEDSTATS */
2467 2323
2468out_activate: 2324out_activate:
2469#endif /* CONFIG_SMP */ 2325#endif /* CONFIG_SMP */
@@ -2481,6 +2337,9 @@ out_activate:
2481 success = 1; 2337 success = 1;
2482 2338
2483out_running: 2339out_running:
2340 trace_mark(kernel_sched_wakeup,
2341 "pid %d state %ld ## rq %p task %p rq->curr %p",
2342 p->pid, p->state, rq, p, rq->curr);
2484 check_preempt_curr(rq, p); 2343 check_preempt_curr(rq, p);
2485 2344
2486 p->state = TASK_RUNNING; 2345 p->state = TASK_RUNNING;
@@ -2489,6 +2348,8 @@ out_running:
2489 p->sched_class->task_wake_up(rq, p); 2348 p->sched_class->task_wake_up(rq, p);
2490#endif 2349#endif
2491out: 2350out:
2351 current->se.last_wakeup = current->se.sum_exec_runtime;
2352
2492 task_rq_unlock(rq, &flags); 2353 task_rq_unlock(rq, &flags);
2493 2354
2494 return success; 2355 return success;
@@ -2611,6 +2472,9 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2611 p->sched_class->task_new(rq, p); 2472 p->sched_class->task_new(rq, p);
2612 inc_nr_running(rq); 2473 inc_nr_running(rq);
2613 } 2474 }
2475 trace_mark(kernel_sched_wakeup_new,
2476 "pid %d state %ld ## rq %p task %p rq->curr %p",
2477 p->pid, p->state, rq, p, rq->curr);
2614 check_preempt_curr(rq, p); 2478 check_preempt_curr(rq, p);
2615#ifdef CONFIG_SMP 2479#ifdef CONFIG_SMP
2616 if (p->sched_class->task_wake_up) 2480 if (p->sched_class->task_wake_up)
@@ -2663,7 +2527,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2663 notifier->ops->sched_out(notifier, next); 2527 notifier->ops->sched_out(notifier, next);
2664} 2528}
2665 2529
2666#else 2530#else /* !CONFIG_PREEMPT_NOTIFIERS */
2667 2531
2668static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2532static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2669{ 2533{
@@ -2675,7 +2539,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2675{ 2539{
2676} 2540}
2677 2541
2678#endif 2542#endif /* CONFIG_PREEMPT_NOTIFIERS */
2679 2543
2680/** 2544/**
2681 * prepare_task_switch - prepare to switch tasks 2545 * prepare_task_switch - prepare to switch tasks
@@ -2783,6 +2647,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
2783 struct mm_struct *mm, *oldmm; 2647 struct mm_struct *mm, *oldmm;
2784 2648
2785 prepare_task_switch(rq, prev, next); 2649 prepare_task_switch(rq, prev, next);
2650 trace_mark(kernel_sched_schedule,
2651 "prev_pid %d next_pid %d prev_state %ld "
2652 "## rq %p prev %p next %p",
2653 prev->pid, next->pid, prev->state,
2654 rq, prev, next);
2786 mm = next->mm; 2655 mm = next->mm;
2787 oldmm = prev->active_mm; 2656 oldmm = prev->active_mm;
2788 /* 2657 /*
@@ -3117,7 +2986,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3117 enum cpu_idle_type idle, int *all_pinned, 2986 enum cpu_idle_type idle, int *all_pinned,
3118 int *this_best_prio, struct rq_iterator *iterator) 2987 int *this_best_prio, struct rq_iterator *iterator)
3119{ 2988{
3120 int loops = 0, pulled = 0, pinned = 0, skip_for_load; 2989 int loops = 0, pulled = 0, pinned = 0;
3121 struct task_struct *p; 2990 struct task_struct *p;
3122 long rem_load_move = max_load_move; 2991 long rem_load_move = max_load_move;
3123 2992
@@ -3133,14 +3002,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3133next: 3002next:
3134 if (!p || loops++ > sysctl_sched_nr_migrate) 3003 if (!p || loops++ > sysctl_sched_nr_migrate)
3135 goto out; 3004 goto out;
3136 /* 3005
3137 * To help distribute high priority tasks across CPUs we don't 3006 if ((p->se.load.weight >> 1) > rem_load_move ||
3138 * skip a task if it will be the highest priority task (i.e. smallest
3139 * prio value) on its new queue regardless of its load weight
3140 */
3141 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
3142 SCHED_LOAD_SCALE_FUZZ;
3143 if ((skip_for_load && p->prio >= *this_best_prio) ||
3144 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 3007 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3145 p = iterator->next(iterator->arg); 3008 p = iterator->next(iterator->arg);
3146 goto next; 3009 goto next;
@@ -3195,6 +3058,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3195 max_load_move - total_load_moved, 3058 max_load_move - total_load_moved,
3196 sd, idle, all_pinned, &this_best_prio); 3059 sd, idle, all_pinned, &this_best_prio);
3197 class = class->next; 3060 class = class->next;
3061
3062 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3063 break;
3064
3198 } while (class && max_load_move > total_load_moved); 3065 } while (class && max_load_move > total_load_moved);
3199 3066
3200 return total_load_moved > 0; 3067 return total_load_moved > 0;
@@ -3271,6 +3138,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3271 max_load = this_load = total_load = total_pwr = 0; 3138 max_load = this_load = total_load = total_pwr = 0;
3272 busiest_load_per_task = busiest_nr_running = 0; 3139 busiest_load_per_task = busiest_nr_running = 0;
3273 this_load_per_task = this_nr_running = 0; 3140 this_load_per_task = this_nr_running = 0;
3141
3274 if (idle == CPU_NOT_IDLE) 3142 if (idle == CPU_NOT_IDLE)
3275 load_idx = sd->busy_idx; 3143 load_idx = sd->busy_idx;
3276 else if (idle == CPU_NEWLY_IDLE) 3144 else if (idle == CPU_NEWLY_IDLE)
@@ -3285,6 +3153,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3285 int __group_imb = 0; 3153 int __group_imb = 0;
3286 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3154 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3287 unsigned long sum_nr_running, sum_weighted_load; 3155 unsigned long sum_nr_running, sum_weighted_load;
3156 unsigned long sum_avg_load_per_task;
3157 unsigned long avg_load_per_task;
3288 3158
3289 local_group = cpu_isset(this_cpu, group->cpumask); 3159 local_group = cpu_isset(this_cpu, group->cpumask);
3290 3160
@@ -3293,6 +3163,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3293 3163
3294 /* Tally up the load of all CPUs in the group */ 3164 /* Tally up the load of all CPUs in the group */
3295 sum_weighted_load = sum_nr_running = avg_load = 0; 3165 sum_weighted_load = sum_nr_running = avg_load = 0;
3166 sum_avg_load_per_task = avg_load_per_task = 0;
3167
3296 max_cpu_load = 0; 3168 max_cpu_load = 0;
3297 min_cpu_load = ~0UL; 3169 min_cpu_load = ~0UL;
3298 3170
@@ -3326,6 +3198,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3326 avg_load += load; 3198 avg_load += load;
3327 sum_nr_running += rq->nr_running; 3199 sum_nr_running += rq->nr_running;
3328 sum_weighted_load += weighted_cpuload(i); 3200 sum_weighted_load += weighted_cpuload(i);
3201
3202 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3329 } 3203 }
3330 3204
3331 /* 3205 /*
@@ -3347,7 +3221,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3347 avg_load = sg_div_cpu_power(group, 3221 avg_load = sg_div_cpu_power(group,
3348 avg_load * SCHED_LOAD_SCALE); 3222 avg_load * SCHED_LOAD_SCALE);
3349 3223
3350 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) 3224
3225 /*
3226 * Consider the group unbalanced when the imbalance is larger
3227 * than the average weight of two tasks.
3228 *
3229 * APZ: with cgroup the avg task weight can vary wildly and
3230 * might not be a suitable number - should we keep a
3231 * normalized nr_running number somewhere that negates
3232 * the hierarchy?
3233 */
3234 avg_load_per_task = sg_div_cpu_power(group,
3235 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3236
3237 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3351 __group_imb = 1; 3238 __group_imb = 1;
3352 3239
3353 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3240 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3488,9 +3375,9 @@ small_imbalance:
3488 if (busiest_load_per_task > this_load_per_task) 3375 if (busiest_load_per_task > this_load_per_task)
3489 imbn = 1; 3376 imbn = 1;
3490 } else 3377 } else
3491 this_load_per_task = SCHED_LOAD_SCALE; 3378 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3492 3379
3493 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= 3380 if (max_load - this_load + 2*busiest_load_per_task >=
3494 busiest_load_per_task * imbn) { 3381 busiest_load_per_task * imbn) {
3495 *imbalance = busiest_load_per_task; 3382 *imbalance = busiest_load_per_task;
3496 return busiest; 3383 return busiest;
@@ -3600,12 +3487,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3600 unsigned long imbalance; 3487 unsigned long imbalance;
3601 struct rq *busiest; 3488 struct rq *busiest;
3602 unsigned long flags; 3489 unsigned long flags;
3603 int unlock_aggregate;
3604 3490
3605 cpus_setall(*cpus); 3491 cpus_setall(*cpus);
3606 3492
3607 unlock_aggregate = get_aggregate(sd);
3608
3609 /* 3493 /*
3610 * When power savings policy is enabled for the parent domain, idle 3494 * When power savings policy is enabled for the parent domain, idle
3611 * sibling can pick up load irrespective of busy siblings. In this case, 3495 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3619,6 +3503,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3619 schedstat_inc(sd, lb_count[idle]); 3503 schedstat_inc(sd, lb_count[idle]);
3620 3504
3621redo: 3505redo:
3506 update_shares(sd);
3622 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3507 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3623 cpus, balance); 3508 cpus, balance);
3624 3509
@@ -3742,8 +3627,8 @@ out_one_pinned:
3742 else 3627 else
3743 ld_moved = 0; 3628 ld_moved = 0;
3744out: 3629out:
3745 if (unlock_aggregate) 3630 if (ld_moved)
3746 put_aggregate(sd); 3631 update_shares(sd);
3747 return ld_moved; 3632 return ld_moved;
3748} 3633}
3749 3634
@@ -3779,6 +3664,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3779 3664
3780 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 3665 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3781redo: 3666redo:
3667 update_shares_locked(this_rq, sd);
3782 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 3668 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3783 &sd_idle, cpus, NULL); 3669 &sd_idle, cpus, NULL);
3784 if (!group) { 3670 if (!group) {
@@ -3822,6 +3708,7 @@ redo:
3822 } else 3708 } else
3823 sd->nr_balance_failed = 0; 3709 sd->nr_balance_failed = 0;
3824 3710
3711 update_shares_locked(this_rq, sd);
3825 return ld_moved; 3712 return ld_moved;
3826 3713
3827out_balanced: 3714out_balanced:
@@ -4013,6 +3900,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4013 /* Earliest time when we have to do rebalance again */ 3900 /* Earliest time when we have to do rebalance again */
4014 unsigned long next_balance = jiffies + 60*HZ; 3901 unsigned long next_balance = jiffies + 60*HZ;
4015 int update_next_balance = 0; 3902 int update_next_balance = 0;
3903 int need_serialize;
4016 cpumask_t tmp; 3904 cpumask_t tmp;
4017 3905
4018 for_each_domain(cpu, sd) { 3906 for_each_domain(cpu, sd) {
@@ -4030,8 +3918,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4030 if (interval > HZ*NR_CPUS/10) 3918 if (interval > HZ*NR_CPUS/10)
4031 interval = HZ*NR_CPUS/10; 3919 interval = HZ*NR_CPUS/10;
4032 3920
3921 need_serialize = sd->flags & SD_SERIALIZE;
4033 3922
4034 if (sd->flags & SD_SERIALIZE) { 3923 if (need_serialize) {
4035 if (!spin_trylock(&balancing)) 3924 if (!spin_trylock(&balancing))
4036 goto out; 3925 goto out;
4037 } 3926 }
@@ -4047,7 +3936,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4047 } 3936 }
4048 sd->last_balance = jiffies; 3937 sd->last_balance = jiffies;
4049 } 3938 }
4050 if (sd->flags & SD_SERIALIZE) 3939 if (need_serialize)
4051 spin_unlock(&balancing); 3940 spin_unlock(&balancing);
4052out: 3941out:
4053 if (time_after(next_balance, sd->last_balance + interval)) { 3942 if (time_after(next_balance, sd->last_balance + interval)) {
@@ -4362,26 +4251,44 @@ void scheduler_tick(void)
4362#endif 4251#endif
4363} 4252}
4364 4253
4365#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 4254#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4255 defined(CONFIG_PREEMPT_TRACER))
4256
4257static inline unsigned long get_parent_ip(unsigned long addr)
4258{
4259 if (in_lock_functions(addr)) {
4260 addr = CALLER_ADDR2;
4261 if (in_lock_functions(addr))
4262 addr = CALLER_ADDR3;
4263 }
4264 return addr;
4265}
4366 4266
4367void __kprobes add_preempt_count(int val) 4267void __kprobes add_preempt_count(int val)
4368{ 4268{
4269#ifdef CONFIG_DEBUG_PREEMPT
4369 /* 4270 /*
4370 * Underflow? 4271 * Underflow?
4371 */ 4272 */
4372 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 4273 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4373 return; 4274 return;
4275#endif
4374 preempt_count() += val; 4276 preempt_count() += val;
4277#ifdef CONFIG_DEBUG_PREEMPT
4375 /* 4278 /*
4376 * Spinlock count overflowing soon? 4279 * Spinlock count overflowing soon?
4377 */ 4280 */
4378 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 4281 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4379 PREEMPT_MASK - 10); 4282 PREEMPT_MASK - 10);
4283#endif
4284 if (preempt_count() == val)
4285 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4380} 4286}
4381EXPORT_SYMBOL(add_preempt_count); 4287EXPORT_SYMBOL(add_preempt_count);
4382 4288
4383void __kprobes sub_preempt_count(int val) 4289void __kprobes sub_preempt_count(int val)
4384{ 4290{
4291#ifdef CONFIG_DEBUG_PREEMPT
4385 /* 4292 /*
4386 * Underflow? 4293 * Underflow?
4387 */ 4294 */
@@ -4393,7 +4300,10 @@ void __kprobes sub_preempt_count(int val)
4393 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 4300 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4394 !(preempt_count() & PREEMPT_MASK))) 4301 !(preempt_count() & PREEMPT_MASK)))
4395 return; 4302 return;
4303#endif
4396 4304
4305 if (preempt_count() == val)
4306 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4397 preempt_count() -= val; 4307 preempt_count() -= val;
4398} 4308}
4399EXPORT_SYMBOL(sub_preempt_count); 4309EXPORT_SYMBOL(sub_preempt_count);
@@ -4411,6 +4321,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
4411 prev->comm, prev->pid, preempt_count()); 4321 prev->comm, prev->pid, preempt_count());
4412 4322
4413 debug_show_held_locks(prev); 4323 debug_show_held_locks(prev);
4324 print_modules();
4414 if (irqs_disabled()) 4325 if (irqs_disabled())
4415 print_irqtrace_events(prev); 4326 print_irqtrace_events(prev);
4416 4327
@@ -4430,7 +4341,7 @@ static inline void schedule_debug(struct task_struct *prev)
4430 * schedule() atomically, we ignore that path for now. 4341 * schedule() atomically, we ignore that path for now.
4431 * Otherwise, whine if we are scheduling when we should not be. 4342 * Otherwise, whine if we are scheduling when we should not be.
4432 */ 4343 */
4433 if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) 4344 if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
4434 __schedule_bug(prev); 4345 __schedule_bug(prev);
4435 4346
4436 profile_hit(SCHED_PROFILING, __builtin_return_address(0)); 4347 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4484,7 +4395,7 @@ asmlinkage void __sched schedule(void)
4484 struct task_struct *prev, *next; 4395 struct task_struct *prev, *next;
4485 unsigned long *switch_count; 4396 unsigned long *switch_count;
4486 struct rq *rq; 4397 struct rq *rq;
4487 int cpu; 4398 int cpu, hrtick = sched_feat(HRTICK);
4488 4399
4489need_resched: 4400need_resched:
4490 preempt_disable(); 4401 preempt_disable();
@@ -4499,7 +4410,8 @@ need_resched_nonpreemptible:
4499 4410
4500 schedule_debug(prev); 4411 schedule_debug(prev);
4501 4412
4502 hrtick_clear(rq); 4413 if (hrtick)
4414 hrtick_clear(rq);
4503 4415
4504 /* 4416 /*
4505 * Do the rq-clock update outside the rq lock: 4417 * Do the rq-clock update outside the rq lock:
@@ -4510,12 +4422,10 @@ need_resched_nonpreemptible:
4510 clear_tsk_need_resched(prev); 4422 clear_tsk_need_resched(prev);
4511 4423
4512 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4424 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4513 if (unlikely((prev->state & TASK_INTERRUPTIBLE) && 4425 if (unlikely(signal_pending_state(prev->state, prev)))
4514 signal_pending(prev))) {
4515 prev->state = TASK_RUNNING; 4426 prev->state = TASK_RUNNING;
4516 } else { 4427 else
4517 deactivate_task(rq, prev, 1); 4428 deactivate_task(rq, prev, 1);
4518 }
4519 switch_count = &prev->nvcsw; 4429 switch_count = &prev->nvcsw;
4520 } 4430 }
4521 4431
@@ -4547,7 +4457,8 @@ need_resched_nonpreemptible:
4547 } else 4457 } else
4548 spin_unlock_irq(&rq->lock); 4458 spin_unlock_irq(&rq->lock);
4549 4459
4550 hrtick_set(rq); 4460 if (hrtick)
4461 hrtick_set(rq);
4551 4462
4552 if (unlikely(reacquire_kernel_lock(current) < 0)) 4463 if (unlikely(reacquire_kernel_lock(current) < 0))
4553 goto need_resched_nonpreemptible; 4464 goto need_resched_nonpreemptible;
@@ -4741,22 +4652,20 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4741 signal_pending(current)) || 4652 signal_pending(current)) ||
4742 (state == TASK_KILLABLE && 4653 (state == TASK_KILLABLE &&
4743 fatal_signal_pending(current))) { 4654 fatal_signal_pending(current))) {
4744 __remove_wait_queue(&x->wait, &wait); 4655 timeout = -ERESTARTSYS;
4745 return -ERESTARTSYS; 4656 break;
4746 } 4657 }
4747 __set_current_state(state); 4658 __set_current_state(state);
4748 spin_unlock_irq(&x->wait.lock); 4659 spin_unlock_irq(&x->wait.lock);
4749 timeout = schedule_timeout(timeout); 4660 timeout = schedule_timeout(timeout);
4750 spin_lock_irq(&x->wait.lock); 4661 spin_lock_irq(&x->wait.lock);
4751 if (!timeout) { 4662 } while (!x->done && timeout);
4752 __remove_wait_queue(&x->wait, &wait);
4753 return timeout;
4754 }
4755 } while (!x->done);
4756 __remove_wait_queue(&x->wait, &wait); 4663 __remove_wait_queue(&x->wait, &wait);
4664 if (!x->done)
4665 return timeout;
4757 } 4666 }
4758 x->done--; 4667 x->done--;
4759 return timeout; 4668 return timeout ?: 1;
4760} 4669}
4761 4670
4762static long __sched 4671static long __sched
@@ -5086,16 +4995,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
5086 set_load_weight(p); 4995 set_load_weight(p);
5087} 4996}
5088 4997
5089/** 4998static int __sched_setscheduler(struct task_struct *p, int policy,
5090 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 4999 struct sched_param *param, bool user)
5091 * @p: the task in question.
5092 * @policy: new policy.
5093 * @param: structure containing the new RT priority.
5094 *
5095 * NOTE that the task may be already dead.
5096 */
5097int sched_setscheduler(struct task_struct *p, int policy,
5098 struct sched_param *param)
5099{ 5000{
5100 int retval, oldprio, oldpolicy = -1, on_rq, running; 5001 int retval, oldprio, oldpolicy = -1, on_rq, running;
5101 unsigned long flags; 5002 unsigned long flags;
@@ -5127,7 +5028,7 @@ recheck:
5127 /* 5028 /*
5128 * Allow unprivileged RT tasks to decrease priority: 5029 * Allow unprivileged RT tasks to decrease priority:
5129 */ 5030 */
5130 if (!capable(CAP_SYS_NICE)) { 5031 if (user && !capable(CAP_SYS_NICE)) {
5131 if (rt_policy(policy)) { 5032 if (rt_policy(policy)) {
5132 unsigned long rlim_rtprio; 5033 unsigned long rlim_rtprio;
5133 5034
@@ -5163,7 +5064,8 @@ recheck:
5163 * Do not allow realtime tasks into groups that have no runtime 5064 * Do not allow realtime tasks into groups that have no runtime
5164 * assigned. 5065 * assigned.
5165 */ 5066 */
5166 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5067 if (user
5068 && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0)
5167 return -EPERM; 5069 return -EPERM;
5168#endif 5070#endif
5169 5071
@@ -5212,8 +5114,39 @@ recheck:
5212 5114
5213 return 0; 5115 return 0;
5214} 5116}
5117
5118/**
5119 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5120 * @p: the task in question.
5121 * @policy: new policy.
5122 * @param: structure containing the new RT priority.
5123 *
5124 * NOTE that the task may be already dead.
5125 */
5126int sched_setscheduler(struct task_struct *p, int policy,
5127 struct sched_param *param)
5128{
5129 return __sched_setscheduler(p, policy, param, true);
5130}
5215EXPORT_SYMBOL_GPL(sched_setscheduler); 5131EXPORT_SYMBOL_GPL(sched_setscheduler);
5216 5132
5133/**
5134 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5135 * @p: the task in question.
5136 * @policy: new policy.
5137 * @param: structure containing the new RT priority.
5138 *
5139 * Just like sched_setscheduler, only don't bother checking if the
5140 * current context has permission. For example, this is needed in
5141 * stop_machine(): we create temporary high priority worker threads,
5142 * but our caller might not have that capability.
5143 */
5144int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5145 struct sched_param *param)
5146{
5147 return __sched_setscheduler(p, policy, param, false);
5148}
5149
5217static int 5150static int
5218do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5151do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
5219{ 5152{
@@ -5412,24 +5345,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5412 return sched_setaffinity(pid, &new_mask); 5345 return sched_setaffinity(pid, &new_mask);
5413} 5346}
5414 5347
5415/*
5416 * Represents all cpu's present in the system
5417 * In systems capable of hotplug, this map could dynamically grow
5418 * as new cpu's are detected in the system via any platform specific
5419 * method, such as ACPI for e.g.
5420 */
5421
5422cpumask_t cpu_present_map __read_mostly;
5423EXPORT_SYMBOL(cpu_present_map);
5424
5425#ifndef CONFIG_SMP
5426cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5427EXPORT_SYMBOL(cpu_online_map);
5428
5429cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5430EXPORT_SYMBOL(cpu_possible_map);
5431#endif
5432
5433long sched_getaffinity(pid_t pid, cpumask_t *mask) 5348long sched_getaffinity(pid_t pid, cpumask_t *mask)
5434{ 5349{
5435 struct task_struct *p; 5350 struct task_struct *p;
@@ -5726,7 +5641,7 @@ out_unlock:
5726 return retval; 5641 return retval;
5727} 5642}
5728 5643
5729static const char stat_nam[] = "RSDTtZX"; 5644static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5730 5645
5731void sched_show_task(struct task_struct *p) 5646void sched_show_task(struct task_struct *p)
5732{ 5647{
@@ -5913,6 +5828,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5913 goto out; 5828 goto out;
5914 } 5829 }
5915 5830
5831 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5832 !cpus_equal(p->cpus_allowed, *new_mask))) {
5833 ret = -EINVAL;
5834 goto out;
5835 }
5836
5916 if (p->sched_class->set_cpus_allowed) 5837 if (p->sched_class->set_cpus_allowed)
5917 p->sched_class->set_cpus_allowed(p, new_mask); 5838 p->sched_class->set_cpus_allowed(p, new_mask);
5918 else { 5839 else {
@@ -5964,10 +5885,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5964 double_rq_lock(rq_src, rq_dest); 5885 double_rq_lock(rq_src, rq_dest);
5965 /* Already moved. */ 5886 /* Already moved. */
5966 if (task_cpu(p) != src_cpu) 5887 if (task_cpu(p) != src_cpu)
5967 goto out; 5888 goto done;
5968 /* Affinity changed (again). */ 5889 /* Affinity changed (again). */
5969 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 5890 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5970 goto out; 5891 goto fail;
5971 5892
5972 on_rq = p->se.on_rq; 5893 on_rq = p->se.on_rq;
5973 if (on_rq) 5894 if (on_rq)
@@ -5978,8 +5899,9 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5978 activate_task(rq_dest, p, 0); 5899 activate_task(rq_dest, p, 0);
5979 check_preempt_curr(rq_dest, p); 5900 check_preempt_curr(rq_dest, p);
5980 } 5901 }
5902done:
5981 ret = 1; 5903 ret = 1;
5982out: 5904fail:
5983 double_rq_unlock(rq_src, rq_dest); 5905 double_rq_unlock(rq_src, rq_dest);
5984 return ret; 5906 return ret;
5985} 5907}
@@ -6229,6 +6151,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6229 next = pick_next_task(rq, rq->curr); 6151 next = pick_next_task(rq, rq->curr);
6230 if (!next) 6152 if (!next)
6231 break; 6153 break;
6154 next->sched_class->put_prev_task(rq, next);
6232 migrate_dead(dead_cpu, next); 6155 migrate_dead(dead_cpu, next);
6233 6156
6234 } 6157 }
@@ -6400,6 +6323,36 @@ static void unregister_sched_domain_sysctl(void)
6400} 6323}
6401#endif 6324#endif
6402 6325
6326static void set_rq_online(struct rq *rq)
6327{
6328 if (!rq->online) {
6329 const struct sched_class *class;
6330
6331 cpu_set(rq->cpu, rq->rd->online);
6332 rq->online = 1;
6333
6334 for_each_class(class) {
6335 if (class->rq_online)
6336 class->rq_online(rq);
6337 }
6338 }
6339}
6340
6341static void set_rq_offline(struct rq *rq)
6342{
6343 if (rq->online) {
6344 const struct sched_class *class;
6345
6346 for_each_class(class) {
6347 if (class->rq_offline)
6348 class->rq_offline(rq);
6349 }
6350
6351 cpu_clear(rq->cpu, rq->rd->online);
6352 rq->online = 0;
6353 }
6354}
6355
6403/* 6356/*
6404 * migration_call - callback that gets triggered when a CPU is added. 6357 * migration_call - callback that gets triggered when a CPU is added.
6405 * Here we can start up the necessary migration thread for the new CPU. 6358 * Here we can start up the necessary migration thread for the new CPU.
@@ -6437,7 +6390,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6437 spin_lock_irqsave(&rq->lock, flags); 6390 spin_lock_irqsave(&rq->lock, flags);
6438 if (rq->rd) { 6391 if (rq->rd) {
6439 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6392 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6440 cpu_set(cpu, rq->rd->online); 6393
6394 set_rq_online(rq);
6441 } 6395 }
6442 spin_unlock_irqrestore(&rq->lock, flags); 6396 spin_unlock_irqrestore(&rq->lock, flags);
6443 break; 6397 break;
@@ -6498,7 +6452,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6498 spin_lock_irqsave(&rq->lock, flags); 6452 spin_lock_irqsave(&rq->lock, flags);
6499 if (rq->rd) { 6453 if (rq->rd) {
6500 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6454 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6501 cpu_clear(cpu, rq->rd->online); 6455 set_rq_offline(rq);
6502 } 6456 }
6503 spin_unlock_irqrestore(&rq->lock, flags); 6457 spin_unlock_irqrestore(&rq->lock, flags);
6504 break; 6458 break;
@@ -6532,6 +6486,28 @@ void __init migration_init(void)
6532 6486
6533#ifdef CONFIG_SCHED_DEBUG 6487#ifdef CONFIG_SCHED_DEBUG
6534 6488
6489static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6490{
6491 switch (lvl) {
6492 case SD_LV_NONE:
6493 return "NONE";
6494 case SD_LV_SIBLING:
6495 return "SIBLING";
6496 case SD_LV_MC:
6497 return "MC";
6498 case SD_LV_CPU:
6499 return "CPU";
6500 case SD_LV_NODE:
6501 return "NODE";
6502 case SD_LV_ALLNODES:
6503 return "ALLNODES";
6504 case SD_LV_MAX:
6505 return "MAX";
6506
6507 }
6508 return "MAX";
6509}
6510
6535static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6511static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6536 cpumask_t *groupmask) 6512 cpumask_t *groupmask)
6537{ 6513{
@@ -6551,7 +6527,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6551 return -1; 6527 return -1;
6552 } 6528 }
6553 6529
6554 printk(KERN_CONT "span %s\n", str); 6530 printk(KERN_CONT "span %s level %s\n",
6531 str, sd_level_to_string(sd->level));
6555 6532
6556 if (!cpu_isset(cpu, sd->span)) { 6533 if (!cpu_isset(cpu, sd->span)) {
6557 printk(KERN_ERR "ERROR: domain->span does not contain " 6534 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6635,9 +6612,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6635 } 6612 }
6636 kfree(groupmask); 6613 kfree(groupmask);
6637} 6614}
6638#else 6615#else /* !CONFIG_SCHED_DEBUG */
6639# define sched_domain_debug(sd, cpu) do { } while (0) 6616# define sched_domain_debug(sd, cpu) do { } while (0)
6640#endif 6617#endif /* CONFIG_SCHED_DEBUG */
6641 6618
6642static int sd_degenerate(struct sched_domain *sd) 6619static int sd_degenerate(struct sched_domain *sd)
6643{ 6620{
@@ -6697,20 +6674,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6697static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6674static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6698{ 6675{
6699 unsigned long flags; 6676 unsigned long flags;
6700 const struct sched_class *class;
6701 6677
6702 spin_lock_irqsave(&rq->lock, flags); 6678 spin_lock_irqsave(&rq->lock, flags);
6703 6679
6704 if (rq->rd) { 6680 if (rq->rd) {
6705 struct root_domain *old_rd = rq->rd; 6681 struct root_domain *old_rd = rq->rd;
6706 6682
6707 for (class = sched_class_highest; class; class = class->next) { 6683 if (cpu_isset(rq->cpu, old_rd->online))
6708 if (class->leave_domain) 6684 set_rq_offline(rq);
6709 class->leave_domain(rq);
6710 }
6711 6685
6712 cpu_clear(rq->cpu, old_rd->span); 6686 cpu_clear(rq->cpu, old_rd->span);
6713 cpu_clear(rq->cpu, old_rd->online);
6714 6687
6715 if (atomic_dec_and_test(&old_rd->refcount)) 6688 if (atomic_dec_and_test(&old_rd->refcount))
6716 kfree(old_rd); 6689 kfree(old_rd);
@@ -6721,12 +6694,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6721 6694
6722 cpu_set(rq->cpu, rd->span); 6695 cpu_set(rq->cpu, rd->span);
6723 if (cpu_isset(rq->cpu, cpu_online_map)) 6696 if (cpu_isset(rq->cpu, cpu_online_map))
6724 cpu_set(rq->cpu, rd->online); 6697 set_rq_online(rq);
6725
6726 for (class = sched_class_highest; class; class = class->next) {
6727 if (class->join_domain)
6728 class->join_domain(rq);
6729 }
6730 6698
6731 spin_unlock_irqrestore(&rq->lock, flags); 6699 spin_unlock_irqrestore(&rq->lock, flags);
6732} 6700}
@@ -6737,6 +6705,8 @@ static void init_rootdomain(struct root_domain *rd)
6737 6705
6738 cpus_clear(rd->span); 6706 cpus_clear(rd->span);
6739 cpus_clear(rd->online); 6707 cpus_clear(rd->online);
6708
6709 cpupri_init(&rd->cpupri);
6740} 6710}
6741 6711
6742static void init_defrootdomain(void) 6712static void init_defrootdomain(void)
@@ -6879,9 +6849,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6879 6849
6880 min_val = INT_MAX; 6850 min_val = INT_MAX;
6881 6851
6882 for (i = 0; i < MAX_NUMNODES; i++) { 6852 for (i = 0; i < nr_node_ids; i++) {
6883 /* Start at @node */ 6853 /* Start at @node */
6884 n = (node + i) % MAX_NUMNODES; 6854 n = (node + i) % nr_node_ids;
6885 6855
6886 if (!nr_cpus_node(n)) 6856 if (!nr_cpus_node(n))
6887 continue; 6857 continue;
@@ -6931,7 +6901,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
6931 cpus_or(*span, *span, *nodemask); 6901 cpus_or(*span, *span, *nodemask);
6932 } 6902 }
6933} 6903}
6934#endif 6904#endif /* CONFIG_NUMA */
6935 6905
6936int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 6906int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6937 6907
@@ -6950,7 +6920,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6950 *sg = &per_cpu(sched_group_cpus, cpu); 6920 *sg = &per_cpu(sched_group_cpus, cpu);
6951 return cpu; 6921 return cpu;
6952} 6922}
6953#endif 6923#endif /* CONFIG_SCHED_SMT */
6954 6924
6955/* 6925/*
6956 * multi-core sched-domains: 6926 * multi-core sched-domains:
@@ -6958,7 +6928,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6958#ifdef CONFIG_SCHED_MC 6928#ifdef CONFIG_SCHED_MC
6959static DEFINE_PER_CPU(struct sched_domain, core_domains); 6929static DEFINE_PER_CPU(struct sched_domain, core_domains);
6960static DEFINE_PER_CPU(struct sched_group, sched_group_core); 6930static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6961#endif 6931#endif /* CONFIG_SCHED_MC */
6962 6932
6963#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6933#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6964static int 6934static int
@@ -7060,7 +7030,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7060 sg = sg->next; 7030 sg = sg->next;
7061 } while (sg != group_head); 7031 } while (sg != group_head);
7062} 7032}
7063#endif 7033#endif /* CONFIG_NUMA */
7064 7034
7065#ifdef CONFIG_NUMA 7035#ifdef CONFIG_NUMA
7066/* Free memory allocated for various sched_group structures */ 7036/* Free memory allocated for various sched_group structures */
@@ -7075,7 +7045,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7075 if (!sched_group_nodes) 7045 if (!sched_group_nodes)
7076 continue; 7046 continue;
7077 7047
7078 for (i = 0; i < MAX_NUMNODES; i++) { 7048 for (i = 0; i < nr_node_ids; i++) {
7079 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7049 struct sched_group *oldsg, *sg = sched_group_nodes[i];
7080 7050
7081 *nodemask = node_to_cpumask(i); 7051 *nodemask = node_to_cpumask(i);
@@ -7097,11 +7067,11 @@ next_sg:
7097 sched_group_nodes_bycpu[cpu] = NULL; 7067 sched_group_nodes_bycpu[cpu] = NULL;
7098 } 7068 }
7099} 7069}
7100#else 7070#else /* !CONFIG_NUMA */
7101static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7071static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7102{ 7072{
7103} 7073}
7104#endif 7074#endif /* CONFIG_NUMA */
7105 7075
7106/* 7076/*
7107 * Initialize sched groups cpu_power. 7077 * Initialize sched groups cpu_power.
@@ -7219,7 +7189,12 @@ static int default_relax_domain_level = -1;
7219 7189
7220static int __init setup_relax_domain_level(char *str) 7190static int __init setup_relax_domain_level(char *str)
7221{ 7191{
7222 default_relax_domain_level = simple_strtoul(str, NULL, 0); 7192 unsigned long val;
7193
7194 val = simple_strtoul(str, NULL, 0);
7195 if (val < SD_LV_MAX)
7196 default_relax_domain_level = val;
7197
7223 return 1; 7198 return 1;
7224} 7199}
7225__setup("relax_domain_level=", setup_relax_domain_level); 7200__setup("relax_domain_level=", setup_relax_domain_level);
@@ -7263,7 +7238,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7263 /* 7238 /*
7264 * Allocate the per-node list of sched groups 7239 * Allocate the per-node list of sched groups
7265 */ 7240 */
7266 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), 7241 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
7267 GFP_KERNEL); 7242 GFP_KERNEL);
7268 if (!sched_group_nodes) { 7243 if (!sched_group_nodes) {
7269 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7244 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -7316,7 +7291,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7316 SD_INIT(sd, ALLNODES); 7291 SD_INIT(sd, ALLNODES);
7317 set_domain_attribute(sd, attr); 7292 set_domain_attribute(sd, attr);
7318 sd->span = *cpu_map; 7293 sd->span = *cpu_map;
7319 sd->first_cpu = first_cpu(sd->span);
7320 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7294 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7321 p = sd; 7295 p = sd;
7322 sd_allnodes = 1; 7296 sd_allnodes = 1;
@@ -7327,7 +7301,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7327 SD_INIT(sd, NODE); 7301 SD_INIT(sd, NODE);
7328 set_domain_attribute(sd, attr); 7302 set_domain_attribute(sd, attr);
7329 sched_domain_node_span(cpu_to_node(i), &sd->span); 7303 sched_domain_node_span(cpu_to_node(i), &sd->span);
7330 sd->first_cpu = first_cpu(sd->span);
7331 sd->parent = p; 7304 sd->parent = p;
7332 if (p) 7305 if (p)
7333 p->child = sd; 7306 p->child = sd;
@@ -7339,7 +7312,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7339 SD_INIT(sd, CPU); 7312 SD_INIT(sd, CPU);
7340 set_domain_attribute(sd, attr); 7313 set_domain_attribute(sd, attr);
7341 sd->span = *nodemask; 7314 sd->span = *nodemask;
7342 sd->first_cpu = first_cpu(sd->span);
7343 sd->parent = p; 7315 sd->parent = p;
7344 if (p) 7316 if (p)
7345 p->child = sd; 7317 p->child = sd;
@@ -7351,7 +7323,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7351 SD_INIT(sd, MC); 7323 SD_INIT(sd, MC);
7352 set_domain_attribute(sd, attr); 7324 set_domain_attribute(sd, attr);
7353 sd->span = cpu_coregroup_map(i); 7325 sd->span = cpu_coregroup_map(i);
7354 sd->first_cpu = first_cpu(sd->span);
7355 cpus_and(sd->span, sd->span, *cpu_map); 7326 cpus_and(sd->span, sd->span, *cpu_map);
7356 sd->parent = p; 7327 sd->parent = p;
7357 p->child = sd; 7328 p->child = sd;
@@ -7364,7 +7335,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7364 SD_INIT(sd, SIBLING); 7335 SD_INIT(sd, SIBLING);
7365 set_domain_attribute(sd, attr); 7336 set_domain_attribute(sd, attr);
7366 sd->span = per_cpu(cpu_sibling_map, i); 7337 sd->span = per_cpu(cpu_sibling_map, i);
7367 sd->first_cpu = first_cpu(sd->span);
7368 cpus_and(sd->span, sd->span, *cpu_map); 7338 cpus_and(sd->span, sd->span, *cpu_map);
7369 sd->parent = p; 7339 sd->parent = p;
7370 p->child = sd; 7340 p->child = sd;
@@ -7407,7 +7377,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7407#endif 7377#endif
7408 7378
7409 /* Set up physical groups */ 7379 /* Set up physical groups */
7410 for (i = 0; i < MAX_NUMNODES; i++) { 7380 for (i = 0; i < nr_node_ids; i++) {
7411 SCHED_CPUMASK_VAR(nodemask, allmasks); 7381 SCHED_CPUMASK_VAR(nodemask, allmasks);
7412 SCHED_CPUMASK_VAR(send_covered, allmasks); 7382 SCHED_CPUMASK_VAR(send_covered, allmasks);
7413 7383
@@ -7431,7 +7401,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7431 send_covered, tmpmask); 7401 send_covered, tmpmask);
7432 } 7402 }
7433 7403
7434 for (i = 0; i < MAX_NUMNODES; i++) { 7404 for (i = 0; i < nr_node_ids; i++) {
7435 /* Set up node groups */ 7405 /* Set up node groups */
7436 struct sched_group *sg, *prev; 7406 struct sched_group *sg, *prev;
7437 SCHED_CPUMASK_VAR(nodemask, allmasks); 7407 SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7470,9 +7440,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7470 cpus_or(*covered, *covered, *nodemask); 7440 cpus_or(*covered, *covered, *nodemask);
7471 prev = sg; 7441 prev = sg;
7472 7442
7473 for (j = 0; j < MAX_NUMNODES; j++) { 7443 for (j = 0; j < nr_node_ids; j++) {
7474 SCHED_CPUMASK_VAR(notcovered, allmasks); 7444 SCHED_CPUMASK_VAR(notcovered, allmasks);
7475 int n = (i + j) % MAX_NUMNODES; 7445 int n = (i + j) % nr_node_ids;
7476 node_to_cpumask_ptr(pnodemask, n); 7446 node_to_cpumask_ptr(pnodemask, n);
7477 7447
7478 cpus_complement(*notcovered, *covered); 7448 cpus_complement(*notcovered, *covered);
@@ -7525,7 +7495,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7525 } 7495 }
7526 7496
7527#ifdef CONFIG_NUMA 7497#ifdef CONFIG_NUMA
7528 for (i = 0; i < MAX_NUMNODES; i++) 7498 for (i = 0; i < nr_node_ids; i++)
7529 init_numa_sched_groups_power(sched_group_nodes[i]); 7499 init_numa_sched_groups_power(sched_group_nodes[i]);
7530 7500
7531 if (sd_allnodes) { 7501 if (sd_allnodes) {
@@ -7568,8 +7538,8 @@ static int build_sched_domains(const cpumask_t *cpu_map)
7568 7538
7569static cpumask_t *doms_cur; /* current sched domains */ 7539static cpumask_t *doms_cur; /* current sched domains */
7570static int ndoms_cur; /* number of sched domains in 'doms_cur' */ 7540static int ndoms_cur; /* number of sched domains in 'doms_cur' */
7571static struct sched_domain_attr *dattr_cur; /* attribues of custom domains 7541static struct sched_domain_attr *dattr_cur;
7572 in 'doms_cur' */ 7542 /* attribues of custom domains in 'doms_cur' */
7573 7543
7574/* 7544/*
7575 * Special case: If a kmalloc of a doms_cur partition (array of 7545 * Special case: If a kmalloc of a doms_cur partition (array of
@@ -7583,6 +7553,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7583} 7553}
7584 7554
7585/* 7555/*
7556 * Free current domain masks.
7557 * Called after all cpus are attached to NULL domain.
7558 */
7559static void free_sched_domains(void)
7560{
7561 ndoms_cur = 0;
7562 if (doms_cur != &fallback_doms)
7563 kfree(doms_cur);
7564 doms_cur = &fallback_doms;
7565}
7566
7567/*
7586 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7568 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7587 * For now this just excludes isolated cpus, but could be used to 7569 * For now this just excludes isolated cpus, but could be used to
7588 * exclude other special cases in the future. 7570 * exclude other special cases in the future.
@@ -7729,6 +7711,7 @@ int arch_reinit_sched_domains(void)
7729 get_online_cpus(); 7711 get_online_cpus();
7730 mutex_lock(&sched_domains_mutex); 7712 mutex_lock(&sched_domains_mutex);
7731 detach_destroy_domains(&cpu_online_map); 7713 detach_destroy_domains(&cpu_online_map);
7714 free_sched_domains();
7732 err = arch_init_sched_domains(&cpu_online_map); 7715 err = arch_init_sched_domains(&cpu_online_map);
7733 mutex_unlock(&sched_domains_mutex); 7716 mutex_unlock(&sched_domains_mutex);
7734 put_online_cpus(); 7717 put_online_cpus();
@@ -7797,7 +7780,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7797#endif 7780#endif
7798 return err; 7781 return err;
7799} 7782}
7800#endif 7783#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7801 7784
7802/* 7785/*
7803 * Force a reinitialization of the sched domains hierarchy. The domains 7786 * Force a reinitialization of the sched domains hierarchy. The domains
@@ -7808,20 +7791,28 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7808static int update_sched_domains(struct notifier_block *nfb, 7791static int update_sched_domains(struct notifier_block *nfb,
7809 unsigned long action, void *hcpu) 7792 unsigned long action, void *hcpu)
7810{ 7793{
7794 int cpu = (int)(long)hcpu;
7795
7811 switch (action) { 7796 switch (action) {
7812 case CPU_UP_PREPARE:
7813 case CPU_UP_PREPARE_FROZEN:
7814 case CPU_DOWN_PREPARE: 7797 case CPU_DOWN_PREPARE:
7815 case CPU_DOWN_PREPARE_FROZEN: 7798 case CPU_DOWN_PREPARE_FROZEN:
7799 disable_runtime(cpu_rq(cpu));
7800 /* fall-through */
7801 case CPU_UP_PREPARE:
7802 case CPU_UP_PREPARE_FROZEN:
7816 detach_destroy_domains(&cpu_online_map); 7803 detach_destroy_domains(&cpu_online_map);
7804 free_sched_domains();
7817 return NOTIFY_OK; 7805 return NOTIFY_OK;
7818 7806
7819 case CPU_UP_CANCELED: 7807
7820 case CPU_UP_CANCELED_FROZEN:
7821 case CPU_DOWN_FAILED: 7808 case CPU_DOWN_FAILED:
7822 case CPU_DOWN_FAILED_FROZEN: 7809 case CPU_DOWN_FAILED_FROZEN:
7823 case CPU_ONLINE: 7810 case CPU_ONLINE:
7824 case CPU_ONLINE_FROZEN: 7811 case CPU_ONLINE_FROZEN:
7812 enable_runtime(cpu_rq(cpu));
7813 /* fall-through */
7814 case CPU_UP_CANCELED:
7815 case CPU_UP_CANCELED_FROZEN:
7825 case CPU_DEAD: 7816 case CPU_DEAD:
7826 case CPU_DEAD_FROZEN: 7817 case CPU_DEAD_FROZEN:
7827 /* 7818 /*
@@ -7832,8 +7823,16 @@ static int update_sched_domains(struct notifier_block *nfb,
7832 return NOTIFY_DONE; 7823 return NOTIFY_DONE;
7833 } 7824 }
7834 7825
7826#ifndef CONFIG_CPUSETS
7827 /*
7828 * Create default domain partitioning if cpusets are disabled.
7829 * Otherwise we let cpusets rebuild the domains based on the
7830 * current setup.
7831 */
7832
7835 /* The hotplug lock is already held by cpu_up/cpu_down */ 7833 /* The hotplug lock is already held by cpu_up/cpu_down */
7836 arch_init_sched_domains(&cpu_online_map); 7834 arch_init_sched_domains(&cpu_online_map);
7835#endif
7837 7836
7838 return NOTIFY_OK; 7837 return NOTIFY_OK;
7839} 7838}
@@ -7973,7 +7972,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7973 else 7972 else
7974 rt_se->rt_rq = parent->my_q; 7973 rt_se->rt_rq = parent->my_q;
7975 7974
7976 rt_se->rt_rq = &rq->rt;
7977 rt_se->my_q = rt_rq; 7975 rt_se->my_q = rt_rq;
7978 rt_se->parent = parent; 7976 rt_se->parent = parent;
7979 INIT_LIST_HEAD(&rt_se->run_list); 7977 INIT_LIST_HEAD(&rt_se->run_list);
@@ -8014,8 +8012,8 @@ void __init sched_init(void)
8014 8012
8015 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 8013 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8016 ptr += nr_cpu_ids * sizeof(void **); 8014 ptr += nr_cpu_ids * sizeof(void **);
8017#endif 8015#endif /* CONFIG_USER_SCHED */
8018#endif 8016#endif /* CONFIG_FAIR_GROUP_SCHED */
8019#ifdef CONFIG_RT_GROUP_SCHED 8017#ifdef CONFIG_RT_GROUP_SCHED
8020 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 8018 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
8021 ptr += nr_cpu_ids * sizeof(void **); 8019 ptr += nr_cpu_ids * sizeof(void **);
@@ -8029,12 +8027,11 @@ void __init sched_init(void)
8029 8027
8030 root_task_group.rt_rq = (struct rt_rq **)ptr; 8028 root_task_group.rt_rq = (struct rt_rq **)ptr;
8031 ptr += nr_cpu_ids * sizeof(void **); 8029 ptr += nr_cpu_ids * sizeof(void **);
8032#endif 8030#endif /* CONFIG_USER_SCHED */
8033#endif 8031#endif /* CONFIG_RT_GROUP_SCHED */
8034 } 8032 }
8035 8033
8036#ifdef CONFIG_SMP 8034#ifdef CONFIG_SMP
8037 init_aggregate();
8038 init_defrootdomain(); 8035 init_defrootdomain();
8039#endif 8036#endif
8040 8037
@@ -8047,8 +8044,8 @@ void __init sched_init(void)
8047#ifdef CONFIG_USER_SCHED 8044#ifdef CONFIG_USER_SCHED
8048 init_rt_bandwidth(&root_task_group.rt_bandwidth, 8045 init_rt_bandwidth(&root_task_group.rt_bandwidth,
8049 global_rt_period(), RUNTIME_INF); 8046 global_rt_period(), RUNTIME_INF);
8050#endif 8047#endif /* CONFIG_USER_SCHED */
8051#endif 8048#endif /* CONFIG_RT_GROUP_SCHED */
8052 8049
8053#ifdef CONFIG_GROUP_SCHED 8050#ifdef CONFIG_GROUP_SCHED
8054 list_add(&init_task_group.list, &task_groups); 8051 list_add(&init_task_group.list, &task_groups);
@@ -8058,8 +8055,8 @@ void __init sched_init(void)
8058 INIT_LIST_HEAD(&root_task_group.children); 8055 INIT_LIST_HEAD(&root_task_group.children);
8059 init_task_group.parent = &root_task_group; 8056 init_task_group.parent = &root_task_group;
8060 list_add(&init_task_group.siblings, &root_task_group.children); 8057 list_add(&init_task_group.siblings, &root_task_group.children);
8061#endif 8058#endif /* CONFIG_USER_SCHED */
8062#endif 8059#endif /* CONFIG_GROUP_SCHED */
8063 8060
8064 for_each_possible_cpu(i) { 8061 for_each_possible_cpu(i) {
8065 struct rq *rq; 8062 struct rq *rq;
@@ -8139,6 +8136,7 @@ void __init sched_init(void)
8139 rq->next_balance = jiffies; 8136 rq->next_balance = jiffies;
8140 rq->push_cpu = 0; 8137 rq->push_cpu = 0;
8141 rq->cpu = i; 8138 rq->cpu = i;
8139 rq->online = 0;
8142 rq->migration_thread = NULL; 8140 rq->migration_thread = NULL;
8143 INIT_LIST_HEAD(&rq->migration_queue); 8141 INIT_LIST_HEAD(&rq->migration_queue);
8144 rq_attach_root(rq, &def_root_domain); 8142 rq_attach_root(rq, &def_root_domain);
@@ -8154,7 +8152,7 @@ void __init sched_init(void)
8154#endif 8152#endif
8155 8153
8156#ifdef CONFIG_SMP 8154#ifdef CONFIG_SMP
8157 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 8155 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
8158#endif 8156#endif
8159 8157
8160#ifdef CONFIG_RT_MUTEXES 8158#ifdef CONFIG_RT_MUTEXES
@@ -8378,7 +8376,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8378{ 8376{
8379 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8377 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8380} 8378}
8381#else 8379#else /* !CONFG_FAIR_GROUP_SCHED */
8382static inline void free_fair_sched_group(struct task_group *tg) 8380static inline void free_fair_sched_group(struct task_group *tg)
8383{ 8381{
8384} 8382}
@@ -8396,7 +8394,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8396static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8394static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8397{ 8395{
8398} 8396}
8399#endif 8397#endif /* CONFIG_FAIR_GROUP_SCHED */
8400 8398
8401#ifdef CONFIG_RT_GROUP_SCHED 8399#ifdef CONFIG_RT_GROUP_SCHED
8402static void free_rt_sched_group(struct task_group *tg) 8400static void free_rt_sched_group(struct task_group *tg)
@@ -8467,7 +8465,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8467{ 8465{
8468 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8466 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8469} 8467}
8470#else 8468#else /* !CONFIG_RT_GROUP_SCHED */
8471static inline void free_rt_sched_group(struct task_group *tg) 8469static inline void free_rt_sched_group(struct task_group *tg)
8472{ 8470{
8473} 8471}
@@ -8485,7 +8483,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8485static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8483static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8486{ 8484{
8487} 8485}
8488#endif 8486#endif /* CONFIG_RT_GROUP_SCHED */
8489 8487
8490#ifdef CONFIG_GROUP_SCHED 8488#ifdef CONFIG_GROUP_SCHED
8491static void free_sched_group(struct task_group *tg) 8489static void free_sched_group(struct task_group *tg)
@@ -8596,7 +8594,7 @@ void sched_move_task(struct task_struct *tsk)
8596 8594
8597 task_rq_unlock(rq, &flags); 8595 task_rq_unlock(rq, &flags);
8598} 8596}
8599#endif 8597#endif /* CONFIG_GROUP_SCHED */
8600 8598
8601#ifdef CONFIG_FAIR_GROUP_SCHED 8599#ifdef CONFIG_FAIR_GROUP_SCHED
8602static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8600static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -8731,7 +8729,7 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8731 } 8729 }
8732 rcu_read_unlock(); 8730 rcu_read_unlock();
8733 8731
8734 return total + to_ratio(period, runtime) < 8732 return total + to_ratio(period, runtime) <=
8735 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8733 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period),
8736 parent->rt_bandwidth.rt_runtime); 8734 parent->rt_bandwidth.rt_runtime);
8737} 8735}
@@ -8834,6 +8832,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8834 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 8832 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8835 rt_runtime = tg->rt_bandwidth.rt_runtime; 8833 rt_runtime = tg->rt_bandwidth.rt_runtime;
8836 8834
8835 if (rt_period == 0)
8836 return -EINVAL;
8837
8837 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8838 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8838} 8839}
8839 8840
@@ -8848,16 +8849,21 @@ long sched_group_rt_period(struct task_group *tg)
8848 8849
8849static int sched_rt_global_constraints(void) 8850static int sched_rt_global_constraints(void)
8850{ 8851{
8852 struct task_group *tg = &root_task_group;
8853 u64 rt_runtime, rt_period;
8851 int ret = 0; 8854 int ret = 0;
8852 8855
8856 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8857 rt_runtime = tg->rt_bandwidth.rt_runtime;
8858
8853 mutex_lock(&rt_constraints_mutex); 8859 mutex_lock(&rt_constraints_mutex);
8854 if (!__rt_schedulable(NULL, 1, 0)) 8860 if (!__rt_schedulable(tg, rt_period, rt_runtime))
8855 ret = -EINVAL; 8861 ret = -EINVAL;
8856 mutex_unlock(&rt_constraints_mutex); 8862 mutex_unlock(&rt_constraints_mutex);
8857 8863
8858 return ret; 8864 return ret;
8859} 8865}
8860#else 8866#else /* !CONFIG_RT_GROUP_SCHED */
8861static int sched_rt_global_constraints(void) 8867static int sched_rt_global_constraints(void)
8862{ 8868{
8863 unsigned long flags; 8869 unsigned long flags;
@@ -8875,7 +8881,7 @@ static int sched_rt_global_constraints(void)
8875 8881
8876 return 0; 8882 return 0;
8877} 8883}
8878#endif 8884#endif /* CONFIG_RT_GROUP_SCHED */
8879 8885
8880int sched_rt_handler(struct ctl_table *table, int write, 8886int sched_rt_handler(struct ctl_table *table, int write,
8881 struct file *filp, void __user *buffer, size_t *lenp, 8887 struct file *filp, void __user *buffer, size_t *lenp,
@@ -8983,7 +8989,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8983 8989
8984 return (u64) tg->shares; 8990 return (u64) tg->shares;
8985} 8991}
8986#endif 8992#endif /* CONFIG_FAIR_GROUP_SCHED */
8987 8993
8988#ifdef CONFIG_RT_GROUP_SCHED 8994#ifdef CONFIG_RT_GROUP_SCHED
8989static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 8995static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -9007,7 +9013,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
9007{ 9013{
9008 return sched_group_rt_period(cgroup_tg(cgrp)); 9014 return sched_group_rt_period(cgroup_tg(cgrp));
9009} 9015}
9010#endif 9016#endif /* CONFIG_RT_GROUP_SCHED */
9011 9017
9012static struct cftype cpu_files[] = { 9018static struct cftype cpu_files[] = {
9013#ifdef CONFIG_FAIR_GROUP_SCHED 9019#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 9c597e37f7de..22ed55d1167f 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -3,6 +3,9 @@
3 * 3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 * 5 *
6 * Updates and enhancements:
7 * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
8 *
6 * Based on code by: 9 * Based on code by:
7 * Ingo Molnar <mingo@redhat.com> 10 * Ingo Molnar <mingo@redhat.com>
8 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
@@ -32,6 +35,11 @@
32 35
33#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 36#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
34 37
38#define MULTI_SHIFT 15
39/* Max is double, Min is 1/2 */
40#define MAX_MULTI (2LL << MULTI_SHIFT)
41#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
42
35struct sched_clock_data { 43struct sched_clock_data {
36 /* 44 /*
37 * Raw spinlock - this is a special case: this might be called 45 * Raw spinlock - this is a special case: this might be called
@@ -40,11 +48,15 @@ struct sched_clock_data {
40 */ 48 */
41 raw_spinlock_t lock; 49 raw_spinlock_t lock;
42 50
43 unsigned long prev_jiffies; 51 unsigned long tick_jiffies;
44 u64 prev_raw; 52 u64 prev_raw;
45 u64 tick_raw; 53 u64 tick_raw;
46 u64 tick_gtod; 54 u64 tick_gtod;
47 u64 clock; 55 u64 clock;
56 s64 multi;
57#ifdef CONFIG_NO_HZ
58 int check_max;
59#endif
48}; 60};
49 61
50static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); 62static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
@@ -59,49 +71,103 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
59 return &per_cpu(sched_clock_data, cpu); 71 return &per_cpu(sched_clock_data, cpu);
60} 72}
61 73
74static __read_mostly int sched_clock_running;
75
62void sched_clock_init(void) 76void sched_clock_init(void)
63{ 77{
64 u64 ktime_now = ktime_to_ns(ktime_get()); 78 u64 ktime_now = ktime_to_ns(ktime_get());
65 u64 now = 0; 79 unsigned long now_jiffies = jiffies;
66 int cpu; 80 int cpu;
67 81
68 for_each_possible_cpu(cpu) { 82 for_each_possible_cpu(cpu) {
69 struct sched_clock_data *scd = cpu_sdc(cpu); 83 struct sched_clock_data *scd = cpu_sdc(cpu);
70 84
71 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 85 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
72 scd->prev_jiffies = jiffies; 86 scd->tick_jiffies = now_jiffies;
73 scd->prev_raw = now; 87 scd->prev_raw = 0;
74 scd->tick_raw = now; 88 scd->tick_raw = 0;
75 scd->tick_gtod = ktime_now; 89 scd->tick_gtod = ktime_now;
76 scd->clock = ktime_now; 90 scd->clock = ktime_now;
91 scd->multi = 1 << MULTI_SHIFT;
92#ifdef CONFIG_NO_HZ
93 scd->check_max = 1;
94#endif
77 } 95 }
96
97 sched_clock_running = 1;
98}
99
100#ifdef CONFIG_NO_HZ
101/*
102 * The dynamic ticks makes the delta jiffies inaccurate. This
103 * prevents us from checking the maximum time update.
104 * Disable the maximum check during stopped ticks.
105 */
106void sched_clock_tick_stop(int cpu)
107{
108 struct sched_clock_data *scd = cpu_sdc(cpu);
109
110 scd->check_max = 0;
111}
112
113void sched_clock_tick_start(int cpu)
114{
115 struct sched_clock_data *scd = cpu_sdc(cpu);
116
117 scd->check_max = 1;
78} 118}
79 119
120static int check_max(struct sched_clock_data *scd)
121{
122 return scd->check_max;
123}
124#else
125static int check_max(struct sched_clock_data *scd)
126{
127 return 1;
128}
129#endif /* CONFIG_NO_HZ */
130
80/* 131/*
81 * update the percpu scd from the raw @now value 132 * update the percpu scd from the raw @now value
82 * 133 *
83 * - filter out backward motion 134 * - filter out backward motion
84 * - use jiffies to generate a min,max window to clip the raw values 135 * - use jiffies to generate a min,max window to clip the raw values
85 */ 136 */
86static void __update_sched_clock(struct sched_clock_data *scd, u64 now) 137static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time)
87{ 138{
88 unsigned long now_jiffies = jiffies; 139 unsigned long now_jiffies = jiffies;
89 long delta_jiffies = now_jiffies - scd->prev_jiffies; 140 long delta_jiffies = now_jiffies - scd->tick_jiffies;
90 u64 clock = scd->clock; 141 u64 clock = scd->clock;
91 u64 min_clock, max_clock; 142 u64 min_clock, max_clock;
92 s64 delta = now - scd->prev_raw; 143 s64 delta = now - scd->prev_raw;
93 144
94 WARN_ON_ONCE(!irqs_disabled()); 145 WARN_ON_ONCE(!irqs_disabled());
95 min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; 146
147 /*
148 * At schedule tick the clock can be just under the gtod. We don't
149 * want to push it too prematurely.
150 */
151 min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
152 if (min_clock > TICK_NSEC)
153 min_clock -= TICK_NSEC / 2;
96 154
97 if (unlikely(delta < 0)) { 155 if (unlikely(delta < 0)) {
98 clock++; 156 clock++;
99 goto out; 157 goto out;
100 } 158 }
101 159
102 max_clock = min_clock + TICK_NSEC; 160 /*
161 * The clock must stay within a jiffie of the gtod.
162 * But since we may be at the start of a jiffy or the end of one
163 * we add another jiffy buffer.
164 */
165 max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
103 166
104 if (unlikely(clock + delta > max_clock)) { 167 delta *= scd->multi;
168 delta >>= MULTI_SHIFT;
169
170 if (unlikely(clock + delta > max_clock) && check_max(scd)) {
105 if (clock < max_clock) 171 if (clock < max_clock)
106 clock = max_clock; 172 clock = max_clock;
107 else 173 else
@@ -114,9 +180,12 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now)
114 if (unlikely(clock < min_clock)) 180 if (unlikely(clock < min_clock))
115 clock = min_clock; 181 clock = min_clock;
116 182
117 scd->prev_raw = now; 183 if (time)
118 scd->prev_jiffies = now_jiffies; 184 *time = clock;
119 scd->clock = clock; 185 else {
186 scd->prev_raw = now;
187 scd->clock = clock;
188 }
120} 189}
121 190
122static void lock_double_clock(struct sched_clock_data *data1, 191static void lock_double_clock(struct sched_clock_data *data1,
@@ -136,6 +205,9 @@ u64 sched_clock_cpu(int cpu)
136 struct sched_clock_data *scd = cpu_sdc(cpu); 205 struct sched_clock_data *scd = cpu_sdc(cpu);
137 u64 now, clock; 206 u64 now, clock;
138 207
208 if (unlikely(!sched_clock_running))
209 return 0ull;
210
139 WARN_ON_ONCE(!irqs_disabled()); 211 WARN_ON_ONCE(!irqs_disabled());
140 now = sched_clock(); 212 now = sched_clock();
141 213
@@ -153,41 +225,64 @@ u64 sched_clock_cpu(int cpu)
153 now -= my_scd->tick_raw; 225 now -= my_scd->tick_raw;
154 now += scd->tick_raw; 226 now += scd->tick_raw;
155 227
156 now -= my_scd->tick_gtod; 228 now += my_scd->tick_gtod;
157 now += scd->tick_gtod; 229 now -= scd->tick_gtod;
158 230
159 __raw_spin_unlock(&my_scd->lock); 231 __raw_spin_unlock(&my_scd->lock);
232
233 __update_sched_clock(scd, now, &clock);
234
235 __raw_spin_unlock(&scd->lock);
236
160 } else { 237 } else {
161 __raw_spin_lock(&scd->lock); 238 __raw_spin_lock(&scd->lock);
239 __update_sched_clock(scd, now, NULL);
240 clock = scd->clock;
241 __raw_spin_unlock(&scd->lock);
162 } 242 }
163 243
164 __update_sched_clock(scd, now);
165 clock = scd->clock;
166
167 __raw_spin_unlock(&scd->lock);
168
169 return clock; 244 return clock;
170} 245}
171 246
172void sched_clock_tick(void) 247void sched_clock_tick(void)
173{ 248{
174 struct sched_clock_data *scd = this_scd(); 249 struct sched_clock_data *scd = this_scd();
250 unsigned long now_jiffies = jiffies;
251 s64 mult, delta_gtod, delta_raw;
175 u64 now, now_gtod; 252 u64 now, now_gtod;
176 253
254 if (unlikely(!sched_clock_running))
255 return;
256
177 WARN_ON_ONCE(!irqs_disabled()); 257 WARN_ON_ONCE(!irqs_disabled());
178 258
179 now = sched_clock();
180 now_gtod = ktime_to_ns(ktime_get()); 259 now_gtod = ktime_to_ns(ktime_get());
260 now = sched_clock();
181 261
182 __raw_spin_lock(&scd->lock); 262 __raw_spin_lock(&scd->lock);
183 __update_sched_clock(scd, now); 263 __update_sched_clock(scd, now, NULL);
184 /* 264 /*
185 * update tick_gtod after __update_sched_clock() because that will 265 * update tick_gtod after __update_sched_clock() because that will
186 * already observe 1 new jiffy; adding a new tick_gtod to that would 266 * already observe 1 new jiffy; adding a new tick_gtod to that would
187 * increase the clock 2 jiffies. 267 * increase the clock 2 jiffies.
188 */ 268 */
269 delta_gtod = now_gtod - scd->tick_gtod;
270 delta_raw = now - scd->tick_raw;
271
272 if ((long)delta_raw > 0) {
273 mult = delta_gtod << MULTI_SHIFT;
274 do_div(mult, delta_raw);
275 scd->multi = mult;
276 if (scd->multi > MAX_MULTI)
277 scd->multi = MAX_MULTI;
278 else if (scd->multi < MIN_MULTI)
279 scd->multi = MIN_MULTI;
280 } else
281 scd->multi = 1 << MULTI_SHIFT;
282
189 scd->tick_raw = now; 283 scd->tick_raw = now;
190 scd->tick_gtod = now_gtod; 284 scd->tick_gtod = now_gtod;
285 scd->tick_jiffies = now_jiffies;
191 __raw_spin_unlock(&scd->lock); 286 __raw_spin_unlock(&scd->lock);
192} 287}
193 288
@@ -217,6 +312,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
217 __raw_spin_lock(&scd->lock); 312 __raw_spin_lock(&scd->lock);
218 scd->prev_raw = now; 313 scd->prev_raw = now;
219 scd->clock += delta_ns; 314 scd->clock += delta_ns;
315 scd->multi = 1 << MULTI_SHIFT;
220 __raw_spin_unlock(&scd->lock); 316 __raw_spin_unlock(&scd->lock);
221 317
222 touch_softlockup_watchdog(); 318 touch_softlockup_watchdog();
@@ -234,3 +330,16 @@ unsigned long long __attribute__((weak)) sched_clock(void)
234{ 330{
235 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); 331 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
236} 332}
333
334unsigned long long cpu_clock(int cpu)
335{
336 unsigned long long clock;
337 unsigned long flags;
338
339 local_irq_save(flags);
340 clock = sched_clock_cpu(cpu);
341 local_irq_restore(flags);
342
343 return clock;
344}
345EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 000000000000..52154fefab7e
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
1/*
2 * kernel/sched_cpupri.c
3 *
4 * CPU priority management
5 *
6 * Copyright (C) 2007-2008 Novell
7 *
8 * Author: Gregory Haskins <ghaskins@novell.com>
9 *
10 * This code tracks the priority of each CPU so that global migration
11 * decisions are easy to calculate. Each CPU can be in a state as follows:
12 *
13 * (INVALID), IDLE, NORMAL, RT1, ... RT99
14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus
18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a
21 * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
22 * yields the worst case search is fairly contrived.
23 *
24 * This program is free software; you can redistribute it and/or
25 * modify it under the terms of the GNU General Public License
26 * as published by the Free Software Foundation; version 2
27 * of the License.
28 */
29
30#include "sched_cpupri.h"
31
32/* Convert between a 140 based task->prio, and our 102 based cpupri */
33static int convert_prio(int prio)
34{
35 int cpupri;
36
37 if (prio == CPUPRI_INVALID)
38 cpupri = CPUPRI_INVALID;
39 else if (prio == MAX_PRIO)
40 cpupri = CPUPRI_IDLE;
41 else if (prio >= MAX_RT_PRIO)
42 cpupri = CPUPRI_NORMAL;
43 else
44 cpupri = MAX_RT_PRIO - prio + 1;
45
46 return cpupri;
47}
48
49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53
54/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system
56 * @cp: The cpupri context
57 * @p: The task
58 * @lowest_mask: A mask to fill in with selected CPUs
59 *
60 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in
62 * fact changed priorities any number of times. While not ideal, it is not
63 * an issue of correctness since the normal rebalancer logic will correct
64 * any discrepancies created by racing against the uncertainty of the current
65 * priority configuration.
66 *
67 * Returns: (int)bool - CPUs were found
68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask)
71{
72 int idx = 0;
73 int task_pri = convert_prio(p->prio);
74
75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78
79 if (idx >= task_pri)
80 break;
81
82 cpus_and(mask, p->cpus_allowed, vec->mask);
83
84 if (cpus_empty(mask))
85 continue;
86
87 *lowest_mask = mask;
88 return 1;
89 }
90
91 return 0;
92}
93
94/**
95 * cpupri_set - update the cpu priority setting
96 * @cp: The cpupri context
97 * @cpu: The target cpu
98 * @pri: The priority (INVALID-RT99) to assign to this CPU
99 *
100 * Note: Assumes cpu_rq(cpu)->lock is locked
101 *
102 * Returns: (void)
103 */
104void cpupri_set(struct cpupri *cp, int cpu, int newpri)
105{
106 int *currpri = &cp->cpu_to_pri[cpu];
107 int oldpri = *currpri;
108 unsigned long flags;
109
110 newpri = convert_prio(newpri);
111
112 BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
113
114 if (newpri == oldpri)
115 return;
116
117 /*
118 * If the cpu was currently mapped to a different value, we
119 * first need to unmap the old value
120 */
121 if (likely(oldpri != CPUPRI_INVALID)) {
122 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
123
124 spin_lock_irqsave(&vec->lock, flags);
125
126 vec->count--;
127 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask);
130
131 spin_unlock_irqrestore(&vec->lock, flags);
132 }
133
134 if (likely(newpri != CPUPRI_INVALID)) {
135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136
137 spin_lock_irqsave(&vec->lock, flags);
138
139 cpu_set(cpu, vec->mask);
140 vec->count++;
141 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active);
143
144 spin_unlock_irqrestore(&vec->lock, flags);
145 }
146
147 *currpri = newpri;
148}
149
150/**
151 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context
153 *
154 * Returns: (void)
155 */
156void cpupri_init(struct cpupri *cp)
157{
158 int i;
159
160 memset(cp, 0, sizeof(*cp));
161
162 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
163 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
164
165 spin_lock_init(&vec->lock);
166 vec->count = 0;
167 cpus_clear(vec->mask);
168 }
169
170 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID;
172}
173
174
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 000000000000..f25811b0f931
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
1#ifndef _LINUX_CPUPRI_H
2#define _LINUX_CPUPRI_H
3
4#include <linux/sched.h>
5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
8
9#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0
11#define CPUPRI_NORMAL 1
12/* values 2-101 are RT priorities 0-99 */
13
14struct cpupri_vec {
15 spinlock_t lock;
16 int count;
17 cpumask_t mask;
18};
19
20struct cpupri {
21 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
22 long pri_active[CPUPRI_NR_PRI_WORDS];
23 int cpu_to_pri[NR_CPUS];
24};
25
26#ifdef CONFIG_SMP
27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp);
31#else
32#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0)
34#endif
35
36#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5f06118fbc31..bbe6b31c3c56 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
119 struct sched_entity *last; 119 struct sched_entity *last;
120 unsigned long flags; 120 unsigned long flags;
121 121
122#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) 122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
124#else
125 char path[128] = ""; 123 char path[128] = "";
126 struct cgroup *cgroup = NULL; 124 struct cgroup *cgroup = NULL;
127 struct task_group *tg = cfs_rq->tg; 125 struct task_group *tg = cfs_rq->tg;
@@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
133 cgroup_path(cgroup, path, sizeof(path)); 131 cgroup_path(cgroup, path, sizeof(path));
134 132
135 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
134#else
135 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
136#endif 136#endif
137 137
138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
@@ -162,8 +162,23 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
162 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 162 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
163 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 163 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
164#ifdef CONFIG_SCHEDSTATS 164#ifdef CONFIG_SCHEDSTATS
165 SEQ_printf(m, " .%-30s: %d\n", "bkl_count", 165#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
166 rq->bkl_count); 166
167 P(yld_exp_empty);
168 P(yld_act_empty);
169 P(yld_both_empty);
170 P(yld_count);
171
172 P(sched_switch);
173 P(sched_count);
174 P(sched_goidle);
175
176 P(ttwu_count);
177 P(ttwu_local);
178
179 P(bkl_count);
180
181#undef P
167#endif 182#endif
168 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", 183 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
169 cfs_rq->nr_spread_over); 184 cfs_rq->nr_spread_over);
@@ -174,6 +189,39 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
174#endif 189#endif
175} 190}
176 191
192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
193{
194#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
195 char path[128] = "";
196 struct cgroup *cgroup = NULL;
197 struct task_group *tg = rt_rq->tg;
198
199 if (tg)
200 cgroup = tg->css.cgroup;
201
202 if (cgroup)
203 cgroup_path(cgroup, path, sizeof(path));
204
205 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
206#else
207 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
208#endif
209
210
211#define P(x) \
212 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
213#define PN(x) \
214 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
215
216 P(rt_nr_running);
217 P(rt_throttled);
218 PN(rt_time);
219 PN(rt_runtime);
220
221#undef PN
222#undef P
223}
224
177static void print_cpu(struct seq_file *m, int cpu) 225static void print_cpu(struct seq_file *m, int cpu)
178{ 226{
179 struct rq *rq = &per_cpu(runqueues, cpu); 227 struct rq *rq = &per_cpu(runqueues, cpu);
@@ -213,6 +261,7 @@ static void print_cpu(struct seq_file *m, int cpu)
213#undef PN 261#undef PN
214 262
215 print_cfs_stats(m, cpu); 263 print_cfs_stats(m, cpu);
264 print_rt_stats(m, cpu);
216 265
217 print_rq(m, rq, cpu); 266 print_rq(m, rq, cpu);
218} 267}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e24ecd39c4b8..f2aa987027d6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -429,12 +429,38 @@ calc_delta_asym(unsigned long delta, struct sched_entity *se)
429 429
430 for_each_sched_entity(se) { 430 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load; 431 struct load_weight *se_lw = &se->load;
432 unsigned long rw = cfs_rq_of(se)->load.weight;
433
434#ifdef CONFIG_FAIR_SCHED_GROUP
435 struct cfs_rq *cfs_rq = se->my_q;
436 struct task_group *tg = NULL
437
438 if (cfs_rq)
439 tg = cfs_rq->tg;
440
441 if (tg && tg->shares < NICE_0_LOAD) {
442 /*
443 * scale shares to what it would have been had
444 * tg->weight been NICE_0_LOAD:
445 *
446 * weight = 1024 * shares / tg->weight
447 */
448 lw.weight *= se->load.weight;
449 lw.weight /= tg->shares;
450
451 lw.inv_weight = 0;
432 452
433 if (se->load.weight < NICE_0_LOAD)
434 se_lw = &lw; 453 se_lw = &lw;
454 rw += lw.weight - se->load.weight;
455 } else
456#endif
435 457
436 delta = calc_delta_mine(delta, 458 if (se->load.weight < NICE_0_LOAD) {
437 cfs_rq_of(se)->load.weight, se_lw); 459 se_lw = &lw;
460 rw += NICE_0_LOAD - se->load.weight;
461 }
462
463 delta = calc_delta_mine(delta, rw, se_lw);
438 } 464 }
439 465
440 return delta; 466 return delta;
@@ -700,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
700 __enqueue_entity(cfs_rq, se); 726 __enqueue_entity(cfs_rq, se);
701} 727}
702 728
703static void update_avg(u64 *avg, u64 sample)
704{
705 s64 diff = sample - *avg;
706 *avg += diff >> 3;
707}
708
709static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
710{
711 if (!se->last_wakeup)
712 return;
713
714 update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
715 se->last_wakeup = 0;
716}
717
718static void 729static void
719dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 730dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
720{ 731{
@@ -725,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
725 736
726 update_stats_dequeue(cfs_rq, se); 737 update_stats_dequeue(cfs_rq, se);
727 if (sleep) { 738 if (sleep) {
728 update_avg_stats(cfs_rq, se);
729#ifdef CONFIG_SCHEDSTATS 739#ifdef CONFIG_SCHEDSTATS
730 if (entity_is_task(se)) { 740 if (entity_is_task(se)) {
731 struct task_struct *tsk = task_of(se); 741 struct task_struct *tsk = task_of(se);
@@ -787,17 +797,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
787 se->prev_sum_exec_runtime = se->sum_exec_runtime; 797 se->prev_sum_exec_runtime = se->sum_exec_runtime;
788} 798}
789 799
790static int
791wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
792
793static struct sched_entity * 800static struct sched_entity *
794pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) 801pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
795{ 802{
796 if (!cfs_rq->next) 803 struct rq *rq = rq_of(cfs_rq);
797 return se; 804 u64 pair_slice = rq->clock - cfs_rq->pair_start;
798 805
799 if (wakeup_preempt_entity(cfs_rq->next, se) != 0) 806 if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
807 cfs_rq->pair_start = rq->clock;
800 return se; 808 return se;
809 }
801 810
802 return cfs_rq->next; 811 return cfs_rq->next;
803} 812}
@@ -896,7 +905,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
896 hrtick_start(rq, delta, requeue); 905 hrtick_start(rq, delta, requeue);
897 } 906 }
898} 907}
899#else 908#else /* !CONFIG_SCHED_HRTICK */
900static inline void 909static inline void
901hrtick_start_fair(struct rq *rq, struct task_struct *p) 910hrtick_start_fair(struct rq *rq, struct task_struct *p)
902{ 911{
@@ -1037,7 +1046,7 @@ static int wake_idle(int cpu, struct task_struct *p)
1037 } 1046 }
1038 return cpu; 1047 return cpu;
1039} 1048}
1040#else 1049#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1041static inline int wake_idle(int cpu, struct task_struct *p) 1050static inline int wake_idle(int cpu, struct task_struct *p)
1042{ 1051{
1043 return cpu; 1052 return cpu;
@@ -1048,6 +1057,89 @@ static inline int wake_idle(int cpu, struct task_struct *p)
1048 1057
1049static const struct sched_class fair_sched_class; 1058static const struct sched_class fair_sched_class;
1050 1059
1060#ifdef CONFIG_FAIR_GROUP_SCHED
1061/*
1062 * effective_load() calculates the load change as seen from the root_task_group
1063 *
1064 * Adding load to a group doesn't make a group heavier, but can cause movement
1065 * of group shares between cpus. Assuming the shares were perfectly aligned one
1066 * can calculate the shift in shares.
1067 *
1068 * The problem is that perfectly aligning the shares is rather expensive, hence
1069 * we try to avoid doing that too often - see update_shares(), which ratelimits
1070 * this change.
1071 *
1072 * We compensate this by not only taking the current delta into account, but
1073 * also considering the delta between when the shares were last adjusted and
1074 * now.
1075 *
1076 * We still saw a performance dip, some tracing learned us that between
1077 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1078 * significantly. Therefore try to bias the error in direction of failing
1079 * the affine wakeup.
1080 *
1081 */
1082static long effective_load(struct task_group *tg, int cpu,
1083 long wl, long wg)
1084{
1085 struct sched_entity *se = tg->se[cpu];
1086 long more_w;
1087
1088 if (!tg->parent)
1089 return wl;
1090
1091 /*
1092 * By not taking the decrease of shares on the other cpu into
1093 * account our error leans towards reducing the affine wakeups.
1094 */
1095 if (!wl && sched_feat(ASYM_EFF_LOAD))
1096 return wl;
1097
1098 /*
1099 * Instead of using this increment, also add the difference
1100 * between when the shares were last updated and now.
1101 */
1102 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1103 wl += more_w;
1104 wg += more_w;
1105
1106 for_each_sched_entity(se) {
1107#define D(n) (likely(n) ? (n) : 1)
1108
1109 long S, rw, s, a, b;
1110
1111 S = se->my_q->tg->shares;
1112 s = se->my_q->shares;
1113 rw = se->my_q->rq_weight;
1114
1115 a = S*(rw + wl);
1116 b = S*rw + s*wg;
1117
1118 wl = s*(a-b)/D(b);
1119 /*
1120 * Assume the group is already running and will
1121 * thus already be accounted for in the weight.
1122 *
1123 * That is, moving shares between CPUs, does not
1124 * alter the group weight.
1125 */
1126 wg = 0;
1127#undef D
1128 }
1129
1130 return wl;
1131}
1132
1133#else
1134
1135static inline unsigned long effective_load(struct task_group *tg, int cpu,
1136 unsigned long wl, unsigned long wg)
1137{
1138 return wl;
1139}
1140
1141#endif
1142
1051static int 1143static int
1052wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1144wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1053 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1145 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
@@ -1055,36 +1147,50 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1055 unsigned int imbalance) 1147 unsigned int imbalance)
1056{ 1148{
1057 struct task_struct *curr = this_rq->curr; 1149 struct task_struct *curr = this_rq->curr;
1150 struct task_group *tg;
1058 unsigned long tl = this_load; 1151 unsigned long tl = this_load;
1059 unsigned long tl_per_task; 1152 unsigned long tl_per_task;
1153 unsigned long weight;
1154 int balanced;
1060 1155
1061 if (!(this_sd->flags & SD_WAKE_AFFINE)) 1156 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1062 return 0; 1157 return 0;
1063 1158
1064 /* 1159 /*
1160 * If sync wakeup then subtract the (maximum possible)
1161 * effect of the currently running task from the load
1162 * of the current CPU:
1163 */
1164 if (sync) {
1165 tg = task_group(current);
1166 weight = current->se.load.weight;
1167
1168 tl += effective_load(tg, this_cpu, -weight, -weight);
1169 load += effective_load(tg, prev_cpu, 0, -weight);
1170 }
1171
1172 tg = task_group(p);
1173 weight = p->se.load.weight;
1174
1175 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
1176 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1177
1178 /*
1065 * If the currently running task will sleep within 1179 * If the currently running task will sleep within
1066 * a reasonable amount of time then attract this newly 1180 * a reasonable amount of time then attract this newly
1067 * woken task: 1181 * woken task:
1068 */ 1182 */
1069 if (sync && curr->sched_class == &fair_sched_class) { 1183 if (sync && balanced) {
1070 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1184 if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1071 p->se.avg_overlap < sysctl_sched_migration_cost) 1185 p->se.avg_overlap < sysctl_sched_migration_cost)
1072 return 1; 1186 return 1;
1073 } 1187 }
1074 1188
1075 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1189 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1076 tl_per_task = cpu_avg_load_per_task(this_cpu); 1190 tl_per_task = cpu_avg_load_per_task(this_cpu);
1077 1191
1078 /*
1079 * If sync wakeup then subtract the (maximum possible)
1080 * effect of the currently running task from the load
1081 * of the current CPU:
1082 */
1083 if (sync)
1084 tl -= current->se.load.weight;
1085
1086 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || 1192 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
1087 100*(tl + p->se.load.weight) <= imbalance*load) { 1193 balanced) {
1088 /* 1194 /*
1089 * This domain has SD_WAKE_AFFINE and 1195 * This domain has SD_WAKE_AFFINE and
1090 * p is cache cold in this domain, and 1196 * p is cache cold in this domain, and
@@ -1172,7 +1278,10 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1172 * More easily preempt - nice tasks, while not making it harder for 1278 * More easily preempt - nice tasks, while not making it harder for
1173 * + nice tasks. 1279 * + nice tasks.
1174 */ 1280 */
1175 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); 1281 if (sched_feat(ASYM_GRAN))
1282 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
1283 else
1284 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1176 1285
1177 return gran; 1286 return gran;
1178} 1287}
@@ -1234,7 +1343,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1234 return; 1343 return;
1235 } 1344 }
1236 1345
1237 se->last_wakeup = se->sum_exec_runtime;
1238 if (unlikely(se == pse)) 1346 if (unlikely(se == pse))
1239 return; 1347 return;
1240 1348
@@ -1332,23 +1440,18 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1332 struct task_struct *p = NULL; 1440 struct task_struct *p = NULL;
1333 struct sched_entity *se; 1441 struct sched_entity *se;
1334 1442
1335 if (next == &cfs_rq->tasks) 1443 while (next != &cfs_rq->tasks) {
1336 return NULL;
1337
1338 /* Skip over entities that are not tasks */
1339 do {
1340 se = list_entry(next, struct sched_entity, group_node); 1444 se = list_entry(next, struct sched_entity, group_node);
1341 next = next->next; 1445 next = next->next;
1342 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1343 1446
1344 if (next == &cfs_rq->tasks) 1447 /* Skip over entities that are not tasks */
1345 return NULL; 1448 if (entity_is_task(se)) {
1449 p = task_of(se);
1450 break;
1451 }
1452 }
1346 1453
1347 cfs_rq->balance_iterator = next; 1454 cfs_rq->balance_iterator = next;
1348
1349 if (entity_is_task(se))
1350 p = task_of(se);
1351
1352 return p; 1455 return p;
1353} 1456}
1354 1457
@@ -1395,40 +1498,32 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1395 struct task_group *tg; 1498 struct task_group *tg;
1396 1499
1397 rcu_read_lock(); 1500 rcu_read_lock();
1501 update_h_load(busiest_cpu);
1502
1398 list_for_each_entry(tg, &task_groups, list) { 1503 list_for_each_entry(tg, &task_groups, list) {
1399 long imbalance; 1504 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1400 unsigned long this_weight, busiest_weight; 1505 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1401 long rem_load, max_load, moved_load; 1506 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
1507 u64 rem_load, moved_load;
1402 1508
1403 /* 1509 /*
1404 * empty group 1510 * empty group
1405 */ 1511 */
1406 if (!aggregate(tg, sd)->task_weight) 1512 if (!busiest_cfs_rq->task_weight)
1407 continue; 1513 continue;
1408 1514
1409 rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; 1515 rem_load = (u64)rem_load_move * busiest_weight;
1410 rem_load /= aggregate(tg, sd)->load + 1; 1516 rem_load = div_u64(rem_load, busiest_h_load + 1);
1411 1517
1412 this_weight = tg->cfs_rq[this_cpu]->task_weight;
1413 busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
1414
1415 imbalance = (busiest_weight - this_weight) / 2;
1416
1417 if (imbalance < 0)
1418 imbalance = busiest_weight;
1419
1420 max_load = max(rem_load, imbalance);
1421 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 1518 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1422 max_load, sd, idle, all_pinned, this_best_prio, 1519 rem_load, sd, idle, all_pinned, this_best_prio,
1423 tg->cfs_rq[busiest_cpu]); 1520 tg->cfs_rq[busiest_cpu]);
1424 1521
1425 if (!moved_load) 1522 if (!moved_load)
1426 continue; 1523 continue;
1427 1524
1428 move_group_shares(tg, sd, busiest_cpu, this_cpu); 1525 moved_load *= busiest_h_load;
1429 1526 moved_load = div_u64(moved_load, busiest_weight + 1);
1430 moved_load *= aggregate(tg, sd)->load;
1431 moved_load /= aggregate(tg, sd)->rq_weight + 1;
1432 1527
1433 rem_load_move -= moved_load; 1528 rem_load_move -= moved_load;
1434 if (rem_load_move < 0) 1529 if (rem_load_move < 0)
@@ -1474,7 +1569,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1474 1569
1475 return 0; 1570 return 0;
1476} 1571}
1477#endif 1572#endif /* CONFIG_SMP */
1478 1573
1479/* 1574/*
1480 * scheduler tick hitting a task of our scheduling class: 1575 * scheduler tick hitting a task of our scheduling class:
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1c7283cb9581..862b06bd560a 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,5 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
2SCHED_FEAT(NORMALIZED_SLEEPER, 1)
2SCHED_FEAT(WAKEUP_PREEMPT, 1) 3SCHED_FEAT(WAKEUP_PREEMPT, 1)
3SCHED_FEAT(START_DEBIT, 1) 4SCHED_FEAT(START_DEBIT, 1)
4SCHED_FEAT(AFFINE_WAKEUPS, 1) 5SCHED_FEAT(AFFINE_WAKEUPS, 1)
@@ -6,5 +7,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
6SCHED_FEAT(SYNC_WAKEUPS, 1) 7SCHED_FEAT(SYNC_WAKEUPS, 1)
7SCHED_FEAT(HRTICK, 1) 8SCHED_FEAT(HRTICK, 1)
8SCHED_FEAT(DOUBLE_TICK, 0) 9SCHED_FEAT(DOUBLE_TICK, 0)
9SCHED_FEAT(NORMALIZED_SLEEPER, 1) 10SCHED_FEAT(ASYM_GRAN, 1)
10SCHED_FEAT(DEADLINE, 1) 11SCHED_FEAT(LB_BIAS, 0)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 060e87b0cb1c..47ceac9e8552 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq)
12 12
13static inline void rt_set_overload(struct rq *rq) 13static inline void rt_set_overload(struct rq *rq)
14{ 14{
15 if (!rq->online)
16 return;
17
15 cpu_set(rq->cpu, rq->rd->rto_mask); 18 cpu_set(rq->cpu, rq->rd->rto_mask);
16 /* 19 /*
17 * Make sure the mask is visible before we set 20 * Make sure the mask is visible before we set
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq)
26 29
27static inline void rt_clear_overload(struct rq *rq) 30static inline void rt_clear_overload(struct rq *rq)
28{ 31{
32 if (!rq->online)
33 return;
34
29 /* the order here really doesn't matter */ 35 /* the order here really doesn't matter */
30 atomic_dec(&rq->rd->rto_count); 36 atomic_dec(&rq->rd->rto_count);
31 cpu_clear(rq->cpu, rq->rd->rto_mask); 37 cpu_clear(rq->cpu, rq->rd->rto_mask);
@@ -155,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
155 return &rt_rq->tg->rt_bandwidth; 161 return &rt_rq->tg->rt_bandwidth;
156} 162}
157 163
158#else 164#else /* !CONFIG_RT_GROUP_SCHED */
159 165
160static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 166static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
161{ 167{
@@ -220,48 +226,10 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
220 return &def_rt_bandwidth; 226 return &def_rt_bandwidth;
221} 227}
222 228
223#endif 229#endif /* CONFIG_RT_GROUP_SCHED */
224
225static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
226{
227 int i, idle = 1;
228 cpumask_t span;
229
230 if (rt_b->rt_runtime == RUNTIME_INF)
231 return 1;
232
233 span = sched_rt_period_mask();
234 for_each_cpu_mask(i, span) {
235 int enqueue = 0;
236 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
237 struct rq *rq = rq_of_rt_rq(rt_rq);
238
239 spin_lock(&rq->lock);
240 if (rt_rq->rt_time) {
241 u64 runtime;
242
243 spin_lock(&rt_rq->rt_runtime_lock);
244 runtime = rt_rq->rt_runtime;
245 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
246 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
247 rt_rq->rt_throttled = 0;
248 enqueue = 1;
249 }
250 if (rt_rq->rt_time || rt_rq->rt_nr_running)
251 idle = 0;
252 spin_unlock(&rt_rq->rt_runtime_lock);
253 }
254
255 if (enqueue)
256 sched_rt_rq_enqueue(rt_rq);
257 spin_unlock(&rq->lock);
258 }
259
260 return idle;
261}
262 230
263#ifdef CONFIG_SMP 231#ifdef CONFIG_SMP
264static int balance_runtime(struct rt_rq *rt_rq) 232static int do_balance_runtime(struct rt_rq *rt_rq)
265{ 233{
266 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 234 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
267 struct root_domain *rd = cpu_rq(smp_processor_id())->rd; 235 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
@@ -280,6 +248,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
280 continue; 248 continue;
281 249
282 spin_lock(&iter->rt_runtime_lock); 250 spin_lock(&iter->rt_runtime_lock);
251 if (iter->rt_runtime == RUNTIME_INF)
252 goto next;
253
283 diff = iter->rt_runtime - iter->rt_time; 254 diff = iter->rt_runtime - iter->rt_time;
284 if (diff > 0) { 255 if (diff > 0) {
285 do_div(diff, weight); 256 do_div(diff, weight);
@@ -293,13 +264,163 @@ static int balance_runtime(struct rt_rq *rt_rq)
293 break; 264 break;
294 } 265 }
295 } 266 }
267next:
296 spin_unlock(&iter->rt_runtime_lock); 268 spin_unlock(&iter->rt_runtime_lock);
297 } 269 }
298 spin_unlock(&rt_b->rt_runtime_lock); 270 spin_unlock(&rt_b->rt_runtime_lock);
299 271
300 return more; 272 return more;
301} 273}
302#endif 274
275static void __disable_runtime(struct rq *rq)
276{
277 struct root_domain *rd = rq->rd;
278 struct rt_rq *rt_rq;
279
280 if (unlikely(!scheduler_running))
281 return;
282
283 for_each_leaf_rt_rq(rt_rq, rq) {
284 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
285 s64 want;
286 int i;
287
288 spin_lock(&rt_b->rt_runtime_lock);
289 spin_lock(&rt_rq->rt_runtime_lock);
290 if (rt_rq->rt_runtime == RUNTIME_INF ||
291 rt_rq->rt_runtime == rt_b->rt_runtime)
292 goto balanced;
293 spin_unlock(&rt_rq->rt_runtime_lock);
294
295 want = rt_b->rt_runtime - rt_rq->rt_runtime;
296
297 for_each_cpu_mask(i, rd->span) {
298 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
299 s64 diff;
300
301 if (iter == rt_rq)
302 continue;
303
304 spin_lock(&iter->rt_runtime_lock);
305 if (want > 0) {
306 diff = min_t(s64, iter->rt_runtime, want);
307 iter->rt_runtime -= diff;
308 want -= diff;
309 } else {
310 iter->rt_runtime -= want;
311 want -= want;
312 }
313 spin_unlock(&iter->rt_runtime_lock);
314
315 if (!want)
316 break;
317 }
318
319 spin_lock(&rt_rq->rt_runtime_lock);
320 BUG_ON(want);
321balanced:
322 rt_rq->rt_runtime = RUNTIME_INF;
323 spin_unlock(&rt_rq->rt_runtime_lock);
324 spin_unlock(&rt_b->rt_runtime_lock);
325 }
326}
327
328static void disable_runtime(struct rq *rq)
329{
330 unsigned long flags;
331
332 spin_lock_irqsave(&rq->lock, flags);
333 __disable_runtime(rq);
334 spin_unlock_irqrestore(&rq->lock, flags);
335}
336
337static void __enable_runtime(struct rq *rq)
338{
339 struct rt_rq *rt_rq;
340
341 if (unlikely(!scheduler_running))
342 return;
343
344 for_each_leaf_rt_rq(rt_rq, rq) {
345 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
346
347 spin_lock(&rt_b->rt_runtime_lock);
348 spin_lock(&rt_rq->rt_runtime_lock);
349 rt_rq->rt_runtime = rt_b->rt_runtime;
350 rt_rq->rt_time = 0;
351 spin_unlock(&rt_rq->rt_runtime_lock);
352 spin_unlock(&rt_b->rt_runtime_lock);
353 }
354}
355
356static void enable_runtime(struct rq *rq)
357{
358 unsigned long flags;
359
360 spin_lock_irqsave(&rq->lock, flags);
361 __enable_runtime(rq);
362 spin_unlock_irqrestore(&rq->lock, flags);
363}
364
365static int balance_runtime(struct rt_rq *rt_rq)
366{
367 int more = 0;
368
369 if (rt_rq->rt_time > rt_rq->rt_runtime) {
370 spin_unlock(&rt_rq->rt_runtime_lock);
371 more = do_balance_runtime(rt_rq);
372 spin_lock(&rt_rq->rt_runtime_lock);
373 }
374
375 return more;
376}
377#else /* !CONFIG_SMP */
378static inline int balance_runtime(struct rt_rq *rt_rq)
379{
380 return 0;
381}
382#endif /* CONFIG_SMP */
383
384static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
385{
386 int i, idle = 1;
387 cpumask_t span;
388
389 if (rt_b->rt_runtime == RUNTIME_INF)
390 return 1;
391
392 span = sched_rt_period_mask();
393 for_each_cpu_mask(i, span) {
394 int enqueue = 0;
395 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
396 struct rq *rq = rq_of_rt_rq(rt_rq);
397
398 spin_lock(&rq->lock);
399 if (rt_rq->rt_time) {
400 u64 runtime;
401
402 spin_lock(&rt_rq->rt_runtime_lock);
403 if (rt_rq->rt_throttled)
404 balance_runtime(rt_rq);
405 runtime = rt_rq->rt_runtime;
406 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
407 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
408 rt_rq->rt_throttled = 0;
409 enqueue = 1;
410 }
411 if (rt_rq->rt_time || rt_rq->rt_nr_running)
412 idle = 0;
413 spin_unlock(&rt_rq->rt_runtime_lock);
414 } else if (rt_rq->rt_nr_running)
415 idle = 0;
416
417 if (enqueue)
418 sched_rt_rq_enqueue(rt_rq);
419 spin_unlock(&rq->lock);
420 }
421
422 return idle;
423}
303 424
304static inline int rt_se_prio(struct sched_rt_entity *rt_se) 425static inline int rt_se_prio(struct sched_rt_entity *rt_se)
305{ 426{
@@ -326,18 +447,10 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
326 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) 447 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
327 return 0; 448 return 0;
328 449
329#ifdef CONFIG_SMP 450 balance_runtime(rt_rq);
330 if (rt_rq->rt_time > runtime) { 451 runtime = sched_rt_runtime(rt_rq);
331 int more; 452 if (runtime == RUNTIME_INF)
332 453 return 0;
333 spin_unlock(&rt_rq->rt_runtime_lock);
334 more = balance_runtime(rt_rq);
335 spin_lock(&rt_rq->rt_runtime_lock);
336
337 if (more)
338 runtime = sched_rt_runtime(rt_rq);
339 }
340#endif
341 454
342 if (rt_rq->rt_time > runtime) { 455 if (rt_rq->rt_time > runtime) {
343 rt_rq->rt_throttled = 1; 456 rt_rq->rt_throttled = 1;
@@ -391,12 +504,21 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
391 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 504 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
392 rt_rq->rt_nr_running++; 505 rt_rq->rt_nr_running++;
393#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 506#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
394 if (rt_se_prio(rt_se) < rt_rq->highest_prio) 507 if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
508 struct rq *rq = rq_of_rt_rq(rt_rq);
509
395 rt_rq->highest_prio = rt_se_prio(rt_se); 510 rt_rq->highest_prio = rt_se_prio(rt_se);
511#ifdef CONFIG_SMP
512 if (rq->online)
513 cpupri_set(&rq->rd->cpupri, rq->cpu,
514 rt_se_prio(rt_se));
515#endif
516 }
396#endif 517#endif
397#ifdef CONFIG_SMP 518#ifdef CONFIG_SMP
398 if (rt_se->nr_cpus_allowed > 1) { 519 if (rt_se->nr_cpus_allowed > 1) {
399 struct rq *rq = rq_of_rt_rq(rt_rq); 520 struct rq *rq = rq_of_rt_rq(rt_rq);
521
400 rq->rt.rt_nr_migratory++; 522 rq->rt.rt_nr_migratory++;
401 } 523 }
402 524
@@ -416,6 +538,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
416static inline 538static inline
417void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 539void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
418{ 540{
541#ifdef CONFIG_SMP
542 int highest_prio = rt_rq->highest_prio;
543#endif
544
419 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 545 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
420 WARN_ON(!rt_rq->rt_nr_running); 546 WARN_ON(!rt_rq->rt_nr_running);
421 rt_rq->rt_nr_running--; 547 rt_rq->rt_nr_running--;
@@ -439,6 +565,14 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
439 rq->rt.rt_nr_migratory--; 565 rq->rt.rt_nr_migratory--;
440 } 566 }
441 567
568 if (rt_rq->highest_prio != highest_prio) {
569 struct rq *rq = rq_of_rt_rq(rt_rq);
570
571 if (rq->online)
572 cpupri_set(&rq->rd->cpupri, rq->cpu,
573 rt_rq->highest_prio);
574 }
575
442 update_rt_migration(rq_of_rt_rq(rt_rq)); 576 update_rt_migration(rq_of_rt_rq(rt_rq));
443#endif /* CONFIG_SMP */ 577#endif /* CONFIG_SMP */
444#ifdef CONFIG_RT_GROUP_SCHED 578#ifdef CONFIG_RT_GROUP_SCHED
@@ -449,22 +583,33 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
449#endif 583#endif
450} 584}
451 585
452static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 586static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
453{ 587{
454 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 588 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
455 struct rt_prio_array *array = &rt_rq->active; 589 struct rt_prio_array *array = &rt_rq->active;
456 struct rt_rq *group_rq = group_rt_rq(rt_se); 590 struct rt_rq *group_rq = group_rt_rq(rt_se);
591 struct list_head *queue = array->queue + rt_se_prio(rt_se);
457 592
458 if (group_rq && rt_rq_throttled(group_rq)) 593 /*
594 * Don't enqueue the group if its throttled, or when empty.
595 * The latter is a consequence of the former when a child group
596 * get throttled and the current group doesn't have any other
597 * active members.
598 */
599 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
459 return; 600 return;
460 601
461 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); 602 if (rt_se->nr_cpus_allowed == 1)
603 list_add(&rt_se->run_list, queue);
604 else
605 list_add_tail(&rt_se->run_list, queue);
606
462 __set_bit(rt_se_prio(rt_se), array->bitmap); 607 __set_bit(rt_se_prio(rt_se), array->bitmap);
463 608
464 inc_rt_tasks(rt_se, rt_rq); 609 inc_rt_tasks(rt_se, rt_rq);
465} 610}
466 611
467static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 612static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
468{ 613{
469 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 614 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
470 struct rt_prio_array *array = &rt_rq->active; 615 struct rt_prio_array *array = &rt_rq->active;
@@ -480,11 +625,10 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
480 * Because the prio of an upper entry depends on the lower 625 * Because the prio of an upper entry depends on the lower
481 * entries, we must remove entries top - down. 626 * entries, we must remove entries top - down.
482 */ 627 */
483static void dequeue_rt_stack(struct task_struct *p) 628static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
484{ 629{
485 struct sched_rt_entity *rt_se, *back = NULL; 630 struct sched_rt_entity *back = NULL;
486 631
487 rt_se = &p->rt;
488 for_each_sched_rt_entity(rt_se) { 632 for_each_sched_rt_entity(rt_se) {
489 rt_se->back = back; 633 rt_se->back = back;
490 back = rt_se; 634 back = rt_se;
@@ -492,7 +636,26 @@ static void dequeue_rt_stack(struct task_struct *p)
492 636
493 for (rt_se = back; rt_se; rt_se = rt_se->back) { 637 for (rt_se = back; rt_se; rt_se = rt_se->back) {
494 if (on_rt_rq(rt_se)) 638 if (on_rt_rq(rt_se))
495 dequeue_rt_entity(rt_se); 639 __dequeue_rt_entity(rt_se);
640 }
641}
642
643static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
644{
645 dequeue_rt_stack(rt_se);
646 for_each_sched_rt_entity(rt_se)
647 __enqueue_rt_entity(rt_se);
648}
649
650static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
651{
652 dequeue_rt_stack(rt_se);
653
654 for_each_sched_rt_entity(rt_se) {
655 struct rt_rq *rt_rq = group_rt_rq(rt_se);
656
657 if (rt_rq && rt_rq->rt_nr_running)
658 __enqueue_rt_entity(rt_se);
496 } 659 }
497} 660}
498 661
@@ -506,13 +669,7 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
506 if (wakeup) 669 if (wakeup)
507 rt_se->timeout = 0; 670 rt_se->timeout = 0;
508 671
509 dequeue_rt_stack(p); 672 enqueue_rt_entity(rt_se);
510
511 /*
512 * enqueue everybody, bottom - up.
513 */
514 for_each_sched_rt_entity(rt_se)
515 enqueue_rt_entity(rt_se);
516 673
517 inc_cpu_load(rq, p->se.load.weight); 674 inc_cpu_load(rq, p->se.load.weight);
518} 675}
@@ -520,20 +677,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
520static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 677static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
521{ 678{
522 struct sched_rt_entity *rt_se = &p->rt; 679 struct sched_rt_entity *rt_se = &p->rt;
523 struct rt_rq *rt_rq;
524 680
525 update_curr_rt(rq); 681 update_curr_rt(rq);
526 682 dequeue_rt_entity(rt_se);
527 dequeue_rt_stack(p);
528
529 /*
530 * re-enqueue all non-empty rt_rq entities.
531 */
532 for_each_sched_rt_entity(rt_se) {
533 rt_rq = group_rt_rq(rt_se);
534 if (rt_rq && rt_rq->rt_nr_running)
535 enqueue_rt_entity(rt_se);
536 }
537 683
538 dec_cpu_load(rq, p->se.load.weight); 684 dec_cpu_load(rq, p->se.load.weight);
539} 685}
@@ -547,7 +693,11 @@ void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
547{ 693{
548 struct rt_prio_array *array = &rt_rq->active; 694 struct rt_prio_array *array = &rt_rq->active;
549 695
550 list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); 696 if (on_rt_rq(rt_se)) {
697 list_del_init(&rt_se->run_list);
698 list_add_tail(&rt_se->run_list,
699 array->queue + rt_se_prio(rt_se));
700 }
551} 701}
552 702
553static void requeue_task_rt(struct rq *rq, struct task_struct *p) 703static void requeue_task_rt(struct rq *rq, struct task_struct *p)
@@ -610,8 +760,37 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
610 */ 760 */
611static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 761static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
612{ 762{
613 if (p->prio < rq->curr->prio) 763 if (p->prio < rq->curr->prio) {
614 resched_task(rq->curr); 764 resched_task(rq->curr);
765 return;
766 }
767
768#ifdef CONFIG_SMP
769 /*
770 * If:
771 *
772 * - the newly woken task is of equal priority to the current task
773 * - the newly woken task is non-migratable while current is migratable
774 * - current will be preempted on the next reschedule
775 *
776 * we should check to see if current can readily move to a different
777 * cpu. If so, we will reschedule to allow the push logic to try
778 * to move current somewhere else, making room for our non-migratable
779 * task.
780 */
781 if((p->prio == rq->curr->prio)
782 && p->rt.nr_cpus_allowed == 1
783 && rq->curr->rt.nr_cpus_allowed != 1) {
784 cpumask_t mask;
785
786 if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
787 /*
788 * There appears to be other cpus that can accept
789 * current, so lets reschedule to try and push it away
790 */
791 resched_task(rq->curr);
792 }
793#endif
615} 794}
616 795
617static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 796static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -714,73 +893,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
714 893
715static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 894static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
716 895
717static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
718{
719 int lowest_prio = -1;
720 int lowest_cpu = -1;
721 int count = 0;
722 int cpu;
723
724 cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
725
726 /*
727 * Scan each rq for the lowest prio.
728 */
729 for_each_cpu_mask(cpu, *lowest_mask) {
730 struct rq *rq = cpu_rq(cpu);
731
732 /* We look for lowest RT prio or non-rt CPU */
733 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
734 /*
735 * if we already found a low RT queue
736 * and now we found this non-rt queue
737 * clear the mask and set our bit.
738 * Otherwise just return the queue as is
739 * and the count==1 will cause the algorithm
740 * to use the first bit found.
741 */
742 if (lowest_cpu != -1) {
743 cpus_clear(*lowest_mask);
744 cpu_set(rq->cpu, *lowest_mask);
745 }
746 return 1;
747 }
748
749 /* no locking for now */
750 if ((rq->rt.highest_prio > task->prio)
751 && (rq->rt.highest_prio >= lowest_prio)) {
752 if (rq->rt.highest_prio > lowest_prio) {
753 /* new low - clear old data */
754 lowest_prio = rq->rt.highest_prio;
755 lowest_cpu = cpu;
756 count = 0;
757 }
758 count++;
759 } else
760 cpu_clear(cpu, *lowest_mask);
761 }
762
763 /*
764 * Clear out all the set bits that represent
765 * runqueues that were of higher prio than
766 * the lowest_prio.
767 */
768 if (lowest_cpu > 0) {
769 /*
770 * Perhaps we could add another cpumask op to
771 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
772 * Then that could be optimized to use memset and such.
773 */
774 for_each_cpu_mask(cpu, *lowest_mask) {
775 if (cpu >= lowest_cpu)
776 break;
777 cpu_clear(cpu, *lowest_mask);
778 }
779 }
780
781 return count;
782}
783
784static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 896static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
785{ 897{
786 int first; 898 int first;
@@ -802,17 +914,12 @@ static int find_lowest_rq(struct task_struct *task)
802 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 914 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
803 int this_cpu = smp_processor_id(); 915 int this_cpu = smp_processor_id();
804 int cpu = task_cpu(task); 916 int cpu = task_cpu(task);
805 int count = find_lowest_cpus(task, lowest_mask);
806 917
807 if (!count) 918 if (task->rt.nr_cpus_allowed == 1)
808 return -1; /* No targets found */ 919 return -1; /* No other targets possible */
809 920
810 /* 921 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
811 * There is no sense in performing an optimal search if only one 922 return -1; /* No targets found */
812 * target is found.
813 */
814 if (count == 1)
815 return first_cpu(*lowest_mask);
816 923
817 /* 924 /*
818 * At this point we have built a mask of cpus representing the 925 * At this point we have built a mask of cpus representing the
@@ -1157,17 +1264,25 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1157} 1264}
1158 1265
1159/* Assumes rq->lock is held */ 1266/* Assumes rq->lock is held */
1160static void join_domain_rt(struct rq *rq) 1267static void rq_online_rt(struct rq *rq)
1161{ 1268{
1162 if (rq->rt.overloaded) 1269 if (rq->rt.overloaded)
1163 rt_set_overload(rq); 1270 rt_set_overload(rq);
1271
1272 __enable_runtime(rq);
1273
1274 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
1164} 1275}
1165 1276
1166/* Assumes rq->lock is held */ 1277/* Assumes rq->lock is held */
1167static void leave_domain_rt(struct rq *rq) 1278static void rq_offline_rt(struct rq *rq)
1168{ 1279{
1169 if (rq->rt.overloaded) 1280 if (rq->rt.overloaded)
1170 rt_clear_overload(rq); 1281 rt_clear_overload(rq);
1282
1283 __disable_runtime(rq);
1284
1285 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1171} 1286}
1172 1287
1173/* 1288/*
@@ -1330,8 +1445,8 @@ static const struct sched_class rt_sched_class = {
1330 .load_balance = load_balance_rt, 1445 .load_balance = load_balance_rt,
1331 .move_one_task = move_one_task_rt, 1446 .move_one_task = move_one_task_rt,
1332 .set_cpus_allowed = set_cpus_allowed_rt, 1447 .set_cpus_allowed = set_cpus_allowed_rt,
1333 .join_domain = join_domain_rt, 1448 .rq_online = rq_online_rt,
1334 .leave_domain = leave_domain_rt, 1449 .rq_offline = rq_offline_rt,
1335 .pre_schedule = pre_schedule_rt, 1450 .pre_schedule = pre_schedule_rt,
1336 .post_schedule = post_schedule_rt, 1451 .post_schedule = post_schedule_rt,
1337 .task_wake_up = task_wake_up_rt, 1452 .task_wake_up = task_wake_up_rt,
@@ -1344,3 +1459,17 @@ static const struct sched_class rt_sched_class = {
1344 .prio_changed = prio_changed_rt, 1459 .prio_changed = prio_changed_rt,
1345 .switched_to = switched_to_rt, 1460 .switched_to = switched_to_rt,
1346}; 1461};
1462
1463#ifdef CONFIG_SCHED_DEBUG
1464extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1465
1466static void print_rt_stats(struct seq_file *m, int cpu)
1467{
1468 struct rt_rq *rt_rq;
1469
1470 rcu_read_lock();
1471 for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
1472 print_rt_rq(m, cpu, rt_rq);
1473 rcu_read_unlock();
1474}
1475#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 5bae2e0c3ff2..8385d43987e2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -67,6 +67,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
67 preempt_enable(); 67 preempt_enable();
68#endif 68#endif
69 } 69 }
70 kfree(mask_str);
70 return 0; 71 return 0;
71} 72}
72 73
@@ -117,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
117 if (rq) 118 if (rq)
118 rq->rq_sched_info.cpu_time += delta; 119 rq->rq_sched_info.cpu_time += delta;
119} 120}
121
122static inline void
123rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
124{
125 if (rq)
126 rq->rq_sched_info.run_delay += delta;
127}
120# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 128# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
121# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 129# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
122# define schedstat_set(var, val) do { var = (val); } while (0) 130# define schedstat_set(var, val) do { var = (val); } while (0)
@@ -125,6 +133,9 @@ static inline void
125rq_sched_info_arrive(struct rq *rq, unsigned long long delta) 133rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
126{} 134{}
127static inline void 135static inline void
136rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
137{}
138static inline void
128rq_sched_info_depart(struct rq *rq, unsigned long long delta) 139rq_sched_info_depart(struct rq *rq, unsigned long long delta)
129{} 140{}
130# define schedstat_inc(rq, field) do { } while (0) 141# define schedstat_inc(rq, field) do { } while (0)
@@ -133,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
133#endif 144#endif
134 145
135#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 146#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
147static inline void sched_info_reset_dequeued(struct task_struct *t)
148{
149 t->sched_info.last_queued = 0;
150}
151
136/* 152/*
137 * Called when a process is dequeued from the active array and given 153 * Called when a process is dequeued from the active array and given
138 * the cpu. We should note that with the exception of interactive 154 * the cpu. We should note that with the exception of interactive
@@ -142,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
142 * active queue, thus delaying tasks in the expired queue from running; 158 * active queue, thus delaying tasks in the expired queue from running;
143 * see scheduler_tick()). 159 * see scheduler_tick()).
144 * 160 *
145 * This function is only called from sched_info_arrive(), rather than 161 * Though we are interested in knowing how long it was from the *first* time a
146 * dequeue_task(). Even though a task may be queued and dequeued multiple 162 * task was queued to the time that it finally hit a cpu, we call this routine
147 * times as it is shuffled about, we're really interested in knowing how 163 * from dequeue_task() to account for possible rq->clock skew across cpus. The
148 * long it was from the *first* time it was queued to the time that it 164 * delta taken on each cpu would annul the skew.
149 * finally hit a cpu.
150 */ 165 */
151static inline void sched_info_dequeued(struct task_struct *t) 166static inline void sched_info_dequeued(struct task_struct *t)
152{ 167{
153 t->sched_info.last_queued = 0; 168 unsigned long long now = task_rq(t)->clock, delta = 0;
169
170 if (unlikely(sched_info_on()))
171 if (t->sched_info.last_queued)
172 delta = now - t->sched_info.last_queued;
173 sched_info_reset_dequeued(t);
174 t->sched_info.run_delay += delta;
175
176 rq_sched_info_dequeued(task_rq(t), delta);
154} 177}
155 178
156/* 179/*
@@ -164,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t)
164 187
165 if (t->sched_info.last_queued) 188 if (t->sched_info.last_queued)
166 delta = now - t->sched_info.last_queued; 189 delta = now - t->sched_info.last_queued;
167 sched_info_dequeued(t); 190 sched_info_reset_dequeued(t);
168 t->sched_info.run_delay += delta; 191 t->sched_info.run_delay += delta;
169 t->sched_info.last_arrival = now; 192 t->sched_info.last_arrival = now;
170 t->sched_info.pcount++; 193 t->sched_info.pcount++;
@@ -197,6 +220,9 @@ static inline void sched_info_queued(struct task_struct *t)
197/* 220/*
198 * Called when a process ceases being the active-running process, either 221 * Called when a process ceases being the active-running process, either
199 * voluntarily or involuntarily. Now we can calculate how long we ran. 222 * voluntarily or involuntarily. Now we can calculate how long we ran.
223 * Also, if the process is still in the TASK_RUNNING state, call
224 * sched_info_queued() to mark that it has now again started waiting on
225 * the runqueue.
200 */ 226 */
201static inline void sched_info_depart(struct task_struct *t) 227static inline void sched_info_depart(struct task_struct *t)
202{ 228{
@@ -205,6 +231,9 @@ static inline void sched_info_depart(struct task_struct *t)
205 231
206 t->sched_info.cpu_time += delta; 232 t->sched_info.cpu_time += delta;
207 rq_sched_info_depart(task_rq(t), delta); 233 rq_sched_info_depart(task_rq(t), delta);
234
235 if (t->state == TASK_RUNNING)
236 sched_info_queued(t);
208} 237}
209 238
210/* 239/*
@@ -235,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
235 __sched_info_switch(prev, next); 264 __sched_info_switch(prev, next);
236} 265}
237#else 266#else
238#define sched_info_queued(t) do { } while (0) 267#define sched_info_queued(t) do { } while (0)
239#define sched_info_switch(t, next) do { } while (0) 268#define sched_info_reset_dequeued(t) do { } while (0)
269#define sched_info_dequeued(t) do { } while (0)
270#define sched_info_switch(t, next) do { } while (0)
240#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 271#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
241 272
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5c2942e768cd..aaaeae8244e7 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -31,6 +31,7 @@
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/semaphore.h> 32#include <linux/semaphore.h>
33#include <linux/spinlock.h> 33#include <linux/spinlock.h>
34#include <linux/ftrace.h>
34 35
35static noinline void __down(struct semaphore *sem); 36static noinline void __down(struct semaphore *sem);
36static noinline int __down_interruptible(struct semaphore *sem); 37static noinline int __down_interruptible(struct semaphore *sem);
diff --git a/kernel/signal.c b/kernel/signal.c
index 13fab9838354..c5bf0c0df658 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -231,6 +231,40 @@ void flush_signals(struct task_struct *t)
231 spin_unlock_irqrestore(&t->sighand->siglock, flags); 231 spin_unlock_irqrestore(&t->sighand->siglock, flags);
232} 232}
233 233
234static void __flush_itimer_signals(struct sigpending *pending)
235{
236 sigset_t signal, retain;
237 struct sigqueue *q, *n;
238
239 signal = pending->signal;
240 sigemptyset(&retain);
241
242 list_for_each_entry_safe(q, n, &pending->list, list) {
243 int sig = q->info.si_signo;
244
245 if (likely(q->info.si_code != SI_TIMER)) {
246 sigaddset(&retain, sig);
247 } else {
248 sigdelset(&signal, sig);
249 list_del_init(&q->list);
250 __sigqueue_free(q);
251 }
252 }
253
254 sigorsets(&pending->signal, &signal, &retain);
255}
256
257void flush_itimer_signals(void)
258{
259 struct task_struct *tsk = current;
260 unsigned long flags;
261
262 spin_lock_irqsave(&tsk->sighand->siglock, flags);
263 __flush_itimer_signals(&tsk->pending);
264 __flush_itimer_signals(&tsk->signal->shared_pending);
265 spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
266}
267
234void ignore_signals(struct task_struct *t) 268void ignore_signals(struct task_struct *t)
235{ 269{
236 int i; 270 int i;
@@ -1240,17 +1274,22 @@ void sigqueue_free(struct sigqueue *q)
1240 1274
1241 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC)); 1275 BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
1242 /* 1276 /*
1243 * If the signal is still pending remove it from the 1277 * We must hold ->siglock while testing q->list
1244 * pending queue. We must hold ->siglock while testing 1278 * to serialize with collect_signal() or with
1245 * q->list to serialize with collect_signal(). 1279 * __exit_signal()->flush_sigqueue().
1246 */ 1280 */
1247 spin_lock_irqsave(lock, flags); 1281 spin_lock_irqsave(lock, flags);
1282 q->flags &= ~SIGQUEUE_PREALLOC;
1283 /*
1284 * If it is queued it will be freed when dequeued,
1285 * like the "regular" sigqueue.
1286 */
1248 if (!list_empty(&q->list)) 1287 if (!list_empty(&q->list))
1249 list_del_init(&q->list); 1288 q = NULL;
1250 spin_unlock_irqrestore(lock, flags); 1289 spin_unlock_irqrestore(lock, flags);
1251 1290
1252 q->flags &= ~SIGQUEUE_PREALLOC; 1291 if (q)
1253 __sigqueue_free(q); 1292 __sigqueue_free(q);
1254} 1293}
1255 1294
1256int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) 1295int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
diff --git a/kernel/smp.c b/kernel/smp.c
new file mode 100644
index 000000000000..462c785ca1ee
--- /dev/null
+++ b/kernel/smp.c
@@ -0,0 +1,383 @@
1/*
2 * Generic helpers for smp ipi calls
3 *
4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
5 *
6 */
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/percpu.h>
10#include <linux/rcupdate.h>
11#include <linux/rculist.h>
12#include <linux/smp.h>
13
14static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
15static LIST_HEAD(call_function_queue);
16__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
17
18enum {
19 CSD_FLAG_WAIT = 0x01,
20 CSD_FLAG_ALLOC = 0x02,
21};
22
23struct call_function_data {
24 struct call_single_data csd;
25 spinlock_t lock;
26 unsigned int refs;
27 cpumask_t cpumask;
28 struct rcu_head rcu_head;
29};
30
31struct call_single_queue {
32 struct list_head list;
33 spinlock_t lock;
34};
35
36void __cpuinit init_call_single_data(void)
37{
38 int i;
39
40 for_each_possible_cpu(i) {
41 struct call_single_queue *q = &per_cpu(call_single_queue, i);
42
43 spin_lock_init(&q->lock);
44 INIT_LIST_HEAD(&q->list);
45 }
46}
47
48static void csd_flag_wait(struct call_single_data *data)
49{
50 /* Wait for response */
51 do {
52 /*
53 * We need to see the flags store in the IPI handler
54 */
55 smp_mb();
56 if (!(data->flags & CSD_FLAG_WAIT))
57 break;
58 cpu_relax();
59 } while (1);
60}
61
62/*
63 * Insert a previously allocated call_single_data element for execution
64 * on the given CPU. data must already have ->func, ->info, and ->flags set.
65 */
66static void generic_exec_single(int cpu, struct call_single_data *data)
67{
68 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
69 int wait = data->flags & CSD_FLAG_WAIT, ipi;
70 unsigned long flags;
71
72 spin_lock_irqsave(&dst->lock, flags);
73 ipi = list_empty(&dst->list);
74 list_add_tail(&data->list, &dst->list);
75 spin_unlock_irqrestore(&dst->lock, flags);
76
77 if (ipi)
78 arch_send_call_function_single_ipi(cpu);
79
80 if (wait)
81 csd_flag_wait(data);
82}
83
84static void rcu_free_call_data(struct rcu_head *head)
85{
86 struct call_function_data *data;
87
88 data = container_of(head, struct call_function_data, rcu_head);
89
90 kfree(data);
91}
92
93/*
94 * Invoked by arch to handle an IPI for call function. Must be called with
95 * interrupts disabled.
96 */
97void generic_smp_call_function_interrupt(void)
98{
99 struct call_function_data *data;
100 int cpu = get_cpu();
101
102 /*
103 * It's ok to use list_for_each_rcu() here even though we may delete
104 * 'pos', since list_del_rcu() doesn't clear ->next
105 */
106 rcu_read_lock();
107 list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
108 int refs;
109
110 if (!cpu_isset(cpu, data->cpumask))
111 continue;
112
113 data->csd.func(data->csd.info);
114
115 spin_lock(&data->lock);
116 cpu_clear(cpu, data->cpumask);
117 WARN_ON(data->refs == 0);
118 data->refs--;
119 refs = data->refs;
120 spin_unlock(&data->lock);
121
122 if (refs)
123 continue;
124
125 spin_lock(&call_function_lock);
126 list_del_rcu(&data->csd.list);
127 spin_unlock(&call_function_lock);
128
129 if (data->csd.flags & CSD_FLAG_WAIT) {
130 /*
131 * serialize stores to data with the flag clear
132 * and wakeup
133 */
134 smp_wmb();
135 data->csd.flags &= ~CSD_FLAG_WAIT;
136 } else
137 call_rcu(&data->rcu_head, rcu_free_call_data);
138 }
139 rcu_read_unlock();
140
141 put_cpu();
142}
143
144/*
145 * Invoked by arch to handle an IPI for call function single. Must be called
146 * from the arch with interrupts disabled.
147 */
148void generic_smp_call_function_single_interrupt(void)
149{
150 struct call_single_queue *q = &__get_cpu_var(call_single_queue);
151 LIST_HEAD(list);
152
153 /*
154 * Need to see other stores to list head for checking whether
155 * list is empty without holding q->lock
156 */
157 smp_mb();
158 while (!list_empty(&q->list)) {
159 unsigned int data_flags;
160
161 spin_lock(&q->lock);
162 list_replace_init(&q->list, &list);
163 spin_unlock(&q->lock);
164
165 while (!list_empty(&list)) {
166 struct call_single_data *data;
167
168 data = list_entry(list.next, struct call_single_data,
169 list);
170 list_del(&data->list);
171
172 /*
173 * 'data' can be invalid after this call if
174 * flags == 0 (when called through
175 * generic_exec_single(), so save them away before
176 * making the call.
177 */
178 data_flags = data->flags;
179
180 data->func(data->info);
181
182 if (data_flags & CSD_FLAG_WAIT) {
183 smp_wmb();
184 data->flags &= ~CSD_FLAG_WAIT;
185 } else if (data_flags & CSD_FLAG_ALLOC)
186 kfree(data);
187 }
188 /*
189 * See comment on outer loop
190 */
191 smp_mb();
192 }
193}
194
195/*
196 * smp_call_function_single - Run a function on a specific CPU
197 * @func: The function to run. This must be fast and non-blocking.
198 * @info: An arbitrary pointer to pass to the function.
199 * @wait: If true, wait until function has completed on other CPUs.
200 *
201 * Returns 0 on success, else a negative status code. Note that @wait
202 * will be implicitly turned on in case of allocation failures, since
203 * we fall back to on-stack allocation.
204 */
205int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
206 int wait)
207{
208 struct call_single_data d;
209 unsigned long flags;
210 /* prevent preemption and reschedule on another processor */
211 int me = get_cpu();
212
213 /* Can deadlock when called with interrupts disabled */
214 WARN_ON(irqs_disabled());
215
216 if (cpu == me) {
217 local_irq_save(flags);
218 func(info);
219 local_irq_restore(flags);
220 } else {
221 struct call_single_data *data = NULL;
222
223 if (!wait) {
224 data = kmalloc(sizeof(*data), GFP_ATOMIC);
225 if (data)
226 data->flags = CSD_FLAG_ALLOC;
227 }
228 if (!data) {
229 data = &d;
230 data->flags = CSD_FLAG_WAIT;
231 }
232
233 data->func = func;
234 data->info = info;
235 generic_exec_single(cpu, data);
236 }
237
238 put_cpu();
239 return 0;
240}
241EXPORT_SYMBOL(smp_call_function_single);
242
243/**
244 * __smp_call_function_single(): Run a function on another CPU
245 * @cpu: The CPU to run on.
246 * @data: Pre-allocated and setup data structure
247 *
248 * Like smp_call_function_single(), but allow caller to pass in a pre-allocated
249 * data structure. Useful for embedding @data inside other structures, for
250 * instance.
251 *
252 */
253void __smp_call_function_single(int cpu, struct call_single_data *data)
254{
255 /* Can deadlock when called with interrupts disabled */
256 WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
257
258 generic_exec_single(cpu, data);
259}
260
261/**
262 * smp_call_function_mask(): Run a function on a set of other CPUs.
263 * @mask: The set of cpus to run on.
264 * @func: The function to run. This must be fast and non-blocking.
265 * @info: An arbitrary pointer to pass to the function.
266 * @wait: If true, wait (atomically) until function has completed on other CPUs.
267 *
268 * Returns 0 on success, else a negative status code.
269 *
270 * If @wait is true, then returns once @func has returned. Note that @wait
271 * will be implicitly turned on in case of allocation failures, since
272 * we fall back to on-stack allocation.
273 *
274 * You must not call this function with disabled interrupts or from a
275 * hardware interrupt handler or from a bottom half handler. Preemption
276 * must be disabled when calling this function.
277 */
278int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
279 int wait)
280{
281 struct call_function_data d;
282 struct call_function_data *data = NULL;
283 cpumask_t allbutself;
284 unsigned long flags;
285 int cpu, num_cpus;
286
287 /* Can deadlock when called with interrupts disabled */
288 WARN_ON(irqs_disabled());
289
290 cpu = smp_processor_id();
291 allbutself = cpu_online_map;
292 cpu_clear(cpu, allbutself);
293 cpus_and(mask, mask, allbutself);
294 num_cpus = cpus_weight(mask);
295
296 /*
297 * If zero CPUs, return. If just a single CPU, turn this request
298 * into a targetted single call instead since it's faster.
299 */
300 if (!num_cpus)
301 return 0;
302 else if (num_cpus == 1) {
303 cpu = first_cpu(mask);
304 return smp_call_function_single(cpu, func, info, wait);
305 }
306
307 if (!wait) {
308 data = kmalloc(sizeof(*data), GFP_ATOMIC);
309 if (data)
310 data->csd.flags = CSD_FLAG_ALLOC;
311 }
312 if (!data) {
313 data = &d;
314 data->csd.flags = CSD_FLAG_WAIT;
315 wait = 1;
316 }
317
318 spin_lock_init(&data->lock);
319 data->csd.func = func;
320 data->csd.info = info;
321 data->refs = num_cpus;
322 data->cpumask = mask;
323
324 spin_lock_irqsave(&call_function_lock, flags);
325 list_add_tail_rcu(&data->csd.list, &call_function_queue);
326 spin_unlock_irqrestore(&call_function_lock, flags);
327
328 /* Send a message to all CPUs in the map */
329 arch_send_call_function_ipi(mask);
330
331 /* optionally wait for the CPUs to complete */
332 if (wait)
333 csd_flag_wait(&data->csd);
334
335 return 0;
336}
337EXPORT_SYMBOL(smp_call_function_mask);
338
339/**
340 * smp_call_function(): Run a function on all other CPUs.
341 * @func: The function to run. This must be fast and non-blocking.
342 * @info: An arbitrary pointer to pass to the function.
343 * @wait: If true, wait (atomically) until function has completed on other CPUs.
344 *
345 * Returns 0 on success, else a negative status code.
346 *
347 * If @wait is true, then returns once @func has returned; otherwise
348 * it returns just before the target cpu calls @func. In case of allocation
349 * failure, @wait will be implicitly turned on.
350 *
351 * You must not call this function with disabled interrupts or from a
352 * hardware interrupt handler or from a bottom half handler.
353 */
354int smp_call_function(void (*func)(void *), void *info, int wait)
355{
356 int ret;
357
358 preempt_disable();
359 ret = smp_call_function_mask(cpu_online_map, func, info, wait);
360 preempt_enable();
361 return ret;
362}
363EXPORT_SYMBOL(smp_call_function);
364
365void ipi_call_lock(void)
366{
367 spin_lock(&call_function_lock);
368}
369
370void ipi_call_unlock(void)
371{
372 spin_unlock(&call_function_lock);
373}
374
375void ipi_call_lock_irq(void)
376{
377 spin_lock_irq(&call_function_lock);
378}
379
380void ipi_call_unlock_irq(void)
381{
382 spin_unlock_irq(&call_function_lock);
383}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..f6b03d56c2bf 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -131,23 +131,17 @@ void _local_bh_enable(void)
131 131
132EXPORT_SYMBOL(_local_bh_enable); 132EXPORT_SYMBOL(_local_bh_enable);
133 133
134void local_bh_enable(void) 134static inline void _local_bh_enable_ip(unsigned long ip)
135{ 135{
136 WARN_ON_ONCE(in_irq() || irqs_disabled());
136#ifdef CONFIG_TRACE_IRQFLAGS 137#ifdef CONFIG_TRACE_IRQFLAGS
137 unsigned long flags; 138 local_irq_disable();
138
139 WARN_ON_ONCE(in_irq());
140#endif
141 WARN_ON_ONCE(irqs_disabled());
142
143#ifdef CONFIG_TRACE_IRQFLAGS
144 local_irq_save(flags);
145#endif 139#endif
146 /* 140 /*
147 * Are softirqs going to be turned on now: 141 * Are softirqs going to be turned on now:
148 */ 142 */
149 if (softirq_count() == SOFTIRQ_OFFSET) 143 if (softirq_count() == SOFTIRQ_OFFSET)
150 trace_softirqs_on((unsigned long)__builtin_return_address(0)); 144 trace_softirqs_on(ip);
151 /* 145 /*
152 * Keep preemption disabled until we are done with 146 * Keep preemption disabled until we are done with
153 * softirq processing: 147 * softirq processing:
@@ -159,40 +153,20 @@ void local_bh_enable(void)
159 153
160 dec_preempt_count(); 154 dec_preempt_count();
161#ifdef CONFIG_TRACE_IRQFLAGS 155#ifdef CONFIG_TRACE_IRQFLAGS
162 local_irq_restore(flags); 156 local_irq_enable();
163#endif 157#endif
164 preempt_check_resched(); 158 preempt_check_resched();
165} 159}
160
161void local_bh_enable(void)
162{
163 _local_bh_enable_ip((unsigned long)__builtin_return_address(0));
164}
166EXPORT_SYMBOL(local_bh_enable); 165EXPORT_SYMBOL(local_bh_enable);
167 166
168void local_bh_enable_ip(unsigned long ip) 167void local_bh_enable_ip(unsigned long ip)
169{ 168{
170#ifdef CONFIG_TRACE_IRQFLAGS 169 _local_bh_enable_ip(ip);
171 unsigned long flags;
172
173 WARN_ON_ONCE(in_irq());
174
175 local_irq_save(flags);
176#endif
177 /*
178 * Are softirqs going to be turned on now:
179 */
180 if (softirq_count() == SOFTIRQ_OFFSET)
181 trace_softirqs_on(ip);
182 /*
183 * Keep preemption disabled until we are done with
184 * softirq processing:
185 */
186 sub_preempt_count(SOFTIRQ_OFFSET - 1);
187
188 if (unlikely(!in_interrupt() && local_softirq_pending()))
189 do_softirq();
190
191 dec_preempt_count();
192#ifdef CONFIG_TRACE_IRQFLAGS
193 local_irq_restore(flags);
194#endif
195 preempt_check_resched();
196} 170}
197EXPORT_SYMBOL(local_bh_enable_ip); 171EXPORT_SYMBOL(local_bh_enable_ip);
198 172
@@ -312,7 +286,7 @@ void irq_exit(void)
312#ifdef CONFIG_NO_HZ 286#ifdef CONFIG_NO_HZ
313 /* Make sure that timer wheel updates are propagated */ 287 /* Make sure that timer wheel updates are propagated */
314 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) 288 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
315 tick_nohz_stop_sched_tick(); 289 tick_nohz_stop_sched_tick(0);
316 rcu_irq_exit(); 290 rcu_irq_exit();
317#endif 291#endif
318 preempt_enable_no_resched(); 292 preempt_enable_no_resched();
@@ -347,9 +321,8 @@ void raise_softirq(unsigned int nr)
347 local_irq_restore(flags); 321 local_irq_restore(flags);
348} 322}
349 323
350void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) 324void open_softirq(int nr, void (*action)(struct softirq_action *))
351{ 325{
352 softirq_vec[nr].data = data;
353 softirq_vec[nr].action = action; 326 softirq_vec[nr].action = action;
354} 327}
355 328
@@ -360,10 +333,8 @@ struct tasklet_head
360 struct tasklet_struct **tail; 333 struct tasklet_struct **tail;
361}; 334};
362 335
363/* Some compilers disobey section attribute on statics when not 336static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
364 initialized -- RR */ 337static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
365static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
366static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
367 338
368void __tasklet_schedule(struct tasklet_struct *t) 339void __tasklet_schedule(struct tasklet_struct *t)
369{ 340{
@@ -503,8 +474,8 @@ void __init softirq_init(void)
503 &per_cpu(tasklet_hi_vec, cpu).head; 474 &per_cpu(tasklet_hi_vec, cpu).head;
504 } 475 }
505 476
506 open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); 477 open_softirq(TASKLET_SOFTIRQ, tasklet_action);
507 open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); 478 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
508} 479}
509 480
510static int ksoftirqd(void * __bind_cpu) 481static int ksoftirqd(void * __bind_cpu)
@@ -645,7 +616,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
645 616
646 p = per_cpu(ksoftirqd, hotcpu); 617 p = per_cpu(ksoftirqd, hotcpu);
647 per_cpu(ksoftirqd, hotcpu) = NULL; 618 per_cpu(ksoftirqd, hotcpu) = NULL;
648 sched_setscheduler(p, SCHED_FIFO, &param); 619 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
649 kthread_stop(p); 620 kthread_stop(p);
650 takeover_tasklets(hotcpu); 621 takeover_tasklets(hotcpu);
651 break; 622 break;
@@ -674,12 +645,12 @@ __init int spawn_ksoftirqd(void)
674/* 645/*
675 * Call a function on all processors 646 * Call a function on all processors
676 */ 647 */
677int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait) 648int on_each_cpu(void (*func) (void *info), void *info, int wait)
678{ 649{
679 int ret = 0; 650 int ret = 0;
680 651
681 preempt_disable(); 652 preempt_disable();
682 ret = smp_call_function(func, info, retry, wait); 653 ret = smp_call_function(func, info, wait);
683 local_irq_disable(); 654 local_irq_disable();
684 func(info); 655 func(info);
685 local_irq_enable(); 656 local_irq_enable();
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 01b6522fd92b..a272d78185eb 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -49,12 +49,17 @@ static unsigned long get_timestamp(int this_cpu)
49 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ 49 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
50} 50}
51 51
52void touch_softlockup_watchdog(void) 52static void __touch_softlockup_watchdog(void)
53{ 53{
54 int this_cpu = raw_smp_processor_id(); 54 int this_cpu = raw_smp_processor_id();
55 55
56 __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu); 56 __raw_get_cpu_var(touch_timestamp) = get_timestamp(this_cpu);
57} 57}
58
59void touch_softlockup_watchdog(void)
60{
61 __raw_get_cpu_var(touch_timestamp) = 0;
62}
58EXPORT_SYMBOL(touch_softlockup_watchdog); 63EXPORT_SYMBOL(touch_softlockup_watchdog);
59 64
60void touch_all_softlockup_watchdogs(void) 65void touch_all_softlockup_watchdogs(void)
@@ -80,7 +85,7 @@ void softlockup_tick(void)
80 unsigned long now; 85 unsigned long now;
81 86
82 if (touch_timestamp == 0) { 87 if (touch_timestamp == 0) {
83 touch_softlockup_watchdog(); 88 __touch_softlockup_watchdog();
84 return; 89 return;
85 } 90 }
86 91
@@ -95,7 +100,7 @@ void softlockup_tick(void)
95 100
96 /* do not print during early bootup: */ 101 /* do not print during early bootup: */
97 if (unlikely(system_state != SYSTEM_RUNNING)) { 102 if (unlikely(system_state != SYSTEM_RUNNING)) {
98 touch_softlockup_watchdog(); 103 __touch_softlockup_watchdog();
99 return; 104 return;
100 } 105 }
101 106
@@ -115,6 +120,7 @@ void softlockup_tick(void)
115 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", 120 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
116 this_cpu, now - touch_timestamp, 121 this_cpu, now - touch_timestamp,
117 current->comm, task_pid_nr(current)); 122 current->comm, task_pid_nr(current));
123 print_modules();
118 if (regs) 124 if (regs)
119 show_regs(regs); 125 show_regs(regs);
120 else 126 else
@@ -214,7 +220,7 @@ static int watchdog(void *__bind_cpu)
214 sched_setscheduler(current, SCHED_FIFO, &param); 220 sched_setscheduler(current, SCHED_FIFO, &param);
215 221
216 /* initialize timestamp */ 222 /* initialize timestamp */
217 touch_softlockup_watchdog(); 223 __touch_softlockup_watchdog();
218 224
219 set_current_state(TASK_INTERRUPTIBLE); 225 set_current_state(TASK_INTERRUPTIBLE);
220 /* 226 /*
@@ -223,7 +229,7 @@ static int watchdog(void *__bind_cpu)
223 * debug-printout triggers in softlockup_tick(). 229 * debug-printout triggers in softlockup_tick().
224 */ 230 */
225 while (!kthread_should_stop()) { 231 while (!kthread_should_stop()) {
226 touch_softlockup_watchdog(); 232 __touch_softlockup_watchdog();
227 schedule(); 233 schedule();
228 234
229 if (kthread_should_stop()) 235 if (kthread_should_stop())
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index ae28c8245123..a1fb54c93cdd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -436,7 +436,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
436} 436}
437EXPORT_SYMBOL(_spin_trylock_bh); 437EXPORT_SYMBOL(_spin_trylock_bh);
438 438
439int in_lock_functions(unsigned long addr) 439notrace int in_lock_functions(unsigned long addr)
440{ 440{
441 /* Linker adds these: start and end of __lockfunc functions */ 441 /* Linker adds these: start and end of __lockfunc functions */
442 extern char __lock_text_start[], __lock_text_end[]; 442 extern char __lock_text_start[], __lock_text_end[];
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index b71816e47a30..94b527ef1d1e 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,19 +6,21 @@
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/module.h>
9#include <linux/kallsyms.h> 10#include <linux/kallsyms.h>
10#include <linux/stacktrace.h> 11#include <linux/stacktrace.h>
11 12
12void print_stack_trace(struct stack_trace *trace, int spaces) 13void print_stack_trace(struct stack_trace *trace, int spaces)
13{ 14{
14 int i, j; 15 int i;
15 16
16 for (i = 0; i < trace->nr_entries; i++) { 17 if (WARN_ON(!trace->entries))
17 unsigned long ip = trace->entries[i]; 18 return;
18 19
19 for (j = 0; j < spaces + 1; j++) 20 for (i = 0; i < trace->nr_entries; i++) {
20 printk(" "); 21 printk("%*c", 1 + spaces, ' ');
21 print_ip_sym(ip); 22 print_ip_sym(trace->entries[i]);
22 } 23 }
23} 24}
25EXPORT_SYMBOL_GPL(print_stack_trace);
24 26
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0101aeef7ed7..ba9b2054ecbd 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -62,8 +62,7 @@ static int stopmachine(void *cpu)
62 * help our sisters onto their CPUs. */ 62 * help our sisters onto their CPUs. */
63 if (!prepared && !irqs_disabled) 63 if (!prepared && !irqs_disabled)
64 yield(); 64 yield();
65 else 65 cpu_relax();
66 cpu_relax();
67 } 66 }
68 67
69 /* Ack: we are exiting. */ 68 /* Ack: we are exiting. */
@@ -106,8 +105,10 @@ static int stop_machine(void)
106 } 105 }
107 106
108 /* Wait for them all to come to life. */ 107 /* Wait for them all to come to life. */
109 while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) 108 while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) {
110 yield(); 109 yield();
110 cpu_relax();
111 }
111 112
112 /* If some failed, kill them all. */ 113 /* If some failed, kill them all. */
113 if (ret < 0) { 114 if (ret < 0) {
@@ -186,7 +187,7 @@ struct task_struct *__stop_machine_run(int (*fn)(void *), void *data,
186 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 187 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
187 188
188 /* One high-prio thread per cpu. We'll do this one. */ 189 /* One high-prio thread per cpu. We'll do this one. */
189 sched_setscheduler(p, SCHED_FIFO, &param); 190 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
190 kthread_bind(p, cpu); 191 kthread_bind(p, cpu);
191 wake_up_process(p); 192 wake_up_process(p);
192 wait_for_completion(&smdata.done); 193 wait_for_completion(&smdata.done);
diff --git a/kernel/sys.c b/kernel/sys.c
index 895d2d4c9493..14e97282eb6c 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1652,7 +1652,7 @@ asmlinkage long sys_umask(int mask)
1652asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, 1652asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1653 unsigned long arg4, unsigned long arg5) 1653 unsigned long arg4, unsigned long arg5)
1654{ 1654{
1655 long uninitialized_var(error); 1655 long error = 0;
1656 1656
1657 if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error)) 1657 if (security_task_prctl(option, arg2, arg3, arg4, arg5, &error))
1658 return error; 1658 return error;
@@ -1701,9 +1701,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1701 error = PR_TIMING_STATISTICAL; 1701 error = PR_TIMING_STATISTICAL;
1702 break; 1702 break;
1703 case PR_SET_TIMING: 1703 case PR_SET_TIMING:
1704 if (arg2 == PR_TIMING_STATISTICAL) 1704 if (arg2 != PR_TIMING_STATISTICAL)
1705 error = 0;
1706 else
1707 error = -EINVAL; 1705 error = -EINVAL;
1708 break; 1706 break;
1709 1707
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..6b16e16428d8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -46,6 +46,7 @@
46#include <linux/nfs_fs.h> 46#include <linux/nfs_fs.h>
47#include <linux/acpi.h> 47#include <linux/acpi.h>
48#include <linux/reboot.h> 48#include <linux/reboot.h>
49#include <linux/ftrace.h>
49 50
50#include <asm/uaccess.h> 51#include <asm/uaccess.h>
51#include <asm/processor.h> 52#include <asm/processor.h>
@@ -82,6 +83,9 @@ extern int maps_protect;
82extern int sysctl_stat_interval; 83extern int sysctl_stat_interval;
83extern int latencytop_enabled; 84extern int latencytop_enabled;
84extern int sysctl_nr_open_min, sysctl_nr_open_max; 85extern int sysctl_nr_open_min, sysctl_nr_open_max;
86#ifdef CONFIG_RCU_TORTURE_TEST
87extern int rcutorture_runnable;
88#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
85 89
86/* Constants used for minimum and maximum */ 90/* Constants used for minimum and maximum */
87#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) 91#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
@@ -132,8 +136,6 @@ extern int sysctl_userprocess_debug;
132extern int spin_retry; 136extern int spin_retry;
133#endif 137#endif
134 138
135extern int sysctl_hz_timer;
136
137#ifdef CONFIG_BSD_PROCESS_ACCT 139#ifdef CONFIG_BSD_PROCESS_ACCT
138extern int acct_parm[]; 140extern int acct_parm[];
139#endif 141#endif
@@ -266,6 +268,14 @@ static struct ctl_table kern_table[] = {
266 }, 268 },
267 { 269 {
268 .ctl_name = CTL_UNNUMBERED, 270 .ctl_name = CTL_UNNUMBERED,
271 .procname = "sched_shares_ratelimit",
272 .data = &sysctl_sched_shares_ratelimit,
273 .maxlen = sizeof(unsigned int),
274 .mode = 0644,
275 .proc_handler = &proc_dointvec,
276 },
277 {
278 .ctl_name = CTL_UNNUMBERED,
269 .procname = "sched_child_runs_first", 279 .procname = "sched_child_runs_first",
270 .data = &sysctl_sched_child_runs_first, 280 .data = &sysctl_sched_child_runs_first,
271 .maxlen = sizeof(unsigned int), 281 .maxlen = sizeof(unsigned int),
@@ -455,6 +465,16 @@ static struct ctl_table kern_table[] = {
455 .mode = 0644, 465 .mode = 0644,
456 .proc_handler = &proc_dointvec, 466 .proc_handler = &proc_dointvec,
457 }, 467 },
468#ifdef CONFIG_FTRACE
469 {
470 .ctl_name = CTL_UNNUMBERED,
471 .procname = "ftrace_enabled",
472 .data = &ftrace_enabled,
473 .maxlen = sizeof(int),
474 .mode = 0644,
475 .proc_handler = &ftrace_enable_sysctl,
476 },
477#endif
458#ifdef CONFIG_KMOD 478#ifdef CONFIG_KMOD
459 { 479 {
460 .ctl_name = KERN_MODPROBE, 480 .ctl_name = KERN_MODPROBE,
@@ -563,16 +583,6 @@ static struct ctl_table kern_table[] = {
563 .proc_handler = &proc_dointvec, 583 .proc_handler = &proc_dointvec,
564 }, 584 },
565#endif 585#endif
566#ifdef CONFIG_NO_IDLE_HZ
567 {
568 .ctl_name = KERN_HZ_TIMER,
569 .procname = "hz_timer",
570 .data = &sysctl_hz_timer,
571 .maxlen = sizeof(int),
572 .mode = 0644,
573 .proc_handler = &proc_dointvec,
574 },
575#endif
576 { 586 {
577 .ctl_name = KERN_S390_USER_DEBUG_LOGGING, 587 .ctl_name = KERN_S390_USER_DEBUG_LOGGING,
578 .procname = "userprocess_debug", 588 .procname = "userprocess_debug",
@@ -813,6 +823,16 @@ static struct ctl_table kern_table[] = {
813 .child = key_sysctls, 823 .child = key_sysctls,
814 }, 824 },
815#endif 825#endif
826#ifdef CONFIG_RCU_TORTURE_TEST
827 {
828 .ctl_name = CTL_UNNUMBERED,
829 .procname = "rcutorture_runnable",
830 .data = &rcutorture_runnable,
831 .maxlen = sizeof(int),
832 .mode = 0644,
833 .proc_handler = &proc_dointvec,
834 },
835#endif
816/* 836/*
817 * NOTE: do not add new entries to this table unless you have read 837 * NOTE: do not add new entries to this table unless you have read
818 * Documentation/sysctl/ctl_unnumbered.txt 838 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 57a1f02e5ec0..f48d0f09d32f 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -30,6 +30,7 @@
30struct tick_device tick_broadcast_device; 30struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask; 31static cpumask_t tick_broadcast_mask;
32static DEFINE_SPINLOCK(tick_broadcast_lock); 32static DEFINE_SPINLOCK(tick_broadcast_lock);
33static int tick_broadcast_force;
33 34
34#ifdef CONFIG_TICK_ONESHOT 35#ifdef CONFIG_TICK_ONESHOT
35static void tick_broadcast_clear_oneshot(int cpu); 36static void tick_broadcast_clear_oneshot(int cpu);
@@ -232,10 +233,11 @@ static void tick_do_broadcast_on_off(void *why)
232 CLOCK_EVT_MODE_SHUTDOWN); 233 CLOCK_EVT_MODE_SHUTDOWN);
233 } 234 }
234 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) 235 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
235 dev->features |= CLOCK_EVT_FEAT_DUMMY; 236 tick_broadcast_force = 1;
236 break; 237 break;
237 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 238 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
238 if (cpu_isset(cpu, tick_broadcast_mask)) { 239 if (!tick_broadcast_force &&
240 cpu_isset(cpu, tick_broadcast_mask)) {
239 cpu_clear(cpu, tick_broadcast_mask); 241 cpu_clear(cpu, tick_broadcast_mask);
240 if (td->mode == TICKDEV_MODE_PERIODIC) 242 if (td->mode == TICKDEV_MODE_PERIODIC)
241 tick_setup_periodic(dev, 0); 243 tick_setup_periodic(dev, 0);
@@ -266,7 +268,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
266 "offline CPU #%d\n", *oncpu); 268 "offline CPU #%d\n", *oncpu);
267 else 269 else
268 smp_call_function_single(*oncpu, tick_do_broadcast_on_off, 270 smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
269 &reason, 1, 1); 271 &reason, 1);
270} 272}
271 273
272/* 274/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b854a895591e..a5c26d2b1323 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -48,6 +48,13 @@ static void tick_do_update_jiffies64(ktime_t now)
48 unsigned long ticks = 0; 48 unsigned long ticks = 0;
49 ktime_t delta; 49 ktime_t delta;
50 50
51 /*
52 * Do a quick check without holding xtime_lock:
53 */
54 delta = ktime_sub(now, last_jiffies_update);
55 if (delta.tv64 < tick_period.tv64)
56 return;
57
51 /* Reevalute with xtime_lock held */ 58 /* Reevalute with xtime_lock held */
52 write_seqlock(&xtime_lock); 59 write_seqlock(&xtime_lock);
53 60
@@ -188,7 +195,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
188 * Called either from the idle loop or from irq_exit() when an idle period was 195 * Called either from the idle loop or from irq_exit() when an idle period was
189 * just interrupted by an interrupt which did not cause a reschedule. 196 * just interrupted by an interrupt which did not cause a reschedule.
190 */ 197 */
191void tick_nohz_stop_sched_tick(void) 198void tick_nohz_stop_sched_tick(int inidle)
192{ 199{
193 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 200 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
194 struct tick_sched *ts; 201 struct tick_sched *ts;
@@ -217,6 +224,11 @@ void tick_nohz_stop_sched_tick(void)
217 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 224 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
218 goto end; 225 goto end;
219 226
227 if (!inidle && !ts->inidle)
228 goto end;
229
230 ts->inidle = 1;
231
220 if (need_resched()) 232 if (need_resched())
221 goto end; 233 goto end;
222 234
@@ -228,6 +240,7 @@ void tick_nohz_stop_sched_tick(void)
228 local_softirq_pending()); 240 local_softirq_pending());
229 ratelimit++; 241 ratelimit++;
230 } 242 }
243 goto end;
231 } 244 }
232 245
233 ts->idle_calls++; 246 ts->idle_calls++;
@@ -276,6 +289,7 @@ void tick_nohz_stop_sched_tick(void)
276 ts->tick_stopped = 1; 289 ts->tick_stopped = 1;
277 ts->idle_jiffies = last_jiffies; 290 ts->idle_jiffies = last_jiffies;
278 rcu_enter_nohz(); 291 rcu_enter_nohz();
292 sched_clock_tick_stop(cpu);
279 } 293 }
280 294
281 /* 295 /*
@@ -364,17 +378,21 @@ void tick_nohz_restart_sched_tick(void)
364 local_irq_disable(); 378 local_irq_disable();
365 tick_nohz_stop_idle(cpu); 379 tick_nohz_stop_idle(cpu);
366 380
367 if (!ts->tick_stopped) { 381 if (!ts->inidle || !ts->tick_stopped) {
382 ts->inidle = 0;
368 local_irq_enable(); 383 local_irq_enable();
369 return; 384 return;
370 } 385 }
371 386
387 ts->inidle = 0;
388
372 rcu_exit_nohz(); 389 rcu_exit_nohz();
373 390
374 /* Update jiffies first */ 391 /* Update jiffies first */
375 select_nohz_load_balancer(0); 392 select_nohz_load_balancer(0);
376 now = ktime_get(); 393 now = ktime_get();
377 tick_do_update_jiffies64(now); 394 tick_do_update_jiffies64(now);
395 sched_clock_tick_start(cpu);
378 cpu_clear(cpu, nohz_cpu_mask); 396 cpu_clear(cpu, nohz_cpu_mask);
379 397
380 /* 398 /*
diff --git a/kernel/timer.c b/kernel/timer.c
index ceacc6626572..03bc7f1f1593 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -812,7 +812,7 @@ static inline void __run_timers(struct tvec_base *base)
812 spin_unlock_irq(&base->lock); 812 spin_unlock_irq(&base->lock);
813} 813}
814 814
815#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) 815#ifdef CONFIG_NO_HZ
816/* 816/*
817 * Find out when the next timer event is due to happen. This 817 * Find out when the next timer event is due to happen. This
818 * is used on S/390 to stop all activity when a cpus is idle. 818 * is used on S/390 to stop all activity when a cpus is idle.
@@ -947,14 +947,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
947 947
948 return cmp_next_hrtimer_event(now, expires); 948 return cmp_next_hrtimer_event(now, expires);
949} 949}
950
951#ifdef CONFIG_NO_IDLE_HZ
952unsigned long next_timer_interrupt(void)
953{
954 return get_next_timer_interrupt(jiffies);
955}
956#endif
957
958#endif 950#endif
959 951
960#ifndef CONFIG_VIRT_CPU_ACCOUNTING 952#ifndef CONFIG_VIRT_CPU_ACCOUNTING
@@ -1502,7 +1494,7 @@ void __init init_timers(void)
1502 1494
1503 BUG_ON(err == NOTIFY_BAD); 1495 BUG_ON(err == NOTIFY_BAD);
1504 register_cpu_notifier(&timers_nb); 1496 register_cpu_notifier(&timers_nb);
1505 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1497 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1506} 1498}
1507 1499
1508/** 1500/**
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644
index 000000000000..263e9e6bbd60
--- /dev/null
+++ b/kernel/trace/Kconfig
@@ -0,0 +1,135 @@
1#
2# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
3#
4config HAVE_FTRACE
5 bool
6
7config HAVE_DYNAMIC_FTRACE
8 bool
9
10config TRACER_MAX_TRACE
11 bool
12
13config TRACING
14 bool
15 select DEBUG_FS
16 select STACKTRACE
17
18config FTRACE
19 bool "Kernel Function Tracer"
20 depends on HAVE_FTRACE
21 select FRAME_POINTER
22 select TRACING
23 select CONTEXT_SWITCH_TRACER
24 help
25 Enable the kernel to trace every kernel function. This is done
26 by using a compiler feature to insert a small, 5-byte No-Operation
27 instruction to the beginning of every kernel function, which NOP
28 sequence is then dynamically patched into a tracer call when
29 tracing is enabled by the administrator. If it's runtime disabled
30 (the bootup default), then the overhead of the instructions is very
31 small and not measurable even in micro-benchmarks.
32
33config IRQSOFF_TRACER
34 bool "Interrupts-off Latency Tracer"
35 default n
36 depends on TRACE_IRQFLAGS_SUPPORT
37 depends on GENERIC_TIME
38 depends on HAVE_FTRACE
39 select TRACE_IRQFLAGS
40 select TRACING
41 select TRACER_MAX_TRACE
42 help
43 This option measures the time spent in irqs-off critical
44 sections, with microsecond accuracy.
45
46 The default measurement method is a maximum search, which is
47 disabled by default and can be runtime (re-)started
48 via:
49
50 echo 0 > /debugfs/tracing/tracing_max_latency
51
52 (Note that kernel size and overhead increases with this option
53 enabled. This option and the preempt-off timing option can be
54 used together or separately.)
55
56config PREEMPT_TRACER
57 bool "Preemption-off Latency Tracer"
58 default n
59 depends on GENERIC_TIME
60 depends on PREEMPT
61 depends on HAVE_FTRACE
62 select TRACING
63 select TRACER_MAX_TRACE
64 help
65 This option measures the time spent in preemption off critical
66 sections, with microsecond accuracy.
67
68 The default measurement method is a maximum search, which is
69 disabled by default and can be runtime (re-)started
70 via:
71
72 echo 0 > /debugfs/tracing/tracing_max_latency
73
74 (Note that kernel size and overhead increases with this option
75 enabled. This option and the irqs-off timing option can be
76 used together or separately.)
77
78config SYSPROF_TRACER
79 bool "Sysprof Tracer"
80 depends on X86
81 select TRACING
82 help
83 This tracer provides the trace needed by the 'Sysprof' userspace
84 tool.
85
86config SCHED_TRACER
87 bool "Scheduling Latency Tracer"
88 depends on HAVE_FTRACE
89 select TRACING
90 select CONTEXT_SWITCH_TRACER
91 select TRACER_MAX_TRACE
92 help
93 This tracer tracks the latency of the highest priority task
94 to be scheduled in, starting from the point it has woken up.
95
96config CONTEXT_SWITCH_TRACER
97 bool "Trace process context switches"
98 depends on HAVE_FTRACE
99 select TRACING
100 select MARKERS
101 help
102 This tracer gets called from the context switch and records
103 all switching of tasks.
104
105config DYNAMIC_FTRACE
106 bool "enable/disable ftrace tracepoints dynamically"
107 depends on FTRACE
108 depends on HAVE_DYNAMIC_FTRACE
109 default y
110 help
111 This option will modify all the calls to ftrace dynamically
112 (will patch them out of the binary image and replaces them
113 with a No-Op instruction) as they are called. A table is
114 created to dynamically enable them again.
115
116 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
117 has native performance as long as no tracing is active.
118
119 The changes to the code are done by a kernel thread that
120 wakes up once a second and checks to see if any ftrace calls
121 were made. If so, it runs stop_machine (stops all CPUS)
122 and modifies the code to jump over the call to ftrace.
123
124config FTRACE_SELFTEST
125 bool
126
127config FTRACE_STARTUP_TEST
128 bool "Perform a startup test on ftrace"
129 depends on TRACING
130 select FTRACE_SELFTEST
131 help
132 This option performs a series of startup tests on ftrace. On bootup
133 a series of tests are made to verify that the tracer is
134 functioning properly. It will do tests on all the configured
135 tracers of ftrace.
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644
index 000000000000..71d17de17288
--- /dev/null
+++ b/kernel/trace/Makefile
@@ -0,0 +1,24 @@
1
2# Do not instrument the tracer itself:
3
4ifdef CONFIG_FTRACE
5ORIG_CFLAGS := $(KBUILD_CFLAGS)
6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
7
8# selftest needs instrumentation
9CFLAGS_trace_selftest_dynamic.o = -pg
10obj-y += trace_selftest_dynamic.o
11endif
12
13obj-$(CONFIG_FTRACE) += libftrace.o
14
15obj-$(CONFIG_TRACING) += trace.o
16obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
17obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
18obj-$(CONFIG_FTRACE) += trace_functions.o
19obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
20obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
21obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
22obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
23
24libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644
index 000000000000..4231a3dc224a
--- /dev/null
+++ b/kernel/trace/ftrace.c
@@ -0,0 +1,1727 @@
1/*
2 * Infrastructure for profiling code inserted by 'gcc -pg'.
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * Originally ported from the -rt patch by:
8 * Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
9 *
10 * Based on code in the latency_tracer, that is:
11 *
12 * Copyright (C) 2004-2006 Ingo Molnar
13 * Copyright (C) 2004 William Lee Irwin III
14 */
15
16#include <linux/stop_machine.h>
17#include <linux/clocksource.h>
18#include <linux/kallsyms.h>
19#include <linux/seq_file.h>
20#include <linux/debugfs.h>
21#include <linux/hardirq.h>
22#include <linux/kthread.h>
23#include <linux/uaccess.h>
24#include <linux/kprobes.h>
25#include <linux/ftrace.h>
26#include <linux/sysctl.h>
27#include <linux/ctype.h>
28#include <linux/hash.h>
29#include <linux/list.h>
30
31#include <asm/ftrace.h>
32
33#include "trace.h"
34
35/* ftrace_enabled is a method to turn ftrace on or off */
36int ftrace_enabled __read_mostly;
37static int last_ftrace_enabled;
38
39/*
40 * ftrace_disabled is set when an anomaly is discovered.
41 * ftrace_disabled is much stronger than ftrace_enabled.
42 */
43static int ftrace_disabled __read_mostly;
44
45static DEFINE_SPINLOCK(ftrace_lock);
46static DEFINE_MUTEX(ftrace_sysctl_lock);
47
48static struct ftrace_ops ftrace_list_end __read_mostly =
49{
50 .func = ftrace_stub,
51};
52
53static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
54ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
55
56static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
57{
58 struct ftrace_ops *op = ftrace_list;
59
60 /* in case someone actually ports this to alpha! */
61 read_barrier_depends();
62
63 while (op != &ftrace_list_end) {
64 /* silly alpha */
65 read_barrier_depends();
66 op->func(ip, parent_ip);
67 op = op->next;
68 };
69}
70
71/**
72 * clear_ftrace_function - reset the ftrace function
73 *
74 * This NULLs the ftrace function and in essence stops
75 * tracing. There may be lag
76 */
77void clear_ftrace_function(void)
78{
79 ftrace_trace_function = ftrace_stub;
80}
81
82static int __register_ftrace_function(struct ftrace_ops *ops)
83{
84 /* Should never be called by interrupts */
85 spin_lock(&ftrace_lock);
86
87 ops->next = ftrace_list;
88 /*
89 * We are entering ops into the ftrace_list but another
90 * CPU might be walking that list. We need to make sure
91 * the ops->next pointer is valid before another CPU sees
92 * the ops pointer included into the ftrace_list.
93 */
94 smp_wmb();
95 ftrace_list = ops;
96
97 if (ftrace_enabled) {
98 /*
99 * For one func, simply call it directly.
100 * For more than one func, call the chain.
101 */
102 if (ops->next == &ftrace_list_end)
103 ftrace_trace_function = ops->func;
104 else
105 ftrace_trace_function = ftrace_list_func;
106 }
107
108 spin_unlock(&ftrace_lock);
109
110 return 0;
111}
112
113static int __unregister_ftrace_function(struct ftrace_ops *ops)
114{
115 struct ftrace_ops **p;
116 int ret = 0;
117
118 spin_lock(&ftrace_lock);
119
120 /*
121 * If we are removing the last function, then simply point
122 * to the ftrace_stub.
123 */
124 if (ftrace_list == ops && ops->next == &ftrace_list_end) {
125 ftrace_trace_function = ftrace_stub;
126 ftrace_list = &ftrace_list_end;
127 goto out;
128 }
129
130 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
131 if (*p == ops)
132 break;
133
134 if (*p != ops) {
135 ret = -1;
136 goto out;
137 }
138
139 *p = (*p)->next;
140
141 if (ftrace_enabled) {
142 /* If we only have one func left, then call that directly */
143 if (ftrace_list == &ftrace_list_end ||
144 ftrace_list->next == &ftrace_list_end)
145 ftrace_trace_function = ftrace_list->func;
146 }
147
148 out:
149 spin_unlock(&ftrace_lock);
150
151 return ret;
152}
153
154#ifdef CONFIG_DYNAMIC_FTRACE
155
156static struct task_struct *ftraced_task;
157
158enum {
159 FTRACE_ENABLE_CALLS = (1 << 0),
160 FTRACE_DISABLE_CALLS = (1 << 1),
161 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
162 FTRACE_ENABLE_MCOUNT = (1 << 3),
163 FTRACE_DISABLE_MCOUNT = (1 << 4),
164};
165
166static int ftrace_filtered;
167static int tracing_on;
168static int frozen_record_count;
169
170static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
171
172static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
173
174static DEFINE_SPINLOCK(ftrace_shutdown_lock);
175static DEFINE_MUTEX(ftraced_lock);
176static DEFINE_MUTEX(ftrace_regex_lock);
177
178struct ftrace_page {
179 struct ftrace_page *next;
180 unsigned long index;
181 struct dyn_ftrace records[];
182};
183
184#define ENTRIES_PER_PAGE \
185 ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
186
187/* estimate from running different kernels */
188#define NR_TO_INIT 10000
189
190static struct ftrace_page *ftrace_pages_start;
191static struct ftrace_page *ftrace_pages;
192
193static int ftraced_trigger;
194static int ftraced_suspend;
195static int ftraced_stop;
196
197static int ftrace_record_suspend;
198
199static struct dyn_ftrace *ftrace_free_records;
200
201
202#ifdef CONFIG_KPROBES
203static inline void freeze_record(struct dyn_ftrace *rec)
204{
205 if (!(rec->flags & FTRACE_FL_FROZEN)) {
206 rec->flags |= FTRACE_FL_FROZEN;
207 frozen_record_count++;
208 }
209}
210
211static inline void unfreeze_record(struct dyn_ftrace *rec)
212{
213 if (rec->flags & FTRACE_FL_FROZEN) {
214 rec->flags &= ~FTRACE_FL_FROZEN;
215 frozen_record_count--;
216 }
217}
218
219static inline int record_frozen(struct dyn_ftrace *rec)
220{
221 return rec->flags & FTRACE_FL_FROZEN;
222}
223#else
224# define freeze_record(rec) ({ 0; })
225# define unfreeze_record(rec) ({ 0; })
226# define record_frozen(rec) ({ 0; })
227#endif /* CONFIG_KPROBES */
228
229int skip_trace(unsigned long ip)
230{
231 unsigned long fl;
232 struct dyn_ftrace *rec;
233 struct hlist_node *t;
234 struct hlist_head *head;
235
236 if (frozen_record_count == 0)
237 return 0;
238
239 head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)];
240 hlist_for_each_entry_rcu(rec, t, head, node) {
241 if (rec->ip == ip) {
242 if (record_frozen(rec)) {
243 if (rec->flags & FTRACE_FL_FAILED)
244 return 1;
245
246 if (!(rec->flags & FTRACE_FL_CONVERTED))
247 return 1;
248
249 if (!tracing_on || !ftrace_enabled)
250 return 1;
251
252 if (ftrace_filtered) {
253 fl = rec->flags & (FTRACE_FL_FILTER |
254 FTRACE_FL_NOTRACE);
255 if (!fl || (fl & FTRACE_FL_NOTRACE))
256 return 1;
257 }
258 }
259 break;
260 }
261 }
262
263 return 0;
264}
265
266static inline int
267ftrace_ip_in_hash(unsigned long ip, unsigned long key)
268{
269 struct dyn_ftrace *p;
270 struct hlist_node *t;
271 int found = 0;
272
273 hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) {
274 if (p->ip == ip) {
275 found = 1;
276 break;
277 }
278 }
279
280 return found;
281}
282
283static inline void
284ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
285{
286 hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
287}
288
289/* called from kstop_machine */
290static inline void ftrace_del_hash(struct dyn_ftrace *node)
291{
292 hlist_del(&node->node);
293}
294
295static void ftrace_free_rec(struct dyn_ftrace *rec)
296{
297 /* no locking, only called from kstop_machine */
298
299 rec->ip = (unsigned long)ftrace_free_records;
300 ftrace_free_records = rec;
301 rec->flags |= FTRACE_FL_FREE;
302}
303
304static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
305{
306 struct dyn_ftrace *rec;
307
308 /* First check for freed records */
309 if (ftrace_free_records) {
310 rec = ftrace_free_records;
311
312 if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
313 WARN_ON_ONCE(1);
314 ftrace_free_records = NULL;
315 ftrace_disabled = 1;
316 ftrace_enabled = 0;
317 return NULL;
318 }
319
320 ftrace_free_records = (void *)rec->ip;
321 memset(rec, 0, sizeof(*rec));
322 return rec;
323 }
324
325 if (ftrace_pages->index == ENTRIES_PER_PAGE) {
326 if (!ftrace_pages->next)
327 return NULL;
328 ftrace_pages = ftrace_pages->next;
329 }
330
331 return &ftrace_pages->records[ftrace_pages->index++];
332}
333
334static void
335ftrace_record_ip(unsigned long ip)
336{
337 struct dyn_ftrace *node;
338 unsigned long flags;
339 unsigned long key;
340 int resched;
341 int atomic;
342 int cpu;
343
344 if (!ftrace_enabled || ftrace_disabled)
345 return;
346
347 resched = need_resched();
348 preempt_disable_notrace();
349
350 /*
351 * We simply need to protect against recursion.
352 * Use the the raw version of smp_processor_id and not
353 * __get_cpu_var which can call debug hooks that can
354 * cause a recursive crash here.
355 */
356 cpu = raw_smp_processor_id();
357 per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
358 if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
359 goto out;
360
361 if (unlikely(ftrace_record_suspend))
362 goto out;
363
364 key = hash_long(ip, FTRACE_HASHBITS);
365
366 WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
367
368 if (ftrace_ip_in_hash(ip, key))
369 goto out;
370
371 atomic = irqs_disabled();
372
373 spin_lock_irqsave(&ftrace_shutdown_lock, flags);
374
375 /* This ip may have hit the hash before the lock */
376 if (ftrace_ip_in_hash(ip, key))
377 goto out_unlock;
378
379 node = ftrace_alloc_dyn_node(ip);
380 if (!node)
381 goto out_unlock;
382
383 node->ip = ip;
384
385 ftrace_add_hash(node, key);
386
387 ftraced_trigger = 1;
388
389 out_unlock:
390 spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
391 out:
392 per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
393
394 /* prevent recursion with scheduler */
395 if (resched)
396 preempt_enable_no_resched_notrace();
397 else
398 preempt_enable_notrace();
399}
400
401#define FTRACE_ADDR ((long)(ftrace_caller))
402
403static int
404__ftrace_replace_code(struct dyn_ftrace *rec,
405 unsigned char *old, unsigned char *new, int enable)
406{
407 unsigned long ip, fl;
408
409 ip = rec->ip;
410
411 if (ftrace_filtered && enable) {
412 /*
413 * If filtering is on:
414 *
415 * If this record is set to be filtered and
416 * is enabled then do nothing.
417 *
418 * If this record is set to be filtered and
419 * it is not enabled, enable it.
420 *
421 * If this record is not set to be filtered
422 * and it is not enabled do nothing.
423 *
424 * If this record is set not to trace then
425 * do nothing.
426 *
427 * If this record is set not to trace and
428 * it is enabled then disable it.
429 *
430 * If this record is not set to be filtered and
431 * it is enabled, disable it.
432 */
433
434 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE |
435 FTRACE_FL_ENABLED);
436
437 if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
438 (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) ||
439 !fl || (fl == FTRACE_FL_NOTRACE))
440 return 0;
441
442 /*
443 * If it is enabled disable it,
444 * otherwise enable it!
445 */
446 if (fl & FTRACE_FL_ENABLED) {
447 /* swap new and old */
448 new = old;
449 old = ftrace_call_replace(ip, FTRACE_ADDR);
450 rec->flags &= ~FTRACE_FL_ENABLED;
451 } else {
452 new = ftrace_call_replace(ip, FTRACE_ADDR);
453 rec->flags |= FTRACE_FL_ENABLED;
454 }
455 } else {
456
457 if (enable) {
458 /*
459 * If this record is set not to trace and is
460 * not enabled, do nothing.
461 */
462 fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
463 if (fl == FTRACE_FL_NOTRACE)
464 return 0;
465
466 new = ftrace_call_replace(ip, FTRACE_ADDR);
467 } else
468 old = ftrace_call_replace(ip, FTRACE_ADDR);
469
470 if (enable) {
471 if (rec->flags & FTRACE_FL_ENABLED)
472 return 0;
473 rec->flags |= FTRACE_FL_ENABLED;
474 } else {
475 if (!(rec->flags & FTRACE_FL_ENABLED))
476 return 0;
477 rec->flags &= ~FTRACE_FL_ENABLED;
478 }
479 }
480
481 return ftrace_modify_code(ip, old, new);
482}
483
484static void ftrace_replace_code(int enable)
485{
486 int i, failed;
487 unsigned char *new = NULL, *old = NULL;
488 struct dyn_ftrace *rec;
489 struct ftrace_page *pg;
490
491 if (enable)
492 old = ftrace_nop_replace();
493 else
494 new = ftrace_nop_replace();
495
496 for (pg = ftrace_pages_start; pg; pg = pg->next) {
497 for (i = 0; i < pg->index; i++) {
498 rec = &pg->records[i];
499
500 /* don't modify code that has already faulted */
501 if (rec->flags & FTRACE_FL_FAILED)
502 continue;
503
504 /* ignore updates to this record's mcount site */
505 if (get_kprobe((void *)rec->ip)) {
506 freeze_record(rec);
507 continue;
508 } else {
509 unfreeze_record(rec);
510 }
511
512 failed = __ftrace_replace_code(rec, old, new, enable);
513 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
514 rec->flags |= FTRACE_FL_FAILED;
515 if ((system_state == SYSTEM_BOOTING) ||
516 !core_kernel_text(rec->ip)) {
517 ftrace_del_hash(rec);
518 ftrace_free_rec(rec);
519 }
520 }
521 }
522 }
523}
524
525static void ftrace_shutdown_replenish(void)
526{
527 if (ftrace_pages->next)
528 return;
529
530 /* allocate another page */
531 ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
532}
533
534static int
535ftrace_code_disable(struct dyn_ftrace *rec)
536{
537 unsigned long ip;
538 unsigned char *nop, *call;
539 int failed;
540
541 ip = rec->ip;
542
543 nop = ftrace_nop_replace();
544 call = ftrace_call_replace(ip, MCOUNT_ADDR);
545
546 failed = ftrace_modify_code(ip, call, nop);
547 if (failed) {
548 rec->flags |= FTRACE_FL_FAILED;
549 return 0;
550 }
551 return 1;
552}
553
554static int __ftrace_update_code(void *ignore);
555
556static int __ftrace_modify_code(void *data)
557{
558 unsigned long addr;
559 int *command = data;
560
561 if (*command & FTRACE_ENABLE_CALLS) {
562 /*
563 * Update any recorded ips now that we have the
564 * machine stopped
565 */
566 __ftrace_update_code(NULL);
567 ftrace_replace_code(1);
568 tracing_on = 1;
569 } else if (*command & FTRACE_DISABLE_CALLS) {
570 ftrace_replace_code(0);
571 tracing_on = 0;
572 }
573
574 if (*command & FTRACE_UPDATE_TRACE_FUNC)
575 ftrace_update_ftrace_func(ftrace_trace_function);
576
577 if (*command & FTRACE_ENABLE_MCOUNT) {
578 addr = (unsigned long)ftrace_record_ip;
579 ftrace_mcount_set(&addr);
580 } else if (*command & FTRACE_DISABLE_MCOUNT) {
581 addr = (unsigned long)ftrace_stub;
582 ftrace_mcount_set(&addr);
583 }
584
585 return 0;
586}
587
588static void ftrace_run_update_code(int command)
589{
590 stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
591}
592
593void ftrace_disable_daemon(void)
594{
595 /* Stop the daemon from calling kstop_machine */
596 mutex_lock(&ftraced_lock);
597 ftraced_stop = 1;
598 mutex_unlock(&ftraced_lock);
599
600 ftrace_force_update();
601}
602
603void ftrace_enable_daemon(void)
604{
605 mutex_lock(&ftraced_lock);
606 ftraced_stop = 0;
607 mutex_unlock(&ftraced_lock);
608
609 ftrace_force_update();
610}
611
612static ftrace_func_t saved_ftrace_func;
613
614static void ftrace_startup(void)
615{
616 int command = 0;
617
618 if (unlikely(ftrace_disabled))
619 return;
620
621 mutex_lock(&ftraced_lock);
622 ftraced_suspend++;
623 if (ftraced_suspend == 1)
624 command |= FTRACE_ENABLE_CALLS;
625
626 if (saved_ftrace_func != ftrace_trace_function) {
627 saved_ftrace_func = ftrace_trace_function;
628 command |= FTRACE_UPDATE_TRACE_FUNC;
629 }
630
631 if (!command || !ftrace_enabled)
632 goto out;
633
634 ftrace_run_update_code(command);
635 out:
636 mutex_unlock(&ftraced_lock);
637}
638
639static void ftrace_shutdown(void)
640{
641 int command = 0;
642
643 if (unlikely(ftrace_disabled))
644 return;
645
646 mutex_lock(&ftraced_lock);
647 ftraced_suspend--;
648 if (!ftraced_suspend)
649 command |= FTRACE_DISABLE_CALLS;
650
651 if (saved_ftrace_func != ftrace_trace_function) {
652 saved_ftrace_func = ftrace_trace_function;
653 command |= FTRACE_UPDATE_TRACE_FUNC;
654 }
655
656 if (!command || !ftrace_enabled)
657 goto out;
658
659 ftrace_run_update_code(command);
660 out:
661 mutex_unlock(&ftraced_lock);
662}
663
664static void ftrace_startup_sysctl(void)
665{
666 int command = FTRACE_ENABLE_MCOUNT;
667
668 if (unlikely(ftrace_disabled))
669 return;
670
671 mutex_lock(&ftraced_lock);
672 /* Force update next time */
673 saved_ftrace_func = NULL;
674 /* ftraced_suspend is true if we want ftrace running */
675 if (ftraced_suspend)
676 command |= FTRACE_ENABLE_CALLS;
677
678 ftrace_run_update_code(command);
679 mutex_unlock(&ftraced_lock);
680}
681
682static void ftrace_shutdown_sysctl(void)
683{
684 int command = FTRACE_DISABLE_MCOUNT;
685
686 if (unlikely(ftrace_disabled))
687 return;
688
689 mutex_lock(&ftraced_lock);
690 /* ftraced_suspend is true if ftrace is running */
691 if (ftraced_suspend)
692 command |= FTRACE_DISABLE_CALLS;
693
694 ftrace_run_update_code(command);
695 mutex_unlock(&ftraced_lock);
696}
697
698static cycle_t ftrace_update_time;
699static unsigned long ftrace_update_cnt;
700unsigned long ftrace_update_tot_cnt;
701
702static int __ftrace_update_code(void *ignore)
703{
704 int i, save_ftrace_enabled;
705 cycle_t start, stop;
706 struct dyn_ftrace *p;
707 struct hlist_node *t, *n;
708 struct hlist_head *head, temp_list;
709
710 /* Don't be recording funcs now */
711 ftrace_record_suspend++;
712 save_ftrace_enabled = ftrace_enabled;
713 ftrace_enabled = 0;
714
715 start = ftrace_now(raw_smp_processor_id());
716 ftrace_update_cnt = 0;
717
718 /* No locks needed, the machine is stopped! */
719 for (i = 0; i < FTRACE_HASHSIZE; i++) {
720 INIT_HLIST_HEAD(&temp_list);
721 head = &ftrace_hash[i];
722
723 /* all CPUS are stopped, we are safe to modify code */
724 hlist_for_each_entry_safe(p, t, n, head, node) {
725 /* Skip over failed records which have not been
726 * freed. */
727 if (p->flags & FTRACE_FL_FAILED)
728 continue;
729
730 /* Unconverted records are always at the head of the
731 * hash bucket. Once we encounter a converted record,
732 * simply skip over to the next bucket. Saves ftraced
733 * some processor cycles (ftrace does its bid for
734 * global warming :-p ). */
735 if (p->flags & (FTRACE_FL_CONVERTED))
736 break;
737
738 /* Ignore updates to this record's mcount site.
739 * Reintroduce this record at the head of this
740 * bucket to attempt to "convert" it again if
741 * the kprobe on it is unregistered before the
742 * next run. */
743 if (get_kprobe((void *)p->ip)) {
744 ftrace_del_hash(p);
745 INIT_HLIST_NODE(&p->node);
746 hlist_add_head(&p->node, &temp_list);
747 freeze_record(p);
748 continue;
749 } else {
750 unfreeze_record(p);
751 }
752
753 /* convert record (i.e, patch mcount-call with NOP) */
754 if (ftrace_code_disable(p)) {
755 p->flags |= FTRACE_FL_CONVERTED;
756 ftrace_update_cnt++;
757 } else {
758 if ((system_state == SYSTEM_BOOTING) ||
759 !core_kernel_text(p->ip)) {
760 ftrace_del_hash(p);
761 ftrace_free_rec(p);
762 }
763 }
764 }
765
766 hlist_for_each_entry_safe(p, t, n, &temp_list, node) {
767 hlist_del(&p->node);
768 INIT_HLIST_NODE(&p->node);
769 hlist_add_head(&p->node, head);
770 }
771 }
772
773 stop = ftrace_now(raw_smp_processor_id());
774 ftrace_update_time = stop - start;
775 ftrace_update_tot_cnt += ftrace_update_cnt;
776 ftraced_trigger = 0;
777
778 ftrace_enabled = save_ftrace_enabled;
779 ftrace_record_suspend--;
780
781 return 0;
782}
783
784static int ftrace_update_code(void)
785{
786 if (unlikely(ftrace_disabled) ||
787 !ftrace_enabled || !ftraced_trigger)
788 return 0;
789
790 stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
791
792 return 1;
793}
794
795static int ftraced(void *ignore)
796{
797 unsigned long usecs;
798
799 while (!kthread_should_stop()) {
800
801 set_current_state(TASK_INTERRUPTIBLE);
802
803 /* check once a second */
804 schedule_timeout(HZ);
805
806 if (unlikely(ftrace_disabled))
807 continue;
808
809 mutex_lock(&ftrace_sysctl_lock);
810 mutex_lock(&ftraced_lock);
811 if (!ftraced_suspend && !ftraced_stop &&
812 ftrace_update_code()) {
813 usecs = nsecs_to_usecs(ftrace_update_time);
814 if (ftrace_update_tot_cnt > 100000) {
815 ftrace_update_tot_cnt = 0;
816 pr_info("hm, dftrace overflow: %lu change%s"
817 " (%lu total) in %lu usec%s\n",
818 ftrace_update_cnt,
819 ftrace_update_cnt != 1 ? "s" : "",
820 ftrace_update_tot_cnt,
821 usecs, usecs != 1 ? "s" : "");
822 ftrace_disabled = 1;
823 WARN_ON_ONCE(1);
824 }
825 }
826 mutex_unlock(&ftraced_lock);
827 mutex_unlock(&ftrace_sysctl_lock);
828
829 ftrace_shutdown_replenish();
830 }
831 __set_current_state(TASK_RUNNING);
832 return 0;
833}
834
835static int __init ftrace_dyn_table_alloc(void)
836{
837 struct ftrace_page *pg;
838 int cnt;
839 int i;
840
841 /* allocate a few pages */
842 ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
843 if (!ftrace_pages_start)
844 return -1;
845
846 /*
847 * Allocate a few more pages.
848 *
849 * TODO: have some parser search vmlinux before
850 * final linking to find all calls to ftrace.
851 * Then we can:
852 * a) know how many pages to allocate.
853 * and/or
854 * b) set up the table then.
855 *
856 * The dynamic code is still necessary for
857 * modules.
858 */
859
860 pg = ftrace_pages = ftrace_pages_start;
861
862 cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
863
864 for (i = 0; i < cnt; i++) {
865 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
866
867 /* If we fail, we'll try later anyway */
868 if (!pg->next)
869 break;
870
871 pg = pg->next;
872 }
873
874 return 0;
875}
876
877enum {
878 FTRACE_ITER_FILTER = (1 << 0),
879 FTRACE_ITER_CONT = (1 << 1),
880 FTRACE_ITER_NOTRACE = (1 << 2),
881 FTRACE_ITER_FAILURES = (1 << 3),
882};
883
884#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
885
886struct ftrace_iterator {
887 loff_t pos;
888 struct ftrace_page *pg;
889 unsigned idx;
890 unsigned flags;
891 unsigned char buffer[FTRACE_BUFF_MAX+1];
892 unsigned buffer_idx;
893 unsigned filtered;
894};
895
896static void *
897t_next(struct seq_file *m, void *v, loff_t *pos)
898{
899 struct ftrace_iterator *iter = m->private;
900 struct dyn_ftrace *rec = NULL;
901
902 (*pos)++;
903
904 retry:
905 if (iter->idx >= iter->pg->index) {
906 if (iter->pg->next) {
907 iter->pg = iter->pg->next;
908 iter->idx = 0;
909 goto retry;
910 }
911 } else {
912 rec = &iter->pg->records[iter->idx++];
913 if ((!(iter->flags & FTRACE_ITER_FAILURES) &&
914 (rec->flags & FTRACE_FL_FAILED)) ||
915
916 ((iter->flags & FTRACE_ITER_FAILURES) &&
917 (!(rec->flags & FTRACE_FL_FAILED) ||
918 (rec->flags & FTRACE_FL_FREE))) ||
919
920 ((iter->flags & FTRACE_ITER_FILTER) &&
921 !(rec->flags & FTRACE_FL_FILTER)) ||
922
923 ((iter->flags & FTRACE_ITER_NOTRACE) &&
924 !(rec->flags & FTRACE_FL_NOTRACE))) {
925 rec = NULL;
926 goto retry;
927 }
928 }
929
930 iter->pos = *pos;
931
932 return rec;
933}
934
935static void *t_start(struct seq_file *m, loff_t *pos)
936{
937 struct ftrace_iterator *iter = m->private;
938 void *p = NULL;
939 loff_t l = -1;
940
941 if (*pos != iter->pos) {
942 for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
943 ;
944 } else {
945 l = *pos;
946 p = t_next(m, p, &l);
947 }
948
949 return p;
950}
951
952static void t_stop(struct seq_file *m, void *p)
953{
954}
955
956static int t_show(struct seq_file *m, void *v)
957{
958 struct dyn_ftrace *rec = v;
959 char str[KSYM_SYMBOL_LEN];
960
961 if (!rec)
962 return 0;
963
964 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
965
966 seq_printf(m, "%s\n", str);
967
968 return 0;
969}
970
971static struct seq_operations show_ftrace_seq_ops = {
972 .start = t_start,
973 .next = t_next,
974 .stop = t_stop,
975 .show = t_show,
976};
977
978static int
979ftrace_avail_open(struct inode *inode, struct file *file)
980{
981 struct ftrace_iterator *iter;
982 int ret;
983
984 if (unlikely(ftrace_disabled))
985 return -ENODEV;
986
987 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
988 if (!iter)
989 return -ENOMEM;
990
991 iter->pg = ftrace_pages_start;
992 iter->pos = -1;
993
994 ret = seq_open(file, &show_ftrace_seq_ops);
995 if (!ret) {
996 struct seq_file *m = file->private_data;
997
998 m->private = iter;
999 } else {
1000 kfree(iter);
1001 }
1002
1003 return ret;
1004}
1005
1006int ftrace_avail_release(struct inode *inode, struct file *file)
1007{
1008 struct seq_file *m = (struct seq_file *)file->private_data;
1009 struct ftrace_iterator *iter = m->private;
1010
1011 seq_release(inode, file);
1012 kfree(iter);
1013
1014 return 0;
1015}
1016
1017static int
1018ftrace_failures_open(struct inode *inode, struct file *file)
1019{
1020 int ret;
1021 struct seq_file *m;
1022 struct ftrace_iterator *iter;
1023
1024 ret = ftrace_avail_open(inode, file);
1025 if (!ret) {
1026 m = (struct seq_file *)file->private_data;
1027 iter = (struct ftrace_iterator *)m->private;
1028 iter->flags = FTRACE_ITER_FAILURES;
1029 }
1030
1031 return ret;
1032}
1033
1034
1035static void ftrace_filter_reset(int enable)
1036{
1037 struct ftrace_page *pg;
1038 struct dyn_ftrace *rec;
1039 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1040 unsigned i;
1041
1042 /* keep kstop machine from running */
1043 preempt_disable();
1044 if (enable)
1045 ftrace_filtered = 0;
1046 pg = ftrace_pages_start;
1047 while (pg) {
1048 for (i = 0; i < pg->index; i++) {
1049 rec = &pg->records[i];
1050 if (rec->flags & FTRACE_FL_FAILED)
1051 continue;
1052 rec->flags &= ~type;
1053 }
1054 pg = pg->next;
1055 }
1056 preempt_enable();
1057}
1058
1059static int
1060ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1061{
1062 struct ftrace_iterator *iter;
1063 int ret = 0;
1064
1065 if (unlikely(ftrace_disabled))
1066 return -ENODEV;
1067
1068 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
1069 if (!iter)
1070 return -ENOMEM;
1071
1072 mutex_lock(&ftrace_regex_lock);
1073 if ((file->f_mode & FMODE_WRITE) &&
1074 !(file->f_flags & O_APPEND))
1075 ftrace_filter_reset(enable);
1076
1077 if (file->f_mode & FMODE_READ) {
1078 iter->pg = ftrace_pages_start;
1079 iter->pos = -1;
1080 iter->flags = enable ? FTRACE_ITER_FILTER :
1081 FTRACE_ITER_NOTRACE;
1082
1083 ret = seq_open(file, &show_ftrace_seq_ops);
1084 if (!ret) {
1085 struct seq_file *m = file->private_data;
1086 m->private = iter;
1087 } else
1088 kfree(iter);
1089 } else
1090 file->private_data = iter;
1091 mutex_unlock(&ftrace_regex_lock);
1092
1093 return ret;
1094}
1095
1096static int
1097ftrace_filter_open(struct inode *inode, struct file *file)
1098{
1099 return ftrace_regex_open(inode, file, 1);
1100}
1101
1102static int
1103ftrace_notrace_open(struct inode *inode, struct file *file)
1104{
1105 return ftrace_regex_open(inode, file, 0);
1106}
1107
1108static ssize_t
1109ftrace_regex_read(struct file *file, char __user *ubuf,
1110 size_t cnt, loff_t *ppos)
1111{
1112 if (file->f_mode & FMODE_READ)
1113 return seq_read(file, ubuf, cnt, ppos);
1114 else
1115 return -EPERM;
1116}
1117
1118static loff_t
1119ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1120{
1121 loff_t ret;
1122
1123 if (file->f_mode & FMODE_READ)
1124 ret = seq_lseek(file, offset, origin);
1125 else
1126 file->f_pos = ret = 1;
1127
1128 return ret;
1129}
1130
1131enum {
1132 MATCH_FULL,
1133 MATCH_FRONT_ONLY,
1134 MATCH_MIDDLE_ONLY,
1135 MATCH_END_ONLY,
1136};
1137
1138static void
1139ftrace_match(unsigned char *buff, int len, int enable)
1140{
1141 char str[KSYM_SYMBOL_LEN];
1142 char *search = NULL;
1143 struct ftrace_page *pg;
1144 struct dyn_ftrace *rec;
1145 int type = MATCH_FULL;
1146 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1147 unsigned i, match = 0, search_len = 0;
1148
1149 for (i = 0; i < len; i++) {
1150 if (buff[i] == '*') {
1151 if (!i) {
1152 search = buff + i + 1;
1153 type = MATCH_END_ONLY;
1154 search_len = len - (i + 1);
1155 } else {
1156 if (type == MATCH_END_ONLY) {
1157 type = MATCH_MIDDLE_ONLY;
1158 } else {
1159 match = i;
1160 type = MATCH_FRONT_ONLY;
1161 }
1162 buff[i] = 0;
1163 break;
1164 }
1165 }
1166 }
1167
1168 /* keep kstop machine from running */
1169 preempt_disable();
1170 if (enable)
1171 ftrace_filtered = 1;
1172 pg = ftrace_pages_start;
1173 while (pg) {
1174 for (i = 0; i < pg->index; i++) {
1175 int matched = 0;
1176 char *ptr;
1177
1178 rec = &pg->records[i];
1179 if (rec->flags & FTRACE_FL_FAILED)
1180 continue;
1181 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1182 switch (type) {
1183 case MATCH_FULL:
1184 if (strcmp(str, buff) == 0)
1185 matched = 1;
1186 break;
1187 case MATCH_FRONT_ONLY:
1188 if (memcmp(str, buff, match) == 0)
1189 matched = 1;
1190 break;
1191 case MATCH_MIDDLE_ONLY:
1192 if (strstr(str, search))
1193 matched = 1;
1194 break;
1195 case MATCH_END_ONLY:
1196 ptr = strstr(str, search);
1197 if (ptr && (ptr[search_len] == 0))
1198 matched = 1;
1199 break;
1200 }
1201 if (matched)
1202 rec->flags |= flag;
1203 }
1204 pg = pg->next;
1205 }
1206 preempt_enable();
1207}
1208
1209static ssize_t
1210ftrace_regex_write(struct file *file, const char __user *ubuf,
1211 size_t cnt, loff_t *ppos, int enable)
1212{
1213 struct ftrace_iterator *iter;
1214 char ch;
1215 size_t read = 0;
1216 ssize_t ret;
1217
1218 if (!cnt || cnt < 0)
1219 return 0;
1220
1221 mutex_lock(&ftrace_regex_lock);
1222
1223 if (file->f_mode & FMODE_READ) {
1224 struct seq_file *m = file->private_data;
1225 iter = m->private;
1226 } else
1227 iter = file->private_data;
1228
1229 if (!*ppos) {
1230 iter->flags &= ~FTRACE_ITER_CONT;
1231 iter->buffer_idx = 0;
1232 }
1233
1234 ret = get_user(ch, ubuf++);
1235 if (ret)
1236 goto out;
1237 read++;
1238 cnt--;
1239
1240 if (!(iter->flags & ~FTRACE_ITER_CONT)) {
1241 /* skip white space */
1242 while (cnt && isspace(ch)) {
1243 ret = get_user(ch, ubuf++);
1244 if (ret)
1245 goto out;
1246 read++;
1247 cnt--;
1248 }
1249
1250 if (isspace(ch)) {
1251 file->f_pos += read;
1252 ret = read;
1253 goto out;
1254 }
1255
1256 iter->buffer_idx = 0;
1257 }
1258
1259 while (cnt && !isspace(ch)) {
1260 if (iter->buffer_idx < FTRACE_BUFF_MAX)
1261 iter->buffer[iter->buffer_idx++] = ch;
1262 else {
1263 ret = -EINVAL;
1264 goto out;
1265 }
1266 ret = get_user(ch, ubuf++);
1267 if (ret)
1268 goto out;
1269 read++;
1270 cnt--;
1271 }
1272
1273 if (isspace(ch)) {
1274 iter->filtered++;
1275 iter->buffer[iter->buffer_idx] = 0;
1276 ftrace_match(iter->buffer, iter->buffer_idx, enable);
1277 iter->buffer_idx = 0;
1278 } else
1279 iter->flags |= FTRACE_ITER_CONT;
1280
1281
1282 file->f_pos += read;
1283
1284 ret = read;
1285 out:
1286 mutex_unlock(&ftrace_regex_lock);
1287
1288 return ret;
1289}
1290
1291static ssize_t
1292ftrace_filter_write(struct file *file, const char __user *ubuf,
1293 size_t cnt, loff_t *ppos)
1294{
1295 return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
1296}
1297
1298static ssize_t
1299ftrace_notrace_write(struct file *file, const char __user *ubuf,
1300 size_t cnt, loff_t *ppos)
1301{
1302 return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
1303}
1304
1305static void
1306ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
1307{
1308 if (unlikely(ftrace_disabled))
1309 return;
1310
1311 mutex_lock(&ftrace_regex_lock);
1312 if (reset)
1313 ftrace_filter_reset(enable);
1314 if (buf)
1315 ftrace_match(buf, len, enable);
1316 mutex_unlock(&ftrace_regex_lock);
1317}
1318
1319/**
1320 * ftrace_set_filter - set a function to filter on in ftrace
1321 * @buf - the string that holds the function filter text.
1322 * @len - the length of the string.
1323 * @reset - non zero to reset all filters before applying this filter.
1324 *
1325 * Filters denote which functions should be enabled when tracing is enabled.
1326 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
1327 */
1328void ftrace_set_filter(unsigned char *buf, int len, int reset)
1329{
1330 ftrace_set_regex(buf, len, reset, 1);
1331}
1332
1333/**
1334 * ftrace_set_notrace - set a function to not trace in ftrace
1335 * @buf - the string that holds the function notrace text.
1336 * @len - the length of the string.
1337 * @reset - non zero to reset all filters before applying this filter.
1338 *
1339 * Notrace Filters denote which functions should not be enabled when tracing
1340 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
1341 * for tracing.
1342 */
1343void ftrace_set_notrace(unsigned char *buf, int len, int reset)
1344{
1345 ftrace_set_regex(buf, len, reset, 0);
1346}
1347
1348static int
1349ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1350{
1351 struct seq_file *m = (struct seq_file *)file->private_data;
1352 struct ftrace_iterator *iter;
1353
1354 mutex_lock(&ftrace_regex_lock);
1355 if (file->f_mode & FMODE_READ) {
1356 iter = m->private;
1357
1358 seq_release(inode, file);
1359 } else
1360 iter = file->private_data;
1361
1362 if (iter->buffer_idx) {
1363 iter->filtered++;
1364 iter->buffer[iter->buffer_idx] = 0;
1365 ftrace_match(iter->buffer, iter->buffer_idx, enable);
1366 }
1367
1368 mutex_lock(&ftrace_sysctl_lock);
1369 mutex_lock(&ftraced_lock);
1370 if (iter->filtered && ftraced_suspend && ftrace_enabled)
1371 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1372 mutex_unlock(&ftraced_lock);
1373 mutex_unlock(&ftrace_sysctl_lock);
1374
1375 kfree(iter);
1376 mutex_unlock(&ftrace_regex_lock);
1377 return 0;
1378}
1379
1380static int
1381ftrace_filter_release(struct inode *inode, struct file *file)
1382{
1383 return ftrace_regex_release(inode, file, 1);
1384}
1385
1386static int
1387ftrace_notrace_release(struct inode *inode, struct file *file)
1388{
1389 return ftrace_regex_release(inode, file, 0);
1390}
1391
1392static ssize_t
1393ftraced_read(struct file *filp, char __user *ubuf,
1394 size_t cnt, loff_t *ppos)
1395{
1396 /* don't worry about races */
1397 char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
1398 int r = strlen(buf);
1399
1400 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
1401}
1402
1403static ssize_t
1404ftraced_write(struct file *filp, const char __user *ubuf,
1405 size_t cnt, loff_t *ppos)
1406{
1407 char buf[64];
1408 long val;
1409 int ret;
1410
1411 if (cnt >= sizeof(buf))
1412 return -EINVAL;
1413
1414 if (copy_from_user(&buf, ubuf, cnt))
1415 return -EFAULT;
1416
1417 if (strncmp(buf, "enable", 6) == 0)
1418 val = 1;
1419 else if (strncmp(buf, "disable", 7) == 0)
1420 val = 0;
1421 else {
1422 buf[cnt] = 0;
1423
1424 ret = strict_strtoul(buf, 10, &val);
1425 if (ret < 0)
1426 return ret;
1427
1428 val = !!val;
1429 }
1430
1431 if (val)
1432 ftrace_enable_daemon();
1433 else
1434 ftrace_disable_daemon();
1435
1436 filp->f_pos += cnt;
1437
1438 return cnt;
1439}
1440
1441static struct file_operations ftrace_avail_fops = {
1442 .open = ftrace_avail_open,
1443 .read = seq_read,
1444 .llseek = seq_lseek,
1445 .release = ftrace_avail_release,
1446};
1447
1448static struct file_operations ftrace_failures_fops = {
1449 .open = ftrace_failures_open,
1450 .read = seq_read,
1451 .llseek = seq_lseek,
1452 .release = ftrace_avail_release,
1453};
1454
1455static struct file_operations ftrace_filter_fops = {
1456 .open = ftrace_filter_open,
1457 .read = ftrace_regex_read,
1458 .write = ftrace_filter_write,
1459 .llseek = ftrace_regex_lseek,
1460 .release = ftrace_filter_release,
1461};
1462
1463static struct file_operations ftrace_notrace_fops = {
1464 .open = ftrace_notrace_open,
1465 .read = ftrace_regex_read,
1466 .write = ftrace_notrace_write,
1467 .llseek = ftrace_regex_lseek,
1468 .release = ftrace_notrace_release,
1469};
1470
1471static struct file_operations ftraced_fops = {
1472 .open = tracing_open_generic,
1473 .read = ftraced_read,
1474 .write = ftraced_write,
1475};
1476
1477/**
1478 * ftrace_force_update - force an update to all recording ftrace functions
1479 */
1480int ftrace_force_update(void)
1481{
1482 int ret = 0;
1483
1484 if (unlikely(ftrace_disabled))
1485 return -ENODEV;
1486
1487 mutex_lock(&ftrace_sysctl_lock);
1488 mutex_lock(&ftraced_lock);
1489
1490 /*
1491 * If ftraced_trigger is not set, then there is nothing
1492 * to update.
1493 */
1494 if (ftraced_trigger && !ftrace_update_code())
1495 ret = -EBUSY;
1496
1497 mutex_unlock(&ftraced_lock);
1498 mutex_unlock(&ftrace_sysctl_lock);
1499
1500 return ret;
1501}
1502
1503static void ftrace_force_shutdown(void)
1504{
1505 struct task_struct *task;
1506 int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
1507
1508 mutex_lock(&ftraced_lock);
1509 task = ftraced_task;
1510 ftraced_task = NULL;
1511 ftraced_suspend = -1;
1512 ftrace_run_update_code(command);
1513 mutex_unlock(&ftraced_lock);
1514
1515 if (task)
1516 kthread_stop(task);
1517}
1518
1519static __init int ftrace_init_debugfs(void)
1520{
1521 struct dentry *d_tracer;
1522 struct dentry *entry;
1523
1524 d_tracer = tracing_init_dentry();
1525
1526 entry = debugfs_create_file("available_filter_functions", 0444,
1527 d_tracer, NULL, &ftrace_avail_fops);
1528 if (!entry)
1529 pr_warning("Could not create debugfs "
1530 "'available_filter_functions' entry\n");
1531
1532 entry = debugfs_create_file("failures", 0444,
1533 d_tracer, NULL, &ftrace_failures_fops);
1534 if (!entry)
1535 pr_warning("Could not create debugfs 'failures' entry\n");
1536
1537 entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
1538 NULL, &ftrace_filter_fops);
1539 if (!entry)
1540 pr_warning("Could not create debugfs "
1541 "'set_ftrace_filter' entry\n");
1542
1543 entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
1544 NULL, &ftrace_notrace_fops);
1545 if (!entry)
1546 pr_warning("Could not create debugfs "
1547 "'set_ftrace_notrace' entry\n");
1548
1549 entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer,
1550 NULL, &ftraced_fops);
1551 if (!entry)
1552 pr_warning("Could not create debugfs "
1553 "'ftraced_enabled' entry\n");
1554 return 0;
1555}
1556
1557fs_initcall(ftrace_init_debugfs);
1558
1559static int __init ftrace_dynamic_init(void)
1560{
1561 struct task_struct *p;
1562 unsigned long addr;
1563 int ret;
1564
1565 addr = (unsigned long)ftrace_record_ip;
1566
1567 stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
1568
1569 /* ftrace_dyn_arch_init places the return code in addr */
1570 if (addr) {
1571 ret = (int)addr;
1572 goto failed;
1573 }
1574
1575 ret = ftrace_dyn_table_alloc();
1576 if (ret)
1577 goto failed;
1578
1579 p = kthread_run(ftraced, NULL, "ftraced");
1580 if (IS_ERR(p)) {
1581 ret = -1;
1582 goto failed;
1583 }
1584
1585 last_ftrace_enabled = ftrace_enabled = 1;
1586 ftraced_task = p;
1587
1588 return 0;
1589
1590 failed:
1591 ftrace_disabled = 1;
1592 return ret;
1593}
1594
1595core_initcall(ftrace_dynamic_init);
1596#else
1597# define ftrace_startup() do { } while (0)
1598# define ftrace_shutdown() do { } while (0)
1599# define ftrace_startup_sysctl() do { } while (0)
1600# define ftrace_shutdown_sysctl() do { } while (0)
1601# define ftrace_force_shutdown() do { } while (0)
1602#endif /* CONFIG_DYNAMIC_FTRACE */
1603
1604/**
1605 * ftrace_kill_atomic - kill ftrace from critical sections
1606 *
1607 * This function should be used by panic code. It stops ftrace
1608 * but in a not so nice way. If you need to simply kill ftrace
1609 * from a non-atomic section, use ftrace_kill.
1610 */
1611void ftrace_kill_atomic(void)
1612{
1613 ftrace_disabled = 1;
1614 ftrace_enabled = 0;
1615#ifdef CONFIG_DYNAMIC_FTRACE
1616 ftraced_suspend = -1;
1617#endif
1618 clear_ftrace_function();
1619}
1620
1621/**
1622 * ftrace_kill - totally shutdown ftrace
1623 *
1624 * This is a safety measure. If something was detected that seems
1625 * wrong, calling this function will keep ftrace from doing
1626 * any more modifications, and updates.
1627 * used when something went wrong.
1628 */
1629void ftrace_kill(void)
1630{
1631 mutex_lock(&ftrace_sysctl_lock);
1632 ftrace_disabled = 1;
1633 ftrace_enabled = 0;
1634
1635 clear_ftrace_function();
1636 mutex_unlock(&ftrace_sysctl_lock);
1637
1638 /* Try to totally disable ftrace */
1639 ftrace_force_shutdown();
1640}
1641
1642/**
1643 * register_ftrace_function - register a function for profiling
1644 * @ops - ops structure that holds the function for profiling.
1645 *
1646 * Register a function to be called by all functions in the
1647 * kernel.
1648 *
1649 * Note: @ops->func and all the functions it calls must be labeled
1650 * with "notrace", otherwise it will go into a
1651 * recursive loop.
1652 */
1653int register_ftrace_function(struct ftrace_ops *ops)
1654{
1655 int ret;
1656
1657 if (unlikely(ftrace_disabled))
1658 return -1;
1659
1660 mutex_lock(&ftrace_sysctl_lock);
1661 ret = __register_ftrace_function(ops);
1662 ftrace_startup();
1663 mutex_unlock(&ftrace_sysctl_lock);
1664
1665 return ret;
1666}
1667
1668/**
1669 * unregister_ftrace_function - unresgister a function for profiling.
1670 * @ops - ops structure that holds the function to unregister
1671 *
1672 * Unregister a function that was added to be called by ftrace profiling.
1673 */
1674int unregister_ftrace_function(struct ftrace_ops *ops)
1675{
1676 int ret;
1677
1678 mutex_lock(&ftrace_sysctl_lock);
1679 ret = __unregister_ftrace_function(ops);
1680 ftrace_shutdown();
1681 mutex_unlock(&ftrace_sysctl_lock);
1682
1683 return ret;
1684}
1685
1686int
1687ftrace_enable_sysctl(struct ctl_table *table, int write,
1688 struct file *file, void __user *buffer, size_t *lenp,
1689 loff_t *ppos)
1690{
1691 int ret;
1692
1693 if (unlikely(ftrace_disabled))
1694 return -ENODEV;
1695
1696 mutex_lock(&ftrace_sysctl_lock);
1697
1698 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
1699
1700 if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
1701 goto out;
1702
1703 last_ftrace_enabled = ftrace_enabled;
1704
1705 if (ftrace_enabled) {
1706
1707 ftrace_startup_sysctl();
1708
1709 /* we are starting ftrace again */
1710 if (ftrace_list != &ftrace_list_end) {
1711 if (ftrace_list->next == &ftrace_list_end)
1712 ftrace_trace_function = ftrace_list->func;
1713 else
1714 ftrace_trace_function = ftrace_list_func;
1715 }
1716
1717 } else {
1718 /* stopping ftrace calls (just send to ftrace_stub) */
1719 ftrace_trace_function = ftrace_stub;
1720
1721 ftrace_shutdown_sysctl();
1722 }
1723
1724 out:
1725 mutex_unlock(&ftrace_sysctl_lock);
1726 return ret;
1727}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
new file mode 100644
index 000000000000..868e121c8e38
--- /dev/null
+++ b/kernel/trace/trace.c
@@ -0,0 +1,3161 @@
1/*
2 * ring buffer based function tracer
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * Originally taken from the RT patch by:
8 * Arnaldo Carvalho de Melo <acme@redhat.com>
9 *
10 * Based on code from the latency_tracer, that is:
11 * Copyright (C) 2004-2006 Ingo Molnar
12 * Copyright (C) 2004 William Lee Irwin III
13 */
14#include <linux/utsrelease.h>
15#include <linux/kallsyms.h>
16#include <linux/seq_file.h>
17#include <linux/debugfs.h>
18#include <linux/pagemap.h>
19#include <linux/hardirq.h>
20#include <linux/linkage.h>
21#include <linux/uaccess.h>
22#include <linux/ftrace.h>
23#include <linux/module.h>
24#include <linux/percpu.h>
25#include <linux/ctype.h>
26#include <linux/init.h>
27#include <linux/poll.h>
28#include <linux/gfp.h>
29#include <linux/fs.h>
30#include <linux/kprobes.h>
31#include <linux/writeback.h>
32
33#include <linux/stacktrace.h>
34
35#include "trace.h"
36
37unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
38unsigned long __read_mostly tracing_thresh;
39
40static unsigned long __read_mostly tracing_nr_buffers;
41static cpumask_t __read_mostly tracing_buffer_mask;
42
43#define for_each_tracing_cpu(cpu) \
44 for_each_cpu_mask(cpu, tracing_buffer_mask)
45
46static int trace_alloc_page(void);
47static int trace_free_page(void);
48
49static int tracing_disabled = 1;
50
51static unsigned long tracing_pages_allocated;
52
53long
54ns2usecs(cycle_t nsec)
55{
56 nsec += 500;
57 do_div(nsec, 1000);
58 return nsec;
59}
60
61cycle_t ftrace_now(int cpu)
62{
63 return cpu_clock(cpu);
64}
65
66/*
67 * The global_trace is the descriptor that holds the tracing
68 * buffers for the live tracing. For each CPU, it contains
69 * a link list of pages that will store trace entries. The
70 * page descriptor of the pages in the memory is used to hold
71 * the link list by linking the lru item in the page descriptor
72 * to each of the pages in the buffer per CPU.
73 *
74 * For each active CPU there is a data field that holds the
75 * pages for the buffer for that CPU. Each CPU has the same number
76 * of pages allocated for its buffer.
77 */
78static struct trace_array global_trace;
79
80static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
81
82/*
83 * The max_tr is used to snapshot the global_trace when a maximum
84 * latency is reached. Some tracers will use this to store a maximum
85 * trace while it continues examining live traces.
86 *
87 * The buffers for the max_tr are set up the same as the global_trace.
88 * When a snapshot is taken, the link list of the max_tr is swapped
89 * with the link list of the global_trace and the buffers are reset for
90 * the global_trace so the tracing can continue.
91 */
92static struct trace_array max_tr;
93
94static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
95
96/* tracer_enabled is used to toggle activation of a tracer */
97static int tracer_enabled = 1;
98
99/* function tracing enabled */
100int ftrace_function_enabled;
101
102/*
103 * trace_nr_entries is the number of entries that is allocated
104 * for a buffer. Note, the number of entries is always rounded
105 * to ENTRIES_PER_PAGE.
106 */
107static unsigned long trace_nr_entries = 65536UL;
108
109/* trace_types holds a link list of available tracers. */
110static struct tracer *trace_types __read_mostly;
111
112/* current_trace points to the tracer that is currently active */
113static struct tracer *current_trace __read_mostly;
114
115/*
116 * max_tracer_type_len is used to simplify the allocating of
117 * buffers to read userspace tracer names. We keep track of
118 * the longest tracer name registered.
119 */
120static int max_tracer_type_len;
121
122/*
123 * trace_types_lock is used to protect the trace_types list.
124 * This lock is also used to keep user access serialized.
125 * Accesses from userspace will grab this lock while userspace
126 * activities happen inside the kernel.
127 */
128static DEFINE_MUTEX(trace_types_lock);
129
130/* trace_wait is a waitqueue for tasks blocked on trace_poll */
131static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
132
133/* trace_flags holds iter_ctrl options */
134unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
135
136static notrace void no_trace_init(struct trace_array *tr)
137{
138 int cpu;
139
140 ftrace_function_enabled = 0;
141 if(tr->ctrl)
142 for_each_online_cpu(cpu)
143 tracing_reset(tr->data[cpu]);
144 tracer_enabled = 0;
145}
146
147/* dummy trace to disable tracing */
148static struct tracer no_tracer __read_mostly = {
149 .name = "none",
150 .init = no_trace_init
151};
152
153
154/**
155 * trace_wake_up - wake up tasks waiting for trace input
156 *
157 * Simply wakes up any task that is blocked on the trace_wait
158 * queue. These is used with trace_poll for tasks polling the trace.
159 */
160void trace_wake_up(void)
161{
162 /*
163 * The runqueue_is_locked() can fail, but this is the best we
164 * have for now:
165 */
166 if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
167 wake_up(&trace_wait);
168}
169
170#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
171
172static int __init set_nr_entries(char *str)
173{
174 unsigned long nr_entries;
175 int ret;
176
177 if (!str)
178 return 0;
179 ret = strict_strtoul(str, 0, &nr_entries);
180 /* nr_entries can not be zero */
181 if (ret < 0 || nr_entries == 0)
182 return 0;
183 trace_nr_entries = nr_entries;
184 return 1;
185}
186__setup("trace_entries=", set_nr_entries);
187
188unsigned long nsecs_to_usecs(unsigned long nsecs)
189{
190 return nsecs / 1000;
191}
192
193/*
194 * trace_flag_type is an enumeration that holds different
195 * states when a trace occurs. These are:
196 * IRQS_OFF - interrupts were disabled
197 * NEED_RESCED - reschedule is requested
198 * HARDIRQ - inside an interrupt handler
199 * SOFTIRQ - inside a softirq handler
200 */
201enum trace_flag_type {
202 TRACE_FLAG_IRQS_OFF = 0x01,
203 TRACE_FLAG_NEED_RESCHED = 0x02,
204 TRACE_FLAG_HARDIRQ = 0x04,
205 TRACE_FLAG_SOFTIRQ = 0x08,
206};
207
208/*
209 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
210 * control the output of kernel symbols.
211 */
212#define TRACE_ITER_SYM_MASK \
213 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
214
215/* These must match the bit postions in trace_iterator_flags */
216static const char *trace_options[] = {
217 "print-parent",
218 "sym-offset",
219 "sym-addr",
220 "verbose",
221 "raw",
222 "hex",
223 "bin",
224 "block",
225 "stacktrace",
226 "sched-tree",
227 NULL
228};
229
230/*
231 * ftrace_max_lock is used to protect the swapping of buffers
232 * when taking a max snapshot. The buffers themselves are
233 * protected by per_cpu spinlocks. But the action of the swap
234 * needs its own lock.
235 *
236 * This is defined as a raw_spinlock_t in order to help
237 * with performance when lockdep debugging is enabled.
238 */
239static raw_spinlock_t ftrace_max_lock =
240 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
241
242/*
243 * Copy the new maximum trace into the separate maximum-trace
244 * structure. (this way the maximum trace is permanently saved,
245 * for later retrieval via /debugfs/tracing/latency_trace)
246 */
247static void
248__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
249{
250 struct trace_array_cpu *data = tr->data[cpu];
251
252 max_tr.cpu = cpu;
253 max_tr.time_start = data->preempt_timestamp;
254
255 data = max_tr.data[cpu];
256 data->saved_latency = tracing_max_latency;
257
258 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
259 data->pid = tsk->pid;
260 data->uid = tsk->uid;
261 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
262 data->policy = tsk->policy;
263 data->rt_priority = tsk->rt_priority;
264
265 /* record this tasks comm */
266 tracing_record_cmdline(current);
267}
268
269#define CHECK_COND(cond) \
270 if (unlikely(cond)) { \
271 tracing_disabled = 1; \
272 WARN_ON(1); \
273 return -1; \
274 }
275
276/**
277 * check_pages - integrity check of trace buffers
278 *
279 * As a safty measure we check to make sure the data pages have not
280 * been corrupted.
281 */
282int check_pages(struct trace_array_cpu *data)
283{
284 struct page *page, *tmp;
285
286 CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
287 CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
288
289 list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
290 CHECK_COND(page->lru.next->prev != &page->lru);
291 CHECK_COND(page->lru.prev->next != &page->lru);
292 }
293
294 return 0;
295}
296
297/**
298 * head_page - page address of the first page in per_cpu buffer.
299 *
300 * head_page returns the page address of the first page in
301 * a per_cpu buffer. This also preforms various consistency
302 * checks to make sure the buffer has not been corrupted.
303 */
304void *head_page(struct trace_array_cpu *data)
305{
306 struct page *page;
307
308 if (list_empty(&data->trace_pages))
309 return NULL;
310
311 page = list_entry(data->trace_pages.next, struct page, lru);
312 BUG_ON(&page->lru == &data->trace_pages);
313
314 return page_address(page);
315}
316
317/**
318 * trace_seq_printf - sequence printing of trace information
319 * @s: trace sequence descriptor
320 * @fmt: printf format string
321 *
322 * The tracer may use either sequence operations or its own
323 * copy to user routines. To simplify formating of a trace
324 * trace_seq_printf is used to store strings into a special
325 * buffer (@s). Then the output may be either used by
326 * the sequencer or pulled into another buffer.
327 */
328int
329trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
330{
331 int len = (PAGE_SIZE - 1) - s->len;
332 va_list ap;
333 int ret;
334
335 if (!len)
336 return 0;
337
338 va_start(ap, fmt);
339 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
340 va_end(ap);
341
342 /* If we can't write it all, don't bother writing anything */
343 if (ret >= len)
344 return 0;
345
346 s->len += ret;
347
348 return len;
349}
350
351/**
352 * trace_seq_puts - trace sequence printing of simple string
353 * @s: trace sequence descriptor
354 * @str: simple string to record
355 *
356 * The tracer may use either the sequence operations or its own
357 * copy to user routines. This function records a simple string
358 * into a special buffer (@s) for later retrieval by a sequencer
359 * or other mechanism.
360 */
361static int
362trace_seq_puts(struct trace_seq *s, const char *str)
363{
364 int len = strlen(str);
365
366 if (len > ((PAGE_SIZE - 1) - s->len))
367 return 0;
368
369 memcpy(s->buffer + s->len, str, len);
370 s->len += len;
371
372 return len;
373}
374
375static int
376trace_seq_putc(struct trace_seq *s, unsigned char c)
377{
378 if (s->len >= (PAGE_SIZE - 1))
379 return 0;
380
381 s->buffer[s->len++] = c;
382
383 return 1;
384}
385
386static int
387trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
388{
389 if (len > ((PAGE_SIZE - 1) - s->len))
390 return 0;
391
392 memcpy(s->buffer + s->len, mem, len);
393 s->len += len;
394
395 return len;
396}
397
398#define HEX_CHARS 17
399static const char hex2asc[] = "0123456789abcdef";
400
401static int
402trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
403{
404 unsigned char hex[HEX_CHARS];
405 unsigned char *data = mem;
406 unsigned char byte;
407 int i, j;
408
409 BUG_ON(len >= HEX_CHARS);
410
411#ifdef __BIG_ENDIAN
412 for (i = 0, j = 0; i < len; i++) {
413#else
414 for (i = len-1, j = 0; i >= 0; i--) {
415#endif
416 byte = data[i];
417
418 hex[j++] = hex2asc[byte & 0x0f];
419 hex[j++] = hex2asc[byte >> 4];
420 }
421 hex[j++] = ' ';
422
423 return trace_seq_putmem(s, hex, j);
424}
425
426static void
427trace_seq_reset(struct trace_seq *s)
428{
429 s->len = 0;
430 s->readpos = 0;
431}
432
433ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
434{
435 int len;
436 int ret;
437
438 if (s->len <= s->readpos)
439 return -EBUSY;
440
441 len = s->len - s->readpos;
442 if (cnt > len)
443 cnt = len;
444 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
445 if (ret)
446 return -EFAULT;
447
448 s->readpos += len;
449 return cnt;
450}
451
452static void
453trace_print_seq(struct seq_file *m, struct trace_seq *s)
454{
455 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
456
457 s->buffer[len] = 0;
458 seq_puts(m, s->buffer);
459
460 trace_seq_reset(s);
461}
462
463/*
464 * flip the trace buffers between two trace descriptors.
465 * This usually is the buffers between the global_trace and
466 * the max_tr to record a snapshot of a current trace.
467 *
468 * The ftrace_max_lock must be held.
469 */
470static void
471flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
472{
473 struct list_head flip_pages;
474
475 INIT_LIST_HEAD(&flip_pages);
476
477 memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
478 sizeof(struct trace_array_cpu) -
479 offsetof(struct trace_array_cpu, trace_head_idx));
480
481 check_pages(tr1);
482 check_pages(tr2);
483 list_splice_init(&tr1->trace_pages, &flip_pages);
484 list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
485 list_splice_init(&flip_pages, &tr2->trace_pages);
486 BUG_ON(!list_empty(&flip_pages));
487 check_pages(tr1);
488 check_pages(tr2);
489}
490
491/**
492 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
493 * @tr: tracer
494 * @tsk: the task with the latency
495 * @cpu: The cpu that initiated the trace.
496 *
497 * Flip the buffers between the @tr and the max_tr and record information
498 * about which task was the cause of this latency.
499 */
500void
501update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
502{
503 struct trace_array_cpu *data;
504 int i;
505
506 WARN_ON_ONCE(!irqs_disabled());
507 __raw_spin_lock(&ftrace_max_lock);
508 /* clear out all the previous traces */
509 for_each_tracing_cpu(i) {
510 data = tr->data[i];
511 flip_trace(max_tr.data[i], data);
512 tracing_reset(data);
513 }
514
515 __update_max_tr(tr, tsk, cpu);
516 __raw_spin_unlock(&ftrace_max_lock);
517}
518
519/**
520 * update_max_tr_single - only copy one trace over, and reset the rest
521 * @tr - tracer
522 * @tsk - task with the latency
523 * @cpu - the cpu of the buffer to copy.
524 *
525 * Flip the trace of a single CPU buffer between the @tr and the max_tr.
526 */
527void
528update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
529{
530 struct trace_array_cpu *data = tr->data[cpu];
531 int i;
532
533 WARN_ON_ONCE(!irqs_disabled());
534 __raw_spin_lock(&ftrace_max_lock);
535 for_each_tracing_cpu(i)
536 tracing_reset(max_tr.data[i]);
537
538 flip_trace(max_tr.data[cpu], data);
539 tracing_reset(data);
540
541 __update_max_tr(tr, tsk, cpu);
542 __raw_spin_unlock(&ftrace_max_lock);
543}
544
545/**
546 * register_tracer - register a tracer with the ftrace system.
547 * @type - the plugin for the tracer
548 *
549 * Register a new plugin tracer.
550 */
551int register_tracer(struct tracer *type)
552{
553 struct tracer *t;
554 int len;
555 int ret = 0;
556
557 if (!type->name) {
558 pr_info("Tracer must have a name\n");
559 return -1;
560 }
561
562 mutex_lock(&trace_types_lock);
563 for (t = trace_types; t; t = t->next) {
564 if (strcmp(type->name, t->name) == 0) {
565 /* already found */
566 pr_info("Trace %s already registered\n",
567 type->name);
568 ret = -1;
569 goto out;
570 }
571 }
572
573#ifdef CONFIG_FTRACE_STARTUP_TEST
574 if (type->selftest) {
575 struct tracer *saved_tracer = current_trace;
576 struct trace_array_cpu *data;
577 struct trace_array *tr = &global_trace;
578 int saved_ctrl = tr->ctrl;
579 int i;
580 /*
581 * Run a selftest on this tracer.
582 * Here we reset the trace buffer, and set the current
583 * tracer to be this tracer. The tracer can then run some
584 * internal tracing to verify that everything is in order.
585 * If we fail, we do not register this tracer.
586 */
587 for_each_tracing_cpu(i) {
588 data = tr->data[i];
589 if (!head_page(data))
590 continue;
591 tracing_reset(data);
592 }
593 current_trace = type;
594 tr->ctrl = 0;
595 /* the test is responsible for initializing and enabling */
596 pr_info("Testing tracer %s: ", type->name);
597 ret = type->selftest(type, tr);
598 /* the test is responsible for resetting too */
599 current_trace = saved_tracer;
600 tr->ctrl = saved_ctrl;
601 if (ret) {
602 printk(KERN_CONT "FAILED!\n");
603 goto out;
604 }
605 /* Only reset on passing, to avoid touching corrupted buffers */
606 for_each_tracing_cpu(i) {
607 data = tr->data[i];
608 if (!head_page(data))
609 continue;
610 tracing_reset(data);
611 }
612 printk(KERN_CONT "PASSED\n");
613 }
614#endif
615
616 type->next = trace_types;
617 trace_types = type;
618 len = strlen(type->name);
619 if (len > max_tracer_type_len)
620 max_tracer_type_len = len;
621
622 out:
623 mutex_unlock(&trace_types_lock);
624
625 return ret;
626}
627
628void unregister_tracer(struct tracer *type)
629{
630 struct tracer **t;
631 int len;
632
633 mutex_lock(&trace_types_lock);
634 for (t = &trace_types; *t; t = &(*t)->next) {
635 if (*t == type)
636 goto found;
637 }
638 pr_info("Trace %s not registered\n", type->name);
639 goto out;
640
641 found:
642 *t = (*t)->next;
643 if (strlen(type->name) != max_tracer_type_len)
644 goto out;
645
646 max_tracer_type_len = 0;
647 for (t = &trace_types; *t; t = &(*t)->next) {
648 len = strlen((*t)->name);
649 if (len > max_tracer_type_len)
650 max_tracer_type_len = len;
651 }
652 out:
653 mutex_unlock(&trace_types_lock);
654}
655
656void tracing_reset(struct trace_array_cpu *data)
657{
658 data->trace_idx = 0;
659 data->overrun = 0;
660 data->trace_head = data->trace_tail = head_page(data);
661 data->trace_head_idx = 0;
662 data->trace_tail_idx = 0;
663}
664
665#define SAVED_CMDLINES 128
666static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
667static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
668static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
669static int cmdline_idx;
670static DEFINE_SPINLOCK(trace_cmdline_lock);
671
672/* temporary disable recording */
673atomic_t trace_record_cmdline_disabled __read_mostly;
674
675static void trace_init_cmdlines(void)
676{
677 memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
678 memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
679 cmdline_idx = 0;
680}
681
682void trace_stop_cmdline_recording(void);
683
684static void trace_save_cmdline(struct task_struct *tsk)
685{
686 unsigned map;
687 unsigned idx;
688
689 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
690 return;
691
692 /*
693 * It's not the end of the world if we don't get
694 * the lock, but we also don't want to spin
695 * nor do we want to disable interrupts,
696 * so if we miss here, then better luck next time.
697 */
698 if (!spin_trylock(&trace_cmdline_lock))
699 return;
700
701 idx = map_pid_to_cmdline[tsk->pid];
702 if (idx >= SAVED_CMDLINES) {
703 idx = (cmdline_idx + 1) % SAVED_CMDLINES;
704
705 map = map_cmdline_to_pid[idx];
706 if (map <= PID_MAX_DEFAULT)
707 map_pid_to_cmdline[map] = (unsigned)-1;
708
709 map_pid_to_cmdline[tsk->pid] = idx;
710
711 cmdline_idx = idx;
712 }
713
714 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
715
716 spin_unlock(&trace_cmdline_lock);
717}
718
719static char *trace_find_cmdline(int pid)
720{
721 char *cmdline = "<...>";
722 unsigned map;
723
724 if (!pid)
725 return "<idle>";
726
727 if (pid > PID_MAX_DEFAULT)
728 goto out;
729
730 map = map_pid_to_cmdline[pid];
731 if (map >= SAVED_CMDLINES)
732 goto out;
733
734 cmdline = saved_cmdlines[map];
735
736 out:
737 return cmdline;
738}
739
740void tracing_record_cmdline(struct task_struct *tsk)
741{
742 if (atomic_read(&trace_record_cmdline_disabled))
743 return;
744
745 trace_save_cmdline(tsk);
746}
747
748static inline struct list_head *
749trace_next_list(struct trace_array_cpu *data, struct list_head *next)
750{
751 /*
752 * Roundrobin - but skip the head (which is not a real page):
753 */
754 next = next->next;
755 if (unlikely(next == &data->trace_pages))
756 next = next->next;
757 BUG_ON(next == &data->trace_pages);
758
759 return next;
760}
761
762static inline void *
763trace_next_page(struct trace_array_cpu *data, void *addr)
764{
765 struct list_head *next;
766 struct page *page;
767
768 page = virt_to_page(addr);
769
770 next = trace_next_list(data, &page->lru);
771 page = list_entry(next, struct page, lru);
772
773 return page_address(page);
774}
775
776static inline struct trace_entry *
777tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
778{
779 unsigned long idx, idx_next;
780 struct trace_entry *entry;
781
782 data->trace_idx++;
783 idx = data->trace_head_idx;
784 idx_next = idx + 1;
785
786 BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
787
788 entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
789
790 if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
791 data->trace_head = trace_next_page(data, data->trace_head);
792 idx_next = 0;
793 }
794
795 if (data->trace_head == data->trace_tail &&
796 idx_next == data->trace_tail_idx) {
797 /* overrun */
798 data->overrun++;
799 data->trace_tail_idx++;
800 if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
801 data->trace_tail =
802 trace_next_page(data, data->trace_tail);
803 data->trace_tail_idx = 0;
804 }
805 }
806
807 data->trace_head_idx = idx_next;
808
809 return entry;
810}
811
812static inline void
813tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
814{
815 struct task_struct *tsk = current;
816 unsigned long pc;
817
818 pc = preempt_count();
819
820 entry->preempt_count = pc & 0xff;
821 entry->pid = (tsk) ? tsk->pid : 0;
822 entry->t = ftrace_now(raw_smp_processor_id());
823 entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
824 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
825 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
826 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
827}
828
829void
830trace_function(struct trace_array *tr, struct trace_array_cpu *data,
831 unsigned long ip, unsigned long parent_ip, unsigned long flags)
832{
833 struct trace_entry *entry;
834 unsigned long irq_flags;
835
836 raw_local_irq_save(irq_flags);
837 __raw_spin_lock(&data->lock);
838 entry = tracing_get_trace_entry(tr, data);
839 tracing_generic_entry_update(entry, flags);
840 entry->type = TRACE_FN;
841 entry->fn.ip = ip;
842 entry->fn.parent_ip = parent_ip;
843 __raw_spin_unlock(&data->lock);
844 raw_local_irq_restore(irq_flags);
845}
846
847void
848ftrace(struct trace_array *tr, struct trace_array_cpu *data,
849 unsigned long ip, unsigned long parent_ip, unsigned long flags)
850{
851 if (likely(!atomic_read(&data->disabled)))
852 trace_function(tr, data, ip, parent_ip, flags);
853}
854
855#ifdef CONFIG_MMIOTRACE
856void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
857 struct mmiotrace_rw *rw)
858{
859 struct trace_entry *entry;
860 unsigned long irq_flags;
861
862 raw_local_irq_save(irq_flags);
863 __raw_spin_lock(&data->lock);
864
865 entry = tracing_get_trace_entry(tr, data);
866 tracing_generic_entry_update(entry, 0);
867 entry->type = TRACE_MMIO_RW;
868 entry->mmiorw = *rw;
869
870 __raw_spin_unlock(&data->lock);
871 raw_local_irq_restore(irq_flags);
872
873 trace_wake_up();
874}
875
876void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
877 struct mmiotrace_map *map)
878{
879 struct trace_entry *entry;
880 unsigned long irq_flags;
881
882 raw_local_irq_save(irq_flags);
883 __raw_spin_lock(&data->lock);
884
885 entry = tracing_get_trace_entry(tr, data);
886 tracing_generic_entry_update(entry, 0);
887 entry->type = TRACE_MMIO_MAP;
888 entry->mmiomap = *map;
889
890 __raw_spin_unlock(&data->lock);
891 raw_local_irq_restore(irq_flags);
892
893 trace_wake_up();
894}
895#endif
896
897void __trace_stack(struct trace_array *tr,
898 struct trace_array_cpu *data,
899 unsigned long flags,
900 int skip)
901{
902 struct trace_entry *entry;
903 struct stack_trace trace;
904
905 if (!(trace_flags & TRACE_ITER_STACKTRACE))
906 return;
907
908 entry = tracing_get_trace_entry(tr, data);
909 tracing_generic_entry_update(entry, flags);
910 entry->type = TRACE_STACK;
911
912 memset(&entry->stack, 0, sizeof(entry->stack));
913
914 trace.nr_entries = 0;
915 trace.max_entries = FTRACE_STACK_ENTRIES;
916 trace.skip = skip;
917 trace.entries = entry->stack.caller;
918
919 save_stack_trace(&trace);
920}
921
922void
923__trace_special(void *__tr, void *__data,
924 unsigned long arg1, unsigned long arg2, unsigned long arg3)
925{
926 struct trace_array_cpu *data = __data;
927 struct trace_array *tr = __tr;
928 struct trace_entry *entry;
929 unsigned long irq_flags;
930
931 raw_local_irq_save(irq_flags);
932 __raw_spin_lock(&data->lock);
933 entry = tracing_get_trace_entry(tr, data);
934 tracing_generic_entry_update(entry, 0);
935 entry->type = TRACE_SPECIAL;
936 entry->special.arg1 = arg1;
937 entry->special.arg2 = arg2;
938 entry->special.arg3 = arg3;
939 __trace_stack(tr, data, irq_flags, 4);
940 __raw_spin_unlock(&data->lock);
941 raw_local_irq_restore(irq_flags);
942
943 trace_wake_up();
944}
945
946void
947tracing_sched_switch_trace(struct trace_array *tr,
948 struct trace_array_cpu *data,
949 struct task_struct *prev,
950 struct task_struct *next,
951 unsigned long flags)
952{
953 struct trace_entry *entry;
954 unsigned long irq_flags;
955
956 raw_local_irq_save(irq_flags);
957 __raw_spin_lock(&data->lock);
958 entry = tracing_get_trace_entry(tr, data);
959 tracing_generic_entry_update(entry, flags);
960 entry->type = TRACE_CTX;
961 entry->ctx.prev_pid = prev->pid;
962 entry->ctx.prev_prio = prev->prio;
963 entry->ctx.prev_state = prev->state;
964 entry->ctx.next_pid = next->pid;
965 entry->ctx.next_prio = next->prio;
966 entry->ctx.next_state = next->state;
967 __trace_stack(tr, data, flags, 5);
968 __raw_spin_unlock(&data->lock);
969 raw_local_irq_restore(irq_flags);
970}
971
972void
973tracing_sched_wakeup_trace(struct trace_array *tr,
974 struct trace_array_cpu *data,
975 struct task_struct *wakee,
976 struct task_struct *curr,
977 unsigned long flags)
978{
979 struct trace_entry *entry;
980 unsigned long irq_flags;
981
982 raw_local_irq_save(irq_flags);
983 __raw_spin_lock(&data->lock);
984 entry = tracing_get_trace_entry(tr, data);
985 tracing_generic_entry_update(entry, flags);
986 entry->type = TRACE_WAKE;
987 entry->ctx.prev_pid = curr->pid;
988 entry->ctx.prev_prio = curr->prio;
989 entry->ctx.prev_state = curr->state;
990 entry->ctx.next_pid = wakee->pid;
991 entry->ctx.next_prio = wakee->prio;
992 entry->ctx.next_state = wakee->state;
993 __trace_stack(tr, data, flags, 6);
994 __raw_spin_unlock(&data->lock);
995 raw_local_irq_restore(irq_flags);
996
997 trace_wake_up();
998}
999
1000void
1001ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1002{
1003 struct trace_array *tr = &global_trace;
1004 struct trace_array_cpu *data;
1005 unsigned long flags;
1006 long disabled;
1007 int cpu;
1008
1009 if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl)
1010 return;
1011
1012 local_irq_save(flags);
1013 cpu = raw_smp_processor_id();
1014 data = tr->data[cpu];
1015 disabled = atomic_inc_return(&data->disabled);
1016
1017 if (likely(disabled == 1))
1018 __trace_special(tr, data, arg1, arg2, arg3);
1019
1020 atomic_dec(&data->disabled);
1021 local_irq_restore(flags);
1022}
1023
1024#ifdef CONFIG_FTRACE
1025static void
1026function_trace_call(unsigned long ip, unsigned long parent_ip)
1027{
1028 struct trace_array *tr = &global_trace;
1029 struct trace_array_cpu *data;
1030 unsigned long flags;
1031 long disabled;
1032 int cpu;
1033
1034 if (unlikely(!ftrace_function_enabled))
1035 return;
1036
1037 if (skip_trace(ip))
1038 return;
1039
1040 local_irq_save(flags);
1041 cpu = raw_smp_processor_id();
1042 data = tr->data[cpu];
1043 disabled = atomic_inc_return(&data->disabled);
1044
1045 if (likely(disabled == 1))
1046 trace_function(tr, data, ip, parent_ip, flags);
1047
1048 atomic_dec(&data->disabled);
1049 local_irq_restore(flags);
1050}
1051
1052static struct ftrace_ops trace_ops __read_mostly =
1053{
1054 .func = function_trace_call,
1055};
1056
1057void tracing_start_function_trace(void)
1058{
1059 ftrace_function_enabled = 0;
1060 register_ftrace_function(&trace_ops);
1061 if (tracer_enabled)
1062 ftrace_function_enabled = 1;
1063}
1064
1065void tracing_stop_function_trace(void)
1066{
1067 ftrace_function_enabled = 0;
1068 unregister_ftrace_function(&trace_ops);
1069}
1070#endif
1071
1072enum trace_file_type {
1073 TRACE_FILE_LAT_FMT = 1,
1074};
1075
1076static struct trace_entry *
1077trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
1078 struct trace_iterator *iter, int cpu)
1079{
1080 struct page *page;
1081 struct trace_entry *array;
1082
1083 if (iter->next_idx[cpu] >= tr->entries ||
1084 iter->next_idx[cpu] >= data->trace_idx ||
1085 (data->trace_head == data->trace_tail &&
1086 data->trace_head_idx == data->trace_tail_idx))
1087 return NULL;
1088
1089 if (!iter->next_page[cpu]) {
1090 /* Initialize the iterator for this cpu trace buffer */
1091 WARN_ON(!data->trace_tail);
1092 page = virt_to_page(data->trace_tail);
1093 iter->next_page[cpu] = &page->lru;
1094 iter->next_page_idx[cpu] = data->trace_tail_idx;
1095 }
1096
1097 page = list_entry(iter->next_page[cpu], struct page, lru);
1098 BUG_ON(&data->trace_pages == &page->lru);
1099
1100 array = page_address(page);
1101
1102 WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE);
1103 return &array[iter->next_page_idx[cpu]];
1104}
1105
1106static struct trace_entry *
1107find_next_entry(struct trace_iterator *iter, int *ent_cpu)
1108{
1109 struct trace_array *tr = iter->tr;
1110 struct trace_entry *ent, *next = NULL;
1111 int next_cpu = -1;
1112 int cpu;
1113
1114 for_each_tracing_cpu(cpu) {
1115 if (!head_page(tr->data[cpu]))
1116 continue;
1117 ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
1118 /*
1119 * Pick the entry with the smallest timestamp:
1120 */
1121 if (ent && (!next || ent->t < next->t)) {
1122 next = ent;
1123 next_cpu = cpu;
1124 }
1125 }
1126
1127 if (ent_cpu)
1128 *ent_cpu = next_cpu;
1129
1130 return next;
1131}
1132
1133static void trace_iterator_increment(struct trace_iterator *iter)
1134{
1135 iter->idx++;
1136 iter->next_idx[iter->cpu]++;
1137 iter->next_page_idx[iter->cpu]++;
1138
1139 if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
1140 struct trace_array_cpu *data = iter->tr->data[iter->cpu];
1141
1142 iter->next_page_idx[iter->cpu] = 0;
1143 iter->next_page[iter->cpu] =
1144 trace_next_list(data, iter->next_page[iter->cpu]);
1145 }
1146}
1147
1148static void trace_consume(struct trace_iterator *iter)
1149{
1150 struct trace_array_cpu *data = iter->tr->data[iter->cpu];
1151
1152 data->trace_tail_idx++;
1153 if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
1154 data->trace_tail = trace_next_page(data, data->trace_tail);
1155 data->trace_tail_idx = 0;
1156 }
1157
1158 /* Check if we empty it, then reset the index */
1159 if (data->trace_head == data->trace_tail &&
1160 data->trace_head_idx == data->trace_tail_idx)
1161 data->trace_idx = 0;
1162}
1163
1164static void *find_next_entry_inc(struct trace_iterator *iter)
1165{
1166 struct trace_entry *next;
1167 int next_cpu = -1;
1168
1169 next = find_next_entry(iter, &next_cpu);
1170
1171 iter->prev_ent = iter->ent;
1172 iter->prev_cpu = iter->cpu;
1173
1174 iter->ent = next;
1175 iter->cpu = next_cpu;
1176
1177 if (next)
1178 trace_iterator_increment(iter);
1179
1180 return next ? iter : NULL;
1181}
1182
1183static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1184{
1185 struct trace_iterator *iter = m->private;
1186 void *last_ent = iter->ent;
1187 int i = (int)*pos;
1188 void *ent;
1189
1190 (*pos)++;
1191
1192 /* can't go backwards */
1193 if (iter->idx > i)
1194 return NULL;
1195
1196 if (iter->idx < 0)
1197 ent = find_next_entry_inc(iter);
1198 else
1199 ent = iter;
1200
1201 while (ent && iter->idx < i)
1202 ent = find_next_entry_inc(iter);
1203
1204 iter->pos = *pos;
1205
1206 if (last_ent && !ent)
1207 seq_puts(m, "\n\nvim:ft=help\n");
1208
1209 return ent;
1210}
1211
1212static void *s_start(struct seq_file *m, loff_t *pos)
1213{
1214 struct trace_iterator *iter = m->private;
1215 void *p = NULL;
1216 loff_t l = 0;
1217 int i;
1218
1219 mutex_lock(&trace_types_lock);
1220
1221 if (!current_trace || current_trace != iter->trace) {
1222 mutex_unlock(&trace_types_lock);
1223 return NULL;
1224 }
1225
1226 atomic_inc(&trace_record_cmdline_disabled);
1227
1228 /* let the tracer grab locks here if needed */
1229 if (current_trace->start)
1230 current_trace->start(iter);
1231
1232 if (*pos != iter->pos) {
1233 iter->ent = NULL;
1234 iter->cpu = 0;
1235 iter->idx = -1;
1236 iter->prev_ent = NULL;
1237 iter->prev_cpu = -1;
1238
1239 for_each_tracing_cpu(i) {
1240 iter->next_idx[i] = 0;
1241 iter->next_page[i] = NULL;
1242 }
1243
1244 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1245 ;
1246
1247 } else {
1248 l = *pos - 1;
1249 p = s_next(m, p, &l);
1250 }
1251
1252 return p;
1253}
1254
1255static void s_stop(struct seq_file *m, void *p)
1256{
1257 struct trace_iterator *iter = m->private;
1258
1259 atomic_dec(&trace_record_cmdline_disabled);
1260
1261 /* let the tracer release locks here if needed */
1262 if (current_trace && current_trace == iter->trace && iter->trace->stop)
1263 iter->trace->stop(iter);
1264
1265 mutex_unlock(&trace_types_lock);
1266}
1267
1268#define KRETPROBE_MSG "[unknown/kretprobe'd]"
1269
1270#ifdef CONFIG_KRETPROBES
1271static inline int kretprobed(unsigned long addr)
1272{
1273 return addr == (unsigned long)kretprobe_trampoline;
1274}
1275#else
1276static inline int kretprobed(unsigned long addr)
1277{
1278 return 0;
1279}
1280#endif /* CONFIG_KRETPROBES */
1281
1282static int
1283seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
1284{
1285#ifdef CONFIG_KALLSYMS
1286 char str[KSYM_SYMBOL_LEN];
1287
1288 kallsyms_lookup(address, NULL, NULL, NULL, str);
1289
1290 return trace_seq_printf(s, fmt, str);
1291#endif
1292 return 1;
1293}
1294
1295static int
1296seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1297 unsigned long address)
1298{
1299#ifdef CONFIG_KALLSYMS
1300 char str[KSYM_SYMBOL_LEN];
1301
1302 sprint_symbol(str, address);
1303 return trace_seq_printf(s, fmt, str);
1304#endif
1305 return 1;
1306}
1307
1308#ifndef CONFIG_64BIT
1309# define IP_FMT "%08lx"
1310#else
1311# define IP_FMT "%016lx"
1312#endif
1313
1314static int
1315seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1316{
1317 int ret;
1318
1319 if (!ip)
1320 return trace_seq_printf(s, "0");
1321
1322 if (sym_flags & TRACE_ITER_SYM_OFFSET)
1323 ret = seq_print_sym_offset(s, "%s", ip);
1324 else
1325 ret = seq_print_sym_short(s, "%s", ip);
1326
1327 if (!ret)
1328 return 0;
1329
1330 if (sym_flags & TRACE_ITER_SYM_ADDR)
1331 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1332 return ret;
1333}
1334
1335static void print_lat_help_header(struct seq_file *m)
1336{
1337 seq_puts(m, "# _------=> CPU# \n");
1338 seq_puts(m, "# / _-----=> irqs-off \n");
1339 seq_puts(m, "# | / _----=> need-resched \n");
1340 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1341 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1342 seq_puts(m, "# |||| / \n");
1343 seq_puts(m, "# ||||| delay \n");
1344 seq_puts(m, "# cmd pid ||||| time | caller \n");
1345 seq_puts(m, "# \\ / ||||| \\ | / \n");
1346}
1347
1348static void print_func_help_header(struct seq_file *m)
1349{
1350 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
1351 seq_puts(m, "# | | | | |\n");
1352}
1353
1354
1355static void
1356print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1357{
1358 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1359 struct trace_array *tr = iter->tr;
1360 struct trace_array_cpu *data = tr->data[tr->cpu];
1361 struct tracer *type = current_trace;
1362 unsigned long total = 0;
1363 unsigned long entries = 0;
1364 int cpu;
1365 const char *name = "preemption";
1366
1367 if (type)
1368 name = type->name;
1369
1370 for_each_tracing_cpu(cpu) {
1371 if (head_page(tr->data[cpu])) {
1372 total += tr->data[cpu]->trace_idx;
1373 if (tr->data[cpu]->trace_idx > tr->entries)
1374 entries += tr->entries;
1375 else
1376 entries += tr->data[cpu]->trace_idx;
1377 }
1378 }
1379
1380 seq_printf(m, "%s latency trace v1.1.5 on %s\n",
1381 name, UTS_RELEASE);
1382 seq_puts(m, "-----------------------------------"
1383 "---------------------------------\n");
1384 seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
1385 " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
1386 nsecs_to_usecs(data->saved_latency),
1387 entries,
1388 total,
1389 tr->cpu,
1390#if defined(CONFIG_PREEMPT_NONE)
1391 "server",
1392#elif defined(CONFIG_PREEMPT_VOLUNTARY)
1393 "desktop",
1394#elif defined(CONFIG_PREEMPT)
1395 "preempt",
1396#else
1397 "unknown",
1398#endif
1399 /* These are reserved for later use */
1400 0, 0, 0, 0);
1401#ifdef CONFIG_SMP
1402 seq_printf(m, " #P:%d)\n", num_online_cpus());
1403#else
1404 seq_puts(m, ")\n");
1405#endif
1406 seq_puts(m, " -----------------\n");
1407 seq_printf(m, " | task: %.16s-%d "
1408 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
1409 data->comm, data->pid, data->uid, data->nice,
1410 data->policy, data->rt_priority);
1411 seq_puts(m, " -----------------\n");
1412
1413 if (data->critical_start) {
1414 seq_puts(m, " => started at: ");
1415 seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
1416 trace_print_seq(m, &iter->seq);
1417 seq_puts(m, "\n => ended at: ");
1418 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1419 trace_print_seq(m, &iter->seq);
1420 seq_puts(m, "\n");
1421 }
1422
1423 seq_puts(m, "\n");
1424}
1425
1426static void
1427lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
1428{
1429 int hardirq, softirq;
1430 char *comm;
1431
1432 comm = trace_find_cmdline(entry->pid);
1433
1434 trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
1435 trace_seq_printf(s, "%d", cpu);
1436 trace_seq_printf(s, "%c%c",
1437 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
1438 ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
1439
1440 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
1441 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
1442 if (hardirq && softirq) {
1443 trace_seq_putc(s, 'H');
1444 } else {
1445 if (hardirq) {
1446 trace_seq_putc(s, 'h');
1447 } else {
1448 if (softirq)
1449 trace_seq_putc(s, 's');
1450 else
1451 trace_seq_putc(s, '.');
1452 }
1453 }
1454
1455 if (entry->preempt_count)
1456 trace_seq_printf(s, "%x", entry->preempt_count);
1457 else
1458 trace_seq_puts(s, ".");
1459}
1460
1461unsigned long preempt_mark_thresh = 100;
1462
1463static void
1464lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
1465 unsigned long rel_usecs)
1466{
1467 trace_seq_printf(s, " %4lldus", abs_usecs);
1468 if (rel_usecs > preempt_mark_thresh)
1469 trace_seq_puts(s, "!: ");
1470 else if (rel_usecs > 1)
1471 trace_seq_puts(s, "+: ");
1472 else
1473 trace_seq_puts(s, " : ");
1474}
1475
1476static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1477
1478static int
1479print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1480{
1481 struct trace_seq *s = &iter->seq;
1482 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1483 struct trace_entry *next_entry = find_next_entry(iter, NULL);
1484 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
1485 struct trace_entry *entry = iter->ent;
1486 unsigned long abs_usecs;
1487 unsigned long rel_usecs;
1488 char *comm;
1489 int S, T;
1490 int i;
1491 unsigned state;
1492
1493 if (!next_entry)
1494 next_entry = entry;
1495 rel_usecs = ns2usecs(next_entry->t - entry->t);
1496 abs_usecs = ns2usecs(entry->t - iter->tr->time_start);
1497
1498 if (verbose) {
1499 comm = trace_find_cmdline(entry->pid);
1500 trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]"
1501 " %ld.%03ldms (+%ld.%03ldms): ",
1502 comm,
1503 entry->pid, cpu, entry->flags,
1504 entry->preempt_count, trace_idx,
1505 ns2usecs(entry->t),
1506 abs_usecs/1000,
1507 abs_usecs % 1000, rel_usecs/1000,
1508 rel_usecs % 1000);
1509 } else {
1510 lat_print_generic(s, entry, cpu);
1511 lat_print_timestamp(s, abs_usecs, rel_usecs);
1512 }
1513 switch (entry->type) {
1514 case TRACE_FN:
1515 seq_print_ip_sym(s, entry->fn.ip, sym_flags);
1516 trace_seq_puts(s, " (");
1517 if (kretprobed(entry->fn.parent_ip))
1518 trace_seq_puts(s, KRETPROBE_MSG);
1519 else
1520 seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
1521 trace_seq_puts(s, ")\n");
1522 break;
1523 case TRACE_CTX:
1524 case TRACE_WAKE:
1525 T = entry->ctx.next_state < sizeof(state_to_char) ?
1526 state_to_char[entry->ctx.next_state] : 'X';
1527
1528 state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0;
1529 S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
1530 comm = trace_find_cmdline(entry->ctx.next_pid);
1531 trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
1532 entry->ctx.prev_pid,
1533 entry->ctx.prev_prio,
1534 S, entry->type == TRACE_CTX ? "==>" : " +",
1535 entry->ctx.next_pid,
1536 entry->ctx.next_prio,
1537 T, comm);
1538 break;
1539 case TRACE_SPECIAL:
1540 trace_seq_printf(s, "# %ld %ld %ld\n",
1541 entry->special.arg1,
1542 entry->special.arg2,
1543 entry->special.arg3);
1544 break;
1545 case TRACE_STACK:
1546 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1547 if (i)
1548 trace_seq_puts(s, " <= ");
1549 seq_print_ip_sym(s, entry->stack.caller[i], sym_flags);
1550 }
1551 trace_seq_puts(s, "\n");
1552 break;
1553 default:
1554 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1555 }
1556 return 1;
1557}
1558
1559static int print_trace_fmt(struct trace_iterator *iter)
1560{
1561 struct trace_seq *s = &iter->seq;
1562 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1563 struct trace_entry *entry;
1564 unsigned long usec_rem;
1565 unsigned long long t;
1566 unsigned long secs;
1567 char *comm;
1568 int ret;
1569 int S, T;
1570 int i;
1571
1572 entry = iter->ent;
1573
1574 comm = trace_find_cmdline(iter->ent->pid);
1575
1576 t = ns2usecs(entry->t);
1577 usec_rem = do_div(t, 1000000ULL);
1578 secs = (unsigned long)t;
1579
1580 ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
1581 if (!ret)
1582 return 0;
1583 ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
1584 if (!ret)
1585 return 0;
1586 ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
1587 if (!ret)
1588 return 0;
1589
1590 switch (entry->type) {
1591 case TRACE_FN:
1592 ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags);
1593 if (!ret)
1594 return 0;
1595 if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
1596 entry->fn.parent_ip) {
1597 ret = trace_seq_printf(s, " <-");
1598 if (!ret)
1599 return 0;
1600 if (kretprobed(entry->fn.parent_ip))
1601 ret = trace_seq_puts(s, KRETPROBE_MSG);
1602 else
1603 ret = seq_print_ip_sym(s, entry->fn.parent_ip,
1604 sym_flags);
1605 if (!ret)
1606 return 0;
1607 }
1608 ret = trace_seq_printf(s, "\n");
1609 if (!ret)
1610 return 0;
1611 break;
1612 case TRACE_CTX:
1613 case TRACE_WAKE:
1614 S = entry->ctx.prev_state < sizeof(state_to_char) ?
1615 state_to_char[entry->ctx.prev_state] : 'X';
1616 T = entry->ctx.next_state < sizeof(state_to_char) ?
1617 state_to_char[entry->ctx.next_state] : 'X';
1618 ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
1619 entry->ctx.prev_pid,
1620 entry->ctx.prev_prio,
1621 S,
1622 entry->type == TRACE_CTX ? "==>" : " +",
1623 entry->ctx.next_pid,
1624 entry->ctx.next_prio,
1625 T);
1626 if (!ret)
1627 return 0;
1628 break;
1629 case TRACE_SPECIAL:
1630 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
1631 entry->special.arg1,
1632 entry->special.arg2,
1633 entry->special.arg3);
1634 if (!ret)
1635 return 0;
1636 break;
1637 case TRACE_STACK:
1638 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1639 if (i) {
1640 ret = trace_seq_puts(s, " <= ");
1641 if (!ret)
1642 return 0;
1643 }
1644 ret = seq_print_ip_sym(s, entry->stack.caller[i],
1645 sym_flags);
1646 if (!ret)
1647 return 0;
1648 }
1649 ret = trace_seq_puts(s, "\n");
1650 if (!ret)
1651 return 0;
1652 break;
1653 }
1654 return 1;
1655}
1656
1657static int print_raw_fmt(struct trace_iterator *iter)
1658{
1659 struct trace_seq *s = &iter->seq;
1660 struct trace_entry *entry;
1661 int ret;
1662 int S, T;
1663
1664 entry = iter->ent;
1665
1666 ret = trace_seq_printf(s, "%d %d %llu ",
1667 entry->pid, iter->cpu, entry->t);
1668 if (!ret)
1669 return 0;
1670
1671 switch (entry->type) {
1672 case TRACE_FN:
1673 ret = trace_seq_printf(s, "%x %x\n",
1674 entry->fn.ip, entry->fn.parent_ip);
1675 if (!ret)
1676 return 0;
1677 break;
1678 case TRACE_CTX:
1679 case TRACE_WAKE:
1680 S = entry->ctx.prev_state < sizeof(state_to_char) ?
1681 state_to_char[entry->ctx.prev_state] : 'X';
1682 T = entry->ctx.next_state < sizeof(state_to_char) ?
1683 state_to_char[entry->ctx.next_state] : 'X';
1684 if (entry->type == TRACE_WAKE)
1685 S = '+';
1686 ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
1687 entry->ctx.prev_pid,
1688 entry->ctx.prev_prio,
1689 S,
1690 entry->ctx.next_pid,
1691 entry->ctx.next_prio,
1692 T);
1693 if (!ret)
1694 return 0;
1695 break;
1696 case TRACE_SPECIAL:
1697 case TRACE_STACK:
1698 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
1699 entry->special.arg1,
1700 entry->special.arg2,
1701 entry->special.arg3);
1702 if (!ret)
1703 return 0;
1704 break;
1705 }
1706 return 1;
1707}
1708
1709#define SEQ_PUT_FIELD_RET(s, x) \
1710do { \
1711 if (!trace_seq_putmem(s, &(x), sizeof(x))) \
1712 return 0; \
1713} while (0)
1714
1715#define SEQ_PUT_HEX_FIELD_RET(s, x) \
1716do { \
1717 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
1718 return 0; \
1719} while (0)
1720
1721static int print_hex_fmt(struct trace_iterator *iter)
1722{
1723 struct trace_seq *s = &iter->seq;
1724 unsigned char newline = '\n';
1725 struct trace_entry *entry;
1726 int S, T;
1727
1728 entry = iter->ent;
1729
1730 SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
1731 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
1732 SEQ_PUT_HEX_FIELD_RET(s, entry->t);
1733
1734 switch (entry->type) {
1735 case TRACE_FN:
1736 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip);
1737 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
1738 break;
1739 case TRACE_CTX:
1740 case TRACE_WAKE:
1741 S = entry->ctx.prev_state < sizeof(state_to_char) ?
1742 state_to_char[entry->ctx.prev_state] : 'X';
1743 T = entry->ctx.next_state < sizeof(state_to_char) ?
1744 state_to_char[entry->ctx.next_state] : 'X';
1745 if (entry->type == TRACE_WAKE)
1746 S = '+';
1747 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid);
1748 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio);
1749 SEQ_PUT_HEX_FIELD_RET(s, S);
1750 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid);
1751 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio);
1752 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
1753 SEQ_PUT_HEX_FIELD_RET(s, T);
1754 break;
1755 case TRACE_SPECIAL:
1756 case TRACE_STACK:
1757 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1);
1758 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2);
1759 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3);
1760 break;
1761 }
1762 SEQ_PUT_FIELD_RET(s, newline);
1763
1764 return 1;
1765}
1766
1767static int print_bin_fmt(struct trace_iterator *iter)
1768{
1769 struct trace_seq *s = &iter->seq;
1770 struct trace_entry *entry;
1771
1772 entry = iter->ent;
1773
1774 SEQ_PUT_FIELD_RET(s, entry->pid);
1775 SEQ_PUT_FIELD_RET(s, entry->cpu);
1776 SEQ_PUT_FIELD_RET(s, entry->t);
1777
1778 switch (entry->type) {
1779 case TRACE_FN:
1780 SEQ_PUT_FIELD_RET(s, entry->fn.ip);
1781 SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip);
1782 break;
1783 case TRACE_CTX:
1784 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid);
1785 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio);
1786 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state);
1787 SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
1788 SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
1789 SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
1790 break;
1791 case TRACE_SPECIAL:
1792 case TRACE_STACK:
1793 SEQ_PUT_FIELD_RET(s, entry->special.arg1);
1794 SEQ_PUT_FIELD_RET(s, entry->special.arg2);
1795 SEQ_PUT_FIELD_RET(s, entry->special.arg3);
1796 break;
1797 }
1798 return 1;
1799}
1800
1801static int trace_empty(struct trace_iterator *iter)
1802{
1803 struct trace_array_cpu *data;
1804 int cpu;
1805
1806 for_each_tracing_cpu(cpu) {
1807 data = iter->tr->data[cpu];
1808
1809 if (head_page(data) && data->trace_idx &&
1810 (data->trace_tail != data->trace_head ||
1811 data->trace_tail_idx != data->trace_head_idx))
1812 return 0;
1813 }
1814 return 1;
1815}
1816
1817static int print_trace_line(struct trace_iterator *iter)
1818{
1819 if (iter->trace && iter->trace->print_line)
1820 return iter->trace->print_line(iter);
1821
1822 if (trace_flags & TRACE_ITER_BIN)
1823 return print_bin_fmt(iter);
1824
1825 if (trace_flags & TRACE_ITER_HEX)
1826 return print_hex_fmt(iter);
1827
1828 if (trace_flags & TRACE_ITER_RAW)
1829 return print_raw_fmt(iter);
1830
1831 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
1832 return print_lat_fmt(iter, iter->idx, iter->cpu);
1833
1834 return print_trace_fmt(iter);
1835}
1836
1837static int s_show(struct seq_file *m, void *v)
1838{
1839 struct trace_iterator *iter = v;
1840
1841 if (iter->ent == NULL) {
1842 if (iter->tr) {
1843 seq_printf(m, "# tracer: %s\n", iter->trace->name);
1844 seq_puts(m, "#\n");
1845 }
1846 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
1847 /* print nothing if the buffers are empty */
1848 if (trace_empty(iter))
1849 return 0;
1850 print_trace_header(m, iter);
1851 if (!(trace_flags & TRACE_ITER_VERBOSE))
1852 print_lat_help_header(m);
1853 } else {
1854 if (!(trace_flags & TRACE_ITER_VERBOSE))
1855 print_func_help_header(m);
1856 }
1857 } else {
1858 print_trace_line(iter);
1859 trace_print_seq(m, &iter->seq);
1860 }
1861
1862 return 0;
1863}
1864
1865static struct seq_operations tracer_seq_ops = {
1866 .start = s_start,
1867 .next = s_next,
1868 .stop = s_stop,
1869 .show = s_show,
1870};
1871
1872static struct trace_iterator *
1873__tracing_open(struct inode *inode, struct file *file, int *ret)
1874{
1875 struct trace_iterator *iter;
1876
1877 if (tracing_disabled) {
1878 *ret = -ENODEV;
1879 return NULL;
1880 }
1881
1882 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
1883 if (!iter) {
1884 *ret = -ENOMEM;
1885 goto out;
1886 }
1887
1888 mutex_lock(&trace_types_lock);
1889 if (current_trace && current_trace->print_max)
1890 iter->tr = &max_tr;
1891 else
1892 iter->tr = inode->i_private;
1893 iter->trace = current_trace;
1894 iter->pos = -1;
1895
1896 /* TODO stop tracer */
1897 *ret = seq_open(file, &tracer_seq_ops);
1898 if (!*ret) {
1899 struct seq_file *m = file->private_data;
1900 m->private = iter;
1901
1902 /* stop the trace while dumping */
1903 if (iter->tr->ctrl) {
1904 tracer_enabled = 0;
1905 ftrace_function_enabled = 0;
1906 }
1907
1908 if (iter->trace && iter->trace->open)
1909 iter->trace->open(iter);
1910 } else {
1911 kfree(iter);
1912 iter = NULL;
1913 }
1914 mutex_unlock(&trace_types_lock);
1915
1916 out:
1917 return iter;
1918}
1919
1920int tracing_open_generic(struct inode *inode, struct file *filp)
1921{
1922 if (tracing_disabled)
1923 return -ENODEV;
1924
1925 filp->private_data = inode->i_private;
1926 return 0;
1927}
1928
1929int tracing_release(struct inode *inode, struct file *file)
1930{
1931 struct seq_file *m = (struct seq_file *)file->private_data;
1932 struct trace_iterator *iter = m->private;
1933
1934 mutex_lock(&trace_types_lock);
1935 if (iter->trace && iter->trace->close)
1936 iter->trace->close(iter);
1937
1938 /* reenable tracing if it was previously enabled */
1939 if (iter->tr->ctrl) {
1940 tracer_enabled = 1;
1941 /*
1942 * It is safe to enable function tracing even if it
1943 * isn't used
1944 */
1945 ftrace_function_enabled = 1;
1946 }
1947 mutex_unlock(&trace_types_lock);
1948
1949 seq_release(inode, file);
1950 kfree(iter);
1951 return 0;
1952}
1953
1954static int tracing_open(struct inode *inode, struct file *file)
1955{
1956 int ret;
1957
1958 __tracing_open(inode, file, &ret);
1959
1960 return ret;
1961}
1962
1963static int tracing_lt_open(struct inode *inode, struct file *file)
1964{
1965 struct trace_iterator *iter;
1966 int ret;
1967
1968 iter = __tracing_open(inode, file, &ret);
1969
1970 if (!ret)
1971 iter->iter_flags |= TRACE_FILE_LAT_FMT;
1972
1973 return ret;
1974}
1975
1976
1977static void *
1978t_next(struct seq_file *m, void *v, loff_t *pos)
1979{
1980 struct tracer *t = m->private;
1981
1982 (*pos)++;
1983
1984 if (t)
1985 t = t->next;
1986
1987 m->private = t;
1988
1989 return t;
1990}
1991
1992static void *t_start(struct seq_file *m, loff_t *pos)
1993{
1994 struct tracer *t = m->private;
1995 loff_t l = 0;
1996
1997 mutex_lock(&trace_types_lock);
1998 for (; t && l < *pos; t = t_next(m, t, &l))
1999 ;
2000
2001 return t;
2002}
2003
2004static void t_stop(struct seq_file *m, void *p)
2005{
2006 mutex_unlock(&trace_types_lock);
2007}
2008
2009static int t_show(struct seq_file *m, void *v)
2010{
2011 struct tracer *t = v;
2012
2013 if (!t)
2014 return 0;
2015
2016 seq_printf(m, "%s", t->name);
2017 if (t->next)
2018 seq_putc(m, ' ');
2019 else
2020 seq_putc(m, '\n');
2021
2022 return 0;
2023}
2024
2025static struct seq_operations show_traces_seq_ops = {
2026 .start = t_start,
2027 .next = t_next,
2028 .stop = t_stop,
2029 .show = t_show,
2030};
2031
2032static int show_traces_open(struct inode *inode, struct file *file)
2033{
2034 int ret;
2035
2036 if (tracing_disabled)
2037 return -ENODEV;
2038
2039 ret = seq_open(file, &show_traces_seq_ops);
2040 if (!ret) {
2041 struct seq_file *m = file->private_data;
2042 m->private = trace_types;
2043 }
2044
2045 return ret;
2046}
2047
2048static struct file_operations tracing_fops = {
2049 .open = tracing_open,
2050 .read = seq_read,
2051 .llseek = seq_lseek,
2052 .release = tracing_release,
2053};
2054
2055static struct file_operations tracing_lt_fops = {
2056 .open = tracing_lt_open,
2057 .read = seq_read,
2058 .llseek = seq_lseek,
2059 .release = tracing_release,
2060};
2061
2062static struct file_operations show_traces_fops = {
2063 .open = show_traces_open,
2064 .read = seq_read,
2065 .release = seq_release,
2066};
2067
2068/*
2069 * Only trace on a CPU if the bitmask is set:
2070 */
2071static cpumask_t tracing_cpumask = CPU_MASK_ALL;
2072
2073/*
2074 * When tracing/tracing_cpu_mask is modified then this holds
2075 * the new bitmask we are about to install:
2076 */
2077static cpumask_t tracing_cpumask_new;
2078
2079/*
2080 * The tracer itself will not take this lock, but still we want
2081 * to provide a consistent cpumask to user-space:
2082 */
2083static DEFINE_MUTEX(tracing_cpumask_update_lock);
2084
2085/*
2086 * Temporary storage for the character representation of the
2087 * CPU bitmask (and one more byte for the newline):
2088 */
2089static char mask_str[NR_CPUS + 1];
2090
2091static ssize_t
2092tracing_cpumask_read(struct file *filp, char __user *ubuf,
2093 size_t count, loff_t *ppos)
2094{
2095 int len;
2096
2097 mutex_lock(&tracing_cpumask_update_lock);
2098
2099 len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
2100 if (count - len < 2) {
2101 count = -EINVAL;
2102 goto out_err;
2103 }
2104 len += sprintf(mask_str + len, "\n");
2105 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
2106
2107out_err:
2108 mutex_unlock(&tracing_cpumask_update_lock);
2109
2110 return count;
2111}
2112
2113static ssize_t
2114tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2115 size_t count, loff_t *ppos)
2116{
2117 int err, cpu;
2118
2119 mutex_lock(&tracing_cpumask_update_lock);
2120 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
2121 if (err)
2122 goto err_unlock;
2123
2124 raw_local_irq_disable();
2125 __raw_spin_lock(&ftrace_max_lock);
2126 for_each_tracing_cpu(cpu) {
2127 /*
2128 * Increase/decrease the disabled counter if we are
2129 * about to flip a bit in the cpumask:
2130 */
2131 if (cpu_isset(cpu, tracing_cpumask) &&
2132 !cpu_isset(cpu, tracing_cpumask_new)) {
2133 atomic_inc(&global_trace.data[cpu]->disabled);
2134 }
2135 if (!cpu_isset(cpu, tracing_cpumask) &&
2136 cpu_isset(cpu, tracing_cpumask_new)) {
2137 atomic_dec(&global_trace.data[cpu]->disabled);
2138 }
2139 }
2140 __raw_spin_unlock(&ftrace_max_lock);
2141 raw_local_irq_enable();
2142
2143 tracing_cpumask = tracing_cpumask_new;
2144
2145 mutex_unlock(&tracing_cpumask_update_lock);
2146
2147 return count;
2148
2149err_unlock:
2150 mutex_unlock(&tracing_cpumask_update_lock);
2151
2152 return err;
2153}
2154
2155static struct file_operations tracing_cpumask_fops = {
2156 .open = tracing_open_generic,
2157 .read = tracing_cpumask_read,
2158 .write = tracing_cpumask_write,
2159};
2160
2161static ssize_t
2162tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2163 size_t cnt, loff_t *ppos)
2164{
2165 char *buf;
2166 int r = 0;
2167 int len = 0;
2168 int i;
2169
2170 /* calulate max size */
2171 for (i = 0; trace_options[i]; i++) {
2172 len += strlen(trace_options[i]);
2173 len += 3; /* "no" and space */
2174 }
2175
2176 /* +2 for \n and \0 */
2177 buf = kmalloc(len + 2, GFP_KERNEL);
2178 if (!buf)
2179 return -ENOMEM;
2180
2181 for (i = 0; trace_options[i]; i++) {
2182 if (trace_flags & (1 << i))
2183 r += sprintf(buf + r, "%s ", trace_options[i]);
2184 else
2185 r += sprintf(buf + r, "no%s ", trace_options[i]);
2186 }
2187
2188 r += sprintf(buf + r, "\n");
2189 WARN_ON(r >= len + 2);
2190
2191 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2192
2193 kfree(buf);
2194
2195 return r;
2196}
2197
2198static ssize_t
2199tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2200 size_t cnt, loff_t *ppos)
2201{
2202 char buf[64];
2203 char *cmp = buf;
2204 int neg = 0;
2205 int i;
2206
2207 if (cnt >= sizeof(buf))
2208 return -EINVAL;
2209
2210 if (copy_from_user(&buf, ubuf, cnt))
2211 return -EFAULT;
2212
2213 buf[cnt] = 0;
2214
2215 if (strncmp(buf, "no", 2) == 0) {
2216 neg = 1;
2217 cmp += 2;
2218 }
2219
2220 for (i = 0; trace_options[i]; i++) {
2221 int len = strlen(trace_options[i]);
2222
2223 if (strncmp(cmp, trace_options[i], len) == 0) {
2224 if (neg)
2225 trace_flags &= ~(1 << i);
2226 else
2227 trace_flags |= (1 << i);
2228 break;
2229 }
2230 }
2231 /*
2232 * If no option could be set, return an error:
2233 */
2234 if (!trace_options[i])
2235 return -EINVAL;
2236
2237 filp->f_pos += cnt;
2238
2239 return cnt;
2240}
2241
2242static struct file_operations tracing_iter_fops = {
2243 .open = tracing_open_generic,
2244 .read = tracing_iter_ctrl_read,
2245 .write = tracing_iter_ctrl_write,
2246};
2247
2248static const char readme_msg[] =
2249 "tracing mini-HOWTO:\n\n"
2250 "# mkdir /debug\n"
2251 "# mount -t debugfs nodev /debug\n\n"
2252 "# cat /debug/tracing/available_tracers\n"
2253 "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
2254 "# cat /debug/tracing/current_tracer\n"
2255 "none\n"
2256 "# echo sched_switch > /debug/tracing/current_tracer\n"
2257 "# cat /debug/tracing/current_tracer\n"
2258 "sched_switch\n"
2259 "# cat /debug/tracing/iter_ctrl\n"
2260 "noprint-parent nosym-offset nosym-addr noverbose\n"
2261 "# echo print-parent > /debug/tracing/iter_ctrl\n"
2262 "# echo 1 > /debug/tracing/tracing_enabled\n"
2263 "# cat /debug/tracing/trace > /tmp/trace.txt\n"
2264 "echo 0 > /debug/tracing/tracing_enabled\n"
2265;
2266
2267static ssize_t
2268tracing_readme_read(struct file *filp, char __user *ubuf,
2269 size_t cnt, loff_t *ppos)
2270{
2271 return simple_read_from_buffer(ubuf, cnt, ppos,
2272 readme_msg, strlen(readme_msg));
2273}
2274
2275static struct file_operations tracing_readme_fops = {
2276 .open = tracing_open_generic,
2277 .read = tracing_readme_read,
2278};
2279
2280static ssize_t
2281tracing_ctrl_read(struct file *filp, char __user *ubuf,
2282 size_t cnt, loff_t *ppos)
2283{
2284 struct trace_array *tr = filp->private_data;
2285 char buf[64];
2286 int r;
2287
2288 r = sprintf(buf, "%ld\n", tr->ctrl);
2289 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2290}
2291
2292static ssize_t
2293tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2294 size_t cnt, loff_t *ppos)
2295{
2296 struct trace_array *tr = filp->private_data;
2297 char buf[64];
2298 long val;
2299 int ret;
2300
2301 if (cnt >= sizeof(buf))
2302 return -EINVAL;
2303
2304 if (copy_from_user(&buf, ubuf, cnt))
2305 return -EFAULT;
2306
2307 buf[cnt] = 0;
2308
2309 ret = strict_strtoul(buf, 10, &val);
2310 if (ret < 0)
2311 return ret;
2312
2313 val = !!val;
2314
2315 mutex_lock(&trace_types_lock);
2316 if (tr->ctrl ^ val) {
2317 if (val)
2318 tracer_enabled = 1;
2319 else
2320 tracer_enabled = 0;
2321
2322 tr->ctrl = val;
2323
2324 if (current_trace && current_trace->ctrl_update)
2325 current_trace->ctrl_update(tr);
2326 }
2327 mutex_unlock(&trace_types_lock);
2328
2329 filp->f_pos += cnt;
2330
2331 return cnt;
2332}
2333
2334static ssize_t
2335tracing_set_trace_read(struct file *filp, char __user *ubuf,
2336 size_t cnt, loff_t *ppos)
2337{
2338 char buf[max_tracer_type_len+2];
2339 int r;
2340
2341 mutex_lock(&trace_types_lock);
2342 if (current_trace)
2343 r = sprintf(buf, "%s\n", current_trace->name);
2344 else
2345 r = sprintf(buf, "\n");
2346 mutex_unlock(&trace_types_lock);
2347
2348 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2349}
2350
2351static ssize_t
2352tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2353 size_t cnt, loff_t *ppos)
2354{
2355 struct trace_array *tr = &global_trace;
2356 struct tracer *t;
2357 char buf[max_tracer_type_len+1];
2358 int i;
2359
2360 if (cnt > max_tracer_type_len)
2361 cnt = max_tracer_type_len;
2362
2363 if (copy_from_user(&buf, ubuf, cnt))
2364 return -EFAULT;
2365
2366 buf[cnt] = 0;
2367
2368 /* strip ending whitespace. */
2369 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
2370 buf[i] = 0;
2371
2372 mutex_lock(&trace_types_lock);
2373 for (t = trace_types; t; t = t->next) {
2374 if (strcmp(t->name, buf) == 0)
2375 break;
2376 }
2377 if (!t || t == current_trace)
2378 goto out;
2379
2380 if (current_trace && current_trace->reset)
2381 current_trace->reset(tr);
2382
2383 current_trace = t;
2384 if (t->init)
2385 t->init(tr);
2386
2387 out:
2388 mutex_unlock(&trace_types_lock);
2389
2390 filp->f_pos += cnt;
2391
2392 return cnt;
2393}
2394
2395static ssize_t
2396tracing_max_lat_read(struct file *filp, char __user *ubuf,
2397 size_t cnt, loff_t *ppos)
2398{
2399 unsigned long *ptr = filp->private_data;
2400 char buf[64];
2401 int r;
2402
2403 r = snprintf(buf, sizeof(buf), "%ld\n",
2404 *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
2405 if (r > sizeof(buf))
2406 r = sizeof(buf);
2407 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2408}
2409
2410static ssize_t
2411tracing_max_lat_write(struct file *filp, const char __user *ubuf,
2412 size_t cnt, loff_t *ppos)
2413{
2414 long *ptr = filp->private_data;
2415 char buf[64];
2416 long val;
2417 int ret;
2418
2419 if (cnt >= sizeof(buf))
2420 return -EINVAL;
2421
2422 if (copy_from_user(&buf, ubuf, cnt))
2423 return -EFAULT;
2424
2425 buf[cnt] = 0;
2426
2427 ret = strict_strtoul(buf, 10, &val);
2428 if (ret < 0)
2429 return ret;
2430
2431 *ptr = val * 1000;
2432
2433 return cnt;
2434}
2435
2436static atomic_t tracing_reader;
2437
2438static int tracing_open_pipe(struct inode *inode, struct file *filp)
2439{
2440 struct trace_iterator *iter;
2441
2442 if (tracing_disabled)
2443 return -ENODEV;
2444
2445 /* We only allow for reader of the pipe */
2446 if (atomic_inc_return(&tracing_reader) != 1) {
2447 atomic_dec(&tracing_reader);
2448 return -EBUSY;
2449 }
2450
2451 /* create a buffer to store the information to pass to userspace */
2452 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2453 if (!iter)
2454 return -ENOMEM;
2455
2456 mutex_lock(&trace_types_lock);
2457 iter->tr = &global_trace;
2458 iter->trace = current_trace;
2459 filp->private_data = iter;
2460
2461 if (iter->trace->pipe_open)
2462 iter->trace->pipe_open(iter);
2463 mutex_unlock(&trace_types_lock);
2464
2465 return 0;
2466}
2467
2468static int tracing_release_pipe(struct inode *inode, struct file *file)
2469{
2470 struct trace_iterator *iter = file->private_data;
2471
2472 kfree(iter);
2473 atomic_dec(&tracing_reader);
2474
2475 return 0;
2476}
2477
2478static unsigned int
2479tracing_poll_pipe(struct file *filp, poll_table *poll_table)
2480{
2481 struct trace_iterator *iter = filp->private_data;
2482
2483 if (trace_flags & TRACE_ITER_BLOCK) {
2484 /*
2485 * Always select as readable when in blocking mode
2486 */
2487 return POLLIN | POLLRDNORM;
2488 } else {
2489 if (!trace_empty(iter))
2490 return POLLIN | POLLRDNORM;
2491 poll_wait(filp, &trace_wait, poll_table);
2492 if (!trace_empty(iter))
2493 return POLLIN | POLLRDNORM;
2494
2495 return 0;
2496 }
2497}
2498
2499/*
2500 * Consumer reader.
2501 */
2502static ssize_t
2503tracing_read_pipe(struct file *filp, char __user *ubuf,
2504 size_t cnt, loff_t *ppos)
2505{
2506 struct trace_iterator *iter = filp->private_data;
2507 struct trace_array_cpu *data;
2508 static cpumask_t mask;
2509 unsigned long flags;
2510#ifdef CONFIG_FTRACE
2511 int ftrace_save;
2512#endif
2513 int cpu;
2514 ssize_t sret;
2515
2516 /* return any leftover data */
2517 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2518 if (sret != -EBUSY)
2519 return sret;
2520 sret = 0;
2521
2522 trace_seq_reset(&iter->seq);
2523
2524 mutex_lock(&trace_types_lock);
2525 if (iter->trace->read) {
2526 sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
2527 if (sret)
2528 goto out;
2529 }
2530
2531 while (trace_empty(iter)) {
2532
2533 if ((filp->f_flags & O_NONBLOCK)) {
2534 sret = -EAGAIN;
2535 goto out;
2536 }
2537
2538 /*
2539 * This is a make-shift waitqueue. The reason we don't use
2540 * an actual wait queue is because:
2541 * 1) we only ever have one waiter
2542 * 2) the tracing, traces all functions, we don't want
2543 * the overhead of calling wake_up and friends
2544 * (and tracing them too)
2545 * Anyway, this is really very primitive wakeup.
2546 */
2547 set_current_state(TASK_INTERRUPTIBLE);
2548 iter->tr->waiter = current;
2549
2550 mutex_unlock(&trace_types_lock);
2551
2552 /* sleep for 100 msecs, and try again. */
2553 schedule_timeout(HZ/10);
2554
2555 mutex_lock(&trace_types_lock);
2556
2557 iter->tr->waiter = NULL;
2558
2559 if (signal_pending(current)) {
2560 sret = -EINTR;
2561 goto out;
2562 }
2563
2564 if (iter->trace != current_trace)
2565 goto out;
2566
2567 /*
2568 * We block until we read something and tracing is disabled.
2569 * We still block if tracing is disabled, but we have never
2570 * read anything. This allows a user to cat this file, and
2571 * then enable tracing. But after we have read something,
2572 * we give an EOF when tracing is again disabled.
2573 *
2574 * iter->pos will be 0 if we haven't read anything.
2575 */
2576 if (!tracer_enabled && iter->pos)
2577 break;
2578
2579 continue;
2580 }
2581
2582 /* stop when tracing is finished */
2583 if (trace_empty(iter))
2584 goto out;
2585
2586 if (cnt >= PAGE_SIZE)
2587 cnt = PAGE_SIZE - 1;
2588
2589 /* reset all but tr, trace, and overruns */
2590 memset(&iter->seq, 0,
2591 sizeof(struct trace_iterator) -
2592 offsetof(struct trace_iterator, seq));
2593 iter->pos = -1;
2594
2595 /*
2596 * We need to stop all tracing on all CPUS to read the
2597 * the next buffer. This is a bit expensive, but is
2598 * not done often. We fill all what we can read,
2599 * and then release the locks again.
2600 */
2601
2602 cpus_clear(mask);
2603 local_irq_save(flags);
2604#ifdef CONFIG_FTRACE
2605 ftrace_save = ftrace_enabled;
2606 ftrace_enabled = 0;
2607#endif
2608 smp_wmb();
2609 for_each_tracing_cpu(cpu) {
2610 data = iter->tr->data[cpu];
2611
2612 if (!head_page(data) || !data->trace_idx)
2613 continue;
2614
2615 atomic_inc(&data->disabled);
2616 cpu_set(cpu, mask);
2617 }
2618
2619 for_each_cpu_mask(cpu, mask) {
2620 data = iter->tr->data[cpu];
2621 __raw_spin_lock(&data->lock);
2622
2623 if (data->overrun > iter->last_overrun[cpu])
2624 iter->overrun[cpu] +=
2625 data->overrun - iter->last_overrun[cpu];
2626 iter->last_overrun[cpu] = data->overrun;
2627 }
2628
2629 while (find_next_entry_inc(iter) != NULL) {
2630 int ret;
2631 int len = iter->seq.len;
2632
2633 ret = print_trace_line(iter);
2634 if (!ret) {
2635 /* don't print partial lines */
2636 iter->seq.len = len;
2637 break;
2638 }
2639
2640 trace_consume(iter);
2641
2642 if (iter->seq.len >= cnt)
2643 break;
2644 }
2645
2646 for_each_cpu_mask(cpu, mask) {
2647 data = iter->tr->data[cpu];
2648 __raw_spin_unlock(&data->lock);
2649 }
2650
2651 for_each_cpu_mask(cpu, mask) {
2652 data = iter->tr->data[cpu];
2653 atomic_dec(&data->disabled);
2654 }
2655#ifdef CONFIG_FTRACE
2656 ftrace_enabled = ftrace_save;
2657#endif
2658 local_irq_restore(flags);
2659
2660 /* Now copy what we have to the user */
2661 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2662 if (iter->seq.readpos >= iter->seq.len)
2663 trace_seq_reset(&iter->seq);
2664 if (sret == -EBUSY)
2665 sret = 0;
2666
2667out:
2668 mutex_unlock(&trace_types_lock);
2669
2670 return sret;
2671}
2672
2673static ssize_t
2674tracing_entries_read(struct file *filp, char __user *ubuf,
2675 size_t cnt, loff_t *ppos)
2676{
2677 struct trace_array *tr = filp->private_data;
2678 char buf[64];
2679 int r;
2680
2681 r = sprintf(buf, "%lu\n", tr->entries);
2682 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2683}
2684
2685static ssize_t
2686tracing_entries_write(struct file *filp, const char __user *ubuf,
2687 size_t cnt, loff_t *ppos)
2688{
2689 unsigned long val;
2690 char buf[64];
2691 int i, ret;
2692
2693 if (cnt >= sizeof(buf))
2694 return -EINVAL;
2695
2696 if (copy_from_user(&buf, ubuf, cnt))
2697 return -EFAULT;
2698
2699 buf[cnt] = 0;
2700
2701 ret = strict_strtoul(buf, 10, &val);
2702 if (ret < 0)
2703 return ret;
2704
2705 /* must have at least 1 entry */
2706 if (!val)
2707 return -EINVAL;
2708
2709 mutex_lock(&trace_types_lock);
2710
2711 if (current_trace != &no_tracer) {
2712 cnt = -EBUSY;
2713 pr_info("ftrace: set current_tracer to none"
2714 " before modifying buffer size\n");
2715 goto out;
2716 }
2717
2718 if (val > global_trace.entries) {
2719 long pages_requested;
2720 unsigned long freeable_pages;
2721
2722 /* make sure we have enough memory before mapping */
2723 pages_requested =
2724 (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
2725
2726 /* account for each buffer (and max_tr) */
2727 pages_requested *= tracing_nr_buffers * 2;
2728
2729 /* Check for overflow */
2730 if (pages_requested < 0) {
2731 cnt = -ENOMEM;
2732 goto out;
2733 }
2734
2735 freeable_pages = determine_dirtyable_memory();
2736
2737 /* we only allow to request 1/4 of useable memory */
2738 if (pages_requested >
2739 ((freeable_pages + tracing_pages_allocated) / 4)) {
2740 cnt = -ENOMEM;
2741 goto out;
2742 }
2743
2744 while (global_trace.entries < val) {
2745 if (trace_alloc_page()) {
2746 cnt = -ENOMEM;
2747 goto out;
2748 }
2749 /* double check that we don't go over the known pages */
2750 if (tracing_pages_allocated > pages_requested)
2751 break;
2752 }
2753
2754 } else {
2755 /* include the number of entries in val (inc of page entries) */
2756 while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
2757 trace_free_page();
2758 }
2759
2760 /* check integrity */
2761 for_each_tracing_cpu(i)
2762 check_pages(global_trace.data[i]);
2763
2764 filp->f_pos += cnt;
2765
2766 /* If check pages failed, return ENOMEM */
2767 if (tracing_disabled)
2768 cnt = -ENOMEM;
2769 out:
2770 max_tr.entries = global_trace.entries;
2771 mutex_unlock(&trace_types_lock);
2772
2773 return cnt;
2774}
2775
2776static struct file_operations tracing_max_lat_fops = {
2777 .open = tracing_open_generic,
2778 .read = tracing_max_lat_read,
2779 .write = tracing_max_lat_write,
2780};
2781
2782static struct file_operations tracing_ctrl_fops = {
2783 .open = tracing_open_generic,
2784 .read = tracing_ctrl_read,
2785 .write = tracing_ctrl_write,
2786};
2787
2788static struct file_operations set_tracer_fops = {
2789 .open = tracing_open_generic,
2790 .read = tracing_set_trace_read,
2791 .write = tracing_set_trace_write,
2792};
2793
2794static struct file_operations tracing_pipe_fops = {
2795 .open = tracing_open_pipe,
2796 .poll = tracing_poll_pipe,
2797 .read = tracing_read_pipe,
2798 .release = tracing_release_pipe,
2799};
2800
2801static struct file_operations tracing_entries_fops = {
2802 .open = tracing_open_generic,
2803 .read = tracing_entries_read,
2804 .write = tracing_entries_write,
2805};
2806
2807#ifdef CONFIG_DYNAMIC_FTRACE
2808
2809static ssize_t
2810tracing_read_long(struct file *filp, char __user *ubuf,
2811 size_t cnt, loff_t *ppos)
2812{
2813 unsigned long *p = filp->private_data;
2814 char buf[64];
2815 int r;
2816
2817 r = sprintf(buf, "%ld\n", *p);
2818
2819 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2820}
2821
2822static struct file_operations tracing_read_long_fops = {
2823 .open = tracing_open_generic,
2824 .read = tracing_read_long,
2825};
2826#endif
2827
2828static struct dentry *d_tracer;
2829
2830struct dentry *tracing_init_dentry(void)
2831{
2832 static int once;
2833
2834 if (d_tracer)
2835 return d_tracer;
2836
2837 d_tracer = debugfs_create_dir("tracing", NULL);
2838
2839 if (!d_tracer && !once) {
2840 once = 1;
2841 pr_warning("Could not create debugfs directory 'tracing'\n");
2842 return NULL;
2843 }
2844
2845 return d_tracer;
2846}
2847
2848#ifdef CONFIG_FTRACE_SELFTEST
2849/* Let selftest have access to static functions in this file */
2850#include "trace_selftest.c"
2851#endif
2852
2853static __init void tracer_init_debugfs(void)
2854{
2855 struct dentry *d_tracer;
2856 struct dentry *entry;
2857
2858 d_tracer = tracing_init_dentry();
2859
2860 entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
2861 &global_trace, &tracing_ctrl_fops);
2862 if (!entry)
2863 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
2864
2865 entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
2866 NULL, &tracing_iter_fops);
2867 if (!entry)
2868 pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
2869
2870 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
2871 NULL, &tracing_cpumask_fops);
2872 if (!entry)
2873 pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
2874
2875 entry = debugfs_create_file("latency_trace", 0444, d_tracer,
2876 &global_trace, &tracing_lt_fops);
2877 if (!entry)
2878 pr_warning("Could not create debugfs 'latency_trace' entry\n");
2879
2880 entry = debugfs_create_file("trace", 0444, d_tracer,
2881 &global_trace, &tracing_fops);
2882 if (!entry)
2883 pr_warning("Could not create debugfs 'trace' entry\n");
2884
2885 entry = debugfs_create_file("available_tracers", 0444, d_tracer,
2886 &global_trace, &show_traces_fops);
2887 if (!entry)
2888 pr_warning("Could not create debugfs 'trace' entry\n");
2889
2890 entry = debugfs_create_file("current_tracer", 0444, d_tracer,
2891 &global_trace, &set_tracer_fops);
2892 if (!entry)
2893 pr_warning("Could not create debugfs 'trace' entry\n");
2894
2895 entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
2896 &tracing_max_latency,
2897 &tracing_max_lat_fops);
2898 if (!entry)
2899 pr_warning("Could not create debugfs "
2900 "'tracing_max_latency' entry\n");
2901
2902 entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
2903 &tracing_thresh, &tracing_max_lat_fops);
2904 if (!entry)
2905 pr_warning("Could not create debugfs "
2906 "'tracing_threash' entry\n");
2907 entry = debugfs_create_file("README", 0644, d_tracer,
2908 NULL, &tracing_readme_fops);
2909 if (!entry)
2910 pr_warning("Could not create debugfs 'README' entry\n");
2911
2912 entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
2913 NULL, &tracing_pipe_fops);
2914 if (!entry)
2915 pr_warning("Could not create debugfs "
2916 "'tracing_threash' entry\n");
2917
2918 entry = debugfs_create_file("trace_entries", 0644, d_tracer,
2919 &global_trace, &tracing_entries_fops);
2920 if (!entry)
2921 pr_warning("Could not create debugfs "
2922 "'tracing_threash' entry\n");
2923
2924#ifdef CONFIG_DYNAMIC_FTRACE
2925 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
2926 &ftrace_update_tot_cnt,
2927 &tracing_read_long_fops);
2928 if (!entry)
2929 pr_warning("Could not create debugfs "
2930 "'dyn_ftrace_total_info' entry\n");
2931#endif
2932#ifdef CONFIG_SYSPROF_TRACER
2933 init_tracer_sysprof_debugfs(d_tracer);
2934#endif
2935}
2936
2937static int trace_alloc_page(void)
2938{
2939 struct trace_array_cpu *data;
2940 struct page *page, *tmp;
2941 LIST_HEAD(pages);
2942 void *array;
2943 unsigned pages_allocated = 0;
2944 int i;
2945
2946 /* first allocate a page for each CPU */
2947 for_each_tracing_cpu(i) {
2948 array = (void *)__get_free_page(GFP_KERNEL);
2949 if (array == NULL) {
2950 printk(KERN_ERR "tracer: failed to allocate page"
2951 "for trace buffer!\n");
2952 goto free_pages;
2953 }
2954
2955 pages_allocated++;
2956 page = virt_to_page(array);
2957 list_add(&page->lru, &pages);
2958
2959/* Only allocate if we are actually using the max trace */
2960#ifdef CONFIG_TRACER_MAX_TRACE
2961 array = (void *)__get_free_page(GFP_KERNEL);
2962 if (array == NULL) {
2963 printk(KERN_ERR "tracer: failed to allocate page"
2964 "for trace buffer!\n");
2965 goto free_pages;
2966 }
2967 pages_allocated++;
2968 page = virt_to_page(array);
2969 list_add(&page->lru, &pages);
2970#endif
2971 }
2972
2973 /* Now that we successfully allocate a page per CPU, add them */
2974 for_each_tracing_cpu(i) {
2975 data = global_trace.data[i];
2976 page = list_entry(pages.next, struct page, lru);
2977 list_del_init(&page->lru);
2978 list_add_tail(&page->lru, &data->trace_pages);
2979 ClearPageLRU(page);
2980
2981#ifdef CONFIG_TRACER_MAX_TRACE
2982 data = max_tr.data[i];
2983 page = list_entry(pages.next, struct page, lru);
2984 list_del_init(&page->lru);
2985 list_add_tail(&page->lru, &data->trace_pages);
2986 SetPageLRU(page);
2987#endif
2988 }
2989 tracing_pages_allocated += pages_allocated;
2990 global_trace.entries += ENTRIES_PER_PAGE;
2991
2992 return 0;
2993
2994 free_pages:
2995 list_for_each_entry_safe(page, tmp, &pages, lru) {
2996 list_del_init(&page->lru);
2997 __free_page(page);
2998 }
2999 return -ENOMEM;
3000}
3001
3002static int trace_free_page(void)
3003{
3004 struct trace_array_cpu *data;
3005 struct page *page;
3006 struct list_head *p;
3007 int i;
3008 int ret = 0;
3009
3010 /* free one page from each buffer */
3011 for_each_tracing_cpu(i) {
3012 data = global_trace.data[i];
3013 p = data->trace_pages.next;
3014 if (p == &data->trace_pages) {
3015 /* should never happen */
3016 WARN_ON(1);
3017 tracing_disabled = 1;
3018 ret = -1;
3019 break;
3020 }
3021 page = list_entry(p, struct page, lru);
3022 ClearPageLRU(page);
3023 list_del(&page->lru);
3024 tracing_pages_allocated--;
3025 tracing_pages_allocated--;
3026 __free_page(page);
3027
3028 tracing_reset(data);
3029
3030#ifdef CONFIG_TRACER_MAX_TRACE
3031 data = max_tr.data[i];
3032 p = data->trace_pages.next;
3033 if (p == &data->trace_pages) {
3034 /* should never happen */
3035 WARN_ON(1);
3036 tracing_disabled = 1;
3037 ret = -1;
3038 break;
3039 }
3040 page = list_entry(p, struct page, lru);
3041 ClearPageLRU(page);
3042 list_del(&page->lru);
3043 __free_page(page);
3044
3045 tracing_reset(data);
3046#endif
3047 }
3048 global_trace.entries -= ENTRIES_PER_PAGE;
3049
3050 return ret;
3051}
3052
3053__init static int tracer_alloc_buffers(void)
3054{
3055 struct trace_array_cpu *data;
3056 void *array;
3057 struct page *page;
3058 int pages = 0;
3059 int ret = -ENOMEM;
3060 int i;
3061
3062 /* TODO: make the number of buffers hot pluggable with CPUS */
3063 tracing_nr_buffers = num_possible_cpus();
3064 tracing_buffer_mask = cpu_possible_map;
3065
3066 /* Allocate the first page for all buffers */
3067 for_each_tracing_cpu(i) {
3068 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
3069 max_tr.data[i] = &per_cpu(max_data, i);
3070
3071 array = (void *)__get_free_page(GFP_KERNEL);
3072 if (array == NULL) {
3073 printk(KERN_ERR "tracer: failed to allocate page"
3074 "for trace buffer!\n");
3075 goto free_buffers;
3076 }
3077
3078 /* set the array to the list */
3079 INIT_LIST_HEAD(&data->trace_pages);
3080 page = virt_to_page(array);
3081 list_add(&page->lru, &data->trace_pages);
3082 /* use the LRU flag to differentiate the two buffers */
3083 ClearPageLRU(page);
3084
3085 data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
3086 max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
3087
3088/* Only allocate if we are actually using the max trace */
3089#ifdef CONFIG_TRACER_MAX_TRACE
3090 array = (void *)__get_free_page(GFP_KERNEL);
3091 if (array == NULL) {
3092 printk(KERN_ERR "tracer: failed to allocate page"
3093 "for trace buffer!\n");
3094 goto free_buffers;
3095 }
3096
3097 INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
3098 page = virt_to_page(array);
3099 list_add(&page->lru, &max_tr.data[i]->trace_pages);
3100 SetPageLRU(page);
3101#endif
3102 }
3103
3104 /*
3105 * Since we allocate by orders of pages, we may be able to
3106 * round up a bit.
3107 */
3108 global_trace.entries = ENTRIES_PER_PAGE;
3109 pages++;
3110
3111 while (global_trace.entries < trace_nr_entries) {
3112 if (trace_alloc_page())
3113 break;
3114 pages++;
3115 }
3116 max_tr.entries = global_trace.entries;
3117
3118 pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n",
3119 pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE);
3120 pr_info(" actual entries %ld\n", global_trace.entries);
3121
3122 tracer_init_debugfs();
3123
3124 trace_init_cmdlines();
3125
3126 register_tracer(&no_tracer);
3127 current_trace = &no_tracer;
3128
3129 /* All seems OK, enable tracing */
3130 global_trace.ctrl = tracer_enabled;
3131 tracing_disabled = 0;
3132
3133 return 0;
3134
3135 free_buffers:
3136 for (i-- ; i >= 0; i--) {
3137 struct page *page, *tmp;
3138 struct trace_array_cpu *data = global_trace.data[i];
3139
3140 if (data) {
3141 list_for_each_entry_safe(page, tmp,
3142 &data->trace_pages, lru) {
3143 list_del_init(&page->lru);
3144 __free_page(page);
3145 }
3146 }
3147
3148#ifdef CONFIG_TRACER_MAX_TRACE
3149 data = max_tr.data[i];
3150 if (data) {
3151 list_for_each_entry_safe(page, tmp,
3152 &data->trace_pages, lru) {
3153 list_del_init(&page->lru);
3154 __free_page(page);
3155 }
3156 }
3157#endif
3158 }
3159 return ret;
3160}
3161fs_initcall(tracer_alloc_buffers);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
new file mode 100644
index 000000000000..f69f86788c2b
--- /dev/null
+++ b/kernel/trace/trace.h
@@ -0,0 +1,339 @@
1#ifndef _LINUX_KERNEL_TRACE_H
2#define _LINUX_KERNEL_TRACE_H
3
4#include <linux/fs.h>
5#include <asm/atomic.h>
6#include <linux/sched.h>
7#include <linux/clocksource.h>
8#include <linux/mmiotrace.h>
9
10enum trace_type {
11 __TRACE_FIRST_TYPE = 0,
12
13 TRACE_FN,
14 TRACE_CTX,
15 TRACE_WAKE,
16 TRACE_STACK,
17 TRACE_SPECIAL,
18 TRACE_MMIO_RW,
19 TRACE_MMIO_MAP,
20
21 __TRACE_LAST_TYPE
22};
23
24/*
25 * Function trace entry - function address and parent function addres:
26 */
27struct ftrace_entry {
28 unsigned long ip;
29 unsigned long parent_ip;
30};
31
32/*
33 * Context switch trace entry - which task (and prio) we switched from/to:
34 */
35struct ctx_switch_entry {
36 unsigned int prev_pid;
37 unsigned char prev_prio;
38 unsigned char prev_state;
39 unsigned int next_pid;
40 unsigned char next_prio;
41 unsigned char next_state;
42};
43
44/*
45 * Special (free-form) trace entry:
46 */
47struct special_entry {
48 unsigned long arg1;
49 unsigned long arg2;
50 unsigned long arg3;
51};
52
53/*
54 * Stack-trace entry:
55 */
56
57#define FTRACE_STACK_ENTRIES 8
58
59struct stack_entry {
60 unsigned long caller[FTRACE_STACK_ENTRIES];
61};
62
63/*
64 * The trace entry - the most basic unit of tracing. This is what
65 * is printed in the end as a single line in the trace output, such as:
66 *
67 * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
68 */
69struct trace_entry {
70 char type;
71 char cpu;
72 char flags;
73 char preempt_count;
74 int pid;
75 cycle_t t;
76 union {
77 struct ftrace_entry fn;
78 struct ctx_switch_entry ctx;
79 struct special_entry special;
80 struct stack_entry stack;
81 struct mmiotrace_rw mmiorw;
82 struct mmiotrace_map mmiomap;
83 };
84};
85
86#define TRACE_ENTRY_SIZE sizeof(struct trace_entry)
87
88/*
89 * The CPU trace array - it consists of thousands of trace entries
90 * plus some other descriptor data: (for example which task started
91 * the trace, etc.)
92 */
93struct trace_array_cpu {
94 struct list_head trace_pages;
95 atomic_t disabled;
96 raw_spinlock_t lock;
97 struct lock_class_key lock_key;
98
99 /* these fields get copied into max-trace: */
100 unsigned trace_head_idx;
101 unsigned trace_tail_idx;
102 void *trace_head; /* producer */
103 void *trace_tail; /* consumer */
104 unsigned long trace_idx;
105 unsigned long overrun;
106 unsigned long saved_latency;
107 unsigned long critical_start;
108 unsigned long critical_end;
109 unsigned long critical_sequence;
110 unsigned long nice;
111 unsigned long policy;
112 unsigned long rt_priority;
113 cycle_t preempt_timestamp;
114 pid_t pid;
115 uid_t uid;
116 char comm[TASK_COMM_LEN];
117};
118
119struct trace_iterator;
120
121/*
122 * The trace array - an array of per-CPU trace arrays. This is the
123 * highest level data structure that individual tracers deal with.
124 * They have on/off state as well:
125 */
126struct trace_array {
127 unsigned long entries;
128 long ctrl;
129 int cpu;
130 cycle_t time_start;
131 struct task_struct *waiter;
132 struct trace_array_cpu *data[NR_CPUS];
133};
134
135/*
136 * A specific tracer, represented by methods that operate on a trace array:
137 */
138struct tracer {
139 const char *name;
140 void (*init)(struct trace_array *tr);
141 void (*reset)(struct trace_array *tr);
142 void (*open)(struct trace_iterator *iter);
143 void (*pipe_open)(struct trace_iterator *iter);
144 void (*close)(struct trace_iterator *iter);
145 void (*start)(struct trace_iterator *iter);
146 void (*stop)(struct trace_iterator *iter);
147 ssize_t (*read)(struct trace_iterator *iter,
148 struct file *filp, char __user *ubuf,
149 size_t cnt, loff_t *ppos);
150 void (*ctrl_update)(struct trace_array *tr);
151#ifdef CONFIG_FTRACE_STARTUP_TEST
152 int (*selftest)(struct tracer *trace,
153 struct trace_array *tr);
154#endif
155 int (*print_line)(struct trace_iterator *iter);
156 struct tracer *next;
157 int print_max;
158};
159
160struct trace_seq {
161 unsigned char buffer[PAGE_SIZE];
162 unsigned int len;
163 unsigned int readpos;
164};
165
166/*
167 * Trace iterator - used by printout routines who present trace
168 * results to users and which routines might sleep, etc:
169 */
170struct trace_iterator {
171 struct trace_array *tr;
172 struct tracer *trace;
173 void *private;
174 long last_overrun[NR_CPUS];
175 long overrun[NR_CPUS];
176
177 /* The below is zeroed out in pipe_read */
178 struct trace_seq seq;
179 struct trace_entry *ent;
180 int cpu;
181
182 struct trace_entry *prev_ent;
183 int prev_cpu;
184
185 unsigned long iter_flags;
186 loff_t pos;
187 unsigned long next_idx[NR_CPUS];
188 struct list_head *next_page[NR_CPUS];
189 unsigned next_page_idx[NR_CPUS];
190 long idx;
191};
192
193void tracing_reset(struct trace_array_cpu *data);
194int tracing_open_generic(struct inode *inode, struct file *filp);
195struct dentry *tracing_init_dentry(void);
196void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
197
198void ftrace(struct trace_array *tr,
199 struct trace_array_cpu *data,
200 unsigned long ip,
201 unsigned long parent_ip,
202 unsigned long flags);
203void tracing_sched_switch_trace(struct trace_array *tr,
204 struct trace_array_cpu *data,
205 struct task_struct *prev,
206 struct task_struct *next,
207 unsigned long flags);
208void tracing_record_cmdline(struct task_struct *tsk);
209
210void tracing_sched_wakeup_trace(struct trace_array *tr,
211 struct trace_array_cpu *data,
212 struct task_struct *wakee,
213 struct task_struct *cur,
214 unsigned long flags);
215void trace_special(struct trace_array *tr,
216 struct trace_array_cpu *data,
217 unsigned long arg1,
218 unsigned long arg2,
219 unsigned long arg3);
220void trace_function(struct trace_array *tr,
221 struct trace_array_cpu *data,
222 unsigned long ip,
223 unsigned long parent_ip,
224 unsigned long flags);
225
226void tracing_start_cmdline_record(void);
227void tracing_stop_cmdline_record(void);
228int register_tracer(struct tracer *type);
229void unregister_tracer(struct tracer *type);
230
231extern unsigned long nsecs_to_usecs(unsigned long nsecs);
232
233extern unsigned long tracing_max_latency;
234extern unsigned long tracing_thresh;
235
236void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
237void update_max_tr_single(struct trace_array *tr,
238 struct task_struct *tsk, int cpu);
239
240extern cycle_t ftrace_now(int cpu);
241
242#ifdef CONFIG_FTRACE
243void tracing_start_function_trace(void);
244void tracing_stop_function_trace(void);
245#else
246# define tracing_start_function_trace() do { } while (0)
247# define tracing_stop_function_trace() do { } while (0)
248#endif
249
250#ifdef CONFIG_CONTEXT_SWITCH_TRACER
251typedef void
252(*tracer_switch_func_t)(void *private,
253 void *__rq,
254 struct task_struct *prev,
255 struct task_struct *next);
256
257struct tracer_switch_ops {
258 tracer_switch_func_t func;
259 void *private;
260 struct tracer_switch_ops *next;
261};
262
263#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
264
265#ifdef CONFIG_DYNAMIC_FTRACE
266extern unsigned long ftrace_update_tot_cnt;
267#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
268extern int DYN_FTRACE_TEST_NAME(void);
269#endif
270
271#ifdef CONFIG_MMIOTRACE
272extern void __trace_mmiotrace_rw(struct trace_array *tr,
273 struct trace_array_cpu *data,
274 struct mmiotrace_rw *rw);
275extern void __trace_mmiotrace_map(struct trace_array *tr,
276 struct trace_array_cpu *data,
277 struct mmiotrace_map *map);
278#endif
279
280#ifdef CONFIG_FTRACE_STARTUP_TEST
281#ifdef CONFIG_FTRACE
282extern int trace_selftest_startup_function(struct tracer *trace,
283 struct trace_array *tr);
284#endif
285#ifdef CONFIG_IRQSOFF_TRACER
286extern int trace_selftest_startup_irqsoff(struct tracer *trace,
287 struct trace_array *tr);
288#endif
289#ifdef CONFIG_PREEMPT_TRACER
290extern int trace_selftest_startup_preemptoff(struct tracer *trace,
291 struct trace_array *tr);
292#endif
293#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
294extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
295 struct trace_array *tr);
296#endif
297#ifdef CONFIG_SCHED_TRACER
298extern int trace_selftest_startup_wakeup(struct tracer *trace,
299 struct trace_array *tr);
300#endif
301#ifdef CONFIG_CONTEXT_SWITCH_TRACER
302extern int trace_selftest_startup_sched_switch(struct tracer *trace,
303 struct trace_array *tr);
304#endif
305#ifdef CONFIG_SYSPROF_TRACER
306extern int trace_selftest_startup_sysprof(struct tracer *trace,
307 struct trace_array *tr);
308#endif
309#endif /* CONFIG_FTRACE_STARTUP_TEST */
310
311extern void *head_page(struct trace_array_cpu *data);
312extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
313extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
314 size_t cnt);
315extern long ns2usecs(cycle_t nsec);
316
317extern unsigned long trace_flags;
318
319/*
320 * trace_iterator_flags is an enumeration that defines bit
321 * positions into trace_flags that controls the output.
322 *
323 * NOTE: These bits must match the trace_options array in
324 * trace.c.
325 */
326enum trace_iterator_flags {
327 TRACE_ITER_PRINT_PARENT = 0x01,
328 TRACE_ITER_SYM_OFFSET = 0x02,
329 TRACE_ITER_SYM_ADDR = 0x04,
330 TRACE_ITER_VERBOSE = 0x08,
331 TRACE_ITER_RAW = 0x10,
332 TRACE_ITER_HEX = 0x20,
333 TRACE_ITER_BIN = 0x40,
334 TRACE_ITER_BLOCK = 0x80,
335 TRACE_ITER_STACKTRACE = 0x100,
336 TRACE_ITER_SCHED_TREE = 0x200,
337};
338
339#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
new file mode 100644
index 000000000000..312144897970
--- /dev/null
+++ b/kernel/trace/trace_functions.c
@@ -0,0 +1,81 @@
1/*
2 * ring buffer based function tracer
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * Based on code from the latency_tracer, that is:
8 *
9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III
11 */
12#include <linux/debugfs.h>
13#include <linux/uaccess.h>
14#include <linux/ftrace.h>
15#include <linux/fs.h>
16
17#include "trace.h"
18
19static void function_reset(struct trace_array *tr)
20{
21 int cpu;
22
23 tr->time_start = ftrace_now(tr->cpu);
24
25 for_each_online_cpu(cpu)
26 tracing_reset(tr->data[cpu]);
27}
28
29static void start_function_trace(struct trace_array *tr)
30{
31 tr->cpu = get_cpu();
32 function_reset(tr);
33 put_cpu();
34
35 tracing_start_cmdline_record();
36 tracing_start_function_trace();
37}
38
39static void stop_function_trace(struct trace_array *tr)
40{
41 tracing_stop_function_trace();
42 tracing_stop_cmdline_record();
43}
44
45static void function_trace_init(struct trace_array *tr)
46{
47 if (tr->ctrl)
48 start_function_trace(tr);
49}
50
51static void function_trace_reset(struct trace_array *tr)
52{
53 if (tr->ctrl)
54 stop_function_trace(tr);
55}
56
57static void function_trace_ctrl_update(struct trace_array *tr)
58{
59 if (tr->ctrl)
60 start_function_trace(tr);
61 else
62 stop_function_trace(tr);
63}
64
65static struct tracer function_trace __read_mostly =
66{
67 .name = "ftrace",
68 .init = function_trace_init,
69 .reset = function_trace_reset,
70 .ctrl_update = function_trace_ctrl_update,
71#ifdef CONFIG_FTRACE_SELFTEST
72 .selftest = trace_selftest_startup_function,
73#endif
74};
75
76static __init int init_function_trace(void)
77{
78 return register_tracer(&function_trace);
79}
80
81device_initcall(init_function_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
new file mode 100644
index 000000000000..421d6fe3650e
--- /dev/null
+++ b/kernel/trace/trace_irqsoff.c
@@ -0,0 +1,486 @@
1/*
2 * trace irqs off criticall timings
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * From code in the latency_tracer, that is:
8 *
9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III
11 */
12#include <linux/kallsyms.h>
13#include <linux/debugfs.h>
14#include <linux/uaccess.h>
15#include <linux/module.h>
16#include <linux/ftrace.h>
17#include <linux/fs.h>
18
19#include "trace.h"
20
21static struct trace_array *irqsoff_trace __read_mostly;
22static int tracer_enabled __read_mostly;
23
24static DEFINE_PER_CPU(int, tracing_cpu);
25
26static DEFINE_SPINLOCK(max_trace_lock);
27
28enum {
29 TRACER_IRQS_OFF = (1 << 1),
30 TRACER_PREEMPT_OFF = (1 << 2),
31};
32
33static int trace_type __read_mostly;
34
35#ifdef CONFIG_PREEMPT_TRACER
36static inline int
37preempt_trace(void)
38{
39 return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
40}
41#else
42# define preempt_trace() (0)
43#endif
44
45#ifdef CONFIG_IRQSOFF_TRACER
46static inline int
47irq_trace(void)
48{
49 return ((trace_type & TRACER_IRQS_OFF) &&
50 irqs_disabled());
51}
52#else
53# define irq_trace() (0)
54#endif
55
56/*
57 * Sequence count - we record it when starting a measurement and
58 * skip the latency if the sequence has changed - some other section
59 * did a maximum and could disturb our measurement with serial console
60 * printouts, etc. Truly coinciding maximum latencies should be rare
61 * and what happens together happens separately as well, so this doesnt
62 * decrease the validity of the maximum found:
63 */
64static __cacheline_aligned_in_smp unsigned long max_sequence;
65
66#ifdef CONFIG_FTRACE
67/*
68 * irqsoff uses its own tracer function to keep the overhead down:
69 */
70static void
71irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
72{
73 struct trace_array *tr = irqsoff_trace;
74 struct trace_array_cpu *data;
75 unsigned long flags;
76 long disabled;
77 int cpu;
78
79 /*
80 * Does not matter if we preempt. We test the flags
81 * afterward, to see if irqs are disabled or not.
82 * If we preempt and get a false positive, the flags
83 * test will fail.
84 */
85 cpu = raw_smp_processor_id();
86 if (likely(!per_cpu(tracing_cpu, cpu)))
87 return;
88
89 local_save_flags(flags);
90 /* slight chance to get a false positive on tracing_cpu */
91 if (!irqs_disabled_flags(flags))
92 return;
93
94 data = tr->data[cpu];
95 disabled = atomic_inc_return(&data->disabled);
96
97 if (likely(disabled == 1))
98 trace_function(tr, data, ip, parent_ip, flags);
99
100 atomic_dec(&data->disabled);
101}
102
103static struct ftrace_ops trace_ops __read_mostly =
104{
105 .func = irqsoff_tracer_call,
106};
107#endif /* CONFIG_FTRACE */
108
109/*
110 * Should this new latency be reported/recorded?
111 */
112static int report_latency(cycle_t delta)
113{
114 if (tracing_thresh) {
115 if (delta < tracing_thresh)
116 return 0;
117 } else {
118 if (delta <= tracing_max_latency)
119 return 0;
120 }
121 return 1;
122}
123
124static void
125check_critical_timing(struct trace_array *tr,
126 struct trace_array_cpu *data,
127 unsigned long parent_ip,
128 int cpu)
129{
130 unsigned long latency, t0, t1;
131 cycle_t T0, T1, delta;
132 unsigned long flags;
133
134 /*
135 * usecs conversion is slow so we try to delay the conversion
136 * as long as possible:
137 */
138 T0 = data->preempt_timestamp;
139 T1 = ftrace_now(cpu);
140 delta = T1-T0;
141
142 local_save_flags(flags);
143
144 if (!report_latency(delta))
145 goto out;
146
147 spin_lock_irqsave(&max_trace_lock, flags);
148
149 /* check if we are still the max latency */
150 if (!report_latency(delta))
151 goto out_unlock;
152
153 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
154
155 latency = nsecs_to_usecs(delta);
156
157 if (data->critical_sequence != max_sequence)
158 goto out_unlock;
159
160 tracing_max_latency = delta;
161 t0 = nsecs_to_usecs(T0);
162 t1 = nsecs_to_usecs(T1);
163
164 data->critical_end = parent_ip;
165
166 update_max_tr_single(tr, current, cpu);
167
168 max_sequence++;
169
170out_unlock:
171 spin_unlock_irqrestore(&max_trace_lock, flags);
172
173out:
174 data->critical_sequence = max_sequence;
175 data->preempt_timestamp = ftrace_now(cpu);
176 tracing_reset(data);
177 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
178}
179
180static inline void
181start_critical_timing(unsigned long ip, unsigned long parent_ip)
182{
183 int cpu;
184 struct trace_array *tr = irqsoff_trace;
185 struct trace_array_cpu *data;
186 unsigned long flags;
187
188 if (likely(!tracer_enabled))
189 return;
190
191 cpu = raw_smp_processor_id();
192
193 if (per_cpu(tracing_cpu, cpu))
194 return;
195
196 data = tr->data[cpu];
197
198 if (unlikely(!data) || atomic_read(&data->disabled))
199 return;
200
201 atomic_inc(&data->disabled);
202
203 data->critical_sequence = max_sequence;
204 data->preempt_timestamp = ftrace_now(cpu);
205 data->critical_start = parent_ip ? : ip;
206 tracing_reset(data);
207
208 local_save_flags(flags);
209
210 trace_function(tr, data, ip, parent_ip, flags);
211
212 per_cpu(tracing_cpu, cpu) = 1;
213
214 atomic_dec(&data->disabled);
215}
216
217static inline void
218stop_critical_timing(unsigned long ip, unsigned long parent_ip)
219{
220 int cpu;
221 struct trace_array *tr = irqsoff_trace;
222 struct trace_array_cpu *data;
223 unsigned long flags;
224
225 cpu = raw_smp_processor_id();
226 /* Always clear the tracing cpu on stopping the trace */
227 if (unlikely(per_cpu(tracing_cpu, cpu)))
228 per_cpu(tracing_cpu, cpu) = 0;
229 else
230 return;
231
232 if (!tracer_enabled)
233 return;
234
235 data = tr->data[cpu];
236
237 if (unlikely(!data) || unlikely(!head_page(data)) ||
238 !data->critical_start || atomic_read(&data->disabled))
239 return;
240
241 atomic_inc(&data->disabled);
242
243 local_save_flags(flags);
244 trace_function(tr, data, ip, parent_ip, flags);
245 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
246 data->critical_start = 0;
247 atomic_dec(&data->disabled);
248}
249
250/* start and stop critical timings used to for stoppage (in idle) */
251void start_critical_timings(void)
252{
253 if (preempt_trace() || irq_trace())
254 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
255}
256
257void stop_critical_timings(void)
258{
259 if (preempt_trace() || irq_trace())
260 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
261}
262
263#ifdef CONFIG_IRQSOFF_TRACER
264#ifdef CONFIG_PROVE_LOCKING
265void time_hardirqs_on(unsigned long a0, unsigned long a1)
266{
267 if (!preempt_trace() && irq_trace())
268 stop_critical_timing(a0, a1);
269}
270
271void time_hardirqs_off(unsigned long a0, unsigned long a1)
272{
273 if (!preempt_trace() && irq_trace())
274 start_critical_timing(a0, a1);
275}
276
277#else /* !CONFIG_PROVE_LOCKING */
278
279/*
280 * Stubs:
281 */
282
283void early_boot_irqs_off(void)
284{
285}
286
287void early_boot_irqs_on(void)
288{
289}
290
291void trace_softirqs_on(unsigned long ip)
292{
293}
294
295void trace_softirqs_off(unsigned long ip)
296{
297}
298
299inline void print_irqtrace_events(struct task_struct *curr)
300{
301}
302
303/*
304 * We are only interested in hardirq on/off events:
305 */
306void trace_hardirqs_on(void)
307{
308 if (!preempt_trace() && irq_trace())
309 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
310}
311EXPORT_SYMBOL(trace_hardirqs_on);
312
313void trace_hardirqs_off(void)
314{
315 if (!preempt_trace() && irq_trace())
316 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
317}
318EXPORT_SYMBOL(trace_hardirqs_off);
319
320void trace_hardirqs_on_caller(unsigned long caller_addr)
321{
322 if (!preempt_trace() && irq_trace())
323 stop_critical_timing(CALLER_ADDR0, caller_addr);
324}
325EXPORT_SYMBOL(trace_hardirqs_on_caller);
326
327void trace_hardirqs_off_caller(unsigned long caller_addr)
328{
329 if (!preempt_trace() && irq_trace())
330 start_critical_timing(CALLER_ADDR0, caller_addr);
331}
332EXPORT_SYMBOL(trace_hardirqs_off_caller);
333
334#endif /* CONFIG_PROVE_LOCKING */
335#endif /* CONFIG_IRQSOFF_TRACER */
336
337#ifdef CONFIG_PREEMPT_TRACER
338void trace_preempt_on(unsigned long a0, unsigned long a1)
339{
340 stop_critical_timing(a0, a1);
341}
342
343void trace_preempt_off(unsigned long a0, unsigned long a1)
344{
345 start_critical_timing(a0, a1);
346}
347#endif /* CONFIG_PREEMPT_TRACER */
348
349static void start_irqsoff_tracer(struct trace_array *tr)
350{
351 register_ftrace_function(&trace_ops);
352 tracer_enabled = 1;
353}
354
355static void stop_irqsoff_tracer(struct trace_array *tr)
356{
357 tracer_enabled = 0;
358 unregister_ftrace_function(&trace_ops);
359}
360
361static void __irqsoff_tracer_init(struct trace_array *tr)
362{
363 irqsoff_trace = tr;
364 /* make sure that the tracer is visible */
365 smp_wmb();
366
367 if (tr->ctrl)
368 start_irqsoff_tracer(tr);
369}
370
371static void irqsoff_tracer_reset(struct trace_array *tr)
372{
373 if (tr->ctrl)
374 stop_irqsoff_tracer(tr);
375}
376
377static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
378{
379 if (tr->ctrl)
380 start_irqsoff_tracer(tr);
381 else
382 stop_irqsoff_tracer(tr);
383}
384
385static void irqsoff_tracer_open(struct trace_iterator *iter)
386{
387 /* stop the trace while dumping */
388 if (iter->tr->ctrl)
389 stop_irqsoff_tracer(iter->tr);
390}
391
392static void irqsoff_tracer_close(struct trace_iterator *iter)
393{
394 if (iter->tr->ctrl)
395 start_irqsoff_tracer(iter->tr);
396}
397
398#ifdef CONFIG_IRQSOFF_TRACER
399static void irqsoff_tracer_init(struct trace_array *tr)
400{
401 trace_type = TRACER_IRQS_OFF;
402
403 __irqsoff_tracer_init(tr);
404}
405static struct tracer irqsoff_tracer __read_mostly =
406{
407 .name = "irqsoff",
408 .init = irqsoff_tracer_init,
409 .reset = irqsoff_tracer_reset,
410 .open = irqsoff_tracer_open,
411 .close = irqsoff_tracer_close,
412 .ctrl_update = irqsoff_tracer_ctrl_update,
413 .print_max = 1,
414#ifdef CONFIG_FTRACE_SELFTEST
415 .selftest = trace_selftest_startup_irqsoff,
416#endif
417};
418# define register_irqsoff(trace) register_tracer(&trace)
419#else
420# define register_irqsoff(trace) do { } while (0)
421#endif
422
423#ifdef CONFIG_PREEMPT_TRACER
424static void preemptoff_tracer_init(struct trace_array *tr)
425{
426 trace_type = TRACER_PREEMPT_OFF;
427
428 __irqsoff_tracer_init(tr);
429}
430
431static struct tracer preemptoff_tracer __read_mostly =
432{
433 .name = "preemptoff",
434 .init = preemptoff_tracer_init,
435 .reset = irqsoff_tracer_reset,
436 .open = irqsoff_tracer_open,
437 .close = irqsoff_tracer_close,
438 .ctrl_update = irqsoff_tracer_ctrl_update,
439 .print_max = 1,
440#ifdef CONFIG_FTRACE_SELFTEST
441 .selftest = trace_selftest_startup_preemptoff,
442#endif
443};
444# define register_preemptoff(trace) register_tracer(&trace)
445#else
446# define register_preemptoff(trace) do { } while (0)
447#endif
448
449#if defined(CONFIG_IRQSOFF_TRACER) && \
450 defined(CONFIG_PREEMPT_TRACER)
451
452static void preemptirqsoff_tracer_init(struct trace_array *tr)
453{
454 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
455
456 __irqsoff_tracer_init(tr);
457}
458
459static struct tracer preemptirqsoff_tracer __read_mostly =
460{
461 .name = "preemptirqsoff",
462 .init = preemptirqsoff_tracer_init,
463 .reset = irqsoff_tracer_reset,
464 .open = irqsoff_tracer_open,
465 .close = irqsoff_tracer_close,
466 .ctrl_update = irqsoff_tracer_ctrl_update,
467 .print_max = 1,
468#ifdef CONFIG_FTRACE_SELFTEST
469 .selftest = trace_selftest_startup_preemptirqsoff,
470#endif
471};
472
473# define register_preemptirqsoff(trace) register_tracer(&trace)
474#else
475# define register_preemptirqsoff(trace) do { } while (0)
476#endif
477
478__init static int init_irqsoff_tracer(void)
479{
480 register_irqsoff(irqsoff_tracer);
481 register_preemptoff(preemptoff_tracer);
482 register_preemptirqsoff(preemptirqsoff_tracer);
483
484 return 0;
485}
486device_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644
index 000000000000..b13dc19dcbb4
--- /dev/null
+++ b/kernel/trace/trace_mmiotrace.c
@@ -0,0 +1,295 @@
1/*
2 * Memory mapped I/O tracing
3 *
4 * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
5 */
6
7#define DEBUG 1
8
9#include <linux/kernel.h>
10#include <linux/mmiotrace.h>
11#include <linux/pci.h>
12
13#include "trace.h"
14
15struct header_iter {
16 struct pci_dev *dev;
17};
18
19static struct trace_array *mmio_trace_array;
20static bool overrun_detected;
21
22static void mmio_reset_data(struct trace_array *tr)
23{
24 int cpu;
25
26 overrun_detected = false;
27 tr->time_start = ftrace_now(tr->cpu);
28
29 for_each_online_cpu(cpu)
30 tracing_reset(tr->data[cpu]);
31}
32
33static void mmio_trace_init(struct trace_array *tr)
34{
35 pr_debug("in %s\n", __func__);
36 mmio_trace_array = tr;
37 if (tr->ctrl) {
38 mmio_reset_data(tr);
39 enable_mmiotrace();
40 }
41}
42
43static void mmio_trace_reset(struct trace_array *tr)
44{
45 pr_debug("in %s\n", __func__);
46 if (tr->ctrl)
47 disable_mmiotrace();
48 mmio_reset_data(tr);
49 mmio_trace_array = NULL;
50}
51
52static void mmio_trace_ctrl_update(struct trace_array *tr)
53{
54 pr_debug("in %s\n", __func__);
55 if (tr->ctrl) {
56 mmio_reset_data(tr);
57 enable_mmiotrace();
58 } else {
59 disable_mmiotrace();
60 }
61}
62
63static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
64{
65 int ret = 0;
66 int i;
67 resource_size_t start, end;
68 const struct pci_driver *drv = pci_dev_driver(dev);
69
70 /* XXX: incomplete checks for trace_seq_printf() return value */
71 ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
72 dev->bus->number, dev->devfn,
73 dev->vendor, dev->device, dev->irq);
74 /*
75 * XXX: is pci_resource_to_user() appropriate, since we are
76 * supposed to interpret the __ioremap() phys_addr argument based on
77 * these printed values?
78 */
79 for (i = 0; i < 7; i++) {
80 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
81 ret += trace_seq_printf(s, " %llx",
82 (unsigned long long)(start |
83 (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
84 }
85 for (i = 0; i < 7; i++) {
86 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
87 ret += trace_seq_printf(s, " %llx",
88 dev->resource[i].start < dev->resource[i].end ?
89 (unsigned long long)(end - start) + 1 : 0);
90 }
91 if (drv)
92 ret += trace_seq_printf(s, " %s\n", drv->name);
93 else
94 ret += trace_seq_printf(s, " \n");
95 return ret;
96}
97
98static void destroy_header_iter(struct header_iter *hiter)
99{
100 if (!hiter)
101 return;
102 pci_dev_put(hiter->dev);
103 kfree(hiter);
104}
105
106static void mmio_pipe_open(struct trace_iterator *iter)
107{
108 struct header_iter *hiter;
109 struct trace_seq *s = &iter->seq;
110
111 trace_seq_printf(s, "VERSION 20070824\n");
112
113 hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
114 if (!hiter)
115 return;
116
117 hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL);
118 iter->private = hiter;
119}
120
121/* XXX: This is not called when the pipe is closed! */
122static void mmio_close(struct trace_iterator *iter)
123{
124 struct header_iter *hiter = iter->private;
125 destroy_header_iter(hiter);
126 iter->private = NULL;
127}
128
129static unsigned long count_overruns(struct trace_iterator *iter)
130{
131 int cpu;
132 unsigned long cnt = 0;
133 for_each_online_cpu(cpu) {
134 cnt += iter->overrun[cpu];
135 iter->overrun[cpu] = 0;
136 }
137 return cnt;
138}
139
140static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
141 char __user *ubuf, size_t cnt, loff_t *ppos)
142{
143 ssize_t ret;
144 struct header_iter *hiter = iter->private;
145 struct trace_seq *s = &iter->seq;
146 unsigned long n;
147
148 n = count_overruns(iter);
149 if (n) {
150 /* XXX: This is later than where events were lost. */
151 trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n);
152 if (!overrun_detected)
153 pr_warning("mmiotrace has lost events.\n");
154 overrun_detected = true;
155 goto print_out;
156 }
157
158 if (!hiter)
159 return 0;
160
161 mmio_print_pcidev(s, hiter->dev);
162 hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev);
163
164 if (!hiter->dev) {
165 destroy_header_iter(hiter);
166 iter->private = NULL;
167 }
168
169print_out:
170 ret = trace_seq_to_user(s, ubuf, cnt);
171 return (ret == -EBUSY) ? 0 : ret;
172}
173
174static int mmio_print_rw(struct trace_iterator *iter)
175{
176 struct trace_entry *entry = iter->ent;
177 struct mmiotrace_rw *rw = &entry->mmiorw;
178 struct trace_seq *s = &iter->seq;
179 unsigned long long t = ns2usecs(entry->t);
180 unsigned long usec_rem = do_div(t, 1000000ULL);
181 unsigned secs = (unsigned long)t;
182 int ret = 1;
183
184 switch (entry->mmiorw.opcode) {
185 case MMIO_READ:
186 ret = trace_seq_printf(s,
187 "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
188 rw->width, secs, usec_rem, rw->map_id,
189 (unsigned long long)rw->phys,
190 rw->value, rw->pc, 0);
191 break;
192 case MMIO_WRITE:
193 ret = trace_seq_printf(s,
194 "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
195 rw->width, secs, usec_rem, rw->map_id,
196 (unsigned long long)rw->phys,
197 rw->value, rw->pc, 0);
198 break;
199 case MMIO_UNKNOWN_OP:
200 ret = trace_seq_printf(s,
201 "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
202 secs, usec_rem, rw->map_id,
203 (unsigned long long)rw->phys,
204 (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
205 (rw->value >> 0) & 0xff, rw->pc, 0);
206 break;
207 default:
208 ret = trace_seq_printf(s, "rw what?\n");
209 break;
210 }
211 if (ret)
212 return 1;
213 return 0;
214}
215
216static int mmio_print_map(struct trace_iterator *iter)
217{
218 struct trace_entry *entry = iter->ent;
219 struct mmiotrace_map *m = &entry->mmiomap;
220 struct trace_seq *s = &iter->seq;
221 unsigned long long t = ns2usecs(entry->t);
222 unsigned long usec_rem = do_div(t, 1000000ULL);
223 unsigned secs = (unsigned long)t;
224 int ret = 1;
225
226 switch (entry->mmiorw.opcode) {
227 case MMIO_PROBE:
228 ret = trace_seq_printf(s,
229 "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
230 secs, usec_rem, m->map_id,
231 (unsigned long long)m->phys, m->virt, m->len,
232 0UL, 0);
233 break;
234 case MMIO_UNPROBE:
235 ret = trace_seq_printf(s,
236 "UNMAP %lu.%06lu %d 0x%lx %d\n",
237 secs, usec_rem, m->map_id, 0UL, 0);
238 break;
239 default:
240 ret = trace_seq_printf(s, "map what?\n");
241 break;
242 }
243 if (ret)
244 return 1;
245 return 0;
246}
247
248/* return 0 to abort printing without consuming current entry in pipe mode */
249static int mmio_print_line(struct trace_iterator *iter)
250{
251 switch (iter->ent->type) {
252 case TRACE_MMIO_RW:
253 return mmio_print_rw(iter);
254 case TRACE_MMIO_MAP:
255 return mmio_print_map(iter);
256 default:
257 return 1; /* ignore unknown entries */
258 }
259}
260
261static struct tracer mmio_tracer __read_mostly =
262{
263 .name = "mmiotrace",
264 .init = mmio_trace_init,
265 .reset = mmio_trace_reset,
266 .pipe_open = mmio_pipe_open,
267 .close = mmio_close,
268 .read = mmio_read,
269 .ctrl_update = mmio_trace_ctrl_update,
270 .print_line = mmio_print_line,
271};
272
273__init static int init_mmio_trace(void)
274{
275 return register_tracer(&mmio_tracer);
276}
277device_initcall(init_mmio_trace);
278
279void mmio_trace_rw(struct mmiotrace_rw *rw)
280{
281 struct trace_array *tr = mmio_trace_array;
282 struct trace_array_cpu *data = tr->data[smp_processor_id()];
283 __trace_mmiotrace_rw(tr, data, rw);
284}
285
286void mmio_trace_mapping(struct mmiotrace_map *map)
287{
288 struct trace_array *tr = mmio_trace_array;
289 struct trace_array_cpu *data;
290
291 preempt_disable();
292 data = tr->data[smp_processor_id()];
293 __trace_mmiotrace_map(tr, data, map);
294 preempt_enable();
295}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
new file mode 100644
index 000000000000..cb817a209aa0
--- /dev/null
+++ b/kernel/trace/trace_sched_switch.c
@@ -0,0 +1,286 @@
1/*
2 * trace context switch
3 *
4 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
5 *
6 */
7#include <linux/module.h>
8#include <linux/fs.h>
9#include <linux/debugfs.h>
10#include <linux/kallsyms.h>
11#include <linux/uaccess.h>
12#include <linux/marker.h>
13#include <linux/ftrace.h>
14
15#include "trace.h"
16
17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled;
19static atomic_t sched_ref;
20
21static void
22sched_switch_func(void *private, void *__rq, struct task_struct *prev,
23 struct task_struct *next)
24{
25 struct trace_array **ptr = private;
26 struct trace_array *tr = *ptr;
27 struct trace_array_cpu *data;
28 unsigned long flags;
29 long disabled;
30 int cpu;
31
32 tracing_record_cmdline(prev);
33 tracing_record_cmdline(next);
34
35 if (!tracer_enabled)
36 return;
37
38 local_irq_save(flags);
39 cpu = raw_smp_processor_id();
40 data = tr->data[cpu];
41 disabled = atomic_inc_return(&data->disabled);
42
43 if (likely(disabled == 1))
44 tracing_sched_switch_trace(tr, data, prev, next, flags);
45
46 atomic_dec(&data->disabled);
47 local_irq_restore(flags);
48}
49
50static notrace void
51sched_switch_callback(void *probe_data, void *call_data,
52 const char *format, va_list *args)
53{
54 struct task_struct *prev;
55 struct task_struct *next;
56 struct rq *__rq;
57
58 if (!atomic_read(&sched_ref))
59 return;
60
61 /* skip prev_pid %d next_pid %d prev_state %ld */
62 (void)va_arg(*args, int);
63 (void)va_arg(*args, int);
64 (void)va_arg(*args, long);
65 __rq = va_arg(*args, typeof(__rq));
66 prev = va_arg(*args, typeof(prev));
67 next = va_arg(*args, typeof(next));
68
69 /*
70 * If tracer_switch_func only points to the local
71 * switch func, it still needs the ptr passed to it.
72 */
73 sched_switch_func(probe_data, __rq, prev, next);
74}
75
76static void
77wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
78 task_struct *curr)
79{
80 struct trace_array **ptr = private;
81 struct trace_array *tr = *ptr;
82 struct trace_array_cpu *data;
83 unsigned long flags;
84 long disabled;
85 int cpu;
86
87 if (!tracer_enabled)
88 return;
89
90 tracing_record_cmdline(curr);
91
92 local_irq_save(flags);
93 cpu = raw_smp_processor_id();
94 data = tr->data[cpu];
95 disabled = atomic_inc_return(&data->disabled);
96
97 if (likely(disabled == 1))
98 tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
99
100 atomic_dec(&data->disabled);
101 local_irq_restore(flags);
102}
103
104static notrace void
105wake_up_callback(void *probe_data, void *call_data,
106 const char *format, va_list *args)
107{
108 struct task_struct *curr;
109 struct task_struct *task;
110 struct rq *__rq;
111
112 if (likely(!tracer_enabled))
113 return;
114
115 /* Skip pid %d state %ld */
116 (void)va_arg(*args, int);
117 (void)va_arg(*args, long);
118 /* now get the meat: "rq %p task %p rq->curr %p" */
119 __rq = va_arg(*args, typeof(__rq));
120 task = va_arg(*args, typeof(task));
121 curr = va_arg(*args, typeof(curr));
122
123 tracing_record_cmdline(task);
124 tracing_record_cmdline(curr);
125
126 wakeup_func(probe_data, __rq, task, curr);
127}
128
129static void sched_switch_reset(struct trace_array *tr)
130{
131 int cpu;
132
133 tr->time_start = ftrace_now(tr->cpu);
134
135 for_each_online_cpu(cpu)
136 tracing_reset(tr->data[cpu]);
137}
138
139static int tracing_sched_register(void)
140{
141 int ret;
142
143 ret = marker_probe_register("kernel_sched_wakeup",
144 "pid %d state %ld ## rq %p task %p rq->curr %p",
145 wake_up_callback,
146 &ctx_trace);
147 if (ret) {
148 pr_info("wakeup trace: Couldn't add marker"
149 " probe to kernel_sched_wakeup\n");
150 return ret;
151 }
152
153 ret = marker_probe_register("kernel_sched_wakeup_new",
154 "pid %d state %ld ## rq %p task %p rq->curr %p",
155 wake_up_callback,
156 &ctx_trace);
157 if (ret) {
158 pr_info("wakeup trace: Couldn't add marker"
159 " probe to kernel_sched_wakeup_new\n");
160 goto fail_deprobe;
161 }
162
163 ret = marker_probe_register("kernel_sched_schedule",
164 "prev_pid %d next_pid %d prev_state %ld "
165 "## rq %p prev %p next %p",
166 sched_switch_callback,
167 &ctx_trace);
168 if (ret) {
169 pr_info("sched trace: Couldn't add marker"
170 " probe to kernel_sched_schedule\n");
171 goto fail_deprobe_wake_new;
172 }
173
174 return ret;
175fail_deprobe_wake_new:
176 marker_probe_unregister("kernel_sched_wakeup_new",
177 wake_up_callback,
178 &ctx_trace);
179fail_deprobe:
180 marker_probe_unregister("kernel_sched_wakeup",
181 wake_up_callback,
182 &ctx_trace);
183 return ret;
184}
185
186static void tracing_sched_unregister(void)
187{
188 marker_probe_unregister("kernel_sched_schedule",
189 sched_switch_callback,
190 &ctx_trace);
191 marker_probe_unregister("kernel_sched_wakeup_new",
192 wake_up_callback,
193 &ctx_trace);
194 marker_probe_unregister("kernel_sched_wakeup",
195 wake_up_callback,
196 &ctx_trace);
197}
198
199static void tracing_start_sched_switch(void)
200{
201 long ref;
202
203 ref = atomic_inc_return(&sched_ref);
204 if (ref == 1)
205 tracing_sched_register();
206}
207
208static void tracing_stop_sched_switch(void)
209{
210 long ref;
211
212 ref = atomic_dec_and_test(&sched_ref);
213 if (ref)
214 tracing_sched_unregister();
215}
216
217void tracing_start_cmdline_record(void)
218{
219 tracing_start_sched_switch();
220}
221
222void tracing_stop_cmdline_record(void)
223{
224 tracing_stop_sched_switch();
225}
226
227static void start_sched_trace(struct trace_array *tr)
228{
229 sched_switch_reset(tr);
230 tracing_start_cmdline_record();
231 tracer_enabled = 1;
232}
233
234static void stop_sched_trace(struct trace_array *tr)
235{
236 tracer_enabled = 0;
237 tracing_stop_cmdline_record();
238}
239
240static void sched_switch_trace_init(struct trace_array *tr)
241{
242 ctx_trace = tr;
243
244 if (tr->ctrl)
245 start_sched_trace(tr);
246}
247
248static void sched_switch_trace_reset(struct trace_array *tr)
249{
250 if (tr->ctrl)
251 stop_sched_trace(tr);
252}
253
254static void sched_switch_trace_ctrl_update(struct trace_array *tr)
255{
256 /* When starting a new trace, reset the buffers */
257 if (tr->ctrl)
258 start_sched_trace(tr);
259 else
260 stop_sched_trace(tr);
261}
262
263static struct tracer sched_switch_trace __read_mostly =
264{
265 .name = "sched_switch",
266 .init = sched_switch_trace_init,
267 .reset = sched_switch_trace_reset,
268 .ctrl_update = sched_switch_trace_ctrl_update,
269#ifdef CONFIG_FTRACE_SELFTEST
270 .selftest = trace_selftest_startup_sched_switch,
271#endif
272};
273
274__init static int init_sched_switch_trace(void)
275{
276 int ret = 0;
277
278 if (atomic_read(&sched_ref))
279 ret = tracing_sched_register();
280 if (ret) {
281 pr_info("error registering scheduler trace\n");
282 return ret;
283 }
284 return register_tracer(&sched_switch_trace);
285}
286device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
new file mode 100644
index 000000000000..3c8d61df4474
--- /dev/null
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -0,0 +1,448 @@
1/*
2 * trace task wakeup timings
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * Based on code from the latency_tracer, that is:
8 *
9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III
11 */
12#include <linux/module.h>
13#include <linux/fs.h>
14#include <linux/debugfs.h>
15#include <linux/kallsyms.h>
16#include <linux/uaccess.h>
17#include <linux/ftrace.h>
18#include <linux/marker.h>
19
20#include "trace.h"
21
22static struct trace_array *wakeup_trace;
23static int __read_mostly tracer_enabled;
24
25static struct task_struct *wakeup_task;
26static int wakeup_cpu;
27static unsigned wakeup_prio = -1;
28
29static DEFINE_SPINLOCK(wakeup_lock);
30
31static void __wakeup_reset(struct trace_array *tr);
32
33#ifdef CONFIG_FTRACE
34/*
35 * irqsoff uses its own tracer function to keep the overhead down:
36 */
37static void
38wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
39{
40 struct trace_array *tr = wakeup_trace;
41 struct trace_array_cpu *data;
42 unsigned long flags;
43 long disabled;
44 int resched;
45 int cpu;
46
47 if (likely(!wakeup_task))
48 return;
49
50 resched = need_resched();
51 preempt_disable_notrace();
52
53 cpu = raw_smp_processor_id();
54 data = tr->data[cpu];
55 disabled = atomic_inc_return(&data->disabled);
56 if (unlikely(disabled != 1))
57 goto out;
58
59 spin_lock_irqsave(&wakeup_lock, flags);
60
61 if (unlikely(!wakeup_task))
62 goto unlock;
63
64 /*
65 * The task can't disappear because it needs to
66 * wake up first, and we have the wakeup_lock.
67 */
68 if (task_cpu(wakeup_task) != cpu)
69 goto unlock;
70
71 trace_function(tr, data, ip, parent_ip, flags);
72
73 unlock:
74 spin_unlock_irqrestore(&wakeup_lock, flags);
75
76 out:
77 atomic_dec(&data->disabled);
78
79 /*
80 * To prevent recursion from the scheduler, if the
81 * resched flag was set before we entered, then
82 * don't reschedule.
83 */
84 if (resched)
85 preempt_enable_no_resched_notrace();
86 else
87 preempt_enable_notrace();
88}
89
90static struct ftrace_ops trace_ops __read_mostly =
91{
92 .func = wakeup_tracer_call,
93};
94#endif /* CONFIG_FTRACE */
95
96/*
97 * Should this new latency be reported/recorded?
98 */
99static int report_latency(cycle_t delta)
100{
101 if (tracing_thresh) {
102 if (delta < tracing_thresh)
103 return 0;
104 } else {
105 if (delta <= tracing_max_latency)
106 return 0;
107 }
108 return 1;
109}
110
111static void notrace
112wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
113 struct task_struct *next)
114{
115 unsigned long latency = 0, t0 = 0, t1 = 0;
116 struct trace_array **ptr = private;
117 struct trace_array *tr = *ptr;
118 struct trace_array_cpu *data;
119 cycle_t T0, T1, delta;
120 unsigned long flags;
121 long disabled;
122 int cpu;
123
124 if (unlikely(!tracer_enabled))
125 return;
126
127 /*
128 * When we start a new trace, we set wakeup_task to NULL
129 * and then set tracer_enabled = 1. We want to make sure
130 * that another CPU does not see the tracer_enabled = 1
131 * and the wakeup_task with an older task, that might
132 * actually be the same as next.
133 */
134 smp_rmb();
135
136 if (next != wakeup_task)
137 return;
138
139 /* The task we are waiting for is waking up */
140 data = tr->data[wakeup_cpu];
141
142 /* disable local data, not wakeup_cpu data */
143 cpu = raw_smp_processor_id();
144 disabled = atomic_inc_return(&tr->data[cpu]->disabled);
145 if (likely(disabled != 1))
146 goto out;
147
148 spin_lock_irqsave(&wakeup_lock, flags);
149
150 /* We could race with grabbing wakeup_lock */
151 if (unlikely(!tracer_enabled || next != wakeup_task))
152 goto out_unlock;
153
154 trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
155
156 /*
157 * usecs conversion is slow so we try to delay the conversion
158 * as long as possible:
159 */
160 T0 = data->preempt_timestamp;
161 T1 = ftrace_now(cpu);
162 delta = T1-T0;
163
164 if (!report_latency(delta))
165 goto out_unlock;
166
167 latency = nsecs_to_usecs(delta);
168
169 tracing_max_latency = delta;
170 t0 = nsecs_to_usecs(T0);
171 t1 = nsecs_to_usecs(T1);
172
173 update_max_tr(tr, wakeup_task, wakeup_cpu);
174
175out_unlock:
176 __wakeup_reset(tr);
177 spin_unlock_irqrestore(&wakeup_lock, flags);
178out:
179 atomic_dec(&tr->data[cpu]->disabled);
180}
181
182static notrace void
183sched_switch_callback(void *probe_data, void *call_data,
184 const char *format, va_list *args)
185{
186 struct task_struct *prev;
187 struct task_struct *next;
188 struct rq *__rq;
189
190 /* skip prev_pid %d next_pid %d prev_state %ld */
191 (void)va_arg(*args, int);
192 (void)va_arg(*args, int);
193 (void)va_arg(*args, long);
194 __rq = va_arg(*args, typeof(__rq));
195 prev = va_arg(*args, typeof(prev));
196 next = va_arg(*args, typeof(next));
197
198 tracing_record_cmdline(prev);
199
200 /*
201 * If tracer_switch_func only points to the local
202 * switch func, it still needs the ptr passed to it.
203 */
204 wakeup_sched_switch(probe_data, __rq, prev, next);
205}
206
207static void __wakeup_reset(struct trace_array *tr)
208{
209 struct trace_array_cpu *data;
210 int cpu;
211
212 assert_spin_locked(&wakeup_lock);
213
214 for_each_possible_cpu(cpu) {
215 data = tr->data[cpu];
216 tracing_reset(data);
217 }
218
219 wakeup_cpu = -1;
220 wakeup_prio = -1;
221
222 if (wakeup_task)
223 put_task_struct(wakeup_task);
224
225 wakeup_task = NULL;
226}
227
228static void wakeup_reset(struct trace_array *tr)
229{
230 unsigned long flags;
231
232 spin_lock_irqsave(&wakeup_lock, flags);
233 __wakeup_reset(tr);
234 spin_unlock_irqrestore(&wakeup_lock, flags);
235}
236
237static void
238wakeup_check_start(struct trace_array *tr, struct task_struct *p,
239 struct task_struct *curr)
240{
241 int cpu = smp_processor_id();
242 unsigned long flags;
243 long disabled;
244
245 if (likely(!rt_task(p)) ||
246 p->prio >= wakeup_prio ||
247 p->prio >= curr->prio)
248 return;
249
250 disabled = atomic_inc_return(&tr->data[cpu]->disabled);
251 if (unlikely(disabled != 1))
252 goto out;
253
254 /* interrupts should be off from try_to_wake_up */
255 spin_lock(&wakeup_lock);
256
257 /* check for races. */
258 if (!tracer_enabled || p->prio >= wakeup_prio)
259 goto out_locked;
260
261 /* reset the trace */
262 __wakeup_reset(tr);
263
264 wakeup_cpu = task_cpu(p);
265 wakeup_prio = p->prio;
266
267 wakeup_task = p;
268 get_task_struct(wakeup_task);
269
270 local_save_flags(flags);
271
272 tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
273 trace_function(tr, tr->data[wakeup_cpu],
274 CALLER_ADDR1, CALLER_ADDR2, flags);
275
276out_locked:
277 spin_unlock(&wakeup_lock);
278out:
279 atomic_dec(&tr->data[cpu]->disabled);
280}
281
282static notrace void
283wake_up_callback(void *probe_data, void *call_data,
284 const char *format, va_list *args)
285{
286 struct trace_array **ptr = probe_data;
287 struct trace_array *tr = *ptr;
288 struct task_struct *curr;
289 struct task_struct *task;
290 struct rq *__rq;
291
292 if (likely(!tracer_enabled))
293 return;
294
295 /* Skip pid %d state %ld */
296 (void)va_arg(*args, int);
297 (void)va_arg(*args, long);
298 /* now get the meat: "rq %p task %p rq->curr %p" */
299 __rq = va_arg(*args, typeof(__rq));
300 task = va_arg(*args, typeof(task));
301 curr = va_arg(*args, typeof(curr));
302
303 tracing_record_cmdline(task);
304 tracing_record_cmdline(curr);
305
306 wakeup_check_start(tr, task, curr);
307}
308
309static void start_wakeup_tracer(struct trace_array *tr)
310{
311 int ret;
312
313 ret = marker_probe_register("kernel_sched_wakeup",
314 "pid %d state %ld ## rq %p task %p rq->curr %p",
315 wake_up_callback,
316 &wakeup_trace);
317 if (ret) {
318 pr_info("wakeup trace: Couldn't add marker"
319 " probe to kernel_sched_wakeup\n");
320 return;
321 }
322
323 ret = marker_probe_register("kernel_sched_wakeup_new",
324 "pid %d state %ld ## rq %p task %p rq->curr %p",
325 wake_up_callback,
326 &wakeup_trace);
327 if (ret) {
328 pr_info("wakeup trace: Couldn't add marker"
329 " probe to kernel_sched_wakeup_new\n");
330 goto fail_deprobe;
331 }
332
333 ret = marker_probe_register("kernel_sched_schedule",
334 "prev_pid %d next_pid %d prev_state %ld "
335 "## rq %p prev %p next %p",
336 sched_switch_callback,
337 &wakeup_trace);
338 if (ret) {
339 pr_info("sched trace: Couldn't add marker"
340 " probe to kernel_sched_schedule\n");
341 goto fail_deprobe_wake_new;
342 }
343
344 wakeup_reset(tr);
345
346 /*
347 * Don't let the tracer_enabled = 1 show up before
348 * the wakeup_task is reset. This may be overkill since
349 * wakeup_reset does a spin_unlock after setting the
350 * wakeup_task to NULL, but I want to be safe.
351 * This is a slow path anyway.
352 */
353 smp_wmb();
354
355 register_ftrace_function(&trace_ops);
356
357 tracer_enabled = 1;
358
359 return;
360fail_deprobe_wake_new:
361 marker_probe_unregister("kernel_sched_wakeup_new",
362 wake_up_callback,
363 &wakeup_trace);
364fail_deprobe:
365 marker_probe_unregister("kernel_sched_wakeup",
366 wake_up_callback,
367 &wakeup_trace);
368}
369
370static void stop_wakeup_tracer(struct trace_array *tr)
371{
372 tracer_enabled = 0;
373 unregister_ftrace_function(&trace_ops);
374 marker_probe_unregister("kernel_sched_schedule",
375 sched_switch_callback,
376 &wakeup_trace);
377 marker_probe_unregister("kernel_sched_wakeup_new",
378 wake_up_callback,
379 &wakeup_trace);
380 marker_probe_unregister("kernel_sched_wakeup",
381 wake_up_callback,
382 &wakeup_trace);
383}
384
385static void wakeup_tracer_init(struct trace_array *tr)
386{
387 wakeup_trace = tr;
388
389 if (tr->ctrl)
390 start_wakeup_tracer(tr);
391}
392
393static void wakeup_tracer_reset(struct trace_array *tr)
394{
395 if (tr->ctrl) {
396 stop_wakeup_tracer(tr);
397 /* make sure we put back any tasks we are tracing */
398 wakeup_reset(tr);
399 }
400}
401
402static void wakeup_tracer_ctrl_update(struct trace_array *tr)
403{
404 if (tr->ctrl)
405 start_wakeup_tracer(tr);
406 else
407 stop_wakeup_tracer(tr);
408}
409
410static void wakeup_tracer_open(struct trace_iterator *iter)
411{
412 /* stop the trace while dumping */
413 if (iter->tr->ctrl)
414 stop_wakeup_tracer(iter->tr);
415}
416
417static void wakeup_tracer_close(struct trace_iterator *iter)
418{
419 /* forget about any processes we were recording */
420 if (iter->tr->ctrl)
421 start_wakeup_tracer(iter->tr);
422}
423
424static struct tracer wakeup_tracer __read_mostly =
425{
426 .name = "wakeup",
427 .init = wakeup_tracer_init,
428 .reset = wakeup_tracer_reset,
429 .open = wakeup_tracer_open,
430 .close = wakeup_tracer_close,
431 .ctrl_update = wakeup_tracer_ctrl_update,
432 .print_max = 1,
433#ifdef CONFIG_FTRACE_SELFTEST
434 .selftest = trace_selftest_startup_wakeup,
435#endif
436};
437
438__init static int init_wakeup_tracer(void)
439{
440 int ret;
441
442 ret = register_tracer(&wakeup_tracer);
443 if (ret)
444 return ret;
445
446 return 0;
447}
448device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
new file mode 100644
index 000000000000..0911b7e073bf
--- /dev/null
+++ b/kernel/trace/trace_selftest.c
@@ -0,0 +1,563 @@
1/* Include in trace.c */
2
3#include <linux/kthread.h>
4#include <linux/delay.h>
5
6static inline int trace_valid_entry(struct trace_entry *entry)
7{
8 switch (entry->type) {
9 case TRACE_FN:
10 case TRACE_CTX:
11 case TRACE_WAKE:
12 case TRACE_STACK:
13 case TRACE_SPECIAL:
14 return 1;
15 }
16 return 0;
17}
18
19static int
20trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
21{
22 struct trace_entry *entries;
23 struct page *page;
24 int idx = 0;
25 int i;
26
27 BUG_ON(list_empty(&data->trace_pages));
28 page = list_entry(data->trace_pages.next, struct page, lru);
29 entries = page_address(page);
30
31 check_pages(data);
32 if (head_page(data) != entries)
33 goto failed;
34
35 /*
36 * The starting trace buffer always has valid elements,
37 * if any element exists.
38 */
39 entries = head_page(data);
40
41 for (i = 0; i < tr->entries; i++) {
42
43 if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
44 printk(KERN_CONT ".. invalid entry %d ",
45 entries[idx].type);
46 goto failed;
47 }
48
49 idx++;
50 if (idx >= ENTRIES_PER_PAGE) {
51 page = virt_to_page(entries);
52 if (page->lru.next == &data->trace_pages) {
53 if (i != tr->entries - 1) {
54 printk(KERN_CONT ".. entries buffer mismatch");
55 goto failed;
56 }
57 } else {
58 page = list_entry(page->lru.next, struct page, lru);
59 entries = page_address(page);
60 }
61 idx = 0;
62 }
63 }
64
65 page = virt_to_page(entries);
66 if (page->lru.next != &data->trace_pages) {
67 printk(KERN_CONT ".. too many entries");
68 goto failed;
69 }
70
71 return 0;
72
73 failed:
74 /* disable tracing */
75 tracing_disabled = 1;
76 printk(KERN_CONT ".. corrupted trace buffer .. ");
77 return -1;
78}
79
80/*
81 * Test the trace buffer to see if all the elements
82 * are still sane.
83 */
84static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
85{
86 unsigned long flags, cnt = 0;
87 int cpu, ret = 0;
88
89 /* Don't allow flipping of max traces now */
90 raw_local_irq_save(flags);
91 __raw_spin_lock(&ftrace_max_lock);
92 for_each_possible_cpu(cpu) {
93 if (!head_page(tr->data[cpu]))
94 continue;
95
96 cnt += tr->data[cpu]->trace_idx;
97
98 ret = trace_test_buffer_cpu(tr, tr->data[cpu]);
99 if (ret)
100 break;
101 }
102 __raw_spin_unlock(&ftrace_max_lock);
103 raw_local_irq_restore(flags);
104
105 if (count)
106 *count = cnt;
107
108 return ret;
109}
110
111#ifdef CONFIG_FTRACE
112
113#ifdef CONFIG_DYNAMIC_FTRACE
114
115#define __STR(x) #x
116#define STR(x) __STR(x)
117
118/* Test dynamic code modification and ftrace filters */
119int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
120 struct trace_array *tr,
121 int (*func)(void))
122{
123 unsigned long count;
124 int ret;
125 int save_ftrace_enabled = ftrace_enabled;
126 int save_tracer_enabled = tracer_enabled;
127 char *func_name;
128
129 /* The ftrace test PASSED */
130 printk(KERN_CONT "PASSED\n");
131 pr_info("Testing dynamic ftrace: ");
132
133 /* enable tracing, and record the filter function */
134 ftrace_enabled = 1;
135 tracer_enabled = 1;
136
137 /* passed in by parameter to fool gcc from optimizing */
138 func();
139
140 /* update the records */
141 ret = ftrace_force_update();
142 if (ret) {
143 printk(KERN_CONT ".. ftraced failed .. ");
144 return ret;
145 }
146
147 /*
148 * Some archs *cough*PowerPC*cough* add charachters to the
149 * start of the function names. We simply put a '*' to
150 * accomodate them.
151 */
152 func_name = "*" STR(DYN_FTRACE_TEST_NAME);
153
154 /* filter only on our function */
155 ftrace_set_filter(func_name, strlen(func_name), 1);
156
157 /* enable tracing */
158 tr->ctrl = 1;
159 trace->init(tr);
160 /* Sleep for a 1/10 of a second */
161 msleep(100);
162
163 /* we should have nothing in the buffer */
164 ret = trace_test_buffer(tr, &count);
165 if (ret)
166 goto out;
167
168 if (count) {
169 ret = -1;
170 printk(KERN_CONT ".. filter did not filter .. ");
171 goto out;
172 }
173
174 /* call our function again */
175 func();
176
177 /* sleep again */
178 msleep(100);
179
180 /* stop the tracing. */
181 tr->ctrl = 0;
182 trace->ctrl_update(tr);
183 ftrace_enabled = 0;
184
185 /* check the trace buffer */
186 ret = trace_test_buffer(tr, &count);
187 trace->reset(tr);
188
189 /* we should only have one item */
190 if (!ret && count != 1) {
191 printk(KERN_CONT ".. filter failed count=%ld ..", count);
192 ret = -1;
193 goto out;
194 }
195 out:
196 ftrace_enabled = save_ftrace_enabled;
197 tracer_enabled = save_tracer_enabled;
198
199 /* Enable tracing on all functions again */
200 ftrace_set_filter(NULL, 0, 1);
201
202 return ret;
203}
204#else
205# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
206#endif /* CONFIG_DYNAMIC_FTRACE */
207/*
208 * Simple verification test of ftrace function tracer.
209 * Enable ftrace, sleep 1/10 second, and then read the trace
210 * buffer to see if all is in order.
211 */
212int
213trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
214{
215 unsigned long count;
216 int ret;
217 int save_ftrace_enabled = ftrace_enabled;
218 int save_tracer_enabled = tracer_enabled;
219
220 /* make sure msleep has been recorded */
221 msleep(1);
222
223 /* force the recorded functions to be traced */
224 ret = ftrace_force_update();
225 if (ret) {
226 printk(KERN_CONT ".. ftraced failed .. ");
227 return ret;
228 }
229
230 /* start the tracing */
231 ftrace_enabled = 1;
232 tracer_enabled = 1;
233
234 tr->ctrl = 1;
235 trace->init(tr);
236 /* Sleep for a 1/10 of a second */
237 msleep(100);
238 /* stop the tracing. */
239 tr->ctrl = 0;
240 trace->ctrl_update(tr);
241 ftrace_enabled = 0;
242
243 /* check the trace buffer */
244 ret = trace_test_buffer(tr, &count);
245 trace->reset(tr);
246
247 if (!ret && !count) {
248 printk(KERN_CONT ".. no entries found ..");
249 ret = -1;
250 goto out;
251 }
252
253 ret = trace_selftest_startup_dynamic_tracing(trace, tr,
254 DYN_FTRACE_TEST_NAME);
255
256 out:
257 ftrace_enabled = save_ftrace_enabled;
258 tracer_enabled = save_tracer_enabled;
259
260 /* kill ftrace totally if we failed */
261 if (ret)
262 ftrace_kill();
263
264 return ret;
265}
266#endif /* CONFIG_FTRACE */
267
268#ifdef CONFIG_IRQSOFF_TRACER
269int
270trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
271{
272 unsigned long save_max = tracing_max_latency;
273 unsigned long count;
274 int ret;
275
276 /* start the tracing */
277 tr->ctrl = 1;
278 trace->init(tr);
279 /* reset the max latency */
280 tracing_max_latency = 0;
281 /* disable interrupts for a bit */
282 local_irq_disable();
283 udelay(100);
284 local_irq_enable();
285 /* stop the tracing. */
286 tr->ctrl = 0;
287 trace->ctrl_update(tr);
288 /* check both trace buffers */
289 ret = trace_test_buffer(tr, NULL);
290 if (!ret)
291 ret = trace_test_buffer(&max_tr, &count);
292 trace->reset(tr);
293
294 if (!ret && !count) {
295 printk(KERN_CONT ".. no entries found ..");
296 ret = -1;
297 }
298
299 tracing_max_latency = save_max;
300
301 return ret;
302}
303#endif /* CONFIG_IRQSOFF_TRACER */
304
305#ifdef CONFIG_PREEMPT_TRACER
306int
307trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
308{
309 unsigned long save_max = tracing_max_latency;
310 unsigned long count;
311 int ret;
312
313 /* start the tracing */
314 tr->ctrl = 1;
315 trace->init(tr);
316 /* reset the max latency */
317 tracing_max_latency = 0;
318 /* disable preemption for a bit */
319 preempt_disable();
320 udelay(100);
321 preempt_enable();
322 /* stop the tracing. */
323 tr->ctrl = 0;
324 trace->ctrl_update(tr);
325 /* check both trace buffers */
326 ret = trace_test_buffer(tr, NULL);
327 if (!ret)
328 ret = trace_test_buffer(&max_tr, &count);
329 trace->reset(tr);
330
331 if (!ret && !count) {
332 printk(KERN_CONT ".. no entries found ..");
333 ret = -1;
334 }
335
336 tracing_max_latency = save_max;
337
338 return ret;
339}
340#endif /* CONFIG_PREEMPT_TRACER */
341
342#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
343int
344trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
345{
346 unsigned long save_max = tracing_max_latency;
347 unsigned long count;
348 int ret;
349
350 /* start the tracing */
351 tr->ctrl = 1;
352 trace->init(tr);
353
354 /* reset the max latency */
355 tracing_max_latency = 0;
356
357 /* disable preemption and interrupts for a bit */
358 preempt_disable();
359 local_irq_disable();
360 udelay(100);
361 preempt_enable();
362 /* reverse the order of preempt vs irqs */
363 local_irq_enable();
364
365 /* stop the tracing. */
366 tr->ctrl = 0;
367 trace->ctrl_update(tr);
368 /* check both trace buffers */
369 ret = trace_test_buffer(tr, NULL);
370 if (ret)
371 goto out;
372
373 ret = trace_test_buffer(&max_tr, &count);
374 if (ret)
375 goto out;
376
377 if (!ret && !count) {
378 printk(KERN_CONT ".. no entries found ..");
379 ret = -1;
380 goto out;
381 }
382
383 /* do the test by disabling interrupts first this time */
384 tracing_max_latency = 0;
385 tr->ctrl = 1;
386 trace->ctrl_update(tr);
387 preempt_disable();
388 local_irq_disable();
389 udelay(100);
390 preempt_enable();
391 /* reverse the order of preempt vs irqs */
392 local_irq_enable();
393
394 /* stop the tracing. */
395 tr->ctrl = 0;
396 trace->ctrl_update(tr);
397 /* check both trace buffers */
398 ret = trace_test_buffer(tr, NULL);
399 if (ret)
400 goto out;
401
402 ret = trace_test_buffer(&max_tr, &count);
403
404 if (!ret && !count) {
405 printk(KERN_CONT ".. no entries found ..");
406 ret = -1;
407 goto out;
408 }
409
410 out:
411 trace->reset(tr);
412 tracing_max_latency = save_max;
413
414 return ret;
415}
416#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
417
418#ifdef CONFIG_SCHED_TRACER
419static int trace_wakeup_test_thread(void *data)
420{
421 /* Make this a RT thread, doesn't need to be too high */
422 struct sched_param param = { .sched_priority = 5 };
423 struct completion *x = data;
424
425 sched_setscheduler(current, SCHED_FIFO, &param);
426
427 /* Make it know we have a new prio */
428 complete(x);
429
430 /* now go to sleep and let the test wake us up */
431 set_current_state(TASK_INTERRUPTIBLE);
432 schedule();
433
434 /* we are awake, now wait to disappear */
435 while (!kthread_should_stop()) {
436 /*
437 * This is an RT task, do short sleeps to let
438 * others run.
439 */
440 msleep(100);
441 }
442
443 return 0;
444}
445
446int
447trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
448{
449 unsigned long save_max = tracing_max_latency;
450 struct task_struct *p;
451 struct completion isrt;
452 unsigned long count;
453 int ret;
454
455 init_completion(&isrt);
456
457 /* create a high prio thread */
458 p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
459 if (IS_ERR(p)) {
460 printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
461 return -1;
462 }
463
464 /* make sure the thread is running at an RT prio */
465 wait_for_completion(&isrt);
466
467 /* start the tracing */
468 tr->ctrl = 1;
469 trace->init(tr);
470 /* reset the max latency */
471 tracing_max_latency = 0;
472
473 /* sleep to let the RT thread sleep too */
474 msleep(100);
475
476 /*
477 * Yes this is slightly racy. It is possible that for some
478 * strange reason that the RT thread we created, did not
479 * call schedule for 100ms after doing the completion,
480 * and we do a wakeup on a task that already is awake.
481 * But that is extremely unlikely, and the worst thing that
482 * happens in such a case, is that we disable tracing.
483 * Honestly, if this race does happen something is horrible
484 * wrong with the system.
485 */
486
487 wake_up_process(p);
488
489 /* stop the tracing. */
490 tr->ctrl = 0;
491 trace->ctrl_update(tr);
492 /* check both trace buffers */
493 ret = trace_test_buffer(tr, NULL);
494 if (!ret)
495 ret = trace_test_buffer(&max_tr, &count);
496
497
498 trace->reset(tr);
499
500 tracing_max_latency = save_max;
501
502 /* kill the thread */
503 kthread_stop(p);
504
505 if (!ret && !count) {
506 printk(KERN_CONT ".. no entries found ..");
507 ret = -1;
508 }
509
510 return ret;
511}
512#endif /* CONFIG_SCHED_TRACER */
513
514#ifdef CONFIG_CONTEXT_SWITCH_TRACER
515int
516trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
517{
518 unsigned long count;
519 int ret;
520
521 /* start the tracing */
522 tr->ctrl = 1;
523 trace->init(tr);
524 /* Sleep for a 1/10 of a second */
525 msleep(100);
526 /* stop the tracing. */
527 tr->ctrl = 0;
528 trace->ctrl_update(tr);
529 /* check the trace buffer */
530 ret = trace_test_buffer(tr, &count);
531 trace->reset(tr);
532
533 if (!ret && !count) {
534 printk(KERN_CONT ".. no entries found ..");
535 ret = -1;
536 }
537
538 return ret;
539}
540#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
541
542#ifdef CONFIG_SYSPROF_TRACER
543int
544trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
545{
546 unsigned long count;
547 int ret;
548
549 /* start the tracing */
550 tr->ctrl = 1;
551 trace->init(tr);
552 /* Sleep for a 1/10 of a second */
553 msleep(100);
554 /* stop the tracing. */
555 tr->ctrl = 0;
556 trace->ctrl_update(tr);
557 /* check the trace buffer */
558 ret = trace_test_buffer(tr, &count);
559 trace->reset(tr);
560
561 return ret;
562}
563#endif /* CONFIG_SYSPROF_TRACER */
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
new file mode 100644
index 000000000000..54dd77cce5bf
--- /dev/null
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -0,0 +1,7 @@
1#include "trace.h"
2
3int DYN_FTRACE_TEST_NAME(void)
4{
5 /* used to call mcount */
6 return 0;
7}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
new file mode 100644
index 000000000000..2301e1e7c606
--- /dev/null
+++ b/kernel/trace/trace_sysprof.c
@@ -0,0 +1,363 @@
1/*
2 * trace stack traces
3 *
4 * Copyright (C) 2004-2008, Soeren Sandmann
5 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
6 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
7 */
8#include <linux/kallsyms.h>
9#include <linux/debugfs.h>
10#include <linux/hrtimer.h>
11#include <linux/uaccess.h>
12#include <linux/ftrace.h>
13#include <linux/module.h>
14#include <linux/irq.h>
15#include <linux/fs.h>
16
17#include <asm/stacktrace.h>
18
19#include "trace.h"
20
21static struct trace_array *sysprof_trace;
22static int __read_mostly tracer_enabled;
23
24/*
25 * 1 msec sample interval by default:
26 */
27static unsigned long sample_period = 1000000;
28static const unsigned int sample_max_depth = 512;
29
30static DEFINE_MUTEX(sample_timer_lock);
31/*
32 * Per CPU hrtimers that do the profiling:
33 */
34static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
35
36struct stack_frame {
37 const void __user *next_fp;
38 unsigned long return_address;
39};
40
41static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
42{
43 int ret;
44
45 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
46 return 0;
47
48 ret = 1;
49 pagefault_disable();
50 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
51 ret = 0;
52 pagefault_enable();
53
54 return ret;
55}
56
57struct backtrace_info {
58 struct trace_array_cpu *data;
59 struct trace_array *tr;
60 int pos;
61};
62
63static void
64backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
65{
66 /* Ignore warnings */
67}
68
69static void backtrace_warning(void *data, char *msg)
70{
71 /* Ignore warnings */
72}
73
74static int backtrace_stack(void *data, char *name)
75{
76 /* Don't bother with IRQ stacks for now */
77 return -1;
78}
79
80static void backtrace_address(void *data, unsigned long addr, int reliable)
81{
82 struct backtrace_info *info = data;
83
84 if (info->pos < sample_max_depth && reliable) {
85 __trace_special(info->tr, info->data, 1, addr, 0);
86
87 info->pos++;
88 }
89}
90
91const static struct stacktrace_ops backtrace_ops = {
92 .warning = backtrace_warning,
93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack,
95 .address = backtrace_address,
96};
97
98static int
99trace_kernel(struct pt_regs *regs, struct trace_array *tr,
100 struct trace_array_cpu *data)
101{
102 struct backtrace_info info;
103 unsigned long bp;
104 char *stack;
105
106 info.tr = tr;
107 info.data = data;
108 info.pos = 1;
109
110 __trace_special(info.tr, info.data, 1, regs->ip, 0);
111
112 stack = ((char *)regs + sizeof(struct pt_regs));
113#ifdef CONFIG_FRAME_POINTER
114 bp = regs->bp;
115#else
116 bp = 0;
117#endif
118
119 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
120
121 return info.pos;
122}
123
124static void timer_notify(struct pt_regs *regs, int cpu)
125{
126 struct trace_array_cpu *data;
127 struct stack_frame frame;
128 struct trace_array *tr;
129 const void __user *fp;
130 int is_user;
131 int i;
132
133 if (!regs)
134 return;
135
136 tr = sysprof_trace;
137 data = tr->data[cpu];
138 is_user = user_mode(regs);
139
140 if (!current || current->pid == 0)
141 return;
142
143 if (is_user && current->state != TASK_RUNNING)
144 return;
145
146 __trace_special(tr, data, 0, 0, current->pid);
147
148 if (!is_user)
149 i = trace_kernel(regs, tr, data);
150 else
151 i = 0;
152
153 /*
154 * Trace user stack if we are not a kernel thread
155 */
156 if (current->mm && i < sample_max_depth) {
157 regs = (struct pt_regs *)current->thread.sp0 - 1;
158
159 fp = (void __user *)regs->bp;
160
161 __trace_special(tr, data, 2, regs->ip, 0);
162
163 while (i < sample_max_depth) {
164 frame.next_fp = 0;
165 frame.return_address = 0;
166 if (!copy_stack_frame(fp, &frame))
167 break;
168 if ((unsigned long)fp < regs->sp)
169 break;
170
171 __trace_special(tr, data, 2, frame.return_address,
172 (unsigned long)fp);
173 fp = frame.next_fp;
174
175 i++;
176 }
177
178 }
179
180 /*
181 * Special trace entry if we overflow the max depth:
182 */
183 if (i == sample_max_depth)
184 __trace_special(tr, data, -1, -1, -1);
185
186 __trace_special(tr, data, 3, current->pid, i);
187}
188
189static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
190{
191 /* trace here */
192 timer_notify(get_irq_regs(), smp_processor_id());
193
194 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
195
196 return HRTIMER_RESTART;
197}
198
199static void start_stack_timer(int cpu)
200{
201 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208}
209
210static void start_stack_timers(void)
211{
212 cpumask_t saved_mask = current->cpus_allowed;
213 int cpu;
214
215 for_each_online_cpu(cpu) {
216 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
217 start_stack_timer(cpu);
218 }
219 set_cpus_allowed_ptr(current, &saved_mask);
220}
221
222static void stop_stack_timer(int cpu)
223{
224 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
225
226 hrtimer_cancel(hrtimer);
227}
228
229static void stop_stack_timers(void)
230{
231 int cpu;
232
233 for_each_online_cpu(cpu)
234 stop_stack_timer(cpu);
235}
236
237static void stack_reset(struct trace_array *tr)
238{
239 int cpu;
240
241 tr->time_start = ftrace_now(tr->cpu);
242
243 for_each_online_cpu(cpu)
244 tracing_reset(tr->data[cpu]);
245}
246
247static void start_stack_trace(struct trace_array *tr)
248{
249 mutex_lock(&sample_timer_lock);
250 stack_reset(tr);
251 start_stack_timers();
252 tracer_enabled = 1;
253 mutex_unlock(&sample_timer_lock);
254}
255
256static void stop_stack_trace(struct trace_array *tr)
257{
258 mutex_lock(&sample_timer_lock);
259 stop_stack_timers();
260 tracer_enabled = 0;
261 mutex_unlock(&sample_timer_lock);
262}
263
264static void stack_trace_init(struct trace_array *tr)
265{
266 sysprof_trace = tr;
267
268 if (tr->ctrl)
269 start_stack_trace(tr);
270}
271
272static void stack_trace_reset(struct trace_array *tr)
273{
274 if (tr->ctrl)
275 stop_stack_trace(tr);
276}
277
278static void stack_trace_ctrl_update(struct trace_array *tr)
279{
280 /* When starting a new trace, reset the buffers */
281 if (tr->ctrl)
282 start_stack_trace(tr);
283 else
284 stop_stack_trace(tr);
285}
286
287static struct tracer stack_trace __read_mostly =
288{
289 .name = "sysprof",
290 .init = stack_trace_init,
291 .reset = stack_trace_reset,
292 .ctrl_update = stack_trace_ctrl_update,
293#ifdef CONFIG_FTRACE_SELFTEST
294 .selftest = trace_selftest_startup_sysprof,
295#endif
296};
297
298__init static int init_stack_trace(void)
299{
300 return register_tracer(&stack_trace);
301}
302device_initcall(init_stack_trace);
303
304#define MAX_LONG_DIGITS 22
305
306static ssize_t
307sysprof_sample_read(struct file *filp, char __user *ubuf,
308 size_t cnt, loff_t *ppos)
309{
310 char buf[MAX_LONG_DIGITS];
311 int r;
312
313 r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
314
315 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
316}
317
318static ssize_t
319sysprof_sample_write(struct file *filp, const char __user *ubuf,
320 size_t cnt, loff_t *ppos)
321{
322 char buf[MAX_LONG_DIGITS];
323 unsigned long val;
324
325 if (cnt > MAX_LONG_DIGITS-1)
326 cnt = MAX_LONG_DIGITS-1;
327
328 if (copy_from_user(&buf, ubuf, cnt))
329 return -EFAULT;
330
331 buf[cnt] = 0;
332
333 val = simple_strtoul(buf, NULL, 10);
334 /*
335 * Enforce a minimum sample period of 100 usecs:
336 */
337 if (val < 100)
338 val = 100;
339
340 mutex_lock(&sample_timer_lock);
341 stop_stack_timers();
342 sample_period = val * 1000;
343 start_stack_timers();
344 mutex_unlock(&sample_timer_lock);
345
346 return cnt;
347}
348
349static struct file_operations sysprof_sample_fops = {
350 .read = sysprof_sample_read,
351 .write = sysprof_sample_write,
352};
353
354void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
355{
356 struct dentry *entry;
357
358 entry = debugfs_create_file("sysprof_sample_period", 0644,
359 d_tracer, NULL, &sysprof_sample_fops);
360 if (entry)
361 return;
362 pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n");
363}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 29fc39f1029c..ce7799540c91 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -13,7 +13,7 @@
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de> 13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu> 14 * Theodore Ts'o <tytso@mit.edu>
15 * 15 *
16 * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>. 16 * Made to use alloc_percpu by Christoph Lameter.
17 */ 17 */
18 18
19#include <linux/module.h> 19#include <linux/module.h>