aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/exit.c5
-rw-r--r--kernel/fork.c21
-rw-r--r--kernel/futex.c7
-rw-r--r--kernel/hrtimer.c55
-rw-r--r--kernel/hung_task.c217
-rw-r--r--kernel/irq/devres.c16
-rw-r--r--kernel/irq/handle.c50
-rw-r--r--kernel/irq/manage.c189
-rw-r--r--kernel/irq/numa_migrate.c1
-rw-r--r--kernel/kprobes.c281
-rw-r--r--kernel/kthread.c26
-rw-r--r--kernel/lockdep.c5
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/mutex.c3
-rw-r--r--kernel/panic.c12
-rw-r--r--kernel/posix-cpu-timers.c9
-rw-r--r--kernel/power/disk.c8
-rw-r--r--kernel/power/user.c9
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/rcuclassic.c23
-rw-r--r--kernel/rcupreempt.c48
-rw-r--r--kernel/rcutree.c20
-rw-r--r--kernel/rcutree.h10
-rw-r--r--kernel/rcutree_trace.c2
-rw-r--r--kernel/sched.c174
-rw-r--r--kernel/sched_cpupri.c5
-rw-r--r--kernel/sched_rt.c15
-rw-r--r--kernel/softirq.c2
-rw-r--r--kernel/softlockup.c100
-rw-r--r--kernel/sys.c24
-rw-r--r--kernel/sysctl.c62
-rw-r--r--kernel/timer.c7
-rw-r--r--kernel/trace/Kconfig4
-rw-r--r--kernel/trace/blktrace.c17
-rw-r--r--kernel/trace/kmemtrace.c319
-rw-r--r--kernel/trace/trace.c57
-rw-r--r--kernel/trace/trace.h8
-rw-r--r--kernel/trace/trace_events.c12
-rw-r--r--kernel/trace/trace_events_filter.c14
-rw-r--r--kernel/trace/trace_events_stage_2.h4
-rw-r--r--kernel/trace/trace_export.c2
-rw-r--r--kernel/trace/trace_output.c2
-rw-r--r--kernel/trace/trace_sched_switch.c3
-rw-r--r--kernel/trace/trace_sched_wakeup.c8
-rw-r--r--kernel/trace/trace_syscalls.c2
-rw-r--r--kernel/workqueue.c36
47 files changed, 1428 insertions, 496 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bab1dffe37e9..42423665660a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
74obj-$(CONFIG_KPROBES) += kprobes.o 74obj-$(CONFIG_KPROBES) += kprobes.o
75obj-$(CONFIG_KGDB) += kgdb.o 75obj-$(CONFIG_KGDB) += kgdb.o
76obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o 76obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
77obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
77obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 78obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
78obj-$(CONFIG_SECCOMP) += seccomp.o 79obj-$(CONFIG_SECCOMP) += seccomp.o
79obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 80obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/exit.c b/kernel/exit.c
index 6686ed1e4aa3..abf9cf3b95c6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -837,8 +837,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
837 */ 837 */
838 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) && 838 if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
839 (tsk->parent_exec_id != tsk->real_parent->self_exec_id || 839 (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
840 tsk->self_exec_id != tsk->parent_exec_id) && 840 tsk->self_exec_id != tsk->parent_exec_id))
841 !capable(CAP_KILL))
842 tsk->exit_signal = SIGCHLD; 841 tsk->exit_signal = SIGCHLD;
843 842
844 signal = tracehook_notify_death(tsk, &cookie, group_dead); 843 signal = tracehook_notify_death(tsk, &cookie, group_dead);
@@ -924,6 +923,8 @@ NORET_TYPE void do_exit(long code)
924 schedule(); 923 schedule();
925 } 924 }
926 925
926 exit_irq_thread();
927
927 exit_signals(tsk); /* sets PF_EXITING */ 928 exit_signals(tsk); /* sets PF_EXITING */
928 /* 929 /*
929 * tsk->flags are checked in the futex code to protect against 930 * tsk->flags are checked in the futex code to protect against
diff --git a/kernel/fork.c b/kernel/fork.c
index 660c2b8765bc..b9e2edd00726 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -645,6 +645,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
645 645
646 tsk->min_flt = tsk->maj_flt = 0; 646 tsk->min_flt = tsk->maj_flt = 0;
647 tsk->nvcsw = tsk->nivcsw = 0; 647 tsk->nvcsw = tsk->nivcsw = 0;
648#ifdef CONFIG_DETECT_HUNG_TASK
649 tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
650#endif
648 651
649 tsk->mm = NULL; 652 tsk->mm = NULL;
650 tsk->active_mm = NULL; 653 tsk->active_mm = NULL;
@@ -797,6 +800,12 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
797 sig->cputime_expires.virt_exp = cputime_zero; 800 sig->cputime_expires.virt_exp = cputime_zero;
798 sig->cputime_expires.sched_exp = 0; 801 sig->cputime_expires.sched_exp = 0;
799 802
803 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) {
804 sig->cputime_expires.prof_exp =
805 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
806 sig->cputimer.running = 1;
807 }
808
800 /* The timer lists. */ 809 /* The timer lists. */
801 INIT_LIST_HEAD(&sig->cpu_timers[0]); 810 INIT_LIST_HEAD(&sig->cpu_timers[0]);
802 INIT_LIST_HEAD(&sig->cpu_timers[1]); 811 INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -812,11 +821,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
812 atomic_inc(&current->signal->live); 821 atomic_inc(&current->signal->live);
813 return 0; 822 return 0;
814 } 823 }
815 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
816
817 if (sig)
818 posix_cpu_timers_init_group(sig);
819 824
825 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
820 tsk->signal = sig; 826 tsk->signal = sig;
821 if (!sig) 827 if (!sig)
822 return -ENOMEM; 828 return -ENOMEM;
@@ -856,6 +862,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
856 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 862 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
857 task_unlock(current->group_leader); 863 task_unlock(current->group_leader);
858 864
865 posix_cpu_timers_init_group(sig);
866
859 acct_init_pacct(&sig->pacct); 867 acct_init_pacct(&sig->pacct);
860 868
861 tty_audit_fork(sig); 869 tty_audit_fork(sig);
@@ -1032,11 +1040,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1032 1040
1033 p->default_timer_slack_ns = current->timer_slack_ns; 1041 p->default_timer_slack_ns = current->timer_slack_ns;
1034 1042
1035#ifdef CONFIG_DETECT_SOFTLOCKUP
1036 p->last_switch_count = 0;
1037 p->last_switch_timestamp = 0;
1038#endif
1039
1040 task_io_accounting_init(&p->ioac); 1043 task_io_accounting_init(&p->ioac);
1041 acct_clear_integrals(p); 1044 acct_clear_integrals(p);
1042 1045
diff --git a/kernel/futex.c b/kernel/futex.c
index 6b50a024bca2..eef8cd26b5e5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -883,7 +883,12 @@ retry_private:
883out_unlock: 883out_unlock:
884 double_unlock_hb(hb1, hb2); 884 double_unlock_hb(hb1, hb2);
885 885
886 /* drop_futex_key_refs() must be called outside the spinlocks. */ 886 /*
887 * drop_futex_key_refs() must be called outside the spinlocks. During
888 * the requeue we moved futex_q's from the hash bucket at key1 to the
889 * one at key2 and updated their key pointer. We no longer need to
890 * hold the references to key1.
891 */
887 while (--drop_count >= 0) 892 while (--drop_count >= 0)
888 drop_futex_key_refs(&key1); 893 drop_futex_key_refs(&key1);
889 894
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f394d2a42ca3..cb8a15c19583 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -651,14 +651,20 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
651 * and expiry check is done in the hrtimer_interrupt or in the softirq. 651 * and expiry check is done in the hrtimer_interrupt or in the softirq.
652 */ 652 */
653static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 653static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
654 struct hrtimer_clock_base *base) 654 struct hrtimer_clock_base *base,
655 int wakeup)
655{ 656{
656 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) { 657 if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
657 spin_unlock(&base->cpu_base->lock); 658 if (wakeup) {
658 raise_softirq_irqoff(HRTIMER_SOFTIRQ); 659 spin_unlock(&base->cpu_base->lock);
659 spin_lock(&base->cpu_base->lock); 660 raise_softirq_irqoff(HRTIMER_SOFTIRQ);
661 spin_lock(&base->cpu_base->lock);
662 } else
663 __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
664
660 return 1; 665 return 1;
661 } 666 }
667
662 return 0; 668 return 0;
663} 669}
664 670
@@ -703,7 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
703static inline int hrtimer_switch_to_hres(void) { return 0; } 709static inline int hrtimer_switch_to_hres(void) { return 0; }
704static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { } 710static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
705static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, 711static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
706 struct hrtimer_clock_base *base) 712 struct hrtimer_clock_base *base,
713 int wakeup)
707{ 714{
708 return 0; 715 return 0;
709} 716}
@@ -886,20 +893,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
886 return 0; 893 return 0;
887} 894}
888 895
889/** 896int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
890 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU 897 unsigned long delta_ns, const enum hrtimer_mode mode,
891 * @timer: the timer to be added 898 int wakeup)
892 * @tim: expiry time
893 * @delta_ns: "slack" range for the timer
894 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
895 *
896 * Returns:
897 * 0 on success
898 * 1 when the timer was active
899 */
900int
901hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
902 const enum hrtimer_mode mode)
903{ 899{
904 struct hrtimer_clock_base *base, *new_base; 900 struct hrtimer_clock_base *base, *new_base;
905 unsigned long flags; 901 unsigned long flags;
@@ -940,12 +936,29 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
940 * XXX send_remote_softirq() ? 936 * XXX send_remote_softirq() ?
941 */ 937 */
942 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases)) 938 if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
943 hrtimer_enqueue_reprogram(timer, new_base); 939 hrtimer_enqueue_reprogram(timer, new_base, wakeup);
944 940
945 unlock_hrtimer_base(timer, &flags); 941 unlock_hrtimer_base(timer, &flags);
946 942
947 return ret; 943 return ret;
948} 944}
945
946/**
947 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
948 * @timer: the timer to be added
949 * @tim: expiry time
950 * @delta_ns: "slack" range for the timer
951 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
952 *
953 * Returns:
954 * 0 on success
955 * 1 when the timer was active
956 */
957int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
958 unsigned long delta_ns, const enum hrtimer_mode mode)
959{
960 return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
961}
949EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); 962EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
950 963
951/** 964/**
@@ -961,7 +974,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
961int 974int
962hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) 975hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
963{ 976{
964 return hrtimer_start_range_ns(timer, tim, 0, mode); 977 return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
965} 978}
966EXPORT_SYMBOL_GPL(hrtimer_start); 979EXPORT_SYMBOL_GPL(hrtimer_start);
967 980
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 000000000000..022a4927b785
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,217 @@
1/*
2 * Detect Hung Task
3 *
4 * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
5 *
6 */
7
8#include <linux/mm.h>
9#include <linux/cpu.h>
10#include <linux/nmi.h>
11#include <linux/init.h>
12#include <linux/delay.h>
13#include <linux/freezer.h>
14#include <linux/kthread.h>
15#include <linux/lockdep.h>
16#include <linux/module.h>
17#include <linux/sysctl.h>
18
19/*
20 * The number of tasks checked:
21 */
22unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
23
24/*
25 * Limit number of tasks checked in a batch.
26 *
27 * This value controls the preemptibility of khungtaskd since preemption
28 * is disabled during the critical section. It also controls the size of
29 * the RCU grace period. So it needs to be upper-bound.
30 */
31#define HUNG_TASK_BATCHING 1024
32
33/*
34 * Zero means infinite timeout - no checking done:
35 */
36unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
37
38unsigned long __read_mostly sysctl_hung_task_warnings = 10;
39
40static int __read_mostly did_panic;
41
42static struct task_struct *watchdog_task;
43
44/*
45 * Should we panic (and reboot, if panic_timeout= is set) when a
46 * hung task is detected:
47 */
48unsigned int __read_mostly sysctl_hung_task_panic =
49 CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
50
51static int __init hung_task_panic_setup(char *str)
52{
53 sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
54
55 return 1;
56}
57__setup("hung_task_panic=", hung_task_panic_setup);
58
59static int
60hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
61{
62 did_panic = 1;
63
64 return NOTIFY_DONE;
65}
66
67static struct notifier_block panic_block = {
68 .notifier_call = hung_task_panic,
69};
70
71static void check_hung_task(struct task_struct *t, unsigned long timeout)
72{
73 unsigned long switch_count = t->nvcsw + t->nivcsw;
74
75 /*
76 * Ensure the task is not frozen.
77 * Also, when a freshly created task is scheduled once, changes
78 * its state to TASK_UNINTERRUPTIBLE without having ever been
79 * switched out once, it musn't be checked.
80 */
81 if (unlikely(t->flags & PF_FROZEN || !switch_count))
82 return;
83
84 if (switch_count != t->last_switch_count) {
85 t->last_switch_count = switch_count;
86 return;
87 }
88 if (!sysctl_hung_task_warnings)
89 return;
90 sysctl_hung_task_warnings--;
91
92 /*
93 * Ok, the task did not get scheduled for more than 2 minutes,
94 * complain:
95 */
96 printk(KERN_ERR "INFO: task %s:%d blocked for more than "
97 "%ld seconds.\n", t->comm, t->pid, timeout);
98 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
99 " disables this message.\n");
100 sched_show_task(t);
101 __debug_show_held_locks(t);
102
103 touch_nmi_watchdog();
104
105 if (sysctl_hung_task_panic)
106 panic("hung_task: blocked tasks");
107}
108
109/*
110 * To avoid extending the RCU grace period for an unbounded amount of time,
111 * periodically exit the critical section and enter a new one.
112 *
113 * For preemptible RCU it is sufficient to call rcu_read_unlock in order
114 * exit the grace period. For classic RCU, a reschedule is required.
115 */
116static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
117{
118 get_task_struct(g);
119 get_task_struct(t);
120 rcu_read_unlock();
121 cond_resched();
122 rcu_read_lock();
123 put_task_struct(t);
124 put_task_struct(g);
125}
126
127/*
128 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
129 * a really long time (120 seconds). If that happens, print out
130 * a warning.
131 */
132static void check_hung_uninterruptible_tasks(unsigned long timeout)
133{
134 int max_count = sysctl_hung_task_check_count;
135 int batch_count = HUNG_TASK_BATCHING;
136 struct task_struct *g, *t;
137
138 /*
139 * If the system crashed already then all bets are off,
140 * do not report extra hung tasks:
141 */
142 if (test_taint(TAINT_DIE) || did_panic)
143 return;
144
145 rcu_read_lock();
146 do_each_thread(g, t) {
147 if (!--max_count)
148 goto unlock;
149 if (!--batch_count) {
150 batch_count = HUNG_TASK_BATCHING;
151 rcu_lock_break(g, t);
152 /* Exit if t or g was unhashed during refresh. */
153 if (t->state == TASK_DEAD || g->state == TASK_DEAD)
154 goto unlock;
155 }
156 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
157 if (t->state == TASK_UNINTERRUPTIBLE)
158 check_hung_task(t, timeout);
159 } while_each_thread(g, t);
160 unlock:
161 rcu_read_unlock();
162}
163
164static unsigned long timeout_jiffies(unsigned long timeout)
165{
166 /* timeout of 0 will disable the watchdog */
167 return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
168}
169
170/*
171 * Process updating of timeout sysctl
172 */
173int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
174 struct file *filp, void __user *buffer,
175 size_t *lenp, loff_t *ppos)
176{
177 int ret;
178
179 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
180
181 if (ret || !write)
182 goto out;
183
184 wake_up_process(watchdog_task);
185
186 out:
187 return ret;
188}
189
190/*
191 * kthread which checks for tasks stuck in D state
192 */
193static int watchdog(void *dummy)
194{
195 set_user_nice(current, 0);
196
197 for ( ; ; ) {
198 unsigned long timeout = sysctl_hung_task_timeout_secs;
199
200 while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
201 timeout = sysctl_hung_task_timeout_secs;
202
203 check_hung_uninterruptible_tasks(timeout);
204 }
205
206 return 0;
207}
208
209static int __init hung_task_init(void)
210{
211 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
212 watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
213
214 return 0;
215}
216
217module_init(hung_task_init);
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 38a25b8d8bff..d06df9c41cba 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
26} 26}
27 27
28/** 28/**
29 * devm_request_irq - allocate an interrupt line for a managed device 29 * devm_request_threaded_irq - allocate an interrupt line for a managed device
30 * @dev: device to request interrupt for 30 * @dev: device to request interrupt for
31 * @irq: Interrupt line to allocate 31 * @irq: Interrupt line to allocate
32 * @handler: Function to be called when the IRQ occurs 32 * @handler: Function to be called when the IRQ occurs
33 * @thread_fn: function to be called in a threaded interrupt context. NULL
34 * for devices which handle everything in @handler
33 * @irqflags: Interrupt type flags 35 * @irqflags: Interrupt type flags
34 * @devname: An ascii name for the claiming device 36 * @devname: An ascii name for the claiming device
35 * @dev_id: A cookie passed back to the handler function 37 * @dev_id: A cookie passed back to the handler function
@@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
42 * If an IRQ allocated with this function needs to be freed 44 * If an IRQ allocated with this function needs to be freed
43 * separately, dev_free_irq() must be used. 45 * separately, dev_free_irq() must be used.
44 */ 46 */
45int devm_request_irq(struct device *dev, unsigned int irq, 47int devm_request_threaded_irq(struct device *dev, unsigned int irq,
46 irq_handler_t handler, unsigned long irqflags, 48 irq_handler_t handler, irq_handler_t thread_fn,
47 const char *devname, void *dev_id) 49 unsigned long irqflags, const char *devname,
50 void *dev_id)
48{ 51{
49 struct irq_devres *dr; 52 struct irq_devres *dr;
50 int rc; 53 int rc;
@@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq,
54 if (!dr) 57 if (!dr)
55 return -ENOMEM; 58 return -ENOMEM;
56 59
57 rc = request_irq(irq, handler, irqflags, devname, dev_id); 60 rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
61 dev_id);
58 if (rc) { 62 if (rc) {
59 devres_free(dr); 63 devres_free(dr);
60 return rc; 64 return rc;
@@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
66 70
67 return 0; 71 return 0;
68} 72}
69EXPORT_SYMBOL(devm_request_irq); 73EXPORT_SYMBOL(devm_request_threaded_irq);
70 74
71/** 75/**
72 * devm_free_irq - free an interrupt 76 * devm_free_irq - free an interrupt
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 343acecae629..d82142be8dd2 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -339,6 +339,15 @@ irqreturn_t no_action(int cpl, void *dev_id)
339 return IRQ_NONE; 339 return IRQ_NONE;
340} 340}
341 341
342static void warn_no_thread(unsigned int irq, struct irqaction *action)
343{
344 if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
345 return;
346
347 printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
348 "but no thread function available.", irq, action->name);
349}
350
342DEFINE_TRACE(irq_handler_entry); 351DEFINE_TRACE(irq_handler_entry);
343DEFINE_TRACE(irq_handler_exit); 352DEFINE_TRACE(irq_handler_exit);
344 353
@@ -363,8 +372,47 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
363 trace_irq_handler_entry(irq, action); 372 trace_irq_handler_entry(irq, action);
364 ret = action->handler(irq, action->dev_id); 373 ret = action->handler(irq, action->dev_id);
365 trace_irq_handler_exit(irq, action, ret); 374 trace_irq_handler_exit(irq, action, ret);
366 if (ret == IRQ_HANDLED) 375
376 switch (ret) {
377 case IRQ_WAKE_THREAD:
378 /*
379 * Set result to handled so the spurious check
380 * does not trigger.
381 */
382 ret = IRQ_HANDLED;
383
384 /*
385 * Catch drivers which return WAKE_THREAD but
386 * did not set up a thread function
387 */
388 if (unlikely(!action->thread_fn)) {
389 warn_no_thread(irq, action);
390 break;
391 }
392
393 /*
394 * Wake up the handler thread for this
395 * action. In case the thread crashed and was
396 * killed we just pretend that we handled the
397 * interrupt. The hardirq handler above has
398 * disabled the device interrupt, so no irq
399 * storm is lurking.
400 */
401 if (likely(!test_bit(IRQTF_DIED,
402 &action->thread_flags))) {
403 set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
404 wake_up_process(action->thread);
405 }
406
407 /* Fall through to add to randomness */
408 case IRQ_HANDLED:
367 status |= action->flags; 409 status |= action->flags;
410 break;
411
412 default:
413 break;
414 }
415
368 retval |= ret; 416 retval |= ret;
369 action = action->next; 417 action = action->next;
370 } while (action); 418 } while (action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1516ab77355c..7e2e7dd4cd2f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -8,16 +8,15 @@
8 */ 8 */
9 9
10#include <linux/irq.h> 10#include <linux/irq.h>
11#include <linux/kthread.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/random.h> 13#include <linux/random.h>
13#include <linux/interrupt.h> 14#include <linux/interrupt.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/sched.h>
15 17
16#include "internals.h" 18#include "internals.h"
17 19
18#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
19cpumask_var_t irq_default_affinity;
20
21/** 20/**
22 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
23 * @irq: interrupt number to wait for 22 * @irq: interrupt number to wait for
@@ -53,9 +52,18 @@ void synchronize_irq(unsigned int irq)
53 52
54 /* Oops, that failed? */ 53 /* Oops, that failed? */
55 } while (status & IRQ_INPROGRESS); 54 } while (status & IRQ_INPROGRESS);
55
56 /*
57 * We made sure that no hardirq handler is running. Now verify
58 * that no threaded handlers are active.
59 */
60 wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
56} 61}
57EXPORT_SYMBOL(synchronize_irq); 62EXPORT_SYMBOL(synchronize_irq);
58 63
64#ifdef CONFIG_SMP
65cpumask_var_t irq_default_affinity;
66
59/** 67/**
60 * irq_can_set_affinity - Check if the affinity of a given irq can be set 68 * irq_can_set_affinity - Check if the affinity of a given irq can be set
61 * @irq: Interrupt to check 69 * @irq: Interrupt to check
@@ -72,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq)
72 return 1; 80 return 1;
73} 81}
74 82
83static void
84irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
85{
86 struct irqaction *action = desc->action;
87
88 while (action) {
89 if (action->thread)
90 set_cpus_allowed_ptr(action->thread, cpumask);
91 action = action->next;
92 }
93}
94
75/** 95/**
76 * irq_set_affinity - Set the irq affinity of a given irq 96 * irq_set_affinity - Set the irq affinity of a given irq
77 * @irq: Interrupt to set affinity 97 * @irq: Interrupt to set affinity
@@ -100,6 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
100 cpumask_copy(desc->affinity, cpumask); 120 cpumask_copy(desc->affinity, cpumask);
101 desc->chip->set_affinity(irq, cpumask); 121 desc->chip->set_affinity(irq, cpumask);
102#endif 122#endif
123 irq_set_thread_affinity(desc, cpumask);
103 desc->status |= IRQ_AFFINITY_SET; 124 desc->status |= IRQ_AFFINITY_SET;
104 spin_unlock_irqrestore(&desc->lock, flags); 125 spin_unlock_irqrestore(&desc->lock, flags);
105 return 0; 126 return 0;
@@ -150,6 +171,8 @@ int irq_select_affinity_usr(unsigned int irq)
150 171
151 spin_lock_irqsave(&desc->lock, flags); 172 spin_lock_irqsave(&desc->lock, flags);
152 ret = setup_affinity(irq, desc); 173 ret = setup_affinity(irq, desc);
174 if (!ret)
175 irq_set_thread_affinity(desc, desc->affinity);
153 spin_unlock_irqrestore(&desc->lock, flags); 176 spin_unlock_irqrestore(&desc->lock, flags);
154 177
155 return ret; 178 return ret;
@@ -401,6 +424,90 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
401 return ret; 424 return ret;
402} 425}
403 426
427static int irq_wait_for_interrupt(struct irqaction *action)
428{
429 while (!kthread_should_stop()) {
430 set_current_state(TASK_INTERRUPTIBLE);
431
432 if (test_and_clear_bit(IRQTF_RUNTHREAD,
433 &action->thread_flags)) {
434 __set_current_state(TASK_RUNNING);
435 return 0;
436 }
437 schedule();
438 }
439 return -1;
440}
441
442/*
443 * Interrupt handler thread
444 */
445static int irq_thread(void *data)
446{
447 struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
448 struct irqaction *action = data;
449 struct irq_desc *desc = irq_to_desc(action->irq);
450 int wake;
451
452 sched_setscheduler(current, SCHED_FIFO, &param);
453 current->irqaction = action;
454
455 while (!irq_wait_for_interrupt(action)) {
456
457 atomic_inc(&desc->threads_active);
458
459 spin_lock_irq(&desc->lock);
460 if (unlikely(desc->status & IRQ_DISABLED)) {
461 /*
462 * CHECKME: We might need a dedicated
463 * IRQ_THREAD_PENDING flag here, which
464 * retriggers the thread in check_irq_resend()
465 * but AFAICT IRQ_PENDING should be fine as it
466 * retriggers the interrupt itself --- tglx
467 */
468 desc->status |= IRQ_PENDING;
469 spin_unlock_irq(&desc->lock);
470 } else {
471 spin_unlock_irq(&desc->lock);
472
473 action->thread_fn(action->irq, action->dev_id);
474 }
475
476 wake = atomic_dec_and_test(&desc->threads_active);
477
478 if (wake && waitqueue_active(&desc->wait_for_threads))
479 wake_up(&desc->wait_for_threads);
480 }
481
482 /*
483 * Clear irqaction. Otherwise exit_irq_thread() would make
484 * fuzz about an active irq thread going into nirvana.
485 */
486 current->irqaction = NULL;
487 return 0;
488}
489
490/*
491 * Called from do_exit()
492 */
493void exit_irq_thread(void)
494{
495 struct task_struct *tsk = current;
496
497 if (!tsk->irqaction)
498 return;
499
500 printk(KERN_ERR
501 "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
502 tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
503
504 /*
505 * Set the THREAD DIED flag to prevent further wakeups of the
506 * soon to be gone threaded handler.
507 */
508 set_bit(IRQTF_DIED, &tsk->irqaction->flags);
509}
510
404/* 511/*
405 * Internal function to register an irqaction - typically used to 512 * Internal function to register an irqaction - typically used to
406 * allocate special interrupts that are part of the architecture. 513 * allocate special interrupts that are part of the architecture.
@@ -437,6 +544,26 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
437 } 544 }
438 545
439 /* 546 /*
547 * Threaded handler ?
548 */
549 if (new->thread_fn) {
550 struct task_struct *t;
551
552 t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
553 new->name);
554 if (IS_ERR(t))
555 return PTR_ERR(t);
556 /*
557 * We keep the reference to the task struct even if
558 * the thread dies to avoid that the interrupt code
559 * references an already freed task_struct.
560 */
561 get_task_struct(t);
562 new->thread = t;
563 wake_up_process(t);
564 }
565
566 /*
440 * The following block of code has to be executed atomically 567 * The following block of code has to be executed atomically
441 */ 568 */
442 spin_lock_irqsave(&desc->lock, flags); 569 spin_lock_irqsave(&desc->lock, flags);
@@ -473,15 +600,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
473 if (!shared) { 600 if (!shared) {
474 irq_chip_set_defaults(desc->chip); 601 irq_chip_set_defaults(desc->chip);
475 602
603 init_waitqueue_head(&desc->wait_for_threads);
604
476 /* Setup the type (level, edge polarity) if configured: */ 605 /* Setup the type (level, edge polarity) if configured: */
477 if (new->flags & IRQF_TRIGGER_MASK) { 606 if (new->flags & IRQF_TRIGGER_MASK) {
478 ret = __irq_set_trigger(desc, irq, 607 ret = __irq_set_trigger(desc, irq,
479 new->flags & IRQF_TRIGGER_MASK); 608 new->flags & IRQF_TRIGGER_MASK);
480 609
481 if (ret) { 610 if (ret)
482 spin_unlock_irqrestore(&desc->lock, flags); 611 goto out_thread;
483 return ret;
484 }
485 } else 612 } else
486 compat_irq_chip_set_default_handler(desc); 613 compat_irq_chip_set_default_handler(desc);
487#if defined(CONFIG_IRQ_PER_CPU) 614#if defined(CONFIG_IRQ_PER_CPU)
@@ -549,8 +676,19 @@ mismatch:
549 dump_stack(); 676 dump_stack();
550 } 677 }
551#endif 678#endif
679 ret = -EBUSY;
680
681out_thread:
552 spin_unlock_irqrestore(&desc->lock, flags); 682 spin_unlock_irqrestore(&desc->lock, flags);
553 return -EBUSY; 683 if (new->thread) {
684 struct task_struct *t = new->thread;
685
686 new->thread = NULL;
687 if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
688 kthread_stop(t);
689 put_task_struct(t);
690 }
691 return ret;
554} 692}
555 693
556/** 694/**
@@ -576,6 +714,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
576{ 714{
577 struct irq_desc *desc = irq_to_desc(irq); 715 struct irq_desc *desc = irq_to_desc(irq);
578 struct irqaction *action, **action_ptr; 716 struct irqaction *action, **action_ptr;
717 struct task_struct *irqthread;
579 unsigned long flags; 718 unsigned long flags;
580 719
581 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq); 720 WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -622,6 +761,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
622 else 761 else
623 desc->chip->disable(irq); 762 desc->chip->disable(irq);
624 } 763 }
764
765 irqthread = action->thread;
766 action->thread = NULL;
767
625 spin_unlock_irqrestore(&desc->lock, flags); 768 spin_unlock_irqrestore(&desc->lock, flags);
626 769
627 unregister_handler_proc(irq, action); 770 unregister_handler_proc(irq, action);
@@ -629,6 +772,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
629 /* Make sure it's not being used on another CPU: */ 772 /* Make sure it's not being used on another CPU: */
630 synchronize_irq(irq); 773 synchronize_irq(irq);
631 774
775 if (irqthread) {
776 if (!test_bit(IRQTF_DIED, &action->thread_flags))
777 kthread_stop(irqthread);
778 put_task_struct(irqthread);
779 }
780
632#ifdef CONFIG_DEBUG_SHIRQ 781#ifdef CONFIG_DEBUG_SHIRQ
633 /* 782 /*
634 * It's a shared IRQ -- the driver ought to be prepared for an IRQ 783 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -681,9 +830,12 @@ void free_irq(unsigned int irq, void *dev_id)
681EXPORT_SYMBOL(free_irq); 830EXPORT_SYMBOL(free_irq);
682 831
683/** 832/**
684 * request_irq - allocate an interrupt line 833 * request_threaded_irq - allocate an interrupt line
685 * @irq: Interrupt line to allocate 834 * @irq: Interrupt line to allocate
686 * @handler: Function to be called when the IRQ occurs 835 * @handler: Function to be called when the IRQ occurs.
836 * Primary handler for threaded interrupts
837 * @thread_fn: Function called from the irq handler thread
838 * If NULL, no irq thread is created
687 * @irqflags: Interrupt type flags 839 * @irqflags: Interrupt type flags
688 * @devname: An ascii name for the claiming device 840 * @devname: An ascii name for the claiming device
689 * @dev_id: A cookie passed back to the handler function 841 * @dev_id: A cookie passed back to the handler function
@@ -695,6 +847,15 @@ EXPORT_SYMBOL(free_irq);
695 * raises, you must take care both to initialise your hardware 847 * raises, you must take care both to initialise your hardware
696 * and to set up the interrupt handler in the right order. 848 * and to set up the interrupt handler in the right order.
697 * 849 *
850 * If you want to set up a threaded irq handler for your device
851 * then you need to supply @handler and @thread_fn. @handler ist
852 * still called in hard interrupt context and has to check
853 * whether the interrupt originates from the device. If yes it
854 * needs to disable the interrupt on the device and return
855 * IRQ_THREAD_WAKE which will wake up the handler thread and run
856 * @thread_fn. This split handler design is necessary to support
857 * shared interrupts.
858 *
698 * Dev_id must be globally unique. Normally the address of the 859 * Dev_id must be globally unique. Normally the address of the
699 * device data structure is used as the cookie. Since the handler 860 * device data structure is used as the cookie. Since the handler
700 * receives this value it makes sense to use it. 861 * receives this value it makes sense to use it.
@@ -710,8 +871,9 @@ EXPORT_SYMBOL(free_irq);
710 * IRQF_TRIGGER_* Specify active edge(s) or level 871 * IRQF_TRIGGER_* Specify active edge(s) or level
711 * 872 *
712 */ 873 */
713int request_irq(unsigned int irq, irq_handler_t handler, 874int request_threaded_irq(unsigned int irq, irq_handler_t handler,
714 unsigned long irqflags, const char *devname, void *dev_id) 875 irq_handler_t thread_fn, unsigned long irqflags,
876 const char *devname, void *dev_id)
715{ 877{
716 struct irqaction *action; 878 struct irqaction *action;
717 struct irq_desc *desc; 879 struct irq_desc *desc;
@@ -759,6 +921,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
759 return -ENOMEM; 921 return -ENOMEM;
760 922
761 action->handler = handler; 923 action->handler = handler;
924 action->thread_fn = thread_fn;
762 action->flags = irqflags; 925 action->flags = irqflags;
763 action->name = devname; 926 action->name = devname;
764 action->dev_id = dev_id; 927 action->dev_id = dev_id;
@@ -788,4 +951,4 @@ int request_irq(unsigned int irq, irq_handler_t handler,
788#endif 951#endif
789 return retval; 952 return retval;
790} 953}
791EXPORT_SYMBOL(request_irq); 954EXPORT_SYMBOL(request_threaded_irq);
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 243d6121e50e..44bbdcbaf8d2 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -54,6 +54,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
54static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) 54static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
55{ 55{
56 free_kstat_irqs(old_desc, desc); 56 free_kstat_irqs(old_desc, desc);
57 free_desc_masks(old_desc, desc);
57 arch_free_chip_data(old_desc, desc); 58 arch_free_chip_data(old_desc, desc);
58} 59}
59 60
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5016bfb682b9..a5e74ddee0e2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -68,7 +68,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
68static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 68static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
69 69
70/* NOTE: change this value only with kprobe_mutex held */ 70/* NOTE: change this value only with kprobe_mutex held */
71static bool kprobe_enabled; 71static bool kprobes_all_disarmed;
72 72
73static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 73static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
74static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 74static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -328,7 +328,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
328 struct kprobe *kp; 328 struct kprobe *kp;
329 329
330 list_for_each_entry_rcu(kp, &p->list, list) { 330 list_for_each_entry_rcu(kp, &p->list, list) {
331 if (kp->pre_handler && !kprobe_gone(kp)) { 331 if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
332 set_kprobe_instance(kp); 332 set_kprobe_instance(kp);
333 if (kp->pre_handler(kp, regs)) 333 if (kp->pre_handler(kp, regs))
334 return 1; 334 return 1;
@@ -344,7 +344,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
344 struct kprobe *kp; 344 struct kprobe *kp;
345 345
346 list_for_each_entry_rcu(kp, &p->list, list) { 346 list_for_each_entry_rcu(kp, &p->list, list) {
347 if (kp->post_handler && !kprobe_gone(kp)) { 347 if (kp->post_handler && likely(!kprobe_disabled(kp))) {
348 set_kprobe_instance(kp); 348 set_kprobe_instance(kp);
349 kp->post_handler(kp, regs, flags); 349 kp->post_handler(kp, regs, flags);
350 reset_kprobe_instance(); 350 reset_kprobe_instance();
@@ -518,20 +518,28 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
518} 518}
519 519
520/* 520/*
521* Add the new probe to old_p->list. Fail if this is the 521* Add the new probe to ap->list. Fail if this is the
522* second jprobe at the address - two jprobes can't coexist 522* second jprobe at the address - two jprobes can't coexist
523*/ 523*/
524static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 524static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
525{ 525{
526 BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
526 if (p->break_handler) { 527 if (p->break_handler) {
527 if (old_p->break_handler) 528 if (ap->break_handler)
528 return -EEXIST; 529 return -EEXIST;
529 list_add_tail_rcu(&p->list, &old_p->list); 530 list_add_tail_rcu(&p->list, &ap->list);
530 old_p->break_handler = aggr_break_handler; 531 ap->break_handler = aggr_break_handler;
531 } else 532 } else
532 list_add_rcu(&p->list, &old_p->list); 533 list_add_rcu(&p->list, &ap->list);
533 if (p->post_handler && !old_p->post_handler) 534 if (p->post_handler && !ap->post_handler)
534 old_p->post_handler = aggr_post_handler; 535 ap->post_handler = aggr_post_handler;
536
537 if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
538 ap->flags &= ~KPROBE_FLAG_DISABLED;
539 if (!kprobes_all_disarmed)
540 /* Arm the breakpoint again. */
541 arch_arm_kprobe(ap);
542 }
535 return 0; 543 return 0;
536} 544}
537 545
@@ -544,6 +552,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
544 copy_kprobe(p, ap); 552 copy_kprobe(p, ap);
545 flush_insn_slot(ap); 553 flush_insn_slot(ap);
546 ap->addr = p->addr; 554 ap->addr = p->addr;
555 ap->flags = p->flags;
547 ap->pre_handler = aggr_pre_handler; 556 ap->pre_handler = aggr_pre_handler;
548 ap->fault_handler = aggr_fault_handler; 557 ap->fault_handler = aggr_fault_handler;
549 /* We don't care the kprobe which has gone. */ 558 /* We don't care the kprobe which has gone. */
@@ -566,44 +575,59 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
566 struct kprobe *p) 575 struct kprobe *p)
567{ 576{
568 int ret = 0; 577 int ret = 0;
569 struct kprobe *ap; 578 struct kprobe *ap = old_p;
570 579
571 if (kprobe_gone(old_p)) { 580 if (old_p->pre_handler != aggr_pre_handler) {
581 /* If old_p is not an aggr_probe, create new aggr_kprobe. */
582 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
583 if (!ap)
584 return -ENOMEM;
585 add_aggr_kprobe(ap, old_p);
586 }
587
588 if (kprobe_gone(ap)) {
572 /* 589 /*
573 * Attempting to insert new probe at the same location that 590 * Attempting to insert new probe at the same location that
574 * had a probe in the module vaddr area which already 591 * had a probe in the module vaddr area which already
575 * freed. So, the instruction slot has already been 592 * freed. So, the instruction slot has already been
576 * released. We need a new slot for the new probe. 593 * released. We need a new slot for the new probe.
577 */ 594 */
578 ret = arch_prepare_kprobe(old_p); 595 ret = arch_prepare_kprobe(ap);
579 if (ret) 596 if (ret)
597 /*
598 * Even if fail to allocate new slot, don't need to
599 * free aggr_probe. It will be used next time, or
600 * freed by unregister_kprobe.
601 */
580 return ret; 602 return ret;
581 } 603
582 if (old_p->pre_handler == aggr_pre_handler) {
583 copy_kprobe(old_p, p);
584 ret = add_new_kprobe(old_p, p);
585 ap = old_p;
586 } else {
587 ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
588 if (!ap) {
589 if (kprobe_gone(old_p))
590 arch_remove_kprobe(old_p);
591 return -ENOMEM;
592 }
593 add_aggr_kprobe(ap, old_p);
594 copy_kprobe(ap, p);
595 ret = add_new_kprobe(ap, p);
596 }
597 if (kprobe_gone(old_p)) {
598 /* 604 /*
599 * If the old_p has gone, its breakpoint has been disarmed. 605 * Clear gone flag to prevent allocating new slot again, and
600 * We have to arm it again after preparing real kprobes. 606 * set disabled flag because it is not armed yet.
601 */ 607 */
602 ap->flags &= ~KPROBE_FLAG_GONE; 608 ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
603 if (kprobe_enabled) 609 | KPROBE_FLAG_DISABLED;
604 arch_arm_kprobe(ap);
605 } 610 }
606 return ret; 611
612 copy_kprobe(ap, p);
613 return add_new_kprobe(ap, p);
614}
615
616/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
617static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
618{
619 struct kprobe *kp;
620
621 list_for_each_entry_rcu(kp, &p->list, list) {
622 if (!kprobe_disabled(kp))
623 /*
624 * There is an active probe on the list.
625 * We can't disable aggr_kprobe.
626 */
627 return 0;
628 }
629 p->flags |= KPROBE_FLAG_DISABLED;
630 return 1;
607} 631}
608 632
609static int __kprobes in_kprobes_functions(unsigned long addr) 633static int __kprobes in_kprobes_functions(unsigned long addr)
@@ -664,7 +688,9 @@ int __kprobes register_kprobe(struct kprobe *p)
664 return -EINVAL; 688 return -EINVAL;
665 } 689 }
666 690
667 p->flags = 0; 691 /* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
692 p->flags &= KPROBE_FLAG_DISABLED;
693
668 /* 694 /*
669 * Check if are we probing a module. 695 * Check if are we probing a module.
670 */ 696 */
@@ -709,7 +735,7 @@ int __kprobes register_kprobe(struct kprobe *p)
709 hlist_add_head_rcu(&p->hlist, 735 hlist_add_head_rcu(&p->hlist,
710 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 736 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
711 737
712 if (kprobe_enabled) 738 if (!kprobes_all_disarmed && !kprobe_disabled(p))
713 arch_arm_kprobe(p); 739 arch_arm_kprobe(p);
714 740
715out_unlock_text: 741out_unlock_text:
@@ -722,26 +748,39 @@ out:
722 748
723 return ret; 749 return ret;
724} 750}
751EXPORT_SYMBOL_GPL(register_kprobe);
725 752
726/* 753/* Check passed kprobe is valid and return kprobe in kprobe_table. */
727 * Unregister a kprobe without a scheduler synchronization. 754static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
728 */
729static int __kprobes __unregister_kprobe_top(struct kprobe *p)
730{ 755{
731 struct kprobe *old_p, *list_p; 756 struct kprobe *old_p, *list_p;
732 757
733 old_p = get_kprobe(p->addr); 758 old_p = get_kprobe(p->addr);
734 if (unlikely(!old_p)) 759 if (unlikely(!old_p))
735 return -EINVAL; 760 return NULL;
736 761
737 if (p != old_p) { 762 if (p != old_p) {
738 list_for_each_entry_rcu(list_p, &old_p->list, list) 763 list_for_each_entry_rcu(list_p, &old_p->list, list)
739 if (list_p == p) 764 if (list_p == p)
740 /* kprobe p is a valid probe */ 765 /* kprobe p is a valid probe */
741 goto valid_p; 766 goto valid;
742 return -EINVAL; 767 return NULL;
743 } 768 }
744valid_p: 769valid:
770 return old_p;
771}
772
773/*
774 * Unregister a kprobe without a scheduler synchronization.
775 */
776static int __kprobes __unregister_kprobe_top(struct kprobe *p)
777{
778 struct kprobe *old_p, *list_p;
779
780 old_p = __get_valid_kprobe(p);
781 if (old_p == NULL)
782 return -EINVAL;
783
745 if (old_p == p || 784 if (old_p == p ||
746 (old_p->pre_handler == aggr_pre_handler && 785 (old_p->pre_handler == aggr_pre_handler &&
747 list_is_singular(&old_p->list))) { 786 list_is_singular(&old_p->list))) {
@@ -750,7 +789,7 @@ valid_p:
750 * enabled and not gone - otherwise, the breakpoint would 789 * enabled and not gone - otherwise, the breakpoint would
751 * already have been removed. We save on flushing icache. 790 * already have been removed. We save on flushing icache.
752 */ 791 */
753 if (kprobe_enabled && !kprobe_gone(old_p)) { 792 if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) {
754 mutex_lock(&text_mutex); 793 mutex_lock(&text_mutex);
755 arch_disarm_kprobe(p); 794 arch_disarm_kprobe(p);
756 mutex_unlock(&text_mutex); 795 mutex_unlock(&text_mutex);
@@ -768,6 +807,11 @@ valid_p:
768 } 807 }
769noclean: 808noclean:
770 list_del_rcu(&p->list); 809 list_del_rcu(&p->list);
810 if (!kprobe_disabled(old_p)) {
811 try_to_disable_aggr_kprobe(old_p);
812 if (!kprobes_all_disarmed && kprobe_disabled(old_p))
813 arch_disarm_kprobe(old_p);
814 }
771 } 815 }
772 return 0; 816 return 0;
773} 817}
@@ -803,11 +847,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
803 } 847 }
804 return ret; 848 return ret;
805} 849}
850EXPORT_SYMBOL_GPL(register_kprobes);
806 851
807void __kprobes unregister_kprobe(struct kprobe *p) 852void __kprobes unregister_kprobe(struct kprobe *p)
808{ 853{
809 unregister_kprobes(&p, 1); 854 unregister_kprobes(&p, 1);
810} 855}
856EXPORT_SYMBOL_GPL(unregister_kprobe);
811 857
812void __kprobes unregister_kprobes(struct kprobe **kps, int num) 858void __kprobes unregister_kprobes(struct kprobe **kps, int num)
813{ 859{
@@ -826,6 +872,7 @@ void __kprobes unregister_kprobes(struct kprobe **kps, int num)
826 if (kps[i]->addr) 872 if (kps[i]->addr)
827 __unregister_kprobe_bottom(kps[i]); 873 __unregister_kprobe_bottom(kps[i]);
828} 874}
875EXPORT_SYMBOL_GPL(unregister_kprobes);
829 876
830static struct notifier_block kprobe_exceptions_nb = { 877static struct notifier_block kprobe_exceptions_nb = {
831 .notifier_call = kprobe_exceptions_notify, 878 .notifier_call = kprobe_exceptions_notify,
@@ -865,16 +912,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
865 } 912 }
866 return ret; 913 return ret;
867} 914}
915EXPORT_SYMBOL_GPL(register_jprobes);
868 916
869int __kprobes register_jprobe(struct jprobe *jp) 917int __kprobes register_jprobe(struct jprobe *jp)
870{ 918{
871 return register_jprobes(&jp, 1); 919 return register_jprobes(&jp, 1);
872} 920}
921EXPORT_SYMBOL_GPL(register_jprobe);
873 922
874void __kprobes unregister_jprobe(struct jprobe *jp) 923void __kprobes unregister_jprobe(struct jprobe *jp)
875{ 924{
876 unregister_jprobes(&jp, 1); 925 unregister_jprobes(&jp, 1);
877} 926}
927EXPORT_SYMBOL_GPL(unregister_jprobe);
878 928
879void __kprobes unregister_jprobes(struct jprobe **jps, int num) 929void __kprobes unregister_jprobes(struct jprobe **jps, int num)
880{ 930{
@@ -894,6 +944,7 @@ void __kprobes unregister_jprobes(struct jprobe **jps, int num)
894 __unregister_kprobe_bottom(&jps[i]->kp); 944 __unregister_kprobe_bottom(&jps[i]->kp);
895 } 945 }
896} 946}
947EXPORT_SYMBOL_GPL(unregister_jprobes);
897 948
898#ifdef CONFIG_KRETPROBES 949#ifdef CONFIG_KRETPROBES
899/* 950/*
@@ -987,6 +1038,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
987 free_rp_inst(rp); 1038 free_rp_inst(rp);
988 return ret; 1039 return ret;
989} 1040}
1041EXPORT_SYMBOL_GPL(register_kretprobe);
990 1042
991int __kprobes register_kretprobes(struct kretprobe **rps, int num) 1043int __kprobes register_kretprobes(struct kretprobe **rps, int num)
992{ 1044{
@@ -1004,11 +1056,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
1004 } 1056 }
1005 return ret; 1057 return ret;
1006} 1058}
1059EXPORT_SYMBOL_GPL(register_kretprobes);
1007 1060
1008void __kprobes unregister_kretprobe(struct kretprobe *rp) 1061void __kprobes unregister_kretprobe(struct kretprobe *rp)
1009{ 1062{
1010 unregister_kretprobes(&rp, 1); 1063 unregister_kretprobes(&rp, 1);
1011} 1064}
1065EXPORT_SYMBOL_GPL(unregister_kretprobe);
1012 1066
1013void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1067void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1014{ 1068{
@@ -1030,24 +1084,30 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1030 } 1084 }
1031 } 1085 }
1032} 1086}
1087EXPORT_SYMBOL_GPL(unregister_kretprobes);
1033 1088
1034#else /* CONFIG_KRETPROBES */ 1089#else /* CONFIG_KRETPROBES */
1035int __kprobes register_kretprobe(struct kretprobe *rp) 1090int __kprobes register_kretprobe(struct kretprobe *rp)
1036{ 1091{
1037 return -ENOSYS; 1092 return -ENOSYS;
1038} 1093}
1094EXPORT_SYMBOL_GPL(register_kretprobe);
1039 1095
1040int __kprobes register_kretprobes(struct kretprobe **rps, int num) 1096int __kprobes register_kretprobes(struct kretprobe **rps, int num)
1041{ 1097{
1042 return -ENOSYS; 1098 return -ENOSYS;
1043} 1099}
1100EXPORT_SYMBOL_GPL(register_kretprobes);
1101
1044void __kprobes unregister_kretprobe(struct kretprobe *rp) 1102void __kprobes unregister_kretprobe(struct kretprobe *rp)
1045{ 1103{
1046} 1104}
1105EXPORT_SYMBOL_GPL(unregister_kretprobe);
1047 1106
1048void __kprobes unregister_kretprobes(struct kretprobe **rps, int num) 1107void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
1049{ 1108{
1050} 1109}
1110EXPORT_SYMBOL_GPL(unregister_kretprobes);
1051 1111
1052static int __kprobes pre_handler_kretprobe(struct kprobe *p, 1112static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1053 struct pt_regs *regs) 1113 struct pt_regs *regs)
@@ -1061,6 +1121,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
1061static void __kprobes kill_kprobe(struct kprobe *p) 1121static void __kprobes kill_kprobe(struct kprobe *p)
1062{ 1122{
1063 struct kprobe *kp; 1123 struct kprobe *kp;
1124
1064 p->flags |= KPROBE_FLAG_GONE; 1125 p->flags |= KPROBE_FLAG_GONE;
1065 if (p->pre_handler == aggr_pre_handler) { 1126 if (p->pre_handler == aggr_pre_handler) {
1066 /* 1127 /*
@@ -1173,8 +1234,8 @@ static int __init init_kprobes(void)
1173 } 1234 }
1174 } 1235 }
1175 1236
1176 /* By default, kprobes are enabled */ 1237 /* By default, kprobes are armed */
1177 kprobe_enabled = true; 1238 kprobes_all_disarmed = false;
1178 1239
1179 err = arch_init_kprobes(); 1240 err = arch_init_kprobes();
1180 if (!err) 1241 if (!err)
@@ -1202,12 +1263,18 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
1202 else 1263 else
1203 kprobe_type = "k"; 1264 kprobe_type = "k";
1204 if (sym) 1265 if (sym)
1205 seq_printf(pi, "%p %s %s+0x%x %s %s\n", p->addr, kprobe_type, 1266 seq_printf(pi, "%p %s %s+0x%x %s %s%s\n",
1206 sym, offset, (modname ? modname : " "), 1267 p->addr, kprobe_type, sym, offset,
1207 (kprobe_gone(p) ? "[GONE]" : "")); 1268 (modname ? modname : " "),
1269 (kprobe_gone(p) ? "[GONE]" : ""),
1270 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1271 "[DISABLED]" : ""));
1208 else 1272 else
1209 seq_printf(pi, "%p %s %p %s\n", p->addr, kprobe_type, p->addr, 1273 seq_printf(pi, "%p %s %p %s%s\n",
1210 (kprobe_gone(p) ? "[GONE]" : "")); 1274 p->addr, kprobe_type, p->addr,
1275 (kprobe_gone(p) ? "[GONE]" : ""),
1276 ((kprobe_disabled(p) && !kprobe_gone(p)) ?
1277 "[DISABLED]" : ""));
1211} 1278}
1212 1279
1213static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) 1280static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1272,7 +1339,72 @@ static struct file_operations debugfs_kprobes_operations = {
1272 .release = seq_release, 1339 .release = seq_release,
1273}; 1340};
1274 1341
1275static void __kprobes enable_all_kprobes(void) 1342/* Disable one kprobe */
1343int __kprobes disable_kprobe(struct kprobe *kp)
1344{
1345 int ret = 0;
1346 struct kprobe *p;
1347
1348 mutex_lock(&kprobe_mutex);
1349
1350 /* Check whether specified probe is valid. */
1351 p = __get_valid_kprobe(kp);
1352 if (unlikely(p == NULL)) {
1353 ret = -EINVAL;
1354 goto out;
1355 }
1356
1357 /* If the probe is already disabled (or gone), just return */
1358 if (kprobe_disabled(kp))
1359 goto out;
1360
1361 kp->flags |= KPROBE_FLAG_DISABLED;
1362 if (p != kp)
1363 /* When kp != p, p is always enabled. */
1364 try_to_disable_aggr_kprobe(p);
1365
1366 if (!kprobes_all_disarmed && kprobe_disabled(p))
1367 arch_disarm_kprobe(p);
1368out:
1369 mutex_unlock(&kprobe_mutex);
1370 return ret;
1371}
1372EXPORT_SYMBOL_GPL(disable_kprobe);
1373
1374/* Enable one kprobe */
1375int __kprobes enable_kprobe(struct kprobe *kp)
1376{
1377 int ret = 0;
1378 struct kprobe *p;
1379
1380 mutex_lock(&kprobe_mutex);
1381
1382 /* Check whether specified probe is valid. */
1383 p = __get_valid_kprobe(kp);
1384 if (unlikely(p == NULL)) {
1385 ret = -EINVAL;
1386 goto out;
1387 }
1388
1389 if (kprobe_gone(kp)) {
1390 /* This kprobe has gone, we couldn't enable it. */
1391 ret = -EINVAL;
1392 goto out;
1393 }
1394
1395 if (!kprobes_all_disarmed && kprobe_disabled(p))
1396 arch_arm_kprobe(p);
1397
1398 p->flags &= ~KPROBE_FLAG_DISABLED;
1399 if (p != kp)
1400 kp->flags &= ~KPROBE_FLAG_DISABLED;
1401out:
1402 mutex_unlock(&kprobe_mutex);
1403 return ret;
1404}
1405EXPORT_SYMBOL_GPL(enable_kprobe);
1406
1407static void __kprobes arm_all_kprobes(void)
1276{ 1408{
1277 struct hlist_head *head; 1409 struct hlist_head *head;
1278 struct hlist_node *node; 1410 struct hlist_node *node;
@@ -1281,20 +1413,20 @@ static void __kprobes enable_all_kprobes(void)
1281 1413
1282 mutex_lock(&kprobe_mutex); 1414 mutex_lock(&kprobe_mutex);
1283 1415
1284 /* If kprobes are already enabled, just return */ 1416 /* If kprobes are armed, just return */
1285 if (kprobe_enabled) 1417 if (!kprobes_all_disarmed)
1286 goto already_enabled; 1418 goto already_enabled;
1287 1419
1288 mutex_lock(&text_mutex); 1420 mutex_lock(&text_mutex);
1289 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1421 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1290 head = &kprobe_table[i]; 1422 head = &kprobe_table[i];
1291 hlist_for_each_entry_rcu(p, node, head, hlist) 1423 hlist_for_each_entry_rcu(p, node, head, hlist)
1292 if (!kprobe_gone(p)) 1424 if (!kprobe_disabled(p))
1293 arch_arm_kprobe(p); 1425 arch_arm_kprobe(p);
1294 } 1426 }
1295 mutex_unlock(&text_mutex); 1427 mutex_unlock(&text_mutex);
1296 1428
1297 kprobe_enabled = true; 1429 kprobes_all_disarmed = false;
1298 printk(KERN_INFO "Kprobes globally enabled\n"); 1430 printk(KERN_INFO "Kprobes globally enabled\n");
1299 1431
1300already_enabled: 1432already_enabled:
@@ -1302,7 +1434,7 @@ already_enabled:
1302 return; 1434 return;
1303} 1435}
1304 1436
1305static void __kprobes disable_all_kprobes(void) 1437static void __kprobes disarm_all_kprobes(void)
1306{ 1438{
1307 struct hlist_head *head; 1439 struct hlist_head *head;
1308 struct hlist_node *node; 1440 struct hlist_node *node;
@@ -1311,17 +1443,17 @@ static void __kprobes disable_all_kprobes(void)
1311 1443
1312 mutex_lock(&kprobe_mutex); 1444 mutex_lock(&kprobe_mutex);
1313 1445
1314 /* If kprobes are already disabled, just return */ 1446 /* If kprobes are already disarmed, just return */
1315 if (!kprobe_enabled) 1447 if (kprobes_all_disarmed)
1316 goto already_disabled; 1448 goto already_disabled;
1317 1449
1318 kprobe_enabled = false; 1450 kprobes_all_disarmed = true;
1319 printk(KERN_INFO "Kprobes globally disabled\n"); 1451 printk(KERN_INFO "Kprobes globally disabled\n");
1320 mutex_lock(&text_mutex); 1452 mutex_lock(&text_mutex);
1321 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1453 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1322 head = &kprobe_table[i]; 1454 head = &kprobe_table[i];
1323 hlist_for_each_entry_rcu(p, node, head, hlist) { 1455 hlist_for_each_entry_rcu(p, node, head, hlist) {
1324 if (!arch_trampoline_kprobe(p) && !kprobe_gone(p)) 1456 if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
1325 arch_disarm_kprobe(p); 1457 arch_disarm_kprobe(p);
1326 } 1458 }
1327 } 1459 }
@@ -1347,7 +1479,7 @@ static ssize_t read_enabled_file_bool(struct file *file,
1347{ 1479{
1348 char buf[3]; 1480 char buf[3];
1349 1481
1350 if (kprobe_enabled) 1482 if (!kprobes_all_disarmed)
1351 buf[0] = '1'; 1483 buf[0] = '1';
1352 else 1484 else
1353 buf[0] = '0'; 1485 buf[0] = '0';
@@ -1370,12 +1502,12 @@ static ssize_t write_enabled_file_bool(struct file *file,
1370 case 'y': 1502 case 'y':
1371 case 'Y': 1503 case 'Y':
1372 case '1': 1504 case '1':
1373 enable_all_kprobes(); 1505 arm_all_kprobes();
1374 break; 1506 break;
1375 case 'n': 1507 case 'n':
1376 case 'N': 1508 case 'N':
1377 case '0': 1509 case '0':
1378 disable_all_kprobes(); 1510 disarm_all_kprobes();
1379 break; 1511 break;
1380 } 1512 }
1381 1513
@@ -1418,16 +1550,5 @@ late_initcall(debugfs_kprobe_init);
1418 1550
1419module_init(init_kprobes); 1551module_init(init_kprobes);
1420 1552
1421EXPORT_SYMBOL_GPL(register_kprobe); 1553/* defined in arch/.../kernel/kprobes.c */
1422EXPORT_SYMBOL_GPL(unregister_kprobe);
1423EXPORT_SYMBOL_GPL(register_kprobes);
1424EXPORT_SYMBOL_GPL(unregister_kprobes);
1425EXPORT_SYMBOL_GPL(register_jprobe);
1426EXPORT_SYMBOL_GPL(unregister_jprobe);
1427EXPORT_SYMBOL_GPL(register_jprobes);
1428EXPORT_SYMBOL_GPL(unregister_jprobes);
1429EXPORT_SYMBOL_GPL(jprobe_return); 1554EXPORT_SYMBOL_GPL(jprobe_return);
1430EXPORT_SYMBOL_GPL(register_kretprobe);
1431EXPORT_SYMBOL_GPL(unregister_kretprobe);
1432EXPORT_SYMBOL_GPL(register_kretprobes);
1433EXPORT_SYMBOL_GPL(unregister_kretprobes);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 84bbadd4d021..4ebaf8519abf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -76,6 +76,7 @@ static int kthread(void *_create)
76 76
77 /* OK, tell user we're spawned, wait for stop or wakeup */ 77 /* OK, tell user we're spawned, wait for stop or wakeup */
78 __set_current_state(TASK_UNINTERRUPTIBLE); 78 __set_current_state(TASK_UNINTERRUPTIBLE);
79 create->result = current;
79 complete(&create->started); 80 complete(&create->started);
80 schedule(); 81 schedule();
81 82
@@ -96,22 +97,10 @@ static void create_kthread(struct kthread_create_info *create)
96 97
97 /* We want our own signal handler (we take no signals by default). */ 98 /* We want our own signal handler (we take no signals by default). */
98 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD); 99 pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
99 if (pid < 0) { 100 if (pid < 0)
100 create->result = ERR_PTR(pid); 101 create->result = ERR_PTR(pid);
101 } else { 102 else
102 struct sched_param param = { .sched_priority = 0 };
103 wait_for_completion(&create->started); 103 wait_for_completion(&create->started);
104 read_lock(&tasklist_lock);
105 create->result = find_task_by_pid_ns(pid, &init_pid_ns);
106 read_unlock(&tasklist_lock);
107 /*
108 * root may have changed our (kthreadd's) priority or CPU mask.
109 * The kernel thread should not inherit these properties.
110 */
111 sched_setscheduler(create->result, SCHED_NORMAL, &param);
112 set_user_nice(create->result, KTHREAD_NICE_LEVEL);
113 set_cpus_allowed_ptr(create->result, cpu_all_mask);
114 }
115 complete(&create->done); 104 complete(&create->done);
116} 105}
117 106
@@ -154,11 +143,20 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
154 wait_for_completion(&create.done); 143 wait_for_completion(&create.done);
155 144
156 if (!IS_ERR(create.result)) { 145 if (!IS_ERR(create.result)) {
146 struct sched_param param = { .sched_priority = 0 };
157 va_list args; 147 va_list args;
148
158 va_start(args, namefmt); 149 va_start(args, namefmt);
159 vsnprintf(create.result->comm, sizeof(create.result->comm), 150 vsnprintf(create.result->comm, sizeof(create.result->comm),
160 namefmt, args); 151 namefmt, args);
161 va_end(args); 152 va_end(args);
153 /*
154 * root may have changed our (kthreadd's) priority or CPU mask.
155 * The kernel thread should not inherit these properties.
156 */
157 sched_setscheduler_nocheck(create.result, SCHED_NORMAL, &param);
158 set_user_nice(create.result, KTHREAD_NICE_LEVEL);
159 set_cpus_allowed_ptr(create.result, cpu_all_mask);
162 } 160 }
163 return create.result; 161 return create.result;
164} 162}
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81b5f33970b8..b0f011866969 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -793,6 +793,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
793 793
794 printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); 794 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
795 printk("turning off the locking correctness validator.\n"); 795 printk("turning off the locking correctness validator.\n");
796 dump_stack();
796 return NULL; 797 return NULL;
797 } 798 }
798 class = lock_classes + nr_lock_classes++; 799 class = lock_classes + nr_lock_classes++;
@@ -856,6 +857,7 @@ static struct lock_list *alloc_list_entry(void)
856 857
857 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); 858 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
858 printk("turning off the locking correctness validator.\n"); 859 printk("turning off the locking correctness validator.\n");
860 dump_stack();
859 return NULL; 861 return NULL;
860 } 862 }
861 return list_entries + nr_list_entries++; 863 return list_entries + nr_list_entries++;
@@ -1682,6 +1684,7 @@ cache_hit:
1682 1684
1683 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); 1685 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
1684 printk("turning off the locking correctness validator.\n"); 1686 printk("turning off the locking correctness validator.\n");
1687 dump_stack();
1685 return 0; 1688 return 0;
1686 } 1689 }
1687 chain = lock_chains + nr_lock_chains++; 1690 chain = lock_chains + nr_lock_chains++;
@@ -2541,6 +2544,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2541 debug_locks_off(); 2544 debug_locks_off();
2542 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n"); 2545 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
2543 printk("turning off the locking correctness validator.\n"); 2546 printk("turning off the locking correctness validator.\n");
2547 dump_stack();
2544 return 0; 2548 return 0;
2545 } 2549 }
2546 2550
@@ -2637,6 +2641,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2637 debug_locks_off(); 2641 debug_locks_off();
2638 printk("BUG: MAX_LOCK_DEPTH too low!\n"); 2642 printk("BUG: MAX_LOCK_DEPTH too low!\n");
2639 printk("turning off the locking correctness validator.\n"); 2643 printk("turning off the locking correctness validator.\n");
2644 dump_stack();
2640 return 0; 2645 return 0;
2641 } 2646 }
2642 2647
diff --git a/kernel/module.c b/kernel/module.c
index c268a771595c..e797812a4d95 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1952,9 +1952,6 @@ static noinline struct module *load_module(void __user *umod,
1952 if (strstarts(secstrings+sechdrs[i].sh_name, ".exit")) 1952 if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
1953 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC; 1953 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
1954#endif 1954#endif
1955 /* Don't keep __versions around; it's just for loading. */
1956 if (strcmp(secstrings + sechdrs[i].sh_name, "__versions") == 0)
1957 sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
1958 } 1955 }
1959 1956
1960 modindex = find_sec(hdr, sechdrs, secstrings, 1957 modindex = find_sec(hdr, sechdrs, secstrings,
@@ -2391,6 +2388,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
2391 blocking_notifier_call_chain(&module_notify_list, 2388 blocking_notifier_call_chain(&module_notify_list,
2392 MODULE_STATE_LIVE, mod); 2389 MODULE_STATE_LIVE, mod);
2393 2390
2391 /* We need to finish all async code before the module init sequence is done */
2392 async_synchronize_full();
2393
2394 mutex_lock(&module_mutex); 2394 mutex_lock(&module_mutex);
2395 /* Drop initial reference. */ 2395 /* Drop initial reference. */
2396 module_put(mod); 2396 module_put(mod);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5d79781394a3..507cf2b5e9f1 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -148,7 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
148 148
149 preempt_disable(); 149 preempt_disable();
150 mutex_acquire(&lock->dep_map, subclass, 0, ip); 150 mutex_acquire(&lock->dep_map, subclass, 0, ip);
151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) 151#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \
152 !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES)
152 /* 153 /*
153 * Optimistic spinning. 154 * Optimistic spinning.
154 * 155 *
diff --git a/kernel/panic.c b/kernel/panic.c
index 3fd8c5bf8b39..934fb377f4b3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -213,8 +213,16 @@ unsigned long get_taint(void)
213 213
214void add_taint(unsigned flag) 214void add_taint(unsigned flag)
215{ 215{
216 /* can't trust the integrity of the kernel anymore: */ 216 /*
217 debug_locks = 0; 217 * Can't trust the integrity of the kernel anymore.
218 * We don't call directly debug_locks_off() because the issue
219 * is not necessarily serious enough to set oops_in_progress to 1
220 * Also we want to keep up lockdep for staging development and
221 * post-warning case.
222 */
223 if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
224 printk(KERN_WARNING "Disabling lockdep due to kernel taint\n");
225
218 set_bit(flag, &tainted_mask); 226 set_bit(flag, &tainted_mask);
219} 227}
220EXPORT_SYMBOL(add_taint); 228EXPORT_SYMBOL(add_taint);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 8e5d9a68b022..c9dcf98b4463 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -18,7 +18,7 @@ void update_rlimit_cpu(unsigned long rlim_new)
18 18
19 cputime = secs_to_cputime(rlim_new); 19 cputime = secs_to_cputime(rlim_new);
20 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) || 20 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
21 cputime_lt(current->signal->it_prof_expires, cputime)) { 21 cputime_gt(current->signal->it_prof_expires, cputime)) {
22 spin_lock_irq(&current->sighand->siglock); 22 spin_lock_irq(&current->sighand->siglock);
23 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL); 23 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
24 spin_unlock_irq(&current->sighand->siglock); 24 spin_unlock_irq(&current->sighand->siglock);
@@ -224,7 +224,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
224 cpu->cpu = virt_ticks(p); 224 cpu->cpu = virt_ticks(p);
225 break; 225 break;
226 case CPUCLOCK_SCHED: 226 case CPUCLOCK_SCHED:
227 cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p); 227 cpu->sched = task_sched_runtime(p);
228 break; 228 break;
229 } 229 }
230 return 0; 230 return 0;
@@ -305,18 +305,19 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
305{ 305{
306 struct task_cputime cputime; 306 struct task_cputime cputime;
307 307
308 thread_group_cputime(p, &cputime);
309 switch (CPUCLOCK_WHICH(which_clock)) { 308 switch (CPUCLOCK_WHICH(which_clock)) {
310 default: 309 default:
311 return -EINVAL; 310 return -EINVAL;
312 case CPUCLOCK_PROF: 311 case CPUCLOCK_PROF:
312 thread_group_cputime(p, &cputime);
313 cpu->cpu = cputime_add(cputime.utime, cputime.stime); 313 cpu->cpu = cputime_add(cputime.utime, cputime.stime);
314 break; 314 break;
315 case CPUCLOCK_VIRT: 315 case CPUCLOCK_VIRT:
316 thread_group_cputime(p, &cputime);
316 cpu->cpu = cputime.utime; 317 cpu->cpu = cputime.utime;
317 break; 318 break;
318 case CPUCLOCK_SCHED: 319 case CPUCLOCK_SCHED:
319 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p); 320 cpu->sched = thread_group_sched_runtime(p);
320 break; 321 break;
321 } 322 }
322 return 0; 323 return 0;
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 5f21ab2bbcdf..0854770b63b9 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
22#include <linux/console.h> 22#include <linux/console.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <scsi/scsi_scan.h>
25#include <asm/suspend.h> 26#include <asm/suspend.h>
26 27
27#include "power.h" 28#include "power.h"
@@ -645,6 +646,13 @@ static int software_resume(void)
645 return 0; 646 return 0;
646 647
647 /* 648 /*
649 * We can't depend on SCSI devices being available after loading one of
650 * their modules if scsi_complete_async_scans() is not called and the
651 * resume device usually is a SCSI one.
652 */
653 scsi_complete_async_scans();
654
655 /*
648 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs 656 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
649 * is configured into the kernel. Since the regular hibernate 657 * is configured into the kernel. Since the regular hibernate
650 * trigger path is via sysfs which takes a buffer mutex before 658 * trigger path is via sysfs which takes a buffer mutex before
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 6c85359364f2..ed97375daae9 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -24,6 +24,7 @@
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h> 26#include <linux/smp_lock.h>
27#include <scsi/scsi_scan.h>
27 28
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
29 30
@@ -92,6 +93,7 @@ static int snapshot_open(struct inode *inode, struct file *filp)
92 filp->private_data = data; 93 filp->private_data = data;
93 memset(&data->handle, 0, sizeof(struct snapshot_handle)); 94 memset(&data->handle, 0, sizeof(struct snapshot_handle));
94 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { 95 if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
96 /* Hibernating. The image device should be accessible. */
95 data->swap = swsusp_resume_device ? 97 data->swap = swsusp_resume_device ?
96 swap_type_of(swsusp_resume_device, 0, NULL) : -1; 98 swap_type_of(swsusp_resume_device, 0, NULL) : -1;
97 data->mode = O_RDONLY; 99 data->mode = O_RDONLY;
@@ -99,6 +101,13 @@ static int snapshot_open(struct inode *inode, struct file *filp)
99 if (error) 101 if (error)
100 pm_notifier_call_chain(PM_POST_HIBERNATION); 102 pm_notifier_call_chain(PM_POST_HIBERNATION);
101 } else { 103 } else {
104 /*
105 * Resuming. We may need to wait for the image device to
106 * appear.
107 */
108 wait_for_device_probe();
109 scsi_complete_async_scans();
110
102 data->swap = -1; 111 data->swap = -1;
103 data->mode = O_WRONLY; 112 data->mode = O_WRONLY;
104 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 113 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aaad0ec34194..dfcd83ceee3b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -21,9 +21,7 @@
21#include <linux/audit.h> 21#include <linux/audit.h>
22#include <linux/pid_namespace.h> 22#include <linux/pid_namespace.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24 24#include <linux/uaccess.h>
25#include <asm/pgtable.h>
26#include <asm/uaccess.h>
27 25
28 26
29/* 27/*
@@ -48,7 +46,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
48 list_add(&child->ptrace_entry, &new_parent->ptraced); 46 list_add(&child->ptrace_entry, &new_parent->ptraced);
49 child->parent = new_parent; 47 child->parent = new_parent;
50} 48}
51 49
52/* 50/*
53 * Turn a tracing stop into a normal stop now, since with no tracer there 51 * Turn a tracing stop into a normal stop now, since with no tracer there
54 * would be no way to wake it up with SIGCONT or SIGKILL. If there was a 52 * would be no way to wake it up with SIGCONT or SIGKILL. If there was a
@@ -173,7 +171,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
173 task_lock(task); 171 task_lock(task);
174 err = __ptrace_may_access(task, mode); 172 err = __ptrace_may_access(task, mode);
175 task_unlock(task); 173 task_unlock(task);
176 return (!err ? true : false); 174 return !err;
177} 175}
178 176
179int ptrace_attach(struct task_struct *task) 177int ptrace_attach(struct task_struct *task)
@@ -358,7 +356,7 @@ int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst
358 copied += retval; 356 copied += retval;
359 src += retval; 357 src += retval;
360 dst += retval; 358 dst += retval;
361 len -= retval; 359 len -= retval;
362 } 360 }
363 return copied; 361 return copied;
364} 362}
@@ -383,7 +381,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
383 copied += retval; 381 copied += retval;
384 src += retval; 382 src += retval;
385 dst += retval; 383 dst += retval;
386 len -= retval; 384 len -= retval;
387 } 385 }
388 return copied; 386 return copied;
389} 387}
@@ -496,9 +494,9 @@ static int ptrace_resume(struct task_struct *child, long request, long data)
496 if (unlikely(!arch_has_single_step())) 494 if (unlikely(!arch_has_single_step()))
497 return -EIO; 495 return -EIO;
498 user_enable_single_step(child); 496 user_enable_single_step(child);
499 } 497 } else {
500 else
501 user_disable_single_step(child); 498 user_disable_single_step(child);
499 }
502 500
503 child->exit_code = data; 501 child->exit_code = data;
504 wake_up_process(child); 502 wake_up_process(child);
@@ -606,10 +604,11 @@ repeat:
606 ret = security_ptrace_traceme(current->parent); 604 ret = security_ptrace_traceme(current->parent);
607 605
608 /* 606 /*
609 * Set the ptrace bit in the process ptrace flags. 607 * Check PF_EXITING to ensure ->real_parent has not passed
610 * Then link us on our parent's ptraced list. 608 * exit_ptrace(). Otherwise we don't report the error but
609 * pretend ->real_parent untraces us right after return.
611 */ 610 */
612 if (!ret) { 611 if (!ret && !(current->real_parent->flags & PF_EXITING)) {
613 current->ptrace |= PT_PTRACED; 612 current->ptrace |= PT_PTRACED;
614 __ptrace_link(current, current->real_parent); 613 __ptrace_link(current, current->real_parent);
615 } 614 }
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 654c640a6b9c..0f2b0b311304 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -65,6 +65,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
66 .cpumask = CPU_BITS_NONE, 66 .cpumask = CPU_BITS_NONE,
67}; 67};
68
68static struct rcu_ctrlblk rcu_bh_ctrlblk = { 69static struct rcu_ctrlblk rcu_bh_ctrlblk = {
69 .cur = -300, 70 .cur = -300,
70 .completed = -300, 71 .completed = -300,
@@ -73,8 +74,26 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
73 .cpumask = CPU_BITS_NONE, 74 .cpumask = CPU_BITS_NONE,
74}; 75};
75 76
76DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 77static DEFINE_PER_CPU(struct rcu_data, rcu_data);
77DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; 78static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
79
80/*
81 * Increment the quiescent state counter.
82 * The counter is a bit degenerated: We do not need to know
83 * how many quiescent states passed, just if there was at least
84 * one since the start of the grace period. Thus just a flag.
85 */
86void rcu_qsctr_inc(int cpu)
87{
88 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
89 rdp->passed_quiesc = 1;
90}
91
92void rcu_bh_qsctr_inc(int cpu)
93{
94 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
95 rdp->passed_quiesc = 1;
96}
78 97
79static int blimit = 10; 98static int blimit = 10;
80static int qhimark = 10000; 99static int qhimark = 10000;
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5d59e850fb71..ce97a4df64d3 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -147,7 +147,51 @@ struct rcu_ctrlblk {
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */ 147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
148}; 148};
149 149
150struct rcu_dyntick_sched {
151 int dynticks;
152 int dynticks_snap;
153 int sched_qs;
154 int sched_qs_snap;
155 int sched_dynticks_snap;
156};
157
158static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
159 .dynticks = 1,
160};
161
162void rcu_qsctr_inc(int cpu)
163{
164 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
165
166 rdssp->sched_qs++;
167}
168
169#ifdef CONFIG_NO_HZ
170
171void rcu_enter_nohz(void)
172{
173 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
174
175 smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
176 __get_cpu_var(rcu_dyntick_sched).dynticks++;
177 WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
178}
179
180void rcu_exit_nohz(void)
181{
182 static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
183
184 __get_cpu_var(rcu_dyntick_sched).dynticks++;
185 smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
186 WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
187 &rs);
188}
189
190#endif /* CONFIG_NO_HZ */
191
192
150static DEFINE_PER_CPU(struct rcu_data, rcu_data); 193static DEFINE_PER_CPU(struct rcu_data, rcu_data);
194
151static struct rcu_ctrlblk rcu_ctrlblk = { 195static struct rcu_ctrlblk rcu_ctrlblk = {
152 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), 196 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
153 .completed = 0, 197 .completed = 0,
@@ -427,10 +471,6 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
427 } 471 }
428} 472}
429 473
430DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
431 .dynticks = 1,
432};
433
434#ifdef CONFIG_NO_HZ 474#ifdef CONFIG_NO_HZ
435static DEFINE_PER_CPU(int, rcu_update_flag); 475static DEFINE_PER_CPU(int, rcu_update_flag);
436 476
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 97ce31579ec0..7f3266922572 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -78,6 +78,26 @@ DEFINE_PER_CPU(struct rcu_data, rcu_data);
78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); 78struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); 79DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
80 80
81/*
82 * Increment the quiescent state counter.
83 * The counter is a bit degenerated: We do not need to know
84 * how many quiescent states passed, just if there was at least
85 * one since the start of the grace period. Thus just a flag.
86 */
87void rcu_qsctr_inc(int cpu)
88{
89 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
90 rdp->passed_quiesc = 1;
91 rdp->passed_quiesc_completed = rdp->completed;
92}
93
94void rcu_bh_qsctr_inc(int cpu)
95{
96 struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
97 rdp->passed_quiesc = 1;
98 rdp->passed_quiesc_completed = rdp->completed;
99}
100
81#ifdef CONFIG_NO_HZ 101#ifdef CONFIG_NO_HZ
82DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 102DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
83 .dynticks_nesting = 1, 103 .dynticks_nesting = 1,
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
new file mode 100644
index 000000000000..5e872bbf07f5
--- /dev/null
+++ b/kernel/rcutree.h
@@ -0,0 +1,10 @@
1
2/*
3 * RCU implementation internal declarations:
4 */
5extern struct rcu_state rcu_state;
6DECLARE_PER_CPU(struct rcu_data, rcu_data);
7
8extern struct rcu_state rcu_bh_state;
9DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
10
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d6db3e837826..4ee954f6a8d5 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,8 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/seq_file.h> 44#include <linux/seq_file.h>
45 45
46#include "rcutree.h"
47
46static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) 48static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
47{ 49{
48 if (!rdp->beenonline) 50 if (!rdp->beenonline)
diff --git a/kernel/sched.c b/kernel/sched.c
index bec249885e17..5724508c3b66 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
231 231
232 spin_lock(&rt_b->rt_runtime_lock); 232 spin_lock(&rt_b->rt_runtime_lock);
233 for (;;) { 233 for (;;) {
234 unsigned long delta;
235 ktime_t soft, hard;
236
234 if (hrtimer_active(&rt_b->rt_period_timer)) 237 if (hrtimer_active(&rt_b->rt_period_timer))
235 break; 238 break;
236 239
237 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 240 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
238 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 241 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
239 hrtimer_start_expires(&rt_b->rt_period_timer, 242
240 HRTIMER_MODE_ABS); 243 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
244 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
245 delta = ktime_to_ns(ktime_sub(hard, soft));
246 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
247 HRTIMER_MODE_ABS, 0);
241 } 248 }
242 spin_unlock(&rt_b->rt_runtime_lock); 249 spin_unlock(&rt_b->rt_runtime_lock);
243} 250}
@@ -1146,7 +1153,8 @@ static __init void init_hrtick(void)
1146 */ 1153 */
1147static void hrtick_start(struct rq *rq, u64 delay) 1154static void hrtick_start(struct rq *rq, u64 delay)
1148{ 1155{
1149 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1156 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1157 HRTIMER_MODE_REL, 0);
1150} 1158}
1151 1159
1152static inline void init_hrtick(void) 1160static inline void init_hrtick(void)
@@ -1410,10 +1418,22 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1410 struct rq_iterator *iterator); 1418 struct rq_iterator *iterator);
1411#endif 1419#endif
1412 1420
1421/* Time spent by the tasks of the cpu accounting group executing in ... */
1422enum cpuacct_stat_index {
1423 CPUACCT_STAT_USER, /* ... user mode */
1424 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
1425
1426 CPUACCT_STAT_NSTATS,
1427};
1428
1413#ifdef CONFIG_CGROUP_CPUACCT 1429#ifdef CONFIG_CGROUP_CPUACCT
1414static void cpuacct_charge(struct task_struct *tsk, u64 cputime); 1430static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1431static void cpuacct_update_stats(struct task_struct *tsk,
1432 enum cpuacct_stat_index idx, cputime_t val);
1415#else 1433#else
1416static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1434static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1435static inline void cpuacct_update_stats(struct task_struct *tsk,
1436 enum cpuacct_stat_index idx, cputime_t val) {}
1417#endif 1437#endif
1418 1438
1419static inline void inc_cpu_load(struct rq *rq, unsigned long load) 1439static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@ -4503,9 +4523,25 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
4503EXPORT_PER_CPU_SYMBOL(kstat); 4523EXPORT_PER_CPU_SYMBOL(kstat);
4504 4524
4505/* 4525/*
4506 * Return any ns on the sched_clock that have not yet been banked in 4526 * Return any ns on the sched_clock that have not yet been accounted in
4507 * @p in case that task is currently running. 4527 * @p in case that task is currently running.
4528 *
4529 * Called with task_rq_lock() held on @rq.
4508 */ 4530 */
4531static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
4532{
4533 u64 ns = 0;
4534
4535 if (task_current(rq, p)) {
4536 update_rq_clock(rq);
4537 ns = rq->clock - p->se.exec_start;
4538 if ((s64)ns < 0)
4539 ns = 0;
4540 }
4541
4542 return ns;
4543}
4544
4509unsigned long long task_delta_exec(struct task_struct *p) 4545unsigned long long task_delta_exec(struct task_struct *p)
4510{ 4546{
4511 unsigned long flags; 4547 unsigned long flags;
@@ -4513,16 +4549,49 @@ unsigned long long task_delta_exec(struct task_struct *p)
4513 u64 ns = 0; 4549 u64 ns = 0;
4514 4550
4515 rq = task_rq_lock(p, &flags); 4551 rq = task_rq_lock(p, &flags);
4552 ns = do_task_delta_exec(p, rq);
4553 task_rq_unlock(rq, &flags);
4516 4554
4517 if (task_current(rq, p)) { 4555 return ns;
4518 u64 delta_exec; 4556}
4519 4557
4520 update_rq_clock(rq); 4558/*
4521 delta_exec = rq->clock - p->se.exec_start; 4559 * Return accounted runtime for the task.
4522 if ((s64)delta_exec > 0) 4560 * In case the task is currently running, return the runtime plus current's
4523 ns = delta_exec; 4561 * pending runtime that have not been accounted yet.
4524 } 4562 */
4563unsigned long long task_sched_runtime(struct task_struct *p)
4564{
4565 unsigned long flags;
4566 struct rq *rq;
4567 u64 ns = 0;
4568
4569 rq = task_rq_lock(p, &flags);
4570 ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
4571 task_rq_unlock(rq, &flags);
4572
4573 return ns;
4574}
4525 4575
4576/*
4577 * Return sum_exec_runtime for the thread group.
4578 * In case the task is currently running, return the sum plus current's
4579 * pending runtime that have not been accounted yet.
4580 *
4581 * Note that the thread group might have other running tasks as well,
4582 * so the return value not includes other pending runtime that other
4583 * running tasks might have.
4584 */
4585unsigned long long thread_group_sched_runtime(struct task_struct *p)
4586{
4587 struct task_cputime totals;
4588 unsigned long flags;
4589 struct rq *rq;
4590 u64 ns;
4591
4592 rq = task_rq_lock(p, &flags);
4593 thread_group_cputime(p, &totals);
4594 ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
4526 task_rq_unlock(rq, &flags); 4595 task_rq_unlock(rq, &flags);
4527 4596
4528 return ns; 4597 return ns;
@@ -4551,6 +4620,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
4551 cpustat->nice = cputime64_add(cpustat->nice, tmp); 4620 cpustat->nice = cputime64_add(cpustat->nice, tmp);
4552 else 4621 else
4553 cpustat->user = cputime64_add(cpustat->user, tmp); 4622 cpustat->user = cputime64_add(cpustat->user, tmp);
4623
4624 cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
4554 /* Account for user time used */ 4625 /* Account for user time used */
4555 acct_update_integrals(p); 4626 acct_update_integrals(p);
4556} 4627}
@@ -4612,6 +4683,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4612 else 4683 else
4613 cpustat->system = cputime64_add(cpustat->system, tmp); 4684 cpustat->system = cputime64_add(cpustat->system, tmp);
4614 4685
4686 cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
4687
4615 /* Account for system time used */ 4688 /* Account for system time used */
4616 acct_update_integrals(p); 4689 acct_update_integrals(p);
4617} 4690}
@@ -7294,7 +7367,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
7294 cpumask_or(groupmask, groupmask, sched_group_cpus(group)); 7367 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
7295 7368
7296 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group)); 7369 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
7297 printk(KERN_CONT " %s", str); 7370 printk(KERN_CONT " %s (__cpu_power = %d)", str,
7371 group->__cpu_power);
7298 7372
7299 group = group->next; 7373 group = group->next;
7300 } while (group != sd->groups); 7374 } while (group != sd->groups);
@@ -9917,6 +9991,7 @@ struct cpuacct {
9917 struct cgroup_subsys_state css; 9991 struct cgroup_subsys_state css;
9918 /* cpuusage holds pointer to a u64-type object on every cpu */ 9992 /* cpuusage holds pointer to a u64-type object on every cpu */
9919 u64 *cpuusage; 9993 u64 *cpuusage;
9994 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
9920 struct cpuacct *parent; 9995 struct cpuacct *parent;
9921}; 9996};
9922 9997
@@ -9941,20 +10016,32 @@ static struct cgroup_subsys_state *cpuacct_create(
9941 struct cgroup_subsys *ss, struct cgroup *cgrp) 10016 struct cgroup_subsys *ss, struct cgroup *cgrp)
9942{ 10017{
9943 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 10018 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
10019 int i;
9944 10020
9945 if (!ca) 10021 if (!ca)
9946 return ERR_PTR(-ENOMEM); 10022 goto out;
9947 10023
9948 ca->cpuusage = alloc_percpu(u64); 10024 ca->cpuusage = alloc_percpu(u64);
9949 if (!ca->cpuusage) { 10025 if (!ca->cpuusage)
9950 kfree(ca); 10026 goto out_free_ca;
9951 return ERR_PTR(-ENOMEM); 10027
9952 } 10028 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
10029 if (percpu_counter_init(&ca->cpustat[i], 0))
10030 goto out_free_counters;
9953 10031
9954 if (cgrp->parent) 10032 if (cgrp->parent)
9955 ca->parent = cgroup_ca(cgrp->parent); 10033 ca->parent = cgroup_ca(cgrp->parent);
9956 10034
9957 return &ca->css; 10035 return &ca->css;
10036
10037out_free_counters:
10038 while (--i >= 0)
10039 percpu_counter_destroy(&ca->cpustat[i]);
10040 free_percpu(ca->cpuusage);
10041out_free_ca:
10042 kfree(ca);
10043out:
10044 return ERR_PTR(-ENOMEM);
9958} 10045}
9959 10046
9960/* destroy an existing cpu accounting group */ 10047/* destroy an existing cpu accounting group */
@@ -9962,7 +10049,10 @@ static void
9962cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) 10049cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9963{ 10050{
9964 struct cpuacct *ca = cgroup_ca(cgrp); 10051 struct cpuacct *ca = cgroup_ca(cgrp);
10052 int i;
9965 10053
10054 for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
10055 percpu_counter_destroy(&ca->cpustat[i]);
9966 free_percpu(ca->cpuusage); 10056 free_percpu(ca->cpuusage);
9967 kfree(ca); 10057 kfree(ca);
9968} 10058}
@@ -10049,6 +10139,25 @@ static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
10049 return 0; 10139 return 0;
10050} 10140}
10051 10141
10142static const char *cpuacct_stat_desc[] = {
10143 [CPUACCT_STAT_USER] = "user",
10144 [CPUACCT_STAT_SYSTEM] = "system",
10145};
10146
10147static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
10148 struct cgroup_map_cb *cb)
10149{
10150 struct cpuacct *ca = cgroup_ca(cgrp);
10151 int i;
10152
10153 for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
10154 s64 val = percpu_counter_read(&ca->cpustat[i]);
10155 val = cputime64_to_clock_t(val);
10156 cb->fill(cb, cpuacct_stat_desc[i], val);
10157 }
10158 return 0;
10159}
10160
10052static struct cftype files[] = { 10161static struct cftype files[] = {
10053 { 10162 {
10054 .name = "usage", 10163 .name = "usage",
@@ -10059,7 +10168,10 @@ static struct cftype files[] = {
10059 .name = "usage_percpu", 10168 .name = "usage_percpu",
10060 .read_seq_string = cpuacct_percpu_seq_read, 10169 .read_seq_string = cpuacct_percpu_seq_read,
10061 }, 10170 },
10062 10171 {
10172 .name = "stat",
10173 .read_map = cpuacct_stats_show,
10174 },
10063}; 10175};
10064 10176
10065static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) 10177static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@ -10081,12 +10193,38 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10081 return; 10193 return;
10082 10194
10083 cpu = task_cpu(tsk); 10195 cpu = task_cpu(tsk);
10196
10197 rcu_read_lock();
10198
10084 ca = task_ca(tsk); 10199 ca = task_ca(tsk);
10085 10200
10086 for (; ca; ca = ca->parent) { 10201 for (; ca; ca = ca->parent) {
10087 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); 10202 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
10088 *cpuusage += cputime; 10203 *cpuusage += cputime;
10089 } 10204 }
10205
10206 rcu_read_unlock();
10207}
10208
10209/*
10210 * Charge the system/user time to the task's accounting group.
10211 */
10212static void cpuacct_update_stats(struct task_struct *tsk,
10213 enum cpuacct_stat_index idx, cputime_t val)
10214{
10215 struct cpuacct *ca;
10216
10217 if (unlikely(!cpuacct_subsys.active))
10218 return;
10219
10220 rcu_read_lock();
10221 ca = task_ca(tsk);
10222
10223 do {
10224 percpu_counter_add(&ca->cpustat[idx], val);
10225 ca = ca->parent;
10226 } while (ca);
10227 rcu_read_unlock();
10090} 10228}
10091 10229
10092struct cgroup_subsys cpuacct_subsys = { 10230struct cgroup_subsys cpuacct_subsys = {
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 1e00bfacf9b8..cdd3c89574cd 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -55,7 +55,7 @@ static int convert_prio(int prio)
55 * cpupri_find - find the best (lowest-pri) CPU in the system 55 * cpupri_find - find the best (lowest-pri) CPU in the system
56 * @cp: The cpupri context 56 * @cp: The cpupri context
57 * @p: The task 57 * @p: The task
58 * @lowest_mask: A mask to fill in with selected CPUs 58 * @lowest_mask: A mask to fill in with selected CPUs (or NULL)
59 * 59 *
60 * Note: This function returns the recommended CPUs as calculated during the 60 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in 61 * current invokation. By the time the call returns, the CPUs may have in
@@ -81,7 +81,8 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids) 81 if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
82 continue; 82 continue;
83 83
84 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask); 84 if (lowest_mask)
85 cpumask_and(lowest_mask, &p->cpus_allowed, vec->mask);
85 return 1; 86 return 1;
86 } 87 }
87 88
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 299d012b4394..f2c66f8f9712 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -948,20 +948,15 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
948 948
949static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 949static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
950{ 950{
951 cpumask_var_t mask;
952
953 if (rq->curr->rt.nr_cpus_allowed == 1) 951 if (rq->curr->rt.nr_cpus_allowed == 1)
954 return; 952 return;
955 953
956 if (!alloc_cpumask_var(&mask, GFP_ATOMIC))
957 return;
958
959 if (p->rt.nr_cpus_allowed != 1 954 if (p->rt.nr_cpus_allowed != 1
960 && cpupri_find(&rq->rd->cpupri, p, mask)) 955 && cpupri_find(&rq->rd->cpupri, p, NULL))
961 goto free; 956 return;
962 957
963 if (!cpupri_find(&rq->rd->cpupri, rq->curr, mask)) 958 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
964 goto free; 959 return;
965 960
966 /* 961 /*
967 * There appears to be other cpus that can accept 962 * There appears to be other cpus that can accept
@@ -970,8 +965,6 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
970 */ 965 */
971 requeue_task_rt(rq, p, 1); 966 requeue_task_rt(rq, p, 1);
972 resched_task(rq->curr); 967 resched_task(rq->curr);
973free:
974 free_cpumask_var(mask);
975} 968}
976 969
977#endif /* CONFIG_SMP */ 970#endif /* CONFIG_SMP */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d105a82543d0..2fecefacdc5b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -65,7 +65,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
65 * to the pending events, so lets the scheduler to balance 65 * to the pending events, so lets the scheduler to balance
66 * the softirq load for us. 66 * the softirq load for us.
67 */ 67 */
68static inline void wakeup_softirqd(void) 68void wakeup_softirqd(void)
69{ 69{
70 /* Interrupts are disabled: no need to stop preemption */ 70 /* Interrupts are disabled: no need to stop preemption */
71 struct task_struct *tsk = __get_cpu_var(ksoftirqd); 71 struct task_struct *tsk = __get_cpu_var(ksoftirqd);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 85d5a2455103..88796c330838 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -166,97 +166,11 @@ void softlockup_tick(void)
166} 166}
167 167
168/* 168/*
169 * Have a reasonable limit on the number of tasks checked:
170 */
171unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
172
173/*
174 * Zero means infinite timeout - no checking done:
175 */
176unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
177
178unsigned long __read_mostly sysctl_hung_task_warnings = 10;
179
180/*
181 * Only do the hung-tasks check on one CPU:
182 */
183static int check_cpu __read_mostly = -1;
184
185static void check_hung_task(struct task_struct *t, unsigned long now)
186{
187 unsigned long switch_count = t->nvcsw + t->nivcsw;
188
189 if (t->flags & PF_FROZEN)
190 return;
191
192 if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
193 t->last_switch_count = switch_count;
194 t->last_switch_timestamp = now;
195 return;
196 }
197 if ((long)(now - t->last_switch_timestamp) <
198 sysctl_hung_task_timeout_secs)
199 return;
200 if (!sysctl_hung_task_warnings)
201 return;
202 sysctl_hung_task_warnings--;
203
204 /*
205 * Ok, the task did not get scheduled for more than 2 minutes,
206 * complain:
207 */
208 printk(KERN_ERR "INFO: task %s:%d blocked for more than "
209 "%ld seconds.\n", t->comm, t->pid,
210 sysctl_hung_task_timeout_secs);
211 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
212 " disables this message.\n");
213 sched_show_task(t);
214 __debug_show_held_locks(t);
215
216 t->last_switch_timestamp = now;
217 touch_nmi_watchdog();
218
219 if (softlockup_panic)
220 panic("softlockup: blocked tasks");
221}
222
223/*
224 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
225 * a really long time (120 seconds). If that happens, print out
226 * a warning.
227 */
228static void check_hung_uninterruptible_tasks(int this_cpu)
229{
230 int max_count = sysctl_hung_task_check_count;
231 unsigned long now = get_timestamp(this_cpu);
232 struct task_struct *g, *t;
233
234 /*
235 * If the system crashed already then all bets are off,
236 * do not report extra hung tasks:
237 */
238 if (test_taint(TAINT_DIE) || did_panic)
239 return;
240
241 read_lock(&tasklist_lock);
242 do_each_thread(g, t) {
243 if (!--max_count)
244 goto unlock;
245 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
246 if (t->state == TASK_UNINTERRUPTIBLE)
247 check_hung_task(t, now);
248 } while_each_thread(g, t);
249 unlock:
250 read_unlock(&tasklist_lock);
251}
252
253/*
254 * The watchdog thread - runs every second and touches the timestamp. 169 * The watchdog thread - runs every second and touches the timestamp.
255 */ 170 */
256static int watchdog(void *__bind_cpu) 171static int watchdog(void *__bind_cpu)
257{ 172{
258 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 173 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
259 int this_cpu = (long)__bind_cpu;
260 174
261 sched_setscheduler(current, SCHED_FIFO, &param); 175 sched_setscheduler(current, SCHED_FIFO, &param);
262 176
@@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu)
276 if (kthread_should_stop()) 190 if (kthread_should_stop())
277 break; 191 break;
278 192
279 if (this_cpu == check_cpu) {
280 if (sysctl_hung_task_timeout_secs)
281 check_hung_uninterruptible_tasks(this_cpu);
282 }
283
284 set_current_state(TASK_INTERRUPTIBLE); 193 set_current_state(TASK_INTERRUPTIBLE);
285 } 194 }
286 __set_current_state(TASK_RUNNING); 195 __set_current_state(TASK_RUNNING);
@@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
312 break; 221 break;
313 case CPU_ONLINE: 222 case CPU_ONLINE:
314 case CPU_ONLINE_FROZEN: 223 case CPU_ONLINE_FROZEN:
315 check_cpu = cpumask_any(cpu_online_mask);
316 wake_up_process(per_cpu(watchdog_task, hotcpu)); 224 wake_up_process(per_cpu(watchdog_task, hotcpu));
317 break; 225 break;
318#ifdef CONFIG_HOTPLUG_CPU 226#ifdef CONFIG_HOTPLUG_CPU
319 case CPU_DOWN_PREPARE:
320 case CPU_DOWN_PREPARE_FROZEN:
321 if (hotcpu == check_cpu) {
322 /* Pick any other online cpu. */
323 check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
324 }
325 break;
326
327 case CPU_UP_CANCELED: 227 case CPU_UP_CANCELED:
328 case CPU_UP_CANCELED_FROZEN: 228 case CPU_UP_CANCELED_FROZEN:
329 if (!per_cpu(watchdog_task, hotcpu)) 229 if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/sys.c b/kernel/sys.c
index 51dbb55604e8..e7998cf31498 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -360,6 +360,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
360 void __user *, arg) 360 void __user *, arg)
361{ 361{
362 char buffer[256]; 362 char buffer[256];
363 int ret = 0;
363 364
364 /* We only trust the superuser with rebooting the system. */ 365 /* We only trust the superuser with rebooting the system. */
365 if (!capable(CAP_SYS_BOOT)) 366 if (!capable(CAP_SYS_BOOT))
@@ -397,7 +398,7 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
397 kernel_halt(); 398 kernel_halt();
398 unlock_kernel(); 399 unlock_kernel();
399 do_exit(0); 400 do_exit(0);
400 break; 401 panic("cannot halt");
401 402
402 case LINUX_REBOOT_CMD_POWER_OFF: 403 case LINUX_REBOOT_CMD_POWER_OFF:
403 kernel_power_off(); 404 kernel_power_off();
@@ -417,29 +418,22 @@ SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
417 418
418#ifdef CONFIG_KEXEC 419#ifdef CONFIG_KEXEC
419 case LINUX_REBOOT_CMD_KEXEC: 420 case LINUX_REBOOT_CMD_KEXEC:
420 { 421 ret = kernel_kexec();
421 int ret; 422 break;
422 ret = kernel_kexec();
423 unlock_kernel();
424 return ret;
425 }
426#endif 423#endif
427 424
428#ifdef CONFIG_HIBERNATION 425#ifdef CONFIG_HIBERNATION
429 case LINUX_REBOOT_CMD_SW_SUSPEND: 426 case LINUX_REBOOT_CMD_SW_SUSPEND:
430 { 427 ret = hibernate();
431 int ret = hibernate(); 428 break;
432 unlock_kernel();
433 return ret;
434 }
435#endif 429#endif
436 430
437 default: 431 default:
438 unlock_kernel(); 432 ret = -EINVAL;
439 return -EINVAL; 433 break;
440 } 434 }
441 unlock_kernel(); 435 unlock_kernel();
442 return 0; 436 return ret;
443} 437}
444 438
445static void deferred_cad(struct work_struct *dummy) 439static void deferred_cad(struct work_struct *dummy)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 82350f8f04f6..e3d2c7dd59b9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,10 +97,11 @@ static int neg_one = -1;
97#endif 97#endif
98 98
99static int zero; 99static int zero;
100static int one = 1; 100static int __maybe_unused one = 1;
101static int two = 2; 101static int __maybe_unused two = 2;
102static unsigned long one_ul = 1; 102static unsigned long one_ul = 1;
103static int one_hundred = 100; 103static int one_hundred = 100;
104static int one_thousand = 1000;
104 105
105/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 106/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
106static int maxolduid = 65535; 107static int maxolduid = 65535;
@@ -813,6 +814,19 @@ static struct ctl_table kern_table[] = {
813 .extra1 = &neg_one, 814 .extra1 = &neg_one,
814 .extra2 = &sixty, 815 .extra2 = &sixty,
815 }, 816 },
817#endif
818#ifdef CONFIG_DETECT_HUNG_TASK
819 {
820 .ctl_name = CTL_UNNUMBERED,
821 .procname = "hung_task_panic",
822 .data = &sysctl_hung_task_panic,
823 .maxlen = sizeof(int),
824 .mode = 0644,
825 .proc_handler = &proc_dointvec_minmax,
826 .strategy = &sysctl_intvec,
827 .extra1 = &zero,
828 .extra2 = &one,
829 },
816 { 830 {
817 .ctl_name = CTL_UNNUMBERED, 831 .ctl_name = CTL_UNNUMBERED,
818 .procname = "hung_task_check_count", 832 .procname = "hung_task_check_count",
@@ -828,7 +842,7 @@ static struct ctl_table kern_table[] = {
828 .data = &sysctl_hung_task_timeout_secs, 842 .data = &sysctl_hung_task_timeout_secs,
829 .maxlen = sizeof(unsigned long), 843 .maxlen = sizeof(unsigned long),
830 .mode = 0644, 844 .mode = 0644,
831 .proc_handler = &proc_doulongvec_minmax, 845 .proc_handler = &proc_dohung_task_timeout_secs,
832 .strategy = &sysctl_intvec, 846 .strategy = &sysctl_intvec,
833 }, 847 },
834 { 848 {
@@ -888,16 +902,6 @@ static struct ctl_table kern_table[] = {
888 .proc_handler = &proc_dointvec, 902 .proc_handler = &proc_dointvec,
889 }, 903 },
890#endif 904#endif
891#ifdef CONFIG_UNEVICTABLE_LRU
892 {
893 .ctl_name = CTL_UNNUMBERED,
894 .procname = "scan_unevictable_pages",
895 .data = &scan_unevictable_pages,
896 .maxlen = sizeof(scan_unevictable_pages),
897 .mode = 0644,
898 .proc_handler = &scan_unevictable_handler,
899 },
900#endif
901#ifdef CONFIG_SLOW_WORK 905#ifdef CONFIG_SLOW_WORK
902 { 906 {
903 .ctl_name = CTL_UNNUMBERED, 907 .ctl_name = CTL_UNNUMBERED,
@@ -1027,6 +1031,28 @@ static struct ctl_table vm_table[] = {
1027 .proc_handler = &proc_dointvec, 1031 .proc_handler = &proc_dointvec,
1028 }, 1032 },
1029 { 1033 {
1034 .ctl_name = CTL_UNNUMBERED,
1035 .procname = "nr_pdflush_threads_min",
1036 .data = &nr_pdflush_threads_min,
1037 .maxlen = sizeof nr_pdflush_threads_min,
1038 .mode = 0644 /* read-write */,
1039 .proc_handler = &proc_dointvec_minmax,
1040 .strategy = &sysctl_intvec,
1041 .extra1 = &one,
1042 .extra2 = &nr_pdflush_threads_max,
1043 },
1044 {
1045 .ctl_name = CTL_UNNUMBERED,
1046 .procname = "nr_pdflush_threads_max",
1047 .data = &nr_pdflush_threads_max,
1048 .maxlen = sizeof nr_pdflush_threads_max,
1049 .mode = 0644 /* read-write */,
1050 .proc_handler = &proc_dointvec_minmax,
1051 .strategy = &sysctl_intvec,
1052 .extra1 = &nr_pdflush_threads_min,
1053 .extra2 = &one_thousand,
1054 },
1055 {
1030 .ctl_name = VM_SWAPPINESS, 1056 .ctl_name = VM_SWAPPINESS,
1031 .procname = "swappiness", 1057 .procname = "swappiness",
1032 .data = &vm_swappiness, 1058 .data = &vm_swappiness,
@@ -1266,6 +1292,16 @@ static struct ctl_table vm_table[] = {
1266 .extra2 = &one, 1292 .extra2 = &one,
1267 }, 1293 },
1268#endif 1294#endif
1295#ifdef CONFIG_UNEVICTABLE_LRU
1296 {
1297 .ctl_name = CTL_UNNUMBERED,
1298 .procname = "scan_unevictable_pages",
1299 .data = &scan_unevictable_pages,
1300 .maxlen = sizeof(scan_unevictable_pages),
1301 .mode = 0644,
1302 .proc_handler = &scan_unevictable_handler,
1303 },
1304#endif
1269/* 1305/*
1270 * NOTE: do not add new entries to this table unless you have read 1306 * NOTE: do not add new entries to this table unless you have read
1271 * Documentation/sysctl/ctl_unnumbered.txt 1307 * Documentation/sysctl/ctl_unnumbered.txt
diff --git a/kernel/timer.c b/kernel/timer.c
index b4555568b4e4..cffffad01c31 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -531,10 +531,13 @@ static void __init_timer(struct timer_list *timer,
531} 531}
532 532
533/** 533/**
534 * init_timer - initialize a timer. 534 * init_timer_key - initialize a timer
535 * @timer: the timer to be initialized 535 * @timer: the timer to be initialized
536 * @name: name of the timer
537 * @key: lockdep class key of the fake lock used for tracking timer
538 * sync lock dependencies
536 * 539 *
537 * init_timer() must be done to a timer prior calling *any* of the 540 * init_timer_key() must be done to a timer prior calling *any* of the
538 * other timer functions. 541 * other timer functions.
539 */ 542 */
540void init_timer_key(struct timer_list *timer, 543void init_timer_key(struct timer_list *timer,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2246141bda4d..417d1985e299 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -312,7 +312,7 @@ config KMEMTRACE
312 and profile kernel code. 312 and profile kernel code.
313 313
314 This requires an userspace application to use. See 314 This requires an userspace application to use. See
315 Documentation/vm/kmemtrace.txt for more information. 315 Documentation/trace/kmemtrace.txt for more information.
316 316
317 Saying Y will make the kernel somewhat larger and slower. However, 317 Saying Y will make the kernel somewhat larger and slower. However,
318 if you disable kmemtrace at run-time or boot-time, the performance 318 if you disable kmemtrace at run-time or boot-time, the performance
@@ -403,7 +403,7 @@ config MMIOTRACE
403 implementation and works via page faults. Tracing is disabled by 403 implementation and works via page faults. Tracing is disabled by
404 default and can be enabled at run-time. 404 default and can be enabled at run-time.
405 405
406 See Documentation/tracers/mmiotrace.txt. 406 See Documentation/trace/mmiotrace.txt.
407 If you are not helping to develop drivers, say N. 407 If you are not helping to develop drivers, say N.
408 408
409config MMIOTRACE_TEST 409config MMIOTRACE_TEST
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 947c5b3f90c4..921ef5d1f0ba 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -327,10 +327,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
327 char *msg; 327 char *msg;
328 struct blk_trace *bt; 328 struct blk_trace *bt;
329 329
330 if (count > BLK_TN_MAX_MSG) 330 if (count >= BLK_TN_MAX_MSG)
331 return -EINVAL; 331 return -EINVAL;
332 332
333 msg = kmalloc(count, GFP_KERNEL); 333 msg = kmalloc(count + 1, GFP_KERNEL);
334 if (msg == NULL) 334 if (msg == NULL)
335 return -ENOMEM; 335 return -ENOMEM;
336 336
@@ -339,6 +339,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
339 return -EFAULT; 339 return -EFAULT;
340 } 340 }
341 341
342 msg[count] = '\0';
342 bt = filp->private_data; 343 bt = filp->private_data;
343 __trace_note_message(bt, "%s", msg); 344 __trace_note_message(bt, "%s", msg);
344 kfree(msg); 345 kfree(msg);
@@ -642,7 +643,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
642 if (blk_pc_request(rq)) { 643 if (blk_pc_request(rq)) {
643 what |= BLK_TC_ACT(BLK_TC_PC); 644 what |= BLK_TC_ACT(BLK_TC_PC);
644 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, 645 __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
645 sizeof(rq->cmd), rq->cmd); 646 rq->cmd_len, rq->cmd);
646 } else { 647 } else {
647 what |= BLK_TC_ACT(BLK_TC_FS); 648 what |= BLK_TC_ACT(BLK_TC_FS);
648 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, 649 __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
@@ -1376,12 +1377,12 @@ static int blk_trace_str2mask(const char *str)
1376{ 1377{
1377 int i; 1378 int i;
1378 int mask = 0; 1379 int mask = 0;
1379 char *s, *token; 1380 char *buf, *s, *token;
1380 1381
1381 s = kstrdup(str, GFP_KERNEL); 1382 buf = kstrdup(str, GFP_KERNEL);
1382 if (s == NULL) 1383 if (buf == NULL)
1383 return -ENOMEM; 1384 return -ENOMEM;
1384 s = strstrip(s); 1385 s = strstrip(buf);
1385 1386
1386 while (1) { 1387 while (1) {
1387 token = strsep(&s, ","); 1388 token = strsep(&s, ",");
@@ -1402,7 +1403,7 @@ static int blk_trace_str2mask(const char *str)
1402 break; 1403 break;
1403 } 1404 }
1404 } 1405 }
1405 kfree(s); 1406 kfree(buf);
1406 1407
1407 return mask; 1408 return mask;
1408} 1409}
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index ae201b3eda89..5011f4d91e37 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -6,14 +6,16 @@
6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com> 6 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
7 */ 7 */
8 8
9#include <linux/dcache.h> 9#include <linux/tracepoint.h>
10#include <linux/seq_file.h>
10#include <linux/debugfs.h> 11#include <linux/debugfs.h>
12#include <linux/dcache.h>
11#include <linux/fs.h> 13#include <linux/fs.h>
12#include <linux/seq_file.h> 14
13#include <trace/kmemtrace.h> 15#include <trace/kmemtrace.h>
14 16
15#include "trace.h"
16#include "trace_output.h" 17#include "trace_output.h"
18#include "trace.h"
17 19
18/* Select an alternative, minimalistic output than the original one */ 20/* Select an alternative, minimalistic output than the original one */
19#define TRACE_KMEM_OPT_MINIMAL 0x1 21#define TRACE_KMEM_OPT_MINIMAL 0x1
@@ -25,14 +27,156 @@ static struct tracer_opt kmem_opts[] = {
25}; 27};
26 28
27static struct tracer_flags kmem_tracer_flags = { 29static struct tracer_flags kmem_tracer_flags = {
28 .val = 0, 30 .val = 0,
29 .opts = kmem_opts 31 .opts = kmem_opts
30}; 32};
31 33
32
33static bool kmem_tracing_enabled __read_mostly;
34static struct trace_array *kmemtrace_array; 34static struct trace_array *kmemtrace_array;
35 35
36/* Trace allocations */
37static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
38 unsigned long call_site,
39 const void *ptr,
40 size_t bytes_req,
41 size_t bytes_alloc,
42 gfp_t gfp_flags,
43 int node)
44{
45 struct trace_array *tr = kmemtrace_array;
46 struct kmemtrace_alloc_entry *entry;
47 struct ring_buffer_event *event;
48
49 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
50 if (!event)
51 return;
52
53 entry = ring_buffer_event_data(event);
54 tracing_generic_entry_update(&entry->ent, 0, 0);
55
56 entry->ent.type = TRACE_KMEM_ALLOC;
57 entry->type_id = type_id;
58 entry->call_site = call_site;
59 entry->ptr = ptr;
60 entry->bytes_req = bytes_req;
61 entry->bytes_alloc = bytes_alloc;
62 entry->gfp_flags = gfp_flags;
63 entry->node = node;
64
65 ring_buffer_unlock_commit(tr->buffer, event);
66
67 trace_wake_up();
68}
69
70static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
71 unsigned long call_site,
72 const void *ptr)
73{
74 struct trace_array *tr = kmemtrace_array;
75 struct kmemtrace_free_entry *entry;
76 struct ring_buffer_event *event;
77
78 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
79 if (!event)
80 return;
81 entry = ring_buffer_event_data(event);
82 tracing_generic_entry_update(&entry->ent, 0, 0);
83
84 entry->ent.type = TRACE_KMEM_FREE;
85 entry->type_id = type_id;
86 entry->call_site = call_site;
87 entry->ptr = ptr;
88
89 ring_buffer_unlock_commit(tr->buffer, event);
90
91 trace_wake_up();
92}
93
94static void kmemtrace_kmalloc(unsigned long call_site,
95 const void *ptr,
96 size_t bytes_req,
97 size_t bytes_alloc,
98 gfp_t gfp_flags)
99{
100 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
101 bytes_req, bytes_alloc, gfp_flags, -1);
102}
103
104static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
105 const void *ptr,
106 size_t bytes_req,
107 size_t bytes_alloc,
108 gfp_t gfp_flags)
109{
110 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
111 bytes_req, bytes_alloc, gfp_flags, -1);
112}
113
114static void kmemtrace_kmalloc_node(unsigned long call_site,
115 const void *ptr,
116 size_t bytes_req,
117 size_t bytes_alloc,
118 gfp_t gfp_flags,
119 int node)
120{
121 kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
122 bytes_req, bytes_alloc, gfp_flags, node);
123}
124
125static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
126 const void *ptr,
127 size_t bytes_req,
128 size_t bytes_alloc,
129 gfp_t gfp_flags,
130 int node)
131{
132 kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
133 bytes_req, bytes_alloc, gfp_flags, node);
134}
135
136static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
137{
138 kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
139}
140
141static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
142{
143 kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
144}
145
146static int kmemtrace_start_probes(void)
147{
148 int err;
149
150 err = register_trace_kmalloc(kmemtrace_kmalloc);
151 if (err)
152 return err;
153 err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
154 if (err)
155 return err;
156 err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
157 if (err)
158 return err;
159 err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
160 if (err)
161 return err;
162 err = register_trace_kfree(kmemtrace_kfree);
163 if (err)
164 return err;
165 err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
166
167 return err;
168}
169
170static void kmemtrace_stop_probes(void)
171{
172 unregister_trace_kmalloc(kmemtrace_kmalloc);
173 unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
174 unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
175 unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
176 unregister_trace_kfree(kmemtrace_kfree);
177 unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
178}
179
36static int kmem_trace_init(struct trace_array *tr) 180static int kmem_trace_init(struct trace_array *tr)
37{ 181{
38 int cpu; 182 int cpu;
@@ -41,14 +185,14 @@ static int kmem_trace_init(struct trace_array *tr)
41 for_each_cpu_mask(cpu, cpu_possible_map) 185 for_each_cpu_mask(cpu, cpu_possible_map)
42 tracing_reset(tr, cpu); 186 tracing_reset(tr, cpu);
43 187
44 kmem_tracing_enabled = true; 188 kmemtrace_start_probes();
45 189
46 return 0; 190 return 0;
47} 191}
48 192
49static void kmem_trace_reset(struct trace_array *tr) 193static void kmem_trace_reset(struct trace_array *tr)
50{ 194{
51 kmem_tracing_enabled = false; 195 kmemtrace_stop_probes();
52} 196}
53 197
54static void kmemtrace_headers(struct seq_file *s) 198static void kmemtrace_headers(struct seq_file *s)
@@ -66,47 +210,84 @@ static void kmemtrace_headers(struct seq_file *s)
66} 210}
67 211
68/* 212/*
69 * The two following functions give the original output from kmemtrace, 213 * The following functions give the original output from kmemtrace,
70 * or something close to....perhaps they need some missing things 214 * plus the origin CPU, since reordering occurs in-kernel now.
71 */ 215 */
216
217#define KMEMTRACE_USER_ALLOC 0
218#define KMEMTRACE_USER_FREE 1
219
220struct kmemtrace_user_event {
221 u8 event_id;
222 u8 type_id;
223 u16 event_size;
224 u32 cpu;
225 u64 timestamp;
226 unsigned long call_site;
227 unsigned long ptr;
228};
229
230struct kmemtrace_user_event_alloc {
231 size_t bytes_req;
232 size_t bytes_alloc;
233 unsigned gfp_flags;
234 int node;
235};
236
72static enum print_line_t 237static enum print_line_t
73kmemtrace_print_alloc_original(struct trace_iterator *iter, 238kmemtrace_print_alloc_user(struct trace_iterator *iter,
74 struct kmemtrace_alloc_entry *entry) 239 struct kmemtrace_alloc_entry *entry)
75{ 240{
241 struct kmemtrace_user_event_alloc *ev_alloc;
76 struct trace_seq *s = &iter->seq; 242 struct trace_seq *s = &iter->seq;
77 int ret; 243 struct kmemtrace_user_event *ev;
244
245 ev = trace_seq_reserve(s, sizeof(*ev));
246 if (!ev)
247 return TRACE_TYPE_PARTIAL_LINE;
78 248
79 /* Taken from the old linux/kmemtrace.h */ 249 ev->event_id = KMEMTRACE_USER_ALLOC;
80 ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu " 250 ev->type_id = entry->type_id;
81 "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n", 251 ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
82 entry->type_id, entry->call_site, (unsigned long) entry->ptr, 252 ev->cpu = iter->cpu;
83 (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc, 253 ev->timestamp = iter->ts;
84 (unsigned long) entry->gfp_flags, entry->node); 254 ev->call_site = entry->call_site;
255 ev->ptr = (unsigned long)entry->ptr;
85 256
86 if (!ret) 257 ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
258 if (!ev_alloc)
87 return TRACE_TYPE_PARTIAL_LINE; 259 return TRACE_TYPE_PARTIAL_LINE;
88 260
261 ev_alloc->bytes_req = entry->bytes_req;
262 ev_alloc->bytes_alloc = entry->bytes_alloc;
263 ev_alloc->gfp_flags = entry->gfp_flags;
264 ev_alloc->node = entry->node;
265
89 return TRACE_TYPE_HANDLED; 266 return TRACE_TYPE_HANDLED;
90} 267}
91 268
92static enum print_line_t 269static enum print_line_t
93kmemtrace_print_free_original(struct trace_iterator *iter, 270kmemtrace_print_free_user(struct trace_iterator *iter,
94 struct kmemtrace_free_entry *entry) 271 struct kmemtrace_free_entry *entry)
95{ 272{
96 struct trace_seq *s = &iter->seq; 273 struct trace_seq *s = &iter->seq;
97 int ret; 274 struct kmemtrace_user_event *ev;
98 275
99 /* Taken from the old linux/kmemtrace.h */ 276 ev = trace_seq_reserve(s, sizeof(*ev));
100 ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n", 277 if (!ev)
101 entry->type_id, entry->call_site, (unsigned long) entry->ptr);
102
103 if (!ret)
104 return TRACE_TYPE_PARTIAL_LINE; 278 return TRACE_TYPE_PARTIAL_LINE;
105 279
280 ev->event_id = KMEMTRACE_USER_FREE;
281 ev->type_id = entry->type_id;
282 ev->event_size = sizeof(*ev);
283 ev->cpu = iter->cpu;
284 ev->timestamp = iter->ts;
285 ev->call_site = entry->call_site;
286 ev->ptr = (unsigned long)entry->ptr;
287
106 return TRACE_TYPE_HANDLED; 288 return TRACE_TYPE_HANDLED;
107} 289}
108 290
109
110/* The two other following provide a more minimalistic output */ 291/* The two other following provide a more minimalistic output */
111static enum print_line_t 292static enum print_line_t
112kmemtrace_print_alloc_compress(struct trace_iterator *iter, 293kmemtrace_print_alloc_compress(struct trace_iterator *iter,
@@ -178,7 +359,7 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
178 359
179static enum print_line_t 360static enum print_line_t
180kmemtrace_print_free_compress(struct trace_iterator *iter, 361kmemtrace_print_free_compress(struct trace_iterator *iter,
181 struct kmemtrace_free_entry *entry) 362 struct kmemtrace_free_entry *entry)
182{ 363{
183 struct trace_seq *s = &iter->seq; 364 struct trace_seq *s = &iter->seq;
184 int ret; 365 int ret;
@@ -239,20 +420,22 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
239 switch (entry->type) { 420 switch (entry->type) {
240 case TRACE_KMEM_ALLOC: { 421 case TRACE_KMEM_ALLOC: {
241 struct kmemtrace_alloc_entry *field; 422 struct kmemtrace_alloc_entry *field;
423
242 trace_assign_type(field, entry); 424 trace_assign_type(field, entry);
243 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) 425 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
244 return kmemtrace_print_alloc_compress(iter, field); 426 return kmemtrace_print_alloc_compress(iter, field);
245 else 427 else
246 return kmemtrace_print_alloc_original(iter, field); 428 return kmemtrace_print_alloc_user(iter, field);
247 } 429 }
248 430
249 case TRACE_KMEM_FREE: { 431 case TRACE_KMEM_FREE: {
250 struct kmemtrace_free_entry *field; 432 struct kmemtrace_free_entry *field;
433
251 trace_assign_type(field, entry); 434 trace_assign_type(field, entry);
252 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL) 435 if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
253 return kmemtrace_print_free_compress(iter, field); 436 return kmemtrace_print_free_compress(iter, field);
254 else 437 else
255 return kmemtrace_print_free_original(iter, field); 438 return kmemtrace_print_free_user(iter, field);
256 } 439 }
257 440
258 default: 441 default:
@@ -260,70 +443,13 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
260 } 443 }
261} 444}
262 445
263/* Trace allocations */
264void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
265 unsigned long call_site,
266 const void *ptr,
267 size_t bytes_req,
268 size_t bytes_alloc,
269 gfp_t gfp_flags,
270 int node)
271{
272 struct ring_buffer_event *event;
273 struct kmemtrace_alloc_entry *entry;
274 struct trace_array *tr = kmemtrace_array;
275
276 if (!kmem_tracing_enabled)
277 return;
278
279 event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC,
280 sizeof(*entry), 0, 0);
281 if (!event)
282 return;
283 entry = ring_buffer_event_data(event);
284
285 entry->call_site = call_site;
286 entry->ptr = ptr;
287 entry->bytes_req = bytes_req;
288 entry->bytes_alloc = bytes_alloc;
289 entry->gfp_flags = gfp_flags;
290 entry->node = node;
291
292 trace_buffer_unlock_commit(tr, event, 0, 0);
293}
294EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
295
296void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
297 unsigned long call_site,
298 const void *ptr)
299{
300 struct ring_buffer_event *event;
301 struct kmemtrace_free_entry *entry;
302 struct trace_array *tr = kmemtrace_array;
303
304 if (!kmem_tracing_enabled)
305 return;
306
307 event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE,
308 sizeof(*entry), 0, 0);
309 if (!event)
310 return;
311 entry = ring_buffer_event_data(event);
312 entry->type_id = type_id;
313 entry->call_site = call_site;
314 entry->ptr = ptr;
315
316 trace_buffer_unlock_commit(tr, event, 0, 0);
317}
318EXPORT_SYMBOL(kmemtrace_mark_free);
319
320static struct tracer kmem_tracer __read_mostly = { 446static struct tracer kmem_tracer __read_mostly = {
321 .name = "kmemtrace", 447 .name = "kmemtrace",
322 .init = kmem_trace_init, 448 .init = kmem_trace_init,
323 .reset = kmem_trace_reset, 449 .reset = kmem_trace_reset,
324 .print_line = kmemtrace_print_line, 450 .print_line = kmemtrace_print_line,
325 .print_header = kmemtrace_headers, 451 .print_header = kmemtrace_headers,
326 .flags = &kmem_tracer_flags 452 .flags = &kmem_tracer_flags
327}; 453};
328 454
329void kmemtrace_init(void) 455void kmemtrace_init(void)
@@ -335,5 +461,4 @@ static int __init init_kmem_tracer(void)
335{ 461{
336 return register_tracer(&kmem_tracer); 462 return register_tracer(&kmem_tracer);
337} 463}
338
339device_initcall(init_kmem_tracer); 464device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a0174a40c563..1ce5dc6372b8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,6 +30,7 @@
30#include <linux/percpu.h> 30#include <linux/percpu.h>
31#include <linux/splice.h> 31#include <linux/splice.h>
32#include <linux/kdebug.h> 32#include <linux/kdebug.h>
33#include <linux/string.h>
33#include <linux/ctype.h> 34#include <linux/ctype.h>
34#include <linux/init.h> 35#include <linux/init.h>
35#include <linux/poll.h> 36#include <linux/poll.h>
@@ -147,8 +148,7 @@ static int __init set_ftrace_dump_on_oops(char *str)
147} 148}
148__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); 149__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
149 150
150long 151unsigned long long ns2usecs(cycle_t nsec)
151ns2usecs(cycle_t nsec)
152{ 152{
153 nsec += 500; 153 nsec += 500;
154 do_div(nsec, 1000); 154 do_div(nsec, 1000);
@@ -1632,7 +1632,11 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
1632 return; 1632 return;
1633 1633
1634 cpumask_set_cpu(iter->cpu, iter->started); 1634 cpumask_set_cpu(iter->cpu, iter->started);
1635 trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu); 1635
1636 /* Don't print started cpu buffer for the first entry of the trace */
1637 if (iter->idx > 1)
1638 trace_seq_printf(s, "##### CPU %u buffer started ####\n",
1639 iter->cpu);
1636} 1640}
1637 1641
1638static enum print_line_t print_trace_fmt(struct trace_iterator *iter) 1642static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
@@ -1867,6 +1871,11 @@ __tracing_open(struct inode *inode, struct file *file)
1867 if (current_trace) 1871 if (current_trace)
1868 *iter->trace = *current_trace; 1872 *iter->trace = *current_trace;
1869 1873
1874 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
1875 goto fail;
1876
1877 cpumask_clear(iter->started);
1878
1870 if (current_trace && current_trace->print_max) 1879 if (current_trace && current_trace->print_max)
1871 iter->tr = &max_tr; 1880 iter->tr = &max_tr;
1872 else 1881 else
@@ -1917,6 +1926,7 @@ __tracing_open(struct inode *inode, struct file *file)
1917 if (iter->buffer_iter[cpu]) 1926 if (iter->buffer_iter[cpu])
1918 ring_buffer_read_finish(iter->buffer_iter[cpu]); 1927 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1919 } 1928 }
1929 free_cpumask_var(iter->started);
1920 fail: 1930 fail:
1921 mutex_unlock(&trace_types_lock); 1931 mutex_unlock(&trace_types_lock);
1922 kfree(iter->trace); 1932 kfree(iter->trace);
@@ -1960,6 +1970,7 @@ static int tracing_release(struct inode *inode, struct file *file)
1960 1970
1961 seq_release(inode, file); 1971 seq_release(inode, file);
1962 mutex_destroy(&iter->mutex); 1972 mutex_destroy(&iter->mutex);
1973 free_cpumask_var(iter->started);
1963 kfree(iter->trace); 1974 kfree(iter->trace);
1964 kfree(iter); 1975 kfree(iter);
1965 return 0; 1976 return 0;
@@ -2358,9 +2369,9 @@ static const char readme_msg[] =
2358 "# mkdir /debug\n" 2369 "# mkdir /debug\n"
2359 "# mount -t debugfs nodev /debug\n\n" 2370 "# mount -t debugfs nodev /debug\n\n"
2360 "# cat /debug/tracing/available_tracers\n" 2371 "# cat /debug/tracing/available_tracers\n"
2361 "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n" 2372 "wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
2362 "# cat /debug/tracing/current_tracer\n" 2373 "# cat /debug/tracing/current_tracer\n"
2363 "none\n" 2374 "nop\n"
2364 "# echo sched_switch > /debug/tracing/current_tracer\n" 2375 "# echo sched_switch > /debug/tracing/current_tracer\n"
2365 "# cat /debug/tracing/current_tracer\n" 2376 "# cat /debug/tracing/current_tracer\n"
2366 "sched_switch\n" 2377 "sched_switch\n"
@@ -3266,19 +3277,13 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
3266 3277
3267 info->tr = &global_trace; 3278 info->tr = &global_trace;
3268 info->cpu = cpu; 3279 info->cpu = cpu;
3269 info->spare = ring_buffer_alloc_read_page(info->tr->buffer); 3280 info->spare = NULL;
3270 /* Force reading ring buffer for first read */ 3281 /* Force reading ring buffer for first read */
3271 info->read = (unsigned int)-1; 3282 info->read = (unsigned int)-1;
3272 if (!info->spare)
3273 goto out;
3274 3283
3275 filp->private_data = info; 3284 filp->private_data = info;
3276 3285
3277 return 0; 3286 return nonseekable_open(inode, filp);
3278
3279 out:
3280 kfree(info);
3281 return -ENOMEM;
3282} 3287}
3283 3288
3284static ssize_t 3289static ssize_t
@@ -3293,6 +3298,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
3293 if (!count) 3298 if (!count)
3294 return 0; 3299 return 0;
3295 3300
3301 if (!info->spare)
3302 info->spare = ring_buffer_alloc_read_page(info->tr->buffer);
3303 if (!info->spare)
3304 return -ENOMEM;
3305
3296 /* Do we have previous read data to read? */ 3306 /* Do we have previous read data to read? */
3297 if (info->read < PAGE_SIZE) 3307 if (info->read < PAGE_SIZE)
3298 goto read; 3308 goto read;
@@ -3331,7 +3341,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file)
3331{ 3341{
3332 struct ftrace_buffer_info *info = file->private_data; 3342 struct ftrace_buffer_info *info = file->private_data;
3333 3343
3334 ring_buffer_free_read_page(info->tr->buffer, info->spare); 3344 if (info->spare)
3345 ring_buffer_free_read_page(info->tr->buffer, info->spare);
3335 kfree(info); 3346 kfree(info);
3336 3347
3337 return 0; 3348 return 0;
@@ -3417,14 +3428,19 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3417 int size, i; 3428 int size, i;
3418 size_t ret; 3429 size_t ret;
3419 3430
3420 /* 3431 if (*ppos & (PAGE_SIZE - 1)) {
3421 * We can't seek on a buffer input 3432 WARN_ONCE(1, "Ftrace: previous read must page-align\n");
3422 */ 3433 return -EINVAL;
3423 if (unlikely(*ppos)) 3434 }
3424 return -ESPIPE;
3425 3435
3436 if (len & (PAGE_SIZE - 1)) {
3437 WARN_ONCE(1, "Ftrace: splice_read should page-align\n");
3438 if (len < PAGE_SIZE)
3439 return -EINVAL;
3440 len &= PAGE_MASK;
3441 }
3426 3442
3427 for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) { 3443 for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
3428 struct page *page; 3444 struct page *page;
3429 int r; 3445 int r;
3430 3446
@@ -3463,6 +3479,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
3463 spd.partial[i].offset = 0; 3479 spd.partial[i].offset = 0;
3464 spd.partial[i].private = (unsigned long)ref; 3480 spd.partial[i].private = (unsigned long)ref;
3465 spd.nr_pages++; 3481 spd.nr_pages++;
3482 *ppos += PAGE_SIZE;
3466 } 3483 }
3467 3484
3468 spd.nr_pages = i; 3485 spd.nr_pages = i;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cb0ce3fc36d3..e685ac2b2ba1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -182,6 +182,12 @@ struct trace_power {
182 struct power_trace state_data; 182 struct power_trace state_data;
183}; 183};
184 184
185enum kmemtrace_type_id {
186 KMEMTRACE_TYPE_KMALLOC = 0, /* kmalloc() or kfree(). */
187 KMEMTRACE_TYPE_CACHE, /* kmem_cache_*(). */
188 KMEMTRACE_TYPE_PAGES, /* __get_free_pages() and friends. */
189};
190
185struct kmemtrace_alloc_entry { 191struct kmemtrace_alloc_entry {
186 struct trace_entry ent; 192 struct trace_entry ent;
187 enum kmemtrace_type_id type_id; 193 enum kmemtrace_type_id type_id;
@@ -596,7 +602,7 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
596#endif /* CONFIG_FTRACE_STARTUP_TEST */ 602#endif /* CONFIG_FTRACE_STARTUP_TEST */
597 603
598extern void *head_page(struct trace_array_cpu *data); 604extern void *head_page(struct trace_array_cpu *data);
599extern long ns2usecs(cycle_t nsec); 605extern unsigned long long ns2usecs(cycle_t nsec);
600extern int 606extern int
601trace_vbprintk(unsigned long ip, const char *fmt, va_list args); 607trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
602extern int 608extern int
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 64ec4d278ffb..576f4fa2af0d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -503,6 +503,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
503 503
504 if (copy_from_user(&buf, ubuf, cnt)) 504 if (copy_from_user(&buf, ubuf, cnt))
505 return -EFAULT; 505 return -EFAULT;
506 buf[cnt] = '\0';
506 507
507 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 508 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
508 if (!pred) 509 if (!pred)
@@ -520,9 +521,10 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
520 return cnt; 521 return cnt;
521 } 522 }
522 523
523 if (filter_add_pred(call, pred)) { 524 err = filter_add_pred(call, pred);
525 if (err < 0) {
524 filter_free_pred(pred); 526 filter_free_pred(pred);
525 return -EINVAL; 527 return err;
526 } 528 }
527 529
528 *ppos += cnt; 530 *ppos += cnt;
@@ -569,6 +571,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
569 571
570 if (copy_from_user(&buf, ubuf, cnt)) 572 if (copy_from_user(&buf, ubuf, cnt))
571 return -EFAULT; 573 return -EFAULT;
574 buf[cnt] = '\0';
572 575
573 pred = kzalloc(sizeof(*pred), GFP_KERNEL); 576 pred = kzalloc(sizeof(*pred), GFP_KERNEL);
574 if (!pred) 577 if (!pred)
@@ -586,10 +589,11 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
586 return cnt; 589 return cnt;
587 } 590 }
588 591
589 if (filter_add_subsystem_pred(system, pred)) { 592 err = filter_add_subsystem_pred(system, pred);
593 if (err < 0) {
590 filter_free_subsystem_preds(system); 594 filter_free_subsystem_preds(system);
591 filter_free_pred(pred); 595 filter_free_pred(pred);
592 return -EINVAL; 596 return err;
593 } 597 }
594 598
595 *ppos += cnt; 599 *ppos += cnt;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 026be412f356..e03cbf1e38f3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -215,7 +215,7 @@ static int __filter_add_pred(struct ftrace_event_call *call,
215 } 215 }
216 } 216 }
217 217
218 return -ENOMEM; 218 return -ENOSPC;
219} 219}
220 220
221static int is_string_field(const char *type) 221static int is_string_field(const char *type)
@@ -319,7 +319,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
319 } 319 }
320 320
321 if (i == MAX_FILTER_PRED) 321 if (i == MAX_FILTER_PRED)
322 return -EINVAL; 322 return -ENOSPC;
323 323
324 events_for_each(call) { 324 events_for_each(call) {
325 int err; 325 int err;
@@ -410,16 +410,22 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
410 } 410 }
411 } 411 }
412 412
413 if (!val_str) {
414 pred->field_name = NULL;
415 return -EINVAL;
416 }
417
413 pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); 418 pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
414 if (!pred->field_name) 419 if (!pred->field_name)
415 return -ENOMEM; 420 return -ENOMEM;
416 421
417 pred->val = simple_strtoull(val_str, &tmp, 10); 422 pred->val = simple_strtoull(val_str, &tmp, 0);
418 if (tmp == val_str) { 423 if (tmp == val_str) {
419 pred->str_val = kstrdup(val_str, GFP_KERNEL); 424 pred->str_val = kstrdup(val_str, GFP_KERNEL);
420 if (!pred->str_val) 425 if (!pred->str_val)
421 return -ENOMEM; 426 return -ENOMEM;
422 } 427 } else if (*tmp != '\0')
428 return -EINVAL;
423 429
424 return 0; 430 return 0;
425} 431}
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 30743f7d4110..d363c6672c6c 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -105,10 +105,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \
105 return 0; 105 return 0;
106 106
107#undef __entry 107#undef __entry
108#define __entry "REC" 108#define __entry REC
109 109
110#undef TP_printk 110#undef TP_printk
111#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args 111#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
112 112
113#undef TP_fast_assign 113#undef TP_fast_assign
114#define TP_fast_assign(args...) args 114#define TP_fast_assign(args...) args
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4d9952d3df50..07a22c33ebf3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -40,7 +40,7 @@
40 40
41#undef TRACE_FIELD_ZERO_CHAR 41#undef TRACE_FIELD_ZERO_CHAR
42#define TRACE_FIELD_ZERO_CHAR(item) \ 42#define TRACE_FIELD_ZERO_CHAR(item) \
43 ret = trace_seq_printf(s, "\tfield: char " #item ";\t" \ 43 ret = trace_seq_printf(s, "\tfield:char " #item ";\t" \
44 "offset:%u;\tsize:0;\n", \ 44 "offset:%u;\tsize:0;\n", \
45 (unsigned int)offsetof(typeof(field), item)); \ 45 (unsigned int)offsetof(typeof(field), item)); \
46 if (!ret) \ 46 if (!ret) \
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d72b9a63b247..64b54a59c55b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -423,7 +423,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
423 423
424 trace_find_cmdline(entry->pid, comm); 424 trace_find_cmdline(entry->pid, comm);
425 425
426 ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]" 426 ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
427 " %ld.%03ldms (+%ld.%03ldms): ", comm, 427 " %ld.%03ldms (+%ld.%03ldms): ", comm,
428 entry->pid, iter->cpu, entry->flags, 428 entry->pid, iter->cpu, entry->flags,
429 entry->preempt_count, iter->idx, 429 entry->preempt_count, iter->idx,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index de35f200abd3..9117cea6f1ae 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -62,6 +62,9 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
62 pc = preempt_count(); 62 pc = preempt_count();
63 tracing_record_cmdline(current); 63 tracing_record_cmdline(current);
64 64
65 if (sched_stopped)
66 return;
67
65 local_irq_save(flags); 68 local_irq_save(flags);
66 cpu = raw_smp_processor_id(); 69 cpu = raw_smp_processor_id();
67 data = ctx_trace->data[cpu]; 70 data = ctx_trace->data[cpu];
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3c5ad6b2ec84..5bc00e8f153e 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -154,7 +154,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
154 if (unlikely(!tracer_enabled || next != wakeup_task)) 154 if (unlikely(!tracer_enabled || next != wakeup_task))
155 goto out_unlock; 155 goto out_unlock;
156 156
157 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 157 trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); 158 tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
159 159
160 /* 160 /*
@@ -257,6 +257,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
257 data = wakeup_trace->data[wakeup_cpu]; 257 data = wakeup_trace->data[wakeup_cpu];
258 data->preempt_timestamp = ftrace_now(cpu); 258 data->preempt_timestamp = ftrace_now(cpu);
259 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); 259 tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
260
261 /*
262 * We must be careful in using CALLER_ADDR2. But since wake_up
263 * is not called by an assembly function (where as schedule is)
264 * it should be safe to use it here.
265 */
260 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); 266 trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
261 267
262out_locked: 268out_locked:
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index a2a3af29c943..5e579645ac86 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,5 +1,5 @@
1#include <trace/syscall.h>
1#include <linux/kernel.h> 2#include <linux/kernel.h>
2#include <linux/ftrace.h>
3#include <asm/syscall.h> 3#include <asm/syscall.h>
4 4
5#include "trace_output.h" 5#include "trace_output.h"
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b6b966ce1451..f71fb2a08950 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -966,20 +966,20 @@ undo:
966} 966}
967 967
968#ifdef CONFIG_SMP 968#ifdef CONFIG_SMP
969static struct workqueue_struct *work_on_cpu_wq __read_mostly;
970 969
971struct work_for_cpu { 970struct work_for_cpu {
972 struct work_struct work; 971 struct completion completion;
973 long (*fn)(void *); 972 long (*fn)(void *);
974 void *arg; 973 void *arg;
975 long ret; 974 long ret;
976}; 975};
977 976
978static void do_work_for_cpu(struct work_struct *w) 977static int do_work_for_cpu(void *_wfc)
979{ 978{
980 struct work_for_cpu *wfc = container_of(w, struct work_for_cpu, work); 979 struct work_for_cpu *wfc = _wfc;
981
982 wfc->ret = wfc->fn(wfc->arg); 980 wfc->ret = wfc->fn(wfc->arg);
981 complete(&wfc->completion);
982 return 0;
983} 983}
984 984
985/** 985/**
@@ -990,17 +990,23 @@ static void do_work_for_cpu(struct work_struct *w)
990 * 990 *
991 * This will return the value @fn returns. 991 * This will return the value @fn returns.
992 * It is up to the caller to ensure that the cpu doesn't go offline. 992 * It is up to the caller to ensure that the cpu doesn't go offline.
993 * The caller must not hold any locks which would prevent @fn from completing.
993 */ 994 */
994long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) 995long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
995{ 996{
996 struct work_for_cpu wfc; 997 struct task_struct *sub_thread;
997 998 struct work_for_cpu wfc = {
998 INIT_WORK(&wfc.work, do_work_for_cpu); 999 .completion = COMPLETION_INITIALIZER_ONSTACK(wfc.completion),
999 wfc.fn = fn; 1000 .fn = fn,
1000 wfc.arg = arg; 1001 .arg = arg,
1001 queue_work_on(cpu, work_on_cpu_wq, &wfc.work); 1002 };
1002 flush_work(&wfc.work); 1003
1003 1004 sub_thread = kthread_create(do_work_for_cpu, &wfc, "work_for_cpu");
1005 if (IS_ERR(sub_thread))
1006 return PTR_ERR(sub_thread);
1007 kthread_bind(sub_thread, cpu);
1008 wake_up_process(sub_thread);
1009 wait_for_completion(&wfc.completion);
1004 return wfc.ret; 1010 return wfc.ret;
1005} 1011}
1006EXPORT_SYMBOL_GPL(work_on_cpu); 1012EXPORT_SYMBOL_GPL(work_on_cpu);
@@ -1016,8 +1022,4 @@ void __init init_workqueues(void)
1016 hotcpu_notifier(workqueue_cpu_callback, 0); 1022 hotcpu_notifier(workqueue_cpu_callback, 0);
1017 keventd_wq = create_workqueue("events"); 1023 keventd_wq = create_workqueue("events");
1018 BUG_ON(!keventd_wq); 1024 BUG_ON(!keventd_wq);
1019#ifdef CONFIG_SMP
1020 work_on_cpu_wq = create_workqueue("work_on_cpu");
1021 BUG_ON(!work_on_cpu_wq);
1022#endif
1023} 1025}