aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c8
-rw-r--r--kernel/exit.c4
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/futex.c20
-rw-r--r--kernel/irq/spurious.c2
-rw-r--r--kernel/kprobes.c68
-rw-r--r--kernel/kthread.c23
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/params.c17
-rw-r--r--kernel/perf_event.c91
-rw-r--r--kernel/power/hibernate.c11
-rw-r--r--kernel/power/suspend_test.c5
-rw-r--r--kernel/power/swap.c43
-rw-r--r--kernel/rcutree.c60
-rw-r--r--kernel/rcutree.h17
-rw-r--r--kernel/rcutree_plugin.h46
-rw-r--r--kernel/sched.c65
-rw-r--r--kernel/sched_fair.c74
-rw-r--r--kernel/sys.c25
-rw-r--r--kernel/sysctl_check.c2
-rw-r--r--kernel/trace/Kconfig17
-rw-r--r--kernel/trace/Makefile1
-rw-r--r--kernel/trace/ftrace.c8
-rw-r--r--kernel/trace/ring_buffer.c14
-rw-r--r--kernel/trace/trace.c8
-rw-r--r--kernel/trace/trace.h23
-rw-r--r--kernel/trace/trace_event_profile.c45
-rw-r--r--kernel/trace/trace_events.c144
-rw-r--r--kernel/trace/trace_events_filter.c2
-rw-r--r--kernel/trace/trace_export.c14
-rw-r--r--kernel/trace/trace_kprobe.c1513
-rw-r--r--kernel/trace/trace_output.c5
-rw-r--r--kernel/trace/trace_syscalls.c60
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/workqueue.c23
36 files changed, 2151 insertions, 315 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ca83b73fba19..0249f4be9b5c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1710,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1710 return -EFAULT; 1710 return -EFAULT;
1711 1711
1712 buffer[nbytes] = 0; /* nul-terminate */ 1712 buffer[nbytes] = 0; /* nul-terminate */
1713 strstrip(buffer);
1714 if (cft->write_u64) { 1713 if (cft->write_u64) {
1715 u64 val = simple_strtoull(buffer, &end, 0); 1714 u64 val = simple_strtoull(strstrip(buffer), &end, 0);
1716 if (*end) 1715 if (*end)
1717 return -EINVAL; 1716 return -EINVAL;
1718 retval = cft->write_u64(cgrp, cft, val); 1717 retval = cft->write_u64(cgrp, cft, val);
1719 } else { 1718 } else {
1720 s64 val = simple_strtoll(buffer, &end, 0); 1719 s64 val = simple_strtoll(strstrip(buffer), &end, 0);
1721 if (*end) 1720 if (*end)
1722 return -EINVAL; 1721 return -EINVAL;
1723 retval = cft->write_s64(cgrp, cft, val); 1722 retval = cft->write_s64(cgrp, cft, val);
@@ -1753,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
1753 } 1752 }
1754 1753
1755 buffer[nbytes] = 0; /* nul-terminate */ 1754 buffer[nbytes] = 0; /* nul-terminate */
1756 strstrip(buffer); 1755 retval = cft->write_string(cgrp, cft, strstrip(buffer));
1757 retval = cft->write_string(cgrp, cft, buffer);
1758 if (!retval) 1756 if (!retval)
1759 retval = nbytes; 1757 retval = nbytes;
1760out: 1758out:
diff --git a/kernel/exit.c b/kernel/exit.c
index 266f8920628a..3f45e3cf931d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -360,10 +360,8 @@ void __set_special_pids(struct pid *pid)
360{ 360{
361 struct task_struct *curr = current->group_leader; 361 struct task_struct *curr = current->group_leader;
362 362
363 if (task_session(curr) != pid) { 363 if (task_session(curr) != pid)
364 change_pid(curr, PIDTYPE_SID, pid); 364 change_pid(curr, PIDTYPE_SID, pid);
365 proc_sid_connector(curr);
366 }
367 365
368 if (task_pgrp(curr) != pid) 366 if (task_pgrp(curr) != pid)
369 change_pid(curr, PIDTYPE_PGID, pid); 367 change_pid(curr, PIDTYPE_PGID, pid);
diff --git a/kernel/fork.c b/kernel/fork.c
index 4c20fff8c13a..166b8c49257c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -91,7 +91,7 @@ int nr_processes(void)
91 int cpu; 91 int cpu;
92 int total = 0; 92 int total = 0;
93 93
94 for_each_online_cpu(cpu) 94 for_each_possible_cpu(cpu)
95 total += per_cpu(process_counts, cpu); 95 total += per_cpu(process_counts, cpu);
96 96
97 return total; 97 return total;
diff --git a/kernel/futex.c b/kernel/futex.c
index 4949d336d88d..fb65e822fc41 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
150 */ 150 */
151static inline int match_futex(union futex_key *key1, union futex_key *key2) 151static inline int match_futex(union futex_key *key1, union futex_key *key2)
152{ 152{
153 return (key1->both.word == key2->both.word 153 return (key1 && key2
154 && key1->both.word == key2->both.word
154 && key1->both.ptr == key2->both.ptr 155 && key1->both.ptr == key2->both.ptr
155 && key1->both.offset == key2->both.offset); 156 && key1->both.offset == key2->both.offset);
156} 157}
@@ -1028,7 +1029,6 @@ static inline
1028void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, 1029void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1029 struct futex_hash_bucket *hb) 1030 struct futex_hash_bucket *hb)
1030{ 1031{
1031 drop_futex_key_refs(&q->key);
1032 get_futex_key_refs(key); 1032 get_futex_key_refs(key);
1033 q->key = *key; 1033 q->key = *key;
1034 1034
@@ -1226,6 +1226,7 @@ retry_private:
1226 */ 1226 */
1227 if (ret == 1) { 1227 if (ret == 1) {
1228 WARN_ON(pi_state); 1228 WARN_ON(pi_state);
1229 drop_count++;
1229 task_count++; 1230 task_count++;
1230 ret = get_futex_value_locked(&curval2, uaddr2); 1231 ret = get_futex_value_locked(&curval2, uaddr2);
1231 if (!ret) 1232 if (!ret)
@@ -1304,6 +1305,7 @@ retry_private:
1304 if (ret == 1) { 1305 if (ret == 1) {
1305 /* We got the lock. */ 1306 /* We got the lock. */
1306 requeue_pi_wake_futex(this, &key2, hb2); 1307 requeue_pi_wake_futex(this, &key2, hb2);
1308 drop_count++;
1307 continue; 1309 continue;
1308 } else if (ret) { 1310 } else if (ret) {
1309 /* -EDEADLK */ 1311 /* -EDEADLK */
@@ -1791,6 +1793,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1791 current->timer_slack_ns); 1793 current->timer_slack_ns);
1792 } 1794 }
1793 1795
1796retry:
1794 /* Prepare to wait on uaddr. */ 1797 /* Prepare to wait on uaddr. */
1795 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); 1798 ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
1796 if (ret) 1799 if (ret)
@@ -1808,9 +1811,14 @@ static int futex_wait(u32 __user *uaddr, int fshared,
1808 goto out_put_key; 1811 goto out_put_key;
1809 1812
1810 /* 1813 /*
1811 * We expect signal_pending(current), but another thread may 1814 * We expect signal_pending(current), but we might be the
1812 * have handled it for us already. 1815 * victim of a spurious wakeup as well.
1813 */ 1816 */
1817 if (!signal_pending(current)) {
1818 put_futex_key(fshared, &q.key);
1819 goto retry;
1820 }
1821
1814 ret = -ERESTARTSYS; 1822 ret = -ERESTARTSYS;
1815 if (!abs_time) 1823 if (!abs_time)
1816 goto out_put_key; 1824 goto out_put_key;
@@ -2118,9 +2126,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2118 */ 2126 */
2119 plist_del(&q->list, &q->list.plist); 2127 plist_del(&q->list, &q->list.plist);
2120 2128
2129 /* Handle spurious wakeups gracefully */
2130 ret = -EWOULDBLOCK;
2121 if (timeout && !timeout->task) 2131 if (timeout && !timeout->task)
2122 ret = -ETIMEDOUT; 2132 ret = -ETIMEDOUT;
2123 else 2133 else if (signal_pending(current))
2124 ret = -ERESTARTNOINTR; 2134 ret = -ERESTARTNOINTR;
2125 } 2135 }
2126 return ret; 2136 return ret;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 114e704760fe..bd7273e6282e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -121,7 +121,9 @@ static void poll_all_shared_irqs(void)
121 if (!(status & IRQ_SPURIOUS_DISABLED)) 121 if (!(status & IRQ_SPURIOUS_DISABLED))
122 continue; 122 continue;
123 123
124 local_irq_disable();
124 try_one_irq(i, desc); 125 try_one_irq(i, desc);
126 local_irq_enable();
125 } 127 }
126} 128}
127 129
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5240d75f4c60..84495958e703 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
90 */ 90 */
91static struct kprobe_blackpoint kprobe_blacklist[] = { 91static struct kprobe_blackpoint kprobe_blacklist[] = {
92 {"preempt_schedule",}, 92 {"preempt_schedule",},
93 {"native_get_debugreg",},
94 {"irq_entries_start",},
95 {"common_interrupt",},
93 {NULL} /* Terminator */ 96 {NULL} /* Terminator */
94}; 97};
95 98
@@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
673 return (kprobe_opcode_t *)(((char *)addr) + p->offset); 676 return (kprobe_opcode_t *)(((char *)addr) + p->offset);
674} 677}
675 678
679/* Check passed kprobe is valid and return kprobe in kprobe_table. */
680static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
681{
682 struct kprobe *old_p, *list_p;
683
684 old_p = get_kprobe(p->addr);
685 if (unlikely(!old_p))
686 return NULL;
687
688 if (p != old_p) {
689 list_for_each_entry_rcu(list_p, &old_p->list, list)
690 if (list_p == p)
691 /* kprobe p is a valid probe */
692 goto valid;
693 return NULL;
694 }
695valid:
696 return old_p;
697}
698
699/* Return error if the kprobe is being re-registered */
700static inline int check_kprobe_rereg(struct kprobe *p)
701{
702 int ret = 0;
703 struct kprobe *old_p;
704
705 mutex_lock(&kprobe_mutex);
706 old_p = __get_valid_kprobe(p);
707 if (old_p)
708 ret = -EINVAL;
709 mutex_unlock(&kprobe_mutex);
710 return ret;
711}
712
676int __kprobes register_kprobe(struct kprobe *p) 713int __kprobes register_kprobe(struct kprobe *p)
677{ 714{
678 int ret = 0; 715 int ret = 0;
@@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p)
685 return -EINVAL; 722 return -EINVAL;
686 p->addr = addr; 723 p->addr = addr;
687 724
725 ret = check_kprobe_rereg(p);
726 if (ret)
727 return ret;
728
688 preempt_disable(); 729 preempt_disable();
689 if (!kernel_text_address((unsigned long) p->addr) || 730 if (!kernel_text_address((unsigned long) p->addr) ||
690 in_kprobes_functions((unsigned long) p->addr)) { 731 in_kprobes_functions((unsigned long) p->addr)) {
@@ -754,26 +795,6 @@ out:
754} 795}
755EXPORT_SYMBOL_GPL(register_kprobe); 796EXPORT_SYMBOL_GPL(register_kprobe);
756 797
757/* Check passed kprobe is valid and return kprobe in kprobe_table. */
758static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
759{
760 struct kprobe *old_p, *list_p;
761
762 old_p = get_kprobe(p->addr);
763 if (unlikely(!old_p))
764 return NULL;
765
766 if (p != old_p) {
767 list_for_each_entry_rcu(list_p, &old_p->list, list)
768 if (list_p == p)
769 /* kprobe p is a valid probe */
770 goto valid;
771 return NULL;
772 }
773valid:
774 return old_p;
775}
776
777/* 798/*
778 * Unregister a kprobe without a scheduler synchronization. 799 * Unregister a kprobe without a scheduler synchronization.
779 */ 800 */
@@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p)
1141 arch_remove_kprobe(p); 1162 arch_remove_kprobe(p);
1142} 1163}
1143 1164
1165void __kprobes dump_kprobe(struct kprobe *kp)
1166{
1167 printk(KERN_WARNING "Dumping kprobe:\n");
1168 printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
1169 kp->symbol_name, kp->addr, kp->offset);
1170}
1171
1144/* Module notifier call back, checking kprobes on the module */ 1172/* Module notifier call back, checking kprobes on the module */
1145static int __kprobes kprobes_module_callback(struct notifier_block *nb, 1173static int __kprobes kprobes_module_callback(struct notifier_block *nb,
1146 unsigned long val, void *data) 1174 unsigned long val, void *data)
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5fe709982caa..ab7ae57773e1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -150,29 +150,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
150EXPORT_SYMBOL(kthread_create); 150EXPORT_SYMBOL(kthread_create);
151 151
152/** 152/**
153 * kthread_bind - bind a just-created kthread to a cpu.
154 * @k: thread created by kthread_create().
155 * @cpu: cpu (might not be online, must be possible) for @k to run on.
156 *
157 * Description: This function is equivalent to set_cpus_allowed(),
158 * except that @cpu doesn't need to be online, and the thread must be
159 * stopped (i.e., just returned from kthread_create()).
160 */
161void kthread_bind(struct task_struct *k, unsigned int cpu)
162{
163 /* Must have done schedule() in kthread() before we set_task_cpu */
164 if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {
165 WARN_ON(1);
166 return;
167 }
168 set_task_cpu(k, cpu);
169 k->cpus_allowed = cpumask_of_cpu(cpu);
170 k->rt.nr_cpus_allowed = 1;
171 k->flags |= PF_THREAD_BOUND;
172}
173EXPORT_SYMBOL(kthread_bind);
174
175/**
176 * kthread_stop - stop a thread created by kthread_create(). 153 * kthread_stop - stop a thread created by kthread_create().
177 * @k: thread created by kthread_create(). 154 * @k: thread created by kthread_create().
178 * 155 *
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 9af56723c096..f5dcd36d3151 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -49,7 +49,7 @@
49#include "lockdep_internals.h" 49#include "lockdep_internals.h"
50 50
51#define CREATE_TRACE_POINTS 51#define CREATE_TRACE_POINTS
52#include <trace/events/lockdep.h> 52#include <trace/events/lock.h>
53 53
54#ifdef CONFIG_PROVE_LOCKING 54#ifdef CONFIG_PROVE_LOCKING
55int prove_locking = 1; 55int prove_locking = 1;
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 61d5aa5eced3..acd24e7643eb 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
558 558
559static ATOMIC_NOTIFIER_HEAD(die_chain); 559static ATOMIC_NOTIFIER_HEAD(die_chain);
560 560
561int notrace notify_die(enum die_val val, const char *str, 561int notrace __kprobes notify_die(enum die_val val, const char *str,
562 struct pt_regs *regs, long err, int trap, int sig) 562 struct pt_regs *regs, long err, int trap, int sig)
563{ 563{
564 struct die_args args = { 564 struct die_args args = {
diff --git a/kernel/params.c b/kernel/params.c
index 9da58eabdcb2..d656c276508d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -218,15 +218,11 @@ int param_set_charp(const char *val, struct kernel_param *kp)
218 return -ENOSPC; 218 return -ENOSPC;
219 } 219 }
220 220
221 if (kp->flags & KPARAM_KMALLOCED)
222 kfree(*(char **)kp->arg);
223
224 /* This is a hack. We can't need to strdup in early boot, and we 221 /* This is a hack. We can't need to strdup in early boot, and we
225 * don't need to; this mangled commandline is preserved. */ 222 * don't need to; this mangled commandline is preserved. */
226 if (slab_is_available()) { 223 if (slab_is_available()) {
227 kp->flags |= KPARAM_KMALLOCED;
228 *(char **)kp->arg = kstrdup(val, GFP_KERNEL); 224 *(char **)kp->arg = kstrdup(val, GFP_KERNEL);
229 if (!kp->arg) 225 if (!*(char **)kp->arg)
230 return -ENOMEM; 226 return -ENOMEM;
231 } else 227 } else
232 *(const char **)kp->arg = val; 228 *(const char **)kp->arg = val;
@@ -304,6 +300,7 @@ static int param_array(const char *name,
304 unsigned int min, unsigned int max, 300 unsigned int min, unsigned int max,
305 void *elem, int elemsize, 301 void *elem, int elemsize,
306 int (*set)(const char *, struct kernel_param *kp), 302 int (*set)(const char *, struct kernel_param *kp),
303 u16 flags,
307 unsigned int *num) 304 unsigned int *num)
308{ 305{
309 int ret; 306 int ret;
@@ -313,6 +310,7 @@ static int param_array(const char *name,
313 /* Get the name right for errors. */ 310 /* Get the name right for errors. */
314 kp.name = name; 311 kp.name = name;
315 kp.arg = elem; 312 kp.arg = elem;
313 kp.flags = flags;
316 314
317 /* No equals sign? */ 315 /* No equals sign? */
318 if (!val) { 316 if (!val) {
@@ -358,7 +356,8 @@ int param_array_set(const char *val, struct kernel_param *kp)
358 unsigned int temp_num; 356 unsigned int temp_num;
359 357
360 return param_array(kp->name, val, 1, arr->max, arr->elem, 358 return param_array(kp->name, val, 1, arr->max, arr->elem,
361 arr->elemsize, arr->set, arr->num ?: &temp_num); 359 arr->elemsize, arr->set, kp->flags,
360 arr->num ?: &temp_num);
362} 361}
363 362
364int param_array_get(char *buffer, struct kernel_param *kp) 363int param_array_get(char *buffer, struct kernel_param *kp)
@@ -605,11 +604,7 @@ void module_param_sysfs_remove(struct module *mod)
605 604
606void destroy_params(const struct kernel_param *params, unsigned num) 605void destroy_params(const struct kernel_param *params, unsigned num)
607{ 606{
608 unsigned int i; 607 /* FIXME: This should free kmalloced charp parameters. It doesn't. */
609
610 for (i = 0; i < num; i++)
611 if (params[i].flags & KPARAM_KMALLOCED)
612 kfree(*(char **)params[i].arg);
613} 608}
614 609
615static void __init kernel_add_sysfs_param(const char *name, 610static void __init kernel_add_sysfs_param(const char *name,
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 98dc56b2ebe4..3852e2656bb0 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1357,7 +1357,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1357 u64 interrupts, freq; 1357 u64 interrupts, freq;
1358 1358
1359 spin_lock(&ctx->lock); 1359 spin_lock(&ctx->lock);
1360 list_for_each_entry(event, &ctx->group_list, group_entry) { 1360 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
1361 if (event->state != PERF_EVENT_STATE_ACTIVE) 1361 if (event->state != PERF_EVENT_STATE_ACTIVE)
1362 continue; 1362 continue;
1363 1363
@@ -2696,20 +2696,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle)
2696static void perf_output_lock(struct perf_output_handle *handle) 2696static void perf_output_lock(struct perf_output_handle *handle)
2697{ 2697{
2698 struct perf_mmap_data *data = handle->data; 2698 struct perf_mmap_data *data = handle->data;
2699 int cpu; 2699 int cur, cpu = get_cpu();
2700 2700
2701 handle->locked = 0; 2701 handle->locked = 0;
2702 2702
2703 local_irq_save(handle->flags); 2703 for (;;) {
2704 cpu = smp_processor_id(); 2704 cur = atomic_cmpxchg(&data->lock, -1, cpu);
2705 2705 if (cur == -1) {
2706 if (in_nmi() && atomic_read(&data->lock) == cpu) 2706 handle->locked = 1;
2707 return; 2707 break;
2708 }
2709 if (cur == cpu)
2710 break;
2708 2711
2709 while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
2710 cpu_relax(); 2712 cpu_relax();
2711 2713 }
2712 handle->locked = 1;
2713} 2714}
2714 2715
2715static void perf_output_unlock(struct perf_output_handle *handle) 2716static void perf_output_unlock(struct perf_output_handle *handle)
@@ -2755,7 +2756,7 @@ again:
2755 if (atomic_xchg(&data->wakeup, 0)) 2756 if (atomic_xchg(&data->wakeup, 0))
2756 perf_output_wakeup(handle); 2757 perf_output_wakeup(handle);
2757out: 2758out:
2758 local_irq_restore(handle->flags); 2759 put_cpu();
2759} 2760}
2760 2761
2761void perf_output_copy(struct perf_output_handle *handle, 2762void perf_output_copy(struct perf_output_handle *handle,
@@ -3998,8 +3999,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
3998 regs = task_pt_regs(current); 3999 regs = task_pt_regs(current);
3999 4000
4000 if (regs) { 4001 if (regs) {
4001 if (perf_event_overflow(event, 0, &data, regs)) 4002 if (!(event->attr.exclude_idle && current->pid == 0))
4002 ret = HRTIMER_NORESTART; 4003 if (perf_event_overflow(event, 0, &data, regs))
4004 ret = HRTIMER_NORESTART;
4003 } 4005 }
4004 4006
4005 period = max_t(u64, 10000, event->hw.sample_period); 4007 period = max_t(u64, 10000, event->hw.sample_period);
@@ -4008,6 +4010,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4008 return ret; 4010 return ret;
4009} 4011}
4010 4012
4013static void perf_swevent_start_hrtimer(struct perf_event *event)
4014{
4015 struct hw_perf_event *hwc = &event->hw;
4016
4017 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4018 hwc->hrtimer.function = perf_swevent_hrtimer;
4019 if (hwc->sample_period) {
4020 u64 period;
4021
4022 if (hwc->remaining) {
4023 if (hwc->remaining < 0)
4024 period = 10000;
4025 else
4026 period = hwc->remaining;
4027 hwc->remaining = 0;
4028 } else {
4029 period = max_t(u64, 10000, hwc->sample_period);
4030 }
4031 __hrtimer_start_range_ns(&hwc->hrtimer,
4032 ns_to_ktime(period), 0,
4033 HRTIMER_MODE_REL, 0);
4034 }
4035}
4036
4037static void perf_swevent_cancel_hrtimer(struct perf_event *event)
4038{
4039 struct hw_perf_event *hwc = &event->hw;
4040
4041 if (hwc->sample_period) {
4042 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
4043 hwc->remaining = ktime_to_ns(remaining);
4044
4045 hrtimer_cancel(&hwc->hrtimer);
4046 }
4047}
4048
4011/* 4049/*
4012 * Software event: cpu wall time clock 4050 * Software event: cpu wall time clock
4013 */ 4051 */
@@ -4030,22 +4068,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
4030 int cpu = raw_smp_processor_id(); 4068 int cpu = raw_smp_processor_id();
4031 4069
4032 atomic64_set(&hwc->prev_count, cpu_clock(cpu)); 4070 atomic64_set(&hwc->prev_count, cpu_clock(cpu));
4033 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4071 perf_swevent_start_hrtimer(event);
4034 hwc->hrtimer.function = perf_swevent_hrtimer;
4035 if (hwc->sample_period) {
4036 u64 period = max_t(u64, 10000, hwc->sample_period);
4037 __hrtimer_start_range_ns(&hwc->hrtimer,
4038 ns_to_ktime(period), 0,
4039 HRTIMER_MODE_REL, 0);
4040 }
4041 4072
4042 return 0; 4073 return 0;
4043} 4074}
4044 4075
4045static void cpu_clock_perf_event_disable(struct perf_event *event) 4076static void cpu_clock_perf_event_disable(struct perf_event *event)
4046{ 4077{
4047 if (event->hw.sample_period) 4078 perf_swevent_cancel_hrtimer(event);
4048 hrtimer_cancel(&event->hw.hrtimer);
4049 cpu_clock_perf_event_update(event); 4079 cpu_clock_perf_event_update(event);
4050} 4080}
4051 4081
@@ -4082,22 +4112,15 @@ static int task_clock_perf_event_enable(struct perf_event *event)
4082 now = event->ctx->time; 4112 now = event->ctx->time;
4083 4113
4084 atomic64_set(&hwc->prev_count, now); 4114 atomic64_set(&hwc->prev_count, now);
4085 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 4115
4086 hwc->hrtimer.function = perf_swevent_hrtimer; 4116 perf_swevent_start_hrtimer(event);
4087 if (hwc->sample_period) {
4088 u64 period = max_t(u64, 10000, hwc->sample_period);
4089 __hrtimer_start_range_ns(&hwc->hrtimer,
4090 ns_to_ktime(period), 0,
4091 HRTIMER_MODE_REL, 0);
4092 }
4093 4117
4094 return 0; 4118 return 0;
4095} 4119}
4096 4120
4097static void task_clock_perf_event_disable(struct perf_event *event) 4121static void task_clock_perf_event_disable(struct perf_event *event)
4098{ 4122{
4099 if (event->hw.sample_period) 4123 perf_swevent_cancel_hrtimer(event);
4100 hrtimer_cancel(&event->hw.hrtimer);
4101 task_clock_perf_event_update(event, event->ctx->time); 4124 task_clock_perf_event_update(event, event->ctx->time);
4102 4125
4103} 4126}
@@ -4319,6 +4342,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event)
4319 case PERF_COUNT_SW_PAGE_FAULTS_MAJ: 4342 case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
4320 case PERF_COUNT_SW_CONTEXT_SWITCHES: 4343 case PERF_COUNT_SW_CONTEXT_SWITCHES:
4321 case PERF_COUNT_SW_CPU_MIGRATIONS: 4344 case PERF_COUNT_SW_CPU_MIGRATIONS:
4345 case PERF_COUNT_SW_ALIGNMENT_FAULTS:
4346 case PERF_COUNT_SW_EMULATION_FAULTS:
4322 if (!event->parent) { 4347 if (!event->parent) {
4323 atomic_inc(&perf_swevent_enabled[event_id]); 4348 atomic_inc(&perf_swevent_enabled[event_id]);
4324 event->destroy = sw_perf_event_destroy; 4349 event->destroy = sw_perf_event_destroy;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 04b3a83d686f..04a9e90d248f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -693,21 +693,22 @@ static int software_resume(void)
693 /* The snapshot device should not be opened while we're running */ 693 /* The snapshot device should not be opened while we're running */
694 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { 694 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
695 error = -EBUSY; 695 error = -EBUSY;
696 swsusp_close(FMODE_READ);
696 goto Unlock; 697 goto Unlock;
697 } 698 }
698 699
699 pm_prepare_console(); 700 pm_prepare_console();
700 error = pm_notifier_call_chain(PM_RESTORE_PREPARE); 701 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
701 if (error) 702 if (error)
702 goto Finish; 703 goto close_finish;
703 704
704 error = usermodehelper_disable(); 705 error = usermodehelper_disable();
705 if (error) 706 if (error)
706 goto Finish; 707 goto close_finish;
707 708
708 error = create_basic_memory_bitmaps(); 709 error = create_basic_memory_bitmaps();
709 if (error) 710 if (error)
710 goto Finish; 711 goto close_finish;
711 712
712 pr_debug("PM: Preparing processes for restore.\n"); 713 pr_debug("PM: Preparing processes for restore.\n");
713 error = prepare_processes(); 714 error = prepare_processes();
@@ -719,6 +720,7 @@ static int software_resume(void)
719 pr_debug("PM: Reading hibernation image.\n"); 720 pr_debug("PM: Reading hibernation image.\n");
720 721
721 error = swsusp_read(&flags); 722 error = swsusp_read(&flags);
723 swsusp_close(FMODE_READ);
722 if (!error) 724 if (!error)
723 hibernation_restore(flags & SF_PLATFORM_MODE); 725 hibernation_restore(flags & SF_PLATFORM_MODE);
724 726
@@ -737,6 +739,9 @@ static int software_resume(void)
737 mutex_unlock(&pm_mutex); 739 mutex_unlock(&pm_mutex);
738 pr_debug("PM: Resume from disk failed.\n"); 740 pr_debug("PM: Resume from disk failed.\n");
739 return error; 741 return error;
742close_finish:
743 swsusp_close(FMODE_READ);
744 goto Finish;
740} 745}
741 746
742late_initcall(software_resume); 747late_initcall(software_resume);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 17d8bb1acf9c..25596e450ac7 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -19,7 +19,7 @@
19 * The time it takes is system-specific though, so when we test this 19 * The time it takes is system-specific though, so when we test this
20 * during system bootup we allow a LOT of time. 20 * during system bootup we allow a LOT of time.
21 */ 21 */
22#define TEST_SUSPEND_SECONDS 5 22#define TEST_SUSPEND_SECONDS 10
23 23
24static unsigned long suspend_test_start_time; 24static unsigned long suspend_test_start_time;
25 25
@@ -49,7 +49,8 @@ void suspend_test_finish(const char *label)
49 * has some performance issues. The stack dump of a WARN_ON 49 * has some performance issues. The stack dump of a WARN_ON
50 * is more likely to get the right attention than a printk... 50 * is more likely to get the right attention than a printk...
51 */ 51 */
52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); 52 WARN(msec > (TEST_SUSPEND_SECONDS * 1000),
53 "Component: %s, time: %u\n", label, msec);
53} 54}
54 55
55/* 56/*
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b101cdc4df3f..890f6b11b1d3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -314,7 +314,6 @@ static int save_image(struct swap_map_handle *handle,
314{ 314{
315 unsigned int m; 315 unsigned int m;
316 int ret; 316 int ret;
317 int error = 0;
318 int nr_pages; 317 int nr_pages;
319 int err2; 318 int err2;
320 struct bio *bio; 319 struct bio *bio;
@@ -329,26 +328,27 @@ static int save_image(struct swap_map_handle *handle,
329 nr_pages = 0; 328 nr_pages = 0;
330 bio = NULL; 329 bio = NULL;
331 do_gettimeofday(&start); 330 do_gettimeofday(&start);
332 do { 331 while (1) {
333 ret = snapshot_read_next(snapshot, PAGE_SIZE); 332 ret = snapshot_read_next(snapshot, PAGE_SIZE);
334 if (ret > 0) { 333 if (ret <= 0)
335 error = swap_write_page(handle, data_of(*snapshot), 334 break;
336 &bio); 335 ret = swap_write_page(handle, data_of(*snapshot), &bio);
337 if (error) 336 if (ret)
338 break; 337 break;
339 if (!(nr_pages % m)) 338 if (!(nr_pages % m))
340 printk("\b\b\b\b%3d%%", nr_pages / m); 339 printk("\b\b\b\b%3d%%", nr_pages / m);
341 nr_pages++; 340 nr_pages++;
342 } 341 }
343 } while (ret > 0);
344 err2 = wait_on_bio_chain(&bio); 342 err2 = wait_on_bio_chain(&bio);
345 do_gettimeofday(&stop); 343 do_gettimeofday(&stop);
346 if (!error) 344 if (!ret)
347 error = err2; 345 ret = err2;
348 if (!error) 346 if (!ret)
349 printk("\b\b\b\bdone\n"); 347 printk("\b\b\b\bdone\n");
348 else
349 printk("\n");
350 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); 350 swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
351 return error; 351 return ret;
352} 352}
353 353
354/** 354/**
@@ -536,7 +536,8 @@ static int load_image(struct swap_map_handle *handle,
536 snapshot_write_finalize(snapshot); 536 snapshot_write_finalize(snapshot);
537 if (!snapshot_image_loaded(snapshot)) 537 if (!snapshot_image_loaded(snapshot))
538 error = -ENODATA; 538 error = -ENODATA;
539 } 539 } else
540 printk("\n");
540 swsusp_show_speed(&start, &stop, nr_to_read, "Read"); 541 swsusp_show_speed(&start, &stop, nr_to_read, "Read");
541 return error; 542 return error;
542} 543}
@@ -572,8 +573,6 @@ int swsusp_read(unsigned int *flags_p)
572 error = load_image(&handle, &snapshot, header->pages - 1); 573 error = load_image(&handle, &snapshot, header->pages - 1);
573 release_swap_reader(&handle); 574 release_swap_reader(&handle);
574 575
575 blkdev_put(resume_bdev, FMODE_READ);
576
577 if (!error) 576 if (!error)
578 pr_debug("PM: Image successfully loaded\n"); 577 pr_debug("PM: Image successfully loaded\n");
579 else 578 else
@@ -596,7 +595,7 @@ int swsusp_check(void)
596 error = bio_read_page(swsusp_resume_block, 595 error = bio_read_page(swsusp_resume_block,
597 swsusp_header, NULL); 596 swsusp_header, NULL);
598 if (error) 597 if (error)
599 return error; 598 goto put;
600 599
601 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { 600 if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
602 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); 601 memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
@@ -604,8 +603,10 @@ int swsusp_check(void)
604 error = bio_write_page(swsusp_resume_block, 603 error = bio_write_page(swsusp_resume_block,
605 swsusp_header, NULL); 604 swsusp_header, NULL);
606 } else { 605 } else {
607 return -EINVAL; 606 error = -EINVAL;
608 } 607 }
608
609put:
609 if (error) 610 if (error)
610 blkdev_put(resume_bdev, FMODE_READ); 611 blkdev_put(resume_bdev, FMODE_READ);
611 else 612 else
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 705f02ac7433..f3077c0ab181 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -59,7 +59,7 @@
59 NUM_RCU_LVL_2, \ 59 NUM_RCU_LVL_2, \
60 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ 60 NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
61 }, \ 61 }, \
62 .signaled = RCU_SIGNAL_INIT, \ 62 .signaled = RCU_GP_IDLE, \
63 .gpnum = -300, \ 63 .gpnum = -300, \
64 .completed = -300, \ 64 .completed = -300, \
65 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ 65 .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
@@ -657,14 +657,17 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
657 * irqs disabled. 657 * irqs disabled.
658 */ 658 */
659 rcu_for_each_node_breadth_first(rsp, rnp) { 659 rcu_for_each_node_breadth_first(rsp, rnp) {
660 spin_lock(&rnp->lock); /* irqs already disabled. */ 660 spin_lock(&rnp->lock); /* irqs already disabled. */
661 rcu_preempt_check_blocked_tasks(rnp); 661 rcu_preempt_check_blocked_tasks(rnp);
662 rnp->qsmask = rnp->qsmaskinit; 662 rnp->qsmask = rnp->qsmaskinit;
663 rnp->gpnum = rsp->gpnum; 663 rnp->gpnum = rsp->gpnum;
664 spin_unlock(&rnp->lock); /* irqs already disabled. */ 664 spin_unlock(&rnp->lock); /* irqs remain disabled. */
665 } 665 }
666 666
667 rnp = rcu_get_root(rsp);
668 spin_lock(&rnp->lock); /* irqs already disabled. */
667 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ 669 rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
670 spin_unlock(&rnp->lock); /* irqs remain disabled. */
668 spin_unlock_irqrestore(&rsp->onofflock, flags); 671 spin_unlock_irqrestore(&rsp->onofflock, flags);
669} 672}
670 673
@@ -706,6 +709,7 @@ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
706{ 709{
707 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 710 WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
708 rsp->completed = rsp->gpnum; 711 rsp->completed = rsp->gpnum;
712 rsp->signaled = RCU_GP_IDLE;
709 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); 713 rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
710 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ 714 rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
711} 715}
@@ -913,7 +917,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
913 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 917 spin_unlock(&rnp->lock); /* irqs remain disabled. */
914 break; 918 break;
915 } 919 }
916 rcu_preempt_offline_tasks(rsp, rnp, rdp); 920
921 /*
922 * If there was a task blocking the current grace period,
923 * and if all CPUs have checked in, we need to propagate
924 * the quiescent state up the rcu_node hierarchy. But that
925 * is inconvenient at the moment due to deadlock issues if
926 * this should end the current grace period. So set the
927 * offlined CPU's bit in ->qsmask in order to force the
928 * next force_quiescent_state() invocation to clean up this
929 * mess in a deadlock-free manner.
930 */
931 if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
932 rnp->qsmask |= mask;
933
917 mask = rnp->grpmask; 934 mask = rnp->grpmask;
918 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 935 spin_unlock(&rnp->lock); /* irqs remain disabled. */
919 rnp = rnp->parent; 936 rnp = rnp->parent;
@@ -958,7 +975,7 @@ static void rcu_offline_cpu(int cpu)
958 * Invoke any RCU callbacks that have made it to the end of their grace 975 * Invoke any RCU callbacks that have made it to the end of their grace
959 * period. Thottle as specified by rdp->blimit. 976 * period. Thottle as specified by rdp->blimit.
960 */ 977 */
961static void rcu_do_batch(struct rcu_data *rdp) 978static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
962{ 979{
963 unsigned long flags; 980 unsigned long flags;
964 struct rcu_head *next, *list, **tail; 981 struct rcu_head *next, *list, **tail;
@@ -1011,6 +1028,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
1011 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 1028 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
1012 rdp->blimit = blimit; 1029 rdp->blimit = blimit;
1013 1030
1031 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
1032 if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
1033 rdp->qlen_last_fqs_check = 0;
1034 rdp->n_force_qs_snap = rsp->n_force_qs;
1035 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
1036 rdp->qlen_last_fqs_check = rdp->qlen;
1037
1014 local_irq_restore(flags); 1038 local_irq_restore(flags);
1015 1039
1016 /* Re-raise the RCU softirq if there are callbacks remaining. */ 1040 /* Re-raise the RCU softirq if there are callbacks remaining. */
@@ -1142,9 +1166,10 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1142 } 1166 }
1143 spin_unlock(&rnp->lock); 1167 spin_unlock(&rnp->lock);
1144 switch (signaled) { 1168 switch (signaled) {
1169 case RCU_GP_IDLE:
1145 case RCU_GP_INIT: 1170 case RCU_GP_INIT:
1146 1171
1147 break; /* grace period still initializing, ignore. */ 1172 break; /* grace period idle or initializing, ignore. */
1148 1173
1149 case RCU_SAVE_DYNTICK: 1174 case RCU_SAVE_DYNTICK:
1150 1175
@@ -1158,7 +1183,8 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
1158 1183
1159 /* Update state, record completion counter. */ 1184 /* Update state, record completion counter. */
1160 spin_lock(&rnp->lock); 1185 spin_lock(&rnp->lock);
1161 if (lastcomp == rsp->completed) { 1186 if (lastcomp == rsp->completed &&
1187 rsp->signaled == RCU_SAVE_DYNTICK) {
1162 rsp->signaled = RCU_FORCE_QS; 1188 rsp->signaled = RCU_FORCE_QS;
1163 dyntick_record_completed(rsp, lastcomp); 1189 dyntick_record_completed(rsp, lastcomp);
1164 } 1190 }
@@ -1224,7 +1250,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
1224 } 1250 }
1225 1251
1226 /* If there are callbacks ready, invoke them. */ 1252 /* If there are callbacks ready, invoke them. */
1227 rcu_do_batch(rdp); 1253 rcu_do_batch(rsp, rdp);
1228} 1254}
1229 1255
1230/* 1256/*
@@ -1288,10 +1314,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
1288 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ 1314 rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */
1289 } 1315 }
1290 1316
1291 /* Force the grace period if too many callbacks or too long waiting. */ 1317 /*
1292 if (unlikely(++rdp->qlen > qhimark)) { 1318 * Force the grace period if too many callbacks or too long waiting.
1319 * Enforce hysteresis, and don't invoke force_quiescent_state()
1320 * if some other CPU has recently done so. Also, don't bother
1321 * invoking force_quiescent_state() if the newly enqueued callback
1322 * is the only one waiting for a grace period to complete.
1323 */
1324 if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
1293 rdp->blimit = LONG_MAX; 1325 rdp->blimit = LONG_MAX;
1294 force_quiescent_state(rsp, 0); 1326 if (rsp->n_force_qs == rdp->n_force_qs_snap &&
1327 *rdp->nxttail[RCU_DONE_TAIL] != head)
1328 force_quiescent_state(rsp, 0);
1329 rdp->n_force_qs_snap = rsp->n_force_qs;
1330 rdp->qlen_last_fqs_check = rdp->qlen;
1295 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) 1331 } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
1296 force_quiescent_state(rsp, 1); 1332 force_quiescent_state(rsp, 1);
1297 local_irq_restore(flags); 1333 local_irq_restore(flags);
@@ -1523,6 +1559,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
1523 rdp->beenonline = 1; /* We have now been online. */ 1559 rdp->beenonline = 1; /* We have now been online. */
1524 rdp->preemptable = preemptable; 1560 rdp->preemptable = preemptable;
1525 rdp->passed_quiesc_completed = lastcomp - 1; 1561 rdp->passed_quiesc_completed = lastcomp - 1;
1562 rdp->qlen_last_fqs_check = 0;
1563 rdp->n_force_qs_snap = rsp->n_force_qs;
1526 rdp->blimit = blimit; 1564 rdp->blimit = blimit;
1527 spin_unlock(&rnp->lock); /* irqs remain disabled. */ 1565 spin_unlock(&rnp->lock); /* irqs remain disabled. */
1528 1566
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index b40ac5706040..1899023b0962 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -167,6 +167,10 @@ struct rcu_data {
167 struct rcu_head *nxtlist; 167 struct rcu_head *nxtlist;
168 struct rcu_head **nxttail[RCU_NEXT_SIZE]; 168 struct rcu_head **nxttail[RCU_NEXT_SIZE];
169 long qlen; /* # of queued callbacks */ 169 long qlen; /* # of queued callbacks */
170 long qlen_last_fqs_check;
171 /* qlen at last check for QS forcing */
172 unsigned long n_force_qs_snap;
173 /* did other CPU force QS recently? */
170 long blimit; /* Upper limit on a processed batch */ 174 long blimit; /* Upper limit on a processed batch */
171 175
172#ifdef CONFIG_NO_HZ 176#ifdef CONFIG_NO_HZ
@@ -197,9 +201,10 @@ struct rcu_data {
197}; 201};
198 202
199/* Values for signaled field in struct rcu_state. */ 203/* Values for signaled field in struct rcu_state. */
200#define RCU_GP_INIT 0 /* Grace period being initialized. */ 204#define RCU_GP_IDLE 0 /* No grace period in progress. */
201#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ 205#define RCU_GP_INIT 1 /* Grace period being initialized. */
202#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ 206#define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */
207#define RCU_FORCE_QS 3 /* Need to force quiescent state. */
203#ifdef CONFIG_NO_HZ 208#ifdef CONFIG_NO_HZ
204#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK 209#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK
205#else /* #ifdef CONFIG_NO_HZ */ 210#else /* #ifdef CONFIG_NO_HZ */
@@ -302,9 +307,9 @@ static void rcu_print_task_stall(struct rcu_node *rnp);
302#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ 307#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
303static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); 308static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
304#ifdef CONFIG_HOTPLUG_CPU 309#ifdef CONFIG_HOTPLUG_CPU
305static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 310static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
306 struct rcu_node *rnp, 311 struct rcu_node *rnp,
307 struct rcu_data *rdp); 312 struct rcu_data *rdp);
308static void rcu_preempt_offline_cpu(int cpu); 313static void rcu_preempt_offline_cpu(int cpu);
309#endif /* #ifdef CONFIG_HOTPLUG_CPU */ 314#endif /* #ifdef CONFIG_HOTPLUG_CPU */
310static void rcu_preempt_check_callbacks(int cpu); 315static void rcu_preempt_check_callbacks(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c0cb783aa16a..ef2a58c2b9d5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -304,21 +304,25 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
304 * parent is to remove the need for rcu_read_unlock_special() to 304 * parent is to remove the need for rcu_read_unlock_special() to
305 * make more than two attempts to acquire the target rcu_node's lock. 305 * make more than two attempts to acquire the target rcu_node's lock.
306 * 306 *
307 * Returns 1 if there was previously a task blocking the current grace
308 * period on the specified rcu_node structure.
309 *
307 * The caller must hold rnp->lock with irqs disabled. 310 * The caller must hold rnp->lock with irqs disabled.
308 */ 311 */
309static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 312static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
310 struct rcu_node *rnp, 313 struct rcu_node *rnp,
311 struct rcu_data *rdp) 314 struct rcu_data *rdp)
312{ 315{
313 int i; 316 int i;
314 struct list_head *lp; 317 struct list_head *lp;
315 struct list_head *lp_root; 318 struct list_head *lp_root;
319 int retval = rcu_preempted_readers(rnp);
316 struct rcu_node *rnp_root = rcu_get_root(rsp); 320 struct rcu_node *rnp_root = rcu_get_root(rsp);
317 struct task_struct *tp; 321 struct task_struct *tp;
318 322
319 if (rnp == rnp_root) { 323 if (rnp == rnp_root) {
320 WARN_ONCE(1, "Last CPU thought to be offlined?"); 324 WARN_ONCE(1, "Last CPU thought to be offlined?");
321 return; /* Shouldn't happen: at least one CPU online. */ 325 return 0; /* Shouldn't happen: at least one CPU online. */
322 } 326 }
323 WARN_ON_ONCE(rnp != rdp->mynode && 327 WARN_ON_ONCE(rnp != rdp->mynode &&
324 (!list_empty(&rnp->blocked_tasks[0]) || 328 (!list_empty(&rnp->blocked_tasks[0]) ||
@@ -342,6 +346,8 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
342 spin_unlock(&rnp_root->lock); /* irqs remain disabled */ 346 spin_unlock(&rnp_root->lock); /* irqs remain disabled */
343 } 347 }
344 } 348 }
349
350 return retval;
345} 351}
346 352
347/* 353/*
@@ -393,6 +399,17 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
393EXPORT_SYMBOL_GPL(call_rcu); 399EXPORT_SYMBOL_GPL(call_rcu);
394 400
395/* 401/*
402 * Wait for an rcu-preempt grace period. We are supposed to expedite the
403 * grace period, but this is the crude slow compatability hack, so just
404 * invoke synchronize_rcu().
405 */
406void synchronize_rcu_expedited(void)
407{
408 synchronize_rcu();
409}
410EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
411
412/*
396 * Check to see if there is any immediate preemptable-RCU-related work 413 * Check to see if there is any immediate preemptable-RCU-related work
397 * to be done. 414 * to be done.
398 */ 415 */
@@ -521,12 +538,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
521 538
522/* 539/*
523 * Because preemptable RCU does not exist, it never needs to migrate 540 * Because preemptable RCU does not exist, it never needs to migrate
524 * tasks that were blocked within RCU read-side critical sections. 541 * tasks that were blocked within RCU read-side critical sections, and
542 * such non-existent tasks cannot possibly have been blocking the current
543 * grace period.
525 */ 544 */
526static void rcu_preempt_offline_tasks(struct rcu_state *rsp, 545static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
527 struct rcu_node *rnp, 546 struct rcu_node *rnp,
528 struct rcu_data *rdp) 547 struct rcu_data *rdp)
529{ 548{
549 return 0;
530} 550}
531 551
532/* 552/*
@@ -565,6 +585,16 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
565EXPORT_SYMBOL_GPL(call_rcu); 585EXPORT_SYMBOL_GPL(call_rcu);
566 586
567/* 587/*
588 * Wait for an rcu-preempt grace period, but make it happen quickly.
589 * But because preemptable RCU does not exist, map to rcu-sched.
590 */
591void synchronize_rcu_expedited(void)
592{
593 synchronize_sched_expedited();
594}
595EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
596
597/*
568 * Because preemptable RCU does not exist, it never has any work to do. 598 * Because preemptable RCU does not exist, it never has any work to do.
569 */ 599 */
570static int rcu_preempt_pending(int cpu) 600static int rcu_preempt_pending(int cpu)
diff --git a/kernel/sched.c b/kernel/sched.c
index e88689522e66..3c11ae0a948d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
309 */ 309 */
310static DEFINE_SPINLOCK(task_group_lock); 310static DEFINE_SPINLOCK(task_group_lock);
311 311
312#ifdef CONFIG_FAIR_GROUP_SCHED
313
312#ifdef CONFIG_SMP 314#ifdef CONFIG_SMP
313static int root_task_group_empty(void) 315static int root_task_group_empty(void)
314{ 316{
@@ -316,7 +318,6 @@ static int root_task_group_empty(void)
316} 318}
317#endif 319#endif
318 320
319#ifdef CONFIG_FAIR_GROUP_SCHED
320#ifdef CONFIG_USER_SCHED 321#ifdef CONFIG_USER_SCHED
321# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
322#else /* !CONFIG_USER_SCHED */ 323#else /* !CONFIG_USER_SCHED */
@@ -1564,11 +1565,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1564 1565
1565#ifdef CONFIG_FAIR_GROUP_SCHED 1566#ifdef CONFIG_FAIR_GROUP_SCHED
1566 1567
1567struct update_shares_data { 1568static __read_mostly unsigned long *update_shares_data;
1568 unsigned long rq_weight[NR_CPUS];
1569};
1570
1571static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1572 1569
1573static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1570static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1574 1571
@@ -1578,12 +1575,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1578static void update_group_shares_cpu(struct task_group *tg, int cpu, 1575static void update_group_shares_cpu(struct task_group *tg, int cpu,
1579 unsigned long sd_shares, 1576 unsigned long sd_shares,
1580 unsigned long sd_rq_weight, 1577 unsigned long sd_rq_weight,
1581 struct update_shares_data *usd) 1578 unsigned long *usd_rq_weight)
1582{ 1579{
1583 unsigned long shares, rq_weight; 1580 unsigned long shares, rq_weight;
1584 int boost = 0; 1581 int boost = 0;
1585 1582
1586 rq_weight = usd->rq_weight[cpu]; 1583 rq_weight = usd_rq_weight[cpu];
1587 if (!rq_weight) { 1584 if (!rq_weight) {
1588 boost = 1; 1585 boost = 1;
1589 rq_weight = NICE_0_LOAD; 1586 rq_weight = NICE_0_LOAD;
@@ -1618,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1618static int tg_shares_up(struct task_group *tg, void *data) 1615static int tg_shares_up(struct task_group *tg, void *data)
1619{ 1616{
1620 unsigned long weight, rq_weight = 0, shares = 0; 1617 unsigned long weight, rq_weight = 0, shares = 0;
1621 struct update_shares_data *usd; 1618 unsigned long *usd_rq_weight;
1622 struct sched_domain *sd = data; 1619 struct sched_domain *sd = data;
1623 unsigned long flags; 1620 unsigned long flags;
1624 int i; 1621 int i;
@@ -1627,11 +1624,11 @@ static int tg_shares_up(struct task_group *tg, void *data)
1627 return 0; 1624 return 0;
1628 1625
1629 local_irq_save(flags); 1626 local_irq_save(flags);
1630 usd = &__get_cpu_var(update_shares_data); 1627 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1631 1628
1632 for_each_cpu(i, sched_domain_span(sd)) { 1629 for_each_cpu(i, sched_domain_span(sd)) {
1633 weight = tg->cfs_rq[i]->load.weight; 1630 weight = tg->cfs_rq[i]->load.weight;
1634 usd->rq_weight[i] = weight; 1631 usd_rq_weight[i] = weight;
1635 1632
1636 /* 1633 /*
1637 * If there are currently no tasks on the cpu pretend there 1634 * If there are currently no tasks on the cpu pretend there
@@ -1652,7 +1649,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1652 shares = tg->shares; 1649 shares = tg->shares;
1653 1650
1654 for_each_cpu(i, sched_domain_span(sd)) 1651 for_each_cpu(i, sched_domain_span(sd))
1655 update_group_shares_cpu(tg, i, shares, rq_weight, usd); 1652 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1656 1653
1657 local_irq_restore(flags); 1654 local_irq_restore(flags);
1658 1655
@@ -1996,6 +1993,38 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1996 p->sched_class->prio_changed(rq, p, oldprio, running); 1993 p->sched_class->prio_changed(rq, p, oldprio, running);
1997} 1994}
1998 1995
1996/**
1997 * kthread_bind - bind a just-created kthread to a cpu.
1998 * @p: thread created by kthread_create().
1999 * @cpu: cpu (might not be online, must be possible) for @k to run on.
2000 *
2001 * Description: This function is equivalent to set_cpus_allowed(),
2002 * except that @cpu doesn't need to be online, and the thread must be
2003 * stopped (i.e., just returned from kthread_create()).
2004 *
2005 * Function lives here instead of kthread.c because it messes with
2006 * scheduler internals which require locking.
2007 */
2008void kthread_bind(struct task_struct *p, unsigned int cpu)
2009{
2010 struct rq *rq = cpu_rq(cpu);
2011 unsigned long flags;
2012
2013 /* Must have done schedule() in kthread() before we set_task_cpu */
2014 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2015 WARN_ON(1);
2016 return;
2017 }
2018
2019 spin_lock_irqsave(&rq->lock, flags);
2020 set_task_cpu(p, cpu);
2021 p->cpus_allowed = cpumask_of_cpu(cpu);
2022 p->rt.nr_cpus_allowed = 1;
2023 p->flags |= PF_THREAD_BOUND;
2024 spin_unlock_irqrestore(&rq->lock, flags);
2025}
2026EXPORT_SYMBOL(kthread_bind);
2027
1999#ifdef CONFIG_SMP 2028#ifdef CONFIG_SMP
2000/* 2029/*
2001 * Is this task likely cache-hot: 2030 * Is this task likely cache-hot:
@@ -2008,7 +2037,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2008 /* 2037 /*
2009 * Buddy candidates are cache hot: 2038 * Buddy candidates are cache hot:
2010 */ 2039 */
2011 if (sched_feat(CACHE_HOT_BUDDY) && 2040 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
2012 (&p->se == cfs_rq_of(&p->se)->next || 2041 (&p->se == cfs_rq_of(&p->se)->next ||
2013 &p->se == cfs_rq_of(&p->se)->last)) 2042 &p->se == cfs_rq_of(&p->se)->last))
2014 return 1; 2043 return 1;
@@ -9407,6 +9436,10 @@ void __init sched_init(void)
9407#endif /* CONFIG_USER_SCHED */ 9436#endif /* CONFIG_USER_SCHED */
9408#endif /* CONFIG_GROUP_SCHED */ 9437#endif /* CONFIG_GROUP_SCHED */
9409 9438
9439#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9440 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
9441 __alignof__(unsigned long));
9442#endif
9410 for_each_possible_cpu(i) { 9443 for_each_possible_cpu(i) {
9411 struct rq *rq; 9444 struct rq *rq;
9412 9445
@@ -9532,13 +9565,13 @@ void __init sched_init(void)
9532 current->sched_class = &fair_sched_class; 9565 current->sched_class = &fair_sched_class;
9533 9566
9534 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ 9567 /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
9535 alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); 9568 zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
9536#ifdef CONFIG_SMP 9569#ifdef CONFIG_SMP
9537#ifdef CONFIG_NO_HZ 9570#ifdef CONFIG_NO_HZ
9538 alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); 9571 zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
9539 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); 9572 alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
9540#endif 9573#endif
9541 alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 9574 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
9542#endif /* SMP */ 9575#endif /* SMP */
9543 9576
9544 perf_event_init(); 9577 perf_event_init();
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4e777b47eeda..37087a7fac22 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -822,6 +822,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
822 * re-elected due to buddy favours. 822 * re-elected due to buddy favours.
823 */ 823 */
824 clear_buddies(cfs_rq, curr); 824 clear_buddies(cfs_rq, curr);
825 return;
826 }
827
828 /*
829 * Ensure that a task that missed wakeup preemption by a
830 * narrow margin doesn't have to wait for a full slice.
831 * This also mitigates buddy induced latencies under load.
832 */
833 if (!sched_feat(WAKEUP_PREEMPT))
834 return;
835
836 if (delta_exec < sysctl_sched_min_granularity)
837 return;
838
839 if (cfs_rq->nr_running > 1) {
840 struct sched_entity *se = __pick_next_entity(cfs_rq);
841 s64 delta = curr->vruntime - se->vruntime;
842
843 if (delta > ideal_runtime)
844 resched_task(rq_of(cfs_rq)->curr);
825 } 845 }
826} 846}
827 847
@@ -861,12 +881,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
861static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) 881static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
862{ 882{
863 struct sched_entity *se = __pick_next_entity(cfs_rq); 883 struct sched_entity *se = __pick_next_entity(cfs_rq);
884 struct sched_entity *left = se;
864 885
865 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) 886 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
866 return cfs_rq->next; 887 se = cfs_rq->next;
867 888
868 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) 889 /*
869 return cfs_rq->last; 890 * Prefer last buddy, try to return the CPU to a preempted task.
891 */
892 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
893 se = cfs_rq->last;
894
895 clear_buddies(cfs_rq, se);
870 896
871 return se; 897 return se;
872} 898}
@@ -1568,6 +1594,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1568 struct sched_entity *se = &curr->se, *pse = &p->se; 1594 struct sched_entity *se = &curr->se, *pse = &p->se;
1569 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1595 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1570 int sync = wake_flags & WF_SYNC; 1596 int sync = wake_flags & WF_SYNC;
1597 int scale = cfs_rq->nr_running >= sched_nr_latency;
1571 1598
1572 update_curr(cfs_rq); 1599 update_curr(cfs_rq);
1573 1600
@@ -1582,18 +1609,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1582 if (unlikely(se == pse)) 1609 if (unlikely(se == pse))
1583 return; 1610 return;
1584 1611
1585 /* 1612 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
1586 * Only set the backward buddy when the current task is still on the
1587 * rq. This can happen when a wakeup gets interleaved with schedule on
1588 * the ->pre_schedule() or idle_balance() point, either of which can
1589 * drop the rq lock.
1590 *
1591 * Also, during early boot the idle thread is in the fair class, for
1592 * obvious reasons its a bad idea to schedule back to the idle thread.
1593 */
1594 if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1595 set_last_buddy(se);
1596 if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
1597 set_next_buddy(pse); 1613 set_next_buddy(pse);
1598 1614
1599 /* 1615 /*
@@ -1639,8 +1655,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
1639 1655
1640 BUG_ON(!pse); 1656 BUG_ON(!pse);
1641 1657
1642 if (wakeup_preempt_entity(se, pse) == 1) 1658 if (wakeup_preempt_entity(se, pse) == 1) {
1643 resched_task(curr); 1659 resched_task(curr);
1660 /*
1661 * Only set the backward buddy when the current task is still
1662 * on the rq. This can happen when a wakeup gets interleaved
1663 * with schedule on the ->pre_schedule() or idle_balance()
1664 * point, either of which can * drop the rq lock.
1665 *
1666 * Also, during early boot the idle thread is in the fair class,
1667 * for obvious reasons its a bad idea to schedule back to it.
1668 */
1669 if (unlikely(!se->on_rq || curr == rq->idle))
1670 return;
1671 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
1672 set_last_buddy(se);
1673 }
1644} 1674}
1645 1675
1646static struct task_struct *pick_next_task_fair(struct rq *rq) 1676static struct task_struct *pick_next_task_fair(struct rq *rq)
@@ -1654,16 +1684,6 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
1654 1684
1655 do { 1685 do {
1656 se = pick_next_entity(cfs_rq); 1686 se = pick_next_entity(cfs_rq);
1657 /*
1658 * If se was a buddy, clear it so that it will have to earn
1659 * the favour again.
1660 *
1661 * If se was not a buddy, clear the buddies because neither
1662 * was elegible to run, let them earn it again.
1663 *
1664 * IOW. unconditionally clear buddies.
1665 */
1666 __clear_buddies(cfs_rq, NULL);
1667 set_next_entity(cfs_rq, se); 1687 set_next_entity(cfs_rq, se);
1668 cfs_rq = group_cfs_rq(se); 1688 cfs_rq = group_cfs_rq(se);
1669 } while (cfs_rq); 1689 } while (cfs_rq);
diff --git a/kernel/sys.c b/kernel/sys.c
index 255475d163e0..ce17760d9c51 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1110,6 +1110,8 @@ SYSCALL_DEFINE0(setsid)
1110 err = session; 1110 err = session;
1111out: 1111out:
1112 write_unlock_irq(&tasklist_lock); 1112 write_unlock_irq(&tasklist_lock);
1113 if (err > 0)
1114 proc_sid_connector(group_leader);
1113 return err; 1115 return err;
1114} 1116}
1115 1117
@@ -1546,24 +1548,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
1546 if (arg4 | arg5) 1548 if (arg4 | arg5)
1547 return -EINVAL; 1549 return -EINVAL;
1548 switch (arg2) { 1550 switch (arg2) {
1549 case 0: 1551 case PR_MCE_KILL_CLEAR:
1550 if (arg3 != 0) 1552 if (arg3 != 0)
1551 return -EINVAL; 1553 return -EINVAL;
1552 current->flags &= ~PF_MCE_PROCESS; 1554 current->flags &= ~PF_MCE_PROCESS;
1553 break; 1555 break;
1554 case 1: 1556 case PR_MCE_KILL_SET:
1555 current->flags |= PF_MCE_PROCESS; 1557 current->flags |= PF_MCE_PROCESS;
1556 if (arg3 != 0) 1558 if (arg3 == PR_MCE_KILL_EARLY)
1557 current->flags |= PF_MCE_EARLY; 1559 current->flags |= PF_MCE_EARLY;
1558 else 1560 else if (arg3 == PR_MCE_KILL_LATE)
1559 current->flags &= ~PF_MCE_EARLY; 1561 current->flags &= ~PF_MCE_EARLY;
1562 else if (arg3 == PR_MCE_KILL_DEFAULT)
1563 current->flags &=
1564 ~(PF_MCE_EARLY|PF_MCE_PROCESS);
1565 else
1566 return -EINVAL;
1560 break; 1567 break;
1561 default: 1568 default:
1562 return -EINVAL; 1569 return -EINVAL;
1563 } 1570 }
1564 error = 0; 1571 error = 0;
1565 break; 1572 break;
1566 1573 case PR_MCE_KILL_GET:
1574 if (arg2 | arg3 | arg4 | arg5)
1575 return -EINVAL;
1576 if (current->flags & PF_MCE_PROCESS)
1577 error = (current->flags & PF_MCE_EARLY) ?
1578 PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
1579 else
1580 error = PR_MCE_KILL_DEFAULT;
1581 break;
1567 default: 1582 default:
1568 error = -EINVAL; 1583 error = -EINVAL;
1569 break; 1584 break;
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b38423ca711a..b6e7aaea4604 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1521,7 +1521,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1521 if (!table->ctl_name && table->strategy) 1521 if (!table->ctl_name && table->strategy)
1522 set_fail(&fail, table, "Strategy without ctl_name"); 1522 set_fail(&fail, table, "Strategy without ctl_name");
1523#endif 1523#endif
1524#ifdef CONFIG_PROC_FS 1524#ifdef CONFIG_PROC_SYSCTL
1525 if (table->procname && !table->proc_handler) 1525 if (table->procname && !table->proc_handler)
1526 set_fail(&fail, table, "No proc_handler"); 1526 set_fail(&fail, table, "No proc_handler");
1527#endif 1527#endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 06c3d5be6759..d006554888dc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -449,6 +449,23 @@ config BLK_DEV_IO_TRACE
449 449
450 If unsure, say N. 450 If unsure, say N.
451 451
452config KPROBE_EVENT
453 depends on KPROBES
454 depends on X86
455 bool "Enable kprobes-based dynamic events"
456 select TRACING
457 default y
458 help
459 This allows the user to add tracing events (similar to tracepoints) on the fly
460 via the ftrace interface. See Documentation/trace/kprobetrace.txt
461 for more details.
462
463 Those events can be inserted wherever kprobes can probe, and record
464 various register and memory values.
465
466 This option is also required by perf-probe subcommand of perf tools. If
467 you want to use perf tools, this option is strongly recommended.
468
452config DYNAMIC_FTRACE 469config DYNAMIC_FTRACE
453 bool "enable/disable ftrace tracepoints dynamically" 470 bool "enable/disable ftrace tracepoints dynamically"
454 depends on FUNCTION_TRACER 471 depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 0f84c52e58fe..cd9ecd89ec77 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
56obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o 57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
57obj-$(CONFIG_EVENT_TRACING) += power-traces.o 58obj-$(CONFIG_EVENT_TRACING) += power-traces.o
58 59
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b10c0d90a6ff..7cb6f1922598 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -751,7 +751,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
751 out: 751 out:
752 mutex_unlock(&ftrace_profile_lock); 752 mutex_unlock(&ftrace_profile_lock);
753 753
754 filp->f_pos += cnt; 754 *ppos += cnt;
755 755
756 return cnt; 756 return cnt;
757} 757}
@@ -2199,15 +2199,15 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
2199 ret = ftrace_process_regex(parser->buffer, 2199 ret = ftrace_process_regex(parser->buffer,
2200 parser->idx, enable); 2200 parser->idx, enable);
2201 if (ret) 2201 if (ret)
2202 goto out; 2202 goto out_unlock;
2203 2203
2204 trace_parser_clear(parser); 2204 trace_parser_clear(parser);
2205 } 2205 }
2206 2206
2207 ret = read; 2207 ret = read;
2208 2208out_unlock:
2209 mutex_unlock(&ftrace_regex_lock); 2209 mutex_unlock(&ftrace_regex_lock);
2210out: 2210
2211 return ret; 2211 return ret;
2212} 2212}
2213 2213
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e43c928356ee..db223fe8887f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -486,7 +486,7 @@ struct ring_buffer_iter {
486/* Up this if you want to test the TIME_EXTENTS and normalization */ 486/* Up this if you want to test the TIME_EXTENTS and normalization */
487#define DEBUG_SHIFT 0 487#define DEBUG_SHIFT 0
488 488
489static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) 489static inline u64 rb_time_stamp(struct ring_buffer *buffer)
490{ 490{
491 /* shift to debug/test normalization and TIME_EXTENTS */ 491 /* shift to debug/test normalization and TIME_EXTENTS */
492 return buffer->clock() << DEBUG_SHIFT; 492 return buffer->clock() << DEBUG_SHIFT;
@@ -497,7 +497,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
497 u64 time; 497 u64 time;
498 498
499 preempt_disable_notrace(); 499 preempt_disable_notrace();
500 time = rb_time_stamp(buffer, cpu); 500 time = rb_time_stamp(buffer);
501 preempt_enable_no_resched_notrace(); 501 preempt_enable_no_resched_notrace();
502 502
503 return time; 503 return time;
@@ -602,7 +602,7 @@ static struct list_head *rb_list_head(struct list_head *list)
602} 602}
603 603
604/* 604/*
605 * rb_is_head_page - test if the give page is the head page 605 * rb_is_head_page - test if the given page is the head page
606 * 606 *
607 * Because the reader may move the head_page pointer, we can 607 * Because the reader may move the head_page pointer, we can
608 * not trust what the head page is (it may be pointing to 608 * not trust what the head page is (it may be pointing to
@@ -1196,6 +1196,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1196 atomic_inc(&cpu_buffer->record_disabled); 1196 atomic_inc(&cpu_buffer->record_disabled);
1197 synchronize_sched(); 1197 synchronize_sched();
1198 1198
1199 spin_lock_irq(&cpu_buffer->reader_lock);
1199 rb_head_page_deactivate(cpu_buffer); 1200 rb_head_page_deactivate(cpu_buffer);
1200 1201
1201 for (i = 0; i < nr_pages; i++) { 1202 for (i = 0; i < nr_pages; i++) {
@@ -1210,6 +1211,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
1210 return; 1211 return;
1211 1212
1212 rb_reset_cpu(cpu_buffer); 1213 rb_reset_cpu(cpu_buffer);
1214 spin_unlock_irq(&cpu_buffer->reader_lock);
1213 1215
1214 rb_check_pages(cpu_buffer); 1216 rb_check_pages(cpu_buffer);
1215 1217
@@ -1871,7 +1873,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
1871 * Nested commits always have zero deltas, so 1873 * Nested commits always have zero deltas, so
1872 * just reread the time stamp 1874 * just reread the time stamp
1873 */ 1875 */
1874 *ts = rb_time_stamp(buffer, cpu_buffer->cpu); 1876 *ts = rb_time_stamp(buffer);
1875 next_page->page->time_stamp = *ts; 1877 next_page->page->time_stamp = *ts;
1876 } 1878 }
1877 1879
@@ -2114,7 +2116,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2114 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) 2116 if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
2115 goto out_fail; 2117 goto out_fail;
2116 2118
2117 ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); 2119 ts = rb_time_stamp(cpu_buffer->buffer);
2118 2120
2119 /* 2121 /*
2120 * Only the first commit can update the timestamp. 2122 * Only the first commit can update the timestamp.
@@ -2684,7 +2686,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
2684EXPORT_SYMBOL_GPL(ring_buffer_entries); 2686EXPORT_SYMBOL_GPL(ring_buffer_entries);
2685 2687
2686/** 2688/**
2687 * ring_buffer_overrun_cpu - get the number of overruns in buffer 2689 * ring_buffer_overruns - get the number of overruns in buffer
2688 * @buffer: The ring buffer 2690 * @buffer: The ring buffer
2689 * 2691 *
2690 * Returns the total number of overruns in the ring buffer 2692 * Returns the total number of overruns in the ring buffer
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 026e715a0c7a..9d3067a62d43 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2440,7 +2440,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
2440 return ret; 2440 return ret;
2441 } 2441 }
2442 2442
2443 filp->f_pos += cnt; 2443 *ppos += cnt;
2444 2444
2445 return cnt; 2445 return cnt;
2446} 2446}
@@ -2582,7 +2582,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2582 } 2582 }
2583 mutex_unlock(&trace_types_lock); 2583 mutex_unlock(&trace_types_lock);
2584 2584
2585 filp->f_pos += cnt; 2585 *ppos += cnt;
2586 2586
2587 return cnt; 2587 return cnt;
2588} 2588}
@@ -2764,7 +2764,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2764 if (err) 2764 if (err)
2765 return err; 2765 return err;
2766 2766
2767 filp->f_pos += ret; 2767 *ppos += ret;
2768 2768
2769 return ret; 2769 return ret;
2770} 2770}
@@ -3299,7 +3299,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
3299 } 3299 }
3300 } 3300 }
3301 3301
3302 filp->f_pos += cnt; 3302 *ppos += cnt;
3303 3303
3304 /* If check pages failed, return ENOMEM */ 3304 /* If check pages failed, return ENOMEM */
3305 if (tracing_disabled) 3305 if (tracing_disabled)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index ee00475742eb..4da6ede74401 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -103,6 +103,29 @@ struct syscall_trace_exit {
103 unsigned long ret; 103 unsigned long ret;
104}; 104};
105 105
106struct kprobe_trace_entry {
107 struct trace_entry ent;
108 unsigned long ip;
109 int nargs;
110 unsigned long args[];
111};
112
113#define SIZEOF_KPROBE_TRACE_ENTRY(n) \
114 (offsetof(struct kprobe_trace_entry, args) + \
115 (sizeof(unsigned long) * (n)))
116
117struct kretprobe_trace_entry {
118 struct trace_entry ent;
119 unsigned long func;
120 unsigned long ret_ip;
121 int nargs;
122 unsigned long args[];
123};
124
125#define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \
126 (offsetof(struct kretprobe_trace_entry, args) + \
127 (sizeof(unsigned long) * (n)))
128
106/* 129/*
107 * trace_flag_type is an enumeration that holds different 130 * trace_flag_type is an enumeration that holds different
108 * states when a trace occurs. These are: 131 * states when a trace occurs. These are:
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 8d5c171cc998..e0d351b01f5a 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -8,44 +8,39 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include "trace.h" 9#include "trace.h"
10 10
11/*
12 * We can't use a size but a type in alloc_percpu()
13 * So let's create a dummy type that matches the desired size
14 */
15typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t;
16 11
17char *trace_profile_buf; 12struct perf_trace_buf *perf_trace_buf;
18EXPORT_SYMBOL_GPL(trace_profile_buf); 13EXPORT_SYMBOL_GPL(perf_trace_buf);
19 14
20char *trace_profile_buf_nmi; 15struct perf_trace_buf *perf_trace_buf_nmi;
21EXPORT_SYMBOL_GPL(trace_profile_buf_nmi); 16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
22 17
23/* Count the events in use (per event id, not per instance) */ 18/* Count the events in use (per event id, not per instance) */
24static int total_profile_count; 19static int total_profile_count;
25 20
26static int ftrace_profile_enable_event(struct ftrace_event_call *event) 21static int ftrace_profile_enable_event(struct ftrace_event_call *event)
27{ 22{
28 char *buf; 23 struct perf_trace_buf *buf;
29 int ret = -ENOMEM; 24 int ret = -ENOMEM;
30 25
31 if (atomic_inc_return(&event->profile_count)) 26 if (atomic_inc_return(&event->profile_count))
32 return 0; 27 return 0;
33 28
34 if (!total_profile_count) { 29 if (!total_profile_count) {
35 buf = (char *)alloc_percpu(profile_buf_t); 30 buf = alloc_percpu(struct perf_trace_buf);
36 if (!buf) 31 if (!buf)
37 goto fail_buf; 32 goto fail_buf;
38 33
39 rcu_assign_pointer(trace_profile_buf, buf); 34 rcu_assign_pointer(perf_trace_buf, buf);
40 35
41 buf = (char *)alloc_percpu(profile_buf_t); 36 buf = alloc_percpu(struct perf_trace_buf);
42 if (!buf) 37 if (!buf)
43 goto fail_buf_nmi; 38 goto fail_buf_nmi;
44 39
45 rcu_assign_pointer(trace_profile_buf_nmi, buf); 40 rcu_assign_pointer(perf_trace_buf_nmi, buf);
46 } 41 }
47 42
48 ret = event->profile_enable(); 43 ret = event->profile_enable(event);
49 if (!ret) { 44 if (!ret) {
50 total_profile_count++; 45 total_profile_count++;
51 return 0; 46 return 0;
@@ -53,10 +48,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
53 48
54fail_buf_nmi: 49fail_buf_nmi:
55 if (!total_profile_count) { 50 if (!total_profile_count) {
56 free_percpu(trace_profile_buf_nmi); 51 free_percpu(perf_trace_buf_nmi);
57 free_percpu(trace_profile_buf); 52 free_percpu(perf_trace_buf);
58 trace_profile_buf_nmi = NULL; 53 perf_trace_buf_nmi = NULL;
59 trace_profile_buf = NULL; 54 perf_trace_buf = NULL;
60 } 55 }
61fail_buf: 56fail_buf:
62 atomic_dec(&event->profile_count); 57 atomic_dec(&event->profile_count);
@@ -84,19 +79,19 @@ int ftrace_profile_enable(int event_id)
84 79
85static void ftrace_profile_disable_event(struct ftrace_event_call *event) 80static void ftrace_profile_disable_event(struct ftrace_event_call *event)
86{ 81{
87 char *buf, *nmi_buf; 82 struct perf_trace_buf *buf, *nmi_buf;
88 83
89 if (!atomic_add_negative(-1, &event->profile_count)) 84 if (!atomic_add_negative(-1, &event->profile_count))
90 return; 85 return;
91 86
92 event->profile_disable(); 87 event->profile_disable(event);
93 88
94 if (!--total_profile_count) { 89 if (!--total_profile_count) {
95 buf = trace_profile_buf; 90 buf = perf_trace_buf;
96 rcu_assign_pointer(trace_profile_buf, NULL); 91 rcu_assign_pointer(perf_trace_buf, NULL);
97 92
98 nmi_buf = trace_profile_buf_nmi; 93 nmi_buf = perf_trace_buf_nmi;
99 rcu_assign_pointer(trace_profile_buf_nmi, NULL); 94 rcu_assign_pointer(perf_trace_buf_nmi, NULL);
100 95
101 /* 96 /*
102 * Ensure every events in profiling have finished before 97 * Ensure every events in profiling have finished before
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7c18d154ea28..1d18315dc836 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)
93} 93}
94EXPORT_SYMBOL_GPL(trace_define_common_fields); 94EXPORT_SYMBOL_GPL(trace_define_common_fields);
95 95
96#ifdef CONFIG_MODULES 96void trace_destroy_fields(struct ftrace_event_call *call)
97
98static void trace_destroy_fields(struct ftrace_event_call *call)
99{ 97{
100 struct ftrace_event_field *field, *next; 98 struct ftrace_event_field *field, *next;
101 99
@@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
107 } 105 }
108} 106}
109 107
110#endif /* CONFIG_MODULES */
111
112static void ftrace_event_enable_disable(struct ftrace_event_call *call, 108static void ftrace_event_enable_disable(struct ftrace_event_call *call,
113 int enable) 109 int enable)
114{ 110{
@@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
117 if (call->enabled) { 113 if (call->enabled) {
118 call->enabled = 0; 114 call->enabled = 0;
119 tracing_stop_cmdline_record(); 115 tracing_stop_cmdline_record();
120 call->unregfunc(call->data); 116 call->unregfunc(call);
121 } 117 }
122 break; 118 break;
123 case 1: 119 case 1:
124 if (!call->enabled) { 120 if (!call->enabled) {
125 call->enabled = 1; 121 call->enabled = 1;
126 tracing_start_cmdline_record(); 122 tracing_start_cmdline_record();
127 call->regfunc(call->data); 123 call->regfunc(call);
128 } 124 }
129 break; 125 break;
130 } 126 }
@@ -937,27 +933,46 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
937 return 0; 933 return 0;
938} 934}
939 935
940#define for_each_event(event, start, end) \ 936static int __trace_add_event_call(struct ftrace_event_call *call)
941 for (event = start; \ 937{
942 (unsigned long)event < (unsigned long)end; \ 938 struct dentry *d_events;
943 event++) 939 int ret;
944 940
945#ifdef CONFIG_MODULES 941 if (!call->name)
942 return -EINVAL;
946 943
947static LIST_HEAD(ftrace_module_file_list); 944 if (call->raw_init) {
945 ret = call->raw_init(call);
946 if (ret < 0) {
947 if (ret != -ENOSYS)
948 pr_warning("Could not initialize trace "
949 "events/%s\n", call->name);
950 return ret;
951 }
952 }
948 953
949/* 954 d_events = event_trace_events_dir();
950 * Modules must own their file_operations to keep up with 955 if (!d_events)
951 * reference counting. 956 return -ENOENT;
952 */ 957
953struct ftrace_module_file_ops { 958 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
954 struct list_head list; 959 &ftrace_enable_fops, &ftrace_event_filter_fops,
955 struct module *mod; 960 &ftrace_event_format_fops);
956 struct file_operations id; 961 if (!ret)
957 struct file_operations enable; 962 list_add(&call->list, &ftrace_events);
958 struct file_operations format; 963
959 struct file_operations filter; 964 return ret;
960}; 965}
966
967/* Add an additional event_call dynamically */
968int trace_add_event_call(struct ftrace_event_call *call)
969{
970 int ret;
971 mutex_lock(&event_mutex);
972 ret = __trace_add_event_call(call);
973 mutex_unlock(&event_mutex);
974 return ret;
975}
961 976
962static void remove_subsystem_dir(const char *name) 977static void remove_subsystem_dir(const char *name)
963{ 978{
@@ -985,6 +1000,53 @@ static void remove_subsystem_dir(const char *name)
985 } 1000 }
986} 1001}
987 1002
1003/*
1004 * Must be called under locking both of event_mutex and trace_event_mutex.
1005 */
1006static void __trace_remove_event_call(struct ftrace_event_call *call)
1007{
1008 ftrace_event_enable_disable(call, 0);
1009 if (call->event)
1010 __unregister_ftrace_event(call->event);
1011 debugfs_remove_recursive(call->dir);
1012 list_del(&call->list);
1013 trace_destroy_fields(call);
1014 destroy_preds(call);
1015 remove_subsystem_dir(call->system);
1016}
1017
1018/* Remove an event_call */
1019void trace_remove_event_call(struct ftrace_event_call *call)
1020{
1021 mutex_lock(&event_mutex);
1022 down_write(&trace_event_mutex);
1023 __trace_remove_event_call(call);
1024 up_write(&trace_event_mutex);
1025 mutex_unlock(&event_mutex);
1026}
1027
1028#define for_each_event(event, start, end) \
1029 for (event = start; \
1030 (unsigned long)event < (unsigned long)end; \
1031 event++)
1032
1033#ifdef CONFIG_MODULES
1034
1035static LIST_HEAD(ftrace_module_file_list);
1036
1037/*
1038 * Modules must own their file_operations to keep up with
1039 * reference counting.
1040 */
1041struct ftrace_module_file_ops {
1042 struct list_head list;
1043 struct module *mod;
1044 struct file_operations id;
1045 struct file_operations enable;
1046 struct file_operations format;
1047 struct file_operations filter;
1048};
1049
988static struct ftrace_module_file_ops * 1050static struct ftrace_module_file_ops *
989trace_create_file_ops(struct module *mod) 1051trace_create_file_ops(struct module *mod)
990{ 1052{
@@ -1042,7 +1104,7 @@ static void trace_module_add_events(struct module *mod)
1042 if (!call->name) 1104 if (!call->name)
1043 continue; 1105 continue;
1044 if (call->raw_init) { 1106 if (call->raw_init) {
1045 ret = call->raw_init(); 1107 ret = call->raw_init(call);
1046 if (ret < 0) { 1108 if (ret < 0) {
1047 if (ret != -ENOSYS) 1109 if (ret != -ENOSYS)
1048 pr_warning("Could not initialize trace " 1110 pr_warning("Could not initialize trace "
@@ -1060,10 +1122,11 @@ static void trace_module_add_events(struct module *mod)
1060 return; 1122 return;
1061 } 1123 }
1062 call->mod = mod; 1124 call->mod = mod;
1063 list_add(&call->list, &ftrace_events); 1125 ret = event_create_dir(call, d_events,
1064 event_create_dir(call, d_events, 1126 &file_ops->id, &file_ops->enable,
1065 &file_ops->id, &file_ops->enable, 1127 &file_ops->filter, &file_ops->format);
1066 &file_ops->filter, &file_ops->format); 1128 if (!ret)
1129 list_add(&call->list, &ftrace_events);
1067 } 1130 }
1068} 1131}
1069 1132
@@ -1077,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod)
1077 list_for_each_entry_safe(call, p, &ftrace_events, list) { 1140 list_for_each_entry_safe(call, p, &ftrace_events, list) {
1078 if (call->mod == mod) { 1141 if (call->mod == mod) {
1079 found = true; 1142 found = true;
1080 ftrace_event_enable_disable(call, 0); 1143 __trace_remove_event_call(call);
1081 if (call->event)
1082 __unregister_ftrace_event(call->event);
1083 debugfs_remove_recursive(call->dir);
1084 list_del(&call->list);
1085 trace_destroy_fields(call);
1086 destroy_preds(call);
1087 remove_subsystem_dir(call->system);
1088 } 1144 }
1089 } 1145 }
1090 1146
@@ -1202,7 +1258,7 @@ static __init int event_trace_init(void)
1202 if (!call->name) 1258 if (!call->name)
1203 continue; 1259 continue;
1204 if (call->raw_init) { 1260 if (call->raw_init) {
1205 ret = call->raw_init(); 1261 ret = call->raw_init(call);
1206 if (ret < 0) { 1262 if (ret < 0) {
1207 if (ret != -ENOSYS) 1263 if (ret != -ENOSYS)
1208 pr_warning("Could not initialize trace " 1264 pr_warning("Could not initialize trace "
@@ -1210,10 +1266,12 @@ static __init int event_trace_init(void)
1210 continue; 1266 continue;
1211 } 1267 }
1212 } 1268 }
1213 list_add(&call->list, &ftrace_events); 1269 ret = event_create_dir(call, d_events, &ftrace_event_id_fops,
1214 event_create_dir(call, d_events, &ftrace_event_id_fops, 1270 &ftrace_enable_fops,
1215 &ftrace_enable_fops, &ftrace_event_filter_fops, 1271 &ftrace_event_filter_fops,
1216 &ftrace_event_format_fops); 1272 &ftrace_event_format_fops);
1273 if (!ret)
1274 list_add(&call->list, &ftrace_events);
1217 } 1275 }
1218 1276
1219 while (true) { 1277 while (true) {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 21d34757b955..50504cb228de 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1230,12 +1230,12 @@ static int replace_system_preds(struct event_subsystem *system,
1230 struct filter_parse_state *ps, 1230 struct filter_parse_state *ps,
1231 char *filter_string) 1231 char *filter_string)
1232{ 1232{
1233 struct event_filter *filter = system->filter;
1234 struct ftrace_event_call *call; 1233 struct ftrace_event_call *call;
1235 bool fail = true; 1234 bool fail = true;
1236 int err; 1235 int err;
1237 1236
1238 list_for_each_entry(call, &ftrace_events, list) { 1237 list_for_each_entry(call, &ftrace_events, list) {
1238 struct event_filter *filter = call->filter;
1239 1239
1240 if (!call->define_fields) 1240 if (!call->define_fields)
1241 continue; 1241 continue;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 31da218ee10f..934d81fb4ca4 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -134,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \
134 134
135#include "trace_entries.h" 135#include "trace_entries.h"
136 136
137
138#undef __field 137#undef __field
139#define __field(type, item) \ 138#define __field(type, item) \
140 ret = trace_define_field(event_call, #type, #item, \ 139 ret = trace_define_field(event_call, #type, #item, \
@@ -196,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
196 195
197#include "trace_entries.h" 196#include "trace_entries.h"
198 197
198static int ftrace_raw_init_event(struct ftrace_event_call *call)
199{
200 INIT_LIST_HEAD(&call->fields);
201 return 0;
202}
199 203
200#undef __field 204#undef __field
201#define __field(type, item) 205#define __field(type, item)
@@ -214,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
214 218
215#undef FTRACE_ENTRY 219#undef FTRACE_ENTRY
216#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ 220#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \
217static int ftrace_raw_init_event_##call(void); \
218 \ 221 \
219struct ftrace_event_call __used \ 222struct ftrace_event_call __used \
220__attribute__((__aligned__(4))) \ 223__attribute__((__aligned__(4))) \
@@ -222,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \
222 .name = #call, \ 225 .name = #call, \
223 .id = type, \ 226 .id = type, \
224 .system = __stringify(TRACE_SYSTEM), \ 227 .system = __stringify(TRACE_SYSTEM), \
225 .raw_init = ftrace_raw_init_event_##call, \ 228 .raw_init = ftrace_raw_init_event, \
226 .show_format = ftrace_format_##call, \ 229 .show_format = ftrace_format_##call, \
227 .define_fields = ftrace_define_fields_##call, \ 230 .define_fields = ftrace_define_fields_##call, \
228}; \ 231}; \
229static int ftrace_raw_init_event_##call(void) \
230{ \
231 INIT_LIST_HEAD(&event_##call.fields); \
232 return 0; \
233} \
234 232
235#include "trace_entries.h" 233#include "trace_entries.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
new file mode 100644
index 000000000000..3696476f307d
--- /dev/null
+++ b/kernel/trace/trace_kprobe.c
@@ -0,0 +1,1513 @@
1/*
2 * Kprobes-based tracing events
3 *
4 * Created by Masami Hiramatsu <mhiramat@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#include <linux/module.h>
21#include <linux/uaccess.h>
22#include <linux/kprobes.h>
23#include <linux/seq_file.h>
24#include <linux/slab.h>
25#include <linux/smp.h>
26#include <linux/debugfs.h>
27#include <linux/types.h>
28#include <linux/string.h>
29#include <linux/ctype.h>
30#include <linux/ptrace.h>
31#include <linux/perf_event.h>
32
33#include "trace.h"
34#include "trace_output.h"
35
36#define MAX_TRACE_ARGS 128
37#define MAX_ARGSTR_LEN 63
38#define MAX_EVENT_NAME_LEN 64
39#define KPROBE_EVENT_SYSTEM "kprobes"
40
41/* Reserved field names */
42#define FIELD_STRING_IP "__probe_ip"
43#define FIELD_STRING_NARGS "__probe_nargs"
44#define FIELD_STRING_RETIP "__probe_ret_ip"
45#define FIELD_STRING_FUNC "__probe_func"
46
47const char *reserved_field_names[] = {
48 "common_type",
49 "common_flags",
50 "common_preempt_count",
51 "common_pid",
52 "common_tgid",
53 "common_lock_depth",
54 FIELD_STRING_IP,
55 FIELD_STRING_NARGS,
56 FIELD_STRING_RETIP,
57 FIELD_STRING_FUNC,
58};
59
60struct fetch_func {
61 unsigned long (*func)(struct pt_regs *, void *);
62 void *data;
63};
64
65static __kprobes unsigned long call_fetch(struct fetch_func *f,
66 struct pt_regs *regs)
67{
68 return f->func(regs, f->data);
69}
70
71/* fetch handlers */
72static __kprobes unsigned long fetch_register(struct pt_regs *regs,
73 void *offset)
74{
75 return regs_get_register(regs, (unsigned int)((unsigned long)offset));
76}
77
78static __kprobes unsigned long fetch_stack(struct pt_regs *regs,
79 void *num)
80{
81 return regs_get_kernel_stack_nth(regs,
82 (unsigned int)((unsigned long)num));
83}
84
85static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
86{
87 unsigned long retval;
88
89 if (probe_kernel_address(addr, retval))
90 return 0;
91 return retval;
92}
93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy)
101{
102 return regs_return_value(regs);
103}
104
105static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs,
106 void *dummy)
107{
108 return kernel_stack_pointer(regs);
109}
110
111/* Memory fetching by symbol */
112struct symbol_cache {
113 char *symbol;
114 long offset;
115 unsigned long addr;
116};
117
118static unsigned long update_symbol_cache(struct symbol_cache *sc)
119{
120 sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol);
121 if (sc->addr)
122 sc->addr += sc->offset;
123 return sc->addr;
124}
125
126static void free_symbol_cache(struct symbol_cache *sc)
127{
128 kfree(sc->symbol);
129 kfree(sc);
130}
131
132static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
133{
134 struct symbol_cache *sc;
135
136 if (!sym || strlen(sym) == 0)
137 return NULL;
138 sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL);
139 if (!sc)
140 return NULL;
141
142 sc->symbol = kstrdup(sym, GFP_KERNEL);
143 if (!sc->symbol) {
144 kfree(sc);
145 return NULL;
146 }
147 sc->offset = offset;
148
149 update_symbol_cache(sc);
150 return sc;
151}
152
153static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data)
154{
155 struct symbol_cache *sc = data;
156
157 if (sc->addr)
158 return fetch_memory(regs, (void *)sc->addr);
159 else
160 return 0;
161}
162
163/* Special indirect memory access interface */
164struct indirect_fetch_data {
165 struct fetch_func orig;
166 long offset;
167};
168
169static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data)
170{
171 struct indirect_fetch_data *ind = data;
172 unsigned long addr;
173
174 addr = call_fetch(&ind->orig, regs);
175 if (addr) {
176 addr += ind->offset;
177 return fetch_memory(regs, (void *)addr);
178 } else
179 return 0;
180}
181
182static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data)
183{
184 if (data->orig.func == fetch_indirect)
185 free_indirect_fetch_data(data->orig.data);
186 else if (data->orig.func == fetch_symbol)
187 free_symbol_cache(data->orig.data);
188 kfree(data);
189}
190
191/**
192 * Kprobe event core functions
193 */
194
195struct probe_arg {
196 struct fetch_func fetch;
197 const char *name;
198};
199
200/* Flags for trace_probe */
201#define TP_FLAG_TRACE 1
202#define TP_FLAG_PROFILE 2
203
204struct trace_probe {
205 struct list_head list;
206 struct kretprobe rp; /* Use rp.kp for kprobe use */
207 unsigned long nhit;
208 unsigned int flags; /* For TP_FLAG_* */
209 const char *symbol; /* symbol name */
210 struct ftrace_event_call call;
211 struct trace_event event;
212 unsigned int nr_args;
213 struct probe_arg args[];
214};
215
216#define SIZEOF_TRACE_PROBE(n) \
217 (offsetof(struct trace_probe, args) + \
218 (sizeof(struct probe_arg) * (n)))
219
220static __kprobes int probe_is_return(struct trace_probe *tp)
221{
222 return tp->rp.handler != NULL;
223}
224
225static __kprobes const char *probe_symbol(struct trace_probe *tp)
226{
227 return tp->symbol ? tp->symbol : "unknown";
228}
229
230static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{
232 int ret = -EINVAL;
233
234 if (ff->func == fetch_argument)
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name);
240 } else if (ff->func == fetch_stack)
241 ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data);
242 else if (ff->func == fetch_memory)
243 ret = snprintf(buf, n, "@0x%p", ff->data);
244 else if (ff->func == fetch_symbol) {
245 struct symbol_cache *sc = ff->data;
246 ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset);
247 } else if (ff->func == fetch_retvalue)
248 ret = snprintf(buf, n, "$retval");
249 else if (ff->func == fetch_stack_address)
250 ret = snprintf(buf, n, "$stack");
251 else if (ff->func == fetch_indirect) {
252 struct indirect_fetch_data *id = ff->data;
253 size_t l = 0;
254 ret = snprintf(buf, n, "%+ld(", id->offset);
255 if (ret >= n)
256 goto end;
257 l += ret;
258 ret = probe_arg_string(buf + l, n - l, &id->orig);
259 if (ret < 0)
260 goto end;
261 l += ret;
262 ret = snprintf(buf + l, n - l, ")");
263 ret += l;
264 }
265end:
266 if (ret >= n)
267 return -ENOSPC;
268 return ret;
269}
270
271static int register_probe_event(struct trace_probe *tp);
272static void unregister_probe_event(struct trace_probe *tp);
273
274static DEFINE_MUTEX(probe_lock);
275static LIST_HEAD(probe_list);
276
277static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs);
278static int kretprobe_dispatcher(struct kretprobe_instance *ri,
279 struct pt_regs *regs);
280
281/*
282 * Allocate new trace_probe and initialize it (including kprobes).
283 */
284static struct trace_probe *alloc_trace_probe(const char *group,
285 const char *event,
286 void *addr,
287 const char *symbol,
288 unsigned long offs,
289 int nargs, int is_return)
290{
291 struct trace_probe *tp;
292
293 tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL);
294 if (!tp)
295 return ERR_PTR(-ENOMEM);
296
297 if (symbol) {
298 tp->symbol = kstrdup(symbol, GFP_KERNEL);
299 if (!tp->symbol)
300 goto error;
301 tp->rp.kp.symbol_name = tp->symbol;
302 tp->rp.kp.offset = offs;
303 } else
304 tp->rp.kp.addr = addr;
305
306 if (is_return)
307 tp->rp.handler = kretprobe_dispatcher;
308 else
309 tp->rp.kp.pre_handler = kprobe_dispatcher;
310
311 if (!event)
312 goto error;
313 tp->call.name = kstrdup(event, GFP_KERNEL);
314 if (!tp->call.name)
315 goto error;
316
317 if (!group)
318 goto error;
319 tp->call.system = kstrdup(group, GFP_KERNEL);
320 if (!tp->call.system)
321 goto error;
322
323 INIT_LIST_HEAD(&tp->list);
324 return tp;
325error:
326 kfree(tp->call.name);
327 kfree(tp->symbol);
328 kfree(tp);
329 return ERR_PTR(-ENOMEM);
330}
331
332static void free_probe_arg(struct probe_arg *arg)
333{
334 if (arg->fetch.func == fetch_symbol)
335 free_symbol_cache(arg->fetch.data);
336 else if (arg->fetch.func == fetch_indirect)
337 free_indirect_fetch_data(arg->fetch.data);
338 kfree(arg->name);
339}
340
341static void free_trace_probe(struct trace_probe *tp)
342{
343 int i;
344
345 for (i = 0; i < tp->nr_args; i++)
346 free_probe_arg(&tp->args[i]);
347
348 kfree(tp->call.system);
349 kfree(tp->call.name);
350 kfree(tp->symbol);
351 kfree(tp);
352}
353
354static struct trace_probe *find_probe_event(const char *event,
355 const char *group)
356{
357 struct trace_probe *tp;
358
359 list_for_each_entry(tp, &probe_list, list)
360 if (strcmp(tp->call.name, event) == 0 &&
361 strcmp(tp->call.system, group) == 0)
362 return tp;
363 return NULL;
364}
365
366/* Unregister a trace_probe and probe_event: call with locking probe_lock */
367static void unregister_trace_probe(struct trace_probe *tp)
368{
369 if (probe_is_return(tp))
370 unregister_kretprobe(&tp->rp);
371 else
372 unregister_kprobe(&tp->rp.kp);
373 list_del(&tp->list);
374 unregister_probe_event(tp);
375}
376
377/* Register a trace_probe and probe_event */
378static int register_trace_probe(struct trace_probe *tp)
379{
380 struct trace_probe *old_tp;
381 int ret;
382
383 mutex_lock(&probe_lock);
384
385 /* register as an event */
386 old_tp = find_probe_event(tp->call.name, tp->call.system);
387 if (old_tp) {
388 /* delete old event */
389 unregister_trace_probe(old_tp);
390 free_trace_probe(old_tp);
391 }
392 ret = register_probe_event(tp);
393 if (ret) {
394 pr_warning("Faild to register probe event(%d)\n", ret);
395 goto end;
396 }
397
398 tp->rp.kp.flags |= KPROBE_FLAG_DISABLED;
399 if (probe_is_return(tp))
400 ret = register_kretprobe(&tp->rp);
401 else
402 ret = register_kprobe(&tp->rp.kp);
403
404 if (ret) {
405 pr_warning("Could not insert probe(%d)\n", ret);
406 if (ret == -EILSEQ) {
407 pr_warning("Probing address(0x%p) is not an "
408 "instruction boundary.\n",
409 tp->rp.kp.addr);
410 ret = -EINVAL;
411 }
412 unregister_probe_event(tp);
413 } else
414 list_add_tail(&tp->list, &probe_list);
415end:
416 mutex_unlock(&probe_lock);
417 return ret;
418}
419
420/* Split symbol and offset. */
421static int split_symbol_offset(char *symbol, unsigned long *offset)
422{
423 char *tmp;
424 int ret;
425
426 if (!offset)
427 return -EINVAL;
428
429 tmp = strchr(symbol, '+');
430 if (tmp) {
431 /* skip sign because strict_strtol doesn't accept '+' */
432 ret = strict_strtoul(tmp + 1, 0, offset);
433 if (ret)
434 return ret;
435 *tmp = '\0';
436 } else
437 *offset = 0;
438 return 0;
439}
440
441#define PARAM_MAX_ARGS 16
442#define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long))
443
444static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
445{
446 int ret = 0;
447 unsigned long param;
448
449 if (strcmp(arg, "retval") == 0) {
450 if (is_return) {
451 ff->func = fetch_retvalue;
452 ff->data = NULL;
453 } else
454 ret = -EINVAL;
455 } else if (strncmp(arg, "stack", 5) == 0) {
456 if (arg[5] == '\0') {
457 ff->func = fetch_stack_address;
458 ff->data = NULL;
459 } else if (isdigit(arg[5])) {
460 ret = strict_strtoul(arg + 5, 10, &param);
461 if (ret || param > PARAM_MAX_STACK)
462 ret = -EINVAL;
463 else {
464 ff->func = fetch_stack;
465 ff->data = (void *)param;
466 }
467 } else
468 ret = -EINVAL;
469 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
470 ret = strict_strtoul(arg + 3, 10, &param);
471 if (ret || param > PARAM_MAX_ARGS)
472 ret = -EINVAL;
473 else {
474 ff->func = fetch_argument;
475 ff->data = (void *)param;
476 }
477 } else
478 ret = -EINVAL;
479 return ret;
480}
481
482static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return)
483{
484 int ret = 0;
485 unsigned long param;
486 long offset;
487 char *tmp;
488
489 switch (arg[0]) {
490 case '$':
491 ret = parse_probe_vars(arg + 1, ff, is_return);
492 break;
493 case '%': /* named register */
494 ret = regs_query_register_offset(arg + 1);
495 if (ret >= 0) {
496 ff->func = fetch_register;
497 ff->data = (void *)(unsigned long)ret;
498 ret = 0;
499 }
500 break;
501 case '@': /* memory or symbol */
502 if (isdigit(arg[1])) {
503 ret = strict_strtoul(arg + 1, 0, &param);
504 if (ret)
505 break;
506 ff->func = fetch_memory;
507 ff->data = (void *)param;
508 } else {
509 ret = split_symbol_offset(arg + 1, &offset);
510 if (ret)
511 break;
512 ff->data = alloc_symbol_cache(arg + 1, offset);
513 if (ff->data)
514 ff->func = fetch_symbol;
515 else
516 ret = -EINVAL;
517 }
518 break;
519 case '+': /* indirect memory */
520 case '-':
521 tmp = strchr(arg, '(');
522 if (!tmp) {
523 ret = -EINVAL;
524 break;
525 }
526 *tmp = '\0';
527 ret = strict_strtol(arg + 1, 0, &offset);
528 if (ret)
529 break;
530 if (arg[0] == '-')
531 offset = -offset;
532 arg = tmp + 1;
533 tmp = strrchr(arg, ')');
534 if (tmp) {
535 struct indirect_fetch_data *id;
536 *tmp = '\0';
537 id = kzalloc(sizeof(struct indirect_fetch_data),
538 GFP_KERNEL);
539 if (!id)
540 return -ENOMEM;
541 id->offset = offset;
542 ret = parse_probe_arg(arg, &id->orig, is_return);
543 if (ret)
544 kfree(id);
545 else {
546 ff->func = fetch_indirect;
547 ff->data = (void *)id;
548 }
549 } else
550 ret = -EINVAL;
551 break;
552 default:
553 /* TODO: support custom handler */
554 ret = -EINVAL;
555 }
556 return ret;
557}
558
559/* Return 1 if name is reserved or already used by another argument */
560static int conflict_field_name(const char *name,
561 struct probe_arg *args, int narg)
562{
563 int i;
564 for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++)
565 if (strcmp(reserved_field_names[i], name) == 0)
566 return 1;
567 for (i = 0; i < narg; i++)
568 if (strcmp(args[i].name, name) == 0)
569 return 1;
570 return 0;
571}
572
573static int create_trace_probe(int argc, char **argv)
574{
575 /*
576 * Argument syntax:
577 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
578 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
579 * Fetch args:
580 * $argN : fetch Nth of function argument. (N:0-)
581 * $retval : fetch return value
582 * $stack : fetch stack address
583 * $stackN : fetch Nth of stack (N:0-)
584 * @ADDR : fetch memory at ADDR (ADDR should be in kernel)
585 * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol)
586 * %REG : fetch register REG
587 * Indirect memory fetch:
588 * +|-offs(ARG) : fetch memory at ARG +|- offs address.
589 * Alias name of args:
590 * NAME=FETCHARG : set NAME as alias of FETCHARG.
591 */
592 struct trace_probe *tp;
593 int i, ret = 0;
594 int is_return = 0;
595 char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL;
596 unsigned long offset = 0;
597 void *addr = NULL;
598 char buf[MAX_EVENT_NAME_LEN];
599
600 if (argc < 2) {
601 pr_info("Probe point is not specified.\n");
602 return -EINVAL;
603 }
604
605 if (argv[0][0] == 'p')
606 is_return = 0;
607 else if (argv[0][0] == 'r')
608 is_return = 1;
609 else {
610 pr_info("Probe definition must be started with 'p' or 'r'.\n");
611 return -EINVAL;
612 }
613
614 if (argv[0][1] == ':') {
615 event = &argv[0][2];
616 if (strchr(event, '/')) {
617 group = event;
618 event = strchr(group, '/') + 1;
619 event[-1] = '\0';
620 if (strlen(group) == 0) {
621 pr_info("Group name is not specifiled\n");
622 return -EINVAL;
623 }
624 }
625 if (strlen(event) == 0) {
626 pr_info("Event name is not specifiled\n");
627 return -EINVAL;
628 }
629 }
630
631 if (isdigit(argv[1][0])) {
632 if (is_return) {
633 pr_info("Return probe point must be a symbol.\n");
634 return -EINVAL;
635 }
636 /* an address specified */
637 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr);
638 if (ret) {
639 pr_info("Failed to parse address.\n");
640 return ret;
641 }
642 } else {
643 /* a symbol specified */
644 symbol = argv[1];
645 /* TODO: support .init module functions */
646 ret = split_symbol_offset(symbol, &offset);
647 if (ret) {
648 pr_info("Failed to parse symbol.\n");
649 return ret;
650 }
651 if (offset && is_return) {
652 pr_info("Return probe must be used without offset.\n");
653 return -EINVAL;
654 }
655 }
656 argc -= 2; argv += 2;
657
658 /* setup a probe */
659 if (!group)
660 group = KPROBE_EVENT_SYSTEM;
661 if (!event) {
662 /* Make a new event name */
663 if (symbol)
664 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld",
665 is_return ? 'r' : 'p', symbol, offset);
666 else
667 snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p",
668 is_return ? 'r' : 'p', addr);
669 event = buf;
670 }
671 tp = alloc_trace_probe(group, event, addr, symbol, offset, argc,
672 is_return);
673 if (IS_ERR(tp)) {
674 pr_info("Failed to allocate trace_probe.(%d)\n",
675 (int)PTR_ERR(tp));
676 return PTR_ERR(tp);
677 }
678
679 /* parse arguments */
680 ret = 0;
681 for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) {
682 /* Parse argument name */
683 arg = strchr(argv[i], '=');
684 if (arg)
685 *arg++ = '\0';
686 else
687 arg = argv[i];
688
689 if (conflict_field_name(argv[i], tp->args, i)) {
690 pr_info("Argument%d name '%s' conflicts with "
691 "another field.\n", i, argv[i]);
692 ret = -EINVAL;
693 goto error;
694 }
695
696 tp->args[i].name = kstrdup(argv[i], GFP_KERNEL);
697
698 /* Parse fetch argument */
699 if (strlen(arg) > MAX_ARGSTR_LEN) {
700 pr_info("Argument%d(%s) is too long.\n", i, arg);
701 ret = -ENOSPC;
702 goto error;
703 }
704 ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return);
705 if (ret) {
706 pr_info("Parse error at argument%d. (%d)\n", i, ret);
707 goto error;
708 }
709 }
710 tp->nr_args = i;
711
712 ret = register_trace_probe(tp);
713 if (ret)
714 goto error;
715 return 0;
716
717error:
718 free_trace_probe(tp);
719 return ret;
720}
721
722static void cleanup_all_probes(void)
723{
724 struct trace_probe *tp;
725
726 mutex_lock(&probe_lock);
727 /* TODO: Use batch unregistration */
728 while (!list_empty(&probe_list)) {
729 tp = list_entry(probe_list.next, struct trace_probe, list);
730 unregister_trace_probe(tp);
731 free_trace_probe(tp);
732 }
733 mutex_unlock(&probe_lock);
734}
735
736
737/* Probes listing interfaces */
738static void *probes_seq_start(struct seq_file *m, loff_t *pos)
739{
740 mutex_lock(&probe_lock);
741 return seq_list_start(&probe_list, *pos);
742}
743
744static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos)
745{
746 return seq_list_next(v, &probe_list, pos);
747}
748
749static void probes_seq_stop(struct seq_file *m, void *v)
750{
751 mutex_unlock(&probe_lock);
752}
753
754static int probes_seq_show(struct seq_file *m, void *v)
755{
756 struct trace_probe *tp = v;
757 int i, ret;
758 char buf[MAX_ARGSTR_LEN + 1];
759
760 seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p');
761 seq_printf(m, ":%s", tp->call.name);
762
763 if (tp->symbol)
764 seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset);
765 else
766 seq_printf(m, " 0x%p", tp->rp.kp.addr);
767
768 for (i = 0; i < tp->nr_args; i++) {
769 ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch);
770 if (ret < 0) {
771 pr_warning("Argument%d decoding error(%d).\n", i, ret);
772 return ret;
773 }
774 seq_printf(m, " %s=%s", tp->args[i].name, buf);
775 }
776 seq_printf(m, "\n");
777 return 0;
778}
779
780static const struct seq_operations probes_seq_op = {
781 .start = probes_seq_start,
782 .next = probes_seq_next,
783 .stop = probes_seq_stop,
784 .show = probes_seq_show
785};
786
787static int probes_open(struct inode *inode, struct file *file)
788{
789 if ((file->f_mode & FMODE_WRITE) &&
790 (file->f_flags & O_TRUNC))
791 cleanup_all_probes();
792
793 return seq_open(file, &probes_seq_op);
794}
795
796static int command_trace_probe(const char *buf)
797{
798 char **argv;
799 int argc = 0, ret = 0;
800
801 argv = argv_split(GFP_KERNEL, buf, &argc);
802 if (!argv)
803 return -ENOMEM;
804
805 if (argc)
806 ret = create_trace_probe(argc, argv);
807
808 argv_free(argv);
809 return ret;
810}
811
812#define WRITE_BUFSIZE 128
813
814static ssize_t probes_write(struct file *file, const char __user *buffer,
815 size_t count, loff_t *ppos)
816{
817 char *kbuf, *tmp;
818 int ret;
819 size_t done;
820 size_t size;
821
822 kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL);
823 if (!kbuf)
824 return -ENOMEM;
825
826 ret = done = 0;
827 while (done < count) {
828 size = count - done;
829 if (size >= WRITE_BUFSIZE)
830 size = WRITE_BUFSIZE - 1;
831 if (copy_from_user(kbuf, buffer + done, size)) {
832 ret = -EFAULT;
833 goto out;
834 }
835 kbuf[size] = '\0';
836 tmp = strchr(kbuf, '\n');
837 if (tmp) {
838 *tmp = '\0';
839 size = tmp - kbuf + 1;
840 } else if (done + size < count) {
841 pr_warning("Line length is too long: "
842 "Should be less than %d.", WRITE_BUFSIZE);
843 ret = -EINVAL;
844 goto out;
845 }
846 done += size;
847 /* Remove comments */
848 tmp = strchr(kbuf, '#');
849 if (tmp)
850 *tmp = '\0';
851
852 ret = command_trace_probe(kbuf);
853 if (ret)
854 goto out;
855 }
856 ret = done;
857out:
858 kfree(kbuf);
859 return ret;
860}
861
862static const struct file_operations kprobe_events_ops = {
863 .owner = THIS_MODULE,
864 .open = probes_open,
865 .read = seq_read,
866 .llseek = seq_lseek,
867 .release = seq_release,
868 .write = probes_write,
869};
870
871/* Probes profiling interfaces */
872static int probes_profile_seq_show(struct seq_file *m, void *v)
873{
874 struct trace_probe *tp = v;
875
876 seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit,
877 tp->rp.kp.nmissed);
878
879 return 0;
880}
881
882static const struct seq_operations profile_seq_op = {
883 .start = probes_seq_start,
884 .next = probes_seq_next,
885 .stop = probes_seq_stop,
886 .show = probes_profile_seq_show
887};
888
889static int profile_open(struct inode *inode, struct file *file)
890{
891 return seq_open(file, &profile_seq_op);
892}
893
894static const struct file_operations kprobe_profile_ops = {
895 .owner = THIS_MODULE,
896 .open = profile_open,
897 .read = seq_read,
898 .llseek = seq_lseek,
899 .release = seq_release,
900};
901
902/* Kprobe handler */
903static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
904{
905 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
906 struct kprobe_trace_entry *entry;
907 struct ring_buffer_event *event;
908 struct ring_buffer *buffer;
909 int size, i, pc;
910 unsigned long irq_flags;
911 struct ftrace_event_call *call = &tp->call;
912
913 tp->nhit++;
914
915 local_save_flags(irq_flags);
916 pc = preempt_count();
917
918 size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
919
920 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
921 irq_flags, pc);
922 if (!event)
923 return 0;
924
925 entry = ring_buffer_event_data(event);
926 entry->nargs = tp->nr_args;
927 entry->ip = (unsigned long)kp->addr;
928 for (i = 0; i < tp->nr_args; i++)
929 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
930
931 if (!filter_current_check_discard(buffer, call, entry, event))
932 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
933 return 0;
934}
935
936/* Kretprobe handler */
937static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
938 struct pt_regs *regs)
939{
940 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
941 struct kretprobe_trace_entry *entry;
942 struct ring_buffer_event *event;
943 struct ring_buffer *buffer;
944 int size, i, pc;
945 unsigned long irq_flags;
946 struct ftrace_event_call *call = &tp->call;
947
948 local_save_flags(irq_flags);
949 pc = preempt_count();
950
951 size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
952
953 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
954 irq_flags, pc);
955 if (!event)
956 return 0;
957
958 entry = ring_buffer_event_data(event);
959 entry->nargs = tp->nr_args;
960 entry->func = (unsigned long)tp->rp.kp.addr;
961 entry->ret_ip = (unsigned long)ri->ret_addr;
962 for (i = 0; i < tp->nr_args; i++)
963 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
964
965 if (!filter_current_check_discard(buffer, call, entry, event))
966 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
967
968 return 0;
969}
970
971/* Event entry printers */
972enum print_line_t
973print_kprobe_event(struct trace_iterator *iter, int flags)
974{
975 struct kprobe_trace_entry *field;
976 struct trace_seq *s = &iter->seq;
977 struct trace_event *event;
978 struct trace_probe *tp;
979 int i;
980
981 field = (struct kprobe_trace_entry *)iter->ent;
982 event = ftrace_find_event(field->ent.type);
983 tp = container_of(event, struct trace_probe, event);
984
985 if (!trace_seq_printf(s, "%s: (", tp->call.name))
986 goto partial;
987
988 if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
989 goto partial;
990
991 if (!trace_seq_puts(s, ")"))
992 goto partial;
993
994 for (i = 0; i < field->nargs; i++)
995 if (!trace_seq_printf(s, " %s=%lx",
996 tp->args[i].name, field->args[i]))
997 goto partial;
998
999 if (!trace_seq_puts(s, "\n"))
1000 goto partial;
1001
1002 return TRACE_TYPE_HANDLED;
1003partial:
1004 return TRACE_TYPE_PARTIAL_LINE;
1005}
1006
1007enum print_line_t
1008print_kretprobe_event(struct trace_iterator *iter, int flags)
1009{
1010 struct kretprobe_trace_entry *field;
1011 struct trace_seq *s = &iter->seq;
1012 struct trace_event *event;
1013 struct trace_probe *tp;
1014 int i;
1015
1016 field = (struct kretprobe_trace_entry *)iter->ent;
1017 event = ftrace_find_event(field->ent.type);
1018 tp = container_of(event, struct trace_probe, event);
1019
1020 if (!trace_seq_printf(s, "%s: (", tp->call.name))
1021 goto partial;
1022
1023 if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET))
1024 goto partial;
1025
1026 if (!trace_seq_puts(s, " <- "))
1027 goto partial;
1028
1029 if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET))
1030 goto partial;
1031
1032 if (!trace_seq_puts(s, ")"))
1033 goto partial;
1034
1035 for (i = 0; i < field->nargs; i++)
1036 if (!trace_seq_printf(s, " %s=%lx",
1037 tp->args[i].name, field->args[i]))
1038 goto partial;
1039
1040 if (!trace_seq_puts(s, "\n"))
1041 goto partial;
1042
1043 return TRACE_TYPE_HANDLED;
1044partial:
1045 return TRACE_TYPE_PARTIAL_LINE;
1046}
1047
1048static int probe_event_enable(struct ftrace_event_call *call)
1049{
1050 struct trace_probe *tp = (struct trace_probe *)call->data;
1051
1052 tp->flags |= TP_FLAG_TRACE;
1053 if (probe_is_return(tp))
1054 return enable_kretprobe(&tp->rp);
1055 else
1056 return enable_kprobe(&tp->rp.kp);
1057}
1058
1059static void probe_event_disable(struct ftrace_event_call *call)
1060{
1061 struct trace_probe *tp = (struct trace_probe *)call->data;
1062
1063 tp->flags &= ~TP_FLAG_TRACE;
1064 if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) {
1065 if (probe_is_return(tp))
1066 disable_kretprobe(&tp->rp);
1067 else
1068 disable_kprobe(&tp->rp.kp);
1069 }
1070}
1071
1072static int probe_event_raw_init(struct ftrace_event_call *event_call)
1073{
1074 INIT_LIST_HEAD(&event_call->fields);
1075
1076 return 0;
1077}
1078
1079#undef DEFINE_FIELD
1080#define DEFINE_FIELD(type, item, name, is_signed) \
1081 do { \
1082 ret = trace_define_field(event_call, #type, name, \
1083 offsetof(typeof(field), item), \
1084 sizeof(field.item), is_signed, \
1085 FILTER_OTHER); \
1086 if (ret) \
1087 return ret; \
1088 } while (0)
1089
1090static int kprobe_event_define_fields(struct ftrace_event_call *event_call)
1091{
1092 int ret, i;
1093 struct kprobe_trace_entry field;
1094 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1095
1096 ret = trace_define_common_fields(event_call);
1097 if (!ret)
1098 return ret;
1099
1100 DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
1101 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1102 /* Set argument names as fields */
1103 for (i = 0; i < tp->nr_args; i++)
1104 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1105 return 0;
1106}
1107
1108static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
1109{
1110 int ret, i;
1111 struct kretprobe_trace_entry field;
1112 struct trace_probe *tp = (struct trace_probe *)event_call->data;
1113
1114 ret = trace_define_common_fields(event_call);
1115 if (!ret)
1116 return ret;
1117
1118 DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0);
1119 DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0);
1120 DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1);
1121 /* Set argument names as fields */
1122 for (i = 0; i < tp->nr_args; i++)
1123 DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0);
1124 return 0;
1125}
1126
1127static int __probe_event_show_format(struct trace_seq *s,
1128 struct trace_probe *tp, const char *fmt,
1129 const char *arg)
1130{
1131 int i;
1132
1133 /* Show format */
1134 if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt))
1135 return 0;
1136
1137 for (i = 0; i < tp->nr_args; i++)
1138 if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name))
1139 return 0;
1140
1141 if (!trace_seq_printf(s, "\", %s", arg))
1142 return 0;
1143
1144 for (i = 0; i < tp->nr_args; i++)
1145 if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name))
1146 return 0;
1147
1148 return trace_seq_puts(s, "\n");
1149}
1150
1151#undef SHOW_FIELD
1152#define SHOW_FIELD(type, item, name) \
1153 do { \
1154 ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \
1155 "offset:%u;\tsize:%u;\n", name, \
1156 (unsigned int)offsetof(typeof(field), item),\
1157 (unsigned int)sizeof(type)); \
1158 if (!ret) \
1159 return 0; \
1160 } while (0)
1161
1162static int kprobe_event_show_format(struct ftrace_event_call *call,
1163 struct trace_seq *s)
1164{
1165 struct kprobe_trace_entry field __attribute__((unused));
1166 int ret, i;
1167 struct trace_probe *tp = (struct trace_probe *)call->data;
1168
1169 SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP);
1170 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1171
1172 /* Show fields */
1173 for (i = 0; i < tp->nr_args; i++)
1174 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1175 trace_seq_puts(s, "\n");
1176
1177 return __probe_event_show_format(s, tp, "(%lx)",
1178 "REC->" FIELD_STRING_IP);
1179}
1180
1181static int kretprobe_event_show_format(struct ftrace_event_call *call,
1182 struct trace_seq *s)
1183{
1184 struct kretprobe_trace_entry field __attribute__((unused));
1185 int ret, i;
1186 struct trace_probe *tp = (struct trace_probe *)call->data;
1187
1188 SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC);
1189 SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP);
1190 SHOW_FIELD(int, nargs, FIELD_STRING_NARGS);
1191
1192 /* Show fields */
1193 for (i = 0; i < tp->nr_args; i++)
1194 SHOW_FIELD(unsigned long, args[i], tp->args[i].name);
1195 trace_seq_puts(s, "\n");
1196
1197 return __probe_event_show_format(s, tp, "(%lx <- %lx)",
1198 "REC->" FIELD_STRING_FUNC
1199 ", REC->" FIELD_STRING_RETIP);
1200}
1201
1202#ifdef CONFIG_EVENT_PROFILE
1203
1204/* Kprobe profile handler */
1205static __kprobes int kprobe_profile_func(struct kprobe *kp,
1206 struct pt_regs *regs)
1207{
1208 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1209 struct ftrace_event_call *call = &tp->call;
1210 struct kprobe_trace_entry *entry;
1211 struct perf_trace_buf *trace_buf;
1212 struct trace_entry *ent;
1213 int size, __size, i, pc, __cpu;
1214 unsigned long irq_flags;
1215 char *raw_data;
1216
1217 pc = preempt_count();
1218 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1219 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1220 size -= sizeof(u32);
1221 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1222 "profile buffer not large enough"))
1223 return 0;
1224
1225 /*
1226 * Protect the non nmi buffer
1227 * This also protects the rcu read side
1228 */
1229 local_irq_save(irq_flags);
1230 __cpu = smp_processor_id();
1231
1232 if (in_nmi())
1233 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1234 else
1235 trace_buf = rcu_dereference(perf_trace_buf);
1236
1237 if (!trace_buf)
1238 goto end;
1239
1240 trace_buf = per_cpu_ptr(trace_buf, __cpu);
1241
1242 if (trace_buf->recursion++)
1243 goto end_recursion;
1244
1245 /*
1246 * Make recursion update visible before entering perf_tp_event
1247 * so that we protect from perf recursions.
1248 */
1249 barrier();
1250
1251 raw_data = trace_buf->buf;
1252
1253 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1254 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1255 entry = (struct kprobe_trace_entry *)raw_data;
1256 ent = &entry->ent;
1257
1258 tracing_generic_entry_update(ent, irq_flags, pc);
1259 ent->type = call->id;
1260 entry->nargs = tp->nr_args;
1261 entry->ip = (unsigned long)kp->addr;
1262 for (i = 0; i < tp->nr_args; i++)
1263 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1264 perf_tp_event(call->id, entry->ip, 1, entry, size);
1265
1266end_recursion:
1267 trace_buf->recursion--;
1268end:
1269 local_irq_restore(irq_flags);
1270
1271 return 0;
1272}
1273
1274/* Kretprobe profile handler */
1275static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri,
1276 struct pt_regs *regs)
1277{
1278 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1279 struct ftrace_event_call *call = &tp->call;
1280 struct kretprobe_trace_entry *entry;
1281 struct perf_trace_buf *trace_buf;
1282 struct trace_entry *ent;
1283 int size, __size, i, pc, __cpu;
1284 unsigned long irq_flags;
1285 char *raw_data;
1286
1287 pc = preempt_count();
1288 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1289 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1290 size -= sizeof(u32);
1291 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1292 "profile buffer not large enough"))
1293 return 0;
1294
1295 /*
1296 * Protect the non nmi buffer
1297 * This also protects the rcu read side
1298 */
1299 local_irq_save(irq_flags);
1300 __cpu = smp_processor_id();
1301
1302 if (in_nmi())
1303 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1304 else
1305 trace_buf = rcu_dereference(perf_trace_buf);
1306
1307 if (!trace_buf)
1308 goto end;
1309
1310 trace_buf = per_cpu_ptr(trace_buf, __cpu);
1311
1312 if (trace_buf->recursion++)
1313 goto end_recursion;
1314
1315 /*
1316 * Make recursion update visible before entering perf_tp_event
1317 * so that we protect from perf recursions.
1318 */
1319 barrier();
1320
1321 raw_data = trace_buf->buf;
1322
1323 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1324 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1325 entry = (struct kretprobe_trace_entry *)raw_data;
1326 ent = &entry->ent;
1327
1328 tracing_generic_entry_update(ent, irq_flags, pc);
1329 ent->type = call->id;
1330 entry->nargs = tp->nr_args;
1331 entry->func = (unsigned long)tp->rp.kp.addr;
1332 entry->ret_ip = (unsigned long)ri->ret_addr;
1333 for (i = 0; i < tp->nr_args; i++)
1334 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1335 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1336
1337end_recursion:
1338 trace_buf->recursion--;
1339end:
1340 local_irq_restore(irq_flags);
1341
1342 return 0;
1343}
1344
1345static int probe_profile_enable(struct ftrace_event_call *call)
1346{
1347 struct trace_probe *tp = (struct trace_probe *)call->data;
1348
1349 tp->flags |= TP_FLAG_PROFILE;
1350
1351 if (probe_is_return(tp))
1352 return enable_kretprobe(&tp->rp);
1353 else
1354 return enable_kprobe(&tp->rp.kp);
1355}
1356
1357static void probe_profile_disable(struct ftrace_event_call *call)
1358{
1359 struct trace_probe *tp = (struct trace_probe *)call->data;
1360
1361 tp->flags &= ~TP_FLAG_PROFILE;
1362
1363 if (!(tp->flags & TP_FLAG_TRACE)) {
1364 if (probe_is_return(tp))
1365 disable_kretprobe(&tp->rp);
1366 else
1367 disable_kprobe(&tp->rp.kp);
1368 }
1369}
1370#endif /* CONFIG_EVENT_PROFILE */
1371
1372
1373static __kprobes
1374int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1375{
1376 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1377
1378 if (tp->flags & TP_FLAG_TRACE)
1379 kprobe_trace_func(kp, regs);
1380#ifdef CONFIG_EVENT_PROFILE
1381 if (tp->flags & TP_FLAG_PROFILE)
1382 kprobe_profile_func(kp, regs);
1383#endif /* CONFIG_EVENT_PROFILE */
1384 return 0; /* We don't tweek kernel, so just return 0 */
1385}
1386
1387static __kprobes
1388int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1389{
1390 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1391
1392 if (tp->flags & TP_FLAG_TRACE)
1393 kretprobe_trace_func(ri, regs);
1394#ifdef CONFIG_EVENT_PROFILE
1395 if (tp->flags & TP_FLAG_PROFILE)
1396 kretprobe_profile_func(ri, regs);
1397#endif /* CONFIG_EVENT_PROFILE */
1398 return 0; /* We don't tweek kernel, so just return 0 */
1399}
1400
1401static int register_probe_event(struct trace_probe *tp)
1402{
1403 struct ftrace_event_call *call = &tp->call;
1404 int ret;
1405
1406 /* Initialize ftrace_event_call */
1407 if (probe_is_return(tp)) {
1408 tp->event.trace = print_kretprobe_event;
1409 call->raw_init = probe_event_raw_init;
1410 call->show_format = kretprobe_event_show_format;
1411 call->define_fields = kretprobe_event_define_fields;
1412 } else {
1413 tp->event.trace = print_kprobe_event;
1414 call->raw_init = probe_event_raw_init;
1415 call->show_format = kprobe_event_show_format;
1416 call->define_fields = kprobe_event_define_fields;
1417 }
1418 call->event = &tp->event;
1419 call->id = register_ftrace_event(&tp->event);
1420 if (!call->id)
1421 return -ENODEV;
1422 call->enabled = 0;
1423 call->regfunc = probe_event_enable;
1424 call->unregfunc = probe_event_disable;
1425
1426#ifdef CONFIG_EVENT_PROFILE
1427 atomic_set(&call->profile_count, -1);
1428 call->profile_enable = probe_profile_enable;
1429 call->profile_disable = probe_profile_disable;
1430#endif
1431 call->data = tp;
1432 ret = trace_add_event_call(call);
1433 if (ret) {
1434 pr_info("Failed to register kprobe event: %s\n", call->name);
1435 unregister_ftrace_event(&tp->event);
1436 }
1437 return ret;
1438}
1439
1440static void unregister_probe_event(struct trace_probe *tp)
1441{
1442 /* tp->event is unregistered in trace_remove_event_call() */
1443 trace_remove_event_call(&tp->call);
1444}
1445
1446/* Make a debugfs interface for controling probe points */
1447static __init int init_kprobe_trace(void)
1448{
1449 struct dentry *d_tracer;
1450 struct dentry *entry;
1451
1452 d_tracer = tracing_init_dentry();
1453 if (!d_tracer)
1454 return 0;
1455
1456 entry = debugfs_create_file("kprobe_events", 0644, d_tracer,
1457 NULL, &kprobe_events_ops);
1458
1459 /* Event list interface */
1460 if (!entry)
1461 pr_warning("Could not create debugfs "
1462 "'kprobe_events' entry\n");
1463
1464 /* Profile interface */
1465 entry = debugfs_create_file("kprobe_profile", 0444, d_tracer,
1466 NULL, &kprobe_profile_ops);
1467
1468 if (!entry)
1469 pr_warning("Could not create debugfs "
1470 "'kprobe_profile' entry\n");
1471 return 0;
1472}
1473fs_initcall(init_kprobe_trace);
1474
1475
1476#ifdef CONFIG_FTRACE_STARTUP_TEST
1477
1478static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1479 int a4, int a5, int a6)
1480{
1481 return a1 + a2 + a3 + a4 + a5 + a6;
1482}
1483
1484static __init int kprobe_trace_self_tests_init(void)
1485{
1486 int ret;
1487 int (*target)(int, int, int, int, int, int);
1488
1489 target = kprobe_trace_selftest_target;
1490
1491 pr_info("Testing kprobe tracing: ");
1492
1493 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1494 "$arg1 $arg2 $arg3 $arg4 $stack $stack0");
1495 if (WARN_ON_ONCE(ret))
1496 pr_warning("error enabling function entry\n");
1497
1498 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1499 "$retval");
1500 if (WARN_ON_ONCE(ret))
1501 pr_warning("error enabling function return\n");
1502
1503 ret = target(1, 2, 3, 4, 5, 6);
1504
1505 cleanup_all_probes();
1506
1507 pr_cont("OK\n");
1508 return 0;
1509}
1510
1511late_initcall(kprobe_trace_self_tests_init);
1512
1513#endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed17565826b0..b6c12c6a1bcd 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
69 * @s: trace sequence descriptor 69 * @s: trace sequence descriptor
70 * @fmt: printf format string 70 * @fmt: printf format string
71 * 71 *
72 * It returns 0 if the trace oversizes the buffer's free
73 * space, 1 otherwise.
74 *
72 * The tracer may use either sequence operations or its own 75 * The tracer may use either sequence operations or its own
73 * copy to user routines. To simplify formating of a trace 76 * copy to user routines. To simplify formating of a trace
74 * trace_seq_printf is used to store strings into a special 77 * trace_seq_printf is used to store strings into a special
@@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
95 98
96 s->len += ret; 99 s->len += ret;
97 100
98 return len; 101 return 1;
99} 102}
100EXPORT_SYMBOL_GPL(trace_seq_printf); 103EXPORT_SYMBOL_GPL(trace_seq_printf);
101 104
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index d00d1a8f1f26..51213b0aa81b 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -354,13 +354,13 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret)
354 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 354 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
355} 355}
356 356
357int reg_event_syscall_enter(void *ptr) 357int reg_event_syscall_enter(struct ftrace_event_call *call)
358{ 358{
359 int ret = 0; 359 int ret = 0;
360 int num; 360 int num;
361 char *name; 361 char *name;
362 362
363 name = (char *)ptr; 363 name = (char *)call->data;
364 num = syscall_name_to_nr(name); 364 num = syscall_name_to_nr(name);
365 if (num < 0 || num >= NR_syscalls) 365 if (num < 0 || num >= NR_syscalls)
366 return -ENOSYS; 366 return -ENOSYS;
@@ -378,12 +378,12 @@ int reg_event_syscall_enter(void *ptr)
378 return ret; 378 return ret;
379} 379}
380 380
381void unreg_event_syscall_enter(void *ptr) 381void unreg_event_syscall_enter(struct ftrace_event_call *call)
382{ 382{
383 int num; 383 int num;
384 char *name; 384 char *name;
385 385
386 name = (char *)ptr; 386 name = (char *)call->data;
387 num = syscall_name_to_nr(name); 387 num = syscall_name_to_nr(name);
388 if (num < 0 || num >= NR_syscalls) 388 if (num < 0 || num >= NR_syscalls)
389 return; 389 return;
@@ -395,13 +395,13 @@ void unreg_event_syscall_enter(void *ptr)
395 mutex_unlock(&syscall_trace_lock); 395 mutex_unlock(&syscall_trace_lock);
396} 396}
397 397
398int reg_event_syscall_exit(void *ptr) 398int reg_event_syscall_exit(struct ftrace_event_call *call)
399{ 399{
400 int ret = 0; 400 int ret = 0;
401 int num; 401 int num;
402 char *name; 402 char *name;
403 403
404 name = (char *)ptr; 404 name = call->data;
405 num = syscall_name_to_nr(name); 405 num = syscall_name_to_nr(name);
406 if (num < 0 || num >= NR_syscalls) 406 if (num < 0 || num >= NR_syscalls)
407 return -ENOSYS; 407 return -ENOSYS;
@@ -419,12 +419,12 @@ int reg_event_syscall_exit(void *ptr)
419 return ret; 419 return ret;
420} 420}
421 421
422void unreg_event_syscall_exit(void *ptr) 422void unreg_event_syscall_exit(struct ftrace_event_call *call)
423{ 423{
424 int num; 424 int num;
425 char *name; 425 char *name;
426 426
427 name = (char *)ptr; 427 name = call->data;
428 num = syscall_name_to_nr(name); 428 num = syscall_name_to_nr(name);
429 if (num < 0 || num >= NR_syscalls) 429 if (num < 0 || num >= NR_syscalls)
430 return; 430 return;
@@ -477,6 +477,7 @@ static int sys_prof_refcount_exit;
477static void prof_syscall_enter(struct pt_regs *regs, long id) 477static void prof_syscall_enter(struct pt_regs *regs, long id)
478{ 478{
479 struct syscall_metadata *sys_data; 479 struct syscall_metadata *sys_data;
480 struct perf_trace_buf *trace_buf;
480 struct syscall_trace_enter *rec; 481 struct syscall_trace_enter *rec;
481 unsigned long flags; 482 unsigned long flags;
482 char *raw_data; 483 char *raw_data;
@@ -507,14 +508,25 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
507 cpu = smp_processor_id(); 508 cpu = smp_processor_id();
508 509
509 if (in_nmi()) 510 if (in_nmi())
510 raw_data = rcu_dereference(trace_profile_buf_nmi); 511 trace_buf = rcu_dereference(perf_trace_buf_nmi);
511 else 512 else
512 raw_data = rcu_dereference(trace_profile_buf); 513 trace_buf = rcu_dereference(perf_trace_buf);
513 514
514 if (!raw_data) 515 if (!trace_buf)
515 goto end; 516 goto end;
516 517
517 raw_data = per_cpu_ptr(raw_data, cpu); 518 trace_buf = per_cpu_ptr(trace_buf, cpu);
519
520 if (trace_buf->recursion++)
521 goto end_recursion;
522
523 /*
524 * Make recursion update visible before entering perf_tp_event
525 * so that we protect from perf recursions.
526 */
527 barrier();
528
529 raw_data = trace_buf->buf;
518 530
519 /* zero the dead bytes from align to not leak stack to user */ 531 /* zero the dead bytes from align to not leak stack to user */
520 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 532 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -527,6 +539,8 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
527 (unsigned long *)&rec->args); 539 (unsigned long *)&rec->args);
528 perf_tp_event(sys_data->enter_id, 0, 1, rec, size); 540 perf_tp_event(sys_data->enter_id, 0, 1, rec, size);
529 541
542end_recursion:
543 trace_buf->recursion--;
530end: 544end:
531 local_irq_restore(flags); 545 local_irq_restore(flags);
532} 546}
@@ -574,6 +588,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
574{ 588{
575 struct syscall_metadata *sys_data; 589 struct syscall_metadata *sys_data;
576 struct syscall_trace_exit *rec; 590 struct syscall_trace_exit *rec;
591 struct perf_trace_buf *trace_buf;
577 unsigned long flags; 592 unsigned long flags;
578 int syscall_nr; 593 int syscall_nr;
579 char *raw_data; 594 char *raw_data;
@@ -605,14 +620,25 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
605 cpu = smp_processor_id(); 620 cpu = smp_processor_id();
606 621
607 if (in_nmi()) 622 if (in_nmi())
608 raw_data = rcu_dereference(trace_profile_buf_nmi); 623 trace_buf = rcu_dereference(perf_trace_buf_nmi);
609 else 624 else
610 raw_data = rcu_dereference(trace_profile_buf); 625 trace_buf = rcu_dereference(perf_trace_buf);
611 626
612 if (!raw_data) 627 if (!trace_buf)
613 goto end; 628 goto end;
614 629
615 raw_data = per_cpu_ptr(raw_data, cpu); 630 trace_buf = per_cpu_ptr(trace_buf, cpu);
631
632 if (trace_buf->recursion++)
633 goto end_recursion;
634
635 /*
636 * Make recursion update visible before entering perf_tp_event
637 * so that we protect from perf recursions.
638 */
639 barrier();
640
641 raw_data = trace_buf->buf;
616 642
617 /* zero the dead bytes from align to not leak stack to user */ 643 /* zero the dead bytes from align to not leak stack to user */
618 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 644 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
@@ -626,6 +652,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
626 652
627 perf_tp_event(sys_data->exit_id, 0, 1, rec, size); 653 perf_tp_event(sys_data->exit_id, 0, 1, rec, size);
628 654
655end_recursion:
656 trace_buf->recursion--;
629end: 657end:
630 local_irq_restore(flags); 658 local_irq_restore(flags);
631} 659}
diff --git a/kernel/user.c b/kernel/user.c
index 2c000e7132ac..46d0165ca70c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -330,9 +330,9 @@ done:
330 */ 330 */
331static void free_user(struct user_struct *up, unsigned long flags) 331static void free_user(struct user_struct *up, unsigned long flags)
332{ 332{
333 spin_unlock_irqrestore(&uidhash_lock, flags);
334 INIT_DELAYED_WORK(&up->work, cleanup_user_struct); 333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
335 schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); 334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336} 336}
337 337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ 338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ccefe574dcf7..12328147132c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -647,7 +647,7 @@ EXPORT_SYMBOL(schedule_delayed_work);
647 */ 647 */
648void flush_delayed_work(struct delayed_work *dwork) 648void flush_delayed_work(struct delayed_work *dwork)
649{ 649{
650 if (del_timer(&dwork->timer)) { 650 if (del_timer_sync(&dwork->timer)) {
651 struct cpu_workqueue_struct *cwq; 651 struct cpu_workqueue_struct *cwq;
652 cwq = wq_per_cpu(keventd_wq, get_cpu()); 652 cwq = wq_per_cpu(keventd_wq, get_cpu());
653 __queue_work(cwq, &dwork->work); 653 __queue_work(cwq, &dwork->work);
@@ -685,21 +685,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
685int schedule_on_each_cpu(work_func_t func) 685int schedule_on_each_cpu(work_func_t func)
686{ 686{
687 int cpu; 687 int cpu;
688 int orig = -1;
688 struct work_struct *works; 689 struct work_struct *works;
689 690
690 works = alloc_percpu(struct work_struct); 691 works = alloc_percpu(struct work_struct);
691 if (!works) 692 if (!works)
692 return -ENOMEM; 693 return -ENOMEM;
693 694
695 /*
696 * when running in keventd don't schedule a work item on itself.
697 * Can just call directly because the work queue is already bound.
698 * This also is faster.
699 * Make this a generic parameter for other workqueues?
700 */
701 if (current_is_keventd()) {
702 orig = raw_smp_processor_id();
703 INIT_WORK(per_cpu_ptr(works, orig), func);
704 func(per_cpu_ptr(works, orig));
705 }
706
694 get_online_cpus(); 707 get_online_cpus();
695 for_each_online_cpu(cpu) { 708 for_each_online_cpu(cpu) {
696 struct work_struct *work = per_cpu_ptr(works, cpu); 709 struct work_struct *work = per_cpu_ptr(works, cpu);
697 710
711 if (cpu == orig)
712 continue;
698 INIT_WORK(work, func); 713 INIT_WORK(work, func);
699 schedule_work_on(cpu, work); 714 schedule_work_on(cpu, work);
700 } 715 }
701 for_each_online_cpu(cpu) 716 for_each_online_cpu(cpu) {
702 flush_work(per_cpu_ptr(works, cpu)); 717 if (cpu != orig)
718 flush_work(per_cpu_ptr(works, cpu));
719 }
703 put_online_cpus(); 720 put_online_cpus();
704 free_percpu(works); 721 free_percpu(works);
705 return 0; 722 return 0;