diff options
Diffstat (limited to 'kernel')
36 files changed, 2151 insertions, 315 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ca83b73fba19..0249f4be9b5c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -1710,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, | |||
1710 | return -EFAULT; | 1710 | return -EFAULT; |
1711 | 1711 | ||
1712 | buffer[nbytes] = 0; /* nul-terminate */ | 1712 | buffer[nbytes] = 0; /* nul-terminate */ |
1713 | strstrip(buffer); | ||
1714 | if (cft->write_u64) { | 1713 | if (cft->write_u64) { |
1715 | u64 val = simple_strtoull(buffer, &end, 0); | 1714 | u64 val = simple_strtoull(strstrip(buffer), &end, 0); |
1716 | if (*end) | 1715 | if (*end) |
1717 | return -EINVAL; | 1716 | return -EINVAL; |
1718 | retval = cft->write_u64(cgrp, cft, val); | 1717 | retval = cft->write_u64(cgrp, cft, val); |
1719 | } else { | 1718 | } else { |
1720 | s64 val = simple_strtoll(buffer, &end, 0); | 1719 | s64 val = simple_strtoll(strstrip(buffer), &end, 0); |
1721 | if (*end) | 1720 | if (*end) |
1722 | return -EINVAL; | 1721 | return -EINVAL; |
1723 | retval = cft->write_s64(cgrp, cft, val); | 1722 | retval = cft->write_s64(cgrp, cft, val); |
@@ -1753,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, | |||
1753 | } | 1752 | } |
1754 | 1753 | ||
1755 | buffer[nbytes] = 0; /* nul-terminate */ | 1754 | buffer[nbytes] = 0; /* nul-terminate */ |
1756 | strstrip(buffer); | 1755 | retval = cft->write_string(cgrp, cft, strstrip(buffer)); |
1757 | retval = cft->write_string(cgrp, cft, buffer); | ||
1758 | if (!retval) | 1756 | if (!retval) |
1759 | retval = nbytes; | 1757 | retval = nbytes; |
1760 | out: | 1758 | out: |
diff --git a/kernel/exit.c b/kernel/exit.c index 266f8920628a..3f45e3cf931d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -360,10 +360,8 @@ void __set_special_pids(struct pid *pid) | |||
360 | { | 360 | { |
361 | struct task_struct *curr = current->group_leader; | 361 | struct task_struct *curr = current->group_leader; |
362 | 362 | ||
363 | if (task_session(curr) != pid) { | 363 | if (task_session(curr) != pid) |
364 | change_pid(curr, PIDTYPE_SID, pid); | 364 | change_pid(curr, PIDTYPE_SID, pid); |
365 | proc_sid_connector(curr); | ||
366 | } | ||
367 | 365 | ||
368 | if (task_pgrp(curr) != pid) | 366 | if (task_pgrp(curr) != pid) |
369 | change_pid(curr, PIDTYPE_PGID, pid); | 367 | change_pid(curr, PIDTYPE_PGID, pid); |
diff --git a/kernel/fork.c b/kernel/fork.c index 4c20fff8c13a..166b8c49257c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -91,7 +91,7 @@ int nr_processes(void) | |||
91 | int cpu; | 91 | int cpu; |
92 | int total = 0; | 92 | int total = 0; |
93 | 93 | ||
94 | for_each_online_cpu(cpu) | 94 | for_each_possible_cpu(cpu) |
95 | total += per_cpu(process_counts, cpu); | 95 | total += per_cpu(process_counts, cpu); |
96 | 96 | ||
97 | return total; | 97 | return total; |
diff --git a/kernel/futex.c b/kernel/futex.c index 4949d336d88d..fb65e822fc41 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) | |||
150 | */ | 150 | */ |
151 | static inline int match_futex(union futex_key *key1, union futex_key *key2) | 151 | static inline int match_futex(union futex_key *key1, union futex_key *key2) |
152 | { | 152 | { |
153 | return (key1->both.word == key2->both.word | 153 | return (key1 && key2 |
154 | && key1->both.word == key2->both.word | ||
154 | && key1->both.ptr == key2->both.ptr | 155 | && key1->both.ptr == key2->both.ptr |
155 | && key1->both.offset == key2->both.offset); | 156 | && key1->both.offset == key2->both.offset); |
156 | } | 157 | } |
@@ -1028,7 +1029,6 @@ static inline | |||
1028 | void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, | 1029 | void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, |
1029 | struct futex_hash_bucket *hb) | 1030 | struct futex_hash_bucket *hb) |
1030 | { | 1031 | { |
1031 | drop_futex_key_refs(&q->key); | ||
1032 | get_futex_key_refs(key); | 1032 | get_futex_key_refs(key); |
1033 | q->key = *key; | 1033 | q->key = *key; |
1034 | 1034 | ||
@@ -1226,6 +1226,7 @@ retry_private: | |||
1226 | */ | 1226 | */ |
1227 | if (ret == 1) { | 1227 | if (ret == 1) { |
1228 | WARN_ON(pi_state); | 1228 | WARN_ON(pi_state); |
1229 | drop_count++; | ||
1229 | task_count++; | 1230 | task_count++; |
1230 | ret = get_futex_value_locked(&curval2, uaddr2); | 1231 | ret = get_futex_value_locked(&curval2, uaddr2); |
1231 | if (!ret) | 1232 | if (!ret) |
@@ -1304,6 +1305,7 @@ retry_private: | |||
1304 | if (ret == 1) { | 1305 | if (ret == 1) { |
1305 | /* We got the lock. */ | 1306 | /* We got the lock. */ |
1306 | requeue_pi_wake_futex(this, &key2, hb2); | 1307 | requeue_pi_wake_futex(this, &key2, hb2); |
1308 | drop_count++; | ||
1307 | continue; | 1309 | continue; |
1308 | } else if (ret) { | 1310 | } else if (ret) { |
1309 | /* -EDEADLK */ | 1311 | /* -EDEADLK */ |
@@ -1791,6 +1793,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, | |||
1791 | current->timer_slack_ns); | 1793 | current->timer_slack_ns); |
1792 | } | 1794 | } |
1793 | 1795 | ||
1796 | retry: | ||
1794 | /* Prepare to wait on uaddr. */ | 1797 | /* Prepare to wait on uaddr. */ |
1795 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); | 1798 | ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); |
1796 | if (ret) | 1799 | if (ret) |
@@ -1808,9 +1811,14 @@ static int futex_wait(u32 __user *uaddr, int fshared, | |||
1808 | goto out_put_key; | 1811 | goto out_put_key; |
1809 | 1812 | ||
1810 | /* | 1813 | /* |
1811 | * We expect signal_pending(current), but another thread may | 1814 | * We expect signal_pending(current), but we might be the |
1812 | * have handled it for us already. | 1815 | * victim of a spurious wakeup as well. |
1813 | */ | 1816 | */ |
1817 | if (!signal_pending(current)) { | ||
1818 | put_futex_key(fshared, &q.key); | ||
1819 | goto retry; | ||
1820 | } | ||
1821 | |||
1814 | ret = -ERESTARTSYS; | 1822 | ret = -ERESTARTSYS; |
1815 | if (!abs_time) | 1823 | if (!abs_time) |
1816 | goto out_put_key; | 1824 | goto out_put_key; |
@@ -2118,9 +2126,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, | |||
2118 | */ | 2126 | */ |
2119 | plist_del(&q->list, &q->list.plist); | 2127 | plist_del(&q->list, &q->list.plist); |
2120 | 2128 | ||
2129 | /* Handle spurious wakeups gracefully */ | ||
2130 | ret = -EWOULDBLOCK; | ||
2121 | if (timeout && !timeout->task) | 2131 | if (timeout && !timeout->task) |
2122 | ret = -ETIMEDOUT; | 2132 | ret = -ETIMEDOUT; |
2123 | else | 2133 | else if (signal_pending(current)) |
2124 | ret = -ERESTARTNOINTR; | 2134 | ret = -ERESTARTNOINTR; |
2125 | } | 2135 | } |
2126 | return ret; | 2136 | return ret; |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 114e704760fe..bd7273e6282e 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -121,7 +121,9 @@ static void poll_all_shared_irqs(void) | |||
121 | if (!(status & IRQ_SPURIOUS_DISABLED)) | 121 | if (!(status & IRQ_SPURIOUS_DISABLED)) |
122 | continue; | 122 | continue; |
123 | 123 | ||
124 | local_irq_disable(); | ||
124 | try_one_irq(i, desc); | 125 | try_one_irq(i, desc); |
126 | local_irq_enable(); | ||
125 | } | 127 | } |
126 | } | 128 | } |
127 | 129 | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5240d75f4c60..84495958e703 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -90,6 +90,9 @@ static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) | |||
90 | */ | 90 | */ |
91 | static struct kprobe_blackpoint kprobe_blacklist[] = { | 91 | static struct kprobe_blackpoint kprobe_blacklist[] = { |
92 | {"preempt_schedule",}, | 92 | {"preempt_schedule",}, |
93 | {"native_get_debugreg",}, | ||
94 | {"irq_entries_start",}, | ||
95 | {"common_interrupt",}, | ||
93 | {NULL} /* Terminator */ | 96 | {NULL} /* Terminator */ |
94 | }; | 97 | }; |
95 | 98 | ||
@@ -673,6 +676,40 @@ static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) | |||
673 | return (kprobe_opcode_t *)(((char *)addr) + p->offset); | 676 | return (kprobe_opcode_t *)(((char *)addr) + p->offset); |
674 | } | 677 | } |
675 | 678 | ||
679 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ | ||
680 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) | ||
681 | { | ||
682 | struct kprobe *old_p, *list_p; | ||
683 | |||
684 | old_p = get_kprobe(p->addr); | ||
685 | if (unlikely(!old_p)) | ||
686 | return NULL; | ||
687 | |||
688 | if (p != old_p) { | ||
689 | list_for_each_entry_rcu(list_p, &old_p->list, list) | ||
690 | if (list_p == p) | ||
691 | /* kprobe p is a valid probe */ | ||
692 | goto valid; | ||
693 | return NULL; | ||
694 | } | ||
695 | valid: | ||
696 | return old_p; | ||
697 | } | ||
698 | |||
699 | /* Return error if the kprobe is being re-registered */ | ||
700 | static inline int check_kprobe_rereg(struct kprobe *p) | ||
701 | { | ||
702 | int ret = 0; | ||
703 | struct kprobe *old_p; | ||
704 | |||
705 | mutex_lock(&kprobe_mutex); | ||
706 | old_p = __get_valid_kprobe(p); | ||
707 | if (old_p) | ||
708 | ret = -EINVAL; | ||
709 | mutex_unlock(&kprobe_mutex); | ||
710 | return ret; | ||
711 | } | ||
712 | |||
676 | int __kprobes register_kprobe(struct kprobe *p) | 713 | int __kprobes register_kprobe(struct kprobe *p) |
677 | { | 714 | { |
678 | int ret = 0; | 715 | int ret = 0; |
@@ -685,6 +722,10 @@ int __kprobes register_kprobe(struct kprobe *p) | |||
685 | return -EINVAL; | 722 | return -EINVAL; |
686 | p->addr = addr; | 723 | p->addr = addr; |
687 | 724 | ||
725 | ret = check_kprobe_rereg(p); | ||
726 | if (ret) | ||
727 | return ret; | ||
728 | |||
688 | preempt_disable(); | 729 | preempt_disable(); |
689 | if (!kernel_text_address((unsigned long) p->addr) || | 730 | if (!kernel_text_address((unsigned long) p->addr) || |
690 | in_kprobes_functions((unsigned long) p->addr)) { | 731 | in_kprobes_functions((unsigned long) p->addr)) { |
@@ -754,26 +795,6 @@ out: | |||
754 | } | 795 | } |
755 | EXPORT_SYMBOL_GPL(register_kprobe); | 796 | EXPORT_SYMBOL_GPL(register_kprobe); |
756 | 797 | ||
757 | /* Check passed kprobe is valid and return kprobe in kprobe_table. */ | ||
758 | static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p) | ||
759 | { | ||
760 | struct kprobe *old_p, *list_p; | ||
761 | |||
762 | old_p = get_kprobe(p->addr); | ||
763 | if (unlikely(!old_p)) | ||
764 | return NULL; | ||
765 | |||
766 | if (p != old_p) { | ||
767 | list_for_each_entry_rcu(list_p, &old_p->list, list) | ||
768 | if (list_p == p) | ||
769 | /* kprobe p is a valid probe */ | ||
770 | goto valid; | ||
771 | return NULL; | ||
772 | } | ||
773 | valid: | ||
774 | return old_p; | ||
775 | } | ||
776 | |||
777 | /* | 798 | /* |
778 | * Unregister a kprobe without a scheduler synchronization. | 799 | * Unregister a kprobe without a scheduler synchronization. |
779 | */ | 800 | */ |
@@ -1141,6 +1162,13 @@ static void __kprobes kill_kprobe(struct kprobe *p) | |||
1141 | arch_remove_kprobe(p); | 1162 | arch_remove_kprobe(p); |
1142 | } | 1163 | } |
1143 | 1164 | ||
1165 | void __kprobes dump_kprobe(struct kprobe *kp) | ||
1166 | { | ||
1167 | printk(KERN_WARNING "Dumping kprobe:\n"); | ||
1168 | printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n", | ||
1169 | kp->symbol_name, kp->addr, kp->offset); | ||
1170 | } | ||
1171 | |||
1144 | /* Module notifier call back, checking kprobes on the module */ | 1172 | /* Module notifier call back, checking kprobes on the module */ |
1145 | static int __kprobes kprobes_module_callback(struct notifier_block *nb, | 1173 | static int __kprobes kprobes_module_callback(struct notifier_block *nb, |
1146 | unsigned long val, void *data) | 1174 | unsigned long val, void *data) |
diff --git a/kernel/kthread.c b/kernel/kthread.c index 5fe709982caa..ab7ae57773e1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -150,29 +150,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data), | |||
150 | EXPORT_SYMBOL(kthread_create); | 150 | EXPORT_SYMBOL(kthread_create); |
151 | 151 | ||
152 | /** | 152 | /** |
153 | * kthread_bind - bind a just-created kthread to a cpu. | ||
154 | * @k: thread created by kthread_create(). | ||
155 | * @cpu: cpu (might not be online, must be possible) for @k to run on. | ||
156 | * | ||
157 | * Description: This function is equivalent to set_cpus_allowed(), | ||
158 | * except that @cpu doesn't need to be online, and the thread must be | ||
159 | * stopped (i.e., just returned from kthread_create()). | ||
160 | */ | ||
161 | void kthread_bind(struct task_struct *k, unsigned int cpu) | ||
162 | { | ||
163 | /* Must have done schedule() in kthread() before we set_task_cpu */ | ||
164 | if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) { | ||
165 | WARN_ON(1); | ||
166 | return; | ||
167 | } | ||
168 | set_task_cpu(k, cpu); | ||
169 | k->cpus_allowed = cpumask_of_cpu(cpu); | ||
170 | k->rt.nr_cpus_allowed = 1; | ||
171 | k->flags |= PF_THREAD_BOUND; | ||
172 | } | ||
173 | EXPORT_SYMBOL(kthread_bind); | ||
174 | |||
175 | /** | ||
176 | * kthread_stop - stop a thread created by kthread_create(). | 153 | * kthread_stop - stop a thread created by kthread_create(). |
177 | * @k: thread created by kthread_create(). | 154 | * @k: thread created by kthread_create(). |
178 | * | 155 | * |
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 9af56723c096..f5dcd36d3151 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -49,7 +49,7 @@ | |||
49 | #include "lockdep_internals.h" | 49 | #include "lockdep_internals.h" |
50 | 50 | ||
51 | #define CREATE_TRACE_POINTS | 51 | #define CREATE_TRACE_POINTS |
52 | #include <trace/events/lockdep.h> | 52 | #include <trace/events/lock.h> |
53 | 53 | ||
54 | #ifdef CONFIG_PROVE_LOCKING | 54 | #ifdef CONFIG_PROVE_LOCKING |
55 | int prove_locking = 1; | 55 | int prove_locking = 1; |
diff --git a/kernel/notifier.c b/kernel/notifier.c index 61d5aa5eced3..acd24e7643eb 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c | |||
@@ -558,7 +558,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier); | |||
558 | 558 | ||
559 | static ATOMIC_NOTIFIER_HEAD(die_chain); | 559 | static ATOMIC_NOTIFIER_HEAD(die_chain); |
560 | 560 | ||
561 | int notrace notify_die(enum die_val val, const char *str, | 561 | int notrace __kprobes notify_die(enum die_val val, const char *str, |
562 | struct pt_regs *regs, long err, int trap, int sig) | 562 | struct pt_regs *regs, long err, int trap, int sig) |
563 | { | 563 | { |
564 | struct die_args args = { | 564 | struct die_args args = { |
diff --git a/kernel/params.c b/kernel/params.c index 9da58eabdcb2..d656c276508d 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -218,15 +218,11 @@ int param_set_charp(const char *val, struct kernel_param *kp) | |||
218 | return -ENOSPC; | 218 | return -ENOSPC; |
219 | } | 219 | } |
220 | 220 | ||
221 | if (kp->flags & KPARAM_KMALLOCED) | ||
222 | kfree(*(char **)kp->arg); | ||
223 | |||
224 | /* This is a hack. We can't need to strdup in early boot, and we | 221 | /* This is a hack. We can't need to strdup in early boot, and we |
225 | * don't need to; this mangled commandline is preserved. */ | 222 | * don't need to; this mangled commandline is preserved. */ |
226 | if (slab_is_available()) { | 223 | if (slab_is_available()) { |
227 | kp->flags |= KPARAM_KMALLOCED; | ||
228 | *(char **)kp->arg = kstrdup(val, GFP_KERNEL); | 224 | *(char **)kp->arg = kstrdup(val, GFP_KERNEL); |
229 | if (!kp->arg) | 225 | if (!*(char **)kp->arg) |
230 | return -ENOMEM; | 226 | return -ENOMEM; |
231 | } else | 227 | } else |
232 | *(const char **)kp->arg = val; | 228 | *(const char **)kp->arg = val; |
@@ -304,6 +300,7 @@ static int param_array(const char *name, | |||
304 | unsigned int min, unsigned int max, | 300 | unsigned int min, unsigned int max, |
305 | void *elem, int elemsize, | 301 | void *elem, int elemsize, |
306 | int (*set)(const char *, struct kernel_param *kp), | 302 | int (*set)(const char *, struct kernel_param *kp), |
303 | u16 flags, | ||
307 | unsigned int *num) | 304 | unsigned int *num) |
308 | { | 305 | { |
309 | int ret; | 306 | int ret; |
@@ -313,6 +310,7 @@ static int param_array(const char *name, | |||
313 | /* Get the name right for errors. */ | 310 | /* Get the name right for errors. */ |
314 | kp.name = name; | 311 | kp.name = name; |
315 | kp.arg = elem; | 312 | kp.arg = elem; |
313 | kp.flags = flags; | ||
316 | 314 | ||
317 | /* No equals sign? */ | 315 | /* No equals sign? */ |
318 | if (!val) { | 316 | if (!val) { |
@@ -358,7 +356,8 @@ int param_array_set(const char *val, struct kernel_param *kp) | |||
358 | unsigned int temp_num; | 356 | unsigned int temp_num; |
359 | 357 | ||
360 | return param_array(kp->name, val, 1, arr->max, arr->elem, | 358 | return param_array(kp->name, val, 1, arr->max, arr->elem, |
361 | arr->elemsize, arr->set, arr->num ?: &temp_num); | 359 | arr->elemsize, arr->set, kp->flags, |
360 | arr->num ?: &temp_num); | ||
362 | } | 361 | } |
363 | 362 | ||
364 | int param_array_get(char *buffer, struct kernel_param *kp) | 363 | int param_array_get(char *buffer, struct kernel_param *kp) |
@@ -605,11 +604,7 @@ void module_param_sysfs_remove(struct module *mod) | |||
605 | 604 | ||
606 | void destroy_params(const struct kernel_param *params, unsigned num) | 605 | void destroy_params(const struct kernel_param *params, unsigned num) |
607 | { | 606 | { |
608 | unsigned int i; | 607 | /* FIXME: This should free kmalloced charp parameters. It doesn't. */ |
609 | |||
610 | for (i = 0; i < num; i++) | ||
611 | if (params[i].flags & KPARAM_KMALLOCED) | ||
612 | kfree(*(char **)params[i].arg); | ||
613 | } | 608 | } |
614 | 609 | ||
615 | static void __init kernel_add_sysfs_param(const char *name, | 610 | static void __init kernel_add_sysfs_param(const char *name, |
diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 98dc56b2ebe4..3852e2656bb0 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c | |||
@@ -1357,7 +1357,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) | |||
1357 | u64 interrupts, freq; | 1357 | u64 interrupts, freq; |
1358 | 1358 | ||
1359 | spin_lock(&ctx->lock); | 1359 | spin_lock(&ctx->lock); |
1360 | list_for_each_entry(event, &ctx->group_list, group_entry) { | 1360 | list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
1361 | if (event->state != PERF_EVENT_STATE_ACTIVE) | 1361 | if (event->state != PERF_EVENT_STATE_ACTIVE) |
1362 | continue; | 1362 | continue; |
1363 | 1363 | ||
@@ -2696,20 +2696,21 @@ static void perf_output_wakeup(struct perf_output_handle *handle) | |||
2696 | static void perf_output_lock(struct perf_output_handle *handle) | 2696 | static void perf_output_lock(struct perf_output_handle *handle) |
2697 | { | 2697 | { |
2698 | struct perf_mmap_data *data = handle->data; | 2698 | struct perf_mmap_data *data = handle->data; |
2699 | int cpu; | 2699 | int cur, cpu = get_cpu(); |
2700 | 2700 | ||
2701 | handle->locked = 0; | 2701 | handle->locked = 0; |
2702 | 2702 | ||
2703 | local_irq_save(handle->flags); | 2703 | for (;;) { |
2704 | cpu = smp_processor_id(); | 2704 | cur = atomic_cmpxchg(&data->lock, -1, cpu); |
2705 | 2705 | if (cur == -1) { | |
2706 | if (in_nmi() && atomic_read(&data->lock) == cpu) | 2706 | handle->locked = 1; |
2707 | return; | 2707 | break; |
2708 | } | ||
2709 | if (cur == cpu) | ||
2710 | break; | ||
2708 | 2711 | ||
2709 | while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) | ||
2710 | cpu_relax(); | 2712 | cpu_relax(); |
2711 | 2713 | } | |
2712 | handle->locked = 1; | ||
2713 | } | 2714 | } |
2714 | 2715 | ||
2715 | static void perf_output_unlock(struct perf_output_handle *handle) | 2716 | static void perf_output_unlock(struct perf_output_handle *handle) |
@@ -2755,7 +2756,7 @@ again: | |||
2755 | if (atomic_xchg(&data->wakeup, 0)) | 2756 | if (atomic_xchg(&data->wakeup, 0)) |
2756 | perf_output_wakeup(handle); | 2757 | perf_output_wakeup(handle); |
2757 | out: | 2758 | out: |
2758 | local_irq_restore(handle->flags); | 2759 | put_cpu(); |
2759 | } | 2760 | } |
2760 | 2761 | ||
2761 | void perf_output_copy(struct perf_output_handle *handle, | 2762 | void perf_output_copy(struct perf_output_handle *handle, |
@@ -3998,8 +3999,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
3998 | regs = task_pt_regs(current); | 3999 | regs = task_pt_regs(current); |
3999 | 4000 | ||
4000 | if (regs) { | 4001 | if (regs) { |
4001 | if (perf_event_overflow(event, 0, &data, regs)) | 4002 | if (!(event->attr.exclude_idle && current->pid == 0)) |
4002 | ret = HRTIMER_NORESTART; | 4003 | if (perf_event_overflow(event, 0, &data, regs)) |
4004 | ret = HRTIMER_NORESTART; | ||
4003 | } | 4005 | } |
4004 | 4006 | ||
4005 | period = max_t(u64, 10000, event->hw.sample_period); | 4007 | period = max_t(u64, 10000, event->hw.sample_period); |
@@ -4008,6 +4010,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) | |||
4008 | return ret; | 4010 | return ret; |
4009 | } | 4011 | } |
4010 | 4012 | ||
4013 | static void perf_swevent_start_hrtimer(struct perf_event *event) | ||
4014 | { | ||
4015 | struct hw_perf_event *hwc = &event->hw; | ||
4016 | |||
4017 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
4018 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
4019 | if (hwc->sample_period) { | ||
4020 | u64 period; | ||
4021 | |||
4022 | if (hwc->remaining) { | ||
4023 | if (hwc->remaining < 0) | ||
4024 | period = 10000; | ||
4025 | else | ||
4026 | period = hwc->remaining; | ||
4027 | hwc->remaining = 0; | ||
4028 | } else { | ||
4029 | period = max_t(u64, 10000, hwc->sample_period); | ||
4030 | } | ||
4031 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
4032 | ns_to_ktime(period), 0, | ||
4033 | HRTIMER_MODE_REL, 0); | ||
4034 | } | ||
4035 | } | ||
4036 | |||
4037 | static void perf_swevent_cancel_hrtimer(struct perf_event *event) | ||
4038 | { | ||
4039 | struct hw_perf_event *hwc = &event->hw; | ||
4040 | |||
4041 | if (hwc->sample_period) { | ||
4042 | ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); | ||
4043 | hwc->remaining = ktime_to_ns(remaining); | ||
4044 | |||
4045 | hrtimer_cancel(&hwc->hrtimer); | ||
4046 | } | ||
4047 | } | ||
4048 | |||
4011 | /* | 4049 | /* |
4012 | * Software event: cpu wall time clock | 4050 | * Software event: cpu wall time clock |
4013 | */ | 4051 | */ |
@@ -4030,22 +4068,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event) | |||
4030 | int cpu = raw_smp_processor_id(); | 4068 | int cpu = raw_smp_processor_id(); |
4031 | 4069 | ||
4032 | atomic64_set(&hwc->prev_count, cpu_clock(cpu)); | 4070 | atomic64_set(&hwc->prev_count, cpu_clock(cpu)); |
4033 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 4071 | perf_swevent_start_hrtimer(event); |
4034 | hwc->hrtimer.function = perf_swevent_hrtimer; | ||
4035 | if (hwc->sample_period) { | ||
4036 | u64 period = max_t(u64, 10000, hwc->sample_period); | ||
4037 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
4038 | ns_to_ktime(period), 0, | ||
4039 | HRTIMER_MODE_REL, 0); | ||
4040 | } | ||
4041 | 4072 | ||
4042 | return 0; | 4073 | return 0; |
4043 | } | 4074 | } |
4044 | 4075 | ||
4045 | static void cpu_clock_perf_event_disable(struct perf_event *event) | 4076 | static void cpu_clock_perf_event_disable(struct perf_event *event) |
4046 | { | 4077 | { |
4047 | if (event->hw.sample_period) | 4078 | perf_swevent_cancel_hrtimer(event); |
4048 | hrtimer_cancel(&event->hw.hrtimer); | ||
4049 | cpu_clock_perf_event_update(event); | 4079 | cpu_clock_perf_event_update(event); |
4050 | } | 4080 | } |
4051 | 4081 | ||
@@ -4082,22 +4112,15 @@ static int task_clock_perf_event_enable(struct perf_event *event) | |||
4082 | now = event->ctx->time; | 4112 | now = event->ctx->time; |
4083 | 4113 | ||
4084 | atomic64_set(&hwc->prev_count, now); | 4114 | atomic64_set(&hwc->prev_count, now); |
4085 | hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | 4115 | |
4086 | hwc->hrtimer.function = perf_swevent_hrtimer; | 4116 | perf_swevent_start_hrtimer(event); |
4087 | if (hwc->sample_period) { | ||
4088 | u64 period = max_t(u64, 10000, hwc->sample_period); | ||
4089 | __hrtimer_start_range_ns(&hwc->hrtimer, | ||
4090 | ns_to_ktime(period), 0, | ||
4091 | HRTIMER_MODE_REL, 0); | ||
4092 | } | ||
4093 | 4117 | ||
4094 | return 0; | 4118 | return 0; |
4095 | } | 4119 | } |
4096 | 4120 | ||
4097 | static void task_clock_perf_event_disable(struct perf_event *event) | 4121 | static void task_clock_perf_event_disable(struct perf_event *event) |
4098 | { | 4122 | { |
4099 | if (event->hw.sample_period) | 4123 | perf_swevent_cancel_hrtimer(event); |
4100 | hrtimer_cancel(&event->hw.hrtimer); | ||
4101 | task_clock_perf_event_update(event, event->ctx->time); | 4124 | task_clock_perf_event_update(event, event->ctx->time); |
4102 | 4125 | ||
4103 | } | 4126 | } |
@@ -4319,6 +4342,8 @@ static const struct pmu *sw_perf_event_init(struct perf_event *event) | |||
4319 | case PERF_COUNT_SW_PAGE_FAULTS_MAJ: | 4342 | case PERF_COUNT_SW_PAGE_FAULTS_MAJ: |
4320 | case PERF_COUNT_SW_CONTEXT_SWITCHES: | 4343 | case PERF_COUNT_SW_CONTEXT_SWITCHES: |
4321 | case PERF_COUNT_SW_CPU_MIGRATIONS: | 4344 | case PERF_COUNT_SW_CPU_MIGRATIONS: |
4345 | case PERF_COUNT_SW_ALIGNMENT_FAULTS: | ||
4346 | case PERF_COUNT_SW_EMULATION_FAULTS: | ||
4322 | if (!event->parent) { | 4347 | if (!event->parent) { |
4323 | atomic_inc(&perf_swevent_enabled[event_id]); | 4348 | atomic_inc(&perf_swevent_enabled[event_id]); |
4324 | event->destroy = sw_perf_event_destroy; | 4349 | event->destroy = sw_perf_event_destroy; |
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 04b3a83d686f..04a9e90d248f 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c | |||
@@ -693,21 +693,22 @@ static int software_resume(void) | |||
693 | /* The snapshot device should not be opened while we're running */ | 693 | /* The snapshot device should not be opened while we're running */ |
694 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { | 694 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) { |
695 | error = -EBUSY; | 695 | error = -EBUSY; |
696 | swsusp_close(FMODE_READ); | ||
696 | goto Unlock; | 697 | goto Unlock; |
697 | } | 698 | } |
698 | 699 | ||
699 | pm_prepare_console(); | 700 | pm_prepare_console(); |
700 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); | 701 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); |
701 | if (error) | 702 | if (error) |
702 | goto Finish; | 703 | goto close_finish; |
703 | 704 | ||
704 | error = usermodehelper_disable(); | 705 | error = usermodehelper_disable(); |
705 | if (error) | 706 | if (error) |
706 | goto Finish; | 707 | goto close_finish; |
707 | 708 | ||
708 | error = create_basic_memory_bitmaps(); | 709 | error = create_basic_memory_bitmaps(); |
709 | if (error) | 710 | if (error) |
710 | goto Finish; | 711 | goto close_finish; |
711 | 712 | ||
712 | pr_debug("PM: Preparing processes for restore.\n"); | 713 | pr_debug("PM: Preparing processes for restore.\n"); |
713 | error = prepare_processes(); | 714 | error = prepare_processes(); |
@@ -719,6 +720,7 @@ static int software_resume(void) | |||
719 | pr_debug("PM: Reading hibernation image.\n"); | 720 | pr_debug("PM: Reading hibernation image.\n"); |
720 | 721 | ||
721 | error = swsusp_read(&flags); | 722 | error = swsusp_read(&flags); |
723 | swsusp_close(FMODE_READ); | ||
722 | if (!error) | 724 | if (!error) |
723 | hibernation_restore(flags & SF_PLATFORM_MODE); | 725 | hibernation_restore(flags & SF_PLATFORM_MODE); |
724 | 726 | ||
@@ -737,6 +739,9 @@ static int software_resume(void) | |||
737 | mutex_unlock(&pm_mutex); | 739 | mutex_unlock(&pm_mutex); |
738 | pr_debug("PM: Resume from disk failed.\n"); | 740 | pr_debug("PM: Resume from disk failed.\n"); |
739 | return error; | 741 | return error; |
742 | close_finish: | ||
743 | swsusp_close(FMODE_READ); | ||
744 | goto Finish; | ||
740 | } | 745 | } |
741 | 746 | ||
742 | late_initcall(software_resume); | 747 | late_initcall(software_resume); |
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c index 17d8bb1acf9c..25596e450ac7 100644 --- a/kernel/power/suspend_test.c +++ b/kernel/power/suspend_test.c | |||
@@ -19,7 +19,7 @@ | |||
19 | * The time it takes is system-specific though, so when we test this | 19 | * The time it takes is system-specific though, so when we test this |
20 | * during system bootup we allow a LOT of time. | 20 | * during system bootup we allow a LOT of time. |
21 | */ | 21 | */ |
22 | #define TEST_SUSPEND_SECONDS 5 | 22 | #define TEST_SUSPEND_SECONDS 10 |
23 | 23 | ||
24 | static unsigned long suspend_test_start_time; | 24 | static unsigned long suspend_test_start_time; |
25 | 25 | ||
@@ -49,7 +49,8 @@ void suspend_test_finish(const char *label) | |||
49 | * has some performance issues. The stack dump of a WARN_ON | 49 | * has some performance issues. The stack dump of a WARN_ON |
50 | * is more likely to get the right attention than a printk... | 50 | * is more likely to get the right attention than a printk... |
51 | */ | 51 | */ |
52 | WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label); | 52 | WARN(msec > (TEST_SUSPEND_SECONDS * 1000), |
53 | "Component: %s, time: %u\n", label, msec); | ||
53 | } | 54 | } |
54 | 55 | ||
55 | /* | 56 | /* |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b101cdc4df3f..890f6b11b1d3 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -314,7 +314,6 @@ static int save_image(struct swap_map_handle *handle, | |||
314 | { | 314 | { |
315 | unsigned int m; | 315 | unsigned int m; |
316 | int ret; | 316 | int ret; |
317 | int error = 0; | ||
318 | int nr_pages; | 317 | int nr_pages; |
319 | int err2; | 318 | int err2; |
320 | struct bio *bio; | 319 | struct bio *bio; |
@@ -329,26 +328,27 @@ static int save_image(struct swap_map_handle *handle, | |||
329 | nr_pages = 0; | 328 | nr_pages = 0; |
330 | bio = NULL; | 329 | bio = NULL; |
331 | do_gettimeofday(&start); | 330 | do_gettimeofday(&start); |
332 | do { | 331 | while (1) { |
333 | ret = snapshot_read_next(snapshot, PAGE_SIZE); | 332 | ret = snapshot_read_next(snapshot, PAGE_SIZE); |
334 | if (ret > 0) { | 333 | if (ret <= 0) |
335 | error = swap_write_page(handle, data_of(*snapshot), | 334 | break; |
336 | &bio); | 335 | ret = swap_write_page(handle, data_of(*snapshot), &bio); |
337 | if (error) | 336 | if (ret) |
338 | break; | 337 | break; |
339 | if (!(nr_pages % m)) | 338 | if (!(nr_pages % m)) |
340 | printk("\b\b\b\b%3d%%", nr_pages / m); | 339 | printk("\b\b\b\b%3d%%", nr_pages / m); |
341 | nr_pages++; | 340 | nr_pages++; |
342 | } | 341 | } |
343 | } while (ret > 0); | ||
344 | err2 = wait_on_bio_chain(&bio); | 342 | err2 = wait_on_bio_chain(&bio); |
345 | do_gettimeofday(&stop); | 343 | do_gettimeofday(&stop); |
346 | if (!error) | 344 | if (!ret) |
347 | error = err2; | 345 | ret = err2; |
348 | if (!error) | 346 | if (!ret) |
349 | printk("\b\b\b\bdone\n"); | 347 | printk("\b\b\b\bdone\n"); |
348 | else | ||
349 | printk("\n"); | ||
350 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); | 350 | swsusp_show_speed(&start, &stop, nr_to_write, "Wrote"); |
351 | return error; | 351 | return ret; |
352 | } | 352 | } |
353 | 353 | ||
354 | /** | 354 | /** |
@@ -536,7 +536,8 @@ static int load_image(struct swap_map_handle *handle, | |||
536 | snapshot_write_finalize(snapshot); | 536 | snapshot_write_finalize(snapshot); |
537 | if (!snapshot_image_loaded(snapshot)) | 537 | if (!snapshot_image_loaded(snapshot)) |
538 | error = -ENODATA; | 538 | error = -ENODATA; |
539 | } | 539 | } else |
540 | printk("\n"); | ||
540 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); | 541 | swsusp_show_speed(&start, &stop, nr_to_read, "Read"); |
541 | return error; | 542 | return error; |
542 | } | 543 | } |
@@ -572,8 +573,6 @@ int swsusp_read(unsigned int *flags_p) | |||
572 | error = load_image(&handle, &snapshot, header->pages - 1); | 573 | error = load_image(&handle, &snapshot, header->pages - 1); |
573 | release_swap_reader(&handle); | 574 | release_swap_reader(&handle); |
574 | 575 | ||
575 | blkdev_put(resume_bdev, FMODE_READ); | ||
576 | |||
577 | if (!error) | 576 | if (!error) |
578 | pr_debug("PM: Image successfully loaded\n"); | 577 | pr_debug("PM: Image successfully loaded\n"); |
579 | else | 578 | else |
@@ -596,7 +595,7 @@ int swsusp_check(void) | |||
596 | error = bio_read_page(swsusp_resume_block, | 595 | error = bio_read_page(swsusp_resume_block, |
597 | swsusp_header, NULL); | 596 | swsusp_header, NULL); |
598 | if (error) | 597 | if (error) |
599 | return error; | 598 | goto put; |
600 | 599 | ||
601 | if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { | 600 | if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) { |
602 | memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); | 601 | memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10); |
@@ -604,8 +603,10 @@ int swsusp_check(void) | |||
604 | error = bio_write_page(swsusp_resume_block, | 603 | error = bio_write_page(swsusp_resume_block, |
605 | swsusp_header, NULL); | 604 | swsusp_header, NULL); |
606 | } else { | 605 | } else { |
607 | return -EINVAL; | 606 | error = -EINVAL; |
608 | } | 607 | } |
608 | |||
609 | put: | ||
609 | if (error) | 610 | if (error) |
610 | blkdev_put(resume_bdev, FMODE_READ); | 611 | blkdev_put(resume_bdev, FMODE_READ); |
611 | else | 612 | else |
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 705f02ac7433..f3077c0ab181 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -59,7 +59,7 @@ | |||
59 | NUM_RCU_LVL_2, \ | 59 | NUM_RCU_LVL_2, \ |
60 | NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ | 60 | NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ |
61 | }, \ | 61 | }, \ |
62 | .signaled = RCU_SIGNAL_INIT, \ | 62 | .signaled = RCU_GP_IDLE, \ |
63 | .gpnum = -300, \ | 63 | .gpnum = -300, \ |
64 | .completed = -300, \ | 64 | .completed = -300, \ |
65 | .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ | 65 | .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ |
@@ -657,14 +657,17 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
657 | * irqs disabled. | 657 | * irqs disabled. |
658 | */ | 658 | */ |
659 | rcu_for_each_node_breadth_first(rsp, rnp) { | 659 | rcu_for_each_node_breadth_first(rsp, rnp) { |
660 | spin_lock(&rnp->lock); /* irqs already disabled. */ | 660 | spin_lock(&rnp->lock); /* irqs already disabled. */ |
661 | rcu_preempt_check_blocked_tasks(rnp); | 661 | rcu_preempt_check_blocked_tasks(rnp); |
662 | rnp->qsmask = rnp->qsmaskinit; | 662 | rnp->qsmask = rnp->qsmaskinit; |
663 | rnp->gpnum = rsp->gpnum; | 663 | rnp->gpnum = rsp->gpnum; |
664 | spin_unlock(&rnp->lock); /* irqs already disabled. */ | 664 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
665 | } | 665 | } |
666 | 666 | ||
667 | rnp = rcu_get_root(rsp); | ||
668 | spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
667 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ | 669 | rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ |
670 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
668 | spin_unlock_irqrestore(&rsp->onofflock, flags); | 671 | spin_unlock_irqrestore(&rsp->onofflock, flags); |
669 | } | 672 | } |
670 | 673 | ||
@@ -706,6 +709,7 @@ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) | |||
706 | { | 709 | { |
707 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | 710 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); |
708 | rsp->completed = rsp->gpnum; | 711 | rsp->completed = rsp->gpnum; |
712 | rsp->signaled = RCU_GP_IDLE; | ||
709 | rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); | 713 | rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); |
710 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ | 714 | rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */ |
711 | } | 715 | } |
@@ -913,7 +917,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) | |||
913 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 917 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
914 | break; | 918 | break; |
915 | } | 919 | } |
916 | rcu_preempt_offline_tasks(rsp, rnp, rdp); | 920 | |
921 | /* | ||
922 | * If there was a task blocking the current grace period, | ||
923 | * and if all CPUs have checked in, we need to propagate | ||
924 | * the quiescent state up the rcu_node hierarchy. But that | ||
925 | * is inconvenient at the moment due to deadlock issues if | ||
926 | * this should end the current grace period. So set the | ||
927 | * offlined CPU's bit in ->qsmask in order to force the | ||
928 | * next force_quiescent_state() invocation to clean up this | ||
929 | * mess in a deadlock-free manner. | ||
930 | */ | ||
931 | if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask) | ||
932 | rnp->qsmask |= mask; | ||
933 | |||
917 | mask = rnp->grpmask; | 934 | mask = rnp->grpmask; |
918 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 935 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
919 | rnp = rnp->parent; | 936 | rnp = rnp->parent; |
@@ -958,7 +975,7 @@ static void rcu_offline_cpu(int cpu) | |||
958 | * Invoke any RCU callbacks that have made it to the end of their grace | 975 | * Invoke any RCU callbacks that have made it to the end of their grace |
959 | * period. Thottle as specified by rdp->blimit. | 976 | * period. Thottle as specified by rdp->blimit. |
960 | */ | 977 | */ |
961 | static void rcu_do_batch(struct rcu_data *rdp) | 978 | static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) |
962 | { | 979 | { |
963 | unsigned long flags; | 980 | unsigned long flags; |
964 | struct rcu_head *next, *list, **tail; | 981 | struct rcu_head *next, *list, **tail; |
@@ -1011,6 +1028,13 @@ static void rcu_do_batch(struct rcu_data *rdp) | |||
1011 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) | 1028 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) |
1012 | rdp->blimit = blimit; | 1029 | rdp->blimit = blimit; |
1013 | 1030 | ||
1031 | /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ | ||
1032 | if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { | ||
1033 | rdp->qlen_last_fqs_check = 0; | ||
1034 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
1035 | } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) | ||
1036 | rdp->qlen_last_fqs_check = rdp->qlen; | ||
1037 | |||
1014 | local_irq_restore(flags); | 1038 | local_irq_restore(flags); |
1015 | 1039 | ||
1016 | /* Re-raise the RCU softirq if there are callbacks remaining. */ | 1040 | /* Re-raise the RCU softirq if there are callbacks remaining. */ |
@@ -1142,9 +1166,10 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1142 | } | 1166 | } |
1143 | spin_unlock(&rnp->lock); | 1167 | spin_unlock(&rnp->lock); |
1144 | switch (signaled) { | 1168 | switch (signaled) { |
1169 | case RCU_GP_IDLE: | ||
1145 | case RCU_GP_INIT: | 1170 | case RCU_GP_INIT: |
1146 | 1171 | ||
1147 | break; /* grace period still initializing, ignore. */ | 1172 | break; /* grace period idle or initializing, ignore. */ |
1148 | 1173 | ||
1149 | case RCU_SAVE_DYNTICK: | 1174 | case RCU_SAVE_DYNTICK: |
1150 | 1175 | ||
@@ -1158,7 +1183,8 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) | |||
1158 | 1183 | ||
1159 | /* Update state, record completion counter. */ | 1184 | /* Update state, record completion counter. */ |
1160 | spin_lock(&rnp->lock); | 1185 | spin_lock(&rnp->lock); |
1161 | if (lastcomp == rsp->completed) { | 1186 | if (lastcomp == rsp->completed && |
1187 | rsp->signaled == RCU_SAVE_DYNTICK) { | ||
1162 | rsp->signaled = RCU_FORCE_QS; | 1188 | rsp->signaled = RCU_FORCE_QS; |
1163 | dyntick_record_completed(rsp, lastcomp); | 1189 | dyntick_record_completed(rsp, lastcomp); |
1164 | } | 1190 | } |
@@ -1224,7 +1250,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1224 | } | 1250 | } |
1225 | 1251 | ||
1226 | /* If there are callbacks ready, invoke them. */ | 1252 | /* If there are callbacks ready, invoke them. */ |
1227 | rcu_do_batch(rdp); | 1253 | rcu_do_batch(rsp, rdp); |
1228 | } | 1254 | } |
1229 | 1255 | ||
1230 | /* | 1256 | /* |
@@ -1288,10 +1314,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | |||
1288 | rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ | 1314 | rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ |
1289 | } | 1315 | } |
1290 | 1316 | ||
1291 | /* Force the grace period if too many callbacks or too long waiting. */ | 1317 | /* |
1292 | if (unlikely(++rdp->qlen > qhimark)) { | 1318 | * Force the grace period if too many callbacks or too long waiting. |
1319 | * Enforce hysteresis, and don't invoke force_quiescent_state() | ||
1320 | * if some other CPU has recently done so. Also, don't bother | ||
1321 | * invoking force_quiescent_state() if the newly enqueued callback | ||
1322 | * is the only one waiting for a grace period to complete. | ||
1323 | */ | ||
1324 | if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | ||
1293 | rdp->blimit = LONG_MAX; | 1325 | rdp->blimit = LONG_MAX; |
1294 | force_quiescent_state(rsp, 0); | 1326 | if (rsp->n_force_qs == rdp->n_force_qs_snap && |
1327 | *rdp->nxttail[RCU_DONE_TAIL] != head) | ||
1328 | force_quiescent_state(rsp, 0); | ||
1329 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
1330 | rdp->qlen_last_fqs_check = rdp->qlen; | ||
1295 | } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) | 1331 | } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) |
1296 | force_quiescent_state(rsp, 1); | 1332 | force_quiescent_state(rsp, 1); |
1297 | local_irq_restore(flags); | 1333 | local_irq_restore(flags); |
@@ -1523,6 +1559,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) | |||
1523 | rdp->beenonline = 1; /* We have now been online. */ | 1559 | rdp->beenonline = 1; /* We have now been online. */ |
1524 | rdp->preemptable = preemptable; | 1560 | rdp->preemptable = preemptable; |
1525 | rdp->passed_quiesc_completed = lastcomp - 1; | 1561 | rdp->passed_quiesc_completed = lastcomp - 1; |
1562 | rdp->qlen_last_fqs_check = 0; | ||
1563 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
1526 | rdp->blimit = blimit; | 1564 | rdp->blimit = blimit; |
1527 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ | 1565 | spin_unlock(&rnp->lock); /* irqs remain disabled. */ |
1528 | 1566 | ||
diff --git a/kernel/rcutree.h b/kernel/rcutree.h index b40ac5706040..1899023b0962 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h | |||
@@ -167,6 +167,10 @@ struct rcu_data { | |||
167 | struct rcu_head *nxtlist; | 167 | struct rcu_head *nxtlist; |
168 | struct rcu_head **nxttail[RCU_NEXT_SIZE]; | 168 | struct rcu_head **nxttail[RCU_NEXT_SIZE]; |
169 | long qlen; /* # of queued callbacks */ | 169 | long qlen; /* # of queued callbacks */ |
170 | long qlen_last_fqs_check; | ||
171 | /* qlen at last check for QS forcing */ | ||
172 | unsigned long n_force_qs_snap; | ||
173 | /* did other CPU force QS recently? */ | ||
170 | long blimit; /* Upper limit on a processed batch */ | 174 | long blimit; /* Upper limit on a processed batch */ |
171 | 175 | ||
172 | #ifdef CONFIG_NO_HZ | 176 | #ifdef CONFIG_NO_HZ |
@@ -197,9 +201,10 @@ struct rcu_data { | |||
197 | }; | 201 | }; |
198 | 202 | ||
199 | /* Values for signaled field in struct rcu_state. */ | 203 | /* Values for signaled field in struct rcu_state. */ |
200 | #define RCU_GP_INIT 0 /* Grace period being initialized. */ | 204 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ |
201 | #define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ | 205 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ |
202 | #define RCU_FORCE_QS 2 /* Need to force quiescent state. */ | 206 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ |
207 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ | ||
203 | #ifdef CONFIG_NO_HZ | 208 | #ifdef CONFIG_NO_HZ |
204 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK | 209 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK |
205 | #else /* #ifdef CONFIG_NO_HZ */ | 210 | #else /* #ifdef CONFIG_NO_HZ */ |
@@ -302,9 +307,9 @@ static void rcu_print_task_stall(struct rcu_node *rnp); | |||
302 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ | 307 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ |
303 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | 308 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); |
304 | #ifdef CONFIG_HOTPLUG_CPU | 309 | #ifdef CONFIG_HOTPLUG_CPU |
305 | static void rcu_preempt_offline_tasks(struct rcu_state *rsp, | 310 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, |
306 | struct rcu_node *rnp, | 311 | struct rcu_node *rnp, |
307 | struct rcu_data *rdp); | 312 | struct rcu_data *rdp); |
308 | static void rcu_preempt_offline_cpu(int cpu); | 313 | static void rcu_preempt_offline_cpu(int cpu); |
309 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | 314 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ |
310 | static void rcu_preempt_check_callbacks(int cpu); | 315 | static void rcu_preempt_check_callbacks(int cpu); |
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c0cb783aa16a..ef2a58c2b9d5 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -304,21 +304,25 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | |||
304 | * parent is to remove the need for rcu_read_unlock_special() to | 304 | * parent is to remove the need for rcu_read_unlock_special() to |
305 | * make more than two attempts to acquire the target rcu_node's lock. | 305 | * make more than two attempts to acquire the target rcu_node's lock. |
306 | * | 306 | * |
307 | * Returns 1 if there was previously a task blocking the current grace | ||
308 | * period on the specified rcu_node structure. | ||
309 | * | ||
307 | * The caller must hold rnp->lock with irqs disabled. | 310 | * The caller must hold rnp->lock with irqs disabled. |
308 | */ | 311 | */ |
309 | static void rcu_preempt_offline_tasks(struct rcu_state *rsp, | 312 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, |
310 | struct rcu_node *rnp, | 313 | struct rcu_node *rnp, |
311 | struct rcu_data *rdp) | 314 | struct rcu_data *rdp) |
312 | { | 315 | { |
313 | int i; | 316 | int i; |
314 | struct list_head *lp; | 317 | struct list_head *lp; |
315 | struct list_head *lp_root; | 318 | struct list_head *lp_root; |
319 | int retval = rcu_preempted_readers(rnp); | ||
316 | struct rcu_node *rnp_root = rcu_get_root(rsp); | 320 | struct rcu_node *rnp_root = rcu_get_root(rsp); |
317 | struct task_struct *tp; | 321 | struct task_struct *tp; |
318 | 322 | ||
319 | if (rnp == rnp_root) { | 323 | if (rnp == rnp_root) { |
320 | WARN_ONCE(1, "Last CPU thought to be offlined?"); | 324 | WARN_ONCE(1, "Last CPU thought to be offlined?"); |
321 | return; /* Shouldn't happen: at least one CPU online. */ | 325 | return 0; /* Shouldn't happen: at least one CPU online. */ |
322 | } | 326 | } |
323 | WARN_ON_ONCE(rnp != rdp->mynode && | 327 | WARN_ON_ONCE(rnp != rdp->mynode && |
324 | (!list_empty(&rnp->blocked_tasks[0]) || | 328 | (!list_empty(&rnp->blocked_tasks[0]) || |
@@ -342,6 +346,8 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp, | |||
342 | spin_unlock(&rnp_root->lock); /* irqs remain disabled */ | 346 | spin_unlock(&rnp_root->lock); /* irqs remain disabled */ |
343 | } | 347 | } |
344 | } | 348 | } |
349 | |||
350 | return retval; | ||
345 | } | 351 | } |
346 | 352 | ||
347 | /* | 353 | /* |
@@ -393,6 +399,17 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
393 | EXPORT_SYMBOL_GPL(call_rcu); | 399 | EXPORT_SYMBOL_GPL(call_rcu); |
394 | 400 | ||
395 | /* | 401 | /* |
402 | * Wait for an rcu-preempt grace period. We are supposed to expedite the | ||
403 | * grace period, but this is the crude slow compatability hack, so just | ||
404 | * invoke synchronize_rcu(). | ||
405 | */ | ||
406 | void synchronize_rcu_expedited(void) | ||
407 | { | ||
408 | synchronize_rcu(); | ||
409 | } | ||
410 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
411 | |||
412 | /* | ||
396 | * Check to see if there is any immediate preemptable-RCU-related work | 413 | * Check to see if there is any immediate preemptable-RCU-related work |
397 | * to be done. | 414 | * to be done. |
398 | */ | 415 | */ |
@@ -521,12 +538,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | |||
521 | 538 | ||
522 | /* | 539 | /* |
523 | * Because preemptable RCU does not exist, it never needs to migrate | 540 | * Because preemptable RCU does not exist, it never needs to migrate |
524 | * tasks that were blocked within RCU read-side critical sections. | 541 | * tasks that were blocked within RCU read-side critical sections, and |
542 | * such non-existent tasks cannot possibly have been blocking the current | ||
543 | * grace period. | ||
525 | */ | 544 | */ |
526 | static void rcu_preempt_offline_tasks(struct rcu_state *rsp, | 545 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, |
527 | struct rcu_node *rnp, | 546 | struct rcu_node *rnp, |
528 | struct rcu_data *rdp) | 547 | struct rcu_data *rdp) |
529 | { | 548 | { |
549 | return 0; | ||
530 | } | 550 | } |
531 | 551 | ||
532 | /* | 552 | /* |
@@ -565,6 +585,16 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | |||
565 | EXPORT_SYMBOL_GPL(call_rcu); | 585 | EXPORT_SYMBOL_GPL(call_rcu); |
566 | 586 | ||
567 | /* | 587 | /* |
588 | * Wait for an rcu-preempt grace period, but make it happen quickly. | ||
589 | * But because preemptable RCU does not exist, map to rcu-sched. | ||
590 | */ | ||
591 | void synchronize_rcu_expedited(void) | ||
592 | { | ||
593 | synchronize_sched_expedited(); | ||
594 | } | ||
595 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
596 | |||
597 | /* | ||
568 | * Because preemptable RCU does not exist, it never has any work to do. | 598 | * Because preemptable RCU does not exist, it never has any work to do. |
569 | */ | 599 | */ |
570 | static int rcu_preempt_pending(int cpu) | 600 | static int rcu_preempt_pending(int cpu) |
diff --git a/kernel/sched.c b/kernel/sched.c index e88689522e66..3c11ae0a948d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq); | |||
309 | */ | 309 | */ |
310 | static DEFINE_SPINLOCK(task_group_lock); | 310 | static DEFINE_SPINLOCK(task_group_lock); |
311 | 311 | ||
312 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
313 | |||
312 | #ifdef CONFIG_SMP | 314 | #ifdef CONFIG_SMP |
313 | static int root_task_group_empty(void) | 315 | static int root_task_group_empty(void) |
314 | { | 316 | { |
@@ -316,7 +318,6 @@ static int root_task_group_empty(void) | |||
316 | } | 318 | } |
317 | #endif | 319 | #endif |
318 | 320 | ||
319 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
320 | #ifdef CONFIG_USER_SCHED | 321 | #ifdef CONFIG_USER_SCHED |
321 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) | 322 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
322 | #else /* !CONFIG_USER_SCHED */ | 323 | #else /* !CONFIG_USER_SCHED */ |
@@ -1564,11 +1565,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1564 | 1565 | ||
1565 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1566 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1566 | 1567 | ||
1567 | struct update_shares_data { | 1568 | static __read_mostly unsigned long *update_shares_data; |
1568 | unsigned long rq_weight[NR_CPUS]; | ||
1569 | }; | ||
1570 | |||
1571 | static DEFINE_PER_CPU(struct update_shares_data, update_shares_data); | ||
1572 | 1569 | ||
1573 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1570 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1574 | 1571 | ||
@@ -1578,12 +1575,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); | |||
1578 | static void update_group_shares_cpu(struct task_group *tg, int cpu, | 1575 | static void update_group_shares_cpu(struct task_group *tg, int cpu, |
1579 | unsigned long sd_shares, | 1576 | unsigned long sd_shares, |
1580 | unsigned long sd_rq_weight, | 1577 | unsigned long sd_rq_weight, |
1581 | struct update_shares_data *usd) | 1578 | unsigned long *usd_rq_weight) |
1582 | { | 1579 | { |
1583 | unsigned long shares, rq_weight; | 1580 | unsigned long shares, rq_weight; |
1584 | int boost = 0; | 1581 | int boost = 0; |
1585 | 1582 | ||
1586 | rq_weight = usd->rq_weight[cpu]; | 1583 | rq_weight = usd_rq_weight[cpu]; |
1587 | if (!rq_weight) { | 1584 | if (!rq_weight) { |
1588 | boost = 1; | 1585 | boost = 1; |
1589 | rq_weight = NICE_0_LOAD; | 1586 | rq_weight = NICE_0_LOAD; |
@@ -1618,7 +1615,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1618 | static int tg_shares_up(struct task_group *tg, void *data) | 1615 | static int tg_shares_up(struct task_group *tg, void *data) |
1619 | { | 1616 | { |
1620 | unsigned long weight, rq_weight = 0, shares = 0; | 1617 | unsigned long weight, rq_weight = 0, shares = 0; |
1621 | struct update_shares_data *usd; | 1618 | unsigned long *usd_rq_weight; |
1622 | struct sched_domain *sd = data; | 1619 | struct sched_domain *sd = data; |
1623 | unsigned long flags; | 1620 | unsigned long flags; |
1624 | int i; | 1621 | int i; |
@@ -1627,11 +1624,11 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1627 | return 0; | 1624 | return 0; |
1628 | 1625 | ||
1629 | local_irq_save(flags); | 1626 | local_irq_save(flags); |
1630 | usd = &__get_cpu_var(update_shares_data); | 1627 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); |
1631 | 1628 | ||
1632 | for_each_cpu(i, sched_domain_span(sd)) { | 1629 | for_each_cpu(i, sched_domain_span(sd)) { |
1633 | weight = tg->cfs_rq[i]->load.weight; | 1630 | weight = tg->cfs_rq[i]->load.weight; |
1634 | usd->rq_weight[i] = weight; | 1631 | usd_rq_weight[i] = weight; |
1635 | 1632 | ||
1636 | /* | 1633 | /* |
1637 | * If there are currently no tasks on the cpu pretend there | 1634 | * If there are currently no tasks on the cpu pretend there |
@@ -1652,7 +1649,7 @@ static int tg_shares_up(struct task_group *tg, void *data) | |||
1652 | shares = tg->shares; | 1649 | shares = tg->shares; |
1653 | 1650 | ||
1654 | for_each_cpu(i, sched_domain_span(sd)) | 1651 | for_each_cpu(i, sched_domain_span(sd)) |
1655 | update_group_shares_cpu(tg, i, shares, rq_weight, usd); | 1652 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); |
1656 | 1653 | ||
1657 | local_irq_restore(flags); | 1654 | local_irq_restore(flags); |
1658 | 1655 | ||
@@ -1996,6 +1993,38 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
1996 | p->sched_class->prio_changed(rq, p, oldprio, running); | 1993 | p->sched_class->prio_changed(rq, p, oldprio, running); |
1997 | } | 1994 | } |
1998 | 1995 | ||
1996 | /** | ||
1997 | * kthread_bind - bind a just-created kthread to a cpu. | ||
1998 | * @p: thread created by kthread_create(). | ||
1999 | * @cpu: cpu (might not be online, must be possible) for @k to run on. | ||
2000 | * | ||
2001 | * Description: This function is equivalent to set_cpus_allowed(), | ||
2002 | * except that @cpu doesn't need to be online, and the thread must be | ||
2003 | * stopped (i.e., just returned from kthread_create()). | ||
2004 | * | ||
2005 | * Function lives here instead of kthread.c because it messes with | ||
2006 | * scheduler internals which require locking. | ||
2007 | */ | ||
2008 | void kthread_bind(struct task_struct *p, unsigned int cpu) | ||
2009 | { | ||
2010 | struct rq *rq = cpu_rq(cpu); | ||
2011 | unsigned long flags; | ||
2012 | |||
2013 | /* Must have done schedule() in kthread() before we set_task_cpu */ | ||
2014 | if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { | ||
2015 | WARN_ON(1); | ||
2016 | return; | ||
2017 | } | ||
2018 | |||
2019 | spin_lock_irqsave(&rq->lock, flags); | ||
2020 | set_task_cpu(p, cpu); | ||
2021 | p->cpus_allowed = cpumask_of_cpu(cpu); | ||
2022 | p->rt.nr_cpus_allowed = 1; | ||
2023 | p->flags |= PF_THREAD_BOUND; | ||
2024 | spin_unlock_irqrestore(&rq->lock, flags); | ||
2025 | } | ||
2026 | EXPORT_SYMBOL(kthread_bind); | ||
2027 | |||
1999 | #ifdef CONFIG_SMP | 2028 | #ifdef CONFIG_SMP |
2000 | /* | 2029 | /* |
2001 | * Is this task likely cache-hot: | 2030 | * Is this task likely cache-hot: |
@@ -2008,7 +2037,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
2008 | /* | 2037 | /* |
2009 | * Buddy candidates are cache hot: | 2038 | * Buddy candidates are cache hot: |
2010 | */ | 2039 | */ |
2011 | if (sched_feat(CACHE_HOT_BUDDY) && | 2040 | if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running && |
2012 | (&p->se == cfs_rq_of(&p->se)->next || | 2041 | (&p->se == cfs_rq_of(&p->se)->next || |
2013 | &p->se == cfs_rq_of(&p->se)->last)) | 2042 | &p->se == cfs_rq_of(&p->se)->last)) |
2014 | return 1; | 2043 | return 1; |
@@ -9407,6 +9436,10 @@ void __init sched_init(void) | |||
9407 | #endif /* CONFIG_USER_SCHED */ | 9436 | #endif /* CONFIG_USER_SCHED */ |
9408 | #endif /* CONFIG_GROUP_SCHED */ | 9437 | #endif /* CONFIG_GROUP_SCHED */ |
9409 | 9438 | ||
9439 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
9440 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
9441 | __alignof__(unsigned long)); | ||
9442 | #endif | ||
9410 | for_each_possible_cpu(i) { | 9443 | for_each_possible_cpu(i) { |
9411 | struct rq *rq; | 9444 | struct rq *rq; |
9412 | 9445 | ||
@@ -9532,13 +9565,13 @@ void __init sched_init(void) | |||
9532 | current->sched_class = &fair_sched_class; | 9565 | current->sched_class = &fair_sched_class; |
9533 | 9566 | ||
9534 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ | 9567 | /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */ |
9535 | alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); | 9568 | zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT); |
9536 | #ifdef CONFIG_SMP | 9569 | #ifdef CONFIG_SMP |
9537 | #ifdef CONFIG_NO_HZ | 9570 | #ifdef CONFIG_NO_HZ |
9538 | alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); | 9571 | zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT); |
9539 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); | 9572 | alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT); |
9540 | #endif | 9573 | #endif |
9541 | alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 9574 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
9542 | #endif /* SMP */ | 9575 | #endif /* SMP */ |
9543 | 9576 | ||
9544 | perf_event_init(); | 9577 | perf_event_init(); |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 4e777b47eeda..37087a7fac22 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -822,6 +822,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
822 | * re-elected due to buddy favours. | 822 | * re-elected due to buddy favours. |
823 | */ | 823 | */ |
824 | clear_buddies(cfs_rq, curr); | 824 | clear_buddies(cfs_rq, curr); |
825 | return; | ||
826 | } | ||
827 | |||
828 | /* | ||
829 | * Ensure that a task that missed wakeup preemption by a | ||
830 | * narrow margin doesn't have to wait for a full slice. | ||
831 | * This also mitigates buddy induced latencies under load. | ||
832 | */ | ||
833 | if (!sched_feat(WAKEUP_PREEMPT)) | ||
834 | return; | ||
835 | |||
836 | if (delta_exec < sysctl_sched_min_granularity) | ||
837 | return; | ||
838 | |||
839 | if (cfs_rq->nr_running > 1) { | ||
840 | struct sched_entity *se = __pick_next_entity(cfs_rq); | ||
841 | s64 delta = curr->vruntime - se->vruntime; | ||
842 | |||
843 | if (delta > ideal_runtime) | ||
844 | resched_task(rq_of(cfs_rq)->curr); | ||
825 | } | 845 | } |
826 | } | 846 | } |
827 | 847 | ||
@@ -861,12 +881,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se); | |||
861 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) | 881 | static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) |
862 | { | 882 | { |
863 | struct sched_entity *se = __pick_next_entity(cfs_rq); | 883 | struct sched_entity *se = __pick_next_entity(cfs_rq); |
884 | struct sched_entity *left = se; | ||
864 | 885 | ||
865 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1) | 886 | if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) |
866 | return cfs_rq->next; | 887 | se = cfs_rq->next; |
867 | 888 | ||
868 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1) | 889 | /* |
869 | return cfs_rq->last; | 890 | * Prefer last buddy, try to return the CPU to a preempted task. |
891 | */ | ||
892 | if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) | ||
893 | se = cfs_rq->last; | ||
894 | |||
895 | clear_buddies(cfs_rq, se); | ||
870 | 896 | ||
871 | return se; | 897 | return se; |
872 | } | 898 | } |
@@ -1568,6 +1594,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1568 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1594 | struct sched_entity *se = &curr->se, *pse = &p->se; |
1569 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1595 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
1570 | int sync = wake_flags & WF_SYNC; | 1596 | int sync = wake_flags & WF_SYNC; |
1597 | int scale = cfs_rq->nr_running >= sched_nr_latency; | ||
1571 | 1598 | ||
1572 | update_curr(cfs_rq); | 1599 | update_curr(cfs_rq); |
1573 | 1600 | ||
@@ -1582,18 +1609,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1582 | if (unlikely(se == pse)) | 1609 | if (unlikely(se == pse)) |
1583 | return; | 1610 | return; |
1584 | 1611 | ||
1585 | /* | 1612 | if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) |
1586 | * Only set the backward buddy when the current task is still on the | ||
1587 | * rq. This can happen when a wakeup gets interleaved with schedule on | ||
1588 | * the ->pre_schedule() or idle_balance() point, either of which can | ||
1589 | * drop the rq lock. | ||
1590 | * | ||
1591 | * Also, during early boot the idle thread is in the fair class, for | ||
1592 | * obvious reasons its a bad idea to schedule back to the idle thread. | ||
1593 | */ | ||
1594 | if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle)) | ||
1595 | set_last_buddy(se); | ||
1596 | if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) | ||
1597 | set_next_buddy(pse); | 1613 | set_next_buddy(pse); |
1598 | 1614 | ||
1599 | /* | 1615 | /* |
@@ -1639,8 +1655,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ | |||
1639 | 1655 | ||
1640 | BUG_ON(!pse); | 1656 | BUG_ON(!pse); |
1641 | 1657 | ||
1642 | if (wakeup_preempt_entity(se, pse) == 1) | 1658 | if (wakeup_preempt_entity(se, pse) == 1) { |
1643 | resched_task(curr); | 1659 | resched_task(curr); |
1660 | /* | ||
1661 | * Only set the backward buddy when the current task is still | ||
1662 | * on the rq. This can happen when a wakeup gets interleaved | ||
1663 | * with schedule on the ->pre_schedule() or idle_balance() | ||
1664 | * point, either of which can * drop the rq lock. | ||
1665 | * | ||
1666 | * Also, during early boot the idle thread is in the fair class, | ||
1667 | * for obvious reasons its a bad idea to schedule back to it. | ||
1668 | */ | ||
1669 | if (unlikely(!se->on_rq || curr == rq->idle)) | ||
1670 | return; | ||
1671 | if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se)) | ||
1672 | set_last_buddy(se); | ||
1673 | } | ||
1644 | } | 1674 | } |
1645 | 1675 | ||
1646 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 1676 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
@@ -1654,16 +1684,6 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) | |||
1654 | 1684 | ||
1655 | do { | 1685 | do { |
1656 | se = pick_next_entity(cfs_rq); | 1686 | se = pick_next_entity(cfs_rq); |
1657 | /* | ||
1658 | * If se was a buddy, clear it so that it will have to earn | ||
1659 | * the favour again. | ||
1660 | * | ||
1661 | * If se was not a buddy, clear the buddies because neither | ||
1662 | * was elegible to run, let them earn it again. | ||
1663 | * | ||
1664 | * IOW. unconditionally clear buddies. | ||
1665 | */ | ||
1666 | __clear_buddies(cfs_rq, NULL); | ||
1667 | set_next_entity(cfs_rq, se); | 1687 | set_next_entity(cfs_rq, se); |
1668 | cfs_rq = group_cfs_rq(se); | 1688 | cfs_rq = group_cfs_rq(se); |
1669 | } while (cfs_rq); | 1689 | } while (cfs_rq); |
diff --git a/kernel/sys.c b/kernel/sys.c index 255475d163e0..ce17760d9c51 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1110,6 +1110,8 @@ SYSCALL_DEFINE0(setsid) | |||
1110 | err = session; | 1110 | err = session; |
1111 | out: | 1111 | out: |
1112 | write_unlock_irq(&tasklist_lock); | 1112 | write_unlock_irq(&tasklist_lock); |
1113 | if (err > 0) | ||
1114 | proc_sid_connector(group_leader); | ||
1113 | return err; | 1115 | return err; |
1114 | } | 1116 | } |
1115 | 1117 | ||
@@ -1546,24 +1548,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, | |||
1546 | if (arg4 | arg5) | 1548 | if (arg4 | arg5) |
1547 | return -EINVAL; | 1549 | return -EINVAL; |
1548 | switch (arg2) { | 1550 | switch (arg2) { |
1549 | case 0: | 1551 | case PR_MCE_KILL_CLEAR: |
1550 | if (arg3 != 0) | 1552 | if (arg3 != 0) |
1551 | return -EINVAL; | 1553 | return -EINVAL; |
1552 | current->flags &= ~PF_MCE_PROCESS; | 1554 | current->flags &= ~PF_MCE_PROCESS; |
1553 | break; | 1555 | break; |
1554 | case 1: | 1556 | case PR_MCE_KILL_SET: |
1555 | current->flags |= PF_MCE_PROCESS; | 1557 | current->flags |= PF_MCE_PROCESS; |
1556 | if (arg3 != 0) | 1558 | if (arg3 == PR_MCE_KILL_EARLY) |
1557 | current->flags |= PF_MCE_EARLY; | 1559 | current->flags |= PF_MCE_EARLY; |
1558 | else | 1560 | else if (arg3 == PR_MCE_KILL_LATE) |
1559 | current->flags &= ~PF_MCE_EARLY; | 1561 | current->flags &= ~PF_MCE_EARLY; |
1562 | else if (arg3 == PR_MCE_KILL_DEFAULT) | ||
1563 | current->flags &= | ||
1564 | ~(PF_MCE_EARLY|PF_MCE_PROCESS); | ||
1565 | else | ||
1566 | return -EINVAL; | ||
1560 | break; | 1567 | break; |
1561 | default: | 1568 | default: |
1562 | return -EINVAL; | 1569 | return -EINVAL; |
1563 | } | 1570 | } |
1564 | error = 0; | 1571 | error = 0; |
1565 | break; | 1572 | break; |
1566 | 1573 | case PR_MCE_KILL_GET: | |
1574 | if (arg2 | arg3 | arg4 | arg5) | ||
1575 | return -EINVAL; | ||
1576 | if (current->flags & PF_MCE_PROCESS) | ||
1577 | error = (current->flags & PF_MCE_EARLY) ? | ||
1578 | PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE; | ||
1579 | else | ||
1580 | error = PR_MCE_KILL_DEFAULT; | ||
1581 | break; | ||
1567 | default: | 1582 | default: |
1568 | error = -EINVAL; | 1583 | error = -EINVAL; |
1569 | break; | 1584 | break; |
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index b38423ca711a..b6e7aaea4604 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c | |||
@@ -1521,7 +1521,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) | |||
1521 | if (!table->ctl_name && table->strategy) | 1521 | if (!table->ctl_name && table->strategy) |
1522 | set_fail(&fail, table, "Strategy without ctl_name"); | 1522 | set_fail(&fail, table, "Strategy without ctl_name"); |
1523 | #endif | 1523 | #endif |
1524 | #ifdef CONFIG_PROC_FS | 1524 | #ifdef CONFIG_PROC_SYSCTL |
1525 | if (table->procname && !table->proc_handler) | 1525 | if (table->procname && !table->proc_handler) |
1526 | set_fail(&fail, table, "No proc_handler"); | 1526 | set_fail(&fail, table, "No proc_handler"); |
1527 | #endif | 1527 | #endif |
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 06c3d5be6759..d006554888dc 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig | |||
@@ -449,6 +449,23 @@ config BLK_DEV_IO_TRACE | |||
449 | 449 | ||
450 | If unsure, say N. | 450 | If unsure, say N. |
451 | 451 | ||
452 | config KPROBE_EVENT | ||
453 | depends on KPROBES | ||
454 | depends on X86 | ||
455 | bool "Enable kprobes-based dynamic events" | ||
456 | select TRACING | ||
457 | default y | ||
458 | help | ||
459 | This allows the user to add tracing events (similar to tracepoints) on the fly | ||
460 | via the ftrace interface. See Documentation/trace/kprobetrace.txt | ||
461 | for more details. | ||
462 | |||
463 | Those events can be inserted wherever kprobes can probe, and record | ||
464 | various register and memory values. | ||
465 | |||
466 | This option is also required by perf-probe subcommand of perf tools. If | ||
467 | you want to use perf tools, this option is strongly recommended. | ||
468 | |||
452 | config DYNAMIC_FTRACE | 469 | config DYNAMIC_FTRACE |
453 | bool "enable/disable ftrace tracepoints dynamically" | 470 | bool "enable/disable ftrace tracepoints dynamically" |
454 | depends on FUNCTION_TRACER | 471 | depends on FUNCTION_TRACER |
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 0f84c52e58fe..cd9ecd89ec77 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile | |||
@@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o | |||
53 | obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o | 53 | obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o |
54 | obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o | 54 | obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o |
55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o | 55 | obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o |
56 | obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o | ||
56 | obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o | 57 | obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o |
57 | obj-$(CONFIG_EVENT_TRACING) += power-traces.o | 58 | obj-$(CONFIG_EVENT_TRACING) += power-traces.o |
58 | 59 | ||
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index b10c0d90a6ff..7cb6f1922598 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c | |||
@@ -751,7 +751,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, | |||
751 | out: | 751 | out: |
752 | mutex_unlock(&ftrace_profile_lock); | 752 | mutex_unlock(&ftrace_profile_lock); |
753 | 753 | ||
754 | filp->f_pos += cnt; | 754 | *ppos += cnt; |
755 | 755 | ||
756 | return cnt; | 756 | return cnt; |
757 | } | 757 | } |
@@ -2199,15 +2199,15 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, | |||
2199 | ret = ftrace_process_regex(parser->buffer, | 2199 | ret = ftrace_process_regex(parser->buffer, |
2200 | parser->idx, enable); | 2200 | parser->idx, enable); |
2201 | if (ret) | 2201 | if (ret) |
2202 | goto out; | 2202 | goto out_unlock; |
2203 | 2203 | ||
2204 | trace_parser_clear(parser); | 2204 | trace_parser_clear(parser); |
2205 | } | 2205 | } |
2206 | 2206 | ||
2207 | ret = read; | 2207 | ret = read; |
2208 | 2208 | out_unlock: | |
2209 | mutex_unlock(&ftrace_regex_lock); | 2209 | mutex_unlock(&ftrace_regex_lock); |
2210 | out: | 2210 | |
2211 | return ret; | 2211 | return ret; |
2212 | } | 2212 | } |
2213 | 2213 | ||
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e43c928356ee..db223fe8887f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c | |||
@@ -486,7 +486,7 @@ struct ring_buffer_iter { | |||
486 | /* Up this if you want to test the TIME_EXTENTS and normalization */ | 486 | /* Up this if you want to test the TIME_EXTENTS and normalization */ |
487 | #define DEBUG_SHIFT 0 | 487 | #define DEBUG_SHIFT 0 |
488 | 488 | ||
489 | static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) | 489 | static inline u64 rb_time_stamp(struct ring_buffer *buffer) |
490 | { | 490 | { |
491 | /* shift to debug/test normalization and TIME_EXTENTS */ | 491 | /* shift to debug/test normalization and TIME_EXTENTS */ |
492 | return buffer->clock() << DEBUG_SHIFT; | 492 | return buffer->clock() << DEBUG_SHIFT; |
@@ -497,7 +497,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) | |||
497 | u64 time; | 497 | u64 time; |
498 | 498 | ||
499 | preempt_disable_notrace(); | 499 | preempt_disable_notrace(); |
500 | time = rb_time_stamp(buffer, cpu); | 500 | time = rb_time_stamp(buffer); |
501 | preempt_enable_no_resched_notrace(); | 501 | preempt_enable_no_resched_notrace(); |
502 | 502 | ||
503 | return time; | 503 | return time; |
@@ -602,7 +602,7 @@ static struct list_head *rb_list_head(struct list_head *list) | |||
602 | } | 602 | } |
603 | 603 | ||
604 | /* | 604 | /* |
605 | * rb_is_head_page - test if the give page is the head page | 605 | * rb_is_head_page - test if the given page is the head page |
606 | * | 606 | * |
607 | * Because the reader may move the head_page pointer, we can | 607 | * Because the reader may move the head_page pointer, we can |
608 | * not trust what the head page is (it may be pointing to | 608 | * not trust what the head page is (it may be pointing to |
@@ -1196,6 +1196,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | |||
1196 | atomic_inc(&cpu_buffer->record_disabled); | 1196 | atomic_inc(&cpu_buffer->record_disabled); |
1197 | synchronize_sched(); | 1197 | synchronize_sched(); |
1198 | 1198 | ||
1199 | spin_lock_irq(&cpu_buffer->reader_lock); | ||
1199 | rb_head_page_deactivate(cpu_buffer); | 1200 | rb_head_page_deactivate(cpu_buffer); |
1200 | 1201 | ||
1201 | for (i = 0; i < nr_pages; i++) { | 1202 | for (i = 0; i < nr_pages; i++) { |
@@ -1210,6 +1211,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages) | |||
1210 | return; | 1211 | return; |
1211 | 1212 | ||
1212 | rb_reset_cpu(cpu_buffer); | 1213 | rb_reset_cpu(cpu_buffer); |
1214 | spin_unlock_irq(&cpu_buffer->reader_lock); | ||
1213 | 1215 | ||
1214 | rb_check_pages(cpu_buffer); | 1216 | rb_check_pages(cpu_buffer); |
1215 | 1217 | ||
@@ -1871,7 +1873,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, | |||
1871 | * Nested commits always have zero deltas, so | 1873 | * Nested commits always have zero deltas, so |
1872 | * just reread the time stamp | 1874 | * just reread the time stamp |
1873 | */ | 1875 | */ |
1874 | *ts = rb_time_stamp(buffer, cpu_buffer->cpu); | 1876 | *ts = rb_time_stamp(buffer); |
1875 | next_page->page->time_stamp = *ts; | 1877 | next_page->page->time_stamp = *ts; |
1876 | } | 1878 | } |
1877 | 1879 | ||
@@ -2114,7 +2116,7 @@ rb_reserve_next_event(struct ring_buffer *buffer, | |||
2114 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) | 2116 | if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) |
2115 | goto out_fail; | 2117 | goto out_fail; |
2116 | 2118 | ||
2117 | ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); | 2119 | ts = rb_time_stamp(cpu_buffer->buffer); |
2118 | 2120 | ||
2119 | /* | 2121 | /* |
2120 | * Only the first commit can update the timestamp. | 2122 | * Only the first commit can update the timestamp. |
@@ -2684,7 +2686,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) | |||
2684 | EXPORT_SYMBOL_GPL(ring_buffer_entries); | 2686 | EXPORT_SYMBOL_GPL(ring_buffer_entries); |
2685 | 2687 | ||
2686 | /** | 2688 | /** |
2687 | * ring_buffer_overrun_cpu - get the number of overruns in buffer | 2689 | * ring_buffer_overruns - get the number of overruns in buffer |
2688 | * @buffer: The ring buffer | 2690 | * @buffer: The ring buffer |
2689 | * | 2691 | * |
2690 | * Returns the total number of overruns in the ring buffer | 2692 | * Returns the total number of overruns in the ring buffer |
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 026e715a0c7a..9d3067a62d43 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c | |||
@@ -2440,7 +2440,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, | |||
2440 | return ret; | 2440 | return ret; |
2441 | } | 2441 | } |
2442 | 2442 | ||
2443 | filp->f_pos += cnt; | 2443 | *ppos += cnt; |
2444 | 2444 | ||
2445 | return cnt; | 2445 | return cnt; |
2446 | } | 2446 | } |
@@ -2582,7 +2582,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf, | |||
2582 | } | 2582 | } |
2583 | mutex_unlock(&trace_types_lock); | 2583 | mutex_unlock(&trace_types_lock); |
2584 | 2584 | ||
2585 | filp->f_pos += cnt; | 2585 | *ppos += cnt; |
2586 | 2586 | ||
2587 | return cnt; | 2587 | return cnt; |
2588 | } | 2588 | } |
@@ -2764,7 +2764,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf, | |||
2764 | if (err) | 2764 | if (err) |
2765 | return err; | 2765 | return err; |
2766 | 2766 | ||
2767 | filp->f_pos += ret; | 2767 | *ppos += ret; |
2768 | 2768 | ||
2769 | return ret; | 2769 | return ret; |
2770 | } | 2770 | } |
@@ -3299,7 +3299,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, | |||
3299 | } | 3299 | } |
3300 | } | 3300 | } |
3301 | 3301 | ||
3302 | filp->f_pos += cnt; | 3302 | *ppos += cnt; |
3303 | 3303 | ||
3304 | /* If check pages failed, return ENOMEM */ | 3304 | /* If check pages failed, return ENOMEM */ |
3305 | if (tracing_disabled) | 3305 | if (tracing_disabled) |
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ee00475742eb..4da6ede74401 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h | |||
@@ -103,6 +103,29 @@ struct syscall_trace_exit { | |||
103 | unsigned long ret; | 103 | unsigned long ret; |
104 | }; | 104 | }; |
105 | 105 | ||
106 | struct kprobe_trace_entry { | ||
107 | struct trace_entry ent; | ||
108 | unsigned long ip; | ||
109 | int nargs; | ||
110 | unsigned long args[]; | ||
111 | }; | ||
112 | |||
113 | #define SIZEOF_KPROBE_TRACE_ENTRY(n) \ | ||
114 | (offsetof(struct kprobe_trace_entry, args) + \ | ||
115 | (sizeof(unsigned long) * (n))) | ||
116 | |||
117 | struct kretprobe_trace_entry { | ||
118 | struct trace_entry ent; | ||
119 | unsigned long func; | ||
120 | unsigned long ret_ip; | ||
121 | int nargs; | ||
122 | unsigned long args[]; | ||
123 | }; | ||
124 | |||
125 | #define SIZEOF_KRETPROBE_TRACE_ENTRY(n) \ | ||
126 | (offsetof(struct kretprobe_trace_entry, args) + \ | ||
127 | (sizeof(unsigned long) * (n))) | ||
128 | |||
106 | /* | 129 | /* |
107 | * trace_flag_type is an enumeration that holds different | 130 | * trace_flag_type is an enumeration that holds different |
108 | * states when a trace occurs. These are: | 131 | * states when a trace occurs. These are: |
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 8d5c171cc998..e0d351b01f5a 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c | |||
@@ -8,44 +8,39 @@ | |||
8 | #include <linux/module.h> | 8 | #include <linux/module.h> |
9 | #include "trace.h" | 9 | #include "trace.h" |
10 | 10 | ||
11 | /* | ||
12 | * We can't use a size but a type in alloc_percpu() | ||
13 | * So let's create a dummy type that matches the desired size | ||
14 | */ | ||
15 | typedef struct {char buf[FTRACE_MAX_PROFILE_SIZE];} profile_buf_t; | ||
16 | 11 | ||
17 | char *trace_profile_buf; | 12 | struct perf_trace_buf *perf_trace_buf; |
18 | EXPORT_SYMBOL_GPL(trace_profile_buf); | 13 | EXPORT_SYMBOL_GPL(perf_trace_buf); |
19 | 14 | ||
20 | char *trace_profile_buf_nmi; | 15 | struct perf_trace_buf *perf_trace_buf_nmi; |
21 | EXPORT_SYMBOL_GPL(trace_profile_buf_nmi); | 16 | EXPORT_SYMBOL_GPL(perf_trace_buf_nmi); |
22 | 17 | ||
23 | /* Count the events in use (per event id, not per instance) */ | 18 | /* Count the events in use (per event id, not per instance) */ |
24 | static int total_profile_count; | 19 | static int total_profile_count; |
25 | 20 | ||
26 | static int ftrace_profile_enable_event(struct ftrace_event_call *event) | 21 | static int ftrace_profile_enable_event(struct ftrace_event_call *event) |
27 | { | 22 | { |
28 | char *buf; | 23 | struct perf_trace_buf *buf; |
29 | int ret = -ENOMEM; | 24 | int ret = -ENOMEM; |
30 | 25 | ||
31 | if (atomic_inc_return(&event->profile_count)) | 26 | if (atomic_inc_return(&event->profile_count)) |
32 | return 0; | 27 | return 0; |
33 | 28 | ||
34 | if (!total_profile_count) { | 29 | if (!total_profile_count) { |
35 | buf = (char *)alloc_percpu(profile_buf_t); | 30 | buf = alloc_percpu(struct perf_trace_buf); |
36 | if (!buf) | 31 | if (!buf) |
37 | goto fail_buf; | 32 | goto fail_buf; |
38 | 33 | ||
39 | rcu_assign_pointer(trace_profile_buf, buf); | 34 | rcu_assign_pointer(perf_trace_buf, buf); |
40 | 35 | ||
41 | buf = (char *)alloc_percpu(profile_buf_t); | 36 | buf = alloc_percpu(struct perf_trace_buf); |
42 | if (!buf) | 37 | if (!buf) |
43 | goto fail_buf_nmi; | 38 | goto fail_buf_nmi; |
44 | 39 | ||
45 | rcu_assign_pointer(trace_profile_buf_nmi, buf); | 40 | rcu_assign_pointer(perf_trace_buf_nmi, buf); |
46 | } | 41 | } |
47 | 42 | ||
48 | ret = event->profile_enable(); | 43 | ret = event->profile_enable(event); |
49 | if (!ret) { | 44 | if (!ret) { |
50 | total_profile_count++; | 45 | total_profile_count++; |
51 | return 0; | 46 | return 0; |
@@ -53,10 +48,10 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event) | |||
53 | 48 | ||
54 | fail_buf_nmi: | 49 | fail_buf_nmi: |
55 | if (!total_profile_count) { | 50 | if (!total_profile_count) { |
56 | free_percpu(trace_profile_buf_nmi); | 51 | free_percpu(perf_trace_buf_nmi); |
57 | free_percpu(trace_profile_buf); | 52 | free_percpu(perf_trace_buf); |
58 | trace_profile_buf_nmi = NULL; | 53 | perf_trace_buf_nmi = NULL; |
59 | trace_profile_buf = NULL; | 54 | perf_trace_buf = NULL; |
60 | } | 55 | } |
61 | fail_buf: | 56 | fail_buf: |
62 | atomic_dec(&event->profile_count); | 57 | atomic_dec(&event->profile_count); |
@@ -84,19 +79,19 @@ int ftrace_profile_enable(int event_id) | |||
84 | 79 | ||
85 | static void ftrace_profile_disable_event(struct ftrace_event_call *event) | 80 | static void ftrace_profile_disable_event(struct ftrace_event_call *event) |
86 | { | 81 | { |
87 | char *buf, *nmi_buf; | 82 | struct perf_trace_buf *buf, *nmi_buf; |
88 | 83 | ||
89 | if (!atomic_add_negative(-1, &event->profile_count)) | 84 | if (!atomic_add_negative(-1, &event->profile_count)) |
90 | return; | 85 | return; |
91 | 86 | ||
92 | event->profile_disable(); | 87 | event->profile_disable(event); |
93 | 88 | ||
94 | if (!--total_profile_count) { | 89 | if (!--total_profile_count) { |
95 | buf = trace_profile_buf; | 90 | buf = perf_trace_buf; |
96 | rcu_assign_pointer(trace_profile_buf, NULL); | 91 | rcu_assign_pointer(perf_trace_buf, NULL); |
97 | 92 | ||
98 | nmi_buf = trace_profile_buf_nmi; | 93 | nmi_buf = perf_trace_buf_nmi; |
99 | rcu_assign_pointer(trace_profile_buf_nmi, NULL); | 94 | rcu_assign_pointer(perf_trace_buf_nmi, NULL); |
100 | 95 | ||
101 | /* | 96 | /* |
102 | * Ensure every events in profiling have finished before | 97 | * Ensure every events in profiling have finished before |
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7c18d154ea28..1d18315dc836 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c | |||
@@ -93,9 +93,7 @@ int trace_define_common_fields(struct ftrace_event_call *call) | |||
93 | } | 93 | } |
94 | EXPORT_SYMBOL_GPL(trace_define_common_fields); | 94 | EXPORT_SYMBOL_GPL(trace_define_common_fields); |
95 | 95 | ||
96 | #ifdef CONFIG_MODULES | 96 | void trace_destroy_fields(struct ftrace_event_call *call) |
97 | |||
98 | static void trace_destroy_fields(struct ftrace_event_call *call) | ||
99 | { | 97 | { |
100 | struct ftrace_event_field *field, *next; | 98 | struct ftrace_event_field *field, *next; |
101 | 99 | ||
@@ -107,8 +105,6 @@ static void trace_destroy_fields(struct ftrace_event_call *call) | |||
107 | } | 105 | } |
108 | } | 106 | } |
109 | 107 | ||
110 | #endif /* CONFIG_MODULES */ | ||
111 | |||
112 | static void ftrace_event_enable_disable(struct ftrace_event_call *call, | 108 | static void ftrace_event_enable_disable(struct ftrace_event_call *call, |
113 | int enable) | 109 | int enable) |
114 | { | 110 | { |
@@ -117,14 +113,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, | |||
117 | if (call->enabled) { | 113 | if (call->enabled) { |
118 | call->enabled = 0; | 114 | call->enabled = 0; |
119 | tracing_stop_cmdline_record(); | 115 | tracing_stop_cmdline_record(); |
120 | call->unregfunc(call->data); | 116 | call->unregfunc(call); |
121 | } | 117 | } |
122 | break; | 118 | break; |
123 | case 1: | 119 | case 1: |
124 | if (!call->enabled) { | 120 | if (!call->enabled) { |
125 | call->enabled = 1; | 121 | call->enabled = 1; |
126 | tracing_start_cmdline_record(); | 122 | tracing_start_cmdline_record(); |
127 | call->regfunc(call->data); | 123 | call->regfunc(call); |
128 | } | 124 | } |
129 | break; | 125 | break; |
130 | } | 126 | } |
@@ -937,27 +933,46 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, | |||
937 | return 0; | 933 | return 0; |
938 | } | 934 | } |
939 | 935 | ||
940 | #define for_each_event(event, start, end) \ | 936 | static int __trace_add_event_call(struct ftrace_event_call *call) |
941 | for (event = start; \ | 937 | { |
942 | (unsigned long)event < (unsigned long)end; \ | 938 | struct dentry *d_events; |
943 | event++) | 939 | int ret; |
944 | 940 | ||
945 | #ifdef CONFIG_MODULES | 941 | if (!call->name) |
942 | return -EINVAL; | ||
946 | 943 | ||
947 | static LIST_HEAD(ftrace_module_file_list); | 944 | if (call->raw_init) { |
945 | ret = call->raw_init(call); | ||
946 | if (ret < 0) { | ||
947 | if (ret != -ENOSYS) | ||
948 | pr_warning("Could not initialize trace " | ||
949 | "events/%s\n", call->name); | ||
950 | return ret; | ||
951 | } | ||
952 | } | ||
948 | 953 | ||
949 | /* | 954 | d_events = event_trace_events_dir(); |
950 | * Modules must own their file_operations to keep up with | 955 | if (!d_events) |
951 | * reference counting. | 956 | return -ENOENT; |
952 | */ | 957 | |
953 | struct ftrace_module_file_ops { | 958 | ret = event_create_dir(call, d_events, &ftrace_event_id_fops, |
954 | struct list_head list; | 959 | &ftrace_enable_fops, &ftrace_event_filter_fops, |
955 | struct module *mod; | 960 | &ftrace_event_format_fops); |
956 | struct file_operations id; | 961 | if (!ret) |
957 | struct file_operations enable; | 962 | list_add(&call->list, &ftrace_events); |
958 | struct file_operations format; | 963 | |
959 | struct file_operations filter; | 964 | return ret; |
960 | }; | 965 | } |
966 | |||
967 | /* Add an additional event_call dynamically */ | ||
968 | int trace_add_event_call(struct ftrace_event_call *call) | ||
969 | { | ||
970 | int ret; | ||
971 | mutex_lock(&event_mutex); | ||
972 | ret = __trace_add_event_call(call); | ||
973 | mutex_unlock(&event_mutex); | ||
974 | return ret; | ||
975 | } | ||
961 | 976 | ||
962 | static void remove_subsystem_dir(const char *name) | 977 | static void remove_subsystem_dir(const char *name) |
963 | { | 978 | { |
@@ -985,6 +1000,53 @@ static void remove_subsystem_dir(const char *name) | |||
985 | } | 1000 | } |
986 | } | 1001 | } |
987 | 1002 | ||
1003 | /* | ||
1004 | * Must be called under locking both of event_mutex and trace_event_mutex. | ||
1005 | */ | ||
1006 | static void __trace_remove_event_call(struct ftrace_event_call *call) | ||
1007 | { | ||
1008 | ftrace_event_enable_disable(call, 0); | ||
1009 | if (call->event) | ||
1010 | __unregister_ftrace_event(call->event); | ||
1011 | debugfs_remove_recursive(call->dir); | ||
1012 | list_del(&call->list); | ||
1013 | trace_destroy_fields(call); | ||
1014 | destroy_preds(call); | ||
1015 | remove_subsystem_dir(call->system); | ||
1016 | } | ||
1017 | |||
1018 | /* Remove an event_call */ | ||
1019 | void trace_remove_event_call(struct ftrace_event_call *call) | ||
1020 | { | ||
1021 | mutex_lock(&event_mutex); | ||
1022 | down_write(&trace_event_mutex); | ||
1023 | __trace_remove_event_call(call); | ||
1024 | up_write(&trace_event_mutex); | ||
1025 | mutex_unlock(&event_mutex); | ||
1026 | } | ||
1027 | |||
1028 | #define for_each_event(event, start, end) \ | ||
1029 | for (event = start; \ | ||
1030 | (unsigned long)event < (unsigned long)end; \ | ||
1031 | event++) | ||
1032 | |||
1033 | #ifdef CONFIG_MODULES | ||
1034 | |||
1035 | static LIST_HEAD(ftrace_module_file_list); | ||
1036 | |||
1037 | /* | ||
1038 | * Modules must own their file_operations to keep up with | ||
1039 | * reference counting. | ||
1040 | */ | ||
1041 | struct ftrace_module_file_ops { | ||
1042 | struct list_head list; | ||
1043 | struct module *mod; | ||
1044 | struct file_operations id; | ||
1045 | struct file_operations enable; | ||
1046 | struct file_operations format; | ||
1047 | struct file_operations filter; | ||
1048 | }; | ||
1049 | |||
988 | static struct ftrace_module_file_ops * | 1050 | static struct ftrace_module_file_ops * |
989 | trace_create_file_ops(struct module *mod) | 1051 | trace_create_file_ops(struct module *mod) |
990 | { | 1052 | { |
@@ -1042,7 +1104,7 @@ static void trace_module_add_events(struct module *mod) | |||
1042 | if (!call->name) | 1104 | if (!call->name) |
1043 | continue; | 1105 | continue; |
1044 | if (call->raw_init) { | 1106 | if (call->raw_init) { |
1045 | ret = call->raw_init(); | 1107 | ret = call->raw_init(call); |
1046 | if (ret < 0) { | 1108 | if (ret < 0) { |
1047 | if (ret != -ENOSYS) | 1109 | if (ret != -ENOSYS) |
1048 | pr_warning("Could not initialize trace " | 1110 | pr_warning("Could not initialize trace " |
@@ -1060,10 +1122,11 @@ static void trace_module_add_events(struct module *mod) | |||
1060 | return; | 1122 | return; |
1061 | } | 1123 | } |
1062 | call->mod = mod; | 1124 | call->mod = mod; |
1063 | list_add(&call->list, &ftrace_events); | 1125 | ret = event_create_dir(call, d_events, |
1064 | event_create_dir(call, d_events, | 1126 | &file_ops->id, &file_ops->enable, |
1065 | &file_ops->id, &file_ops->enable, | 1127 | &file_ops->filter, &file_ops->format); |
1066 | &file_ops->filter, &file_ops->format); | 1128 | if (!ret) |
1129 | list_add(&call->list, &ftrace_events); | ||
1067 | } | 1130 | } |
1068 | } | 1131 | } |
1069 | 1132 | ||
@@ -1077,14 +1140,7 @@ static void trace_module_remove_events(struct module *mod) | |||
1077 | list_for_each_entry_safe(call, p, &ftrace_events, list) { | 1140 | list_for_each_entry_safe(call, p, &ftrace_events, list) { |
1078 | if (call->mod == mod) { | 1141 | if (call->mod == mod) { |
1079 | found = true; | 1142 | found = true; |
1080 | ftrace_event_enable_disable(call, 0); | 1143 | __trace_remove_event_call(call); |
1081 | if (call->event) | ||
1082 | __unregister_ftrace_event(call->event); | ||
1083 | debugfs_remove_recursive(call->dir); | ||
1084 | list_del(&call->list); | ||
1085 | trace_destroy_fields(call); | ||
1086 | destroy_preds(call); | ||
1087 | remove_subsystem_dir(call->system); | ||
1088 | } | 1144 | } |
1089 | } | 1145 | } |
1090 | 1146 | ||
@@ -1202,7 +1258,7 @@ static __init int event_trace_init(void) | |||
1202 | if (!call->name) | 1258 | if (!call->name) |
1203 | continue; | 1259 | continue; |
1204 | if (call->raw_init) { | 1260 | if (call->raw_init) { |
1205 | ret = call->raw_init(); | 1261 | ret = call->raw_init(call); |
1206 | if (ret < 0) { | 1262 | if (ret < 0) { |
1207 | if (ret != -ENOSYS) | 1263 | if (ret != -ENOSYS) |
1208 | pr_warning("Could not initialize trace " | 1264 | pr_warning("Could not initialize trace " |
@@ -1210,10 +1266,12 @@ static __init int event_trace_init(void) | |||
1210 | continue; | 1266 | continue; |
1211 | } | 1267 | } |
1212 | } | 1268 | } |
1213 | list_add(&call->list, &ftrace_events); | 1269 | ret = event_create_dir(call, d_events, &ftrace_event_id_fops, |
1214 | event_create_dir(call, d_events, &ftrace_event_id_fops, | 1270 | &ftrace_enable_fops, |
1215 | &ftrace_enable_fops, &ftrace_event_filter_fops, | 1271 | &ftrace_event_filter_fops, |
1216 | &ftrace_event_format_fops); | 1272 | &ftrace_event_format_fops); |
1273 | if (!ret) | ||
1274 | list_add(&call->list, &ftrace_events); | ||
1217 | } | 1275 | } |
1218 | 1276 | ||
1219 | while (true) { | 1277 | while (true) { |
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 21d34757b955..50504cb228de 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c | |||
@@ -1230,12 +1230,12 @@ static int replace_system_preds(struct event_subsystem *system, | |||
1230 | struct filter_parse_state *ps, | 1230 | struct filter_parse_state *ps, |
1231 | char *filter_string) | 1231 | char *filter_string) |
1232 | { | 1232 | { |
1233 | struct event_filter *filter = system->filter; | ||
1234 | struct ftrace_event_call *call; | 1233 | struct ftrace_event_call *call; |
1235 | bool fail = true; | 1234 | bool fail = true; |
1236 | int err; | 1235 | int err; |
1237 | 1236 | ||
1238 | list_for_each_entry(call, &ftrace_events, list) { | 1237 | list_for_each_entry(call, &ftrace_events, list) { |
1238 | struct event_filter *filter = call->filter; | ||
1239 | 1239 | ||
1240 | if (!call->define_fields) | 1240 | if (!call->define_fields) |
1241 | continue; | 1241 | continue; |
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 31da218ee10f..934d81fb4ca4 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c | |||
@@ -134,7 +134,6 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ | |||
134 | 134 | ||
135 | #include "trace_entries.h" | 135 | #include "trace_entries.h" |
136 | 136 | ||
137 | |||
138 | #undef __field | 137 | #undef __field |
139 | #define __field(type, item) \ | 138 | #define __field(type, item) \ |
140 | ret = trace_define_field(event_call, #type, #item, \ | 139 | ret = trace_define_field(event_call, #type, #item, \ |
@@ -196,6 +195,11 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ | |||
196 | 195 | ||
197 | #include "trace_entries.h" | 196 | #include "trace_entries.h" |
198 | 197 | ||
198 | static int ftrace_raw_init_event(struct ftrace_event_call *call) | ||
199 | { | ||
200 | INIT_LIST_HEAD(&call->fields); | ||
201 | return 0; | ||
202 | } | ||
199 | 203 | ||
200 | #undef __field | 204 | #undef __field |
201 | #define __field(type, item) | 205 | #define __field(type, item) |
@@ -214,7 +218,6 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ | |||
214 | 218 | ||
215 | #undef FTRACE_ENTRY | 219 | #undef FTRACE_ENTRY |
216 | #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ | 220 | #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ |
217 | static int ftrace_raw_init_event_##call(void); \ | ||
218 | \ | 221 | \ |
219 | struct ftrace_event_call __used \ | 222 | struct ftrace_event_call __used \ |
220 | __attribute__((__aligned__(4))) \ | 223 | __attribute__((__aligned__(4))) \ |
@@ -222,14 +225,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ | |||
222 | .name = #call, \ | 225 | .name = #call, \ |
223 | .id = type, \ | 226 | .id = type, \ |
224 | .system = __stringify(TRACE_SYSTEM), \ | 227 | .system = __stringify(TRACE_SYSTEM), \ |
225 | .raw_init = ftrace_raw_init_event_##call, \ | 228 | .raw_init = ftrace_raw_init_event, \ |
226 | .show_format = ftrace_format_##call, \ | 229 | .show_format = ftrace_format_##call, \ |
227 | .define_fields = ftrace_define_fields_##call, \ | 230 | .define_fields = ftrace_define_fields_##call, \ |
228 | }; \ | 231 | }; \ |
229 | static int ftrace_raw_init_event_##call(void) \ | ||
230 | { \ | ||
231 | INIT_LIST_HEAD(&event_##call.fields); \ | ||
232 | return 0; \ | ||
233 | } \ | ||
234 | 232 | ||
235 | #include "trace_entries.h" | 233 | #include "trace_entries.h" |
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c new file mode 100644 index 000000000000..3696476f307d --- /dev/null +++ b/kernel/trace/trace_kprobe.c | |||
@@ -0,0 +1,1513 @@ | |||
1 | /* | ||
2 | * Kprobes-based tracing events | ||
3 | * | ||
4 | * Created by Masami Hiramatsu <mhiramat@redhat.com> | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License version 2 as | ||
8 | * published by the Free Software Foundation. | ||
9 | * | ||
10 | * This program is distributed in the hope that it will be useful, | ||
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
13 | * GNU General Public License for more details. | ||
14 | * | ||
15 | * You should have received a copy of the GNU General Public License | ||
16 | * along with this program; if not, write to the Free Software | ||
17 | * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA | ||
18 | */ | ||
19 | |||
20 | #include <linux/module.h> | ||
21 | #include <linux/uaccess.h> | ||
22 | #include <linux/kprobes.h> | ||
23 | #include <linux/seq_file.h> | ||
24 | #include <linux/slab.h> | ||
25 | #include <linux/smp.h> | ||
26 | #include <linux/debugfs.h> | ||
27 | #include <linux/types.h> | ||
28 | #include <linux/string.h> | ||
29 | #include <linux/ctype.h> | ||
30 | #include <linux/ptrace.h> | ||
31 | #include <linux/perf_event.h> | ||
32 | |||
33 | #include "trace.h" | ||
34 | #include "trace_output.h" | ||
35 | |||
36 | #define MAX_TRACE_ARGS 128 | ||
37 | #define MAX_ARGSTR_LEN 63 | ||
38 | #define MAX_EVENT_NAME_LEN 64 | ||
39 | #define KPROBE_EVENT_SYSTEM "kprobes" | ||
40 | |||
41 | /* Reserved field names */ | ||
42 | #define FIELD_STRING_IP "__probe_ip" | ||
43 | #define FIELD_STRING_NARGS "__probe_nargs" | ||
44 | #define FIELD_STRING_RETIP "__probe_ret_ip" | ||
45 | #define FIELD_STRING_FUNC "__probe_func" | ||
46 | |||
47 | const char *reserved_field_names[] = { | ||
48 | "common_type", | ||
49 | "common_flags", | ||
50 | "common_preempt_count", | ||
51 | "common_pid", | ||
52 | "common_tgid", | ||
53 | "common_lock_depth", | ||
54 | FIELD_STRING_IP, | ||
55 | FIELD_STRING_NARGS, | ||
56 | FIELD_STRING_RETIP, | ||
57 | FIELD_STRING_FUNC, | ||
58 | }; | ||
59 | |||
60 | struct fetch_func { | ||
61 | unsigned long (*func)(struct pt_regs *, void *); | ||
62 | void *data; | ||
63 | }; | ||
64 | |||
65 | static __kprobes unsigned long call_fetch(struct fetch_func *f, | ||
66 | struct pt_regs *regs) | ||
67 | { | ||
68 | return f->func(regs, f->data); | ||
69 | } | ||
70 | |||
71 | /* fetch handlers */ | ||
72 | static __kprobes unsigned long fetch_register(struct pt_regs *regs, | ||
73 | void *offset) | ||
74 | { | ||
75 | return regs_get_register(regs, (unsigned int)((unsigned long)offset)); | ||
76 | } | ||
77 | |||
78 | static __kprobes unsigned long fetch_stack(struct pt_regs *regs, | ||
79 | void *num) | ||
80 | { | ||
81 | return regs_get_kernel_stack_nth(regs, | ||
82 | (unsigned int)((unsigned long)num)); | ||
83 | } | ||
84 | |||
85 | static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr) | ||
86 | { | ||
87 | unsigned long retval; | ||
88 | |||
89 | if (probe_kernel_address(addr, retval)) | ||
90 | return 0; | ||
91 | return retval; | ||
92 | } | ||
93 | |||
94 | static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num) | ||
95 | { | ||
96 | return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num)); | ||
97 | } | ||
98 | |||
99 | static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, | ||
100 | void *dummy) | ||
101 | { | ||
102 | return regs_return_value(regs); | ||
103 | } | ||
104 | |||
105 | static __kprobes unsigned long fetch_stack_address(struct pt_regs *regs, | ||
106 | void *dummy) | ||
107 | { | ||
108 | return kernel_stack_pointer(regs); | ||
109 | } | ||
110 | |||
111 | /* Memory fetching by symbol */ | ||
112 | struct symbol_cache { | ||
113 | char *symbol; | ||
114 | long offset; | ||
115 | unsigned long addr; | ||
116 | }; | ||
117 | |||
118 | static unsigned long update_symbol_cache(struct symbol_cache *sc) | ||
119 | { | ||
120 | sc->addr = (unsigned long)kallsyms_lookup_name(sc->symbol); | ||
121 | if (sc->addr) | ||
122 | sc->addr += sc->offset; | ||
123 | return sc->addr; | ||
124 | } | ||
125 | |||
126 | static void free_symbol_cache(struct symbol_cache *sc) | ||
127 | { | ||
128 | kfree(sc->symbol); | ||
129 | kfree(sc); | ||
130 | } | ||
131 | |||
132 | static struct symbol_cache *alloc_symbol_cache(const char *sym, long offset) | ||
133 | { | ||
134 | struct symbol_cache *sc; | ||
135 | |||
136 | if (!sym || strlen(sym) == 0) | ||
137 | return NULL; | ||
138 | sc = kzalloc(sizeof(struct symbol_cache), GFP_KERNEL); | ||
139 | if (!sc) | ||
140 | return NULL; | ||
141 | |||
142 | sc->symbol = kstrdup(sym, GFP_KERNEL); | ||
143 | if (!sc->symbol) { | ||
144 | kfree(sc); | ||
145 | return NULL; | ||
146 | } | ||
147 | sc->offset = offset; | ||
148 | |||
149 | update_symbol_cache(sc); | ||
150 | return sc; | ||
151 | } | ||
152 | |||
153 | static __kprobes unsigned long fetch_symbol(struct pt_regs *regs, void *data) | ||
154 | { | ||
155 | struct symbol_cache *sc = data; | ||
156 | |||
157 | if (sc->addr) | ||
158 | return fetch_memory(regs, (void *)sc->addr); | ||
159 | else | ||
160 | return 0; | ||
161 | } | ||
162 | |||
163 | /* Special indirect memory access interface */ | ||
164 | struct indirect_fetch_data { | ||
165 | struct fetch_func orig; | ||
166 | long offset; | ||
167 | }; | ||
168 | |||
169 | static __kprobes unsigned long fetch_indirect(struct pt_regs *regs, void *data) | ||
170 | { | ||
171 | struct indirect_fetch_data *ind = data; | ||
172 | unsigned long addr; | ||
173 | |||
174 | addr = call_fetch(&ind->orig, regs); | ||
175 | if (addr) { | ||
176 | addr += ind->offset; | ||
177 | return fetch_memory(regs, (void *)addr); | ||
178 | } else | ||
179 | return 0; | ||
180 | } | ||
181 | |||
182 | static __kprobes void free_indirect_fetch_data(struct indirect_fetch_data *data) | ||
183 | { | ||
184 | if (data->orig.func == fetch_indirect) | ||
185 | free_indirect_fetch_data(data->orig.data); | ||
186 | else if (data->orig.func == fetch_symbol) | ||
187 | free_symbol_cache(data->orig.data); | ||
188 | kfree(data); | ||
189 | } | ||
190 | |||
191 | /** | ||
192 | * Kprobe event core functions | ||
193 | */ | ||
194 | |||
195 | struct probe_arg { | ||
196 | struct fetch_func fetch; | ||
197 | const char *name; | ||
198 | }; | ||
199 | |||
200 | /* Flags for trace_probe */ | ||
201 | #define TP_FLAG_TRACE 1 | ||
202 | #define TP_FLAG_PROFILE 2 | ||
203 | |||
204 | struct trace_probe { | ||
205 | struct list_head list; | ||
206 | struct kretprobe rp; /* Use rp.kp for kprobe use */ | ||
207 | unsigned long nhit; | ||
208 | unsigned int flags; /* For TP_FLAG_* */ | ||
209 | const char *symbol; /* symbol name */ | ||
210 | struct ftrace_event_call call; | ||
211 | struct trace_event event; | ||
212 | unsigned int nr_args; | ||
213 | struct probe_arg args[]; | ||
214 | }; | ||
215 | |||
216 | #define SIZEOF_TRACE_PROBE(n) \ | ||
217 | (offsetof(struct trace_probe, args) + \ | ||
218 | (sizeof(struct probe_arg) * (n))) | ||
219 | |||
220 | static __kprobes int probe_is_return(struct trace_probe *tp) | ||
221 | { | ||
222 | return tp->rp.handler != NULL; | ||
223 | } | ||
224 | |||
225 | static __kprobes const char *probe_symbol(struct trace_probe *tp) | ||
226 | { | ||
227 | return tp->symbol ? tp->symbol : "unknown"; | ||
228 | } | ||
229 | |||
230 | static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff) | ||
231 | { | ||
232 | int ret = -EINVAL; | ||
233 | |||
234 | if (ff->func == fetch_argument) | ||
235 | ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data); | ||
236 | else if (ff->func == fetch_register) { | ||
237 | const char *name; | ||
238 | name = regs_query_register_name((unsigned int)((long)ff->data)); | ||
239 | ret = snprintf(buf, n, "%%%s", name); | ||
240 | } else if (ff->func == fetch_stack) | ||
241 | ret = snprintf(buf, n, "$stack%lu", (unsigned long)ff->data); | ||
242 | else if (ff->func == fetch_memory) | ||
243 | ret = snprintf(buf, n, "@0x%p", ff->data); | ||
244 | else if (ff->func == fetch_symbol) { | ||
245 | struct symbol_cache *sc = ff->data; | ||
246 | ret = snprintf(buf, n, "@%s%+ld", sc->symbol, sc->offset); | ||
247 | } else if (ff->func == fetch_retvalue) | ||
248 | ret = snprintf(buf, n, "$retval"); | ||
249 | else if (ff->func == fetch_stack_address) | ||
250 | ret = snprintf(buf, n, "$stack"); | ||
251 | else if (ff->func == fetch_indirect) { | ||
252 | struct indirect_fetch_data *id = ff->data; | ||
253 | size_t l = 0; | ||
254 | ret = snprintf(buf, n, "%+ld(", id->offset); | ||
255 | if (ret >= n) | ||
256 | goto end; | ||
257 | l += ret; | ||
258 | ret = probe_arg_string(buf + l, n - l, &id->orig); | ||
259 | if (ret < 0) | ||
260 | goto end; | ||
261 | l += ret; | ||
262 | ret = snprintf(buf + l, n - l, ")"); | ||
263 | ret += l; | ||
264 | } | ||
265 | end: | ||
266 | if (ret >= n) | ||
267 | return -ENOSPC; | ||
268 | return ret; | ||
269 | } | ||
270 | |||
271 | static int register_probe_event(struct trace_probe *tp); | ||
272 | static void unregister_probe_event(struct trace_probe *tp); | ||
273 | |||
274 | static DEFINE_MUTEX(probe_lock); | ||
275 | static LIST_HEAD(probe_list); | ||
276 | |||
277 | static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs); | ||
278 | static int kretprobe_dispatcher(struct kretprobe_instance *ri, | ||
279 | struct pt_regs *regs); | ||
280 | |||
281 | /* | ||
282 | * Allocate new trace_probe and initialize it (including kprobes). | ||
283 | */ | ||
284 | static struct trace_probe *alloc_trace_probe(const char *group, | ||
285 | const char *event, | ||
286 | void *addr, | ||
287 | const char *symbol, | ||
288 | unsigned long offs, | ||
289 | int nargs, int is_return) | ||
290 | { | ||
291 | struct trace_probe *tp; | ||
292 | |||
293 | tp = kzalloc(SIZEOF_TRACE_PROBE(nargs), GFP_KERNEL); | ||
294 | if (!tp) | ||
295 | return ERR_PTR(-ENOMEM); | ||
296 | |||
297 | if (symbol) { | ||
298 | tp->symbol = kstrdup(symbol, GFP_KERNEL); | ||
299 | if (!tp->symbol) | ||
300 | goto error; | ||
301 | tp->rp.kp.symbol_name = tp->symbol; | ||
302 | tp->rp.kp.offset = offs; | ||
303 | } else | ||
304 | tp->rp.kp.addr = addr; | ||
305 | |||
306 | if (is_return) | ||
307 | tp->rp.handler = kretprobe_dispatcher; | ||
308 | else | ||
309 | tp->rp.kp.pre_handler = kprobe_dispatcher; | ||
310 | |||
311 | if (!event) | ||
312 | goto error; | ||
313 | tp->call.name = kstrdup(event, GFP_KERNEL); | ||
314 | if (!tp->call.name) | ||
315 | goto error; | ||
316 | |||
317 | if (!group) | ||
318 | goto error; | ||
319 | tp->call.system = kstrdup(group, GFP_KERNEL); | ||
320 | if (!tp->call.system) | ||
321 | goto error; | ||
322 | |||
323 | INIT_LIST_HEAD(&tp->list); | ||
324 | return tp; | ||
325 | error: | ||
326 | kfree(tp->call.name); | ||
327 | kfree(tp->symbol); | ||
328 | kfree(tp); | ||
329 | return ERR_PTR(-ENOMEM); | ||
330 | } | ||
331 | |||
332 | static void free_probe_arg(struct probe_arg *arg) | ||
333 | { | ||
334 | if (arg->fetch.func == fetch_symbol) | ||
335 | free_symbol_cache(arg->fetch.data); | ||
336 | else if (arg->fetch.func == fetch_indirect) | ||
337 | free_indirect_fetch_data(arg->fetch.data); | ||
338 | kfree(arg->name); | ||
339 | } | ||
340 | |||
341 | static void free_trace_probe(struct trace_probe *tp) | ||
342 | { | ||
343 | int i; | ||
344 | |||
345 | for (i = 0; i < tp->nr_args; i++) | ||
346 | free_probe_arg(&tp->args[i]); | ||
347 | |||
348 | kfree(tp->call.system); | ||
349 | kfree(tp->call.name); | ||
350 | kfree(tp->symbol); | ||
351 | kfree(tp); | ||
352 | } | ||
353 | |||
354 | static struct trace_probe *find_probe_event(const char *event, | ||
355 | const char *group) | ||
356 | { | ||
357 | struct trace_probe *tp; | ||
358 | |||
359 | list_for_each_entry(tp, &probe_list, list) | ||
360 | if (strcmp(tp->call.name, event) == 0 && | ||
361 | strcmp(tp->call.system, group) == 0) | ||
362 | return tp; | ||
363 | return NULL; | ||
364 | } | ||
365 | |||
366 | /* Unregister a trace_probe and probe_event: call with locking probe_lock */ | ||
367 | static void unregister_trace_probe(struct trace_probe *tp) | ||
368 | { | ||
369 | if (probe_is_return(tp)) | ||
370 | unregister_kretprobe(&tp->rp); | ||
371 | else | ||
372 | unregister_kprobe(&tp->rp.kp); | ||
373 | list_del(&tp->list); | ||
374 | unregister_probe_event(tp); | ||
375 | } | ||
376 | |||
377 | /* Register a trace_probe and probe_event */ | ||
378 | static int register_trace_probe(struct trace_probe *tp) | ||
379 | { | ||
380 | struct trace_probe *old_tp; | ||
381 | int ret; | ||
382 | |||
383 | mutex_lock(&probe_lock); | ||
384 | |||
385 | /* register as an event */ | ||
386 | old_tp = find_probe_event(tp->call.name, tp->call.system); | ||
387 | if (old_tp) { | ||
388 | /* delete old event */ | ||
389 | unregister_trace_probe(old_tp); | ||
390 | free_trace_probe(old_tp); | ||
391 | } | ||
392 | ret = register_probe_event(tp); | ||
393 | if (ret) { | ||
394 | pr_warning("Faild to register probe event(%d)\n", ret); | ||
395 | goto end; | ||
396 | } | ||
397 | |||
398 | tp->rp.kp.flags |= KPROBE_FLAG_DISABLED; | ||
399 | if (probe_is_return(tp)) | ||
400 | ret = register_kretprobe(&tp->rp); | ||
401 | else | ||
402 | ret = register_kprobe(&tp->rp.kp); | ||
403 | |||
404 | if (ret) { | ||
405 | pr_warning("Could not insert probe(%d)\n", ret); | ||
406 | if (ret == -EILSEQ) { | ||
407 | pr_warning("Probing address(0x%p) is not an " | ||
408 | "instruction boundary.\n", | ||
409 | tp->rp.kp.addr); | ||
410 | ret = -EINVAL; | ||
411 | } | ||
412 | unregister_probe_event(tp); | ||
413 | } else | ||
414 | list_add_tail(&tp->list, &probe_list); | ||
415 | end: | ||
416 | mutex_unlock(&probe_lock); | ||
417 | return ret; | ||
418 | } | ||
419 | |||
420 | /* Split symbol and offset. */ | ||
421 | static int split_symbol_offset(char *symbol, unsigned long *offset) | ||
422 | { | ||
423 | char *tmp; | ||
424 | int ret; | ||
425 | |||
426 | if (!offset) | ||
427 | return -EINVAL; | ||
428 | |||
429 | tmp = strchr(symbol, '+'); | ||
430 | if (tmp) { | ||
431 | /* skip sign because strict_strtol doesn't accept '+' */ | ||
432 | ret = strict_strtoul(tmp + 1, 0, offset); | ||
433 | if (ret) | ||
434 | return ret; | ||
435 | *tmp = '\0'; | ||
436 | } else | ||
437 | *offset = 0; | ||
438 | return 0; | ||
439 | } | ||
440 | |||
441 | #define PARAM_MAX_ARGS 16 | ||
442 | #define PARAM_MAX_STACK (THREAD_SIZE / sizeof(unsigned long)) | ||
443 | |||
444 | static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return) | ||
445 | { | ||
446 | int ret = 0; | ||
447 | unsigned long param; | ||
448 | |||
449 | if (strcmp(arg, "retval") == 0) { | ||
450 | if (is_return) { | ||
451 | ff->func = fetch_retvalue; | ||
452 | ff->data = NULL; | ||
453 | } else | ||
454 | ret = -EINVAL; | ||
455 | } else if (strncmp(arg, "stack", 5) == 0) { | ||
456 | if (arg[5] == '\0') { | ||
457 | ff->func = fetch_stack_address; | ||
458 | ff->data = NULL; | ||
459 | } else if (isdigit(arg[5])) { | ||
460 | ret = strict_strtoul(arg + 5, 10, ¶m); | ||
461 | if (ret || param > PARAM_MAX_STACK) | ||
462 | ret = -EINVAL; | ||
463 | else { | ||
464 | ff->func = fetch_stack; | ||
465 | ff->data = (void *)param; | ||
466 | } | ||
467 | } else | ||
468 | ret = -EINVAL; | ||
469 | } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) { | ||
470 | ret = strict_strtoul(arg + 3, 10, ¶m); | ||
471 | if (ret || param > PARAM_MAX_ARGS) | ||
472 | ret = -EINVAL; | ||
473 | else { | ||
474 | ff->func = fetch_argument; | ||
475 | ff->data = (void *)param; | ||
476 | } | ||
477 | } else | ||
478 | ret = -EINVAL; | ||
479 | return ret; | ||
480 | } | ||
481 | |||
482 | static int parse_probe_arg(char *arg, struct fetch_func *ff, int is_return) | ||
483 | { | ||
484 | int ret = 0; | ||
485 | unsigned long param; | ||
486 | long offset; | ||
487 | char *tmp; | ||
488 | |||
489 | switch (arg[0]) { | ||
490 | case '$': | ||
491 | ret = parse_probe_vars(arg + 1, ff, is_return); | ||
492 | break; | ||
493 | case '%': /* named register */ | ||
494 | ret = regs_query_register_offset(arg + 1); | ||
495 | if (ret >= 0) { | ||
496 | ff->func = fetch_register; | ||
497 | ff->data = (void *)(unsigned long)ret; | ||
498 | ret = 0; | ||
499 | } | ||
500 | break; | ||
501 | case '@': /* memory or symbol */ | ||
502 | if (isdigit(arg[1])) { | ||
503 | ret = strict_strtoul(arg + 1, 0, ¶m); | ||
504 | if (ret) | ||
505 | break; | ||
506 | ff->func = fetch_memory; | ||
507 | ff->data = (void *)param; | ||
508 | } else { | ||
509 | ret = split_symbol_offset(arg + 1, &offset); | ||
510 | if (ret) | ||
511 | break; | ||
512 | ff->data = alloc_symbol_cache(arg + 1, offset); | ||
513 | if (ff->data) | ||
514 | ff->func = fetch_symbol; | ||
515 | else | ||
516 | ret = -EINVAL; | ||
517 | } | ||
518 | break; | ||
519 | case '+': /* indirect memory */ | ||
520 | case '-': | ||
521 | tmp = strchr(arg, '('); | ||
522 | if (!tmp) { | ||
523 | ret = -EINVAL; | ||
524 | break; | ||
525 | } | ||
526 | *tmp = '\0'; | ||
527 | ret = strict_strtol(arg + 1, 0, &offset); | ||
528 | if (ret) | ||
529 | break; | ||
530 | if (arg[0] == '-') | ||
531 | offset = -offset; | ||
532 | arg = tmp + 1; | ||
533 | tmp = strrchr(arg, ')'); | ||
534 | if (tmp) { | ||
535 | struct indirect_fetch_data *id; | ||
536 | *tmp = '\0'; | ||
537 | id = kzalloc(sizeof(struct indirect_fetch_data), | ||
538 | GFP_KERNEL); | ||
539 | if (!id) | ||
540 | return -ENOMEM; | ||
541 | id->offset = offset; | ||
542 | ret = parse_probe_arg(arg, &id->orig, is_return); | ||
543 | if (ret) | ||
544 | kfree(id); | ||
545 | else { | ||
546 | ff->func = fetch_indirect; | ||
547 | ff->data = (void *)id; | ||
548 | } | ||
549 | } else | ||
550 | ret = -EINVAL; | ||
551 | break; | ||
552 | default: | ||
553 | /* TODO: support custom handler */ | ||
554 | ret = -EINVAL; | ||
555 | } | ||
556 | return ret; | ||
557 | } | ||
558 | |||
559 | /* Return 1 if name is reserved or already used by another argument */ | ||
560 | static int conflict_field_name(const char *name, | ||
561 | struct probe_arg *args, int narg) | ||
562 | { | ||
563 | int i; | ||
564 | for (i = 0; i < ARRAY_SIZE(reserved_field_names); i++) | ||
565 | if (strcmp(reserved_field_names[i], name) == 0) | ||
566 | return 1; | ||
567 | for (i = 0; i < narg; i++) | ||
568 | if (strcmp(args[i].name, name) == 0) | ||
569 | return 1; | ||
570 | return 0; | ||
571 | } | ||
572 | |||
573 | static int create_trace_probe(int argc, char **argv) | ||
574 | { | ||
575 | /* | ||
576 | * Argument syntax: | ||
577 | * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] | ||
578 | * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] | ||
579 | * Fetch args: | ||
580 | * $argN : fetch Nth of function argument. (N:0-) | ||
581 | * $retval : fetch return value | ||
582 | * $stack : fetch stack address | ||
583 | * $stackN : fetch Nth of stack (N:0-) | ||
584 | * @ADDR : fetch memory at ADDR (ADDR should be in kernel) | ||
585 | * @SYM[+|-offs] : fetch memory at SYM +|- offs (SYM is a data symbol) | ||
586 | * %REG : fetch register REG | ||
587 | * Indirect memory fetch: | ||
588 | * +|-offs(ARG) : fetch memory at ARG +|- offs address. | ||
589 | * Alias name of args: | ||
590 | * NAME=FETCHARG : set NAME as alias of FETCHARG. | ||
591 | */ | ||
592 | struct trace_probe *tp; | ||
593 | int i, ret = 0; | ||
594 | int is_return = 0; | ||
595 | char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; | ||
596 | unsigned long offset = 0; | ||
597 | void *addr = NULL; | ||
598 | char buf[MAX_EVENT_NAME_LEN]; | ||
599 | |||
600 | if (argc < 2) { | ||
601 | pr_info("Probe point is not specified.\n"); | ||
602 | return -EINVAL; | ||
603 | } | ||
604 | |||
605 | if (argv[0][0] == 'p') | ||
606 | is_return = 0; | ||
607 | else if (argv[0][0] == 'r') | ||
608 | is_return = 1; | ||
609 | else { | ||
610 | pr_info("Probe definition must be started with 'p' or 'r'.\n"); | ||
611 | return -EINVAL; | ||
612 | } | ||
613 | |||
614 | if (argv[0][1] == ':') { | ||
615 | event = &argv[0][2]; | ||
616 | if (strchr(event, '/')) { | ||
617 | group = event; | ||
618 | event = strchr(group, '/') + 1; | ||
619 | event[-1] = '\0'; | ||
620 | if (strlen(group) == 0) { | ||
621 | pr_info("Group name is not specifiled\n"); | ||
622 | return -EINVAL; | ||
623 | } | ||
624 | } | ||
625 | if (strlen(event) == 0) { | ||
626 | pr_info("Event name is not specifiled\n"); | ||
627 | return -EINVAL; | ||
628 | } | ||
629 | } | ||
630 | |||
631 | if (isdigit(argv[1][0])) { | ||
632 | if (is_return) { | ||
633 | pr_info("Return probe point must be a symbol.\n"); | ||
634 | return -EINVAL; | ||
635 | } | ||
636 | /* an address specified */ | ||
637 | ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); | ||
638 | if (ret) { | ||
639 | pr_info("Failed to parse address.\n"); | ||
640 | return ret; | ||
641 | } | ||
642 | } else { | ||
643 | /* a symbol specified */ | ||
644 | symbol = argv[1]; | ||
645 | /* TODO: support .init module functions */ | ||
646 | ret = split_symbol_offset(symbol, &offset); | ||
647 | if (ret) { | ||
648 | pr_info("Failed to parse symbol.\n"); | ||
649 | return ret; | ||
650 | } | ||
651 | if (offset && is_return) { | ||
652 | pr_info("Return probe must be used without offset.\n"); | ||
653 | return -EINVAL; | ||
654 | } | ||
655 | } | ||
656 | argc -= 2; argv += 2; | ||
657 | |||
658 | /* setup a probe */ | ||
659 | if (!group) | ||
660 | group = KPROBE_EVENT_SYSTEM; | ||
661 | if (!event) { | ||
662 | /* Make a new event name */ | ||
663 | if (symbol) | ||
664 | snprintf(buf, MAX_EVENT_NAME_LEN, "%c@%s%+ld", | ||
665 | is_return ? 'r' : 'p', symbol, offset); | ||
666 | else | ||
667 | snprintf(buf, MAX_EVENT_NAME_LEN, "%c@0x%p", | ||
668 | is_return ? 'r' : 'p', addr); | ||
669 | event = buf; | ||
670 | } | ||
671 | tp = alloc_trace_probe(group, event, addr, symbol, offset, argc, | ||
672 | is_return); | ||
673 | if (IS_ERR(tp)) { | ||
674 | pr_info("Failed to allocate trace_probe.(%d)\n", | ||
675 | (int)PTR_ERR(tp)); | ||
676 | return PTR_ERR(tp); | ||
677 | } | ||
678 | |||
679 | /* parse arguments */ | ||
680 | ret = 0; | ||
681 | for (i = 0; i < argc && i < MAX_TRACE_ARGS; i++) { | ||
682 | /* Parse argument name */ | ||
683 | arg = strchr(argv[i], '='); | ||
684 | if (arg) | ||
685 | *arg++ = '\0'; | ||
686 | else | ||
687 | arg = argv[i]; | ||
688 | |||
689 | if (conflict_field_name(argv[i], tp->args, i)) { | ||
690 | pr_info("Argument%d name '%s' conflicts with " | ||
691 | "another field.\n", i, argv[i]); | ||
692 | ret = -EINVAL; | ||
693 | goto error; | ||
694 | } | ||
695 | |||
696 | tp->args[i].name = kstrdup(argv[i], GFP_KERNEL); | ||
697 | |||
698 | /* Parse fetch argument */ | ||
699 | if (strlen(arg) > MAX_ARGSTR_LEN) { | ||
700 | pr_info("Argument%d(%s) is too long.\n", i, arg); | ||
701 | ret = -ENOSPC; | ||
702 | goto error; | ||
703 | } | ||
704 | ret = parse_probe_arg(arg, &tp->args[i].fetch, is_return); | ||
705 | if (ret) { | ||
706 | pr_info("Parse error at argument%d. (%d)\n", i, ret); | ||
707 | goto error; | ||
708 | } | ||
709 | } | ||
710 | tp->nr_args = i; | ||
711 | |||
712 | ret = register_trace_probe(tp); | ||
713 | if (ret) | ||
714 | goto error; | ||
715 | return 0; | ||
716 | |||
717 | error: | ||
718 | free_trace_probe(tp); | ||
719 | return ret; | ||
720 | } | ||
721 | |||
722 | static void cleanup_all_probes(void) | ||
723 | { | ||
724 | struct trace_probe *tp; | ||
725 | |||
726 | mutex_lock(&probe_lock); | ||
727 | /* TODO: Use batch unregistration */ | ||
728 | while (!list_empty(&probe_list)) { | ||
729 | tp = list_entry(probe_list.next, struct trace_probe, list); | ||
730 | unregister_trace_probe(tp); | ||
731 | free_trace_probe(tp); | ||
732 | } | ||
733 | mutex_unlock(&probe_lock); | ||
734 | } | ||
735 | |||
736 | |||
737 | /* Probes listing interfaces */ | ||
738 | static void *probes_seq_start(struct seq_file *m, loff_t *pos) | ||
739 | { | ||
740 | mutex_lock(&probe_lock); | ||
741 | return seq_list_start(&probe_list, *pos); | ||
742 | } | ||
743 | |||
744 | static void *probes_seq_next(struct seq_file *m, void *v, loff_t *pos) | ||
745 | { | ||
746 | return seq_list_next(v, &probe_list, pos); | ||
747 | } | ||
748 | |||
749 | static void probes_seq_stop(struct seq_file *m, void *v) | ||
750 | { | ||
751 | mutex_unlock(&probe_lock); | ||
752 | } | ||
753 | |||
754 | static int probes_seq_show(struct seq_file *m, void *v) | ||
755 | { | ||
756 | struct trace_probe *tp = v; | ||
757 | int i, ret; | ||
758 | char buf[MAX_ARGSTR_LEN + 1]; | ||
759 | |||
760 | seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); | ||
761 | seq_printf(m, ":%s", tp->call.name); | ||
762 | |||
763 | if (tp->symbol) | ||
764 | seq_printf(m, " %s+%u", probe_symbol(tp), tp->rp.kp.offset); | ||
765 | else | ||
766 | seq_printf(m, " 0x%p", tp->rp.kp.addr); | ||
767 | |||
768 | for (i = 0; i < tp->nr_args; i++) { | ||
769 | ret = probe_arg_string(buf, MAX_ARGSTR_LEN, &tp->args[i].fetch); | ||
770 | if (ret < 0) { | ||
771 | pr_warning("Argument%d decoding error(%d).\n", i, ret); | ||
772 | return ret; | ||
773 | } | ||
774 | seq_printf(m, " %s=%s", tp->args[i].name, buf); | ||
775 | } | ||
776 | seq_printf(m, "\n"); | ||
777 | return 0; | ||
778 | } | ||
779 | |||
780 | static const struct seq_operations probes_seq_op = { | ||
781 | .start = probes_seq_start, | ||
782 | .next = probes_seq_next, | ||
783 | .stop = probes_seq_stop, | ||
784 | .show = probes_seq_show | ||
785 | }; | ||
786 | |||
787 | static int probes_open(struct inode *inode, struct file *file) | ||
788 | { | ||
789 | if ((file->f_mode & FMODE_WRITE) && | ||
790 | (file->f_flags & O_TRUNC)) | ||
791 | cleanup_all_probes(); | ||
792 | |||
793 | return seq_open(file, &probes_seq_op); | ||
794 | } | ||
795 | |||
796 | static int command_trace_probe(const char *buf) | ||
797 | { | ||
798 | char **argv; | ||
799 | int argc = 0, ret = 0; | ||
800 | |||
801 | argv = argv_split(GFP_KERNEL, buf, &argc); | ||
802 | if (!argv) | ||
803 | return -ENOMEM; | ||
804 | |||
805 | if (argc) | ||
806 | ret = create_trace_probe(argc, argv); | ||
807 | |||
808 | argv_free(argv); | ||
809 | return ret; | ||
810 | } | ||
811 | |||
812 | #define WRITE_BUFSIZE 128 | ||
813 | |||
814 | static ssize_t probes_write(struct file *file, const char __user *buffer, | ||
815 | size_t count, loff_t *ppos) | ||
816 | { | ||
817 | char *kbuf, *tmp; | ||
818 | int ret; | ||
819 | size_t done; | ||
820 | size_t size; | ||
821 | |||
822 | kbuf = kmalloc(WRITE_BUFSIZE, GFP_KERNEL); | ||
823 | if (!kbuf) | ||
824 | return -ENOMEM; | ||
825 | |||
826 | ret = done = 0; | ||
827 | while (done < count) { | ||
828 | size = count - done; | ||
829 | if (size >= WRITE_BUFSIZE) | ||
830 | size = WRITE_BUFSIZE - 1; | ||
831 | if (copy_from_user(kbuf, buffer + done, size)) { | ||
832 | ret = -EFAULT; | ||
833 | goto out; | ||
834 | } | ||
835 | kbuf[size] = '\0'; | ||
836 | tmp = strchr(kbuf, '\n'); | ||
837 | if (tmp) { | ||
838 | *tmp = '\0'; | ||
839 | size = tmp - kbuf + 1; | ||
840 | } else if (done + size < count) { | ||
841 | pr_warning("Line length is too long: " | ||
842 | "Should be less than %d.", WRITE_BUFSIZE); | ||
843 | ret = -EINVAL; | ||
844 | goto out; | ||
845 | } | ||
846 | done += size; | ||
847 | /* Remove comments */ | ||
848 | tmp = strchr(kbuf, '#'); | ||
849 | if (tmp) | ||
850 | *tmp = '\0'; | ||
851 | |||
852 | ret = command_trace_probe(kbuf); | ||
853 | if (ret) | ||
854 | goto out; | ||
855 | } | ||
856 | ret = done; | ||
857 | out: | ||
858 | kfree(kbuf); | ||
859 | return ret; | ||
860 | } | ||
861 | |||
862 | static const struct file_operations kprobe_events_ops = { | ||
863 | .owner = THIS_MODULE, | ||
864 | .open = probes_open, | ||
865 | .read = seq_read, | ||
866 | .llseek = seq_lseek, | ||
867 | .release = seq_release, | ||
868 | .write = probes_write, | ||
869 | }; | ||
870 | |||
871 | /* Probes profiling interfaces */ | ||
872 | static int probes_profile_seq_show(struct seq_file *m, void *v) | ||
873 | { | ||
874 | struct trace_probe *tp = v; | ||
875 | |||
876 | seq_printf(m, " %-44s %15lu %15lu\n", tp->call.name, tp->nhit, | ||
877 | tp->rp.kp.nmissed); | ||
878 | |||
879 | return 0; | ||
880 | } | ||
881 | |||
882 | static const struct seq_operations profile_seq_op = { | ||
883 | .start = probes_seq_start, | ||
884 | .next = probes_seq_next, | ||
885 | .stop = probes_seq_stop, | ||
886 | .show = probes_profile_seq_show | ||
887 | }; | ||
888 | |||
889 | static int profile_open(struct inode *inode, struct file *file) | ||
890 | { | ||
891 | return seq_open(file, &profile_seq_op); | ||
892 | } | ||
893 | |||
894 | static const struct file_operations kprobe_profile_ops = { | ||
895 | .owner = THIS_MODULE, | ||
896 | .open = profile_open, | ||
897 | .read = seq_read, | ||
898 | .llseek = seq_lseek, | ||
899 | .release = seq_release, | ||
900 | }; | ||
901 | |||
902 | /* Kprobe handler */ | ||
903 | static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) | ||
904 | { | ||
905 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); | ||
906 | struct kprobe_trace_entry *entry; | ||
907 | struct ring_buffer_event *event; | ||
908 | struct ring_buffer *buffer; | ||
909 | int size, i, pc; | ||
910 | unsigned long irq_flags; | ||
911 | struct ftrace_event_call *call = &tp->call; | ||
912 | |||
913 | tp->nhit++; | ||
914 | |||
915 | local_save_flags(irq_flags); | ||
916 | pc = preempt_count(); | ||
917 | |||
918 | size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); | ||
919 | |||
920 | event = trace_current_buffer_lock_reserve(&buffer, call->id, size, | ||
921 | irq_flags, pc); | ||
922 | if (!event) | ||
923 | return 0; | ||
924 | |||
925 | entry = ring_buffer_event_data(event); | ||
926 | entry->nargs = tp->nr_args; | ||
927 | entry->ip = (unsigned long)kp->addr; | ||
928 | for (i = 0; i < tp->nr_args; i++) | ||
929 | entry->args[i] = call_fetch(&tp->args[i].fetch, regs); | ||
930 | |||
931 | if (!filter_current_check_discard(buffer, call, entry, event)) | ||
932 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | ||
933 | return 0; | ||
934 | } | ||
935 | |||
936 | /* Kretprobe handler */ | ||
937 | static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, | ||
938 | struct pt_regs *regs) | ||
939 | { | ||
940 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); | ||
941 | struct kretprobe_trace_entry *entry; | ||
942 | struct ring_buffer_event *event; | ||
943 | struct ring_buffer *buffer; | ||
944 | int size, i, pc; | ||
945 | unsigned long irq_flags; | ||
946 | struct ftrace_event_call *call = &tp->call; | ||
947 | |||
948 | local_save_flags(irq_flags); | ||
949 | pc = preempt_count(); | ||
950 | |||
951 | size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); | ||
952 | |||
953 | event = trace_current_buffer_lock_reserve(&buffer, call->id, size, | ||
954 | irq_flags, pc); | ||
955 | if (!event) | ||
956 | return 0; | ||
957 | |||
958 | entry = ring_buffer_event_data(event); | ||
959 | entry->nargs = tp->nr_args; | ||
960 | entry->func = (unsigned long)tp->rp.kp.addr; | ||
961 | entry->ret_ip = (unsigned long)ri->ret_addr; | ||
962 | for (i = 0; i < tp->nr_args; i++) | ||
963 | entry->args[i] = call_fetch(&tp->args[i].fetch, regs); | ||
964 | |||
965 | if (!filter_current_check_discard(buffer, call, entry, event)) | ||
966 | trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); | ||
967 | |||
968 | return 0; | ||
969 | } | ||
970 | |||
971 | /* Event entry printers */ | ||
972 | enum print_line_t | ||
973 | print_kprobe_event(struct trace_iterator *iter, int flags) | ||
974 | { | ||
975 | struct kprobe_trace_entry *field; | ||
976 | struct trace_seq *s = &iter->seq; | ||
977 | struct trace_event *event; | ||
978 | struct trace_probe *tp; | ||
979 | int i; | ||
980 | |||
981 | field = (struct kprobe_trace_entry *)iter->ent; | ||
982 | event = ftrace_find_event(field->ent.type); | ||
983 | tp = container_of(event, struct trace_probe, event); | ||
984 | |||
985 | if (!trace_seq_printf(s, "%s: (", tp->call.name)) | ||
986 | goto partial; | ||
987 | |||
988 | if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET)) | ||
989 | goto partial; | ||
990 | |||
991 | if (!trace_seq_puts(s, ")")) | ||
992 | goto partial; | ||
993 | |||
994 | for (i = 0; i < field->nargs; i++) | ||
995 | if (!trace_seq_printf(s, " %s=%lx", | ||
996 | tp->args[i].name, field->args[i])) | ||
997 | goto partial; | ||
998 | |||
999 | if (!trace_seq_puts(s, "\n")) | ||
1000 | goto partial; | ||
1001 | |||
1002 | return TRACE_TYPE_HANDLED; | ||
1003 | partial: | ||
1004 | return TRACE_TYPE_PARTIAL_LINE; | ||
1005 | } | ||
1006 | |||
1007 | enum print_line_t | ||
1008 | print_kretprobe_event(struct trace_iterator *iter, int flags) | ||
1009 | { | ||
1010 | struct kretprobe_trace_entry *field; | ||
1011 | struct trace_seq *s = &iter->seq; | ||
1012 | struct trace_event *event; | ||
1013 | struct trace_probe *tp; | ||
1014 | int i; | ||
1015 | |||
1016 | field = (struct kretprobe_trace_entry *)iter->ent; | ||
1017 | event = ftrace_find_event(field->ent.type); | ||
1018 | tp = container_of(event, struct trace_probe, event); | ||
1019 | |||
1020 | if (!trace_seq_printf(s, "%s: (", tp->call.name)) | ||
1021 | goto partial; | ||
1022 | |||
1023 | if (!seq_print_ip_sym(s, field->ret_ip, flags | TRACE_ITER_SYM_OFFSET)) | ||
1024 | goto partial; | ||
1025 | |||
1026 | if (!trace_seq_puts(s, " <- ")) | ||
1027 | goto partial; | ||
1028 | |||
1029 | if (!seq_print_ip_sym(s, field->func, flags & ~TRACE_ITER_SYM_OFFSET)) | ||
1030 | goto partial; | ||
1031 | |||
1032 | if (!trace_seq_puts(s, ")")) | ||
1033 | goto partial; | ||
1034 | |||
1035 | for (i = 0; i < field->nargs; i++) | ||
1036 | if (!trace_seq_printf(s, " %s=%lx", | ||
1037 | tp->args[i].name, field->args[i])) | ||
1038 | goto partial; | ||
1039 | |||
1040 | if (!trace_seq_puts(s, "\n")) | ||
1041 | goto partial; | ||
1042 | |||
1043 | return TRACE_TYPE_HANDLED; | ||
1044 | partial: | ||
1045 | return TRACE_TYPE_PARTIAL_LINE; | ||
1046 | } | ||
1047 | |||
1048 | static int probe_event_enable(struct ftrace_event_call *call) | ||
1049 | { | ||
1050 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1051 | |||
1052 | tp->flags |= TP_FLAG_TRACE; | ||
1053 | if (probe_is_return(tp)) | ||
1054 | return enable_kretprobe(&tp->rp); | ||
1055 | else | ||
1056 | return enable_kprobe(&tp->rp.kp); | ||
1057 | } | ||
1058 | |||
1059 | static void probe_event_disable(struct ftrace_event_call *call) | ||
1060 | { | ||
1061 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1062 | |||
1063 | tp->flags &= ~TP_FLAG_TRACE; | ||
1064 | if (!(tp->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE))) { | ||
1065 | if (probe_is_return(tp)) | ||
1066 | disable_kretprobe(&tp->rp); | ||
1067 | else | ||
1068 | disable_kprobe(&tp->rp.kp); | ||
1069 | } | ||
1070 | } | ||
1071 | |||
1072 | static int probe_event_raw_init(struct ftrace_event_call *event_call) | ||
1073 | { | ||
1074 | INIT_LIST_HEAD(&event_call->fields); | ||
1075 | |||
1076 | return 0; | ||
1077 | } | ||
1078 | |||
1079 | #undef DEFINE_FIELD | ||
1080 | #define DEFINE_FIELD(type, item, name, is_signed) \ | ||
1081 | do { \ | ||
1082 | ret = trace_define_field(event_call, #type, name, \ | ||
1083 | offsetof(typeof(field), item), \ | ||
1084 | sizeof(field.item), is_signed, \ | ||
1085 | FILTER_OTHER); \ | ||
1086 | if (ret) \ | ||
1087 | return ret; \ | ||
1088 | } while (0) | ||
1089 | |||
1090 | static int kprobe_event_define_fields(struct ftrace_event_call *event_call) | ||
1091 | { | ||
1092 | int ret, i; | ||
1093 | struct kprobe_trace_entry field; | ||
1094 | struct trace_probe *tp = (struct trace_probe *)event_call->data; | ||
1095 | |||
1096 | ret = trace_define_common_fields(event_call); | ||
1097 | if (!ret) | ||
1098 | return ret; | ||
1099 | |||
1100 | DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); | ||
1101 | DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); | ||
1102 | /* Set argument names as fields */ | ||
1103 | for (i = 0; i < tp->nr_args; i++) | ||
1104 | DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); | ||
1105 | return 0; | ||
1106 | } | ||
1107 | |||
1108 | static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) | ||
1109 | { | ||
1110 | int ret, i; | ||
1111 | struct kretprobe_trace_entry field; | ||
1112 | struct trace_probe *tp = (struct trace_probe *)event_call->data; | ||
1113 | |||
1114 | ret = trace_define_common_fields(event_call); | ||
1115 | if (!ret) | ||
1116 | return ret; | ||
1117 | |||
1118 | DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); | ||
1119 | DEFINE_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP, 0); | ||
1120 | DEFINE_FIELD(int, nargs, FIELD_STRING_NARGS, 1); | ||
1121 | /* Set argument names as fields */ | ||
1122 | for (i = 0; i < tp->nr_args; i++) | ||
1123 | DEFINE_FIELD(unsigned long, args[i], tp->args[i].name, 0); | ||
1124 | return 0; | ||
1125 | } | ||
1126 | |||
1127 | static int __probe_event_show_format(struct trace_seq *s, | ||
1128 | struct trace_probe *tp, const char *fmt, | ||
1129 | const char *arg) | ||
1130 | { | ||
1131 | int i; | ||
1132 | |||
1133 | /* Show format */ | ||
1134 | if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) | ||
1135 | return 0; | ||
1136 | |||
1137 | for (i = 0; i < tp->nr_args; i++) | ||
1138 | if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) | ||
1139 | return 0; | ||
1140 | |||
1141 | if (!trace_seq_printf(s, "\", %s", arg)) | ||
1142 | return 0; | ||
1143 | |||
1144 | for (i = 0; i < tp->nr_args; i++) | ||
1145 | if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) | ||
1146 | return 0; | ||
1147 | |||
1148 | return trace_seq_puts(s, "\n"); | ||
1149 | } | ||
1150 | |||
1151 | #undef SHOW_FIELD | ||
1152 | #define SHOW_FIELD(type, item, name) \ | ||
1153 | do { \ | ||
1154 | ret = trace_seq_printf(s, "\tfield: " #type " %s;\t" \ | ||
1155 | "offset:%u;\tsize:%u;\n", name, \ | ||
1156 | (unsigned int)offsetof(typeof(field), item),\ | ||
1157 | (unsigned int)sizeof(type)); \ | ||
1158 | if (!ret) \ | ||
1159 | return 0; \ | ||
1160 | } while (0) | ||
1161 | |||
1162 | static int kprobe_event_show_format(struct ftrace_event_call *call, | ||
1163 | struct trace_seq *s) | ||
1164 | { | ||
1165 | struct kprobe_trace_entry field __attribute__((unused)); | ||
1166 | int ret, i; | ||
1167 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1168 | |||
1169 | SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); | ||
1170 | SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); | ||
1171 | |||
1172 | /* Show fields */ | ||
1173 | for (i = 0; i < tp->nr_args; i++) | ||
1174 | SHOW_FIELD(unsigned long, args[i], tp->args[i].name); | ||
1175 | trace_seq_puts(s, "\n"); | ||
1176 | |||
1177 | return __probe_event_show_format(s, tp, "(%lx)", | ||
1178 | "REC->" FIELD_STRING_IP); | ||
1179 | } | ||
1180 | |||
1181 | static int kretprobe_event_show_format(struct ftrace_event_call *call, | ||
1182 | struct trace_seq *s) | ||
1183 | { | ||
1184 | struct kretprobe_trace_entry field __attribute__((unused)); | ||
1185 | int ret, i; | ||
1186 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1187 | |||
1188 | SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); | ||
1189 | SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); | ||
1190 | SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); | ||
1191 | |||
1192 | /* Show fields */ | ||
1193 | for (i = 0; i < tp->nr_args; i++) | ||
1194 | SHOW_FIELD(unsigned long, args[i], tp->args[i].name); | ||
1195 | trace_seq_puts(s, "\n"); | ||
1196 | |||
1197 | return __probe_event_show_format(s, tp, "(%lx <- %lx)", | ||
1198 | "REC->" FIELD_STRING_FUNC | ||
1199 | ", REC->" FIELD_STRING_RETIP); | ||
1200 | } | ||
1201 | |||
1202 | #ifdef CONFIG_EVENT_PROFILE | ||
1203 | |||
1204 | /* Kprobe profile handler */ | ||
1205 | static __kprobes int kprobe_profile_func(struct kprobe *kp, | ||
1206 | struct pt_regs *regs) | ||
1207 | { | ||
1208 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); | ||
1209 | struct ftrace_event_call *call = &tp->call; | ||
1210 | struct kprobe_trace_entry *entry; | ||
1211 | struct perf_trace_buf *trace_buf; | ||
1212 | struct trace_entry *ent; | ||
1213 | int size, __size, i, pc, __cpu; | ||
1214 | unsigned long irq_flags; | ||
1215 | char *raw_data; | ||
1216 | |||
1217 | pc = preempt_count(); | ||
1218 | __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); | ||
1219 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | ||
1220 | size -= sizeof(u32); | ||
1221 | if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, | ||
1222 | "profile buffer not large enough")) | ||
1223 | return 0; | ||
1224 | |||
1225 | /* | ||
1226 | * Protect the non nmi buffer | ||
1227 | * This also protects the rcu read side | ||
1228 | */ | ||
1229 | local_irq_save(irq_flags); | ||
1230 | __cpu = smp_processor_id(); | ||
1231 | |||
1232 | if (in_nmi()) | ||
1233 | trace_buf = rcu_dereference(perf_trace_buf_nmi); | ||
1234 | else | ||
1235 | trace_buf = rcu_dereference(perf_trace_buf); | ||
1236 | |||
1237 | if (!trace_buf) | ||
1238 | goto end; | ||
1239 | |||
1240 | trace_buf = per_cpu_ptr(trace_buf, __cpu); | ||
1241 | |||
1242 | if (trace_buf->recursion++) | ||
1243 | goto end_recursion; | ||
1244 | |||
1245 | /* | ||
1246 | * Make recursion update visible before entering perf_tp_event | ||
1247 | * so that we protect from perf recursions. | ||
1248 | */ | ||
1249 | barrier(); | ||
1250 | |||
1251 | raw_data = trace_buf->buf; | ||
1252 | |||
1253 | /* Zero dead bytes from alignment to avoid buffer leak to userspace */ | ||
1254 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; | ||
1255 | entry = (struct kprobe_trace_entry *)raw_data; | ||
1256 | ent = &entry->ent; | ||
1257 | |||
1258 | tracing_generic_entry_update(ent, irq_flags, pc); | ||
1259 | ent->type = call->id; | ||
1260 | entry->nargs = tp->nr_args; | ||
1261 | entry->ip = (unsigned long)kp->addr; | ||
1262 | for (i = 0; i < tp->nr_args; i++) | ||
1263 | entry->args[i] = call_fetch(&tp->args[i].fetch, regs); | ||
1264 | perf_tp_event(call->id, entry->ip, 1, entry, size); | ||
1265 | |||
1266 | end_recursion: | ||
1267 | trace_buf->recursion--; | ||
1268 | end: | ||
1269 | local_irq_restore(irq_flags); | ||
1270 | |||
1271 | return 0; | ||
1272 | } | ||
1273 | |||
1274 | /* Kretprobe profile handler */ | ||
1275 | static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, | ||
1276 | struct pt_regs *regs) | ||
1277 | { | ||
1278 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); | ||
1279 | struct ftrace_event_call *call = &tp->call; | ||
1280 | struct kretprobe_trace_entry *entry; | ||
1281 | struct perf_trace_buf *trace_buf; | ||
1282 | struct trace_entry *ent; | ||
1283 | int size, __size, i, pc, __cpu; | ||
1284 | unsigned long irq_flags; | ||
1285 | char *raw_data; | ||
1286 | |||
1287 | pc = preempt_count(); | ||
1288 | __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); | ||
1289 | size = ALIGN(__size + sizeof(u32), sizeof(u64)); | ||
1290 | size -= sizeof(u32); | ||
1291 | if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, | ||
1292 | "profile buffer not large enough")) | ||
1293 | return 0; | ||
1294 | |||
1295 | /* | ||
1296 | * Protect the non nmi buffer | ||
1297 | * This also protects the rcu read side | ||
1298 | */ | ||
1299 | local_irq_save(irq_flags); | ||
1300 | __cpu = smp_processor_id(); | ||
1301 | |||
1302 | if (in_nmi()) | ||
1303 | trace_buf = rcu_dereference(perf_trace_buf_nmi); | ||
1304 | else | ||
1305 | trace_buf = rcu_dereference(perf_trace_buf); | ||
1306 | |||
1307 | if (!trace_buf) | ||
1308 | goto end; | ||
1309 | |||
1310 | trace_buf = per_cpu_ptr(trace_buf, __cpu); | ||
1311 | |||
1312 | if (trace_buf->recursion++) | ||
1313 | goto end_recursion; | ||
1314 | |||
1315 | /* | ||
1316 | * Make recursion update visible before entering perf_tp_event | ||
1317 | * so that we protect from perf recursions. | ||
1318 | */ | ||
1319 | barrier(); | ||
1320 | |||
1321 | raw_data = trace_buf->buf; | ||
1322 | |||
1323 | /* Zero dead bytes from alignment to avoid buffer leak to userspace */ | ||
1324 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; | ||
1325 | entry = (struct kretprobe_trace_entry *)raw_data; | ||
1326 | ent = &entry->ent; | ||
1327 | |||
1328 | tracing_generic_entry_update(ent, irq_flags, pc); | ||
1329 | ent->type = call->id; | ||
1330 | entry->nargs = tp->nr_args; | ||
1331 | entry->func = (unsigned long)tp->rp.kp.addr; | ||
1332 | entry->ret_ip = (unsigned long)ri->ret_addr; | ||
1333 | for (i = 0; i < tp->nr_args; i++) | ||
1334 | entry->args[i] = call_fetch(&tp->args[i].fetch, regs); | ||
1335 | perf_tp_event(call->id, entry->ret_ip, 1, entry, size); | ||
1336 | |||
1337 | end_recursion: | ||
1338 | trace_buf->recursion--; | ||
1339 | end: | ||
1340 | local_irq_restore(irq_flags); | ||
1341 | |||
1342 | return 0; | ||
1343 | } | ||
1344 | |||
1345 | static int probe_profile_enable(struct ftrace_event_call *call) | ||
1346 | { | ||
1347 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1348 | |||
1349 | tp->flags |= TP_FLAG_PROFILE; | ||
1350 | |||
1351 | if (probe_is_return(tp)) | ||
1352 | return enable_kretprobe(&tp->rp); | ||
1353 | else | ||
1354 | return enable_kprobe(&tp->rp.kp); | ||
1355 | } | ||
1356 | |||
1357 | static void probe_profile_disable(struct ftrace_event_call *call) | ||
1358 | { | ||
1359 | struct trace_probe *tp = (struct trace_probe *)call->data; | ||
1360 | |||
1361 | tp->flags &= ~TP_FLAG_PROFILE; | ||
1362 | |||
1363 | if (!(tp->flags & TP_FLAG_TRACE)) { | ||
1364 | if (probe_is_return(tp)) | ||
1365 | disable_kretprobe(&tp->rp); | ||
1366 | else | ||
1367 | disable_kprobe(&tp->rp.kp); | ||
1368 | } | ||
1369 | } | ||
1370 | #endif /* CONFIG_EVENT_PROFILE */ | ||
1371 | |||
1372 | |||
1373 | static __kprobes | ||
1374 | int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) | ||
1375 | { | ||
1376 | struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); | ||
1377 | |||
1378 | if (tp->flags & TP_FLAG_TRACE) | ||
1379 | kprobe_trace_func(kp, regs); | ||
1380 | #ifdef CONFIG_EVENT_PROFILE | ||
1381 | if (tp->flags & TP_FLAG_PROFILE) | ||
1382 | kprobe_profile_func(kp, regs); | ||
1383 | #endif /* CONFIG_EVENT_PROFILE */ | ||
1384 | return 0; /* We don't tweek kernel, so just return 0 */ | ||
1385 | } | ||
1386 | |||
1387 | static __kprobes | ||
1388 | int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) | ||
1389 | { | ||
1390 | struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); | ||
1391 | |||
1392 | if (tp->flags & TP_FLAG_TRACE) | ||
1393 | kretprobe_trace_func(ri, regs); | ||
1394 | #ifdef CONFIG_EVENT_PROFILE | ||
1395 | if (tp->flags & TP_FLAG_PROFILE) | ||
1396 | kretprobe_profile_func(ri, regs); | ||
1397 | #endif /* CONFIG_EVENT_PROFILE */ | ||
1398 | return 0; /* We don't tweek kernel, so just return 0 */ | ||
1399 | } | ||
1400 | |||
1401 | static int register_probe_event(struct trace_probe *tp) | ||
1402 | { | ||
1403 | struct ftrace_event_call *call = &tp->call; | ||
1404 | int ret; | ||
1405 | |||
1406 | /* Initialize ftrace_event_call */ | ||
1407 | if (probe_is_return(tp)) { | ||
1408 | tp->event.trace = print_kretprobe_event; | ||
1409 | call->raw_init = probe_event_raw_init; | ||
1410 | call->show_format = kretprobe_event_show_format; | ||
1411 | call->define_fields = kretprobe_event_define_fields; | ||
1412 | } else { | ||
1413 | tp->event.trace = print_kprobe_event; | ||
1414 | call->raw_init = probe_event_raw_init; | ||
1415 | call->show_format = kprobe_event_show_format; | ||
1416 | call->define_fields = kprobe_event_define_fields; | ||
1417 | } | ||
1418 | call->event = &tp->event; | ||
1419 | call->id = register_ftrace_event(&tp->event); | ||
1420 | if (!call->id) | ||
1421 | return -ENODEV; | ||
1422 | call->enabled = 0; | ||
1423 | call->regfunc = probe_event_enable; | ||
1424 | call->unregfunc = probe_event_disable; | ||
1425 | |||
1426 | #ifdef CONFIG_EVENT_PROFILE | ||
1427 | atomic_set(&call->profile_count, -1); | ||
1428 | call->profile_enable = probe_profile_enable; | ||
1429 | call->profile_disable = probe_profile_disable; | ||
1430 | #endif | ||
1431 | call->data = tp; | ||
1432 | ret = trace_add_event_call(call); | ||
1433 | if (ret) { | ||
1434 | pr_info("Failed to register kprobe event: %s\n", call->name); | ||
1435 | unregister_ftrace_event(&tp->event); | ||
1436 | } | ||
1437 | return ret; | ||
1438 | } | ||
1439 | |||
1440 | static void unregister_probe_event(struct trace_probe *tp) | ||
1441 | { | ||
1442 | /* tp->event is unregistered in trace_remove_event_call() */ | ||
1443 | trace_remove_event_call(&tp->call); | ||
1444 | } | ||
1445 | |||
1446 | /* Make a debugfs interface for controling probe points */ | ||
1447 | static __init int init_kprobe_trace(void) | ||
1448 | { | ||
1449 | struct dentry *d_tracer; | ||
1450 | struct dentry *entry; | ||
1451 | |||
1452 | d_tracer = tracing_init_dentry(); | ||
1453 | if (!d_tracer) | ||
1454 | return 0; | ||
1455 | |||
1456 | entry = debugfs_create_file("kprobe_events", 0644, d_tracer, | ||
1457 | NULL, &kprobe_events_ops); | ||
1458 | |||
1459 | /* Event list interface */ | ||
1460 | if (!entry) | ||
1461 | pr_warning("Could not create debugfs " | ||
1462 | "'kprobe_events' entry\n"); | ||
1463 | |||
1464 | /* Profile interface */ | ||
1465 | entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, | ||
1466 | NULL, &kprobe_profile_ops); | ||
1467 | |||
1468 | if (!entry) | ||
1469 | pr_warning("Could not create debugfs " | ||
1470 | "'kprobe_profile' entry\n"); | ||
1471 | return 0; | ||
1472 | } | ||
1473 | fs_initcall(init_kprobe_trace); | ||
1474 | |||
1475 | |||
1476 | #ifdef CONFIG_FTRACE_STARTUP_TEST | ||
1477 | |||
1478 | static int kprobe_trace_selftest_target(int a1, int a2, int a3, | ||
1479 | int a4, int a5, int a6) | ||
1480 | { | ||
1481 | return a1 + a2 + a3 + a4 + a5 + a6; | ||
1482 | } | ||
1483 | |||
1484 | static __init int kprobe_trace_self_tests_init(void) | ||
1485 | { | ||
1486 | int ret; | ||
1487 | int (*target)(int, int, int, int, int, int); | ||
1488 | |||
1489 | target = kprobe_trace_selftest_target; | ||
1490 | |||
1491 | pr_info("Testing kprobe tracing: "); | ||
1492 | |||
1493 | ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " | ||
1494 | "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); | ||
1495 | if (WARN_ON_ONCE(ret)) | ||
1496 | pr_warning("error enabling function entry\n"); | ||
1497 | |||
1498 | ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " | ||
1499 | "$retval"); | ||
1500 | if (WARN_ON_ONCE(ret)) | ||
1501 | pr_warning("error enabling function return\n"); | ||
1502 | |||
1503 | ret = target(1, 2, 3, 4, 5, 6); | ||
1504 | |||
1505 | cleanup_all_probes(); | ||
1506 | |||
1507 | pr_cont("OK\n"); | ||
1508 | return 0; | ||
1509 | } | ||
1510 | |||
1511 | late_initcall(kprobe_trace_self_tests_init); | ||
1512 | |||
1513 | #endif | ||
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ed17565826b0..b6c12c6a1bcd 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c | |||
@@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) | |||
69 | * @s: trace sequence descriptor | 69 | * @s: trace sequence descriptor |
70 | * @fmt: printf format string | 70 | * @fmt: printf format string |
71 | * | 71 | * |
72 | * It returns 0 if the trace oversizes the buffer's free | ||
73 | * space, 1 otherwise. | ||
74 | * | ||
72 | * The tracer may use either sequence operations or its own | 75 | * The tracer may use either sequence operations or its own |
73 | * copy to user routines. To simplify formating of a trace | 76 | * copy to user routines. To simplify formating of a trace |
74 | * trace_seq_printf is used to store strings into a special | 77 | * trace_seq_printf is used to store strings into a special |
@@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) | |||
95 | 98 | ||
96 | s->len += ret; | 99 | s->len += ret; |
97 | 100 | ||
98 | return len; | 101 | return 1; |
99 | } | 102 | } |
100 | EXPORT_SYMBOL_GPL(trace_seq_printf); | 103 | EXPORT_SYMBOL_GPL(trace_seq_printf); |
101 | 104 | ||
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index d00d1a8f1f26..51213b0aa81b 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c | |||
@@ -354,13 +354,13 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) | |||
354 | trace_current_buffer_unlock_commit(buffer, event, 0, 0); | 354 | trace_current_buffer_unlock_commit(buffer, event, 0, 0); |
355 | } | 355 | } |
356 | 356 | ||
357 | int reg_event_syscall_enter(void *ptr) | 357 | int reg_event_syscall_enter(struct ftrace_event_call *call) |
358 | { | 358 | { |
359 | int ret = 0; | 359 | int ret = 0; |
360 | int num; | 360 | int num; |
361 | char *name; | 361 | char *name; |
362 | 362 | ||
363 | name = (char *)ptr; | 363 | name = (char *)call->data; |
364 | num = syscall_name_to_nr(name); | 364 | num = syscall_name_to_nr(name); |
365 | if (num < 0 || num >= NR_syscalls) | 365 | if (num < 0 || num >= NR_syscalls) |
366 | return -ENOSYS; | 366 | return -ENOSYS; |
@@ -378,12 +378,12 @@ int reg_event_syscall_enter(void *ptr) | |||
378 | return ret; | 378 | return ret; |
379 | } | 379 | } |
380 | 380 | ||
381 | void unreg_event_syscall_enter(void *ptr) | 381 | void unreg_event_syscall_enter(struct ftrace_event_call *call) |
382 | { | 382 | { |
383 | int num; | 383 | int num; |
384 | char *name; | 384 | char *name; |
385 | 385 | ||
386 | name = (char *)ptr; | 386 | name = (char *)call->data; |
387 | num = syscall_name_to_nr(name); | 387 | num = syscall_name_to_nr(name); |
388 | if (num < 0 || num >= NR_syscalls) | 388 | if (num < 0 || num >= NR_syscalls) |
389 | return; | 389 | return; |
@@ -395,13 +395,13 @@ void unreg_event_syscall_enter(void *ptr) | |||
395 | mutex_unlock(&syscall_trace_lock); | 395 | mutex_unlock(&syscall_trace_lock); |
396 | } | 396 | } |
397 | 397 | ||
398 | int reg_event_syscall_exit(void *ptr) | 398 | int reg_event_syscall_exit(struct ftrace_event_call *call) |
399 | { | 399 | { |
400 | int ret = 0; | 400 | int ret = 0; |
401 | int num; | 401 | int num; |
402 | char *name; | 402 | char *name; |
403 | 403 | ||
404 | name = (char *)ptr; | 404 | name = call->data; |
405 | num = syscall_name_to_nr(name); | 405 | num = syscall_name_to_nr(name); |
406 | if (num < 0 || num >= NR_syscalls) | 406 | if (num < 0 || num >= NR_syscalls) |
407 | return -ENOSYS; | 407 | return -ENOSYS; |
@@ -419,12 +419,12 @@ int reg_event_syscall_exit(void *ptr) | |||
419 | return ret; | 419 | return ret; |
420 | } | 420 | } |
421 | 421 | ||
422 | void unreg_event_syscall_exit(void *ptr) | 422 | void unreg_event_syscall_exit(struct ftrace_event_call *call) |
423 | { | 423 | { |
424 | int num; | 424 | int num; |
425 | char *name; | 425 | char *name; |
426 | 426 | ||
427 | name = (char *)ptr; | 427 | name = call->data; |
428 | num = syscall_name_to_nr(name); | 428 | num = syscall_name_to_nr(name); |
429 | if (num < 0 || num >= NR_syscalls) | 429 | if (num < 0 || num >= NR_syscalls) |
430 | return; | 430 | return; |
@@ -477,6 +477,7 @@ static int sys_prof_refcount_exit; | |||
477 | static void prof_syscall_enter(struct pt_regs *regs, long id) | 477 | static void prof_syscall_enter(struct pt_regs *regs, long id) |
478 | { | 478 | { |
479 | struct syscall_metadata *sys_data; | 479 | struct syscall_metadata *sys_data; |
480 | struct perf_trace_buf *trace_buf; | ||
480 | struct syscall_trace_enter *rec; | 481 | struct syscall_trace_enter *rec; |
481 | unsigned long flags; | 482 | unsigned long flags; |
482 | char *raw_data; | 483 | char *raw_data; |
@@ -507,14 +508,25 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) | |||
507 | cpu = smp_processor_id(); | 508 | cpu = smp_processor_id(); |
508 | 509 | ||
509 | if (in_nmi()) | 510 | if (in_nmi()) |
510 | raw_data = rcu_dereference(trace_profile_buf_nmi); | 511 | trace_buf = rcu_dereference(perf_trace_buf_nmi); |
511 | else | 512 | else |
512 | raw_data = rcu_dereference(trace_profile_buf); | 513 | trace_buf = rcu_dereference(perf_trace_buf); |
513 | 514 | ||
514 | if (!raw_data) | 515 | if (!trace_buf) |
515 | goto end; | 516 | goto end; |
516 | 517 | ||
517 | raw_data = per_cpu_ptr(raw_data, cpu); | 518 | trace_buf = per_cpu_ptr(trace_buf, cpu); |
519 | |||
520 | if (trace_buf->recursion++) | ||
521 | goto end_recursion; | ||
522 | |||
523 | /* | ||
524 | * Make recursion update visible before entering perf_tp_event | ||
525 | * so that we protect from perf recursions. | ||
526 | */ | ||
527 | barrier(); | ||
528 | |||
529 | raw_data = trace_buf->buf; | ||
518 | 530 | ||
519 | /* zero the dead bytes from align to not leak stack to user */ | 531 | /* zero the dead bytes from align to not leak stack to user */ |
520 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; | 532 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; |
@@ -527,6 +539,8 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) | |||
527 | (unsigned long *)&rec->args); | 539 | (unsigned long *)&rec->args); |
528 | perf_tp_event(sys_data->enter_id, 0, 1, rec, size); | 540 | perf_tp_event(sys_data->enter_id, 0, 1, rec, size); |
529 | 541 | ||
542 | end_recursion: | ||
543 | trace_buf->recursion--; | ||
530 | end: | 544 | end: |
531 | local_irq_restore(flags); | 545 | local_irq_restore(flags); |
532 | } | 546 | } |
@@ -574,6 +588,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) | |||
574 | { | 588 | { |
575 | struct syscall_metadata *sys_data; | 589 | struct syscall_metadata *sys_data; |
576 | struct syscall_trace_exit *rec; | 590 | struct syscall_trace_exit *rec; |
591 | struct perf_trace_buf *trace_buf; | ||
577 | unsigned long flags; | 592 | unsigned long flags; |
578 | int syscall_nr; | 593 | int syscall_nr; |
579 | char *raw_data; | 594 | char *raw_data; |
@@ -605,14 +620,25 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) | |||
605 | cpu = smp_processor_id(); | 620 | cpu = smp_processor_id(); |
606 | 621 | ||
607 | if (in_nmi()) | 622 | if (in_nmi()) |
608 | raw_data = rcu_dereference(trace_profile_buf_nmi); | 623 | trace_buf = rcu_dereference(perf_trace_buf_nmi); |
609 | else | 624 | else |
610 | raw_data = rcu_dereference(trace_profile_buf); | 625 | trace_buf = rcu_dereference(perf_trace_buf); |
611 | 626 | ||
612 | if (!raw_data) | 627 | if (!trace_buf) |
613 | goto end; | 628 | goto end; |
614 | 629 | ||
615 | raw_data = per_cpu_ptr(raw_data, cpu); | 630 | trace_buf = per_cpu_ptr(trace_buf, cpu); |
631 | |||
632 | if (trace_buf->recursion++) | ||
633 | goto end_recursion; | ||
634 | |||
635 | /* | ||
636 | * Make recursion update visible before entering perf_tp_event | ||
637 | * so that we protect from perf recursions. | ||
638 | */ | ||
639 | barrier(); | ||
640 | |||
641 | raw_data = trace_buf->buf; | ||
616 | 642 | ||
617 | /* zero the dead bytes from align to not leak stack to user */ | 643 | /* zero the dead bytes from align to not leak stack to user */ |
618 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; | 644 | *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; |
@@ -626,6 +652,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) | |||
626 | 652 | ||
627 | perf_tp_event(sys_data->exit_id, 0, 1, rec, size); | 653 | perf_tp_event(sys_data->exit_id, 0, 1, rec, size); |
628 | 654 | ||
655 | end_recursion: | ||
656 | trace_buf->recursion--; | ||
629 | end: | 657 | end: |
630 | local_irq_restore(flags); | 658 | local_irq_restore(flags); |
631 | } | 659 | } |
diff --git a/kernel/user.c b/kernel/user.c index 2c000e7132ac..46d0165ca70c 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -330,9 +330,9 @@ done: | |||
330 | */ | 330 | */ |
331 | static void free_user(struct user_struct *up, unsigned long flags) | 331 | static void free_user(struct user_struct *up, unsigned long flags) |
332 | { | 332 | { |
333 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
334 | INIT_DELAYED_WORK(&up->work, cleanup_user_struct); | 333 | INIT_DELAYED_WORK(&up->work, cleanup_user_struct); |
335 | schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); | 334 | schedule_delayed_work(&up->work, msecs_to_jiffies(1000)); |
335 | spin_unlock_irqrestore(&uidhash_lock, flags); | ||
336 | } | 336 | } |
337 | 337 | ||
338 | #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ | 338 | #else /* CONFIG_USER_SCHED && CONFIG_SYSFS */ |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ccefe574dcf7..12328147132c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -647,7 +647,7 @@ EXPORT_SYMBOL(schedule_delayed_work); | |||
647 | */ | 647 | */ |
648 | void flush_delayed_work(struct delayed_work *dwork) | 648 | void flush_delayed_work(struct delayed_work *dwork) |
649 | { | 649 | { |
650 | if (del_timer(&dwork->timer)) { | 650 | if (del_timer_sync(&dwork->timer)) { |
651 | struct cpu_workqueue_struct *cwq; | 651 | struct cpu_workqueue_struct *cwq; |
652 | cwq = wq_per_cpu(keventd_wq, get_cpu()); | 652 | cwq = wq_per_cpu(keventd_wq, get_cpu()); |
653 | __queue_work(cwq, &dwork->work); | 653 | __queue_work(cwq, &dwork->work); |
@@ -685,21 +685,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on); | |||
685 | int schedule_on_each_cpu(work_func_t func) | 685 | int schedule_on_each_cpu(work_func_t func) |
686 | { | 686 | { |
687 | int cpu; | 687 | int cpu; |
688 | int orig = -1; | ||
688 | struct work_struct *works; | 689 | struct work_struct *works; |
689 | 690 | ||
690 | works = alloc_percpu(struct work_struct); | 691 | works = alloc_percpu(struct work_struct); |
691 | if (!works) | 692 | if (!works) |
692 | return -ENOMEM; | 693 | return -ENOMEM; |
693 | 694 | ||
695 | /* | ||
696 | * when running in keventd don't schedule a work item on itself. | ||
697 | * Can just call directly because the work queue is already bound. | ||
698 | * This also is faster. | ||
699 | * Make this a generic parameter for other workqueues? | ||
700 | */ | ||
701 | if (current_is_keventd()) { | ||
702 | orig = raw_smp_processor_id(); | ||
703 | INIT_WORK(per_cpu_ptr(works, orig), func); | ||
704 | func(per_cpu_ptr(works, orig)); | ||
705 | } | ||
706 | |||
694 | get_online_cpus(); | 707 | get_online_cpus(); |
695 | for_each_online_cpu(cpu) { | 708 | for_each_online_cpu(cpu) { |
696 | struct work_struct *work = per_cpu_ptr(works, cpu); | 709 | struct work_struct *work = per_cpu_ptr(works, cpu); |
697 | 710 | ||
711 | if (cpu == orig) | ||
712 | continue; | ||
698 | INIT_WORK(work, func); | 713 | INIT_WORK(work, func); |
699 | schedule_work_on(cpu, work); | 714 | schedule_work_on(cpu, work); |
700 | } | 715 | } |
701 | for_each_online_cpu(cpu) | 716 | for_each_online_cpu(cpu) { |
702 | flush_work(per_cpu_ptr(works, cpu)); | 717 | if (cpu != orig) |
718 | flush_work(per_cpu_ptr(works, cpu)); | ||
719 | } | ||
703 | put_online_cpus(); | 720 | put_online_cpus(); |
704 | free_percpu(works); | 721 | free_percpu(works); |
705 | return 0; | 722 | return 0; |