aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-03-04 05:47:50 -0500
committerIngo Molnar <mingo@elte.hu>2010-03-04 05:47:52 -0500
commit4f16d4e0c9a4b20d9f0db365587b96d6001efd7d (patch)
treefa25dcf285b26f1fac2bf267d0d1cd2c4eba90b8 /kernel
parent1e259e0a9982078896f3404240096cbea01daca4 (diff)
parent6630125419ef37ff8781713c5e9d416f2a4ba357 (diff)
Merge branch 'perf/core' into perf/urgent
Merge reason: Switch from pre-merge topical split to the post-merge urgent track Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/futex.c30
-rw-r--r--kernel/hw_breakpoint.c10
-rw-r--r--kernel/kfifo.c3
-rw-r--r--kernel/kgdb.c6
-rw-r--r--kernel/kprobes.c34
-rw-r--r--kernel/perf_event.c642
-rw-r--r--kernel/sched.c12
-rw-r--r--kernel/softirq.c15
-rw-r--r--kernel/softlockup.c15
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c54
-rw-r--r--kernel/trace/trace_event_profile.c52
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_kprobe.c198
-rw-r--r--kernel/trace/trace_stack.c24
-rw-r--r--kernel/trace/trace_syscalls.c76
18 files changed, 666 insertions, 517 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index d9b3a2228f9d..e7a35f1039e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
530 return -EINVAL; 530 return -EINVAL;
531 531
532 WARN_ON(!atomic_read(&pi_state->refcount)); 532 WARN_ON(!atomic_read(&pi_state->refcount));
533 WARN_ON(pid && pi_state->owner && 533
534 pi_state->owner->pid != pid); 534 /*
535 * When pi_state->owner is NULL then the owner died
536 * and another waiter is on the fly. pi_state->owner
537 * is fixed up by the task which acquires
538 * pi_state->rt_mutex.
539 *
540 * We do not check for pid == 0 which can happen when
541 * the owner died and robust_list_exit() cleared the
542 * TID.
543 */
544 if (pid && pi_state->owner) {
545 /*
546 * Bail out if user space manipulated the
547 * futex value.
548 */
549 if (pid != task_pid_vnr(pi_state->owner))
550 return -EINVAL;
551 }
535 552
536 atomic_inc(&pi_state->refcount); 553 atomic_inc(&pi_state->refcount);
537 *ps = pi_state; 554 *ps = pi_state;
@@ -758,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
758 if (!pi_state) 775 if (!pi_state)
759 return -EINVAL; 776 return -EINVAL;
760 777
778 /*
779 * If current does not own the pi_state then the futex is
780 * inconsistent and user space fiddled with the futex value.
781 */
782 if (pi_state->owner != current)
783 return -EINVAL;
784
761 raw_spin_lock(&pi_state->pi_mutex.wait_lock); 785 raw_spin_lock(&pi_state->pi_mutex.wait_lock);
762 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); 786 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
763 787
@@ -1971,7 +1995,7 @@ retry_private:
1971 /* Unqueue and drop the lock */ 1995 /* Unqueue and drop the lock */
1972 unqueue_me_pi(&q); 1996 unqueue_me_pi(&q);
1973 1997
1974 goto out; 1998 goto out_put_key;
1975 1999
1976out_unlock_put_key: 2000out_unlock_put_key:
1977 queue_unlock(&q, hb); 2001 queue_unlock(&q, hb);
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 4d99512ee149..03808ed342a6 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -413,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
413 * 413 *
414 * @return a set of per_cpu pointers to perf events 414 * @return a set of per_cpu pointers to perf events
415 */ 415 */
416struct perf_event ** 416struct perf_event * __percpu *
417register_wide_hw_breakpoint(struct perf_event_attr *attr, 417register_wide_hw_breakpoint(struct perf_event_attr *attr,
418 perf_overflow_handler_t triggered) 418 perf_overflow_handler_t triggered)
419{ 419{
420 struct perf_event **cpu_events, **pevent, *bp; 420 struct perf_event * __percpu *cpu_events, **pevent, *bp;
421 long err; 421 long err;
422 int cpu; 422 int cpu;
423 423
424 cpu_events = alloc_percpu(typeof(*cpu_events)); 424 cpu_events = alloc_percpu(typeof(*cpu_events));
425 if (!cpu_events) 425 if (!cpu_events)
426 return ERR_PTR(-ENOMEM); 426 return (void __percpu __force *)ERR_PTR(-ENOMEM);
427 427
428 get_online_cpus(); 428 get_online_cpus();
429 for_each_online_cpu(cpu) { 429 for_each_online_cpu(cpu) {
@@ -451,7 +451,7 @@ fail:
451 put_online_cpus(); 451 put_online_cpus();
452 452
453 free_percpu(cpu_events); 453 free_percpu(cpu_events);
454 return ERR_PTR(err); 454 return (void __percpu __force *)ERR_PTR(err);
455} 455}
456EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 456EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
457 457
@@ -459,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
459 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel 459 * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel
460 * @cpu_events: the per cpu set of events to unregister 460 * @cpu_events: the per cpu set of events to unregister
461 */ 461 */
462void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) 462void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
463{ 463{
464 int cpu; 464 int cpu;
465 struct perf_event **pevent; 465 struct perf_event **pevent;
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 498cabba225e..35edbe22e9a9 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
80 80
81 buffer = kmalloc(size, gfp_mask); 81 buffer = kmalloc(size, gfp_mask);
82 if (!buffer) { 82 if (!buffer) {
83 _kfifo_init(fifo, 0, 0); 83 _kfifo_init(fifo, NULL, 0);
84 return -ENOMEM; 84 return -ENOMEM;
85 } 85 }
86 86
@@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc);
97void kfifo_free(struct kfifo *fifo) 97void kfifo_free(struct kfifo *fifo)
98{ 98{
99 kfree(fifo->buffer); 99 kfree(fifo->buffer);
100 _kfifo_init(fifo, NULL, 0);
100} 101}
101EXPORT_SYMBOL(kfifo_free); 102EXPORT_SYMBOL(kfifo_free);
102 103
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index c7ade62e4ef0..761fdd2b3034 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -599,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs)
599 599
600 /* Signal the primary CPU that we are done: */ 600 /* Signal the primary CPU that we are done: */
601 atomic_set(&cpu_in_kgdb[cpu], 0); 601 atomic_set(&cpu_in_kgdb[cpu], 0);
602 touch_softlockup_watchdog(); 602 touch_softlockup_watchdog_sync();
603 clocksource_touch_watchdog(); 603 clocksource_touch_watchdog();
604 local_irq_restore(flags); 604 local_irq_restore(flags);
605} 605}
@@ -1453,7 +1453,7 @@ acquirelock:
1453 (kgdb_info[cpu].task && 1453 (kgdb_info[cpu].task &&
1454 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { 1454 kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
1455 atomic_set(&kgdb_active, -1); 1455 atomic_set(&kgdb_active, -1);
1456 touch_softlockup_watchdog(); 1456 touch_softlockup_watchdog_sync();
1457 clocksource_touch_watchdog(); 1457 clocksource_touch_watchdog();
1458 local_irq_restore(flags); 1458 local_irq_restore(flags);
1459 1459
@@ -1553,7 +1553,7 @@ kgdb_restore:
1553 } 1553 }
1554 /* Free kgdb_active */ 1554 /* Free kgdb_active */
1555 atomic_set(&kgdb_active, -1); 1555 atomic_set(&kgdb_active, -1);
1556 touch_softlockup_watchdog(); 1556 touch_softlockup_watchdog_sync();
1557 clocksource_touch_watchdog(); 1557 clocksource_touch_watchdog();
1558 local_irq_restore(flags); 1558 local_irq_restore(flags);
1559 1559
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b7df302a0204..ccec774c716d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -44,6 +44,7 @@
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/kdebug.h> 45#include <linux/kdebug.h>
46#include <linux/memory.h> 46#include <linux/memory.h>
47#include <linux/ftrace.h>
47 48
48#include <asm-generic/sections.h> 49#include <asm-generic/sections.h>
49#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
@@ -93,6 +94,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
93 {"native_get_debugreg",}, 94 {"native_get_debugreg",},
94 {"irq_entries_start",}, 95 {"irq_entries_start",},
95 {"common_interrupt",}, 96 {"common_interrupt",},
97 {"mcount",}, /* mcount can be called from everywhere */
96 {NULL} /* Terminator */ 98 {NULL} /* Terminator */
97}; 99};
98 100
@@ -124,30 +126,6 @@ static LIST_HEAD(kprobe_insn_pages);
124static int kprobe_garbage_slots; 126static int kprobe_garbage_slots;
125static int collect_garbage_slots(void); 127static int collect_garbage_slots(void);
126 128
127static int __kprobes check_safety(void)
128{
129 int ret = 0;
130#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
131 ret = freeze_processes();
132 if (ret == 0) {
133 struct task_struct *p, *q;
134 do_each_thread(p, q) {
135 if (p != current && p->state == TASK_RUNNING &&
136 p->pid != 0) {
137 printk("Check failed: %s is running\n",p->comm);
138 ret = -1;
139 goto loop_end;
140 }
141 } while_each_thread(p, q);
142 }
143loop_end:
144 thaw_processes();
145#else
146 synchronize_sched();
147#endif
148 return ret;
149}
150
151/** 129/**
152 * __get_insn_slot() - Find a slot on an executable page for an instruction. 130 * __get_insn_slot() - Find a slot on an executable page for an instruction.
153 * We allocate an executable page if there's no room on existing ones. 131 * We allocate an executable page if there's no room on existing ones.
@@ -235,9 +213,8 @@ static int __kprobes collect_garbage_slots(void)
235{ 213{
236 struct kprobe_insn_page *kip, *next; 214 struct kprobe_insn_page *kip, *next;
237 215
238 /* Ensure no-one is preepmted on the garbages */ 216 /* Ensure no-one is interrupted on the garbages */
239 if (check_safety()) 217 synchronize_sched();
240 return -EAGAIN;
241 218
242 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 219 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
243 int i; 220 int i;
@@ -728,7 +705,8 @@ int __kprobes register_kprobe(struct kprobe *p)
728 705
729 preempt_disable(); 706 preempt_disable();
730 if (!kernel_text_address((unsigned long) p->addr) || 707 if (!kernel_text_address((unsigned long) p->addr) ||
731 in_kprobes_functions((unsigned long) p->addr)) { 708 in_kprobes_functions((unsigned long) p->addr) ||
709 ftrace_text_reserved(p->addr, p->addr)) {
732 preempt_enable(); 710 preempt_enable();
733 return -EINVAL; 711 return -EINVAL;
734 } 712 }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2ae7409bf38f..482d5e1d3764 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -56,21 +56,6 @@ static atomic_t nr_task_events __read_mostly;
56 */ 56 */
57int sysctl_perf_event_paranoid __read_mostly = 1; 57int sysctl_perf_event_paranoid __read_mostly = 1;
58 58
59static inline bool perf_paranoid_tracepoint_raw(void)
60{
61 return sysctl_perf_event_paranoid > -1;
62}
63
64static inline bool perf_paranoid_cpu(void)
65{
66 return sysctl_perf_event_paranoid > 0;
67}
68
69static inline bool perf_paranoid_kernel(void)
70{
71 return sysctl_perf_event_paranoid > 1;
72}
73
74int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 59int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
75 60
76/* 61/*
@@ -98,11 +83,12 @@ void __weak hw_perf_enable(void) { barrier(); }
98 83
99void __weak hw_perf_event_setup(int cpu) { barrier(); } 84void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); } 85void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
86void __weak hw_perf_event_setup_offline(int cpu) { barrier(); }
101 87
102int __weak 88int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader, 89hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx, 90 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu) 91 struct perf_event_context *ctx)
106{ 92{
107 return 0; 93 return 0;
108} 94}
@@ -248,7 +234,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
248 234
249static inline u64 perf_clock(void) 235static inline u64 perf_clock(void)
250{ 236{
251 return cpu_clock(smp_processor_id()); 237 return cpu_clock(raw_smp_processor_id());
252} 238}
253 239
254/* 240/*
@@ -289,6 +275,15 @@ static void update_event_times(struct perf_event *event)
289 event->total_time_running = run_end - event->tstamp_running; 275 event->total_time_running = run_end - event->tstamp_running;
290} 276}
291 277
278static struct list_head *
279ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
280{
281 if (event->attr.pinned)
282 return &ctx->pinned_groups;
283 else
284 return &ctx->flexible_groups;
285}
286
292/* 287/*
293 * Add a event from the lists for its context. 288 * Add a event from the lists for its context.
294 * Must be called with ctx->mutex and ctx->lock held. 289 * Must be called with ctx->mutex and ctx->lock held.
@@ -303,9 +298,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
303 * add it straight to the context's event list, or to the group 298 * add it straight to the context's event list, or to the group
304 * leader's sibling list: 299 * leader's sibling list:
305 */ 300 */
306 if (group_leader == event) 301 if (group_leader == event) {
307 list_add_tail(&event->group_entry, &ctx->group_list); 302 struct list_head *list;
308 else { 303
304 if (is_software_event(event))
305 event->group_flags |= PERF_GROUP_SOFTWARE;
306
307 list = ctx_group_list(event, ctx);
308 list_add_tail(&event->group_entry, list);
309 } else {
310 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
311 !is_software_event(event))
312 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
313
309 list_add_tail(&event->group_entry, &group_leader->sibling_list); 314 list_add_tail(&event->group_entry, &group_leader->sibling_list);
310 group_leader->nr_siblings++; 315 group_leader->nr_siblings++;
311 } 316 }
@@ -355,9 +360,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
355 * to the context list directly: 360 * to the context list directly:
356 */ 361 */
357 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 362 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
363 struct list_head *list;
358 364
359 list_move_tail(&sibling->group_entry, &ctx->group_list); 365 list = ctx_group_list(event, ctx);
366 list_move_tail(&sibling->group_entry, list);
360 sibling->group_leader = sibling; 367 sibling->group_leader = sibling;
368
369 /* Inherit group flags from the previous leader */
370 sibling->group_flags = event->group_flags;
361 } 371 }
362} 372}
363 373
@@ -608,14 +618,13 @@ void perf_event_disable(struct perf_event *event)
608static int 618static int
609event_sched_in(struct perf_event *event, 619event_sched_in(struct perf_event *event,
610 struct perf_cpu_context *cpuctx, 620 struct perf_cpu_context *cpuctx,
611 struct perf_event_context *ctx, 621 struct perf_event_context *ctx)
612 int cpu)
613{ 622{
614 if (event->state <= PERF_EVENT_STATE_OFF) 623 if (event->state <= PERF_EVENT_STATE_OFF)
615 return 0; 624 return 0;
616 625
617 event->state = PERF_EVENT_STATE_ACTIVE; 626 event->state = PERF_EVENT_STATE_ACTIVE;
618 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 627 event->oncpu = smp_processor_id();
619 /* 628 /*
620 * The new state must be visible before we turn it on in the hardware: 629 * The new state must be visible before we turn it on in the hardware:
621 */ 630 */
@@ -642,8 +651,7 @@ event_sched_in(struct perf_event *event,
642static int 651static int
643group_sched_in(struct perf_event *group_event, 652group_sched_in(struct perf_event *group_event,
644 struct perf_cpu_context *cpuctx, 653 struct perf_cpu_context *cpuctx,
645 struct perf_event_context *ctx, 654 struct perf_event_context *ctx)
646 int cpu)
647{ 655{
648 struct perf_event *event, *partial_group; 656 struct perf_event *event, *partial_group;
649 int ret; 657 int ret;
@@ -651,18 +659,18 @@ group_sched_in(struct perf_event *group_event,
651 if (group_event->state == PERF_EVENT_STATE_OFF) 659 if (group_event->state == PERF_EVENT_STATE_OFF)
652 return 0; 660 return 0;
653 661
654 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 662 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
655 if (ret) 663 if (ret)
656 return ret < 0 ? ret : 0; 664 return ret < 0 ? ret : 0;
657 665
658 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 666 if (event_sched_in(group_event, cpuctx, ctx))
659 return -EAGAIN; 667 return -EAGAIN;
660 668
661 /* 669 /*
662 * Schedule in siblings as one group (if any): 670 * Schedule in siblings as one group (if any):
663 */ 671 */
664 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 672 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
665 if (event_sched_in(event, cpuctx, ctx, cpu)) { 673 if (event_sched_in(event, cpuctx, ctx)) {
666 partial_group = event; 674 partial_group = event;
667 goto group_error; 675 goto group_error;
668 } 676 }
@@ -686,24 +694,6 @@ group_error:
686} 694}
687 695
688/* 696/*
689 * Return 1 for a group consisting entirely of software events,
690 * 0 if the group contains any hardware events.
691 */
692static int is_software_only_group(struct perf_event *leader)
693{
694 struct perf_event *event;
695
696 if (!is_software_event(leader))
697 return 0;
698
699 list_for_each_entry(event, &leader->sibling_list, group_entry)
700 if (!is_software_event(event))
701 return 0;
702
703 return 1;
704}
705
706/*
707 * Work out whether we can put this event group on the CPU now. 697 * Work out whether we can put this event group on the CPU now.
708 */ 698 */
709static int group_can_go_on(struct perf_event *event, 699static int group_can_go_on(struct perf_event *event,
@@ -713,7 +703,7 @@ static int group_can_go_on(struct perf_event *event,
713 /* 703 /*
714 * Groups consisting entirely of software events can always go on. 704 * Groups consisting entirely of software events can always go on.
715 */ 705 */
716 if (is_software_only_group(event)) 706 if (event->group_flags & PERF_GROUP_SOFTWARE)
717 return 1; 707 return 1;
718 /* 708 /*
719 * If an exclusive group is already on, no other hardware 709 * If an exclusive group is already on, no other hardware
@@ -754,7 +744,6 @@ static void __perf_install_in_context(void *info)
754 struct perf_event *event = info; 744 struct perf_event *event = info;
755 struct perf_event_context *ctx = event->ctx; 745 struct perf_event_context *ctx = event->ctx;
756 struct perf_event *leader = event->group_leader; 746 struct perf_event *leader = event->group_leader;
757 int cpu = smp_processor_id();
758 int err; 747 int err;
759 748
760 /* 749 /*
@@ -801,7 +790,7 @@ static void __perf_install_in_context(void *info)
801 if (!group_can_go_on(event, cpuctx, 1)) 790 if (!group_can_go_on(event, cpuctx, 1))
802 err = -EEXIST; 791 err = -EEXIST;
803 else 792 else
804 err = event_sched_in(event, cpuctx, ctx, cpu); 793 err = event_sched_in(event, cpuctx, ctx);
805 794
806 if (err) { 795 if (err) {
807 /* 796 /*
@@ -943,11 +932,9 @@ static void __perf_event_enable(void *info)
943 } else { 932 } else {
944 perf_disable(); 933 perf_disable();
945 if (event == leader) 934 if (event == leader)
946 err = group_sched_in(event, cpuctx, ctx, 935 err = group_sched_in(event, cpuctx, ctx);
947 smp_processor_id());
948 else 936 else
949 err = event_sched_in(event, cpuctx, ctx, 937 err = event_sched_in(event, cpuctx, ctx);
950 smp_processor_id());
951 perf_enable(); 938 perf_enable();
952 } 939 }
953 940
@@ -1043,8 +1030,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1043 return 0; 1030 return 0;
1044} 1031}
1045 1032
1046void __perf_event_sched_out(struct perf_event_context *ctx, 1033enum event_type_t {
1047 struct perf_cpu_context *cpuctx) 1034 EVENT_FLEXIBLE = 0x1,
1035 EVENT_PINNED = 0x2,
1036 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1037};
1038
1039static void ctx_sched_out(struct perf_event_context *ctx,
1040 struct perf_cpu_context *cpuctx,
1041 enum event_type_t event_type)
1048{ 1042{
1049 struct perf_event *event; 1043 struct perf_event *event;
1050 1044
@@ -1055,10 +1049,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1055 update_context_time(ctx); 1049 update_context_time(ctx);
1056 1050
1057 perf_disable(); 1051 perf_disable();
1058 if (ctx->nr_active) { 1052 if (!ctx->nr_active)
1059 list_for_each_entry(event, &ctx->group_list, group_entry) 1053 goto out_enable;
1054
1055 if (event_type & EVENT_PINNED)
1056 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1060 group_sched_out(event, cpuctx, ctx); 1057 group_sched_out(event, cpuctx, ctx);
1061 } 1058
1059 if (event_type & EVENT_FLEXIBLE)
1060 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1061 group_sched_out(event, cpuctx, ctx);
1062
1063 out_enable:
1062 perf_enable(); 1064 perf_enable();
1063 out: 1065 out:
1064 raw_spin_unlock(&ctx->lock); 1066 raw_spin_unlock(&ctx->lock);
@@ -1170,9 +1172,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1170 * not restart the event. 1172 * not restart the event.
1171 */ 1173 */
1172void perf_event_task_sched_out(struct task_struct *task, 1174void perf_event_task_sched_out(struct task_struct *task,
1173 struct task_struct *next, int cpu) 1175 struct task_struct *next)
1174{ 1176{
1175 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1177 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1176 struct perf_event_context *ctx = task->perf_event_ctxp; 1178 struct perf_event_context *ctx = task->perf_event_ctxp;
1177 struct perf_event_context *next_ctx; 1179 struct perf_event_context *next_ctx;
1178 struct perf_event_context *parent; 1180 struct perf_event_context *parent;
@@ -1220,15 +1222,13 @@ void perf_event_task_sched_out(struct task_struct *task,
1220 rcu_read_unlock(); 1222 rcu_read_unlock();
1221 1223
1222 if (do_switch) { 1224 if (do_switch) {
1223 __perf_event_sched_out(ctx, cpuctx); 1225 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1224 cpuctx->task_ctx = NULL; 1226 cpuctx->task_ctx = NULL;
1225 } 1227 }
1226} 1228}
1227 1229
1228/* 1230static void task_ctx_sched_out(struct perf_event_context *ctx,
1229 * Called with IRQs disabled 1231 enum event_type_t event_type)
1230 */
1231static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1232{ 1232{
1233 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1233 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1234 1234
@@ -1238,47 +1238,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1238 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1238 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1239 return; 1239 return;
1240 1240
1241 __perf_event_sched_out(ctx, cpuctx); 1241 ctx_sched_out(ctx, cpuctx, event_type);
1242 cpuctx->task_ctx = NULL; 1242 cpuctx->task_ctx = NULL;
1243} 1243}
1244 1244
1245/* 1245/*
1246 * Called with IRQs disabled 1246 * Called with IRQs disabled
1247 */ 1247 */
1248static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) 1248static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1249{
1250 task_ctx_sched_out(ctx, EVENT_ALL);
1251}
1252
1253/*
1254 * Called with IRQs disabled
1255 */
1256static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1257 enum event_type_t event_type)
1249{ 1258{
1250 __perf_event_sched_out(&cpuctx->ctx, cpuctx); 1259 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1251} 1260}
1252 1261
1253static void 1262static void
1254__perf_event_sched_in(struct perf_event_context *ctx, 1263ctx_pinned_sched_in(struct perf_event_context *ctx,
1255 struct perf_cpu_context *cpuctx, int cpu) 1264 struct perf_cpu_context *cpuctx)
1256{ 1265{
1257 struct perf_event *event; 1266 struct perf_event *event;
1258 int can_add_hw = 1;
1259
1260 raw_spin_lock(&ctx->lock);
1261 ctx->is_active = 1;
1262 if (likely(!ctx->nr_events))
1263 goto out;
1264
1265 ctx->timestamp = perf_clock();
1266
1267 perf_disable();
1268 1267
1269 /* 1268 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1270 * First go through the list and put on any pinned groups 1269 if (event->state <= PERF_EVENT_STATE_OFF)
1271 * in order to give them the best chance of going on.
1272 */
1273 list_for_each_entry(event, &ctx->group_list, group_entry) {
1274 if (event->state <= PERF_EVENT_STATE_OFF ||
1275 !event->attr.pinned)
1276 continue; 1270 continue;
1277 if (event->cpu != -1 && event->cpu != cpu) 1271 if (event->cpu != -1 && event->cpu != smp_processor_id())
1278 continue; 1272 continue;
1279 1273
1280 if (group_can_go_on(event, cpuctx, 1)) 1274 if (group_can_go_on(event, cpuctx, 1))
1281 group_sched_in(event, cpuctx, ctx, cpu); 1275 group_sched_in(event, cpuctx, ctx);
1282 1276
1283 /* 1277 /*
1284 * If this pinned group hasn't been scheduled, 1278 * If this pinned group hasn't been scheduled,
@@ -1289,32 +1283,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1289 event->state = PERF_EVENT_STATE_ERROR; 1283 event->state = PERF_EVENT_STATE_ERROR;
1290 } 1284 }
1291 } 1285 }
1286}
1292 1287
1293 list_for_each_entry(event, &ctx->group_list, group_entry) { 1288static void
1294 /* 1289ctx_flexible_sched_in(struct perf_event_context *ctx,
1295 * Ignore events in OFF or ERROR state, and 1290 struct perf_cpu_context *cpuctx)
1296 * ignore pinned events since we did them already. 1291{
1297 */ 1292 struct perf_event *event;
1298 if (event->state <= PERF_EVENT_STATE_OFF || 1293 int can_add_hw = 1;
1299 event->attr.pinned)
1300 continue;
1301 1294
1295 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1296 /* Ignore events in OFF or ERROR state */
1297 if (event->state <= PERF_EVENT_STATE_OFF)
1298 continue;
1302 /* 1299 /*
1303 * Listen to the 'cpu' scheduling filter constraint 1300 * Listen to the 'cpu' scheduling filter constraint
1304 * of events: 1301 * of events:
1305 */ 1302 */
1306 if (event->cpu != -1 && event->cpu != cpu) 1303 if (event->cpu != -1 && event->cpu != smp_processor_id())
1307 continue; 1304 continue;
1308 1305
1309 if (group_can_go_on(event, cpuctx, can_add_hw)) 1306 if (group_can_go_on(event, cpuctx, can_add_hw))
1310 if (group_sched_in(event, cpuctx, ctx, cpu)) 1307 if (group_sched_in(event, cpuctx, ctx))
1311 can_add_hw = 0; 1308 can_add_hw = 0;
1312 } 1309 }
1310}
1311
1312static void
1313ctx_sched_in(struct perf_event_context *ctx,
1314 struct perf_cpu_context *cpuctx,
1315 enum event_type_t event_type)
1316{
1317 raw_spin_lock(&ctx->lock);
1318 ctx->is_active = 1;
1319 if (likely(!ctx->nr_events))
1320 goto out;
1321
1322 ctx->timestamp = perf_clock();
1323
1324 perf_disable();
1325
1326 /*
1327 * First go through the list and put on any pinned groups
1328 * in order to give them the best chance of going on.
1329 */
1330 if (event_type & EVENT_PINNED)
1331 ctx_pinned_sched_in(ctx, cpuctx);
1332
1333 /* Then walk through the lower prio flexible groups */
1334 if (event_type & EVENT_FLEXIBLE)
1335 ctx_flexible_sched_in(ctx, cpuctx);
1336
1313 perf_enable(); 1337 perf_enable();
1314 out: 1338 out:
1315 raw_spin_unlock(&ctx->lock); 1339 raw_spin_unlock(&ctx->lock);
1316} 1340}
1317 1341
1342static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1343 enum event_type_t event_type)
1344{
1345 struct perf_event_context *ctx = &cpuctx->ctx;
1346
1347 ctx_sched_in(ctx, cpuctx, event_type);
1348}
1349
1350static void task_ctx_sched_in(struct task_struct *task,
1351 enum event_type_t event_type)
1352{
1353 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1354 struct perf_event_context *ctx = task->perf_event_ctxp;
1355
1356 if (likely(!ctx))
1357 return;
1358 if (cpuctx->task_ctx == ctx)
1359 return;
1360 ctx_sched_in(ctx, cpuctx, event_type);
1361 cpuctx->task_ctx = ctx;
1362}
1318/* 1363/*
1319 * Called from scheduler to add the events of the current task 1364 * Called from scheduler to add the events of the current task
1320 * with interrupts disabled. 1365 * with interrupts disabled.
@@ -1326,38 +1371,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1326 * accessing the event control register. If a NMI hits, then it will 1371 * accessing the event control register. If a NMI hits, then it will
1327 * keep the event running. 1372 * keep the event running.
1328 */ 1373 */
1329void perf_event_task_sched_in(struct task_struct *task, int cpu) 1374void perf_event_task_sched_in(struct task_struct *task)
1330{ 1375{
1331 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1376 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1332 struct perf_event_context *ctx = task->perf_event_ctxp; 1377 struct perf_event_context *ctx = task->perf_event_ctxp;
1333 1378
1334 if (likely(!ctx)) 1379 if (likely(!ctx))
1335 return; 1380 return;
1381
1336 if (cpuctx->task_ctx == ctx) 1382 if (cpuctx->task_ctx == ctx)
1337 return; 1383 return;
1338 __perf_event_sched_in(ctx, cpuctx, cpu); 1384
1385 /*
1386 * We want to keep the following priority order:
1387 * cpu pinned (that don't need to move), task pinned,
1388 * cpu flexible, task flexible.
1389 */
1390 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1391
1392 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1393 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1394 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1395
1339 cpuctx->task_ctx = ctx; 1396 cpuctx->task_ctx = ctx;
1340} 1397}
1341 1398
1342static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) 1399#define MAX_INTERRUPTS (~0ULL)
1400
1401static void perf_log_throttle(struct perf_event *event, int enable);
1402
1403static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1343{ 1404{
1344 struct perf_event_context *ctx = &cpuctx->ctx; 1405 u64 frequency = event->attr.sample_freq;
1406 u64 sec = NSEC_PER_SEC;
1407 u64 divisor, dividend;
1408
1409 int count_fls, nsec_fls, frequency_fls, sec_fls;
1410
1411 count_fls = fls64(count);
1412 nsec_fls = fls64(nsec);
1413 frequency_fls = fls64(frequency);
1414 sec_fls = 30;
1415
1416 /*
1417 * We got @count in @nsec, with a target of sample_freq HZ
1418 * the target period becomes:
1419 *
1420 * @count * 10^9
1421 * period = -------------------
1422 * @nsec * sample_freq
1423 *
1424 */
1425
1426 /*
1427 * Reduce accuracy by one bit such that @a and @b converge
1428 * to a similar magnitude.
1429 */
1430#define REDUCE_FLS(a, b) \
1431do { \
1432 if (a##_fls > b##_fls) { \
1433 a >>= 1; \
1434 a##_fls--; \
1435 } else { \
1436 b >>= 1; \
1437 b##_fls--; \
1438 } \
1439} while (0)
1440
1441 /*
1442 * Reduce accuracy until either term fits in a u64, then proceed with
1443 * the other, so that finally we can do a u64/u64 division.
1444 */
1445 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1446 REDUCE_FLS(nsec, frequency);
1447 REDUCE_FLS(sec, count);
1448 }
1449
1450 if (count_fls + sec_fls > 64) {
1451 divisor = nsec * frequency;
1452
1453 while (count_fls + sec_fls > 64) {
1454 REDUCE_FLS(count, sec);
1455 divisor >>= 1;
1456 }
1345 1457
1346 __perf_event_sched_in(ctx, cpuctx, cpu); 1458 dividend = count * sec;
1459 } else {
1460 dividend = count * sec;
1461
1462 while (nsec_fls + frequency_fls > 64) {
1463 REDUCE_FLS(nsec, frequency);
1464 dividend >>= 1;
1465 }
1466
1467 divisor = nsec * frequency;
1468 }
1469
1470 return div64_u64(dividend, divisor);
1347} 1471}
1348 1472
1349#define MAX_INTERRUPTS (~0ULL) 1473static void perf_event_stop(struct perf_event *event)
1474{
1475 if (!event->pmu->stop)
1476 return event->pmu->disable(event);
1350 1477
1351static void perf_log_throttle(struct perf_event *event, int enable); 1478 return event->pmu->stop(event);
1479}
1480
1481static int perf_event_start(struct perf_event *event)
1482{
1483 if (!event->pmu->start)
1484 return event->pmu->enable(event);
1352 1485
1353static void perf_adjust_period(struct perf_event *event, u64 events) 1486 return event->pmu->start(event);
1487}
1488
1489static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1354{ 1490{
1355 struct hw_perf_event *hwc = &event->hw; 1491 struct hw_perf_event *hwc = &event->hw;
1356 u64 period, sample_period; 1492 u64 period, sample_period;
1357 s64 delta; 1493 s64 delta;
1358 1494
1359 events *= hwc->sample_period; 1495 period = perf_calculate_period(event, nsec, count);
1360 period = div64_u64(events, event->attr.sample_freq);
1361 1496
1362 delta = (s64)(period - hwc->sample_period); 1497 delta = (s64)(period - hwc->sample_period);
1363 delta = (delta + 7) / 8; /* low pass filter */ 1498 delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1503,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
1368 sample_period = 1; 1503 sample_period = 1;
1369 1504
1370 hwc->sample_period = sample_period; 1505 hwc->sample_period = sample_period;
1506
1507 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1508 perf_disable();
1509 perf_event_stop(event);
1510 atomic64_set(&hwc->period_left, 0);
1511 perf_event_start(event);
1512 perf_enable();
1513 }
1371} 1514}
1372 1515
1373static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1516static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1374{ 1517{
1375 struct perf_event *event; 1518 struct perf_event *event;
1376 struct hw_perf_event *hwc; 1519 struct hw_perf_event *hwc;
1377 u64 interrupts, freq; 1520 u64 interrupts, now;
1521 s64 delta;
1378 1522
1379 raw_spin_lock(&ctx->lock); 1523 raw_spin_lock(&ctx->lock);
1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1524 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1395,44 +1539,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1395 if (interrupts == MAX_INTERRUPTS) { 1539 if (interrupts == MAX_INTERRUPTS) {
1396 perf_log_throttle(event, 1); 1540 perf_log_throttle(event, 1);
1397 event->pmu->unthrottle(event); 1541 event->pmu->unthrottle(event);
1398 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1399 } 1542 }
1400 1543
1401 if (!event->attr.freq || !event->attr.sample_freq) 1544 if (!event->attr.freq || !event->attr.sample_freq)
1402 continue; 1545 continue;
1403 1546
1404 /* 1547 event->pmu->read(event);
1405 * if the specified freq < HZ then we need to skip ticks 1548 now = atomic64_read(&event->count);
1406 */ 1549 delta = now - hwc->freq_count_stamp;
1407 if (event->attr.sample_freq < HZ) { 1550 hwc->freq_count_stamp = now;
1408 freq = event->attr.sample_freq;
1409
1410 hwc->freq_count += freq;
1411 hwc->freq_interrupts += interrupts;
1412
1413 if (hwc->freq_count < HZ)
1414 continue;
1415
1416 interrupts = hwc->freq_interrupts;
1417 hwc->freq_interrupts = 0;
1418 hwc->freq_count -= HZ;
1419 } else
1420 freq = HZ;
1421
1422 perf_adjust_period(event, freq * interrupts);
1423 1551
1424 /* 1552 if (delta > 0)
1425 * In order to avoid being stalled by an (accidental) huge 1553 perf_adjust_period(event, TICK_NSEC, delta);
1426 * sample period, force reset the sample period if we didn't
1427 * get any events in this freq period.
1428 */
1429 if (!interrupts) {
1430 perf_disable();
1431 event->pmu->disable(event);
1432 atomic64_set(&hwc->period_left, 0);
1433 event->pmu->enable(event);
1434 perf_enable();
1435 }
1436 } 1554 }
1437 raw_spin_unlock(&ctx->lock); 1555 raw_spin_unlock(&ctx->lock);
1438} 1556}
@@ -1442,26 +1560,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1442 */ 1560 */
1443static void rotate_ctx(struct perf_event_context *ctx) 1561static void rotate_ctx(struct perf_event_context *ctx)
1444{ 1562{
1445 struct perf_event *event;
1446
1447 if (!ctx->nr_events) 1563 if (!ctx->nr_events)
1448 return; 1564 return;
1449 1565
1450 raw_spin_lock(&ctx->lock); 1566 raw_spin_lock(&ctx->lock);
1451 /* 1567
1452 * Rotate the first entry last (works just fine for group events too): 1568 /* Rotate the first entry last of non-pinned groups */
1453 */ 1569 list_rotate_left(&ctx->flexible_groups);
1454 perf_disable();
1455 list_for_each_entry(event, &ctx->group_list, group_entry) {
1456 list_move_tail(&event->group_entry, &ctx->group_list);
1457 break;
1458 }
1459 perf_enable();
1460 1570
1461 raw_spin_unlock(&ctx->lock); 1571 raw_spin_unlock(&ctx->lock);
1462} 1572}
1463 1573
1464void perf_event_task_tick(struct task_struct *curr, int cpu) 1574void perf_event_task_tick(struct task_struct *curr)
1465{ 1575{
1466 struct perf_cpu_context *cpuctx; 1576 struct perf_cpu_context *cpuctx;
1467 struct perf_event_context *ctx; 1577 struct perf_event_context *ctx;
@@ -1469,24 +1579,43 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
1469 if (!atomic_read(&nr_events)) 1579 if (!atomic_read(&nr_events))
1470 return; 1580 return;
1471 1581
1472 cpuctx = &per_cpu(perf_cpu_context, cpu); 1582 cpuctx = &__get_cpu_var(perf_cpu_context);
1473 ctx = curr->perf_event_ctxp; 1583 ctx = curr->perf_event_ctxp;
1474 1584
1585 perf_disable();
1586
1475 perf_ctx_adjust_freq(&cpuctx->ctx); 1587 perf_ctx_adjust_freq(&cpuctx->ctx);
1476 if (ctx) 1588 if (ctx)
1477 perf_ctx_adjust_freq(ctx); 1589 perf_ctx_adjust_freq(ctx);
1478 1590
1479 perf_event_cpu_sched_out(cpuctx); 1591 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1480 if (ctx) 1592 if (ctx)
1481 __perf_event_task_sched_out(ctx); 1593 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1482 1594
1483 rotate_ctx(&cpuctx->ctx); 1595 rotate_ctx(&cpuctx->ctx);
1484 if (ctx) 1596 if (ctx)
1485 rotate_ctx(ctx); 1597 rotate_ctx(ctx);
1486 1598
1487 perf_event_cpu_sched_in(cpuctx, cpu); 1599 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1488 if (ctx) 1600 if (ctx)
1489 perf_event_task_sched_in(curr, cpu); 1601 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1602
1603 perf_enable();
1604}
1605
1606static int event_enable_on_exec(struct perf_event *event,
1607 struct perf_event_context *ctx)
1608{
1609 if (!event->attr.enable_on_exec)
1610 return 0;
1611
1612 event->attr.enable_on_exec = 0;
1613 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1614 return 0;
1615
1616 __perf_event_mark_enabled(event, ctx);
1617
1618 return 1;
1490} 1619}
1491 1620
1492/* 1621/*
@@ -1499,6 +1628,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1499 struct perf_event *event; 1628 struct perf_event *event;
1500 unsigned long flags; 1629 unsigned long flags;
1501 int enabled = 0; 1630 int enabled = 0;
1631 int ret;
1502 1632
1503 local_irq_save(flags); 1633 local_irq_save(flags);
1504 ctx = task->perf_event_ctxp; 1634 ctx = task->perf_event_ctxp;
@@ -1509,14 +1639,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1509 1639
1510 raw_spin_lock(&ctx->lock); 1640 raw_spin_lock(&ctx->lock);
1511 1641
1512 list_for_each_entry(event, &ctx->group_list, group_entry) { 1642 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1513 if (!event->attr.enable_on_exec) 1643 ret = event_enable_on_exec(event, ctx);
1514 continue; 1644 if (ret)
1515 event->attr.enable_on_exec = 0; 1645 enabled = 1;
1516 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1646 }
1517 continue; 1647
1518 __perf_event_mark_enabled(event, ctx); 1648 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1519 enabled = 1; 1649 ret = event_enable_on_exec(event, ctx);
1650 if (ret)
1651 enabled = 1;
1520 } 1652 }
1521 1653
1522 /* 1654 /*
@@ -1527,7 +1659,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1527 1659
1528 raw_spin_unlock(&ctx->lock); 1660 raw_spin_unlock(&ctx->lock);
1529 1661
1530 perf_event_task_sched_in(task, smp_processor_id()); 1662 perf_event_task_sched_in(task);
1531 out: 1663 out:
1532 local_irq_restore(flags); 1664 local_irq_restore(flags);
1533} 1665}
@@ -1590,7 +1722,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
1590{ 1722{
1591 raw_spin_lock_init(&ctx->lock); 1723 raw_spin_lock_init(&ctx->lock);
1592 mutex_init(&ctx->mutex); 1724 mutex_init(&ctx->mutex);
1593 INIT_LIST_HEAD(&ctx->group_list); 1725 INIT_LIST_HEAD(&ctx->pinned_groups);
1726 INIT_LIST_HEAD(&ctx->flexible_groups);
1594 INIT_LIST_HEAD(&ctx->event_list); 1727 INIT_LIST_HEAD(&ctx->event_list);
1595 atomic_set(&ctx->refcount, 1); 1728 atomic_set(&ctx->refcount, 1);
1596 ctx->task = task; 1729 ctx->task = task;
@@ -3608,7 +3741,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3608 /* .tid */ 3741 /* .tid */
3609 .start = vma->vm_start, 3742 .start = vma->vm_start,
3610 .len = vma->vm_end - vma->vm_start, 3743 .len = vma->vm_end - vma->vm_start,
3611 .pgoff = vma->vm_pgoff, 3744 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3612 }, 3745 },
3613 }; 3746 };
3614 3747
@@ -3688,12 +3821,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3688 3821
3689 if (event->attr.freq) { 3822 if (event->attr.freq) {
3690 u64 now = perf_clock(); 3823 u64 now = perf_clock();
3691 s64 delta = now - hwc->freq_stamp; 3824 s64 delta = now - hwc->freq_time_stamp;
3692 3825
3693 hwc->freq_stamp = now; 3826 hwc->freq_time_stamp = now;
3694 3827
3695 if (delta > 0 && delta < TICK_NSEC) 3828 if (delta > 0 && delta < 2*TICK_NSEC)
3696 perf_adjust_period(event, NSEC_PER_SEC / (int)delta); 3829 perf_adjust_period(event, delta, hwc->last_period);
3697 } 3830 }
3698 3831
3699 /* 3832 /*
@@ -4184,7 +4317,7 @@ static const struct pmu perf_ops_task_clock = {
4184 .read = task_clock_perf_event_read, 4317 .read = task_clock_perf_event_read,
4185}; 4318};
4186 4319
4187#ifdef CONFIG_EVENT_PROFILE 4320#ifdef CONFIG_EVENT_TRACING
4188 4321
4189void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4322void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4190 int entry_size) 4323 int entry_size)
@@ -4289,7 +4422,7 @@ static void perf_event_free_filter(struct perf_event *event)
4289{ 4422{
4290} 4423}
4291 4424
4292#endif /* CONFIG_EVENT_PROFILE */ 4425#endif /* CONFIG_EVENT_TRACING */
4293 4426
4294#ifdef CONFIG_HAVE_HW_BREAKPOINT 4427#ifdef CONFIG_HAVE_HW_BREAKPOINT
4295static void bp_perf_event_destroy(struct perf_event *event) 4428static void bp_perf_event_destroy(struct perf_event *event)
@@ -4870,8 +5003,15 @@ inherit_event(struct perf_event *parent_event,
4870 else 5003 else
4871 child_event->state = PERF_EVENT_STATE_OFF; 5004 child_event->state = PERF_EVENT_STATE_OFF;
4872 5005
4873 if (parent_event->attr.freq) 5006 if (parent_event->attr.freq) {
4874 child_event->hw.sample_period = parent_event->hw.sample_period; 5007 u64 sample_period = parent_event->hw.sample_period;
5008 struct hw_perf_event *hwc = &child_event->hw;
5009
5010 hwc->sample_period = sample_period;
5011 hwc->last_period = sample_period;
5012
5013 atomic64_set(&hwc->period_left, sample_period);
5014 }
4875 5015
4876 child_event->overflow_handler = parent_event->overflow_handler; 5016 child_event->overflow_handler = parent_event->overflow_handler;
4877 5017
@@ -5039,7 +5179,11 @@ void perf_event_exit_task(struct task_struct *child)
5039 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5179 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5040 5180
5041again: 5181again:
5042 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, 5182 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5183 group_entry)
5184 __perf_event_exit_task(child_event, child_ctx, child);
5185
5186 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5043 group_entry) 5187 group_entry)
5044 __perf_event_exit_task(child_event, child_ctx, child); 5188 __perf_event_exit_task(child_event, child_ctx, child);
5045 5189
@@ -5048,7 +5192,8 @@ again:
5048 * its siblings to the list, but we obtained 'tmp' before that which 5192 * its siblings to the list, but we obtained 'tmp' before that which
5049 * will still point to the list head terminating the iteration. 5193 * will still point to the list head terminating the iteration.
5050 */ 5194 */
5051 if (!list_empty(&child_ctx->group_list)) 5195 if (!list_empty(&child_ctx->pinned_groups) ||
5196 !list_empty(&child_ctx->flexible_groups))
5052 goto again; 5197 goto again;
5053 5198
5054 mutex_unlock(&child_ctx->mutex); 5199 mutex_unlock(&child_ctx->mutex);
@@ -5056,6 +5201,24 @@ again:
5056 put_ctx(child_ctx); 5201 put_ctx(child_ctx);
5057} 5202}
5058 5203
5204static void perf_free_event(struct perf_event *event,
5205 struct perf_event_context *ctx)
5206{
5207 struct perf_event *parent = event->parent;
5208
5209 if (WARN_ON_ONCE(!parent))
5210 return;
5211
5212 mutex_lock(&parent->child_mutex);
5213 list_del_init(&event->child_list);
5214 mutex_unlock(&parent->child_mutex);
5215
5216 fput(parent->filp);
5217
5218 list_del_event(event, ctx);
5219 free_event(event);
5220}
5221
5059/* 5222/*
5060 * free an unexposed, unused context as created by inheritance by 5223 * free an unexposed, unused context as created by inheritance by
5061 * init_task below, used by fork() in case of fail. 5224 * init_task below, used by fork() in case of fail.
@@ -5070,36 +5233,70 @@ void perf_event_free_task(struct task_struct *task)
5070 5233
5071 mutex_lock(&ctx->mutex); 5234 mutex_lock(&ctx->mutex);
5072again: 5235again:
5073 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { 5236 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5074 struct perf_event *parent = event->parent; 5237 perf_free_event(event, ctx);
5075 5238
5076 if (WARN_ON_ONCE(!parent)) 5239 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5077 continue; 5240 group_entry)
5241 perf_free_event(event, ctx);
5242
5243 if (!list_empty(&ctx->pinned_groups) ||
5244 !list_empty(&ctx->flexible_groups))
5245 goto again;
5078 5246
5079 mutex_lock(&parent->child_mutex); 5247 mutex_unlock(&ctx->mutex);
5080 list_del_init(&event->child_list);
5081 mutex_unlock(&parent->child_mutex);
5082 5248
5083 fput(parent->filp); 5249 put_ctx(ctx);
5250}
5084 5251
5085 list_del_event(event, ctx); 5252static int
5086 free_event(event); 5253inherit_task_group(struct perf_event *event, struct task_struct *parent,
5254 struct perf_event_context *parent_ctx,
5255 struct task_struct *child,
5256 int *inherited_all)
5257{
5258 int ret;
5259 struct perf_event_context *child_ctx = child->perf_event_ctxp;
5260
5261 if (!event->attr.inherit) {
5262 *inherited_all = 0;
5263 return 0;
5087 } 5264 }
5088 5265
5089 if (!list_empty(&ctx->group_list)) 5266 if (!child_ctx) {
5090 goto again; 5267 /*
5268 * This is executed from the parent task context, so
5269 * inherit events that have been marked for cloning.
5270 * First allocate and initialize a context for the
5271 * child.
5272 */
5091 5273
5092 mutex_unlock(&ctx->mutex); 5274 child_ctx = kzalloc(sizeof(struct perf_event_context),
5275 GFP_KERNEL);
5276 if (!child_ctx)
5277 return -ENOMEM;
5093 5278
5094 put_ctx(ctx); 5279 __perf_event_init_context(child_ctx, child);
5280 child->perf_event_ctxp = child_ctx;
5281 get_task_struct(child);
5282 }
5283
5284 ret = inherit_group(event, parent, parent_ctx,
5285 child, child_ctx);
5286
5287 if (ret)
5288 *inherited_all = 0;
5289
5290 return ret;
5095} 5291}
5096 5292
5293
5097/* 5294/*
5098 * Initialize the perf_event context in task_struct 5295 * Initialize the perf_event context in task_struct
5099 */ 5296 */
5100int perf_event_init_task(struct task_struct *child) 5297int perf_event_init_task(struct task_struct *child)
5101{ 5298{
5102 struct perf_event_context *child_ctx = NULL, *parent_ctx; 5299 struct perf_event_context *child_ctx, *parent_ctx;
5103 struct perf_event_context *cloned_ctx; 5300 struct perf_event_context *cloned_ctx;
5104 struct perf_event *event; 5301 struct perf_event *event;
5105 struct task_struct *parent = current; 5302 struct task_struct *parent = current;
@@ -5137,41 +5334,22 @@ int perf_event_init_task(struct task_struct *child)
5137 * We dont have to disable NMIs - we are only looking at 5334 * We dont have to disable NMIs - we are only looking at
5138 * the list, not manipulating it: 5335 * the list, not manipulating it:
5139 */ 5336 */
5140 list_for_each_entry(event, &parent_ctx->group_list, group_entry) { 5337 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5141 5338 ret = inherit_task_group(event, parent, parent_ctx, child,
5142 if (!event->attr.inherit) { 5339 &inherited_all);
5143 inherited_all = 0; 5340 if (ret)
5144 continue; 5341 break;
5145 } 5342 }
5146
5147 if (!child->perf_event_ctxp) {
5148 /*
5149 * This is executed from the parent task context, so
5150 * inherit events that have been marked for cloning.
5151 * First allocate and initialize a context for the
5152 * child.
5153 */
5154
5155 child_ctx = kzalloc(sizeof(struct perf_event_context),
5156 GFP_KERNEL);
5157 if (!child_ctx) {
5158 ret = -ENOMEM;
5159 break;
5160 }
5161
5162 __perf_event_init_context(child_ctx, child);
5163 child->perf_event_ctxp = child_ctx;
5164 get_task_struct(child);
5165 }
5166 5343
5167 ret = inherit_group(event, parent, parent_ctx, 5344 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5168 child, child_ctx); 5345 ret = inherit_task_group(event, parent, parent_ctx, child,
5169 if (ret) { 5346 &inherited_all);
5170 inherited_all = 0; 5347 if (ret)
5171 break; 5348 break;
5172 }
5173 } 5349 }
5174 5350
5351 child_ctx = child->perf_event_ctxp;
5352
5175 if (child_ctx && inherited_all) { 5353 if (child_ctx && inherited_all) {
5176 /* 5354 /*
5177 * Mark the child context as a clone of the parent 5355 * Mark the child context as a clone of the parent
@@ -5220,7 +5398,9 @@ static void __perf_event_exit_cpu(void *info)
5220 struct perf_event_context *ctx = &cpuctx->ctx; 5398 struct perf_event_context *ctx = &cpuctx->ctx;
5221 struct perf_event *event, *tmp; 5399 struct perf_event *event, *tmp;
5222 5400
5223 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) 5401 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5402 __perf_event_remove_from_context(event);
5403 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5224 __perf_event_remove_from_context(event); 5404 __perf_event_remove_from_context(event);
5225} 5405}
5226static void perf_event_exit_cpu(int cpu) 5406static void perf_event_exit_cpu(int cpu)
@@ -5258,6 +5438,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5258 perf_event_exit_cpu(cpu); 5438 perf_event_exit_cpu(cpu);
5259 break; 5439 break;
5260 5440
5441 case CPU_DEAD:
5442 hw_perf_event_setup_offline(cpu);
5443 break;
5444
5261 default: 5445 default:
5262 break; 5446 break;
5263 } 5447 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 3a8fb30a91b1..3e71ebb101c2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2794,7 +2794,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2794 */ 2794 */
2795 prev_state = prev->state; 2795 prev_state = prev->state;
2796 finish_arch_switch(prev); 2796 finish_arch_switch(prev);
2797 perf_event_task_sched_in(current, cpu_of(rq)); 2797#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2798 local_irq_disable();
2799#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2800 perf_event_task_sched_in(current);
2801#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2802 local_irq_enable();
2803#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2798 finish_lock_switch(rq, prev); 2804 finish_lock_switch(rq, prev);
2799 2805
2800 fire_sched_in_preempt_notifiers(current); 2806 fire_sched_in_preempt_notifiers(current);
@@ -5309,7 +5315,7 @@ void scheduler_tick(void)
5309 curr->sched_class->task_tick(rq, curr, 0); 5315 curr->sched_class->task_tick(rq, curr, 0);
5310 raw_spin_unlock(&rq->lock); 5316 raw_spin_unlock(&rq->lock);
5311 5317
5312 perf_event_task_tick(curr, cpu); 5318 perf_event_task_tick(curr);
5313 5319
5314#ifdef CONFIG_SMP 5320#ifdef CONFIG_SMP
5315 rq->idle_at_tick = idle_cpu(cpu); 5321 rq->idle_at_tick = idle_cpu(cpu);
@@ -5523,7 +5529,7 @@ need_resched_nonpreemptible:
5523 5529
5524 if (likely(prev != next)) { 5530 if (likely(prev != next)) {
5525 sched_info_switch(prev, next); 5531 sched_info_switch(prev, next);
5526 perf_event_task_sched_out(prev, next, cpu); 5532 perf_event_task_sched_out(prev, next);
5527 5533
5528 rq->nr_switches++; 5534 rq->nr_switches++;
5529 rq->curr = next; 5535 rq->curr = next;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a09502e2ef75..7c1a67ef0274 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill);
500 */ 500 */
501 501
502/* 502/*
503 * The trampoline is called when the hrtimer expires. If this is 503 * The trampoline is called when the hrtimer expires. It schedules a tasklet
504 * called from the hrtimer interrupt then we schedule the tasklet as 504 * to run __tasklet_hrtimer_trampoline() which in turn will call the intended
505 * the timer callback function expects to run in softirq context. If 505 * hrtimer callback, but from softirq context.
506 * it's called in softirq context anyway (i.e. high resolution timers
507 * disabled) then the hrtimer callback is called right away.
508 */ 506 */
509static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) 507static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer)
510{ 508{
511 struct tasklet_hrtimer *ttimer = 509 struct tasklet_hrtimer *ttimer =
512 container_of(timer, struct tasklet_hrtimer, timer); 510 container_of(timer, struct tasklet_hrtimer, timer);
513 511
514 if (hrtimer_is_hres_active(timer)) { 512 tasklet_hi_schedule(&ttimer->tasklet);
515 tasklet_hi_schedule(&ttimer->tasklet); 513 return HRTIMER_NORESTART;
516 return HRTIMER_NORESTART;
517 }
518 return ttimer->function(timer);
519} 514}
520 515
521/* 516/*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d22579087e27..0d4c7898ab80 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock);
25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ 25static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */
26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ 26static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */
27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); 27static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
28static DEFINE_PER_CPU(bool, softlock_touch_sync);
28 29
29static int __read_mostly did_panic; 30static int __read_mostly did_panic;
30int __read_mostly softlockup_thresh = 60; 31int __read_mostly softlockup_thresh = 60;
@@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void)
79} 80}
80EXPORT_SYMBOL(touch_softlockup_watchdog); 81EXPORT_SYMBOL(touch_softlockup_watchdog);
81 82
83void touch_softlockup_watchdog_sync(void)
84{
85 __raw_get_cpu_var(softlock_touch_sync) = true;
86 __raw_get_cpu_var(softlockup_touch_ts) = 0;
87}
88
82void touch_all_softlockup_watchdogs(void) 89void touch_all_softlockup_watchdogs(void)
83{ 90{
84 int cpu; 91 int cpu;
@@ -118,6 +125,14 @@ void softlockup_tick(void)
118 } 125 }
119 126
120 if (touch_ts == 0) { 127 if (touch_ts == 0) {
128 if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) {
129 /*
130 * If the time stamp was touched atomically
131 * make sure the scheduler tick is up to date.
132 */
133 per_cpu(softlock_touch_sync, this_cpu) = false;
134 sched_clock_tick();
135 }
121 __touch_softlockup_watchdog(); 136 __touch_softlockup_watchdog();
122 return; 137 return;
123 } 138 }
diff --git a/kernel/sys.c b/kernel/sys.c
index 26a6b73a6b85..18bde979f346 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -222,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
222 if (which > PRIO_USER || which < PRIO_PROCESS) 222 if (which > PRIO_USER || which < PRIO_PROCESS)
223 return -EINVAL; 223 return -EINVAL;
224 224
225 rcu_read_lock();
225 read_lock(&tasklist_lock); 226 read_lock(&tasklist_lock);
226 switch (which) { 227 switch (which) {
227 case PRIO_PROCESS: 228 case PRIO_PROCESS:
@@ -267,6 +268,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
267 } 268 }
268out_unlock: 269out_unlock:
269 read_unlock(&tasklist_lock); 270 read_unlock(&tasklist_lock);
271 rcu_read_unlock();
270 272
271 return retval; 273 return retval;
272} 274}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7faaa32fbf4f..e2ab064c6d41 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -880,6 +880,7 @@ void getboottime(struct timespec *ts)
880 880
881 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); 881 set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec);
882} 882}
883EXPORT_SYMBOL_GPL(getboottime);
883 884
884/** 885/**
885 * monotonic_to_bootbased - Convert the monotonic time to boot based. 886 * monotonic_to_bootbased - Convert the monotonic time to boot based.
@@ -889,6 +890,7 @@ void monotonic_to_bootbased(struct timespec *ts)
889{ 890{
890 *ts = timespec_add_safe(*ts, total_sleep_time); 891 *ts = timespec_add_safe(*ts, total_sleep_time);
891} 892}
893EXPORT_SYMBOL_GPL(monotonic_to_bootbased);
892 894
893unsigned long get_seconds(void) 895unsigned long get_seconds(void)
894{ 896{
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd89ec77..d00c6fe23f54 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -51,7 +51,9 @@ endif
51obj-$(CONFIG_EVENT_TRACING) += trace_events.o 51obj-$(CONFIG_EVENT_TRACING) += trace_events.o
52obj-$(CONFIG_EVENT_TRACING) += trace_export.o 52obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54ifeq ($(CONFIG_PERF_EVENTS),y)
55obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
56endif
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o 59obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e6640f80454..1904797f4a8a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,7 +22,6 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/kprobes.h>
26#include <linux/ftrace.h> 25#include <linux/ftrace.h>
27#include <linux/sysctl.h> 26#include <linux/sysctl.h>
28#include <linux/ctype.h> 27#include <linux/ctype.h>
@@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records;
898 } \ 897 } \
899 } 898 }
900 899
901#ifdef CONFIG_KPROBES
902
903static int frozen_record_count;
904
905static inline void freeze_record(struct dyn_ftrace *rec)
906{
907 if (!(rec->flags & FTRACE_FL_FROZEN)) {
908 rec->flags |= FTRACE_FL_FROZEN;
909 frozen_record_count++;
910 }
911}
912
913static inline void unfreeze_record(struct dyn_ftrace *rec)
914{
915 if (rec->flags & FTRACE_FL_FROZEN) {
916 rec->flags &= ~FTRACE_FL_FROZEN;
917 frozen_record_count--;
918 }
919}
920
921static inline int record_frozen(struct dyn_ftrace *rec)
922{
923 return rec->flags & FTRACE_FL_FROZEN;
924}
925#else
926# define freeze_record(rec) ({ 0; })
927# define unfreeze_record(rec) ({ 0; })
928# define record_frozen(rec) ({ 0; })
929#endif /* CONFIG_KPROBES */
930
931static void ftrace_free_rec(struct dyn_ftrace *rec) 900static void ftrace_free_rec(struct dyn_ftrace *rec)
932{ 901{
933 rec->freelist = ftrace_free_records; 902 rec->freelist = ftrace_free_records;
@@ -1025,6 +994,21 @@ static void ftrace_bug(int failed, unsigned long ip)
1025} 994}
1026 995
1027 996
997/* Return 1 if the address range is reserved for ftrace */
998int ftrace_text_reserved(void *start, void *end)
999{
1000 struct dyn_ftrace *rec;
1001 struct ftrace_page *pg;
1002
1003 do_for_each_ftrace_rec(pg, rec) {
1004 if (rec->ip <= (unsigned long)end &&
1005 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1006 return 1;
1007 } while_for_each_ftrace_rec();
1008 return 0;
1009}
1010
1011
1028static int 1012static int
1029__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1013__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1030{ 1014{
@@ -1076,14 +1060,6 @@ static void ftrace_replace_code(int enable)
1076 !(rec->flags & FTRACE_FL_CONVERTED)) 1060 !(rec->flags & FTRACE_FL_CONVERTED))
1077 continue; 1061 continue;
1078 1062
1079 /* ignore updates to this record's mcount site */
1080 if (get_kprobe((void *)rec->ip)) {
1081 freeze_record(rec);
1082 continue;
1083 } else {
1084 unfreeze_record(rec);
1085 }
1086
1087 failed = __ftrace_replace_code(rec, enable); 1063 failed = __ftrace_replace_code(rec, enable);
1088 if (failed) { 1064 if (failed) {
1089 rec->flags |= FTRACE_FL_FAILED; 1065 rec->flags |= FTRACE_FL_FAILED;
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 9e25573242cf..f0d693005075 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -6,14 +6,12 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/kprobes.h>
9#include "trace.h" 10#include "trace.h"
10 11
11 12
12char *perf_trace_buf; 13static char *perf_trace_buf;
13EXPORT_SYMBOL_GPL(perf_trace_buf); 14static char *perf_trace_buf_nmi;
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
17 15
18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; 16typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
19 17
@@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id)
120 } 118 }
121 mutex_unlock(&event_mutex); 119 mutex_unlock(&event_mutex);
122} 120}
121
122__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
123 int *rctxp, unsigned long *irq_flags)
124{
125 struct trace_entry *entry;
126 char *trace_buf, *raw_data;
127 int pc, cpu;
128
129 pc = preempt_count();
130
131 /* Protect the per cpu buffer, begin the rcu read side */
132 local_irq_save(*irq_flags);
133
134 *rctxp = perf_swevent_get_recursion_context();
135 if (*rctxp < 0)
136 goto err_recursion;
137
138 cpu = smp_processor_id();
139
140 if (in_nmi())
141 trace_buf = rcu_dereference(perf_trace_buf_nmi);
142 else
143 trace_buf = rcu_dereference(perf_trace_buf);
144
145 if (!trace_buf)
146 goto err;
147
148 raw_data = per_cpu_ptr(trace_buf, cpu);
149
150 /* zero the dead bytes from align to not leak stack to user */
151 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
152
153 entry = (struct trace_entry *)raw_data;
154 tracing_generic_entry_update(entry, *irq_flags, pc);
155 entry->type = type;
156
157 return raw_data;
158err:
159 perf_swevent_put_recursion_context(*rctxp);
160err_recursion:
161 local_irq_restore(*irq_flags);
162 return NULL;
163}
164EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e42af9aad69f..4615f62a04f1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1371,7 +1371,7 @@ out_unlock:
1371 return err; 1371 return err;
1372} 1372}
1373 1373
1374#ifdef CONFIG_EVENT_PROFILE 1374#ifdef CONFIG_PERF_EVENTS
1375 1375
1376void ftrace_profile_free_filter(struct perf_event *event) 1376void ftrace_profile_free_filter(struct perf_event *event)
1377{ 1377{
@@ -1439,5 +1439,5 @@ out_unlock:
1439 return err; 1439 return err;
1440} 1440}
1441 1441
1442#endif /* CONFIG_EVENT_PROFILE */ 1442#endif /* CONFIG_PERF_EVENTS */
1443 1443
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 6ea90c0e2c96..356c10227c98 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -91,11 +91,6 @@ static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
91 return retval; 91 return retval;
92} 92}
93 93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy) 95 void *dummy)
101{ 96{
@@ -231,9 +226,7 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{ 226{
232 int ret = -EINVAL; 227 int ret = -EINVAL;
233 228
234 if (ff->func == fetch_argument) 229 if (ff->func == fetch_register) {
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name; 230 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data)); 231 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name); 232 ret = snprintf(buf, n, "%%%s", name);
@@ -489,14 +482,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
489 } 482 }
490 } else 483 } else
491 ret = -EINVAL; 484 ret = -EINVAL;
492 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
493 ret = strict_strtoul(arg + 3, 10, &param);
494 if (ret || param > PARAM_MAX_ARGS)
495 ret = -EINVAL;
496 else {
497 ff->func = fetch_argument;
498 ff->data = (void *)param;
499 }
500 } else 485 } else
501 ret = -EINVAL; 486 ret = -EINVAL;
502 return ret; 487 return ret;
@@ -611,7 +596,6 @@ static int create_trace_probe(int argc, char **argv)
611 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 596 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
612 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 597 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
613 * Fetch args: 598 * Fetch args:
614 * $argN : fetch Nth of function argument. (N:0-)
615 * $retval : fetch return value 599 * $retval : fetch return value
616 * $stack : fetch stack address 600 * $stack : fetch stack address
617 * $stackN : fetch Nth of stack (N:0-) 601 * $stackN : fetch Nth of stack (N:0-)
@@ -689,7 +673,7 @@ static int create_trace_probe(int argc, char **argv)
689 return -EINVAL; 673 return -EINVAL;
690 } 674 }
691 /* an address specified */ 675 /* an address specified */
692 ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); 676 ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr);
693 if (ret) { 677 if (ret) {
694 pr_info("Failed to parse address.\n"); 678 pr_info("Failed to parse address.\n");
695 return ret; 679 return ret;
@@ -958,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
958}; 942};
959 943
960/* Kprobe handler */ 944/* Kprobe handler */
961static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
962{ 946{
963 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
964 struct kprobe_trace_entry *entry; 948 struct kprobe_trace_entry *entry;
@@ -978,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
978 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
979 irq_flags, pc); 963 irq_flags, pc);
980 if (!event) 964 if (!event)
981 return 0; 965 return;
982 966
983 entry = ring_buffer_event_data(event); 967 entry = ring_buffer_event_data(event);
984 entry->nargs = tp->nr_args; 968 entry->nargs = tp->nr_args;
@@ -988,11 +972,10 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
988 972
989 if (!filter_current_check_discard(buffer, call, entry, event)) 973 if (!filter_current_check_discard(buffer, call, entry, event))
990 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
991 return 0;
992} 975}
993 976
994/* Kretprobe handler */ 977/* Kretprobe handler */
995static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, 978static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
996 struct pt_regs *regs) 979 struct pt_regs *regs)
997{ 980{
998 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1011,7 +994,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
1011 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
1012 irq_flags, pc); 995 irq_flags, pc);
1013 if (!event) 996 if (!event)
1014 return 0; 997 return;
1015 998
1016 entry = ring_buffer_event_data(event); 999 entry = ring_buffer_event_data(event);
1017 entry->nargs = tp->nr_args; 1000 entry->nargs = tp->nr_args;
@@ -1022,8 +1005,6 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
1022 1005
1023 if (!filter_current_check_discard(buffer, call, entry, event)) 1006 if (!filter_current_check_discard(buffer, call, entry, event))
1024 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
1025
1026 return 0;
1027} 1008}
1028 1009
1029/* Event entry printers */ 1010/* Event entry printers */
@@ -1250,137 +1231,67 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call,
1250 ", REC->" FIELD_STRING_RETIP); 1231 ", REC->" FIELD_STRING_RETIP);
1251} 1232}
1252 1233
1253#ifdef CONFIG_EVENT_PROFILE 1234#ifdef CONFIG_PERF_EVENTS
1254 1235
1255/* Kprobe profile handler */ 1236/* Kprobe profile handler */
1256static __kprobes int kprobe_profile_func(struct kprobe *kp, 1237static __kprobes void kprobe_profile_func(struct kprobe *kp,
1257 struct pt_regs *regs) 1238 struct pt_regs *regs)
1258{ 1239{
1259 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1240 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1260 struct ftrace_event_call *call = &tp->call; 1241 struct ftrace_event_call *call = &tp->call;
1261 struct kprobe_trace_entry *entry; 1242 struct kprobe_trace_entry *entry;
1262 struct trace_entry *ent; 1243 int size, __size, i;
1263 int size, __size, i, pc, __cpu;
1264 unsigned long irq_flags; 1244 unsigned long irq_flags;
1265 char *trace_buf;
1266 char *raw_data;
1267 int rctx; 1245 int rctx;
1268 1246
1269 pc = preempt_count();
1270 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1247 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1271 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1248 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1272 size -= sizeof(u32); 1249 size -= sizeof(u32);
1273 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1250 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1274 "profile buffer not large enough")) 1251 "profile buffer not large enough"))
1275 return 0; 1252 return;
1276
1277 /*
1278 * Protect the non nmi buffer
1279 * This also protects the rcu read side
1280 */
1281 local_irq_save(irq_flags);
1282 1253
1283 rctx = perf_swevent_get_recursion_context(); 1254 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1284 if (rctx < 0) 1255 if (!entry)
1285 goto end_recursion; 1256 return;
1286
1287 __cpu = smp_processor_id();
1288
1289 if (in_nmi())
1290 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1291 else
1292 trace_buf = rcu_dereference(perf_trace_buf);
1293
1294 if (!trace_buf)
1295 goto end;
1296
1297 raw_data = per_cpu_ptr(trace_buf, __cpu);
1298
1299 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1300 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1301 entry = (struct kprobe_trace_entry *)raw_data;
1302 ent = &entry->ent;
1303 1257
1304 tracing_generic_entry_update(ent, irq_flags, pc);
1305 ent->type = call->id;
1306 entry->nargs = tp->nr_args; 1258 entry->nargs = tp->nr_args;
1307 entry->ip = (unsigned long)kp->addr; 1259 entry->ip = (unsigned long)kp->addr;
1308 for (i = 0; i < tp->nr_args; i++) 1260 for (i = 0; i < tp->nr_args; i++)
1309 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1261 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1310 perf_tp_event(call->id, entry->ip, 1, entry, size);
1311 1262
1312end: 1263 ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
1313 perf_swevent_put_recursion_context(rctx);
1314end_recursion:
1315 local_irq_restore(irq_flags);
1316
1317 return 0;
1318} 1264}
1319 1265
1320/* Kretprobe profile handler */ 1266/* Kretprobe profile handler */
1321static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, 1267static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
1322 struct pt_regs *regs) 1268 struct pt_regs *regs)
1323{ 1269{
1324 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1270 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1325 struct ftrace_event_call *call = &tp->call; 1271 struct ftrace_event_call *call = &tp->call;
1326 struct kretprobe_trace_entry *entry; 1272 struct kretprobe_trace_entry *entry;
1327 struct trace_entry *ent; 1273 int size, __size, i;
1328 int size, __size, i, pc, __cpu;
1329 unsigned long irq_flags; 1274 unsigned long irq_flags;
1330 char *trace_buf;
1331 char *raw_data;
1332 int rctx; 1275 int rctx;
1333 1276
1334 pc = preempt_count();
1335 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1277 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1336 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1278 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1337 size -= sizeof(u32); 1279 size -= sizeof(u32);
1338 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1280 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1339 "profile buffer not large enough")) 1281 "profile buffer not large enough"))
1340 return 0; 1282 return;
1341
1342 /*
1343 * Protect the non nmi buffer
1344 * This also protects the rcu read side
1345 */
1346 local_irq_save(irq_flags);
1347
1348 rctx = perf_swevent_get_recursion_context();
1349 if (rctx < 0)
1350 goto end_recursion;
1351
1352 __cpu = smp_processor_id();
1353
1354 if (in_nmi())
1355 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1356 else
1357 trace_buf = rcu_dereference(perf_trace_buf);
1358
1359 if (!trace_buf)
1360 goto end;
1361
1362 raw_data = per_cpu_ptr(trace_buf, __cpu);
1363 1283
1364 /* Zero dead bytes from alignment to avoid buffer leak to userspace */ 1284 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1365 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 1285 if (!entry)
1366 entry = (struct kretprobe_trace_entry *)raw_data; 1286 return;
1367 ent = &entry->ent;
1368 1287
1369 tracing_generic_entry_update(ent, irq_flags, pc);
1370 ent->type = call->id;
1371 entry->nargs = tp->nr_args; 1288 entry->nargs = tp->nr_args;
1372 entry->func = (unsigned long)tp->rp.kp.addr; 1289 entry->func = (unsigned long)tp->rp.kp.addr;
1373 entry->ret_ip = (unsigned long)ri->ret_addr; 1290 entry->ret_ip = (unsigned long)ri->ret_addr;
1374 for (i = 0; i < tp->nr_args; i++) 1291 for (i = 0; i < tp->nr_args; i++)
1375 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1292 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1376 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1377
1378end:
1379 perf_swevent_put_recursion_context(rctx);
1380end_recursion:
1381 local_irq_restore(irq_flags);
1382 1293
1383 return 0; 1294 ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
1384} 1295}
1385 1296
1386static int probe_profile_enable(struct ftrace_event_call *call) 1297static int probe_profile_enable(struct ftrace_event_call *call)
@@ -1408,7 +1319,7 @@ static void probe_profile_disable(struct ftrace_event_call *call)
1408 disable_kprobe(&tp->rp.kp); 1319 disable_kprobe(&tp->rp.kp);
1409 } 1320 }
1410} 1321}
1411#endif /* CONFIG_EVENT_PROFILE */ 1322#endif /* CONFIG_PERF_EVENTS */
1412 1323
1413 1324
1414static __kprobes 1325static __kprobes
@@ -1418,10 +1329,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1418 1329
1419 if (tp->flags & TP_FLAG_TRACE) 1330 if (tp->flags & TP_FLAG_TRACE)
1420 kprobe_trace_func(kp, regs); 1331 kprobe_trace_func(kp, regs);
1421#ifdef CONFIG_EVENT_PROFILE 1332#ifdef CONFIG_PERF_EVENTS
1422 if (tp->flags & TP_FLAG_PROFILE) 1333 if (tp->flags & TP_FLAG_PROFILE)
1423 kprobe_profile_func(kp, regs); 1334 kprobe_profile_func(kp, regs);
1424#endif /* CONFIG_EVENT_PROFILE */ 1335#endif
1425 return 0; /* We don't tweek kernel, so just return 0 */ 1336 return 0; /* We don't tweek kernel, so just return 0 */
1426} 1337}
1427 1338
@@ -1432,10 +1343,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1432 1343
1433 if (tp->flags & TP_FLAG_TRACE) 1344 if (tp->flags & TP_FLAG_TRACE)
1434 kretprobe_trace_func(ri, regs); 1345 kretprobe_trace_func(ri, regs);
1435#ifdef CONFIG_EVENT_PROFILE 1346#ifdef CONFIG_PERF_EVENTS
1436 if (tp->flags & TP_FLAG_PROFILE) 1347 if (tp->flags & TP_FLAG_PROFILE)
1437 kretprobe_profile_func(ri, regs); 1348 kretprobe_profile_func(ri, regs);
1438#endif /* CONFIG_EVENT_PROFILE */ 1349#endif
1439 return 0; /* We don't tweek kernel, so just return 0 */ 1350 return 0; /* We don't tweek kernel, so just return 0 */
1440} 1351}
1441 1352
@@ -1464,7 +1375,7 @@ static int register_probe_event(struct trace_probe *tp)
1464 call->regfunc = probe_event_enable; 1375 call->regfunc = probe_event_enable;
1465 call->unregfunc = probe_event_disable; 1376 call->unregfunc = probe_event_disable;
1466 1377
1467#ifdef CONFIG_EVENT_PROFILE 1378#ifdef CONFIG_PERF_EVENTS
1468 call->profile_enable = probe_profile_enable; 1379 call->profile_enable = probe_profile_enable;
1469 call->profile_disable = probe_profile_disable; 1380 call->profile_disable = probe_profile_disable;
1470#endif 1381#endif
@@ -1523,28 +1434,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1523 1434
1524static __init int kprobe_trace_self_tests_init(void) 1435static __init int kprobe_trace_self_tests_init(void)
1525{ 1436{
1526 int ret; 1437 int ret, warn = 0;
1527 int (*target)(int, int, int, int, int, int); 1438 int (*target)(int, int, int, int, int, int);
1439 struct trace_probe *tp;
1528 1440
1529 target = kprobe_trace_selftest_target; 1441 target = kprobe_trace_selftest_target;
1530 1442
1531 pr_info("Testing kprobe tracing: "); 1443 pr_info("Testing kprobe tracing: ");
1532 1444
1533 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " 1445 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1534 "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); 1446 "$stack $stack0 +0($stack)");
1535 if (WARN_ON_ONCE(ret)) 1447 if (WARN_ON_ONCE(ret)) {
1536 pr_warning("error enabling function entry\n"); 1448 pr_warning("error on probing function entry.\n");
1449 warn++;
1450 } else {
1451 /* Enable trace point */
1452 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
1453 if (WARN_ON_ONCE(tp == NULL)) {
1454 pr_warning("error on getting new probe.\n");
1455 warn++;
1456 } else
1457 probe_event_enable(&tp->call);
1458 }
1537 1459
1538 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 1460 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1539 "$retval"); 1461 "$retval");
1540 if (WARN_ON_ONCE(ret)) 1462 if (WARN_ON_ONCE(ret)) {
1541 pr_warning("error enabling function return\n"); 1463 pr_warning("error on probing function return.\n");
1464 warn++;
1465 } else {
1466 /* Enable trace point */
1467 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
1468 if (WARN_ON_ONCE(tp == NULL)) {
1469 pr_warning("error on getting new probe.\n");
1470 warn++;
1471 } else
1472 probe_event_enable(&tp->call);
1473 }
1474
1475 if (warn)
1476 goto end;
1542 1477
1543 ret = target(1, 2, 3, 4, 5, 6); 1478 ret = target(1, 2, 3, 4, 5, 6);
1544 1479
1545 cleanup_all_probes(); 1480 ret = command_trace_probe("-:testprobe");
1481 if (WARN_ON_ONCE(ret)) {
1482 pr_warning("error on deleting a probe.\n");
1483 warn++;
1484 }
1485
1486 ret = command_trace_probe("-:testprobe2");
1487 if (WARN_ON_ONCE(ret)) {
1488 pr_warning("error on deleting a probe.\n");
1489 warn++;
1490 }
1546 1491
1547 pr_cont("OK\n"); 1492end:
1493 cleanup_all_probes();
1494 if (warn)
1495 pr_cont("NG: Some tests are failed. Please check them.\n");
1496 else
1497 pr_cont("OK\n");
1548 return 0; 1498 return 0;
1549} 1499}
1550 1500
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 678a5120ee30..f4bc9b27de5f 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
157 unsigned long val, flags; 157 unsigned long val, flags;
158 char buf[64]; 158 char buf[64];
159 int ret; 159 int ret;
160 int cpu;
160 161
161 if (count >= sizeof(buf)) 162 if (count >= sizeof(buf))
162 return -EINVAL; 163 return -EINVAL;
@@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf,
171 return ret; 172 return ret;
172 173
173 local_irq_save(flags); 174 local_irq_save(flags);
175
176 /*
177 * In case we trace inside arch_spin_lock() or after (NMI),
178 * we will cause circular lock, so we also need to increase
179 * the percpu trace_active here.
180 */
181 cpu = smp_processor_id();
182 per_cpu(trace_active, cpu)++;
183
174 arch_spin_lock(&max_stack_lock); 184 arch_spin_lock(&max_stack_lock);
175 *ptr = val; 185 *ptr = val;
176 arch_spin_unlock(&max_stack_lock); 186 arch_spin_unlock(&max_stack_lock);
187
188 per_cpu(trace_active, cpu)--;
177 local_irq_restore(flags); 189 local_irq_restore(flags);
178 190
179 return count; 191 return count;
@@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
206 218
207static void *t_start(struct seq_file *m, loff_t *pos) 219static void *t_start(struct seq_file *m, loff_t *pos)
208{ 220{
221 int cpu;
222
209 local_irq_disable(); 223 local_irq_disable();
224
225 cpu = smp_processor_id();
226 per_cpu(trace_active, cpu)++;
227
210 arch_spin_lock(&max_stack_lock); 228 arch_spin_lock(&max_stack_lock);
211 229
212 if (*pos == 0) 230 if (*pos == 0)
@@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos)
217 235
218static void t_stop(struct seq_file *m, void *p) 236static void t_stop(struct seq_file *m, void *p)
219{ 237{
238 int cpu;
239
220 arch_spin_unlock(&max_stack_lock); 240 arch_spin_unlock(&max_stack_lock);
241
242 cpu = smp_processor_id();
243 per_cpu(trace_active, cpu)--;
244
221 local_irq_enable(); 245 local_irq_enable();
222} 246}
223 247
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 75289f372dd2..4e332b9e449c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -421,7 +421,7 @@ int __init init_ftrace_syscalls(void)
421} 421}
422core_initcall(init_ftrace_syscalls); 422core_initcall(init_ftrace_syscalls);
423 423
424#ifdef CONFIG_EVENT_PROFILE 424#ifdef CONFIG_PERF_EVENTS
425 425
426static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 426static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
427static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); 427static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
@@ -433,12 +433,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
433 struct syscall_metadata *sys_data; 433 struct syscall_metadata *sys_data;
434 struct syscall_trace_enter *rec; 434 struct syscall_trace_enter *rec;
435 unsigned long flags; 435 unsigned long flags;
436 char *trace_buf;
437 char *raw_data;
438 int syscall_nr; 436 int syscall_nr;
439 int rctx; 437 int rctx;
440 int size; 438 int size;
441 int cpu;
442 439
443 syscall_nr = syscall_get_nr(current, regs); 440 syscall_nr = syscall_get_nr(current, regs);
444 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 441 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -457,37 +454,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
457 "profile buffer not large enough")) 454 "profile buffer not large enough"))
458 return; 455 return;
459 456
460 /* Protect the per cpu buffer, begin the rcu read side */ 457 rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
461 local_irq_save(flags); 458 sys_data->enter_event->id, &rctx, &flags);
462 459 if (!rec)
463 rctx = perf_swevent_get_recursion_context(); 460 return;
464 if (rctx < 0)
465 goto end_recursion;
466
467 cpu = smp_processor_id();
468
469 trace_buf = rcu_dereference(perf_trace_buf);
470
471 if (!trace_buf)
472 goto end;
473
474 raw_data = per_cpu_ptr(trace_buf, cpu);
475
476 /* zero the dead bytes from align to not leak stack to user */
477 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
478 461
479 rec = (struct syscall_trace_enter *) raw_data;
480 tracing_generic_entry_update(&rec->ent, 0, 0);
481 rec->ent.type = sys_data->enter_event->id;
482 rec->nr = syscall_nr; 462 rec->nr = syscall_nr;
483 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 463 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
484 (unsigned long *)&rec->args); 464 (unsigned long *)&rec->args);
485 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); 465 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
486
487end:
488 perf_swevent_put_recursion_context(rctx);
489end_recursion:
490 local_irq_restore(flags);
491} 466}
492 467
493int prof_sysenter_enable(struct ftrace_event_call *call) 468int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -531,11 +506,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
531 struct syscall_trace_exit *rec; 506 struct syscall_trace_exit *rec;
532 unsigned long flags; 507 unsigned long flags;
533 int syscall_nr; 508 int syscall_nr;
534 char *trace_buf;
535 char *raw_data;
536 int rctx; 509 int rctx;
537 int size; 510 int size;
538 int cpu;
539 511
540 syscall_nr = syscall_get_nr(current, regs); 512 syscall_nr = syscall_get_nr(current, regs);
541 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 513 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -557,38 +529,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
557 "exit event has grown above profile buffer size")) 529 "exit event has grown above profile buffer size"))
558 return; 530 return;
559 531
560 /* Protect the per cpu buffer, begin the rcu read side */ 532 rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
561 local_irq_save(flags); 533 sys_data->exit_event->id, &rctx, &flags);
562 534 if (!rec)
563 rctx = perf_swevent_get_recursion_context(); 535 return;
564 if (rctx < 0)
565 goto end_recursion;
566
567 cpu = smp_processor_id();
568
569 trace_buf = rcu_dereference(perf_trace_buf);
570
571 if (!trace_buf)
572 goto end;
573
574 raw_data = per_cpu_ptr(trace_buf, cpu);
575
576 /* zero the dead bytes from align to not leak stack to user */
577 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
578
579 rec = (struct syscall_trace_exit *)raw_data;
580 536
581 tracing_generic_entry_update(&rec->ent, 0, 0);
582 rec->ent.type = sys_data->exit_event->id;
583 rec->nr = syscall_nr; 537 rec->nr = syscall_nr;
584 rec->ret = syscall_get_return_value(current, regs); 538 rec->ret = syscall_get_return_value(current, regs);
585 539
586 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); 540 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
587
588end:
589 perf_swevent_put_recursion_context(rctx);
590end_recursion:
591 local_irq_restore(flags);
592} 541}
593 542
594int prof_sysexit_enable(struct ftrace_event_call *call) 543int prof_sysexit_enable(struct ftrace_event_call *call)
@@ -626,6 +575,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call)
626 mutex_unlock(&syscall_trace_lock); 575 mutex_unlock(&syscall_trace_lock);
627} 576}
628 577
629#endif 578#endif /* CONFIG_PERF_EVENTS */
630
631 579