aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/core.c5
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/events/uprobes.c466
-rw-r--r--kernel/kprobes.c8
-rw-r--r--kernel/profile.c24
-rw-r--r--kernel/ptrace.c6
-rw-r--r--kernel/trace/Kconfig18
-rw-r--r--kernel/trace/blktrace.c2
-rw-r--r--kernel/trace/ftrace.c88
-rw-r--r--kernel/trace/ring_buffer.c108
-rw-r--r--kernel/trace/trace.c252
-rw-r--r--kernel/trace/trace.h134
-rw-r--r--kernel/trace/trace_clock.c4
-rw-r--r--kernel/trace/trace_events.c1
-rw-r--r--kernel/trace/trace_functions.c61
-rw-r--r--kernel/trace/trace_functions_graph.c68
-rw-r--r--kernel/trace/trace_probe.h1
-rw-r--r--kernel/trace/trace_selftest.c21
-rw-r--r--kernel/trace/trace_syscalls.c18
-rw-r--r--kernel/trace/trace_uprobe.c217
20 files changed, 992 insertions, 512 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7b6646a8c067..5c75791d7269 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -6171,11 +6171,14 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6171 6171
6172 if (task) { 6172 if (task) {
6173 event->attach_state = PERF_ATTACH_TASK; 6173 event->attach_state = PERF_ATTACH_TASK;
6174
6175 if (attr->type == PERF_TYPE_TRACEPOINT)
6176 event->hw.tp_target = task;
6174#ifdef CONFIG_HAVE_HW_BREAKPOINT 6177#ifdef CONFIG_HAVE_HW_BREAKPOINT
6175 /* 6178 /*
6176 * hw_breakpoint is a bit difficult here.. 6179 * hw_breakpoint is a bit difficult here..
6177 */ 6180 */
6178 if (attr->type == PERF_TYPE_BREAKPOINT) 6181 else if (attr->type == PERF_TYPE_BREAKPOINT)
6179 event->hw.bp_target = task; 6182 event->hw.bp_target = task;
6180#endif 6183#endif
6181 } 6184 }
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index fe8a916507ed..a64f8aeb5c1f 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -676,7 +676,7 @@ int __init init_hw_breakpoint(void)
676 err_alloc: 676 err_alloc:
677 for_each_possible_cpu(err_cpu) { 677 for_each_possible_cpu(err_cpu) {
678 for (i = 0; i < TYPE_MAX; i++) 678 for (i = 0; i < TYPE_MAX; i++)
679 kfree(per_cpu(nr_task_bp_pinned[i], cpu)); 679 kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
680 if (err_cpu == cpu) 680 if (err_cpu == cpu)
681 break; 681 break;
682 } 682 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index dea7acfbb071..a567c8c7ef31 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -27,6 +27,7 @@
27#include <linux/pagemap.h> /* read_mapping_page */ 27#include <linux/pagemap.h> /* read_mapping_page */
28#include <linux/slab.h> 28#include <linux/slab.h>
29#include <linux/sched.h> 29#include <linux/sched.h>
30#include <linux/export.h>
30#include <linux/rmap.h> /* anon_vma_prepare */ 31#include <linux/rmap.h> /* anon_vma_prepare */
31#include <linux/mmu_notifier.h> /* set_pte_at_notify */ 32#include <linux/mmu_notifier.h> /* set_pte_at_notify */
32#include <linux/swap.h> /* try_to_free_swap */ 33#include <linux/swap.h> /* try_to_free_swap */
@@ -41,58 +42,31 @@
41#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE 42#define MAX_UPROBE_XOL_SLOTS UINSNS_PER_PAGE
42 43
43static struct rb_root uprobes_tree = RB_ROOT; 44static struct rb_root uprobes_tree = RB_ROOT;
44
45static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
46
47#define UPROBES_HASH_SZ 13
48
49/* 45/*
50 * We need separate register/unregister and mmap/munmap lock hashes because 46 * allows us to skip the uprobe_mmap if there are no uprobe events active
51 * of mmap_sem nesting. 47 * at this time. Probably a fine grained per inode count is better?
52 *
53 * uprobe_register() needs to install probes on (potentially) all processes
54 * and thus needs to acquire multiple mmap_sems (consequtively, not
55 * concurrently), whereas uprobe_mmap() is called while holding mmap_sem
56 * for the particular process doing the mmap.
57 *
58 * uprobe_register()->register_for_each_vma() needs to drop/acquire mmap_sem
59 * because of lock order against i_mmap_mutex. This means there's a hole in
60 * the register vma iteration where a mmap() can happen.
61 *
62 * Thus uprobe_register() can race with uprobe_mmap() and we can try and
63 * install a probe where one is already installed.
64 */ 48 */
49#define no_uprobe_events() RB_EMPTY_ROOT(&uprobes_tree)
65 50
66/* serialize (un)register */ 51static DEFINE_SPINLOCK(uprobes_treelock); /* serialize rbtree access */
67static struct mutex uprobes_mutex[UPROBES_HASH_SZ];
68
69#define uprobes_hash(v) (&uprobes_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
70 52
53#define UPROBES_HASH_SZ 13
71/* serialize uprobe->pending_list */ 54/* serialize uprobe->pending_list */
72static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ]; 55static struct mutex uprobes_mmap_mutex[UPROBES_HASH_SZ];
73#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ]) 56#define uprobes_mmap_hash(v) (&uprobes_mmap_mutex[((unsigned long)(v)) % UPROBES_HASH_SZ])
74 57
75static struct percpu_rw_semaphore dup_mmap_sem; 58static struct percpu_rw_semaphore dup_mmap_sem;
76 59
77/*
78 * uprobe_events allows us to skip the uprobe_mmap if there are no uprobe
79 * events active at this time. Probably a fine grained per inode count is
80 * better?
81 */
82static atomic_t uprobe_events = ATOMIC_INIT(0);
83
84/* Have a copy of original instruction */ 60/* Have a copy of original instruction */
85#define UPROBE_COPY_INSN 0 61#define UPROBE_COPY_INSN 0
86/* Dont run handlers when first register/ last unregister in progress*/
87#define UPROBE_RUN_HANDLER 1
88/* Can skip singlestep */ 62/* Can skip singlestep */
89#define UPROBE_SKIP_SSTEP 2 63#define UPROBE_SKIP_SSTEP 1
90 64
91struct uprobe { 65struct uprobe {
92 struct rb_node rb_node; /* node in the rb tree */ 66 struct rb_node rb_node; /* node in the rb tree */
93 atomic_t ref; 67 atomic_t ref;
68 struct rw_semaphore register_rwsem;
94 struct rw_semaphore consumer_rwsem; 69 struct rw_semaphore consumer_rwsem;
95 struct mutex copy_mutex; /* TODO: kill me and UPROBE_COPY_INSN */
96 struct list_head pending_list; 70 struct list_head pending_list;
97 struct uprobe_consumer *consumers; 71 struct uprobe_consumer *consumers;
98 struct inode *inode; /* Also hold a ref to inode */ 72 struct inode *inode; /* Also hold a ref to inode */
@@ -430,9 +404,6 @@ static struct uprobe *insert_uprobe(struct uprobe *uprobe)
430 u = __insert_uprobe(uprobe); 404 u = __insert_uprobe(uprobe);
431 spin_unlock(&uprobes_treelock); 405 spin_unlock(&uprobes_treelock);
432 406
433 /* For now assume that the instruction need not be single-stepped */
434 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
435
436 return u; 407 return u;
437} 408}
438 409
@@ -452,8 +423,10 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
452 423
453 uprobe->inode = igrab(inode); 424 uprobe->inode = igrab(inode);
454 uprobe->offset = offset; 425 uprobe->offset = offset;
426 init_rwsem(&uprobe->register_rwsem);
455 init_rwsem(&uprobe->consumer_rwsem); 427 init_rwsem(&uprobe->consumer_rwsem);
456 mutex_init(&uprobe->copy_mutex); 428 /* For now assume that the instruction need not be single-stepped */
429 __set_bit(UPROBE_SKIP_SSTEP, &uprobe->flags);
457 430
458 /* add to uprobes_tree, sorted on inode:offset */ 431 /* add to uprobes_tree, sorted on inode:offset */
459 cur_uprobe = insert_uprobe(uprobe); 432 cur_uprobe = insert_uprobe(uprobe);
@@ -463,38 +436,17 @@ static struct uprobe *alloc_uprobe(struct inode *inode, loff_t offset)
463 kfree(uprobe); 436 kfree(uprobe);
464 uprobe = cur_uprobe; 437 uprobe = cur_uprobe;
465 iput(inode); 438 iput(inode);
466 } else {
467 atomic_inc(&uprobe_events);
468 } 439 }
469 440
470 return uprobe; 441 return uprobe;
471} 442}
472 443
473static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs) 444static void consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
474{
475 struct uprobe_consumer *uc;
476
477 if (!test_bit(UPROBE_RUN_HANDLER, &uprobe->flags))
478 return;
479
480 down_read(&uprobe->consumer_rwsem);
481 for (uc = uprobe->consumers; uc; uc = uc->next) {
482 if (!uc->filter || uc->filter(uc, current))
483 uc->handler(uc, regs);
484 }
485 up_read(&uprobe->consumer_rwsem);
486}
487
488/* Returns the previous consumer */
489static struct uprobe_consumer *
490consumer_add(struct uprobe *uprobe, struct uprobe_consumer *uc)
491{ 445{
492 down_write(&uprobe->consumer_rwsem); 446 down_write(&uprobe->consumer_rwsem);
493 uc->next = uprobe->consumers; 447 uc->next = uprobe->consumers;
494 uprobe->consumers = uc; 448 uprobe->consumers = uc;
495 up_write(&uprobe->consumer_rwsem); 449 up_write(&uprobe->consumer_rwsem);
496
497 return uc->next;
498} 450}
499 451
500/* 452/*
@@ -588,7 +540,8 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
588 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) 540 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
589 return ret; 541 return ret;
590 542
591 mutex_lock(&uprobe->copy_mutex); 543 /* TODO: move this into _register, until then we abuse this sem. */
544 down_write(&uprobe->consumer_rwsem);
592 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags)) 545 if (test_bit(UPROBE_COPY_INSN, &uprobe->flags))
593 goto out; 546 goto out;
594 547
@@ -612,7 +565,30 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
612 set_bit(UPROBE_COPY_INSN, &uprobe->flags); 565 set_bit(UPROBE_COPY_INSN, &uprobe->flags);
613 566
614 out: 567 out:
615 mutex_unlock(&uprobe->copy_mutex); 568 up_write(&uprobe->consumer_rwsem);
569
570 return ret;
571}
572
573static inline bool consumer_filter(struct uprobe_consumer *uc,
574 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
575{
576 return !uc->filter || uc->filter(uc, ctx, mm);
577}
578
579static bool filter_chain(struct uprobe *uprobe,
580 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
581{
582 struct uprobe_consumer *uc;
583 bool ret = false;
584
585 down_read(&uprobe->consumer_rwsem);
586 for (uc = uprobe->consumers; uc; uc = uc->next) {
587 ret = consumer_filter(uc, ctx, mm);
588 if (ret)
589 break;
590 }
591 up_read(&uprobe->consumer_rwsem);
616 592
617 return ret; 593 return ret;
618} 594}
@@ -624,16 +600,6 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
624 bool first_uprobe; 600 bool first_uprobe;
625 int ret; 601 int ret;
626 602
627 /*
628 * If probe is being deleted, unregister thread could be done with
629 * the vma-rmap-walk through. Adding a probe now can be fatal since
630 * nobody will be able to cleanup. Also we could be from fork or
631 * mremap path, where the probe might have already been inserted.
632 * Hence behave as if probe already existed.
633 */
634 if (!uprobe->consumers)
635 return 0;
636
637 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr); 603 ret = prepare_uprobe(uprobe, vma->vm_file, mm, vaddr);
638 if (ret) 604 if (ret)
639 return ret; 605 return ret;
@@ -658,14 +624,14 @@ install_breakpoint(struct uprobe *uprobe, struct mm_struct *mm,
658static int 624static int
659remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr) 625remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vaddr)
660{ 626{
661 /* can happen if uprobe_register() fails */
662 if (!test_bit(MMF_HAS_UPROBES, &mm->flags))
663 return 0;
664
665 set_bit(MMF_RECALC_UPROBES, &mm->flags); 627 set_bit(MMF_RECALC_UPROBES, &mm->flags);
666 return set_orig_insn(&uprobe->arch, mm, vaddr); 628 return set_orig_insn(&uprobe->arch, mm, vaddr);
667} 629}
668 630
631static inline bool uprobe_is_active(struct uprobe *uprobe)
632{
633 return !RB_EMPTY_NODE(&uprobe->rb_node);
634}
669/* 635/*
670 * There could be threads that have already hit the breakpoint. They 636 * There could be threads that have already hit the breakpoint. They
671 * will recheck the current insn and restart if find_uprobe() fails. 637 * will recheck the current insn and restart if find_uprobe() fails.
@@ -673,12 +639,15 @@ remove_breakpoint(struct uprobe *uprobe, struct mm_struct *mm, unsigned long vad
673 */ 639 */
674static void delete_uprobe(struct uprobe *uprobe) 640static void delete_uprobe(struct uprobe *uprobe)
675{ 641{
642 if (WARN_ON(!uprobe_is_active(uprobe)))
643 return;
644
676 spin_lock(&uprobes_treelock); 645 spin_lock(&uprobes_treelock);
677 rb_erase(&uprobe->rb_node, &uprobes_tree); 646 rb_erase(&uprobe->rb_node, &uprobes_tree);
678 spin_unlock(&uprobes_treelock); 647 spin_unlock(&uprobes_treelock);
648 RB_CLEAR_NODE(&uprobe->rb_node); /* for uprobe_is_active() */
679 iput(uprobe->inode); 649 iput(uprobe->inode);
680 put_uprobe(uprobe); 650 put_uprobe(uprobe);
681 atomic_dec(&uprobe_events);
682} 651}
683 652
684struct map_info { 653struct map_info {
@@ -764,8 +733,10 @@ build_map_info(struct address_space *mapping, loff_t offset, bool is_register)
764 return curr; 733 return curr;
765} 734}
766 735
767static int register_for_each_vma(struct uprobe *uprobe, bool is_register) 736static int
737register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
768{ 738{
739 bool is_register = !!new;
769 struct map_info *info; 740 struct map_info *info;
770 int err = 0; 741 int err = 0;
771 742
@@ -794,10 +765,16 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
794 vaddr_to_offset(vma, info->vaddr) != uprobe->offset) 765 vaddr_to_offset(vma, info->vaddr) != uprobe->offset)
795 goto unlock; 766 goto unlock;
796 767
797 if (is_register) 768 if (is_register) {
798 err = install_breakpoint(uprobe, mm, vma, info->vaddr); 769 /* consult only the "caller", new consumer. */
799 else 770 if (consumer_filter(new,
800 err |= remove_breakpoint(uprobe, mm, info->vaddr); 771 UPROBE_FILTER_REGISTER, mm))
772 err = install_breakpoint(uprobe, mm, vma, info->vaddr);
773 } else if (test_bit(MMF_HAS_UPROBES, &mm->flags)) {
774 if (!filter_chain(uprobe,
775 UPROBE_FILTER_UNREGISTER, mm))
776 err |= remove_breakpoint(uprobe, mm, info->vaddr);
777 }
801 778
802 unlock: 779 unlock:
803 up_write(&mm->mmap_sem); 780 up_write(&mm->mmap_sem);
@@ -810,17 +787,23 @@ static int register_for_each_vma(struct uprobe *uprobe, bool is_register)
810 return err; 787 return err;
811} 788}
812 789
813static int __uprobe_register(struct uprobe *uprobe) 790static int __uprobe_register(struct uprobe *uprobe, struct uprobe_consumer *uc)
814{ 791{
815 return register_for_each_vma(uprobe, true); 792 consumer_add(uprobe, uc);
793 return register_for_each_vma(uprobe, uc);
816} 794}
817 795
818static void __uprobe_unregister(struct uprobe *uprobe) 796static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *uc)
819{ 797{
820 if (!register_for_each_vma(uprobe, false)) 798 int err;
821 delete_uprobe(uprobe); 799
800 if (!consumer_del(uprobe, uc)) /* WARN? */
801 return;
822 802
803 err = register_for_each_vma(uprobe, NULL);
823 /* TODO : cant unregister? schedule a worker thread */ 804 /* TODO : cant unregister? schedule a worker thread */
805 if (!uprobe->consumers && !err)
806 delete_uprobe(uprobe);
824} 807}
825 808
826/* 809/*
@@ -845,31 +828,59 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
845 struct uprobe *uprobe; 828 struct uprobe *uprobe;
846 int ret; 829 int ret;
847 830
848 if (!inode || !uc || uc->next) 831 /* Racy, just to catch the obvious mistakes */
849 return -EINVAL;
850
851 if (offset > i_size_read(inode)) 832 if (offset > i_size_read(inode))
852 return -EINVAL; 833 return -EINVAL;
853 834
854 ret = 0; 835 retry:
855 mutex_lock(uprobes_hash(inode));
856 uprobe = alloc_uprobe(inode, offset); 836 uprobe = alloc_uprobe(inode, offset);
857 837 if (!uprobe)
858 if (!uprobe) { 838 return -ENOMEM;
859 ret = -ENOMEM; 839 /*
860 } else if (!consumer_add(uprobe, uc)) { 840 * We can race with uprobe_unregister()->delete_uprobe().
861 ret = __uprobe_register(uprobe); 841 * Check uprobe_is_active() and retry if it is false.
862 if (ret) { 842 */
863 uprobe->consumers = NULL; 843 down_write(&uprobe->register_rwsem);
864 __uprobe_unregister(uprobe); 844 ret = -EAGAIN;
865 } else { 845 if (likely(uprobe_is_active(uprobe))) {
866 set_bit(UPROBE_RUN_HANDLER, &uprobe->flags); 846 ret = __uprobe_register(uprobe, uc);
867 } 847 if (ret)
848 __uprobe_unregister(uprobe, uc);
868 } 849 }
850 up_write(&uprobe->register_rwsem);
851 put_uprobe(uprobe);
869 852
870 mutex_unlock(uprobes_hash(inode)); 853 if (unlikely(ret == -EAGAIN))
871 if (uprobe) 854 goto retry;
872 put_uprobe(uprobe); 855 return ret;
856}
857EXPORT_SYMBOL_GPL(uprobe_register);
858
859/*
860 * uprobe_apply - unregister a already registered probe.
861 * @inode: the file in which the probe has to be removed.
862 * @offset: offset from the start of the file.
863 * @uc: consumer which wants to add more or remove some breakpoints
864 * @add: add or remove the breakpoints
865 */
866int uprobe_apply(struct inode *inode, loff_t offset,
867 struct uprobe_consumer *uc, bool add)
868{
869 struct uprobe *uprobe;
870 struct uprobe_consumer *con;
871 int ret = -ENOENT;
872
873 uprobe = find_uprobe(inode, offset);
874 if (!uprobe)
875 return ret;
876
877 down_write(&uprobe->register_rwsem);
878 for (con = uprobe->consumers; con && con != uc ; con = con->next)
879 ;
880 if (con)
881 ret = register_for_each_vma(uprobe, add ? uc : NULL);
882 up_write(&uprobe->register_rwsem);
883 put_uprobe(uprobe);
873 884
874 return ret; 885 return ret;
875} 886}
@@ -884,25 +895,42 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
884{ 895{
885 struct uprobe *uprobe; 896 struct uprobe *uprobe;
886 897
887 if (!inode || !uc)
888 return;
889
890 uprobe = find_uprobe(inode, offset); 898 uprobe = find_uprobe(inode, offset);
891 if (!uprobe) 899 if (!uprobe)
892 return; 900 return;
893 901
894 mutex_lock(uprobes_hash(inode)); 902 down_write(&uprobe->register_rwsem);
903 __uprobe_unregister(uprobe, uc);
904 up_write(&uprobe->register_rwsem);
905 put_uprobe(uprobe);
906}
907EXPORT_SYMBOL_GPL(uprobe_unregister);
895 908
896 if (consumer_del(uprobe, uc)) { 909static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
897 if (!uprobe->consumers) { 910{
898 __uprobe_unregister(uprobe); 911 struct vm_area_struct *vma;
899 clear_bit(UPROBE_RUN_HANDLER, &uprobe->flags); 912 int err = 0;
900 } 913
914 down_read(&mm->mmap_sem);
915 for (vma = mm->mmap; vma; vma = vma->vm_next) {
916 unsigned long vaddr;
917 loff_t offset;
918
919 if (!valid_vma(vma, false) ||
920 vma->vm_file->f_mapping->host != uprobe->inode)
921 continue;
922
923 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
924 if (uprobe->offset < offset ||
925 uprobe->offset >= offset + vma->vm_end - vma->vm_start)
926 continue;
927
928 vaddr = offset_to_vaddr(vma, uprobe->offset);
929 err |= remove_breakpoint(uprobe, mm, vaddr);
901 } 930 }
931 up_read(&mm->mmap_sem);
902 932
903 mutex_unlock(uprobes_hash(inode)); 933 return err;
904 if (uprobe)
905 put_uprobe(uprobe);
906} 934}
907 935
908static struct rb_node * 936static struct rb_node *
@@ -979,7 +1007,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
979 struct uprobe *uprobe, *u; 1007 struct uprobe *uprobe, *u;
980 struct inode *inode; 1008 struct inode *inode;
981 1009
982 if (!atomic_read(&uprobe_events) || !valid_vma(vma, true)) 1010 if (no_uprobe_events() || !valid_vma(vma, true))
983 return 0; 1011 return 0;
984 1012
985 inode = vma->vm_file->f_mapping->host; 1013 inode = vma->vm_file->f_mapping->host;
@@ -988,9 +1016,14 @@ int uprobe_mmap(struct vm_area_struct *vma)
988 1016
989 mutex_lock(uprobes_mmap_hash(inode)); 1017 mutex_lock(uprobes_mmap_hash(inode));
990 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list); 1018 build_probe_list(inode, vma, vma->vm_start, vma->vm_end, &tmp_list);
991 1019 /*
1020 * We can race with uprobe_unregister(), this uprobe can be already
1021 * removed. But in this case filter_chain() must return false, all
1022 * consumers have gone away.
1023 */
992 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) { 1024 list_for_each_entry_safe(uprobe, u, &tmp_list, pending_list) {
993 if (!fatal_signal_pending(current)) { 1025 if (!fatal_signal_pending(current) &&
1026 filter_chain(uprobe, UPROBE_FILTER_MMAP, vma->vm_mm)) {
994 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset); 1027 unsigned long vaddr = offset_to_vaddr(vma, uprobe->offset);
995 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr); 1028 install_breakpoint(uprobe, vma->vm_mm, vma, vaddr);
996 } 1029 }
@@ -1025,7 +1058,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
1025 */ 1058 */
1026void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end) 1059void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned long end)
1027{ 1060{
1028 if (!atomic_read(&uprobe_events) || !valid_vma(vma, false)) 1061 if (no_uprobe_events() || !valid_vma(vma, false))
1029 return; 1062 return;
1030 1063
1031 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */ 1064 if (!atomic_read(&vma->vm_mm->mm_users)) /* called by mmput() ? */
@@ -1042,22 +1075,14 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1042/* Slot allocation for XOL */ 1075/* Slot allocation for XOL */
1043static int xol_add_vma(struct xol_area *area) 1076static int xol_add_vma(struct xol_area *area)
1044{ 1077{
1045 struct mm_struct *mm; 1078 struct mm_struct *mm = current->mm;
1046 int ret; 1079 int ret = -EALREADY;
1047
1048 area->page = alloc_page(GFP_HIGHUSER);
1049 if (!area->page)
1050 return -ENOMEM;
1051
1052 ret = -EALREADY;
1053 mm = current->mm;
1054 1080
1055 down_write(&mm->mmap_sem); 1081 down_write(&mm->mmap_sem);
1056 if (mm->uprobes_state.xol_area) 1082 if (mm->uprobes_state.xol_area)
1057 goto fail; 1083 goto fail;
1058 1084
1059 ret = -ENOMEM; 1085 ret = -ENOMEM;
1060
1061 /* Try to map as high as possible, this is only a hint. */ 1086 /* Try to map as high as possible, this is only a hint. */
1062 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); 1087 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
1063 if (area->vaddr & ~PAGE_MASK) { 1088 if (area->vaddr & ~PAGE_MASK) {
@@ -1073,54 +1098,53 @@ static int xol_add_vma(struct xol_area *area)
1073 smp_wmb(); /* pairs with get_xol_area() */ 1098 smp_wmb(); /* pairs with get_xol_area() */
1074 mm->uprobes_state.xol_area = area; 1099 mm->uprobes_state.xol_area = area;
1075 ret = 0; 1100 ret = 0;
1076 1101 fail:
1077fail:
1078 up_write(&mm->mmap_sem); 1102 up_write(&mm->mmap_sem);
1079 if (ret)
1080 __free_page(area->page);
1081 1103
1082 return ret; 1104 return ret;
1083} 1105}
1084 1106
1085static struct xol_area *get_xol_area(struct mm_struct *mm)
1086{
1087 struct xol_area *area;
1088
1089 area = mm->uprobes_state.xol_area;
1090 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1091
1092 return area;
1093}
1094
1095/* 1107/*
1096 * xol_alloc_area - Allocate process's xol_area. 1108 * get_xol_area - Allocate process's xol_area if necessary.
1097 * This area will be used for storing instructions for execution out of 1109 * This area will be used for storing instructions for execution out of line.
1098 * line.
1099 * 1110 *
1100 * Returns the allocated area or NULL. 1111 * Returns the allocated area or NULL.
1101 */ 1112 */
1102static struct xol_area *xol_alloc_area(void) 1113static struct xol_area *get_xol_area(void)
1103{ 1114{
1115 struct mm_struct *mm = current->mm;
1104 struct xol_area *area; 1116 struct xol_area *area;
1105 1117
1118 area = mm->uprobes_state.xol_area;
1119 if (area)
1120 goto ret;
1121
1106 area = kzalloc(sizeof(*area), GFP_KERNEL); 1122 area = kzalloc(sizeof(*area), GFP_KERNEL);
1107 if (unlikely(!area)) 1123 if (unlikely(!area))
1108 return NULL; 1124 goto out;
1109 1125
1110 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL); 1126 area->bitmap = kzalloc(BITS_TO_LONGS(UINSNS_PER_PAGE) * sizeof(long), GFP_KERNEL);
1111
1112 if (!area->bitmap) 1127 if (!area->bitmap)
1113 goto fail; 1128 goto free_area;
1129
1130 area->page = alloc_page(GFP_HIGHUSER);
1131 if (!area->page)
1132 goto free_bitmap;
1114 1133
1115 init_waitqueue_head(&area->wq); 1134 init_waitqueue_head(&area->wq);
1116 if (!xol_add_vma(area)) 1135 if (!xol_add_vma(area))
1117 return area; 1136 return area;
1118 1137
1119fail: 1138 __free_page(area->page);
1139 free_bitmap:
1120 kfree(area->bitmap); 1140 kfree(area->bitmap);
1141 free_area:
1121 kfree(area); 1142 kfree(area);
1122 1143 out:
1123 return get_xol_area(current->mm); 1144 area = mm->uprobes_state.xol_area;
1145 ret:
1146 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1147 return area;
1124} 1148}
1125 1149
1126/* 1150/*
@@ -1186,33 +1210,26 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
1186} 1210}
1187 1211
1188/* 1212/*
1189 * xol_get_insn_slot - If was not allocated a slot, then 1213 * xol_get_insn_slot - allocate a slot for xol.
1190 * allocate a slot.
1191 * Returns the allocated slot address or 0. 1214 * Returns the allocated slot address or 0.
1192 */ 1215 */
1193static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot_addr) 1216static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1194{ 1217{
1195 struct xol_area *area; 1218 struct xol_area *area;
1196 unsigned long offset; 1219 unsigned long offset;
1220 unsigned long xol_vaddr;
1197 void *vaddr; 1221 void *vaddr;
1198 1222
1199 area = get_xol_area(current->mm); 1223 area = get_xol_area();
1200 if (!area) { 1224 if (!area)
1201 area = xol_alloc_area(); 1225 return 0;
1202 if (!area)
1203 return 0;
1204 }
1205 current->utask->xol_vaddr = xol_take_insn_slot(area);
1206 1226
1207 /* 1227 xol_vaddr = xol_take_insn_slot(area);
1208 * Initialize the slot if xol_vaddr points to valid 1228 if (unlikely(!xol_vaddr))
1209 * instruction slot.
1210 */
1211 if (unlikely(!current->utask->xol_vaddr))
1212 return 0; 1229 return 0;
1213 1230
1214 current->utask->vaddr = slot_addr; 1231 /* Initialize the slot */
1215 offset = current->utask->xol_vaddr & ~PAGE_MASK; 1232 offset = xol_vaddr & ~PAGE_MASK;
1216 vaddr = kmap_atomic(area->page); 1233 vaddr = kmap_atomic(area->page);
1217 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES); 1234 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1218 kunmap_atomic(vaddr); 1235 kunmap_atomic(vaddr);
@@ -1222,7 +1239,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe, unsigned long slot
1222 */ 1239 */
1223 flush_dcache_page(area->page); 1240 flush_dcache_page(area->page);
1224 1241
1225 return current->utask->xol_vaddr; 1242 return xol_vaddr;
1226} 1243}
1227 1244
1228/* 1245/*
@@ -1240,8 +1257,7 @@ static void xol_free_insn_slot(struct task_struct *tsk)
1240 return; 1257 return;
1241 1258
1242 slot_addr = tsk->utask->xol_vaddr; 1259 slot_addr = tsk->utask->xol_vaddr;
1243 1260 if (unlikely(!slot_addr))
1244 if (unlikely(!slot_addr || IS_ERR_VALUE(slot_addr)))
1245 return; 1261 return;
1246 1262
1247 area = tsk->mm->uprobes_state.xol_area; 1263 area = tsk->mm->uprobes_state.xol_area;
@@ -1303,33 +1319,48 @@ void uprobe_copy_process(struct task_struct *t)
1303} 1319}
1304 1320
1305/* 1321/*
1306 * Allocate a uprobe_task object for the task. 1322 * Allocate a uprobe_task object for the task if if necessary.
1307 * Called when the thread hits a breakpoint for the first time. 1323 * Called when the thread hits a breakpoint.
1308 * 1324 *
1309 * Returns: 1325 * Returns:
1310 * - pointer to new uprobe_task on success 1326 * - pointer to new uprobe_task on success
1311 * - NULL otherwise 1327 * - NULL otherwise
1312 */ 1328 */
1313static struct uprobe_task *add_utask(void) 1329static struct uprobe_task *get_utask(void)
1314{ 1330{
1315 struct uprobe_task *utask; 1331 if (!current->utask)
1316 1332 current->utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1317 utask = kzalloc(sizeof *utask, GFP_KERNEL); 1333 return current->utask;
1318 if (unlikely(!utask))
1319 return NULL;
1320
1321 current->utask = utask;
1322 return utask;
1323} 1334}
1324 1335
1325/* Prepare to single-step probed instruction out of line. */ 1336/* Prepare to single-step probed instruction out of line. */
1326static int 1337static int
1327pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long vaddr) 1338pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
1328{ 1339{
1329 if (xol_get_insn_slot(uprobe, vaddr) && !arch_uprobe_pre_xol(&uprobe->arch, regs)) 1340 struct uprobe_task *utask;
1330 return 0; 1341 unsigned long xol_vaddr;
1342 int err;
1343
1344 utask = get_utask();
1345 if (!utask)
1346 return -ENOMEM;
1347
1348 xol_vaddr = xol_get_insn_slot(uprobe);
1349 if (!xol_vaddr)
1350 return -ENOMEM;
1351
1352 utask->xol_vaddr = xol_vaddr;
1353 utask->vaddr = bp_vaddr;
1354
1355 err = arch_uprobe_pre_xol(&uprobe->arch, regs);
1356 if (unlikely(err)) {
1357 xol_free_insn_slot(current);
1358 return err;
1359 }
1331 1360
1332 return -EFAULT; 1361 utask->active_uprobe = uprobe;
1362 utask->state = UTASK_SSTEP;
1363 return 0;
1333} 1364}
1334 1365
1335/* 1366/*
@@ -1391,6 +1422,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
1391 * This is not strictly accurate, we can race with 1422 * This is not strictly accurate, we can race with
1392 * uprobe_unregister() and see the already removed 1423 * uprobe_unregister() and see the already removed
1393 * uprobe if delete_uprobe() was not yet called. 1424 * uprobe if delete_uprobe() was not yet called.
1425 * Or this uprobe can be filtered out.
1394 */ 1426 */
1395 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end)) 1427 if (vma_has_uprobes(vma, vma->vm_start, vma->vm_end))
1396 return; 1428 return;
@@ -1452,13 +1484,33 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1452 return uprobe; 1484 return uprobe;
1453} 1485}
1454 1486
1487static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1488{
1489 struct uprobe_consumer *uc;
1490 int remove = UPROBE_HANDLER_REMOVE;
1491
1492 down_read(&uprobe->register_rwsem);
1493 for (uc = uprobe->consumers; uc; uc = uc->next) {
1494 int rc = uc->handler(uc, regs);
1495
1496 WARN(rc & ~UPROBE_HANDLER_MASK,
1497 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1498 remove &= rc;
1499 }
1500
1501 if (remove && uprobe->consumers) {
1502 WARN_ON(!uprobe_is_active(uprobe));
1503 unapply_uprobe(uprobe, current->mm);
1504 }
1505 up_read(&uprobe->register_rwsem);
1506}
1507
1455/* 1508/*
1456 * Run handler and ask thread to singlestep. 1509 * Run handler and ask thread to singlestep.
1457 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1510 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
1458 */ 1511 */
1459static void handle_swbp(struct pt_regs *regs) 1512static void handle_swbp(struct pt_regs *regs)
1460{ 1513{
1461 struct uprobe_task *utask;
1462 struct uprobe *uprobe; 1514 struct uprobe *uprobe;
1463 unsigned long bp_vaddr; 1515 unsigned long bp_vaddr;
1464 int uninitialized_var(is_swbp); 1516 int uninitialized_var(is_swbp);
@@ -1483,6 +1535,10 @@ static void handle_swbp(struct pt_regs *regs)
1483 } 1535 }
1484 return; 1536 return;
1485 } 1537 }
1538
1539 /* change it in advance for ->handler() and restart */
1540 instruction_pointer_set(regs, bp_vaddr);
1541
1486 /* 1542 /*
1487 * TODO: move copy_insn/etc into _register and remove this hack. 1543 * TODO: move copy_insn/etc into _register and remove this hack.
1488 * After we hit the bp, _unregister + _register can install the 1544 * After we hit the bp, _unregister + _register can install the
@@ -1490,32 +1546,16 @@ static void handle_swbp(struct pt_regs *regs)
1490 */ 1546 */
1491 smp_rmb(); /* pairs with wmb() in install_breakpoint() */ 1547 smp_rmb(); /* pairs with wmb() in install_breakpoint() */
1492 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags))) 1548 if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
1493 goto restart; 1549 goto out;
1494
1495 utask = current->utask;
1496 if (!utask) {
1497 utask = add_utask();
1498 /* Cannot allocate; re-execute the instruction. */
1499 if (!utask)
1500 goto restart;
1501 }
1502 1550
1503 handler_chain(uprobe, regs); 1551 handler_chain(uprobe, regs);
1504 if (can_skip_sstep(uprobe, regs)) 1552 if (can_skip_sstep(uprobe, regs))
1505 goto out; 1553 goto out;
1506 1554
1507 if (!pre_ssout(uprobe, regs, bp_vaddr)) { 1555 if (!pre_ssout(uprobe, regs, bp_vaddr))
1508 utask->active_uprobe = uprobe;
1509 utask->state = UTASK_SSTEP;
1510 return; 1556 return;
1511 }
1512 1557
1513restart: 1558 /* can_skip_sstep() succeeded, or restart if can't singlestep */
1514 /*
1515 * cannot singlestep; cannot skip instruction;
1516 * re-execute the instruction.
1517 */
1518 instruction_pointer_set(regs, bp_vaddr);
1519out: 1559out:
1520 put_uprobe(uprobe); 1560 put_uprobe(uprobe);
1521} 1561}
@@ -1609,10 +1649,8 @@ static int __init init_uprobes(void)
1609{ 1649{
1610 int i; 1650 int i;
1611 1651
1612 for (i = 0; i < UPROBES_HASH_SZ; i++) { 1652 for (i = 0; i < UPROBES_HASH_SZ; i++)
1613 mutex_init(&uprobes_mutex[i]);
1614 mutex_init(&uprobes_mmap_mutex[i]); 1653 mutex_init(&uprobes_mmap_mutex[i]);
1615 }
1616 1654
1617 if (percpu_init_rwsem(&dup_mmap_sem)) 1655 if (percpu_init_rwsem(&dup_mmap_sem))
1618 return -ENOMEM; 1656 return -ENOMEM;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 098f396aa409..f423c3ef4a82 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -919,7 +919,7 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
919} 919}
920#endif /* CONFIG_OPTPROBES */ 920#endif /* CONFIG_OPTPROBES */
921 921
922#ifdef KPROBES_CAN_USE_FTRACE 922#ifdef CONFIG_KPROBES_ON_FTRACE
923static struct ftrace_ops kprobe_ftrace_ops __read_mostly = { 923static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
924 .func = kprobe_ftrace_handler, 924 .func = kprobe_ftrace_handler,
925 .flags = FTRACE_OPS_FL_SAVE_REGS, 925 .flags = FTRACE_OPS_FL_SAVE_REGS,
@@ -964,7 +964,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
964 (unsigned long)p->addr, 1, 0); 964 (unsigned long)p->addr, 1, 0);
965 WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); 965 WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret);
966} 966}
967#else /* !KPROBES_CAN_USE_FTRACE */ 967#else /* !CONFIG_KPROBES_ON_FTRACE */
968#define prepare_kprobe(p) arch_prepare_kprobe(p) 968#define prepare_kprobe(p) arch_prepare_kprobe(p)
969#define arm_kprobe_ftrace(p) do {} while (0) 969#define arm_kprobe_ftrace(p) do {} while (0)
970#define disarm_kprobe_ftrace(p) do {} while (0) 970#define disarm_kprobe_ftrace(p) do {} while (0)
@@ -1414,12 +1414,12 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
1414 */ 1414 */
1415 ftrace_addr = ftrace_location((unsigned long)p->addr); 1415 ftrace_addr = ftrace_location((unsigned long)p->addr);
1416 if (ftrace_addr) { 1416 if (ftrace_addr) {
1417#ifdef KPROBES_CAN_USE_FTRACE 1417#ifdef CONFIG_KPROBES_ON_FTRACE
1418 /* Given address is not on the instruction boundary */ 1418 /* Given address is not on the instruction boundary */
1419 if ((unsigned long)p->addr != ftrace_addr) 1419 if ((unsigned long)p->addr != ftrace_addr)
1420 return -EILSEQ; 1420 return -EILSEQ;
1421 p->flags |= KPROBE_FLAG_FTRACE; 1421 p->flags |= KPROBE_FLAG_FTRACE;
1422#else /* !KPROBES_CAN_USE_FTRACE */ 1422#else /* !CONFIG_KPROBES_ON_FTRACE */
1423 return -EINVAL; 1423 return -EINVAL;
1424#endif 1424#endif
1425 } 1425 }
diff --git a/kernel/profile.c b/kernel/profile.c
index 1f391819c42f..dc3384ee874e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -37,9 +37,6 @@ struct profile_hit {
37#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit)) 37#define NR_PROFILE_HIT (PAGE_SIZE/sizeof(struct profile_hit))
38#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ) 38#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
39 39
40/* Oprofile timer tick hook */
41static int (*timer_hook)(struct pt_regs *) __read_mostly;
42
43static atomic_t *prof_buffer; 40static atomic_t *prof_buffer;
44static unsigned long prof_len, prof_shift; 41static unsigned long prof_len, prof_shift;
45 42
@@ -208,25 +205,6 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n)
208} 205}
209EXPORT_SYMBOL_GPL(profile_event_unregister); 206EXPORT_SYMBOL_GPL(profile_event_unregister);
210 207
211int register_timer_hook(int (*hook)(struct pt_regs *))
212{
213 if (timer_hook)
214 return -EBUSY;
215 timer_hook = hook;
216 return 0;
217}
218EXPORT_SYMBOL_GPL(register_timer_hook);
219
220void unregister_timer_hook(int (*hook)(struct pt_regs *))
221{
222 WARN_ON(hook != timer_hook);
223 timer_hook = NULL;
224 /* make sure all CPUs see the NULL hook */
225 synchronize_sched(); /* Allow ongoing interrupts to complete. */
226}
227EXPORT_SYMBOL_GPL(unregister_timer_hook);
228
229
230#ifdef CONFIG_SMP 208#ifdef CONFIG_SMP
231/* 209/*
232 * Each cpu has a pair of open-addressed hashtables for pending 210 * Each cpu has a pair of open-addressed hashtables for pending
@@ -436,8 +414,6 @@ void profile_tick(int type)
436{ 414{
437 struct pt_regs *regs = get_irq_regs(); 415 struct pt_regs *regs = get_irq_regs();
438 416
439 if (type == CPU_PROFILING && timer_hook)
440 timer_hook(regs);
441 if (!user_mode(regs) && prof_cpu_mask != NULL && 417 if (!user_mode(regs) && prof_cpu_mask != NULL &&
442 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask)) 418 cpumask_test_cpu(smp_processor_id(), prof_cpu_mask))
443 profile_hit(type, (void *)profile_pc(regs)); 419 profile_hit(type, (void *)profile_pc(regs));
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6cbeaae4406d..acbd28424d81 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -712,6 +712,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
712 kiov->iov_len, kiov->iov_base); 712 kiov->iov_len, kiov->iov_base);
713} 713}
714 714
715/*
716 * This is declared in linux/regset.h and defined in machine-dependent
717 * code. We put the export here, near the primary machine-neutral use,
718 * to ensure no machine forgets it.
719 */
720EXPORT_SYMBOL_GPL(task_user_regset_view);
715#endif 721#endif
716 722
717int ptrace_request(struct task_struct *child, long request, 723int ptrace_request(struct task_struct *child, long request,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d89335a485f..36567564e221 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -39,6 +39,9 @@ config HAVE_DYNAMIC_FTRACE
39 help 39 help
40 See Documentation/trace/ftrace-design.txt 40 See Documentation/trace/ftrace-design.txt
41 41
42config HAVE_DYNAMIC_FTRACE_WITH_REGS
43 bool
44
42config HAVE_FTRACE_MCOUNT_RECORD 45config HAVE_FTRACE_MCOUNT_RECORD
43 bool 46 bool
44 help 47 help
@@ -250,6 +253,16 @@ config FTRACE_SYSCALLS
250 help 253 help
251 Basic tracer to catch the syscall entry and exit events. 254 Basic tracer to catch the syscall entry and exit events.
252 255
256config TRACER_SNAPSHOT
257 bool "Create a snapshot trace buffer"
258 select TRACER_MAX_TRACE
259 help
260 Allow tracing users to take snapshot of the current buffer using the
261 ftrace interface, e.g.:
262
263 echo 1 > /sys/kernel/debug/tracing/snapshot
264 cat snapshot
265
253config TRACE_BRANCH_PROFILING 266config TRACE_BRANCH_PROFILING
254 bool 267 bool
255 select GENERIC_TRACER 268 select GENERIC_TRACER
@@ -434,6 +447,11 @@ config DYNAMIC_FTRACE
434 were made. If so, it runs stop_machine (stops all CPUS) 447 were made. If so, it runs stop_machine (stops all CPUS)
435 and modifies the code to jump over the call to ftrace. 448 and modifies the code to jump over the call to ftrace.
436 449
450config DYNAMIC_FTRACE_WITH_REGS
451 def_bool y
452 depends on DYNAMIC_FTRACE
453 depends on HAVE_DYNAMIC_FTRACE_WITH_REGS
454
437config FUNCTION_PROFILER 455config FUNCTION_PROFILER
438 bool "Kernel function profiler" 456 bool "Kernel function profiler"
439 depends on FUNCTION_TRACER 457 depends on FUNCTION_TRACER
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c0bd0308741c..71259e2b6b61 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
147 return; 147 return;
148 148
149 local_irq_save(flags); 149 local_irq_save(flags);
150 buf = per_cpu_ptr(bt->msg_data, smp_processor_id()); 150 buf = this_cpu_ptr(bt->msg_data);
151 va_start(args, fmt); 151 va_start(args, fmt);
152 n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args); 152 n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
153 va_end(args); 153 va_end(args);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 41473b4ad7a4..ce8c3d68292f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -111,6 +111,26 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip);
111#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops) 111#define ftrace_ops_list_func ((ftrace_func_t)ftrace_ops_no_ops)
112#endif 112#endif
113 113
114/*
115 * Traverse the ftrace_global_list, invoking all entries. The reason that we
116 * can use rcu_dereference_raw() is that elements removed from this list
117 * are simply leaked, so there is no need to interact with a grace-period
118 * mechanism. The rcu_dereference_raw() calls are needed to handle
119 * concurrent insertions into the ftrace_global_list.
120 *
121 * Silly Alpha and silly pointer-speculation compiler optimizations!
122 */
123#define do_for_each_ftrace_op(op, list) \
124 op = rcu_dereference_raw(list); \
125 do
126
127/*
128 * Optimized for just a single item in the list (as that is the normal case).
129 */
130#define while_for_each_ftrace_op(op) \
131 while (likely(op = rcu_dereference_raw((op)->next)) && \
132 unlikely((op) != &ftrace_list_end))
133
114/** 134/**
115 * ftrace_nr_registered_ops - return number of ops registered 135 * ftrace_nr_registered_ops - return number of ops registered
116 * 136 *
@@ -132,29 +152,21 @@ int ftrace_nr_registered_ops(void)
132 return cnt; 152 return cnt;
133} 153}
134 154
135/*
136 * Traverse the ftrace_global_list, invoking all entries. The reason that we
137 * can use rcu_dereference_raw() is that elements removed from this list
138 * are simply leaked, so there is no need to interact with a grace-period
139 * mechanism. The rcu_dereference_raw() calls are needed to handle
140 * concurrent insertions into the ftrace_global_list.
141 *
142 * Silly Alpha and silly pointer-speculation compiler optimizations!
143 */
144static void 155static void
145ftrace_global_list_func(unsigned long ip, unsigned long parent_ip, 156ftrace_global_list_func(unsigned long ip, unsigned long parent_ip,
146 struct ftrace_ops *op, struct pt_regs *regs) 157 struct ftrace_ops *op, struct pt_regs *regs)
147{ 158{
148 if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT))) 159 int bit;
160
161 bit = trace_test_and_set_recursion(TRACE_GLOBAL_START, TRACE_GLOBAL_MAX);
162 if (bit < 0)
149 return; 163 return;
150 164
151 trace_recursion_set(TRACE_GLOBAL_BIT); 165 do_for_each_ftrace_op(op, ftrace_global_list) {
152 op = rcu_dereference_raw(ftrace_global_list); /*see above*/
153 while (op != &ftrace_list_end) {
154 op->func(ip, parent_ip, op, regs); 166 op->func(ip, parent_ip, op, regs);
155 op = rcu_dereference_raw(op->next); /*see above*/ 167 } while_for_each_ftrace_op(op);
156 }; 168
157 trace_recursion_clear(TRACE_GLOBAL_BIT); 169 trace_clear_recursion(bit);
158} 170}
159 171
160static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, 172static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip,
@@ -221,10 +233,24 @@ static void update_global_ops(void)
221 * registered callers. 233 * registered callers.
222 */ 234 */
223 if (ftrace_global_list == &ftrace_list_end || 235 if (ftrace_global_list == &ftrace_list_end ||
224 ftrace_global_list->next == &ftrace_list_end) 236 ftrace_global_list->next == &ftrace_list_end) {
225 func = ftrace_global_list->func; 237 func = ftrace_global_list->func;
226 else 238 /*
239 * As we are calling the function directly.
240 * If it does not have recursion protection,
241 * the function_trace_op needs to be updated
242 * accordingly.
243 */
244 if (ftrace_global_list->flags & FTRACE_OPS_FL_RECURSION_SAFE)
245 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
246 else
247 global_ops.flags &= ~FTRACE_OPS_FL_RECURSION_SAFE;
248 } else {
227 func = ftrace_global_list_func; 249 func = ftrace_global_list_func;
250 /* The list has its own recursion protection. */
251 global_ops.flags |= FTRACE_OPS_FL_RECURSION_SAFE;
252 }
253
228 254
229 /* If we filter on pids, update to use the pid function */ 255 /* If we filter on pids, update to use the pid function */
230 if (!list_empty(&ftrace_pids)) { 256 if (!list_empty(&ftrace_pids)) {
@@ -337,7 +363,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
337 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK) 363 if ((ops->flags & FL_GLOBAL_CONTROL_MASK) == FL_GLOBAL_CONTROL_MASK)
338 return -EINVAL; 364 return -EINVAL;
339 365
340#ifndef ARCH_SUPPORTS_FTRACE_SAVE_REGS 366#ifndef CONFIG_DYNAMIC_FTRACE_WITH_REGS
341 /* 367 /*
342 * If the ftrace_ops specifies SAVE_REGS, then it only can be used 368 * If the ftrace_ops specifies SAVE_REGS, then it only can be used
343 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set. 369 * if the arch supports it, or SAVE_REGS_IF_SUPPORTED is also set.
@@ -4090,14 +4116,11 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
4090 */ 4116 */
4091 preempt_disable_notrace(); 4117 preempt_disable_notrace();
4092 trace_recursion_set(TRACE_CONTROL_BIT); 4118 trace_recursion_set(TRACE_CONTROL_BIT);
4093 op = rcu_dereference_raw(ftrace_control_list); 4119 do_for_each_ftrace_op(op, ftrace_control_list) {
4094 while (op != &ftrace_list_end) {
4095 if (!ftrace_function_local_disabled(op) && 4120 if (!ftrace_function_local_disabled(op) &&
4096 ftrace_ops_test(op, ip)) 4121 ftrace_ops_test(op, ip))
4097 op->func(ip, parent_ip, op, regs); 4122 op->func(ip, parent_ip, op, regs);
4098 4123 } while_for_each_ftrace_op(op);
4099 op = rcu_dereference_raw(op->next);
4100 };
4101 trace_recursion_clear(TRACE_CONTROL_BIT); 4124 trace_recursion_clear(TRACE_CONTROL_BIT);
4102 preempt_enable_notrace(); 4125 preempt_enable_notrace();
4103} 4126}
@@ -4112,27 +4135,26 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4112 struct ftrace_ops *ignored, struct pt_regs *regs) 4135 struct ftrace_ops *ignored, struct pt_regs *regs)
4113{ 4136{
4114 struct ftrace_ops *op; 4137 struct ftrace_ops *op;
4138 int bit;
4115 4139
4116 if (function_trace_stop) 4140 if (function_trace_stop)
4117 return; 4141 return;
4118 4142
4119 if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT))) 4143 bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
4144 if (bit < 0)
4120 return; 4145 return;
4121 4146
4122 trace_recursion_set(TRACE_INTERNAL_BIT);
4123 /* 4147 /*
4124 * Some of the ops may be dynamically allocated, 4148 * Some of the ops may be dynamically allocated,
4125 * they must be freed after a synchronize_sched(). 4149 * they must be freed after a synchronize_sched().
4126 */ 4150 */
4127 preempt_disable_notrace(); 4151 preempt_disable_notrace();
4128 op = rcu_dereference_raw(ftrace_ops_list); 4152 do_for_each_ftrace_op(op, ftrace_ops_list) {
4129 while (op != &ftrace_list_end) {
4130 if (ftrace_ops_test(op, ip)) 4153 if (ftrace_ops_test(op, ip))
4131 op->func(ip, parent_ip, op, regs); 4154 op->func(ip, parent_ip, op, regs);
4132 op = rcu_dereference_raw(op->next); 4155 } while_for_each_ftrace_op(op);
4133 };
4134 preempt_enable_notrace(); 4156 preempt_enable_notrace();
4135 trace_recursion_clear(TRACE_INTERNAL_BIT); 4157 trace_clear_recursion(bit);
4136} 4158}
4137 4159
4138/* 4160/*
@@ -4143,8 +4165,8 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
4143 * Archs are to support both the regs and ftrace_ops at the same time. 4165 * Archs are to support both the regs and ftrace_ops at the same time.
4144 * If they support ftrace_ops, it is assumed they support regs. 4166 * If they support ftrace_ops, it is assumed they support regs.
4145 * If call backs want to use regs, they must either check for regs 4167 * If call backs want to use regs, they must either check for regs
4146 * being NULL, or ARCH_SUPPORTS_FTRACE_SAVE_REGS. 4168 * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS.
4147 * Note, ARCH_SUPPORT_SAVE_REGS expects a full regs to be saved. 4169 * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved.
4148 * An architecture can pass partial regs with ftrace_ops and still 4170 * An architecture can pass partial regs with ftrace_ops and still
4149 * set the ARCH_SUPPORT_FTARCE_OPS. 4171 * set the ARCH_SUPPORT_FTARCE_OPS.
4150 */ 4172 */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index ce8514feedcd..7244acde77b0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3,8 +3,10 @@
3 * 3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com> 4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */ 5 */
6#include <linux/ftrace_event.h>
6#include <linux/ring_buffer.h> 7#include <linux/ring_buffer.h>
7#include <linux/trace_clock.h> 8#include <linux/trace_clock.h>
9#include <linux/trace_seq.h>
8#include <linux/spinlock.h> 10#include <linux/spinlock.h>
9#include <linux/debugfs.h> 11#include <linux/debugfs.h>
10#include <linux/uaccess.h> 12#include <linux/uaccess.h>
@@ -21,7 +23,6 @@
21#include <linux/fs.h> 23#include <linux/fs.h>
22 24
23#include <asm/local.h> 25#include <asm/local.h>
24#include "trace.h"
25 26
26static void update_pages_handler(struct work_struct *work); 27static void update_pages_handler(struct work_struct *work);
27 28
@@ -2432,41 +2433,76 @@ rb_reserve_next_event(struct ring_buffer *buffer,
2432 2433
2433#ifdef CONFIG_TRACING 2434#ifdef CONFIG_TRACING
2434 2435
2435#define TRACE_RECURSIVE_DEPTH 16 2436/*
2437 * The lock and unlock are done within a preempt disable section.
2438 * The current_context per_cpu variable can only be modified
2439 * by the current task between lock and unlock. But it can
2440 * be modified more than once via an interrupt. To pass this
2441 * information from the lock to the unlock without having to
2442 * access the 'in_interrupt()' functions again (which do show
2443 * a bit of overhead in something as critical as function tracing,
2444 * we use a bitmask trick.
2445 *
2446 * bit 0 = NMI context
2447 * bit 1 = IRQ context
2448 * bit 2 = SoftIRQ context
2449 * bit 3 = normal context.
2450 *
2451 * This works because this is the order of contexts that can
2452 * preempt other contexts. A SoftIRQ never preempts an IRQ
2453 * context.
2454 *
2455 * When the context is determined, the corresponding bit is
2456 * checked and set (if it was set, then a recursion of that context
2457 * happened).
2458 *
2459 * On unlock, we need to clear this bit. To do so, just subtract
2460 * 1 from the current_context and AND it to itself.
2461 *
2462 * (binary)
2463 * 101 - 1 = 100
2464 * 101 & 100 = 100 (clearing bit zero)
2465 *
2466 * 1010 - 1 = 1001
2467 * 1010 & 1001 = 1000 (clearing bit 1)
2468 *
2469 * The least significant bit can be cleared this way, and it
2470 * just so happens that it is the same bit corresponding to
2471 * the current context.
2472 */
2473static DEFINE_PER_CPU(unsigned int, current_context);
2436 2474
2437/* Keep this code out of the fast path cache */ 2475static __always_inline int trace_recursive_lock(void)
2438static noinline void trace_recursive_fail(void)
2439{ 2476{
2440 /* Disable all tracing before we do anything else */ 2477 unsigned int val = this_cpu_read(current_context);
2441 tracing_off_permanent(); 2478 int bit;
2442
2443 printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
2444 "HC[%lu]:SC[%lu]:NMI[%lu]\n",
2445 trace_recursion_buffer(),
2446 hardirq_count() >> HARDIRQ_SHIFT,
2447 softirq_count() >> SOFTIRQ_SHIFT,
2448 in_nmi());
2449
2450 WARN_ON_ONCE(1);
2451}
2452 2479
2453static inline int trace_recursive_lock(void) 2480 if (in_interrupt()) {
2454{ 2481 if (in_nmi())
2455 trace_recursion_inc(); 2482 bit = 0;
2483 else if (in_irq())
2484 bit = 1;
2485 else
2486 bit = 2;
2487 } else
2488 bit = 3;
2456 2489
2457 if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH)) 2490 if (unlikely(val & (1 << bit)))
2458 return 0; 2491 return 1;
2459 2492
2460 trace_recursive_fail(); 2493 val |= (1 << bit);
2494 this_cpu_write(current_context, val);
2461 2495
2462 return -1; 2496 return 0;
2463} 2497}
2464 2498
2465static inline void trace_recursive_unlock(void) 2499static __always_inline void trace_recursive_unlock(void)
2466{ 2500{
2467 WARN_ON_ONCE(!trace_recursion_buffer()); 2501 unsigned int val = this_cpu_read(current_context);
2468 2502
2469 trace_recursion_dec(); 2503 val--;
2504 val &= this_cpu_read(current_context);
2505 this_cpu_write(current_context, val);
2470} 2506}
2471 2507
2472#else 2508#else
@@ -3067,6 +3103,24 @@ ring_buffer_dropped_events_cpu(struct ring_buffer *buffer, int cpu)
3067EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu); 3103EXPORT_SYMBOL_GPL(ring_buffer_dropped_events_cpu);
3068 3104
3069/** 3105/**
3106 * ring_buffer_read_events_cpu - get the number of events successfully read
3107 * @buffer: The ring buffer
3108 * @cpu: The per CPU buffer to get the number of events read
3109 */
3110unsigned long
3111ring_buffer_read_events_cpu(struct ring_buffer *buffer, int cpu)
3112{
3113 struct ring_buffer_per_cpu *cpu_buffer;
3114
3115 if (!cpumask_test_cpu(cpu, buffer->cpumask))
3116 return 0;
3117
3118 cpu_buffer = buffer->buffers[cpu];
3119 return cpu_buffer->read;
3120}
3121EXPORT_SYMBOL_GPL(ring_buffer_read_events_cpu);
3122
3123/**
3070 * ring_buffer_entries - get the number of entries in a buffer 3124 * ring_buffer_entries - get the number of entries in a buffer
3071 * @buffer: The ring buffer 3125 * @buffer: The ring buffer
3072 * 3126 *
@@ -3425,7 +3479,7 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
3425 /* check for end of page padding */ 3479 /* check for end of page padding */
3426 if ((iter->head >= rb_page_size(iter->head_page)) && 3480 if ((iter->head >= rb_page_size(iter->head_page)) &&
3427 (iter->head_page != cpu_buffer->commit_page)) 3481 (iter->head_page != cpu_buffer->commit_page))
3428 rb_advance_iter(iter); 3482 rb_inc_iter(iter);
3429} 3483}
3430 3484
3431static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer) 3485static int rb_lost_events(struct ring_buffer_per_cpu *cpu_buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c13e46d7d24..5d520b7bb4c5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -249,7 +249,7 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
249static struct tracer *trace_types __read_mostly; 249static struct tracer *trace_types __read_mostly;
250 250
251/* current_trace points to the tracer that is currently active */ 251/* current_trace points to the tracer that is currently active */
252static struct tracer *current_trace __read_mostly; 252static struct tracer *current_trace __read_mostly = &nop_trace;
253 253
254/* 254/*
255 * trace_types_lock is used to protect the trace_types list. 255 * trace_types_lock is used to protect the trace_types list.
@@ -709,10 +709,13 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
709 return; 709 return;
710 710
711 WARN_ON_ONCE(!irqs_disabled()); 711 WARN_ON_ONCE(!irqs_disabled());
712 if (!current_trace->use_max_tr) { 712
713 WARN_ON_ONCE(1); 713 if (!current_trace->allocated_snapshot) {
714 /* Only the nop tracer should hit this when disabling */
715 WARN_ON_ONCE(current_trace != &nop_trace);
714 return; 716 return;
715 } 717 }
718
716 arch_spin_lock(&ftrace_max_lock); 719 arch_spin_lock(&ftrace_max_lock);
717 720
718 tr->buffer = max_tr.buffer; 721 tr->buffer = max_tr.buffer;
@@ -739,10 +742,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
739 return; 742 return;
740 743
741 WARN_ON_ONCE(!irqs_disabled()); 744 WARN_ON_ONCE(!irqs_disabled());
742 if (!current_trace->use_max_tr) { 745 if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
743 WARN_ON_ONCE(1);
744 return; 746 return;
745 }
746 747
747 arch_spin_lock(&ftrace_max_lock); 748 arch_spin_lock(&ftrace_max_lock);
748 749
@@ -862,10 +863,13 @@ int register_tracer(struct tracer *type)
862 863
863 current_trace = type; 864 current_trace = type;
864 865
865 /* If we expanded the buffers, make sure the max is expanded too */ 866 if (type->use_max_tr) {
866 if (ring_buffer_expanded && type->use_max_tr) 867 /* If we expanded the buffers, make sure the max is expanded too */
867 ring_buffer_resize(max_tr.buffer, trace_buf_size, 868 if (ring_buffer_expanded)
868 RING_BUFFER_ALL_CPUS); 869 ring_buffer_resize(max_tr.buffer, trace_buf_size,
870 RING_BUFFER_ALL_CPUS);
871 type->allocated_snapshot = true;
872 }
869 873
870 /* the test is responsible for initializing and enabling */ 874 /* the test is responsible for initializing and enabling */
871 pr_info("Testing tracer %s: ", type->name); 875 pr_info("Testing tracer %s: ", type->name);
@@ -881,10 +885,14 @@ int register_tracer(struct tracer *type)
881 /* Only reset on passing, to avoid touching corrupted buffers */ 885 /* Only reset on passing, to avoid touching corrupted buffers */
882 tracing_reset_online_cpus(tr); 886 tracing_reset_online_cpus(tr);
883 887
884 /* Shrink the max buffer again */ 888 if (type->use_max_tr) {
885 if (ring_buffer_expanded && type->use_max_tr) 889 type->allocated_snapshot = false;
886 ring_buffer_resize(max_tr.buffer, 1, 890
887 RING_BUFFER_ALL_CPUS); 891 /* Shrink the max buffer again */
892 if (ring_buffer_expanded)
893 ring_buffer_resize(max_tr.buffer, 1,
894 RING_BUFFER_ALL_CPUS);
895 }
888 896
889 printk(KERN_CONT "PASSED\n"); 897 printk(KERN_CONT "PASSED\n");
890 } 898 }
@@ -922,6 +930,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
922{ 930{
923 struct ring_buffer *buffer = tr->buffer; 931 struct ring_buffer *buffer = tr->buffer;
924 932
933 if (!buffer)
934 return;
935
925 ring_buffer_record_disable(buffer); 936 ring_buffer_record_disable(buffer);
926 937
927 /* Make sure all commits have finished */ 938 /* Make sure all commits have finished */
@@ -936,6 +947,9 @@ void tracing_reset_online_cpus(struct trace_array *tr)
936 struct ring_buffer *buffer = tr->buffer; 947 struct ring_buffer *buffer = tr->buffer;
937 int cpu; 948 int cpu;
938 949
950 if (!buffer)
951 return;
952
939 ring_buffer_record_disable(buffer); 953 ring_buffer_record_disable(buffer);
940 954
941 /* Make sure all commits have finished */ 955 /* Make sure all commits have finished */
@@ -1167,7 +1181,6 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
1167 1181
1168 entry->preempt_count = pc & 0xff; 1182 entry->preempt_count = pc & 0xff;
1169 entry->pid = (tsk) ? tsk->pid : 0; 1183 entry->pid = (tsk) ? tsk->pid : 0;
1170 entry->padding = 0;
1171 entry->flags = 1184 entry->flags =
1172#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT 1185#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
1173 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 1186 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1335,7 +1348,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1335 */ 1348 */
1336 preempt_disable_notrace(); 1349 preempt_disable_notrace();
1337 1350
1338 use_stack = ++__get_cpu_var(ftrace_stack_reserve); 1351 use_stack = __this_cpu_inc_return(ftrace_stack_reserve);
1339 /* 1352 /*
1340 * We don't need any atomic variables, just a barrier. 1353 * We don't need any atomic variables, just a barrier.
1341 * If an interrupt comes in, we don't care, because it would 1354 * If an interrupt comes in, we don't care, because it would
@@ -1389,7 +1402,7 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
1389 out: 1402 out:
1390 /* Again, don't let gcc optimize things here */ 1403 /* Again, don't let gcc optimize things here */
1391 barrier(); 1404 barrier();
1392 __get_cpu_var(ftrace_stack_reserve)--; 1405 __this_cpu_dec(ftrace_stack_reserve);
1393 preempt_enable_notrace(); 1406 preempt_enable_notrace();
1394 1407
1395} 1408}
@@ -1517,7 +1530,6 @@ static struct trace_buffer_struct *trace_percpu_nmi_buffer;
1517static char *get_trace_buf(void) 1530static char *get_trace_buf(void)
1518{ 1531{
1519 struct trace_buffer_struct *percpu_buffer; 1532 struct trace_buffer_struct *percpu_buffer;
1520 struct trace_buffer_struct *buffer;
1521 1533
1522 /* 1534 /*
1523 * If we have allocated per cpu buffers, then we do not 1535 * If we have allocated per cpu buffers, then we do not
@@ -1535,9 +1547,7 @@ static char *get_trace_buf(void)
1535 if (!percpu_buffer) 1547 if (!percpu_buffer)
1536 return NULL; 1548 return NULL;
1537 1549
1538 buffer = per_cpu_ptr(percpu_buffer, smp_processor_id()); 1550 return this_cpu_ptr(&percpu_buffer->buffer[0]);
1539
1540 return buffer->buffer;
1541} 1551}
1542 1552
1543static int alloc_percpu_trace_buffer(void) 1553static int alloc_percpu_trace_buffer(void)
@@ -1942,21 +1952,27 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
1942static void *s_start(struct seq_file *m, loff_t *pos) 1952static void *s_start(struct seq_file *m, loff_t *pos)
1943{ 1953{
1944 struct trace_iterator *iter = m->private; 1954 struct trace_iterator *iter = m->private;
1945 static struct tracer *old_tracer;
1946 int cpu_file = iter->cpu_file; 1955 int cpu_file = iter->cpu_file;
1947 void *p = NULL; 1956 void *p = NULL;
1948 loff_t l = 0; 1957 loff_t l = 0;
1949 int cpu; 1958 int cpu;
1950 1959
1951 /* copy the tracer to avoid using a global lock all around */ 1960 /*
1961 * copy the tracer to avoid using a global lock all around.
1962 * iter->trace is a copy of current_trace, the pointer to the
1963 * name may be used instead of a strcmp(), as iter->trace->name
1964 * will point to the same string as current_trace->name.
1965 */
1952 mutex_lock(&trace_types_lock); 1966 mutex_lock(&trace_types_lock);
1953 if (unlikely(old_tracer != current_trace && current_trace)) { 1967 if (unlikely(current_trace && iter->trace->name != current_trace->name))
1954 old_tracer = current_trace;
1955 *iter->trace = *current_trace; 1968 *iter->trace = *current_trace;
1956 }
1957 mutex_unlock(&trace_types_lock); 1969 mutex_unlock(&trace_types_lock);
1958 1970
1959 atomic_inc(&trace_record_cmdline_disabled); 1971 if (iter->snapshot && iter->trace->use_max_tr)
1972 return ERR_PTR(-EBUSY);
1973
1974 if (!iter->snapshot)
1975 atomic_inc(&trace_record_cmdline_disabled);
1960 1976
1961 if (*pos != iter->pos) { 1977 if (*pos != iter->pos) {
1962 iter->ent = NULL; 1978 iter->ent = NULL;
@@ -1995,7 +2011,11 @@ static void s_stop(struct seq_file *m, void *p)
1995{ 2011{
1996 struct trace_iterator *iter = m->private; 2012 struct trace_iterator *iter = m->private;
1997 2013
1998 atomic_dec(&trace_record_cmdline_disabled); 2014 if (iter->snapshot && iter->trace->use_max_tr)
2015 return;
2016
2017 if (!iter->snapshot)
2018 atomic_dec(&trace_record_cmdline_disabled);
1999 trace_access_unlock(iter->cpu_file); 2019 trace_access_unlock(iter->cpu_file);
2000 trace_event_read_unlock(); 2020 trace_event_read_unlock();
2001} 2021}
@@ -2080,8 +2100,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
2080 unsigned long total; 2100 unsigned long total;
2081 const char *name = "preemption"; 2101 const char *name = "preemption";
2082 2102
2083 if (type) 2103 name = type->name;
2084 name = type->name;
2085 2104
2086 get_total_entries(tr, &total, &entries); 2105 get_total_entries(tr, &total, &entries);
2087 2106
@@ -2430,7 +2449,7 @@ static const struct seq_operations tracer_seq_ops = {
2430}; 2449};
2431 2450
2432static struct trace_iterator * 2451static struct trace_iterator *
2433__tracing_open(struct inode *inode, struct file *file) 2452__tracing_open(struct inode *inode, struct file *file, bool snapshot)
2434{ 2453{
2435 long cpu_file = (long) inode->i_private; 2454 long cpu_file = (long) inode->i_private;
2436 struct trace_iterator *iter; 2455 struct trace_iterator *iter;
@@ -2457,16 +2476,16 @@ __tracing_open(struct inode *inode, struct file *file)
2457 if (!iter->trace) 2476 if (!iter->trace)
2458 goto fail; 2477 goto fail;
2459 2478
2460 if (current_trace) 2479 *iter->trace = *current_trace;
2461 *iter->trace = *current_trace;
2462 2480
2463 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) 2481 if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
2464 goto fail; 2482 goto fail;
2465 2483
2466 if (current_trace && current_trace->print_max) 2484 if (current_trace->print_max || snapshot)
2467 iter->tr = &max_tr; 2485 iter->tr = &max_tr;
2468 else 2486 else
2469 iter->tr = &global_trace; 2487 iter->tr = &global_trace;
2488 iter->snapshot = snapshot;
2470 iter->pos = -1; 2489 iter->pos = -1;
2471 mutex_init(&iter->mutex); 2490 mutex_init(&iter->mutex);
2472 iter->cpu_file = cpu_file; 2491 iter->cpu_file = cpu_file;
@@ -2483,8 +2502,9 @@ __tracing_open(struct inode *inode, struct file *file)
2483 if (trace_clocks[trace_clock_id].in_ns) 2502 if (trace_clocks[trace_clock_id].in_ns)
2484 iter->iter_flags |= TRACE_FILE_TIME_IN_NS; 2503 iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
2485 2504
2486 /* stop the trace while dumping */ 2505 /* stop the trace while dumping if we are not opening "snapshot" */
2487 tracing_stop(); 2506 if (!iter->snapshot)
2507 tracing_stop();
2488 2508
2489 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { 2509 if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
2490 for_each_tracing_cpu(cpu) { 2510 for_each_tracing_cpu(cpu) {
@@ -2547,8 +2567,9 @@ static int tracing_release(struct inode *inode, struct file *file)
2547 if (iter->trace && iter->trace->close) 2567 if (iter->trace && iter->trace->close)
2548 iter->trace->close(iter); 2568 iter->trace->close(iter);
2549 2569
2550 /* reenable tracing if it was previously enabled */ 2570 if (!iter->snapshot)
2551 tracing_start(); 2571 /* reenable tracing if it was previously enabled */
2572 tracing_start();
2552 mutex_unlock(&trace_types_lock); 2573 mutex_unlock(&trace_types_lock);
2553 2574
2554 mutex_destroy(&iter->mutex); 2575 mutex_destroy(&iter->mutex);
@@ -2576,7 +2597,7 @@ static int tracing_open(struct inode *inode, struct file *file)
2576 } 2597 }
2577 2598
2578 if (file->f_mode & FMODE_READ) { 2599 if (file->f_mode & FMODE_READ) {
2579 iter = __tracing_open(inode, file); 2600 iter = __tracing_open(inode, file, false);
2580 if (IS_ERR(iter)) 2601 if (IS_ERR(iter))
2581 ret = PTR_ERR(iter); 2602 ret = PTR_ERR(iter);
2582 else if (trace_flags & TRACE_ITER_LATENCY_FMT) 2603 else if (trace_flags & TRACE_ITER_LATENCY_FMT)
@@ -3014,10 +3035,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
3014 int r; 3035 int r;
3015 3036
3016 mutex_lock(&trace_types_lock); 3037 mutex_lock(&trace_types_lock);
3017 if (current_trace) 3038 r = sprintf(buf, "%s\n", current_trace->name);
3018 r = sprintf(buf, "%s\n", current_trace->name);
3019 else
3020 r = sprintf(buf, "\n");
3021 mutex_unlock(&trace_types_lock); 3039 mutex_unlock(&trace_types_lock);
3022 3040
3023 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); 3041 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3183,6 +3201,7 @@ static int tracing_set_tracer(const char *buf)
3183 static struct trace_option_dentry *topts; 3201 static struct trace_option_dentry *topts;
3184 struct trace_array *tr = &global_trace; 3202 struct trace_array *tr = &global_trace;
3185 struct tracer *t; 3203 struct tracer *t;
3204 bool had_max_tr;
3186 int ret = 0; 3205 int ret = 0;
3187 3206
3188 mutex_lock(&trace_types_lock); 3207 mutex_lock(&trace_types_lock);
@@ -3207,9 +3226,21 @@ static int tracing_set_tracer(const char *buf)
3207 goto out; 3226 goto out;
3208 3227
3209 trace_branch_disable(); 3228 trace_branch_disable();
3210 if (current_trace && current_trace->reset) 3229 if (current_trace->reset)
3211 current_trace->reset(tr); 3230 current_trace->reset(tr);
3212 if (current_trace && current_trace->use_max_tr) { 3231
3232 had_max_tr = current_trace->allocated_snapshot;
3233 current_trace = &nop_trace;
3234
3235 if (had_max_tr && !t->use_max_tr) {
3236 /*
3237 * We need to make sure that the update_max_tr sees that
3238 * current_trace changed to nop_trace to keep it from
3239 * swapping the buffers after we resize it.
3240 * The update_max_tr is called from interrupts disabled
3241 * so a synchronized_sched() is sufficient.
3242 */
3243 synchronize_sched();
3213 /* 3244 /*
3214 * We don't free the ring buffer. instead, resize it because 3245 * We don't free the ring buffer. instead, resize it because
3215 * The max_tr ring buffer has some state (e.g. ring->clock) and 3246 * The max_tr ring buffer has some state (e.g. ring->clock) and
@@ -3217,18 +3248,19 @@ static int tracing_set_tracer(const char *buf)
3217 */ 3248 */
3218 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); 3249 ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
3219 set_buffer_entries(&max_tr, 1); 3250 set_buffer_entries(&max_tr, 1);
3251 tracing_reset_online_cpus(&max_tr);
3252 current_trace->allocated_snapshot = false;
3220 } 3253 }
3221 destroy_trace_option_files(topts); 3254 destroy_trace_option_files(topts);
3222 3255
3223 current_trace = &nop_trace;
3224
3225 topts = create_trace_option_files(t); 3256 topts = create_trace_option_files(t);
3226 if (t->use_max_tr) { 3257 if (t->use_max_tr && !had_max_tr) {
3227 /* we need to make per cpu buffer sizes equivalent */ 3258 /* we need to make per cpu buffer sizes equivalent */
3228 ret = resize_buffer_duplicate_size(&max_tr, &global_trace, 3259 ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
3229 RING_BUFFER_ALL_CPUS); 3260 RING_BUFFER_ALL_CPUS);
3230 if (ret < 0) 3261 if (ret < 0)
3231 goto out; 3262 goto out;
3263 t->allocated_snapshot = true;
3232 } 3264 }
3233 3265
3234 if (t->init) { 3266 if (t->init) {
@@ -3336,8 +3368,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
3336 ret = -ENOMEM; 3368 ret = -ENOMEM;
3337 goto fail; 3369 goto fail;
3338 } 3370 }
3339 if (current_trace) 3371 *iter->trace = *current_trace;
3340 *iter->trace = *current_trace;
3341 3372
3342 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { 3373 if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
3343 ret = -ENOMEM; 3374 ret = -ENOMEM;
@@ -3477,7 +3508,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3477 size_t cnt, loff_t *ppos) 3508 size_t cnt, loff_t *ppos)
3478{ 3509{
3479 struct trace_iterator *iter = filp->private_data; 3510 struct trace_iterator *iter = filp->private_data;
3480 static struct tracer *old_tracer;
3481 ssize_t sret; 3511 ssize_t sret;
3482 3512
3483 /* return any leftover data */ 3513 /* return any leftover data */
@@ -3489,10 +3519,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
3489 3519
3490 /* copy the tracer to avoid using a global lock all around */ 3520 /* copy the tracer to avoid using a global lock all around */
3491 mutex_lock(&trace_types_lock); 3521 mutex_lock(&trace_types_lock);
3492 if (unlikely(old_tracer != current_trace && current_trace)) { 3522 if (unlikely(iter->trace->name != current_trace->name))
3493 old_tracer = current_trace;
3494 *iter->trace = *current_trace; 3523 *iter->trace = *current_trace;
3495 }
3496 mutex_unlock(&trace_types_lock); 3524 mutex_unlock(&trace_types_lock);
3497 3525
3498 /* 3526 /*
@@ -3648,7 +3676,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3648 .ops = &tracing_pipe_buf_ops, 3676 .ops = &tracing_pipe_buf_ops,
3649 .spd_release = tracing_spd_release_pipe, 3677 .spd_release = tracing_spd_release_pipe,
3650 }; 3678 };
3651 static struct tracer *old_tracer;
3652 ssize_t ret; 3679 ssize_t ret;
3653 size_t rem; 3680 size_t rem;
3654 unsigned int i; 3681 unsigned int i;
@@ -3658,10 +3685,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
3658 3685
3659 /* copy the tracer to avoid using a global lock all around */ 3686 /* copy the tracer to avoid using a global lock all around */
3660 mutex_lock(&trace_types_lock); 3687 mutex_lock(&trace_types_lock);
3661 if (unlikely(old_tracer != current_trace && current_trace)) { 3688 if (unlikely(iter->trace->name != current_trace->name))
3662 old_tracer = current_trace;
3663 *iter->trace = *current_trace; 3689 *iter->trace = *current_trace;
3664 }
3665 mutex_unlock(&trace_types_lock); 3690 mutex_unlock(&trace_types_lock);
3666 3691
3667 mutex_lock(&iter->mutex); 3692 mutex_lock(&iter->mutex);
@@ -4037,8 +4062,7 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
4037 * Reset the buffer so that it doesn't have incomparable timestamps. 4062 * Reset the buffer so that it doesn't have incomparable timestamps.
4038 */ 4063 */
4039 tracing_reset_online_cpus(&global_trace); 4064 tracing_reset_online_cpus(&global_trace);
4040 if (max_tr.buffer) 4065 tracing_reset_online_cpus(&max_tr);
4041 tracing_reset_online_cpus(&max_tr);
4042 4066
4043 mutex_unlock(&trace_types_lock); 4067 mutex_unlock(&trace_types_lock);
4044 4068
@@ -4054,6 +4078,87 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
4054 return single_open(file, tracing_clock_show, NULL); 4078 return single_open(file, tracing_clock_show, NULL);
4055} 4079}
4056 4080
4081#ifdef CONFIG_TRACER_SNAPSHOT
4082static int tracing_snapshot_open(struct inode *inode, struct file *file)
4083{
4084 struct trace_iterator *iter;
4085 int ret = 0;
4086
4087 if (file->f_mode & FMODE_READ) {
4088 iter = __tracing_open(inode, file, true);
4089 if (IS_ERR(iter))
4090 ret = PTR_ERR(iter);
4091 }
4092 return ret;
4093}
4094
4095static ssize_t
4096tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
4097 loff_t *ppos)
4098{
4099 unsigned long val;
4100 int ret;
4101
4102 ret = tracing_update_buffers();
4103 if (ret < 0)
4104 return ret;
4105
4106 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
4107 if (ret)
4108 return ret;
4109
4110 mutex_lock(&trace_types_lock);
4111
4112 if (current_trace->use_max_tr) {
4113 ret = -EBUSY;
4114 goto out;
4115 }
4116
4117 switch (val) {
4118 case 0:
4119 if (current_trace->allocated_snapshot) {
4120 /* free spare buffer */
4121 ring_buffer_resize(max_tr.buffer, 1,
4122 RING_BUFFER_ALL_CPUS);
4123 set_buffer_entries(&max_tr, 1);
4124 tracing_reset_online_cpus(&max_tr);
4125 current_trace->allocated_snapshot = false;
4126 }
4127 break;
4128 case 1:
4129 if (!current_trace->allocated_snapshot) {
4130 /* allocate spare buffer */
4131 ret = resize_buffer_duplicate_size(&max_tr,
4132 &global_trace, RING_BUFFER_ALL_CPUS);
4133 if (ret < 0)
4134 break;
4135 current_trace->allocated_snapshot = true;
4136 }
4137
4138 local_irq_disable();
4139 /* Now, we're going to swap */
4140 update_max_tr(&global_trace, current, smp_processor_id());
4141 local_irq_enable();
4142 break;
4143 default:
4144 if (current_trace->allocated_snapshot)
4145 tracing_reset_online_cpus(&max_tr);
4146 else
4147 ret = -EINVAL;
4148 break;
4149 }
4150
4151 if (ret >= 0) {
4152 *ppos += cnt;
4153 ret = cnt;
4154 }
4155out:
4156 mutex_unlock(&trace_types_lock);
4157 return ret;
4158}
4159#endif /* CONFIG_TRACER_SNAPSHOT */
4160
4161
4057static const struct file_operations tracing_max_lat_fops = { 4162static const struct file_operations tracing_max_lat_fops = {
4058 .open = tracing_open_generic, 4163 .open = tracing_open_generic,
4059 .read = tracing_max_lat_read, 4164 .read = tracing_max_lat_read,
@@ -4110,6 +4215,16 @@ static const struct file_operations trace_clock_fops = {
4110 .write = tracing_clock_write, 4215 .write = tracing_clock_write,
4111}; 4216};
4112 4217
4218#ifdef CONFIG_TRACER_SNAPSHOT
4219static const struct file_operations snapshot_fops = {
4220 .open = tracing_snapshot_open,
4221 .read = seq_read,
4222 .write = tracing_snapshot_write,
4223 .llseek = tracing_seek,
4224 .release = tracing_release,
4225};
4226#endif /* CONFIG_TRACER_SNAPSHOT */
4227
4113struct ftrace_buffer_info { 4228struct ftrace_buffer_info {
4114 struct trace_array *tr; 4229 struct trace_array *tr;
4115 void *spare; 4230 void *spare;
@@ -4414,6 +4529,9 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
4414 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); 4529 cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
4415 trace_seq_printf(s, "dropped events: %ld\n", cnt); 4530 trace_seq_printf(s, "dropped events: %ld\n", cnt);
4416 4531
4532 cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
4533 trace_seq_printf(s, "read events: %ld\n", cnt);
4534
4417 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); 4535 count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
4418 4536
4419 kfree(s); 4537 kfree(s);
@@ -4490,7 +4608,7 @@ struct dentry *tracing_init_dentry(void)
4490 4608
4491static struct dentry *d_percpu; 4609static struct dentry *d_percpu;
4492 4610
4493struct dentry *tracing_dentry_percpu(void) 4611static struct dentry *tracing_dentry_percpu(void)
4494{ 4612{
4495 static int once; 4613 static int once;
4496 struct dentry *d_tracer; 4614 struct dentry *d_tracer;
@@ -4906,6 +5024,11 @@ static __init int tracer_init_debugfs(void)
4906 &ftrace_update_tot_cnt, &tracing_dyn_info_fops); 5024 &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
4907#endif 5025#endif
4908 5026
5027#ifdef CONFIG_TRACER_SNAPSHOT
5028 trace_create_file("snapshot", 0644, d_tracer,
5029 (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
5030#endif
5031
4909 create_trace_options_dir(); 5032 create_trace_options_dir();
4910 5033
4911 for_each_tracing_cpu(cpu) 5034 for_each_tracing_cpu(cpu)
@@ -5014,6 +5137,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5014 if (disable_tracing) 5137 if (disable_tracing)
5015 ftrace_kill(); 5138 ftrace_kill();
5016 5139
5140 /* Simulate the iterator */
5017 trace_init_global_iter(&iter); 5141 trace_init_global_iter(&iter);
5018 5142
5019 for_each_tracing_cpu(cpu) { 5143 for_each_tracing_cpu(cpu) {
@@ -5025,10 +5149,6 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
5025 /* don't look at user memory in panic mode */ 5149 /* don't look at user memory in panic mode */
5026 trace_flags &= ~TRACE_ITER_SYM_USEROBJ; 5150 trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
5027 5151
5028 /* Simulate the iterator */
5029 iter.tr = &global_trace;
5030 iter.trace = current_trace;
5031
5032 switch (oops_dump_mode) { 5152 switch (oops_dump_mode) {
5033 case DUMP_ALL: 5153 case DUMP_ALL:
5034 iter.cpu_file = TRACE_PIPE_ALL_CPU; 5154 iter.cpu_file = TRACE_PIPE_ALL_CPU;
@@ -5173,7 +5293,7 @@ __init static int tracer_alloc_buffers(void)
5173 init_irq_work(&trace_work_wakeup, trace_wake_up); 5293 init_irq_work(&trace_work_wakeup, trace_wake_up);
5174 5294
5175 register_tracer(&nop_trace); 5295 register_tracer(&nop_trace);
5176 current_trace = &nop_trace; 5296
5177 /* All seems OK, enable tracing */ 5297 /* All seems OK, enable tracing */
5178 tracing_disabled = 0; 5298 tracing_disabled = 0;
5179 5299
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c75d7988902c..57d7e5397d56 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -287,20 +287,62 @@ struct tracer {
287 struct tracer_flags *flags; 287 struct tracer_flags *flags;
288 bool print_max; 288 bool print_max;
289 bool use_max_tr; 289 bool use_max_tr;
290 bool allocated_snapshot;
290}; 291};
291 292
292 293
293/* Only current can touch trace_recursion */ 294/* Only current can touch trace_recursion */
294#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
295#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
296 295
297/* Ring buffer has the 10 LSB bits to count */ 296/*
298#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff) 297 * For function tracing recursion:
299 298 * The order of these bits are important.
300/* for function tracing recursion */ 299 *
301#define TRACE_INTERNAL_BIT (1<<11) 300 * When function tracing occurs, the following steps are made:
302#define TRACE_GLOBAL_BIT (1<<12) 301 * If arch does not support a ftrace feature:
303#define TRACE_CONTROL_BIT (1<<13) 302 * call internal function (uses INTERNAL bits) which calls...
303 * If callback is registered to the "global" list, the list
304 * function is called and recursion checks the GLOBAL bits.
305 * then this function calls...
306 * The function callback, which can use the FTRACE bits to
307 * check for recursion.
308 *
309 * Now if the arch does not suppport a feature, and it calls
310 * the global list function which calls the ftrace callback
311 * all three of these steps will do a recursion protection.
312 * There's no reason to do one if the previous caller already
313 * did. The recursion that we are protecting against will
314 * go through the same steps again.
315 *
316 * To prevent the multiple recursion checks, if a recursion
317 * bit is set that is higher than the MAX bit of the current
318 * check, then we know that the check was made by the previous
319 * caller, and we can skip the current check.
320 */
321enum {
322 TRACE_BUFFER_BIT,
323 TRACE_BUFFER_NMI_BIT,
324 TRACE_BUFFER_IRQ_BIT,
325 TRACE_BUFFER_SIRQ_BIT,
326
327 /* Start of function recursion bits */
328 TRACE_FTRACE_BIT,
329 TRACE_FTRACE_NMI_BIT,
330 TRACE_FTRACE_IRQ_BIT,
331 TRACE_FTRACE_SIRQ_BIT,
332
333 /* GLOBAL_BITs must be greater than FTRACE_BITs */
334 TRACE_GLOBAL_BIT,
335 TRACE_GLOBAL_NMI_BIT,
336 TRACE_GLOBAL_IRQ_BIT,
337 TRACE_GLOBAL_SIRQ_BIT,
338
339 /* INTERNAL_BITs must be greater than GLOBAL_BITs */
340 TRACE_INTERNAL_BIT,
341 TRACE_INTERNAL_NMI_BIT,
342 TRACE_INTERNAL_IRQ_BIT,
343 TRACE_INTERNAL_SIRQ_BIT,
344
345 TRACE_CONTROL_BIT,
304 346
305/* 347/*
306 * Abuse of the trace_recursion. 348 * Abuse of the trace_recursion.
@@ -309,11 +351,77 @@ struct tracer {
309 * was called in irq context but we have irq tracing off. Since this 351 * was called in irq context but we have irq tracing off. Since this
310 * can only be modified by current, we can reuse trace_recursion. 352 * can only be modified by current, we can reuse trace_recursion.
311 */ 353 */
312#define TRACE_IRQ_BIT (1<<13) 354 TRACE_IRQ_BIT,
355};
356
357#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0)
358#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0)
359#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit)))
360
361#define TRACE_CONTEXT_BITS 4
362
363#define TRACE_FTRACE_START TRACE_FTRACE_BIT
364#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
365
366#define TRACE_GLOBAL_START TRACE_GLOBAL_BIT
367#define TRACE_GLOBAL_MAX ((1 << (TRACE_GLOBAL_START + TRACE_CONTEXT_BITS)) - 1)
368
369#define TRACE_LIST_START TRACE_INTERNAL_BIT
370#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
371
372#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
373
374static __always_inline int trace_get_context_bit(void)
375{
376 int bit;
313 377
314#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0) 378 if (in_interrupt()) {
315#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0) 379 if (in_nmi())
316#define trace_recursion_test(bit) ((current)->trace_recursion & (bit)) 380 bit = 0;
381
382 else if (in_irq())
383 bit = 1;
384 else
385 bit = 2;
386 } else
387 bit = 3;
388
389 return bit;
390}
391
392static __always_inline int trace_test_and_set_recursion(int start, int max)
393{
394 unsigned int val = current->trace_recursion;
395 int bit;
396
397 /* A previous recursion check was made */
398 if ((val & TRACE_CONTEXT_MASK) > max)
399 return 0;
400
401 bit = trace_get_context_bit() + start;
402 if (unlikely(val & (1 << bit)))
403 return -1;
404
405 val |= 1 << bit;
406 current->trace_recursion = val;
407 barrier();
408
409 return bit;
410}
411
412static __always_inline void trace_clear_recursion(int bit)
413{
414 unsigned int val = current->trace_recursion;
415
416 if (!bit)
417 return;
418
419 bit = 1 << bit;
420 val &= ~bit;
421
422 barrier();
423 current->trace_recursion = val;
424}
317 425
318#define TRACE_PIPE_ALL_CPU -1 426#define TRACE_PIPE_ALL_CPU -1
319 427
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 1bbb1b200cec..aa8f5f48dae6 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -21,8 +21,6 @@
21#include <linux/ktime.h> 21#include <linux/ktime.h>
22#include <linux/trace_clock.h> 22#include <linux/trace_clock.h>
23 23
24#include "trace.h"
25
26/* 24/*
27 * trace_clock_local(): the simplest and least coherent tracing clock. 25 * trace_clock_local(): the simplest and least coherent tracing clock.
28 * 26 *
@@ -87,7 +85,7 @@ u64 notrace trace_clock_global(void)
87 local_irq_save(flags); 85 local_irq_save(flags);
88 86
89 this_cpu = raw_smp_processor_id(); 87 this_cpu = raw_smp_processor_id();
90 now = cpu_clock(this_cpu); 88 now = sched_clock_cpu(this_cpu);
91 /* 89 /*
92 * If in an NMI context then dont risk lockups and return the 90 * If in an NMI context then dont risk lockups and return the
93 * cpu_clock() time: 91 * cpu_clock() time:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 880073d0b946..57e9b284250c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,6 @@ static int trace_define_common_fields(void)
116 __common_field(unsigned char, flags); 116 __common_field(unsigned char, flags);
117 __common_field(unsigned char, preempt_count); 117 __common_field(unsigned char, preempt_count);
118 __common_field(int, pid); 118 __common_field(int, pid);
119 __common_field(int, padding);
120 119
121 return ret; 120 return ret;
122} 121}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 8e3ad8082ab7..601152523326 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -47,34 +47,6 @@ static void function_trace_start(struct trace_array *tr)
47 tracing_reset_online_cpus(tr); 47 tracing_reset_online_cpus(tr);
48} 48}
49 49
50static void
51function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip,
52 struct ftrace_ops *op, struct pt_regs *pt_regs)
53{
54 struct trace_array *tr = func_trace;
55 struct trace_array_cpu *data;
56 unsigned long flags;
57 long disabled;
58 int cpu;
59 int pc;
60
61 if (unlikely(!ftrace_function_enabled))
62 return;
63
64 pc = preempt_count();
65 preempt_disable_notrace();
66 local_save_flags(flags);
67 cpu = raw_smp_processor_id();
68 data = tr->data[cpu];
69 disabled = atomic_inc_return(&data->disabled);
70
71 if (likely(disabled == 1))
72 trace_function(tr, ip, parent_ip, flags, pc);
73
74 atomic_dec(&data->disabled);
75 preempt_enable_notrace();
76}
77
78/* Our option */ 50/* Our option */
79enum { 51enum {
80 TRACE_FUNC_OPT_STACK = 0x1, 52 TRACE_FUNC_OPT_STACK = 0x1,
@@ -85,34 +57,34 @@ static struct tracer_flags func_flags;
85static void 57static void
86function_trace_call(unsigned long ip, unsigned long parent_ip, 58function_trace_call(unsigned long ip, unsigned long parent_ip,
87 struct ftrace_ops *op, struct pt_regs *pt_regs) 59 struct ftrace_ops *op, struct pt_regs *pt_regs)
88
89{ 60{
90 struct trace_array *tr = func_trace; 61 struct trace_array *tr = func_trace;
91 struct trace_array_cpu *data; 62 struct trace_array_cpu *data;
92 unsigned long flags; 63 unsigned long flags;
93 long disabled; 64 int bit;
94 int cpu; 65 int cpu;
95 int pc; 66 int pc;
96 67
97 if (unlikely(!ftrace_function_enabled)) 68 if (unlikely(!ftrace_function_enabled))
98 return; 69 return;
99 70
100 /* 71 pc = preempt_count();
101 * Need to use raw, since this must be called before the 72 preempt_disable_notrace();
102 * recursive protection is performed.
103 */
104 local_irq_save(flags);
105 cpu = raw_smp_processor_id();
106 data = tr->data[cpu];
107 disabled = atomic_inc_return(&data->disabled);
108 73
109 if (likely(disabled == 1)) { 74 bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
110 pc = preempt_count(); 75 if (bit < 0)
76 goto out;
77
78 cpu = smp_processor_id();
79 data = tr->data[cpu];
80 if (!atomic_read(&data->disabled)) {
81 local_save_flags(flags);
111 trace_function(tr, ip, parent_ip, flags, pc); 82 trace_function(tr, ip, parent_ip, flags, pc);
112 } 83 }
84 trace_clear_recursion(bit);
113 85
114 atomic_dec(&data->disabled); 86 out:
115 local_irq_restore(flags); 87 preempt_enable_notrace();
116} 88}
117 89
118static void 90static void
@@ -185,11 +157,6 @@ static void tracing_start_function_trace(void)
185{ 157{
186 ftrace_function_enabled = 0; 158 ftrace_function_enabled = 0;
187 159
188 if (trace_flags & TRACE_ITER_PREEMPTONLY)
189 trace_ops.func = function_trace_call_preempt_only;
190 else
191 trace_ops.func = function_trace_call;
192
193 if (func_flags.val & TRACE_FUNC_OPT_STACK) 160 if (func_flags.val & TRACE_FUNC_OPT_STACK)
194 register_ftrace_function(&trace_stack_ops); 161 register_ftrace_function(&trace_stack_ops);
195 else 162 else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4edb4b74eb7e..39ada66389cc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -47,6 +47,8 @@ struct fgraph_data {
47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20 47#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
48#define TRACE_GRAPH_PRINT_IRQS 0x40 48#define TRACE_GRAPH_PRINT_IRQS 0x40
49 49
50static unsigned int max_depth;
51
50static struct tracer_opt trace_opts[] = { 52static struct tracer_opt trace_opts[] = {
51 /* Display overruns? (for self-debug purpose) */ 53 /* Display overruns? (for self-debug purpose) */
52 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) }, 54 { TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
@@ -189,10 +191,16 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer)
189 191
190 ftrace_pop_return_trace(&trace, &ret, frame_pointer); 192 ftrace_pop_return_trace(&trace, &ret, frame_pointer);
191 trace.rettime = trace_clock_local(); 193 trace.rettime = trace_clock_local();
192 ftrace_graph_return(&trace);
193 barrier(); 194 barrier();
194 current->curr_ret_stack--; 195 current->curr_ret_stack--;
195 196
197 /*
198 * The trace should run after decrementing the ret counter
199 * in case an interrupt were to come in. We don't want to
200 * lose the interrupt if max_depth is set.
201 */
202 ftrace_graph_return(&trace);
203
196 if (unlikely(!ret)) { 204 if (unlikely(!ret)) {
197 ftrace_graph_stop(); 205 ftrace_graph_stop();
198 WARN_ON(1); 206 WARN_ON(1);
@@ -250,8 +258,9 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
250 return 0; 258 return 0;
251 259
252 /* trace it when it is-nested-in or is a function enabled. */ 260 /* trace it when it is-nested-in or is a function enabled. */
253 if (!(trace->depth || ftrace_graph_addr(trace->func)) || 261 if ((!(trace->depth || ftrace_graph_addr(trace->func)) ||
254 ftrace_graph_ignore_irqs()) 262 ftrace_graph_ignore_irqs()) ||
263 (max_depth && trace->depth >= max_depth))
255 return 0; 264 return 0;
256 265
257 local_irq_save(flags); 266 local_irq_save(flags);
@@ -1457,6 +1466,59 @@ static struct tracer graph_trace __read_mostly = {
1457#endif 1466#endif
1458}; 1467};
1459 1468
1469
1470static ssize_t
1471graph_depth_write(struct file *filp, const char __user *ubuf, size_t cnt,
1472 loff_t *ppos)
1473{
1474 unsigned long val;
1475 int ret;
1476
1477 ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
1478 if (ret)
1479 return ret;
1480
1481 max_depth = val;
1482
1483 *ppos += cnt;
1484
1485 return cnt;
1486}
1487
1488static ssize_t
1489graph_depth_read(struct file *filp, char __user *ubuf, size_t cnt,
1490 loff_t *ppos)
1491{
1492 char buf[15]; /* More than enough to hold UINT_MAX + "\n"*/
1493 int n;
1494
1495 n = sprintf(buf, "%d\n", max_depth);
1496
1497 return simple_read_from_buffer(ubuf, cnt, ppos, buf, n);
1498}
1499
1500static const struct file_operations graph_depth_fops = {
1501 .open = tracing_open_generic,
1502 .write = graph_depth_write,
1503 .read = graph_depth_read,
1504 .llseek = generic_file_llseek,
1505};
1506
1507static __init int init_graph_debugfs(void)
1508{
1509 struct dentry *d_tracer;
1510
1511 d_tracer = tracing_init_dentry();
1512 if (!d_tracer)
1513 return 0;
1514
1515 trace_create_file("max_graph_depth", 0644, d_tracer,
1516 NULL, &graph_depth_fops);
1517
1518 return 0;
1519}
1520fs_initcall(init_graph_debugfs);
1521
1460static __init int init_graph_trace(void) 1522static __init int init_graph_trace(void)
1461{ 1523{
1462 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); 1524 max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index 933708677814..5c7e09d10d74 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -66,7 +66,6 @@
66#define TP_FLAG_TRACE 1 66#define TP_FLAG_TRACE 1
67#define TP_FLAG_PROFILE 2 67#define TP_FLAG_PROFILE 2
68#define TP_FLAG_REGISTERED 4 68#define TP_FLAG_REGISTERED 4
69#define TP_FLAG_UPROBE 8
70 69
71 70
72/* data_rloc: data relative location, compatible with u32 */ 71/* data_rloc: data relative location, compatible with u32 */
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 47623169a815..51c819c12c29 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -415,7 +415,8 @@ static void trace_selftest_test_recursion_func(unsigned long ip,
415 * The ftrace infrastructure should provide the recursion 415 * The ftrace infrastructure should provide the recursion
416 * protection. If not, this will crash the kernel! 416 * protection. If not, this will crash the kernel!
417 */ 417 */
418 trace_selftest_recursion_cnt++; 418 if (trace_selftest_recursion_cnt++ > 10)
419 return;
419 DYN_FTRACE_TEST_NAME(); 420 DYN_FTRACE_TEST_NAME();
420} 421}
421 422
@@ -452,7 +453,6 @@ trace_selftest_function_recursion(void)
452 char *func_name; 453 char *func_name;
453 int len; 454 int len;
454 int ret; 455 int ret;
455 int cnt;
456 456
457 /* The previous test PASSED */ 457 /* The previous test PASSED */
458 pr_cont("PASSED\n"); 458 pr_cont("PASSED\n");
@@ -510,19 +510,10 @@ trace_selftest_function_recursion(void)
510 510
511 unregister_ftrace_function(&test_recsafe_probe); 511 unregister_ftrace_function(&test_recsafe_probe);
512 512
513 /*
514 * If arch supports all ftrace features, and no other task
515 * was on the list, we should be fine.
516 */
517 if (!ftrace_nr_registered_ops() && !FTRACE_FORCE_LIST_FUNC)
518 cnt = 2; /* Should have recursed */
519 else
520 cnt = 1;
521
522 ret = -1; 513 ret = -1;
523 if (trace_selftest_recursion_cnt != cnt) { 514 if (trace_selftest_recursion_cnt != 2) {
524 pr_cont("*callback not called expected %d times (%d)* ", 515 pr_cont("*callback not called expected 2 times (%d)* ",
525 cnt, trace_selftest_recursion_cnt); 516 trace_selftest_recursion_cnt);
526 goto out; 517 goto out;
527 } 518 }
528 519
@@ -568,7 +559,7 @@ trace_selftest_function_regs(void)
568 int ret; 559 int ret;
569 int supported = 0; 560 int supported = 0;
570 561
571#ifdef ARCH_SUPPORTS_FTRACE_SAVE_REGS 562#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
572 supported = 1; 563 supported = 1;
573#endif 564#endif
574 565
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7609dd6714c2..5329e13e74a1 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -77,7 +77,7 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr)
77 return syscalls_metadata[nr]; 77 return syscalls_metadata[nr];
78} 78}
79 79
80enum print_line_t 80static enum print_line_t
81print_syscall_enter(struct trace_iterator *iter, int flags, 81print_syscall_enter(struct trace_iterator *iter, int flags,
82 struct trace_event *event) 82 struct trace_event *event)
83{ 83{
@@ -130,7 +130,7 @@ end:
130 return TRACE_TYPE_HANDLED; 130 return TRACE_TYPE_HANDLED;
131} 131}
132 132
133enum print_line_t 133static enum print_line_t
134print_syscall_exit(struct trace_iterator *iter, int flags, 134print_syscall_exit(struct trace_iterator *iter, int flags,
135 struct trace_event *event) 135 struct trace_event *event)
136{ 136{
@@ -270,7 +270,7 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
270 return ret; 270 return ret;
271} 271}
272 272
273void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) 273static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
274{ 274{
275 struct syscall_trace_enter *entry; 275 struct syscall_trace_enter *entry;
276 struct syscall_metadata *sys_data; 276 struct syscall_metadata *sys_data;
@@ -305,7 +305,7 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
305 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 305 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
306} 306}
307 307
308void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) 308static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
309{ 309{
310 struct syscall_trace_exit *entry; 310 struct syscall_trace_exit *entry;
311 struct syscall_metadata *sys_data; 311 struct syscall_metadata *sys_data;
@@ -337,7 +337,7 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
337 trace_current_buffer_unlock_commit(buffer, event, 0, 0); 337 trace_current_buffer_unlock_commit(buffer, event, 0, 0);
338} 338}
339 339
340int reg_event_syscall_enter(struct ftrace_event_call *call) 340static int reg_event_syscall_enter(struct ftrace_event_call *call)
341{ 341{
342 int ret = 0; 342 int ret = 0;
343 int num; 343 int num;
@@ -356,7 +356,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
356 return ret; 356 return ret;
357} 357}
358 358
359void unreg_event_syscall_enter(struct ftrace_event_call *call) 359static void unreg_event_syscall_enter(struct ftrace_event_call *call)
360{ 360{
361 int num; 361 int num;
362 362
@@ -371,7 +371,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
371 mutex_unlock(&syscall_trace_lock); 371 mutex_unlock(&syscall_trace_lock);
372} 372}
373 373
374int reg_event_syscall_exit(struct ftrace_event_call *call) 374static int reg_event_syscall_exit(struct ftrace_event_call *call)
375{ 375{
376 int ret = 0; 376 int ret = 0;
377 int num; 377 int num;
@@ -390,7 +390,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
390 return ret; 390 return ret;
391} 391}
392 392
393void unreg_event_syscall_exit(struct ftrace_event_call *call) 393static void unreg_event_syscall_exit(struct ftrace_event_call *call)
394{ 394{
395 int num; 395 int num;
396 396
@@ -459,7 +459,7 @@ unsigned long __init __weak arch_syscall_addr(int nr)
459 return (unsigned long)sys_call_table[nr]; 459 return (unsigned long)sys_call_table[nr];
460} 460}
461 461
462int __init init_ftrace_syscalls(void) 462static int __init init_ftrace_syscalls(void)
463{ 463{
464 struct syscall_metadata *meta; 464 struct syscall_metadata *meta;
465 unsigned long addr; 465 unsigned long addr;
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c86e6d4f67fb..8dad2a92dee9 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -28,20 +28,21 @@
28 28
29#define UPROBE_EVENT_SYSTEM "uprobes" 29#define UPROBE_EVENT_SYSTEM "uprobes"
30 30
31struct trace_uprobe_filter {
32 rwlock_t rwlock;
33 int nr_systemwide;
34 struct list_head perf_events;
35};
36
31/* 37/*
32 * uprobe event core functions 38 * uprobe event core functions
33 */ 39 */
34struct trace_uprobe;
35struct uprobe_trace_consumer {
36 struct uprobe_consumer cons;
37 struct trace_uprobe *tu;
38};
39
40struct trace_uprobe { 40struct trace_uprobe {
41 struct list_head list; 41 struct list_head list;
42 struct ftrace_event_class class; 42 struct ftrace_event_class class;
43 struct ftrace_event_call call; 43 struct ftrace_event_call call;
44 struct uprobe_trace_consumer *consumer; 44 struct trace_uprobe_filter filter;
45 struct uprobe_consumer consumer;
45 struct inode *inode; 46 struct inode *inode;
46 char *filename; 47 char *filename;
47 unsigned long offset; 48 unsigned long offset;
@@ -64,6 +65,18 @@ static LIST_HEAD(uprobe_list);
64 65
65static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs); 66static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
66 67
68static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
69{
70 rwlock_init(&filter->rwlock);
71 filter->nr_systemwide = 0;
72 INIT_LIST_HEAD(&filter->perf_events);
73}
74
75static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
76{
77 return !filter->nr_systemwide && list_empty(&filter->perf_events);
78}
79
67/* 80/*
68 * Allocate new trace_uprobe and initialize it (including uprobes). 81 * Allocate new trace_uprobe and initialize it (including uprobes).
69 */ 82 */
@@ -92,6 +105,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
92 goto error; 105 goto error;
93 106
94 INIT_LIST_HEAD(&tu->list); 107 INIT_LIST_HEAD(&tu->list);
108 tu->consumer.handler = uprobe_dispatcher;
109 init_trace_uprobe_filter(&tu->filter);
95 return tu; 110 return tu;
96 111
97error: 112error:
@@ -253,12 +268,18 @@ static int create_trace_uprobe(int argc, char **argv)
253 if (ret) 268 if (ret)
254 goto fail_address_parse; 269 goto fail_address_parse;
255 270
271 inode = igrab(path.dentry->d_inode);
272 path_put(&path);
273
274 if (!inode || !S_ISREG(inode->i_mode)) {
275 ret = -EINVAL;
276 goto fail_address_parse;
277 }
278
256 ret = kstrtoul(arg, 0, &offset); 279 ret = kstrtoul(arg, 0, &offset);
257 if (ret) 280 if (ret)
258 goto fail_address_parse; 281 goto fail_address_parse;
259 282
260 inode = igrab(path.dentry->d_inode);
261
262 argc -= 2; 283 argc -= 2;
263 argv += 2; 284 argv += 2;
264 285
@@ -356,7 +377,7 @@ fail_address_parse:
356 if (inode) 377 if (inode)
357 iput(inode); 378 iput(inode);
358 379
359 pr_info("Failed to parse address.\n"); 380 pr_info("Failed to parse address or file.\n");
360 381
361 return ret; 382 return ret;
362} 383}
@@ -465,7 +486,7 @@ static const struct file_operations uprobe_profile_ops = {
465}; 486};
466 487
467/* uprobe handler */ 488/* uprobe handler */
468static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs) 489static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
469{ 490{
470 struct uprobe_trace_entry_head *entry; 491 struct uprobe_trace_entry_head *entry;
471 struct ring_buffer_event *event; 492 struct ring_buffer_event *event;
@@ -475,8 +496,6 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
475 unsigned long irq_flags; 496 unsigned long irq_flags;
476 struct ftrace_event_call *call = &tu->call; 497 struct ftrace_event_call *call = &tu->call;
477 498
478 tu->nhit++;
479
480 local_save_flags(irq_flags); 499 local_save_flags(irq_flags);
481 pc = preempt_count(); 500 pc = preempt_count();
482 501
@@ -485,16 +504,18 @@ static void uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
485 event = trace_current_buffer_lock_reserve(&buffer, call->event.type, 504 event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
486 size, irq_flags, pc); 505 size, irq_flags, pc);
487 if (!event) 506 if (!event)
488 return; 507 return 0;
489 508
490 entry = ring_buffer_event_data(event); 509 entry = ring_buffer_event_data(event);
491 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); 510 entry->ip = instruction_pointer(task_pt_regs(current));
492 data = (u8 *)&entry[1]; 511 data = (u8 *)&entry[1];
493 for (i = 0; i < tu->nr_args; i++) 512 for (i = 0; i < tu->nr_args; i++)
494 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 513 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
495 514
496 if (!filter_current_check_discard(buffer, call, entry, event)) 515 if (!filter_current_check_discard(buffer, call, entry, event))
497 trace_buffer_unlock_commit(buffer, event, irq_flags, pc); 516 trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
517
518 return 0;
498} 519}
499 520
500/* Event entry printers */ 521/* Event entry printers */
@@ -533,42 +554,43 @@ partial:
533 return TRACE_TYPE_PARTIAL_LINE; 554 return TRACE_TYPE_PARTIAL_LINE;
534} 555}
535 556
536static int probe_event_enable(struct trace_uprobe *tu, int flag) 557static inline bool is_trace_uprobe_enabled(struct trace_uprobe *tu)
537{ 558{
538 struct uprobe_trace_consumer *utc; 559 return tu->flags & (TP_FLAG_TRACE | TP_FLAG_PROFILE);
539 int ret = 0; 560}
540 561
541 if (!tu->inode || tu->consumer) 562typedef bool (*filter_func_t)(struct uprobe_consumer *self,
542 return -EINTR; 563 enum uprobe_filter_ctx ctx,
564 struct mm_struct *mm);
543 565
544 utc = kzalloc(sizeof(struct uprobe_trace_consumer), GFP_KERNEL); 566static int
545 if (!utc) 567probe_event_enable(struct trace_uprobe *tu, int flag, filter_func_t filter)
568{
569 int ret = 0;
570
571 if (is_trace_uprobe_enabled(tu))
546 return -EINTR; 572 return -EINTR;
547 573
548 utc->cons.handler = uprobe_dispatcher; 574 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
549 utc->cons.filter = NULL;
550 ret = uprobe_register(tu->inode, tu->offset, &utc->cons);
551 if (ret) {
552 kfree(utc);
553 return ret;
554 }
555 575
556 tu->flags |= flag; 576 tu->flags |= flag;
557 utc->tu = tu; 577 tu->consumer.filter = filter;
558 tu->consumer = utc; 578 ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
579 if (ret)
580 tu->flags &= ~flag;
559 581
560 return 0; 582 return ret;
561} 583}
562 584
563static void probe_event_disable(struct trace_uprobe *tu, int flag) 585static void probe_event_disable(struct trace_uprobe *tu, int flag)
564{ 586{
565 if (!tu->inode || !tu->consumer) 587 if (!is_trace_uprobe_enabled(tu))
566 return; 588 return;
567 589
568 uprobe_unregister(tu->inode, tu->offset, &tu->consumer->cons); 590 WARN_ON(!uprobe_filter_is_empty(&tu->filter));
591
592 uprobe_unregister(tu->inode, tu->offset, &tu->consumer);
569 tu->flags &= ~flag; 593 tu->flags &= ~flag;
570 kfree(tu->consumer);
571 tu->consumer = NULL;
572} 594}
573 595
574static int uprobe_event_define_fields(struct ftrace_event_call *event_call) 596static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
@@ -642,8 +664,96 @@ static int set_print_fmt(struct trace_uprobe *tu)
642} 664}
643 665
644#ifdef CONFIG_PERF_EVENTS 666#ifdef CONFIG_PERF_EVENTS
667static bool
668__uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
669{
670 struct perf_event *event;
671
672 if (filter->nr_systemwide)
673 return true;
674
675 list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
676 if (event->hw.tp_target->mm == mm)
677 return true;
678 }
679
680 return false;
681}
682
683static inline bool
684uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
685{
686 return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
687}
688
689static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
690{
691 bool done;
692
693 write_lock(&tu->filter.rwlock);
694 if (event->hw.tp_target) {
695 /*
696 * event->parent != NULL means copy_process(), we can avoid
697 * uprobe_apply(). current->mm must be probed and we can rely
698 * on dup_mmap() which preserves the already installed bp's.
699 *
700 * attr.enable_on_exec means that exec/mmap will install the
701 * breakpoints we need.
702 */
703 done = tu->filter.nr_systemwide ||
704 event->parent || event->attr.enable_on_exec ||
705 uprobe_filter_event(tu, event);
706 list_add(&event->hw.tp_list, &tu->filter.perf_events);
707 } else {
708 done = tu->filter.nr_systemwide;
709 tu->filter.nr_systemwide++;
710 }
711 write_unlock(&tu->filter.rwlock);
712
713 if (!done)
714 uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
715
716 return 0;
717}
718
719static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
720{
721 bool done;
722
723 write_lock(&tu->filter.rwlock);
724 if (event->hw.tp_target) {
725 list_del(&event->hw.tp_list);
726 done = tu->filter.nr_systemwide ||
727 (event->hw.tp_target->flags & PF_EXITING) ||
728 uprobe_filter_event(tu, event);
729 } else {
730 tu->filter.nr_systemwide--;
731 done = tu->filter.nr_systemwide;
732 }
733 write_unlock(&tu->filter.rwlock);
734
735 if (!done)
736 uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
737
738 return 0;
739}
740
741static bool uprobe_perf_filter(struct uprobe_consumer *uc,
742 enum uprobe_filter_ctx ctx, struct mm_struct *mm)
743{
744 struct trace_uprobe *tu;
745 int ret;
746
747 tu = container_of(uc, struct trace_uprobe, consumer);
748 read_lock(&tu->filter.rwlock);
749 ret = __uprobe_perf_filter(&tu->filter, mm);
750 read_unlock(&tu->filter.rwlock);
751
752 return ret;
753}
754
645/* uprobe profile handler */ 755/* uprobe profile handler */
646static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs) 756static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
647{ 757{
648 struct ftrace_event_call *call = &tu->call; 758 struct ftrace_event_call *call = &tu->call;
649 struct uprobe_trace_entry_head *entry; 759 struct uprobe_trace_entry_head *entry;
@@ -652,11 +762,14 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
652 int size, __size, i; 762 int size, __size, i;
653 int rctx; 763 int rctx;
654 764
765 if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
766 return UPROBE_HANDLER_REMOVE;
767
655 __size = sizeof(*entry) + tu->size; 768 __size = sizeof(*entry) + tu->size;
656 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 769 size = ALIGN(__size + sizeof(u32), sizeof(u64));
657 size -= sizeof(u32); 770 size -= sizeof(u32);
658 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) 771 if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
659 return; 772 return 0;
660 773
661 preempt_disable(); 774 preempt_disable();
662 775
@@ -664,7 +777,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
664 if (!entry) 777 if (!entry)
665 goto out; 778 goto out;
666 779
667 entry->ip = uprobe_get_swbp_addr(task_pt_regs(current)); 780 entry->ip = instruction_pointer(task_pt_regs(current));
668 data = (u8 *)&entry[1]; 781 data = (u8 *)&entry[1];
669 for (i = 0; i < tu->nr_args; i++) 782 for (i = 0; i < tu->nr_args; i++)
670 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset); 783 call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
@@ -674,6 +787,7 @@ static void uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
674 787
675 out: 788 out:
676 preempt_enable(); 789 preempt_enable();
790 return 0;
677} 791}
678#endif /* CONFIG_PERF_EVENTS */ 792#endif /* CONFIG_PERF_EVENTS */
679 793
@@ -684,7 +798,7 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
684 798
685 switch (type) { 799 switch (type) {
686 case TRACE_REG_REGISTER: 800 case TRACE_REG_REGISTER:
687 return probe_event_enable(tu, TP_FLAG_TRACE); 801 return probe_event_enable(tu, TP_FLAG_TRACE, NULL);
688 802
689 case TRACE_REG_UNREGISTER: 803 case TRACE_REG_UNREGISTER:
690 probe_event_disable(tu, TP_FLAG_TRACE); 804 probe_event_disable(tu, TP_FLAG_TRACE);
@@ -692,11 +806,18 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
692 806
693#ifdef CONFIG_PERF_EVENTS 807#ifdef CONFIG_PERF_EVENTS
694 case TRACE_REG_PERF_REGISTER: 808 case TRACE_REG_PERF_REGISTER:
695 return probe_event_enable(tu, TP_FLAG_PROFILE); 809 return probe_event_enable(tu, TP_FLAG_PROFILE, uprobe_perf_filter);
696 810
697 case TRACE_REG_PERF_UNREGISTER: 811 case TRACE_REG_PERF_UNREGISTER:
698 probe_event_disable(tu, TP_FLAG_PROFILE); 812 probe_event_disable(tu, TP_FLAG_PROFILE);
699 return 0; 813 return 0;
814
815 case TRACE_REG_PERF_OPEN:
816 return uprobe_perf_open(tu, data);
817
818 case TRACE_REG_PERF_CLOSE:
819 return uprobe_perf_close(tu, data);
820
700#endif 821#endif
701 default: 822 default:
702 return 0; 823 return 0;
@@ -706,22 +827,20 @@ int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type,
706 827
707static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs) 828static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
708{ 829{
709 struct uprobe_trace_consumer *utc;
710 struct trace_uprobe *tu; 830 struct trace_uprobe *tu;
831 int ret = 0;
711 832
712 utc = container_of(con, struct uprobe_trace_consumer, cons); 833 tu = container_of(con, struct trace_uprobe, consumer);
713 tu = utc->tu; 834 tu->nhit++;
714 if (!tu || tu->consumer != utc)
715 return 0;
716 835
717 if (tu->flags & TP_FLAG_TRACE) 836 if (tu->flags & TP_FLAG_TRACE)
718 uprobe_trace_func(tu, regs); 837 ret |= uprobe_trace_func(tu, regs);
719 838
720#ifdef CONFIG_PERF_EVENTS 839#ifdef CONFIG_PERF_EVENTS
721 if (tu->flags & TP_FLAG_PROFILE) 840 if (tu->flags & TP_FLAG_PROFILE)
722 uprobe_perf_func(tu, regs); 841 ret |= uprobe_perf_func(tu, regs);
723#endif 842#endif
724 return 0; 843 return ret;
725} 844}
726 845
727static struct trace_event_functions uprobe_funcs = { 846static struct trace_event_functions uprobe_funcs = {