aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/trace
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/trace')
-rw-r--r--kernel/trace/Kconfig3
-rw-r--r--kernel/trace/blktrace.c30
-rw-r--r--kernel/trace/bpf_trace.c27
-rw-r--r--kernel/trace/ftrace.c29
-rw-r--r--kernel/trace/ring_buffer.c79
-rw-r--r--kernel/trace/trace.c99
-rw-r--r--kernel/trace/trace_benchmark.c2
-rw-r--r--kernel/trace/trace_events.c16
-rw-r--r--kernel/trace/trace_events_trigger.c13
-rw-r--r--kernel/trace/trace_functions.c49
-rw-r--r--kernel/trace/trace_stack.c4
11 files changed, 212 insertions, 139 deletions
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index af7dad126c13..f54dc62b599c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS
164 bool "Enable trace events for preempt and irq disable/enable" 164 bool "Enable trace events for preempt and irq disable/enable"
165 select TRACE_IRQFLAGS 165 select TRACE_IRQFLAGS
166 depends on DEBUG_PREEMPT || !PROVE_LOCKING 166 depends on DEBUG_PREEMPT || !PROVE_LOCKING
167 depends on TRACING
167 default n 168 default n
168 help 169 help
169 Enable tracing of disable and enable events for preemption and irqs. 170 Enable tracing of disable and enable events for preemption and irqs.
@@ -354,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES
354 on if you need to profile the system's use of these macros. 355 on if you need to profile the system's use of these macros.
355 356
356config PROFILE_ALL_BRANCHES 357config PROFILE_ALL_BRANCHES
357 bool "Profile all if conditionals" 358 bool "Profile all if conditionals" if !FORTIFY_SOURCE
358 select TRACE_BRANCH_PROFILING 359 select TRACE_BRANCH_PROFILING
359 help 360 help
360 This tracer profiles all branch conditions. Every if () 361 This tracer profiles all branch conditions. Every if ()
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 206e0e2ace53..987d9a9ae283 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -591,7 +591,7 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
591 return ret; 591 return ret;
592 592
593 if (copy_to_user(arg, &buts, sizeof(buts))) { 593 if (copy_to_user(arg, &buts, sizeof(buts))) {
594 blk_trace_remove(q); 594 __blk_trace_remove(q);
595 return -EFAULT; 595 return -EFAULT;
596 } 596 }
597 return 0; 597 return 0;
@@ -637,7 +637,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
637 return ret; 637 return ret;
638 638
639 if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { 639 if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
640 blk_trace_remove(q); 640 __blk_trace_remove(q);
641 return -EFAULT; 641 return -EFAULT;
642 } 642 }
643 643
@@ -872,7 +872,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
872 * 872 *
873 **/ 873 **/
874static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, 874static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
875 u32 what, int error, union kernfs_node_id *cgid) 875 u32 what, int error)
876{ 876{
877 struct blk_trace *bt = q->blk_trace; 877 struct blk_trace *bt = q->blk_trace;
878 878
@@ -880,22 +880,21 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
880 return; 880 return;
881 881
882 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, 882 __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
883 bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid); 883 bio_op(bio), bio->bi_opf, what, error, 0, NULL,
884 blk_trace_bio_get_cgid(q, bio));
884} 885}
885 886
886static void blk_add_trace_bio_bounce(void *ignore, 887static void blk_add_trace_bio_bounce(void *ignore,
887 struct request_queue *q, struct bio *bio) 888 struct request_queue *q, struct bio *bio)
888{ 889{
889 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0, 890 blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
890 blk_trace_bio_get_cgid(q, bio));
891} 891}
892 892
893static void blk_add_trace_bio_complete(void *ignore, 893static void blk_add_trace_bio_complete(void *ignore,
894 struct request_queue *q, struct bio *bio, 894 struct request_queue *q, struct bio *bio,
895 int error) 895 int error)
896{ 896{
897 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error, 897 blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
898 blk_trace_bio_get_cgid(q, bio));
899} 898}
900 899
901static void blk_add_trace_bio_backmerge(void *ignore, 900static void blk_add_trace_bio_backmerge(void *ignore,
@@ -903,8 +902,7 @@ static void blk_add_trace_bio_backmerge(void *ignore,
903 struct request *rq, 902 struct request *rq,
904 struct bio *bio) 903 struct bio *bio)
905{ 904{
906 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0, 905 blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
907 blk_trace_bio_get_cgid(q, bio));
908} 906}
909 907
910static void blk_add_trace_bio_frontmerge(void *ignore, 908static void blk_add_trace_bio_frontmerge(void *ignore,
@@ -912,15 +910,13 @@ static void blk_add_trace_bio_frontmerge(void *ignore,
912 struct request *rq, 910 struct request *rq,
913 struct bio *bio) 911 struct bio *bio)
914{ 912{
915 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0, 913 blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
916 blk_trace_bio_get_cgid(q, bio));
917} 914}
918 915
919static void blk_add_trace_bio_queue(void *ignore, 916static void blk_add_trace_bio_queue(void *ignore,
920 struct request_queue *q, struct bio *bio) 917 struct request_queue *q, struct bio *bio)
921{ 918{
922 blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0, 919 blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
923 blk_trace_bio_get_cgid(q, bio));
924} 920}
925 921
926static void blk_add_trace_getrq(void *ignore, 922static void blk_add_trace_getrq(void *ignore,
@@ -928,8 +924,7 @@ static void blk_add_trace_getrq(void *ignore,
928 struct bio *bio, int rw) 924 struct bio *bio, int rw)
929{ 925{
930 if (bio) 926 if (bio)
931 blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0, 927 blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
932 blk_trace_bio_get_cgid(q, bio));
933 else { 928 else {
934 struct blk_trace *bt = q->blk_trace; 929 struct blk_trace *bt = q->blk_trace;
935 930
@@ -945,8 +940,7 @@ static void blk_add_trace_sleeprq(void *ignore,
945 struct bio *bio, int rw) 940 struct bio *bio, int rw)
946{ 941{
947 if (bio) 942 if (bio)
948 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0, 943 blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
949 blk_trace_bio_get_cgid(q, bio));
950 else { 944 else {
951 struct blk_trace *bt = q->blk_trace; 945 struct blk_trace *bt = q->blk_trace;
952 946
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 27d1f4ffa3de..40207c2a4113 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -343,14 +343,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = {
343 .arg4_type = ARG_CONST_SIZE, 343 .arg4_type = ARG_CONST_SIZE,
344}; 344};
345 345
346static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); 346static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd);
347 347
348static __always_inline u64 348static __always_inline u64
349__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, 349__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
350 u64 flags, struct perf_raw_record *raw) 350 u64 flags, struct perf_sample_data *sd)
351{ 351{
352 struct bpf_array *array = container_of(map, struct bpf_array, map); 352 struct bpf_array *array = container_of(map, struct bpf_array, map);
353 struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
354 unsigned int cpu = smp_processor_id(); 353 unsigned int cpu = smp_processor_id();
355 u64 index = flags & BPF_F_INDEX_MASK; 354 u64 index = flags & BPF_F_INDEX_MASK;
356 struct bpf_event_entry *ee; 355 struct bpf_event_entry *ee;
@@ -373,8 +372,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
373 if (unlikely(event->oncpu != cpu)) 372 if (unlikely(event->oncpu != cpu))
374 return -EOPNOTSUPP; 373 return -EOPNOTSUPP;
375 374
376 perf_sample_data_init(sd, 0, 0);
377 sd->raw = raw;
378 perf_event_output(event, sd, regs); 375 perf_event_output(event, sd, regs);
379 return 0; 376 return 0;
380} 377}
@@ -382,6 +379,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
382BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, 379BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
383 u64, flags, void *, data, u64, size) 380 u64, flags, void *, data, u64, size)
384{ 381{
382 struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd);
385 struct perf_raw_record raw = { 383 struct perf_raw_record raw = {
386 .frag = { 384 .frag = {
387 .size = size, 385 .size = size,
@@ -392,7 +390,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map,
392 if (unlikely(flags & ~(BPF_F_INDEX_MASK))) 390 if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
393 return -EINVAL; 391 return -EINVAL;
394 392
395 return __bpf_perf_event_output(regs, map, flags, &raw); 393 perf_sample_data_init(sd, 0, 0);
394 sd->raw = &raw;
395
396 return __bpf_perf_event_output(regs, map, flags, sd);
396} 397}
397 398
398static const struct bpf_func_proto bpf_perf_event_output_proto = { 399static const struct bpf_func_proto bpf_perf_event_output_proto = {
@@ -407,10 +408,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = {
407}; 408};
408 409
409static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); 410static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs);
411static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd);
410 412
411u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, 413u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
412 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) 414 void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
413{ 415{
416 struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd);
414 struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); 417 struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs);
415 struct perf_raw_frag frag = { 418 struct perf_raw_frag frag = {
416 .copy = ctx_copy, 419 .copy = ctx_copy,
@@ -428,8 +431,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
428 }; 431 };
429 432
430 perf_fetch_caller_regs(regs); 433 perf_fetch_caller_regs(regs);
434 perf_sample_data_init(sd, 0, 0);
435 sd->raw = &raw;
431 436
432 return __bpf_perf_event_output(regs, map, flags, &raw); 437 return __bpf_perf_event_output(regs, map, flags, sd);
433} 438}
434 439
435BPF_CALL_0(bpf_get_current_task) 440BPF_CALL_0(bpf_get_current_task)
@@ -759,6 +764,8 @@ const struct bpf_prog_ops perf_event_prog_ops = {
759 764
760static DEFINE_MUTEX(bpf_event_mutex); 765static DEFINE_MUTEX(bpf_event_mutex);
761 766
767#define BPF_TRACE_MAX_PROGS 64
768
762int perf_event_attach_bpf_prog(struct perf_event *event, 769int perf_event_attach_bpf_prog(struct perf_event *event,
763 struct bpf_prog *prog) 770 struct bpf_prog *prog)
764{ 771{
@@ -772,6 +779,12 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
772 goto unlock; 779 goto unlock;
773 780
774 old_array = event->tp_event->prog_array; 781 old_array = event->tp_event->prog_array;
782 if (old_array &&
783 bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) {
784 ret = -E2BIG;
785 goto unlock;
786 }
787
775 ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); 788 ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array);
776 if (ret < 0) 789 if (ret < 0)
777 goto unlock; 790 goto unlock;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ccdf3664e4a9..554b517c61a0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = {
1119}; 1119};
1120 1120
1121/* 1121/*
1122 * This is used by __kernel_text_address() to return true if the 1122 * Used by the stack undwinder to know about dynamic ftrace trampolines.
1123 * address is on a dynamically allocated trampoline that would
1124 * not return true for either core_kernel_text() or
1125 * is_module_text_address().
1126 */ 1123 */
1127bool is_ftrace_trampoline(unsigned long addr) 1124struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr)
1128{ 1125{
1129 struct ftrace_ops *op; 1126 struct ftrace_ops *op = NULL;
1130 bool ret = false;
1131 1127
1132 /* 1128 /*
1133 * Some of the ops may be dynamically allocated, 1129 * Some of the ops may be dynamically allocated,
@@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr)
1144 if (op->trampoline && op->trampoline_size) 1140 if (op->trampoline && op->trampoline_size)
1145 if (addr >= op->trampoline && 1141 if (addr >= op->trampoline &&
1146 addr < op->trampoline + op->trampoline_size) { 1142 addr < op->trampoline + op->trampoline_size) {
1147 ret = true; 1143 preempt_enable_notrace();
1148 goto out; 1144 return op;
1149 } 1145 }
1150 } while_for_each_ftrace_op(op); 1146 } while_for_each_ftrace_op(op);
1151
1152 out:
1153 preempt_enable_notrace(); 1147 preempt_enable_notrace();
1154 1148
1155 return ret; 1149 return NULL;
1150}
1151
1152/*
1153 * This is used by __kernel_text_address() to return true if the
1154 * address is on a dynamically allocated trampoline that would
1155 * not return true for either core_kernel_text() or
1156 * is_module_text_address().
1157 */
1158bool is_ftrace_trampoline(unsigned long addr)
1159{
1160 return ftrace_ops_trampoline(addr) != NULL;
1156} 1161}
1157 1162
1158struct ftrace_page { 1163struct ftrace_page {
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d24d48713ef3..ca6930e0d25e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
280/* Missed count stored at end */ 280/* Missed count stored at end */
281#define RB_MISSED_STORED (1 << 30) 281#define RB_MISSED_STORED (1 << 30)
282 282
283#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED)
284
283struct buffer_data_page { 285struct buffer_data_page {
284 u64 time_stamp; /* page time stamp */ 286 u64 time_stamp; /* page time stamp */
285 local_t commit; /* write committed index */ 287 local_t commit; /* write committed index */
@@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
331 */ 333 */
332size_t ring_buffer_page_len(void *page) 334size_t ring_buffer_page_len(void *page)
333{ 335{
334 return local_read(&((struct buffer_data_page *)page)->commit) 336 struct buffer_data_page *bpage = page;
337
338 return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS)
335 + BUF_PAGE_HDR_SIZE; 339 + BUF_PAGE_HDR_SIZE;
336} 340}
337 341
@@ -1799,12 +1803,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
1799} 1803}
1800EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); 1804EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
1801 1805
1802static __always_inline void *
1803__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
1804{
1805 return bpage->data + index;
1806}
1807
1808static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) 1806static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index)
1809{ 1807{
1810 return bpage->page->data + index; 1808 return bpage->page->data + index;
@@ -2536,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
2536 * The lock and unlock are done within a preempt disable section. 2534 * The lock and unlock are done within a preempt disable section.
2537 * The current_context per_cpu variable can only be modified 2535 * The current_context per_cpu variable can only be modified
2538 * by the current task between lock and unlock. But it can 2536 * by the current task between lock and unlock. But it can
2539 * be modified more than once via an interrupt. There are four 2537 * be modified more than once via an interrupt. To pass this
2540 * different contexts that we need to consider. 2538 * information from the lock to the unlock without having to
2539 * access the 'in_interrupt()' functions again (which do show
2540 * a bit of overhead in something as critical as function tracing,
2541 * we use a bitmask trick.
2542 *
2543 * bit 0 = NMI context
2544 * bit 1 = IRQ context
2545 * bit 2 = SoftIRQ context
2546 * bit 3 = normal context.
2547 *
2548 * This works because this is the order of contexts that can
2549 * preempt other contexts. A SoftIRQ never preempts an IRQ
2550 * context.
2551 *
2552 * When the context is determined, the corresponding bit is
2553 * checked and set (if it was set, then a recursion of that context
2554 * happened).
2555 *
2556 * On unlock, we need to clear this bit. To do so, just subtract
2557 * 1 from the current_context and AND it to itself.
2541 * 2558 *
2542 * Normal context. 2559 * (binary)
2543 * SoftIRQ context 2560 * 101 - 1 = 100
2544 * IRQ context 2561 * 101 & 100 = 100 (clearing bit zero)
2545 * NMI context
2546 * 2562 *
2547 * If for some reason the ring buffer starts to recurse, we 2563 * 1010 - 1 = 1001
2548 * only allow that to happen at most 4 times (one for each 2564 * 1010 & 1001 = 1000 (clearing bit 1)
2549 * context). If it happens 5 times, then we consider this a 2565 *
2550 * recusive loop and do not let it go further. 2566 * The least significant bit can be cleared this way, and it
2567 * just so happens that it is the same bit corresponding to
2568 * the current context.
2551 */ 2569 */
2552 2570
2553static __always_inline int 2571static __always_inline int
2554trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) 2572trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
2555{ 2573{
2556 if (cpu_buffer->current_context >= 4) 2574 unsigned int val = cpu_buffer->current_context;
2575 unsigned long pc = preempt_count();
2576 int bit;
2577
2578 if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
2579 bit = RB_CTX_NORMAL;
2580 else
2581 bit = pc & NMI_MASK ? RB_CTX_NMI :
2582 pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ;
2583
2584 if (unlikely(val & (1 << bit)))
2557 return 1; 2585 return 1;
2558 2586
2559 cpu_buffer->current_context++; 2587 val |= (1 << bit);
2560 /* Interrupts must see this update */ 2588 cpu_buffer->current_context = val;
2561 barrier();
2562 2589
2563 return 0; 2590 return 0;
2564} 2591}
@@ -2566,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
2566static __always_inline void 2593static __always_inline void
2567trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) 2594trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
2568{ 2595{
2569 /* Don't let the dec leak out */ 2596 cpu_buffer->current_context &= cpu_buffer->current_context - 1;
2570 barrier();
2571 cpu_buffer->current_context--;
2572} 2597}
2573 2598
2574/** 2599/**
@@ -4406,8 +4431,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
4406{ 4431{
4407 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; 4432 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
4408 struct buffer_data_page *bpage = data; 4433 struct buffer_data_page *bpage = data;
4434 struct page *page = virt_to_page(bpage);
4409 unsigned long flags; 4435 unsigned long flags;
4410 4436
4437 /* If the page is still in use someplace else, we can't reuse it */
4438 if (page_ref_count(page) > 1)
4439 goto out;
4440
4411 local_irq_save(flags); 4441 local_irq_save(flags);
4412 arch_spin_lock(&cpu_buffer->lock); 4442 arch_spin_lock(&cpu_buffer->lock);
4413 4443
@@ -4419,6 +4449,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data)
4419 arch_spin_unlock(&cpu_buffer->lock); 4449 arch_spin_unlock(&cpu_buffer->lock);
4420 local_irq_restore(flags); 4450 local_irq_restore(flags);
4421 4451
4452 out:
4422 free_page((unsigned long)bpage); 4453 free_page((unsigned long)bpage);
4423} 4454}
4424EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); 4455EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1e2a45e87b93..32c069bbf41b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct
362} 362}
363 363
364/** 364/**
365 * trace_pid_filter_add_remove - Add or remove a task from a pid_list 365 * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list
366 * @pid_list: The list to modify 366 * @pid_list: The list to modify
367 * @self: The current task for fork or NULL for exit 367 * @self: The current task for fork or NULL for exit
368 * @task: The task to add or remove 368 * @task: The task to add or remove
@@ -925,7 +925,7 @@ static void tracing_snapshot_instance(struct trace_array *tr)
925} 925}
926 926
927/** 927/**
928 * trace_snapshot - take a snapshot of the current buffer. 928 * tracing_snapshot - take a snapshot of the current buffer.
929 * 929 *
930 * This causes a swap between the snapshot buffer and the current live 930 * This causes a swap between the snapshot buffer and the current live
931 * tracing buffer. You can use this to take snapshots of the live 931 * tracing buffer. You can use this to take snapshots of the live
@@ -1004,9 +1004,9 @@ int tracing_alloc_snapshot(void)
1004EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); 1004EXPORT_SYMBOL_GPL(tracing_alloc_snapshot);
1005 1005
1006/** 1006/**
1007 * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. 1007 * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer.
1008 * 1008 *
1009 * This is similar to trace_snapshot(), but it will allocate the 1009 * This is similar to tracing_snapshot(), but it will allocate the
1010 * snapshot buffer if it isn't already allocated. Use this only 1010 * snapshot buffer if it isn't already allocated. Use this only
1011 * where it is safe to sleep, as the allocation may sleep. 1011 * where it is safe to sleep, as the allocation may sleep.
1012 * 1012 *
@@ -1303,7 +1303,7 @@ unsigned long __read_mostly tracing_thresh;
1303/* 1303/*
1304 * Copy the new maximum trace into the separate maximum-trace 1304 * Copy the new maximum trace into the separate maximum-trace
1305 * structure. (this way the maximum trace is permanently saved, 1305 * structure. (this way the maximum trace is permanently saved,
1306 * for later retrieval via /sys/kernel/debug/tracing/latency_trace) 1306 * for later retrieval via /sys/kernel/tracing/tracing_max_latency)
1307 */ 1307 */
1308static void 1308static void
1309__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 1309__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
@@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
2374} 2374}
2375EXPORT_SYMBOL_GPL(trace_event_buffer_commit); 2375EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
2376 2376
2377/*
2378 * Skip 3:
2379 *
2380 * trace_buffer_unlock_commit_regs()
2381 * trace_event_buffer_commit()
2382 * trace_event_raw_event_xxx()
2383*/
2384# define STACK_SKIP 3
2385
2377void trace_buffer_unlock_commit_regs(struct trace_array *tr, 2386void trace_buffer_unlock_commit_regs(struct trace_array *tr,
2378 struct ring_buffer *buffer, 2387 struct ring_buffer *buffer,
2379 struct ring_buffer_event *event, 2388 struct ring_buffer_event *event,
@@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
2383 __buffer_unlock_commit(buffer, event); 2392 __buffer_unlock_commit(buffer, event);
2384 2393
2385 /* 2394 /*
2386 * If regs is not set, then skip the following callers: 2395 * If regs is not set, then skip the necessary functions.
2387 * trace_buffer_unlock_commit_regs
2388 * event_trigger_unlock_commit
2389 * trace_event_buffer_commit
2390 * trace_event_raw_event_sched_switch
2391 * Note, we can still get here via blktrace, wakeup tracer 2396 * Note, we can still get here via blktrace, wakeup tracer
2392 * and mmiotrace, but that's ok if they lose a function or 2397 * and mmiotrace, but that's ok if they lose a function or
2393 * two. They are that meaningful. 2398 * two. They are not that meaningful.
2394 */ 2399 */
2395 ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); 2400 ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs);
2396 ftrace_trace_userstack(buffer, flags, pc); 2401 ftrace_trace_userstack(buffer, flags, pc);
2397} 2402}
2398 2403
@@ -2415,7 +2420,7 @@ trace_process_export(struct trace_export *export,
2415 2420
2416 entry = ring_buffer_event_data(event); 2421 entry = ring_buffer_event_data(event);
2417 size = ring_buffer_event_length(event); 2422 size = ring_buffer_event_length(event);
2418 export->write(entry, size); 2423 export->write(export, entry, size);
2419} 2424}
2420 2425
2421static DEFINE_MUTEX(ftrace_export_lock); 2426static DEFINE_MUTEX(ftrace_export_lock);
@@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
2579 trace.skip = skip; 2584 trace.skip = skip;
2580 2585
2581 /* 2586 /*
2582 * Add two, for this function and the call to save_stack_trace() 2587 * Add one, for this function and the call to save_stack_trace()
2583 * If regs is set, then these functions will not be in the way. 2588 * If regs is set, then these functions will not be in the way.
2584 */ 2589 */
2590#ifndef CONFIG_UNWINDER_ORC
2585 if (!regs) 2591 if (!regs)
2586 trace.skip += 2; 2592 trace.skip++;
2593#endif
2587 2594
2588 /* 2595 /*
2589 * Since events can happen in NMIs there's no safe way to 2596 * Since events can happen in NMIs there's no safe way to
@@ -2682,17 +2689,6 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
2682 if (unlikely(in_nmi())) 2689 if (unlikely(in_nmi()))
2683 return; 2690 return;
2684 2691
2685 /*
2686 * It is possible that a function is being traced in a
2687 * location that RCU is not watching. A call to
2688 * rcu_irq_enter() will make sure that it is, but there's
2689 * a few internal rcu functions that could be traced
2690 * where that wont work either. In those cases, we just
2691 * do nothing.
2692 */
2693 if (unlikely(rcu_irq_enter_disabled()))
2694 return;
2695
2696 rcu_irq_enter_irqson(); 2692 rcu_irq_enter_irqson();
2697 __ftrace_trace_stack(buffer, flags, skip, pc, NULL); 2693 __ftrace_trace_stack(buffer, flags, skip, pc, NULL);
2698 rcu_irq_exit_irqson(); 2694 rcu_irq_exit_irqson();
@@ -2711,11 +2707,10 @@ void trace_dump_stack(int skip)
2711 2707
2712 local_save_flags(flags); 2708 local_save_flags(flags);
2713 2709
2714 /* 2710#ifndef CONFIG_UNWINDER_ORC
2715 * Skip 3 more, seems to get us at the caller of 2711 /* Skip 1 to skip this function. */
2716 * this function. 2712 skip++;
2717 */ 2713#endif
2718 skip += 3;
2719 __ftrace_trace_stack(global_trace.trace_buffer.buffer, 2714 __ftrace_trace_stack(global_trace.trace_buffer.buffer,
2720 flags, skip, preempt_count(), NULL); 2715 flags, skip, preempt_count(), NULL);
2721} 2716}
@@ -4178,37 +4173,30 @@ static const struct file_operations show_traces_fops = {
4178 .llseek = seq_lseek, 4173 .llseek = seq_lseek,
4179}; 4174};
4180 4175
4181/*
4182 * The tracer itself will not take this lock, but still we want
4183 * to provide a consistent cpumask to user-space:
4184 */
4185static DEFINE_MUTEX(tracing_cpumask_update_lock);
4186
4187/*
4188 * Temporary storage for the character representation of the
4189 * CPU bitmask (and one more byte for the newline):
4190 */
4191static char mask_str[NR_CPUS + 1];
4192
4193static ssize_t 4176static ssize_t
4194tracing_cpumask_read(struct file *filp, char __user *ubuf, 4177tracing_cpumask_read(struct file *filp, char __user *ubuf,
4195 size_t count, loff_t *ppos) 4178 size_t count, loff_t *ppos)
4196{ 4179{
4197 struct trace_array *tr = file_inode(filp)->i_private; 4180 struct trace_array *tr = file_inode(filp)->i_private;
4181 char *mask_str;
4198 int len; 4182 int len;
4199 4183
4200 mutex_lock(&tracing_cpumask_update_lock); 4184 len = snprintf(NULL, 0, "%*pb\n",
4185 cpumask_pr_args(tr->tracing_cpumask)) + 1;
4186 mask_str = kmalloc(len, GFP_KERNEL);
4187 if (!mask_str)
4188 return -ENOMEM;
4201 4189
4202 len = snprintf(mask_str, count, "%*pb\n", 4190 len = snprintf(mask_str, len, "%*pb\n",
4203 cpumask_pr_args(tr->tracing_cpumask)); 4191 cpumask_pr_args(tr->tracing_cpumask));
4204 if (len >= count) { 4192 if (len >= count) {
4205 count = -EINVAL; 4193 count = -EINVAL;
4206 goto out_err; 4194 goto out_err;
4207 } 4195 }
4208 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); 4196 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len);
4209 4197
4210out_err: 4198out_err:
4211 mutex_unlock(&tracing_cpumask_update_lock); 4199 kfree(mask_str);
4212 4200
4213 return count; 4201 return count;
4214} 4202}
@@ -4228,8 +4216,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
4228 if (err) 4216 if (err)
4229 goto err_unlock; 4217 goto err_unlock;
4230 4218
4231 mutex_lock(&tracing_cpumask_update_lock);
4232
4233 local_irq_disable(); 4219 local_irq_disable();
4234 arch_spin_lock(&tr->max_lock); 4220 arch_spin_lock(&tr->max_lock);
4235 for_each_tracing_cpu(cpu) { 4221 for_each_tracing_cpu(cpu) {
@@ -4252,8 +4238,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
4252 local_irq_enable(); 4238 local_irq_enable();
4253 4239
4254 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); 4240 cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new);
4255
4256 mutex_unlock(&tracing_cpumask_update_lock);
4257 free_cpumask_var(tracing_cpumask_new); 4241 free_cpumask_var(tracing_cpumask_new);
4258 4242
4259 return count; 4243 return count;
@@ -6780,7 +6764,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
6780 .spd_release = buffer_spd_release, 6764 .spd_release = buffer_spd_release,
6781 }; 6765 };
6782 struct buffer_ref *ref; 6766 struct buffer_ref *ref;
6783 int entries, size, i; 6767 int entries, i;
6784 ssize_t ret = 0; 6768 ssize_t ret = 0;
6785 6769
6786#ifdef CONFIG_TRACER_MAX_TRACE 6770#ifdef CONFIG_TRACER_MAX_TRACE
@@ -6834,14 +6818,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
6834 break; 6818 break;
6835 } 6819 }
6836 6820
6837 /*
6838 * zero out any left over data, this is going to
6839 * user land.
6840 */
6841 size = ring_buffer_page_len(ref->page);
6842 if (size < PAGE_SIZE)
6843 memset(ref->page + size, 0, PAGE_SIZE - size);
6844
6845 page = virt_to_page(ref->page); 6821 page = virt_to_page(ref->page);
6846 6822
6847 spd.pages[i] = page; 6823 spd.pages[i] = page;
@@ -7599,6 +7575,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size
7599 buf->data = alloc_percpu(struct trace_array_cpu); 7575 buf->data = alloc_percpu(struct trace_array_cpu);
7600 if (!buf->data) { 7576 if (!buf->data) {
7601 ring_buffer_free(buf->buffer); 7577 ring_buffer_free(buf->buffer);
7578 buf->buffer = NULL;
7602 return -ENOMEM; 7579 return -ENOMEM;
7603 } 7580 }
7604 7581
@@ -7622,7 +7599,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
7622 allocate_snapshot ? size : 1); 7599 allocate_snapshot ? size : 1);
7623 if (WARN_ON(ret)) { 7600 if (WARN_ON(ret)) {
7624 ring_buffer_free(tr->trace_buffer.buffer); 7601 ring_buffer_free(tr->trace_buffer.buffer);
7602 tr->trace_buffer.buffer = NULL;
7625 free_percpu(tr->trace_buffer.data); 7603 free_percpu(tr->trace_buffer.data);
7604 tr->trace_buffer.data = NULL;
7626 return -ENOMEM; 7605 return -ENOMEM;
7627 } 7606 }
7628 tr->allocated_snapshot = allocate_snapshot; 7607 tr->allocated_snapshot = allocate_snapshot;
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
index 79f838a75077..22fee766081b 100644
--- a/kernel/trace/trace_benchmark.c
+++ b/kernel/trace/trace_benchmark.c
@@ -165,7 +165,7 @@ static int benchmark_event_kthread(void *arg)
165 * this thread will never voluntarily schedule which would 165 * this thread will never voluntarily schedule which would
166 * block synchronize_rcu_tasks() indefinitely. 166 * block synchronize_rcu_tasks() indefinitely.
167 */ 167 */
168 cond_resched_rcu_qs(); 168 cond_resched();
169 } 169 }
170 170
171 return 0; 171 return 0;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ec0f9aa4e151..1b87157edbff 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2213,6 +2213,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
2213{ 2213{
2214 struct trace_event_call *call, *p; 2214 struct trace_event_call *call, *p;
2215 const char *last_system = NULL; 2215 const char *last_system = NULL;
2216 bool first = false;
2216 int last_i; 2217 int last_i;
2217 int i; 2218 int i;
2218 2219
@@ -2220,15 +2221,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
2220 list_for_each_entry_safe(call, p, &ftrace_events, list) { 2221 list_for_each_entry_safe(call, p, &ftrace_events, list) {
2221 /* events are usually grouped together with systems */ 2222 /* events are usually grouped together with systems */
2222 if (!last_system || call->class->system != last_system) { 2223 if (!last_system || call->class->system != last_system) {
2224 first = true;
2223 last_i = 0; 2225 last_i = 0;
2224 last_system = call->class->system; 2226 last_system = call->class->system;
2225 } 2227 }
2226 2228
2229 /*
2230 * Since calls are grouped by systems, the likelyhood that the
2231 * next call in the iteration belongs to the same system as the
2232 * previous call is high. As an optimization, we skip seaching
2233 * for a map[] that matches the call's system if the last call
2234 * was from the same system. That's what last_i is for. If the
2235 * call has the same system as the previous call, then last_i
2236 * will be the index of the first map[] that has a matching
2237 * system.
2238 */
2227 for (i = last_i; i < len; i++) { 2239 for (i = last_i; i < len; i++) {
2228 if (call->class->system == map[i]->system) { 2240 if (call->class->system == map[i]->system) {
2229 /* Save the first system if need be */ 2241 /* Save the first system if need be */
2230 if (!last_i) 2242 if (first) {
2231 last_i = i; 2243 last_i = i;
2244 first = false;
2245 }
2232 update_event_printk(call, map[i]); 2246 update_event_printk(call, map[i]);
2233 } 2247 }
2234 } 2248 }
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index f2ac9d44f6c4..87411482a46f 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; }
1123#endif /* CONFIG_TRACER_SNAPSHOT */ 1123#endif /* CONFIG_TRACER_SNAPSHOT */
1124 1124
1125#ifdef CONFIG_STACKTRACE 1125#ifdef CONFIG_STACKTRACE
1126#ifdef CONFIG_UNWINDER_ORC
1127/* Skip 2:
1128 * event_triggers_post_call()
1129 * trace_event_raw_event_xxx()
1130 */
1131# define STACK_SKIP 2
1132#else
1126/* 1133/*
1127 * Skip 3: 1134 * Skip 4:
1128 * stacktrace_trigger() 1135 * stacktrace_trigger()
1129 * event_triggers_post_call() 1136 * event_triggers_post_call()
1137 * trace_event_buffer_commit()
1130 * trace_event_raw_event_xxx() 1138 * trace_event_raw_event_xxx()
1131 */ 1139 */
1132#define STACK_SKIP 3 1140#define STACK_SKIP 4
1141#endif
1133 1142
1134static void 1143static void
1135stacktrace_trigger(struct event_trigger_data *data, void *rec) 1144stacktrace_trigger(struct event_trigger_data *data, void *rec)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 27f7ad12c4b1..b611cd36e22d 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
154 preempt_enable_notrace(); 154 preempt_enable_notrace();
155} 155}
156 156
157#ifdef CONFIG_UNWINDER_ORC
158/*
159 * Skip 2:
160 *
161 * function_stack_trace_call()
162 * ftrace_call()
163 */
164#define STACK_SKIP 2
165#else
166/*
167 * Skip 3:
168 * __trace_stack()
169 * function_stack_trace_call()
170 * ftrace_call()
171 */
172#define STACK_SKIP 3
173#endif
174
157static void 175static void
158function_stack_trace_call(unsigned long ip, unsigned long parent_ip, 176function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
159 struct ftrace_ops *op, struct pt_regs *pt_regs) 177 struct ftrace_ops *op, struct pt_regs *pt_regs)
@@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
180 if (likely(disabled == 1)) { 198 if (likely(disabled == 1)) {
181 pc = preempt_count(); 199 pc = preempt_count();
182 trace_function(tr, ip, parent_ip, flags, pc); 200 trace_function(tr, ip, parent_ip, flags, pc);
183 /* 201 __trace_stack(tr, flags, STACK_SKIP, pc);
184 * skip over 5 funcs:
185 * __ftrace_trace_stack,
186 * __trace_stack,
187 * function_stack_trace_call
188 * ftrace_list_func
189 * ftrace_call
190 */
191 __trace_stack(tr, flags, 5, pc);
192 } 202 }
193 203
194 atomic_dec(&data->disabled); 204 atomic_dec(&data->disabled);
@@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip,
367 tracer_tracing_off(tr); 377 tracer_tracing_off(tr);
368} 378}
369 379
380#ifdef CONFIG_UNWINDER_ORC
370/* 381/*
371 * Skip 4: 382 * Skip 3:
383 *
384 * function_trace_probe_call()
385 * ftrace_ops_assist_func()
386 * ftrace_call()
387 */
388#define FTRACE_STACK_SKIP 3
389#else
390/*
391 * Skip 5:
392 *
393 * __trace_stack()
372 * ftrace_stacktrace() 394 * ftrace_stacktrace()
373 * function_trace_probe_call() 395 * function_trace_probe_call()
374 * ftrace_ops_list_func() 396 * ftrace_ops_assist_func()
375 * ftrace_call() 397 * ftrace_call()
376 */ 398 */
377#define STACK_SKIP 4 399#define FTRACE_STACK_SKIP 5
400#endif
378 401
379static __always_inline void trace_stack(struct trace_array *tr) 402static __always_inline void trace_stack(struct trace_array *tr)
380{ 403{
@@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr)
384 local_save_flags(flags); 407 local_save_flags(flags);
385 pc = preempt_count(); 408 pc = preempt_count();
386 409
387 __trace_stack(tr, flags, STACK_SKIP, pc); 410 __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc);
388} 411}
389 412
390static void 413static void
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 734accc02418..3c7bfc4bf5e9 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
209 if (__this_cpu_read(disable_stack_tracer) != 1) 209 if (__this_cpu_read(disable_stack_tracer) != 1)
210 goto out; 210 goto out;
211 211
212 /* If rcu is not watching, then save stack trace can fail */
213 if (!rcu_is_watching())
214 goto out;
215
212 ip += MCOUNT_INSN_SIZE; 216 ip += MCOUNT_INSN_SIZE;
213 217
214 check_stack(ip, &stack); 218 check_stack(ip, &stack);