aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/x86/kernel/cpu/perf_counter.c60
-rw-r--r--include/linux/perf_counter.h78
-rw-r--r--include/trace/events/sched.h33
-rw-r--r--kernel/perf_counter.c363
-rw-r--r--kernel/sched_clock.c122
-rw-r--r--kernel/sched_fair.c1
-rw-r--r--tools/perf/Documentation/perf-sched.txt41
-rw-r--r--tools/perf/Documentation/perf-trace.txt25
-rw-r--r--tools/perf/Makefile1
-rw-r--r--tools/perf/builtin-record.c56
-rw-r--r--tools/perf/builtin-sched.c2004
-rw-r--r--tools/perf/builtin.h5
-rw-r--r--tools/perf/command-list.txt2
-rw-r--r--tools/perf/perf.c1
-rw-r--r--tools/perf/util/event.h2
-rw-r--r--tools/perf/util/parse-events.c212
-rw-r--r--tools/perf/util/thread.c4
-rw-r--r--tools/perf/util/thread.h9
-rw-r--r--tools/perf/util/trace-event-info.c7
-rw-r--r--tools/perf/util/trace-event-parse.c45
-rw-r--r--tools/perf/util/trace-event-read.c6
-rw-r--r--tools/perf/util/trace-event.h5
22 files changed, 2727 insertions, 355 deletions
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2732e2c1e4d3..dbdf712fae9e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -36,10 +36,10 @@ static u64 perf_counter_mask __read_mostly;
36#define BTS_RECORD_SIZE 24 36#define BTS_RECORD_SIZE 24
37 37
38/* The size of a per-cpu BTS buffer in bytes: */ 38/* The size of a per-cpu BTS buffer in bytes: */
39#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) 39#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048)
40 40
41/* The BTS overflow threshold in bytes from the end of the buffer: */ 41/* The BTS overflow threshold in bytes from the end of the buffer: */
42#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) 42#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128)
43 43
44 44
45/* 45/*
@@ -1488,8 +1488,7 @@ void perf_counter_print_debug(void)
1488 local_irq_restore(flags); 1488 local_irq_restore(flags);
1489} 1489}
1490 1490
1491static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, 1491static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc)
1492 struct perf_sample_data *data)
1493{ 1492{
1494 struct debug_store *ds = cpuc->ds; 1493 struct debug_store *ds = cpuc->ds;
1495 struct bts_record { 1494 struct bts_record {
@@ -1498,8 +1497,11 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
1498 u64 flags; 1497 u64 flags;
1499 }; 1498 };
1500 struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; 1499 struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
1501 unsigned long orig_ip = data->regs->ip;
1502 struct bts_record *at, *top; 1500 struct bts_record *at, *top;
1501 struct perf_output_handle handle;
1502 struct perf_event_header header;
1503 struct perf_sample_data data;
1504 struct pt_regs regs;
1503 1505
1504 if (!counter) 1506 if (!counter)
1505 return; 1507 return;
@@ -1510,19 +1512,38 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
1510 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; 1512 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1511 top = (struct bts_record *)(unsigned long)ds->bts_index; 1513 top = (struct bts_record *)(unsigned long)ds->bts_index;
1512 1514
1515 if (top <= at)
1516 return;
1517
1513 ds->bts_index = ds->bts_buffer_base; 1518 ds->bts_index = ds->bts_buffer_base;
1514 1519
1520
1521 data.period = counter->hw.last_period;
1522 data.addr = 0;
1523 regs.ip = 0;
1524
1525 /*
1526 * Prepare a generic sample, i.e. fill in the invariant fields.
1527 * We will overwrite the from and to address before we output
1528 * the sample.
1529 */
1530 perf_prepare_sample(&header, &data, counter, &regs);
1531
1532 if (perf_output_begin(&handle, counter,
1533 header.size * (top - at), 1, 1))
1534 return;
1535
1515 for (; at < top; at++) { 1536 for (; at < top; at++) {
1516 data->regs->ip = at->from; 1537 data.ip = at->from;
1517 data->addr = at->to; 1538 data.addr = at->to;
1518 1539
1519 perf_counter_output(counter, 1, data); 1540 perf_output_sample(&handle, &header, &data, counter);
1520 } 1541 }
1521 1542
1522 data->regs->ip = orig_ip; 1543 perf_output_end(&handle);
1523 data->addr = 0;
1524 1544
1525 /* There's new data available. */ 1545 /* There's new data available. */
1546 counter->hw.interrupts++;
1526 counter->pending_kill = POLL_IN; 1547 counter->pending_kill = POLL_IN;
1527} 1548}
1528 1549
@@ -1552,13 +1573,9 @@ static void x86_pmu_disable(struct perf_counter *counter)
1552 x86_perf_counter_update(counter, hwc, idx); 1573 x86_perf_counter_update(counter, hwc, idx);
1553 1574
1554 /* Drain the remaining BTS records. */ 1575 /* Drain the remaining BTS records. */
1555 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 1576 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
1556 struct perf_sample_data data; 1577 intel_pmu_drain_bts_buffer(cpuc);
1557 struct pt_regs regs;
1558 1578
1559 data.regs = &regs;
1560 intel_pmu_drain_bts_buffer(cpuc, &data);
1561 }
1562 cpuc->counters[idx] = NULL; 1579 cpuc->counters[idx] = NULL;
1563 clear_bit(idx, cpuc->used_mask); 1580 clear_bit(idx, cpuc->used_mask);
1564 1581
@@ -1619,7 +1636,6 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
1619 int idx, handled = 0; 1636 int idx, handled = 0;
1620 u64 val; 1637 u64 val;
1621 1638
1622 data.regs = regs;
1623 data.addr = 0; 1639 data.addr = 0;
1624 1640
1625 cpuc = &__get_cpu_var(cpu_hw_counters); 1641 cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1644,7 +1660,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
1644 if (!x86_perf_counter_set_period(counter, hwc, idx)) 1660 if (!x86_perf_counter_set_period(counter, hwc, idx))
1645 continue; 1661 continue;
1646 1662
1647 if (perf_counter_overflow(counter, 1, &data)) 1663 if (perf_counter_overflow(counter, 1, &data, regs))
1648 p6_pmu_disable_counter(hwc, idx); 1664 p6_pmu_disable_counter(hwc, idx);
1649 } 1665 }
1650 1666
@@ -1665,13 +1681,12 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
1665 int bit, loops; 1681 int bit, loops;
1666 u64 ack, status; 1682 u64 ack, status;
1667 1683
1668 data.regs = regs;
1669 data.addr = 0; 1684 data.addr = 0;
1670 1685
1671 cpuc = &__get_cpu_var(cpu_hw_counters); 1686 cpuc = &__get_cpu_var(cpu_hw_counters);
1672 1687
1673 perf_disable(); 1688 perf_disable();
1674 intel_pmu_drain_bts_buffer(cpuc, &data); 1689 intel_pmu_drain_bts_buffer(cpuc);
1675 status = intel_pmu_get_status(); 1690 status = intel_pmu_get_status();
1676 if (!status) { 1691 if (!status) {
1677 perf_enable(); 1692 perf_enable();
@@ -1702,7 +1717,7 @@ again:
1702 1717
1703 data.period = counter->hw.last_period; 1718 data.period = counter->hw.last_period;
1704 1719
1705 if (perf_counter_overflow(counter, 1, &data)) 1720 if (perf_counter_overflow(counter, 1, &data, regs))
1706 intel_pmu_disable_counter(&counter->hw, bit); 1721 intel_pmu_disable_counter(&counter->hw, bit);
1707 } 1722 }
1708 1723
@@ -1729,7 +1744,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1729 int idx, handled = 0; 1744 int idx, handled = 0;
1730 u64 val; 1745 u64 val;
1731 1746
1732 data.regs = regs;
1733 data.addr = 0; 1747 data.addr = 0;
1734 1748
1735 cpuc = &__get_cpu_var(cpu_hw_counters); 1749 cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1754,7 +1768,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1754 if (!x86_perf_counter_set_period(counter, hwc, idx)) 1768 if (!x86_perf_counter_set_period(counter, hwc, idx))
1755 continue; 1769 continue;
1756 1770
1757 if (perf_counter_overflow(counter, 1, &data)) 1771 if (perf_counter_overflow(counter, 1, &data, regs))
1758 amd_pmu_disable_counter(hwc, idx); 1772 amd_pmu_disable_counter(hwc, idx);
1759 } 1773 }
1760 1774
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 972f90d7a32f..c7375f97aa19 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -199,10 +199,14 @@ struct perf_counter_attr {
199 inherit_stat : 1, /* per task counts */ 199 inherit_stat : 1, /* per task counts */
200 enable_on_exec : 1, /* next exec enables */ 200 enable_on_exec : 1, /* next exec enables */
201 task : 1, /* trace fork/exit */ 201 task : 1, /* trace fork/exit */
202 watermark : 1, /* wakeup_watermark */
202 203
203 __reserved_1 : 50; 204 __reserved_1 : 49;
204 205
205 __u32 wakeup_events; /* wakeup every n events */ 206 union {
207 __u32 wakeup_events; /* wakeup every n events */
208 __u32 wakeup_watermark; /* bytes before wakeup */
209 };
206 __u32 __reserved_2; 210 __u32 __reserved_2;
207 211
208 __u64 __reserved_3; 212 __u64 __reserved_3;
@@ -521,6 +525,8 @@ struct perf_mmap_data {
521 atomic_t wakeup; /* needs a wakeup */ 525 atomic_t wakeup; /* needs a wakeup */
522 atomic_t lost; /* nr records lost */ 526 atomic_t lost; /* nr records lost */
523 527
528 long watermark; /* wakeup watermark */
529
524 struct perf_counter_mmap_page *user_page; 530 struct perf_counter_mmap_page *user_page;
525 void *data_pages[0]; 531 void *data_pages[0];
526}; 532};
@@ -685,6 +691,17 @@ struct perf_cpu_context {
685 int recursion[4]; 691 int recursion[4];
686}; 692};
687 693
694struct perf_output_handle {
695 struct perf_counter *counter;
696 struct perf_mmap_data *data;
697 unsigned long head;
698 unsigned long offset;
699 int nmi;
700 int sample;
701 int locked;
702 unsigned long flags;
703};
704
688#ifdef CONFIG_PERF_COUNTERS 705#ifdef CONFIG_PERF_COUNTERS
689 706
690/* 707/*
@@ -716,16 +733,38 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
716extern void perf_counter_update_userpage(struct perf_counter *counter); 733extern void perf_counter_update_userpage(struct perf_counter *counter);
717 734
718struct perf_sample_data { 735struct perf_sample_data {
719 struct pt_regs *regs; 736 u64 type;
737
738 u64 ip;
739 struct {
740 u32 pid;
741 u32 tid;
742 } tid_entry;
743 u64 time;
720 u64 addr; 744 u64 addr;
745 u64 id;
746 u64 stream_id;
747 struct {
748 u32 cpu;
749 u32 reserved;
750 } cpu_entry;
721 u64 period; 751 u64 period;
752 struct perf_callchain_entry *callchain;
722 struct perf_raw_record *raw; 753 struct perf_raw_record *raw;
723}; 754};
724 755
756extern void perf_output_sample(struct perf_output_handle *handle,
757 struct perf_event_header *header,
758 struct perf_sample_data *data,
759 struct perf_counter *counter);
760extern void perf_prepare_sample(struct perf_event_header *header,
761 struct perf_sample_data *data,
762 struct perf_counter *counter,
763 struct pt_regs *regs);
764
725extern int perf_counter_overflow(struct perf_counter *counter, int nmi, 765extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
726 struct perf_sample_data *data); 766 struct perf_sample_data *data,
727extern void perf_counter_output(struct perf_counter *counter, int nmi, 767 struct pt_regs *regs);
728 struct perf_sample_data *data);
729 768
730/* 769/*
731 * Return 1 for a software counter, 0 for a hardware counter 770 * Return 1 for a software counter, 0 for a hardware counter
@@ -775,6 +814,12 @@ extern void perf_tpcounter_event(int event_id, u64 addr, u64 count,
775#define perf_instruction_pointer(regs) instruction_pointer(regs) 814#define perf_instruction_pointer(regs) instruction_pointer(regs)
776#endif 815#endif
777 816
817extern int perf_output_begin(struct perf_output_handle *handle,
818 struct perf_counter *counter, unsigned int size,
819 int nmi, int sample);
820extern void perf_output_end(struct perf_output_handle *handle);
821extern void perf_output_copy(struct perf_output_handle *handle,
822 const void *buf, unsigned int len);
778#else 823#else
779static inline void 824static inline void
780perf_counter_task_sched_in(struct task_struct *task, int cpu) { } 825perf_counter_task_sched_in(struct task_struct *task, int cpu) { }
@@ -801,7 +846,28 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma) { }
801static inline void perf_counter_comm(struct task_struct *tsk) { } 846static inline void perf_counter_comm(struct task_struct *tsk) { }
802static inline void perf_counter_fork(struct task_struct *tsk) { } 847static inline void perf_counter_fork(struct task_struct *tsk) { }
803static inline void perf_counter_init(void) { } 848static inline void perf_counter_init(void) { }
849
850static inline int
851perf_output_begin(struct perf_output_handle *handle, struct perf_counter *c,
852 unsigned int size, int nmi, int sample) { }
853static inline void perf_output_end(struct perf_output_handle *handle) { }
854static inline void
855perf_output_copy(struct perf_output_handle *handle,
856 const void *buf, unsigned int len) { }
857static inline void
858perf_output_sample(struct perf_output_handle *handle,
859 struct perf_event_header *header,
860 struct perf_sample_data *data,
861 struct perf_counter *counter) { }
862static inline void
863perf_prepare_sample(struct perf_event_header *header,
864 struct perf_sample_data *data,
865 struct perf_counter *counter,
866 struct pt_regs *regs) { }
804#endif 867#endif
805 868
869#define perf_output_put(handle, x) \
870 perf_output_copy((handle), &(x), sizeof(x))
871
806#endif /* __KERNEL__ */ 872#endif /* __KERNEL__ */
807#endif /* _LINUX_PERF_COUNTER_H */ 873#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index b48f1ad7c946..4069c43f4187 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -380,6 +380,39 @@ TRACE_EVENT(sched_stat_wait,
380); 380);
381 381
382/* 382/*
383 * Tracepoint for accounting runtime (time the task is executing
384 * on a CPU).
385 */
386TRACE_EVENT(sched_stat_runtime,
387
388 TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
389
390 TP_ARGS(tsk, runtime, vruntime),
391
392 TP_STRUCT__entry(
393 __array( char, comm, TASK_COMM_LEN )
394 __field( pid_t, pid )
395 __field( u64, runtime )
396 __field( u64, vruntime )
397 ),
398
399 TP_fast_assign(
400 memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
401 __entry->pid = tsk->pid;
402 __entry->runtime = runtime;
403 __entry->vruntime = vruntime;
404 )
405 TP_perf_assign(
406 __perf_count(runtime);
407 ),
408
409 TP_printk("task: %s:%d runtime: %Lu [ns], vruntime: %Lu [ns]",
410 __entry->comm, __entry->pid,
411 (unsigned long long)__entry->runtime,
412 (unsigned long long)__entry->vruntime)
413);
414
415/*
383 * Tracepoint for accounting sleep time (time the task is not runnable, 416 * Tracepoint for accounting sleep time (time the task is not runnable,
384 * including iowait, see below). 417 * including iowait, see below).
385 */ 418 */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8cb94a52d1bb..d013f4e89e9c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2176,6 +2176,13 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
2176 data->nr_pages = nr_pages; 2176 data->nr_pages = nr_pages;
2177 atomic_set(&data->lock, -1); 2177 atomic_set(&data->lock, -1);
2178 2178
2179 if (counter->attr.watermark) {
2180 data->watermark = min_t(long, PAGE_SIZE * nr_pages,
2181 counter->attr.wakeup_watermark);
2182 }
2183 if (!data->watermark)
2184 data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
2185
2179 rcu_assign_pointer(counter->data, data); 2186 rcu_assign_pointer(counter->data, data);
2180 2187
2181 return 0; 2188 return 0;
@@ -2315,7 +2322,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2315 lock_limit >>= PAGE_SHIFT; 2322 lock_limit >>= PAGE_SHIFT;
2316 locked = vma->vm_mm->locked_vm + extra; 2323 locked = vma->vm_mm->locked_vm + extra;
2317 2324
2318 if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { 2325 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
2326 !capable(CAP_IPC_LOCK)) {
2319 ret = -EPERM; 2327 ret = -EPERM;
2320 goto unlock; 2328 goto unlock;
2321 } 2329 }
@@ -2504,35 +2512,15 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2504/* 2512/*
2505 * Output 2513 * Output
2506 */ 2514 */
2507 2515static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
2508struct perf_output_handle { 2516 unsigned long offset, unsigned long head)
2509 struct perf_counter *counter;
2510 struct perf_mmap_data *data;
2511 unsigned long head;
2512 unsigned long offset;
2513 int nmi;
2514 int sample;
2515 int locked;
2516 unsigned long flags;
2517};
2518
2519static bool perf_output_space(struct perf_mmap_data *data,
2520 unsigned int offset, unsigned int head)
2521{ 2517{
2522 unsigned long tail;
2523 unsigned long mask; 2518 unsigned long mask;
2524 2519
2525 if (!data->writable) 2520 if (!data->writable)
2526 return true; 2521 return true;
2527 2522
2528 mask = (data->nr_pages << PAGE_SHIFT) - 1; 2523 mask = (data->nr_pages << PAGE_SHIFT) - 1;
2529 /*
2530 * Userspace could choose to issue a mb() before updating the tail
2531 * pointer. So that all reads will be completed before the write is
2532 * issued.
2533 */
2534 tail = ACCESS_ONCE(data->user_page->data_tail);
2535 smp_rmb();
2536 2524
2537 offset = (offset - tail) & mask; 2525 offset = (offset - tail) & mask;
2538 head = (head - tail) & mask; 2526 head = (head - tail) & mask;
@@ -2633,8 +2621,8 @@ out:
2633 local_irq_restore(handle->flags); 2621 local_irq_restore(handle->flags);
2634} 2622}
2635 2623
2636static void perf_output_copy(struct perf_output_handle *handle, 2624void perf_output_copy(struct perf_output_handle *handle,
2637 const void *buf, unsigned int len) 2625 const void *buf, unsigned int len)
2638{ 2626{
2639 unsigned int pages_mask; 2627 unsigned int pages_mask;
2640 unsigned int offset; 2628 unsigned int offset;
@@ -2669,16 +2657,13 @@ static void perf_output_copy(struct perf_output_handle *handle,
2669 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); 2657 WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
2670} 2658}
2671 2659
2672#define perf_output_put(handle, x) \ 2660int perf_output_begin(struct perf_output_handle *handle,
2673 perf_output_copy((handle), &(x), sizeof(x)) 2661 struct perf_counter *counter, unsigned int size,
2674 2662 int nmi, int sample)
2675static int perf_output_begin(struct perf_output_handle *handle,
2676 struct perf_counter *counter, unsigned int size,
2677 int nmi, int sample)
2678{ 2663{
2679 struct perf_counter *output_counter; 2664 struct perf_counter *output_counter;
2680 struct perf_mmap_data *data; 2665 struct perf_mmap_data *data;
2681 unsigned int offset, head; 2666 unsigned long tail, offset, head;
2682 int have_lost; 2667 int have_lost;
2683 struct { 2668 struct {
2684 struct perf_event_header header; 2669 struct perf_event_header header;
@@ -2716,16 +2701,23 @@ static int perf_output_begin(struct perf_output_handle *handle,
2716 perf_output_lock(handle); 2701 perf_output_lock(handle);
2717 2702
2718 do { 2703 do {
2704 /*
2705 * Userspace could choose to issue a mb() before updating the
2706 * tail pointer. So that all reads will be completed before the
2707 * write is issued.
2708 */
2709 tail = ACCESS_ONCE(data->user_page->data_tail);
2710 smp_rmb();
2719 offset = head = atomic_long_read(&data->head); 2711 offset = head = atomic_long_read(&data->head);
2720 head += size; 2712 head += size;
2721 if (unlikely(!perf_output_space(data, offset, head))) 2713 if (unlikely(!perf_output_space(data, tail, offset, head)))
2722 goto fail; 2714 goto fail;
2723 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); 2715 } while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
2724 2716
2725 handle->offset = offset; 2717 handle->offset = offset;
2726 handle->head = head; 2718 handle->head = head;
2727 2719
2728 if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) 2720 if (head - tail > data->watermark)
2729 atomic_set(&data->wakeup, 1); 2721 atomic_set(&data->wakeup, 1);
2730 2722
2731 if (have_lost) { 2723 if (have_lost) {
@@ -2749,7 +2741,7 @@ out:
2749 return -ENOSPC; 2741 return -ENOSPC;
2750} 2742}
2751 2743
2752static void perf_output_end(struct perf_output_handle *handle) 2744void perf_output_end(struct perf_output_handle *handle)
2753{ 2745{
2754 struct perf_counter *counter = handle->counter; 2746 struct perf_counter *counter = handle->counter;
2755 struct perf_mmap_data *data = handle->data; 2747 struct perf_mmap_data *data = handle->data;
@@ -2863,156 +2855,176 @@ static void perf_output_read(struct perf_output_handle *handle,
2863 perf_output_read_one(handle, counter); 2855 perf_output_read_one(handle, counter);
2864} 2856}
2865 2857
2866void perf_counter_output(struct perf_counter *counter, int nmi, 2858void perf_output_sample(struct perf_output_handle *handle,
2867 struct perf_sample_data *data) 2859 struct perf_event_header *header,
2860 struct perf_sample_data *data,
2861 struct perf_counter *counter)
2868{ 2862{
2869 int ret; 2863 u64 sample_type = data->type;
2870 u64 sample_type = counter->attr.sample_type;
2871 struct perf_output_handle handle;
2872 struct perf_event_header header;
2873 u64 ip;
2874 struct {
2875 u32 pid, tid;
2876 } tid_entry;
2877 struct perf_callchain_entry *callchain = NULL;
2878 int callchain_size = 0;
2879 u64 time;
2880 struct {
2881 u32 cpu, reserved;
2882 } cpu_entry;
2883
2884 header.type = PERF_EVENT_SAMPLE;
2885 header.size = sizeof(header);
2886
2887 header.misc = 0;
2888 header.misc |= perf_misc_flags(data->regs);
2889
2890 if (sample_type & PERF_SAMPLE_IP) {
2891 ip = perf_instruction_pointer(data->regs);
2892 header.size += sizeof(ip);
2893 }
2894 2864
2895 if (sample_type & PERF_SAMPLE_TID) { 2865 perf_output_put(handle, *header);
2896 /* namespace issues */
2897 tid_entry.pid = perf_counter_pid(counter, current);
2898 tid_entry.tid = perf_counter_tid(counter, current);
2899 2866
2900 header.size += sizeof(tid_entry); 2867 if (sample_type & PERF_SAMPLE_IP)
2901 } 2868 perf_output_put(handle, data->ip);
2902 2869
2903 if (sample_type & PERF_SAMPLE_TIME) { 2870 if (sample_type & PERF_SAMPLE_TID)
2904 /* 2871 perf_output_put(handle, data->tid_entry);
2905 * Maybe do better on x86 and provide cpu_clock_nmi()
2906 */
2907 time = sched_clock();
2908 2872
2909 header.size += sizeof(u64); 2873 if (sample_type & PERF_SAMPLE_TIME)
2910 } 2874 perf_output_put(handle, data->time);
2911 2875
2912 if (sample_type & PERF_SAMPLE_ADDR) 2876 if (sample_type & PERF_SAMPLE_ADDR)
2913 header.size += sizeof(u64); 2877 perf_output_put(handle, data->addr);
2914 2878
2915 if (sample_type & PERF_SAMPLE_ID) 2879 if (sample_type & PERF_SAMPLE_ID)
2916 header.size += sizeof(u64); 2880 perf_output_put(handle, data->id);
2917 2881
2918 if (sample_type & PERF_SAMPLE_STREAM_ID) 2882 if (sample_type & PERF_SAMPLE_STREAM_ID)
2919 header.size += sizeof(u64); 2883 perf_output_put(handle, data->stream_id);
2920 2884
2921 if (sample_type & PERF_SAMPLE_CPU) { 2885 if (sample_type & PERF_SAMPLE_CPU)
2922 header.size += sizeof(cpu_entry); 2886 perf_output_put(handle, data->cpu_entry);
2923
2924 cpu_entry.cpu = raw_smp_processor_id();
2925 cpu_entry.reserved = 0;
2926 }
2927 2887
2928 if (sample_type & PERF_SAMPLE_PERIOD) 2888 if (sample_type & PERF_SAMPLE_PERIOD)
2929 header.size += sizeof(u64); 2889 perf_output_put(handle, data->period);
2930 2890
2931 if (sample_type & PERF_SAMPLE_READ) 2891 if (sample_type & PERF_SAMPLE_READ)
2932 header.size += perf_counter_read_size(counter); 2892 perf_output_read(handle, counter);
2933 2893
2934 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2894 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2935 callchain = perf_callchain(data->regs); 2895 if (data->callchain) {
2896 int size = 1;
2936 2897
2937 if (callchain) { 2898 if (data->callchain)
2938 callchain_size = (1 + callchain->nr) * sizeof(u64); 2899 size += data->callchain->nr;
2939 header.size += callchain_size; 2900
2940 } else 2901 size *= sizeof(u64);
2941 header.size += sizeof(u64); 2902
2903 perf_output_copy(handle, data->callchain, size);
2904 } else {
2905 u64 nr = 0;
2906 perf_output_put(handle, nr);
2907 }
2942 } 2908 }
2943 2909
2944 if (sample_type & PERF_SAMPLE_RAW) { 2910 if (sample_type & PERF_SAMPLE_RAW) {
2945 int size = sizeof(u32); 2911 if (data->raw) {
2912 perf_output_put(handle, data->raw->size);
2913 perf_output_copy(handle, data->raw->data,
2914 data->raw->size);
2915 } else {
2916 struct {
2917 u32 size;
2918 u32 data;
2919 } raw = {
2920 .size = sizeof(u32),
2921 .data = 0,
2922 };
2923 perf_output_put(handle, raw);
2924 }
2925 }
2926}
2946 2927
2947 if (data->raw) 2928void perf_prepare_sample(struct perf_event_header *header,
2948 size += data->raw->size; 2929 struct perf_sample_data *data,
2949 else 2930 struct perf_counter *counter,
2950 size += sizeof(u32); 2931 struct pt_regs *regs)
2932{
2933 u64 sample_type = counter->attr.sample_type;
2951 2934
2952 WARN_ON_ONCE(size & (sizeof(u64)-1)); 2935 data->type = sample_type;
2953 header.size += size;
2954 }
2955 2936
2956 ret = perf_output_begin(&handle, counter, header.size, nmi, 1); 2937 header->type = PERF_EVENT_SAMPLE;
2957 if (ret) 2938 header->size = sizeof(*header);
2958 return;
2959 2939
2960 perf_output_put(&handle, header); 2940 header->misc = 0;
2941 header->misc |= perf_misc_flags(regs);
2961 2942
2962 if (sample_type & PERF_SAMPLE_IP) 2943 if (sample_type & PERF_SAMPLE_IP) {
2963 perf_output_put(&handle, ip); 2944 data->ip = perf_instruction_pointer(regs);
2964 2945
2965 if (sample_type & PERF_SAMPLE_TID) 2946 header->size += sizeof(data->ip);
2966 perf_output_put(&handle, tid_entry); 2947 }
2967 2948
2968 if (sample_type & PERF_SAMPLE_TIME) 2949 if (sample_type & PERF_SAMPLE_TID) {
2969 perf_output_put(&handle, time); 2950 /* namespace issues */
2951 data->tid_entry.pid = perf_counter_pid(counter, current);
2952 data->tid_entry.tid = perf_counter_tid(counter, current);
2953
2954 header->size += sizeof(data->tid_entry);
2955 }
2956
2957 if (sample_type & PERF_SAMPLE_TIME) {
2958 data->time = perf_clock();
2959
2960 header->size += sizeof(data->time);
2961 }
2970 2962
2971 if (sample_type & PERF_SAMPLE_ADDR) 2963 if (sample_type & PERF_SAMPLE_ADDR)
2972 perf_output_put(&handle, data->addr); 2964 header->size += sizeof(data->addr);
2973 2965
2974 if (sample_type & PERF_SAMPLE_ID) { 2966 if (sample_type & PERF_SAMPLE_ID) {
2975 u64 id = primary_counter_id(counter); 2967 data->id = primary_counter_id(counter);
2976 2968
2977 perf_output_put(&handle, id); 2969 header->size += sizeof(data->id);
2978 } 2970 }
2979 2971
2980 if (sample_type & PERF_SAMPLE_STREAM_ID) 2972 if (sample_type & PERF_SAMPLE_STREAM_ID) {
2981 perf_output_put(&handle, counter->id); 2973 data->stream_id = counter->id;
2982 2974
2983 if (sample_type & PERF_SAMPLE_CPU) 2975 header->size += sizeof(data->stream_id);
2984 perf_output_put(&handle, cpu_entry); 2976 }
2977
2978 if (sample_type & PERF_SAMPLE_CPU) {
2979 data->cpu_entry.cpu = raw_smp_processor_id();
2980 data->cpu_entry.reserved = 0;
2981
2982 header->size += sizeof(data->cpu_entry);
2983 }
2985 2984
2986 if (sample_type & PERF_SAMPLE_PERIOD) 2985 if (sample_type & PERF_SAMPLE_PERIOD)
2987 perf_output_put(&handle, data->period); 2986 header->size += sizeof(data->period);
2988 2987
2989 if (sample_type & PERF_SAMPLE_READ) 2988 if (sample_type & PERF_SAMPLE_READ)
2990 perf_output_read(&handle, counter); 2989 header->size += perf_counter_read_size(counter);
2991 2990
2992 if (sample_type & PERF_SAMPLE_CALLCHAIN) { 2991 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
2993 if (callchain) 2992 int size = 1;
2994 perf_output_copy(&handle, callchain, callchain_size); 2993
2995 else { 2994 data->callchain = perf_callchain(regs);
2996 u64 nr = 0; 2995
2997 perf_output_put(&handle, nr); 2996 if (data->callchain)
2998 } 2997 size += data->callchain->nr;
2998
2999 header->size += size * sizeof(u64);
2999 } 3000 }
3000 3001
3001 if (sample_type & PERF_SAMPLE_RAW) { 3002 if (sample_type & PERF_SAMPLE_RAW) {
3002 if (data->raw) { 3003 int size = sizeof(u32);
3003 perf_output_put(&handle, data->raw->size); 3004
3004 perf_output_copy(&handle, data->raw->data, data->raw->size); 3005 if (data->raw)
3005 } else { 3006 size += data->raw->size;
3006 struct { 3007 else
3007 u32 size; 3008 size += sizeof(u32);
3008 u32 data; 3009
3009 } raw = { 3010 WARN_ON_ONCE(size & (sizeof(u64)-1));
3010 .size = sizeof(u32), 3011 header->size += size;
3011 .data = 0,
3012 };
3013 perf_output_put(&handle, raw);
3014 }
3015 } 3012 }
3013}
3014
3015static void perf_counter_output(struct perf_counter *counter, int nmi,
3016 struct perf_sample_data *data,
3017 struct pt_regs *regs)
3018{
3019 struct perf_output_handle handle;
3020 struct perf_event_header header;
3021
3022 perf_prepare_sample(&header, data, counter, regs);
3023
3024 if (perf_output_begin(&handle, counter, header.size, nmi, 1))
3025 return;
3026
3027 perf_output_sample(&handle, &header, data, counter);
3016 3028
3017 perf_output_end(&handle); 3029 perf_output_end(&handle);
3018} 3030}
@@ -3473,7 +3485,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
3473 .misc = 0, 3485 .misc = 0,
3474 .size = sizeof(throttle_event), 3486 .size = sizeof(throttle_event),
3475 }, 3487 },
3476 .time = sched_clock(), 3488 .time = perf_clock(),
3477 .id = primary_counter_id(counter), 3489 .id = primary_counter_id(counter),
3478 .stream_id = counter->id, 3490 .stream_id = counter->id,
3479 }; 3491 };
@@ -3493,14 +3505,16 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
3493 * Generic counter overflow handling, sampling. 3505 * Generic counter overflow handling, sampling.
3494 */ 3506 */
3495 3507
3496int perf_counter_overflow(struct perf_counter *counter, int nmi, 3508static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
3497 struct perf_sample_data *data) 3509 int throttle, struct perf_sample_data *data,
3510 struct pt_regs *regs)
3498{ 3511{
3499 int events = atomic_read(&counter->event_limit); 3512 int events = atomic_read(&counter->event_limit);
3500 int throttle = counter->pmu->unthrottle != NULL;
3501 struct hw_perf_counter *hwc = &counter->hw; 3513 struct hw_perf_counter *hwc = &counter->hw;
3502 int ret = 0; 3514 int ret = 0;
3503 3515
3516 throttle = (throttle && counter->pmu->unthrottle != NULL);
3517
3504 if (!throttle) { 3518 if (!throttle) {
3505 hwc->interrupts++; 3519 hwc->interrupts++;
3506 } else { 3520 } else {
@@ -3523,7 +3537,7 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3523 } 3537 }
3524 3538
3525 if (counter->attr.freq) { 3539 if (counter->attr.freq) {
3526 u64 now = sched_clock(); 3540 u64 now = perf_clock();
3527 s64 delta = now - hwc->freq_stamp; 3541 s64 delta = now - hwc->freq_stamp;
3528 3542
3529 hwc->freq_stamp = now; 3543 hwc->freq_stamp = now;
@@ -3549,10 +3563,17 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
3549 perf_counter_disable(counter); 3563 perf_counter_disable(counter);
3550 } 3564 }
3551 3565
3552 perf_counter_output(counter, nmi, data); 3566 perf_counter_output(counter, nmi, data, regs);
3553 return ret; 3567 return ret;
3554} 3568}
3555 3569
3570int perf_counter_overflow(struct perf_counter *counter, int nmi,
3571 struct perf_sample_data *data,
3572 struct pt_regs *regs)
3573{
3574 return __perf_counter_overflow(counter, nmi, 1, data, regs);
3575}
3576
3556/* 3577/*
3557 * Generic software counter infrastructure 3578 * Generic software counter infrastructure
3558 */ 3579 */
@@ -3588,9 +3609,11 @@ again:
3588} 3609}
3589 3610
3590static void perf_swcounter_overflow(struct perf_counter *counter, 3611static void perf_swcounter_overflow(struct perf_counter *counter,
3591 int nmi, struct perf_sample_data *data) 3612 int nmi, struct perf_sample_data *data,
3613 struct pt_regs *regs)
3592{ 3614{
3593 struct hw_perf_counter *hwc = &counter->hw; 3615 struct hw_perf_counter *hwc = &counter->hw;
3616 int throttle = 0;
3594 u64 overflow; 3617 u64 overflow;
3595 3618
3596 data->period = counter->hw.last_period; 3619 data->period = counter->hw.last_period;
@@ -3600,13 +3623,15 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
3600 return; 3623 return;
3601 3624
3602 for (; overflow; overflow--) { 3625 for (; overflow; overflow--) {
3603 if (perf_counter_overflow(counter, nmi, data)) { 3626 if (__perf_counter_overflow(counter, nmi, throttle,
3627 data, regs)) {
3604 /* 3628 /*
3605 * We inhibit the overflow from happening when 3629 * We inhibit the overflow from happening when
3606 * hwc->interrupts == MAX_INTERRUPTS. 3630 * hwc->interrupts == MAX_INTERRUPTS.
3607 */ 3631 */
3608 break; 3632 break;
3609 } 3633 }
3634 throttle = 1;
3610 } 3635 }
3611} 3636}
3612 3637
@@ -3618,7 +3643,8 @@ static void perf_swcounter_unthrottle(struct perf_counter *counter)
3618} 3643}
3619 3644
3620static void perf_swcounter_add(struct perf_counter *counter, u64 nr, 3645static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3621 int nmi, struct perf_sample_data *data) 3646 int nmi, struct perf_sample_data *data,
3647 struct pt_regs *regs)
3622{ 3648{
3623 struct hw_perf_counter *hwc = &counter->hw; 3649 struct hw_perf_counter *hwc = &counter->hw;
3624 3650
@@ -3627,11 +3653,11 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
3627 if (!hwc->sample_period) 3653 if (!hwc->sample_period)
3628 return; 3654 return;
3629 3655
3630 if (!data->regs) 3656 if (!regs)
3631 return; 3657 return;
3632 3658
3633 if (!atomic64_add_negative(nr, &hwc->period_left)) 3659 if (!atomic64_add_negative(nr, &hwc->period_left))
3634 perf_swcounter_overflow(counter, nmi, data); 3660 perf_swcounter_overflow(counter, nmi, data, regs);
3635} 3661}
3636 3662
3637static int perf_swcounter_is_counting(struct perf_counter *counter) 3663static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3690,7 +3716,8 @@ static int perf_swcounter_match(struct perf_counter *counter,
3690static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, 3716static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3691 enum perf_type_id type, 3717 enum perf_type_id type,
3692 u32 event, u64 nr, int nmi, 3718 u32 event, u64 nr, int nmi,
3693 struct perf_sample_data *data) 3719 struct perf_sample_data *data,
3720 struct pt_regs *regs)
3694{ 3721{
3695 struct perf_counter *counter; 3722 struct perf_counter *counter;
3696 3723
@@ -3699,8 +3726,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
3699 3726
3700 rcu_read_lock(); 3727 rcu_read_lock();
3701 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { 3728 list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
3702 if (perf_swcounter_match(counter, type, event, data->regs)) 3729 if (perf_swcounter_match(counter, type, event, regs))
3703 perf_swcounter_add(counter, nr, nmi, data); 3730 perf_swcounter_add(counter, nr, nmi, data, regs);
3704 } 3731 }
3705 rcu_read_unlock(); 3732 rcu_read_unlock();
3706} 3733}
@@ -3721,7 +3748,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
3721 3748
3722static void do_perf_swcounter_event(enum perf_type_id type, u32 event, 3749static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3723 u64 nr, int nmi, 3750 u64 nr, int nmi,
3724 struct perf_sample_data *data) 3751 struct perf_sample_data *data,
3752 struct pt_regs *regs)
3725{ 3753{
3726 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); 3754 struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
3727 int *recursion = perf_swcounter_recursion_context(cpuctx); 3755 int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3734,7 +3762,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3734 barrier(); 3762 barrier();
3735 3763
3736 perf_swcounter_ctx_event(&cpuctx->ctx, type, event, 3764 perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
3737 nr, nmi, data); 3765 nr, nmi, data, regs);
3738 rcu_read_lock(); 3766 rcu_read_lock();
3739 /* 3767 /*
3740 * doesn't really matter which of the child contexts the 3768 * doesn't really matter which of the child contexts the
@@ -3742,7 +3770,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
3742 */ 3770 */
3743 ctx = rcu_dereference(current->perf_counter_ctxp); 3771 ctx = rcu_dereference(current->perf_counter_ctxp);
3744 if (ctx) 3772 if (ctx)
3745 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data); 3773 perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
3746 rcu_read_unlock(); 3774 rcu_read_unlock();
3747 3775
3748 barrier(); 3776 barrier();
@@ -3756,11 +3784,11 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
3756 struct pt_regs *regs, u64 addr) 3784 struct pt_regs *regs, u64 addr)
3757{ 3785{
3758 struct perf_sample_data data = { 3786 struct perf_sample_data data = {
3759 .regs = regs,
3760 .addr = addr, 3787 .addr = addr,
3761 }; 3788 };
3762 3789
3763 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data); 3790 do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
3791 &data, regs);
3764} 3792}
3765 3793
3766static void perf_swcounter_read(struct perf_counter *counter) 3794static void perf_swcounter_read(struct perf_counter *counter)
@@ -3797,6 +3825,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3797{ 3825{
3798 enum hrtimer_restart ret = HRTIMER_RESTART; 3826 enum hrtimer_restart ret = HRTIMER_RESTART;
3799 struct perf_sample_data data; 3827 struct perf_sample_data data;
3828 struct pt_regs *regs;
3800 struct perf_counter *counter; 3829 struct perf_counter *counter;
3801 u64 period; 3830 u64 period;
3802 3831
@@ -3804,17 +3833,17 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
3804 counter->pmu->read(counter); 3833 counter->pmu->read(counter);
3805 3834
3806 data.addr = 0; 3835 data.addr = 0;
3807 data.regs = get_irq_regs(); 3836 regs = get_irq_regs();
3808 /* 3837 /*
3809 * In case we exclude kernel IPs or are somehow not in interrupt 3838 * In case we exclude kernel IPs or are somehow not in interrupt
3810 * context, provide the next best thing, the user IP. 3839 * context, provide the next best thing, the user IP.
3811 */ 3840 */
3812 if ((counter->attr.exclude_kernel || !data.regs) && 3841 if ((counter->attr.exclude_kernel || !regs) &&
3813 !counter->attr.exclude_user) 3842 !counter->attr.exclude_user)
3814 data.regs = task_pt_regs(current); 3843 regs = task_pt_regs(current);
3815 3844
3816 if (data.regs) { 3845 if (regs) {
3817 if (perf_counter_overflow(counter, 0, &data)) 3846 if (perf_counter_overflow(counter, 0, &data, regs))
3818 ret = HRTIMER_NORESTART; 3847 ret = HRTIMER_NORESTART;
3819 } 3848 }
3820 3849
@@ -3950,15 +3979,17 @@ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
3950 }; 3979 };
3951 3980
3952 struct perf_sample_data data = { 3981 struct perf_sample_data data = {
3953 .regs = get_irq_regs(),
3954 .addr = addr, 3982 .addr = addr,
3955 .raw = &raw, 3983 .raw = &raw,
3956 }; 3984 };
3957 3985
3958 if (!data.regs) 3986 struct pt_regs *regs = get_irq_regs();
3959 data.regs = task_pt_regs(current); 3987
3988 if (!regs)
3989 regs = task_pt_regs(current);
3960 3990
3961 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); 3991 do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
3992 &data, regs);
3962} 3993}
3963EXPORT_SYMBOL_GPL(perf_tpcounter_event); 3994EXPORT_SYMBOL_GPL(perf_tpcounter_event);
3964 3995
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e1d16c9a7680..ac2e1dc708bd 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
48__read_mostly int sched_clock_stable; 48__read_mostly int sched_clock_stable;
49 49
50struct sched_clock_data { 50struct sched_clock_data {
51 /*
52 * Raw spinlock - this is a special case: this might be called
53 * from within instrumentation code so we dont want to do any
54 * instrumentation ourselves.
55 */
56 raw_spinlock_t lock;
57
58 u64 tick_raw; 51 u64 tick_raw;
59 u64 tick_gtod; 52 u64 tick_gtod;
60 u64 clock; 53 u64 clock;
@@ -80,7 +73,6 @@ void sched_clock_init(void)
80 for_each_possible_cpu(cpu) { 73 for_each_possible_cpu(cpu) {
81 struct sched_clock_data *scd = cpu_sdc(cpu); 74 struct sched_clock_data *scd = cpu_sdc(cpu);
82 75
83 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
84 scd->tick_raw = 0; 76 scd->tick_raw = 0;
85 scd->tick_gtod = ktime_now; 77 scd->tick_gtod = ktime_now;
86 scd->clock = ktime_now; 78 scd->clock = ktime_now;
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
109 * - filter out backward motion 101 * - filter out backward motion
110 * - use the GTOD tick value to create a window to filter crazy TSC values 102 * - use the GTOD tick value to create a window to filter crazy TSC values
111 */ 103 */
112static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) 104static u64 sched_clock_local(struct sched_clock_data *scd)
113{ 105{
114 s64 delta = now - scd->tick_raw; 106 u64 now, clock, old_clock, min_clock, max_clock;
115 u64 clock, min_clock, max_clock; 107 s64 delta;
116 108
109again:
110 now = sched_clock();
111 delta = now - scd->tick_raw;
117 if (unlikely(delta < 0)) 112 if (unlikely(delta < 0))
118 delta = 0; 113 delta = 0;
119 114
115 old_clock = scd->clock;
116
120 /* 117 /*
121 * scd->clock = clamp(scd->tick_gtod + delta, 118 * scd->clock = clamp(scd->tick_gtod + delta,
122 * max(scd->tick_gtod, scd->clock), 119 * max(scd->tick_gtod, scd->clock),
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
124 */ 121 */
125 122
126 clock = scd->tick_gtod + delta; 123 clock = scd->tick_gtod + delta;
127 min_clock = wrap_max(scd->tick_gtod, scd->clock); 124 min_clock = wrap_max(scd->tick_gtod, old_clock);
128 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); 125 max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
129 126
130 clock = wrap_max(clock, min_clock); 127 clock = wrap_max(clock, min_clock);
131 clock = wrap_min(clock, max_clock); 128 clock = wrap_min(clock, max_clock);
132 129
133 scd->clock = clock; 130 if (cmpxchg(&scd->clock, old_clock, clock) != old_clock)
131 goto again;
134 132
135 return scd->clock; 133 return clock;
136} 134}
137 135
138static void lock_double_clock(struct sched_clock_data *data1, 136static u64 sched_clock_remote(struct sched_clock_data *scd)
139 struct sched_clock_data *data2)
140{ 137{
141 if (data1 < data2) { 138 struct sched_clock_data *my_scd = this_scd();
142 __raw_spin_lock(&data1->lock); 139 u64 this_clock, remote_clock;
143 __raw_spin_lock(&data2->lock); 140 u64 *ptr, old_val, val;
141
142 sched_clock_local(my_scd);
143again:
144 this_clock = my_scd->clock;
145 remote_clock = scd->clock;
146
147 /*
148 * Use the opportunity that we have both locks
149 * taken to couple the two clocks: we take the
150 * larger time as the latest time for both
151 * runqueues. (this creates monotonic movement)
152 */
153 if (likely((s64)(remote_clock - this_clock) < 0)) {
154 ptr = &scd->clock;
155 old_val = remote_clock;
156 val = this_clock;
144 } else { 157 } else {
145 __raw_spin_lock(&data2->lock); 158 /*
146 __raw_spin_lock(&data1->lock); 159 * Should be rare, but possible:
160 */
161 ptr = &my_scd->clock;
162 old_val = this_clock;
163 val = remote_clock;
147 } 164 }
165
166 if (cmpxchg(ptr, old_val, val) != old_val)
167 goto again;
168
169 return val;
148} 170}
149 171
150u64 sched_clock_cpu(int cpu) 172u64 sched_clock_cpu(int cpu)
151{ 173{
152 u64 now, clock, this_clock, remote_clock;
153 struct sched_clock_data *scd; 174 struct sched_clock_data *scd;
175 u64 clock;
176
177 WARN_ON_ONCE(!irqs_disabled());
154 178
155 if (sched_clock_stable) 179 if (sched_clock_stable)
156 return sched_clock(); 180 return sched_clock();
157 181
158 scd = cpu_sdc(cpu);
159
160 /*
161 * Normally this is not called in NMI context - but if it is,
162 * trying to do any locking here is totally lethal.
163 */
164 if (unlikely(in_nmi()))
165 return scd->clock;
166
167 if (unlikely(!sched_clock_running)) 182 if (unlikely(!sched_clock_running))
168 return 0ull; 183 return 0ull;
169 184
170 WARN_ON_ONCE(!irqs_disabled()); 185 scd = cpu_sdc(cpu);
171 now = sched_clock();
172
173 if (cpu != raw_smp_processor_id()) {
174 struct sched_clock_data *my_scd = this_scd();
175
176 lock_double_clock(scd, my_scd);
177
178 this_clock = __update_sched_clock(my_scd, now);
179 remote_clock = scd->clock;
180
181 /*
182 * Use the opportunity that we have both locks
183 * taken to couple the two clocks: we take the
184 * larger time as the latest time for both
185 * runqueues. (this creates monotonic movement)
186 */
187 if (likely((s64)(remote_clock - this_clock) < 0)) {
188 clock = this_clock;
189 scd->clock = clock;
190 } else {
191 /*
192 * Should be rare, but possible:
193 */
194 clock = remote_clock;
195 my_scd->clock = remote_clock;
196 }
197
198 __raw_spin_unlock(&my_scd->lock);
199 } else {
200 __raw_spin_lock(&scd->lock);
201 clock = __update_sched_clock(scd, now);
202 }
203 186
204 __raw_spin_unlock(&scd->lock); 187 if (cpu != smp_processor_id())
188 clock = sched_clock_remote(scd);
189 else
190 clock = sched_clock_local(scd);
205 191
206 return clock; 192 return clock;
207} 193}
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
223 now_gtod = ktime_to_ns(ktime_get()); 209 now_gtod = ktime_to_ns(ktime_get());
224 now = sched_clock(); 210 now = sched_clock();
225 211
226 __raw_spin_lock(&scd->lock);
227 scd->tick_raw = now; 212 scd->tick_raw = now;
228 scd->tick_gtod = now_gtod; 213 scd->tick_gtod = now_gtod;
229 __update_sched_clock(scd, now); 214 sched_clock_local(scd);
230 __raw_spin_unlock(&scd->lock);
231} 215}
232 216
233/* 217/*
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 10d218ab69f2..990b188803ce 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
513 if (entity_is_task(curr)) { 513 if (entity_is_task(curr)) {
514 struct task_struct *curtask = task_of(curr); 514 struct task_struct *curtask = task_of(curr);
515 515
516 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
516 cpuacct_charge(curtask, delta_exec); 517 cpuacct_charge(curtask, delta_exec);
517 account_group_exec_runtime(curtask, delta_exec); 518 account_group_exec_runtime(curtask, delta_exec);
518 } 519 }
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
new file mode 100644
index 000000000000..1ce79198997b
--- /dev/null
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -0,0 +1,41 @@
1perf-sched(1)
2==============
3
4NAME
5----
6perf-sched - Tool to trace/measure scheduler properties (latencies)
7
8SYNOPSIS
9--------
10[verse]
11'perf sched' {record|latency|replay|trace}
12
13DESCRIPTION
14-----------
15There's four variants of perf sched:
16
17 'perf sched record <command>' to record the scheduling events
18 of an arbitrary workload.
19
20 'perf sched latency' to report the per task scheduling latencies
21 and other scheduling properties of the workload.
22
23 'perf sched trace' to see a detailed trace of the workload that
24 was recorded.
25
26 'perf sched replay' to simulate the workload that was recorded
27 via perf sched record. (this is done by starting up mockup threads
28 that mimic the workload based on the events in the trace. These
29 threads can then replay the timings (CPU runtime and sleep patterns)
30 of the workload as it occured when it was recorded - and can repeat
31 it a number of times, measuring its performance.)
32
33OPTIONS
34-------
35-D::
36--dump-raw-trace=::
37 Display verbose dump of the sched data.
38
39SEE ALSO
40--------
41linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
new file mode 100644
index 000000000000..41ed75398ca9
--- /dev/null
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -0,0 +1,25 @@
1perf-trace(1)
2==============
3
4NAME
5----
6perf-trace - Read perf.data (created by perf record) and display trace output
7
8SYNOPSIS
9--------
10[verse]
11'perf trace' [-i <file> | --input=file] symbol_name
12
13DESCRIPTION
14-----------
15This command reads the input file and displays the trace recorded.
16
17OPTIONS
18-------
19-D::
20--dump-raw-trace=::
21 Display verbose dump of the trace data.
22
23SEE ALSO
24--------
25linkperf:perf-record[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 9f8d207a91bf..2cb8cc3f6772 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -376,6 +376,7 @@ LIB_OBJS += util/trace-event-info.o
376 376
377BUILTIN_OBJS += builtin-annotate.o 377BUILTIN_OBJS += builtin-annotate.o
378BUILTIN_OBJS += builtin-help.o 378BUILTIN_OBJS += builtin-help.o
379BUILTIN_OBJS += builtin-sched.o
379BUILTIN_OBJS += builtin-list.o 380BUILTIN_OBJS += builtin-list.o
380BUILTIN_OBJS += builtin-record.o 381BUILTIN_OBJS += builtin-record.o
381BUILTIN_OBJS += builtin-report.o 382BUILTIN_OBJS += builtin-report.o
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 99a12fe86e9f..2459e5a22ed8 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -48,6 +48,8 @@ static int call_graph = 0;
48static int inherit_stat = 0; 48static int inherit_stat = 0;
49static int no_samples = 0; 49static int no_samples = 0;
50static int sample_address = 0; 50static int sample_address = 0;
51static int multiplex = 0;
52static int multiplex_fd = -1;
51 53
52static long samples; 54static long samples;
53static struct timeval last_read; 55static struct timeval last_read;
@@ -470,19 +472,28 @@ try_again:
470 */ 472 */
471 if (group && group_fd == -1) 473 if (group && group_fd == -1)
472 group_fd = fd[nr_cpu][counter]; 474 group_fd = fd[nr_cpu][counter];
475 if (multiplex && multiplex_fd == -1)
476 multiplex_fd = fd[nr_cpu][counter];
473 477
474 event_array[nr_poll].fd = fd[nr_cpu][counter]; 478 if (multiplex && fd[nr_cpu][counter] != multiplex_fd) {
475 event_array[nr_poll].events = POLLIN; 479 int ret;
476 nr_poll++; 480
477 481 ret = ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, multiplex_fd);
478 mmap_array[nr_cpu][counter].counter = counter; 482 assert(ret != -1);
479 mmap_array[nr_cpu][counter].prev = 0; 483 } else {
480 mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1; 484 event_array[nr_poll].fd = fd[nr_cpu][counter];
481 mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size, 485 event_array[nr_poll].events = POLLIN;
482 PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0); 486 nr_poll++;
483 if (mmap_array[nr_cpu][counter].base == MAP_FAILED) { 487
484 error("failed to mmap with %d (%s)\n", errno, strerror(errno)); 488 mmap_array[nr_cpu][counter].counter = counter;
485 exit(-1); 489 mmap_array[nr_cpu][counter].prev = 0;
490 mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
491 mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
492 PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
493 if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
494 error("failed to mmap with %d (%s)\n", errno, strerror(errno));
495 exit(-1);
496 }
486 } 497 }
487 498
488 ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE); 499 ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
@@ -513,6 +524,7 @@ static int __cmd_record(int argc, const char **argv)
513 pid_t pid = 0; 524 pid_t pid = 0;
514 int flags; 525 int flags;
515 int ret; 526 int ret;
527 unsigned long waking = 0;
516 528
517 page_size = sysconf(_SC_PAGE_SIZE); 529 page_size = sysconf(_SC_PAGE_SIZE);
518 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); 530 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
@@ -614,17 +626,29 @@ static int __cmd_record(int argc, const char **argv)
614 int hits = samples; 626 int hits = samples;
615 627
616 for (i = 0; i < nr_cpu; i++) { 628 for (i = 0; i < nr_cpu; i++) {
617 for (counter = 0; counter < nr_counters; counter++) 629 for (counter = 0; counter < nr_counters; counter++) {
618 mmap_read(&mmap_array[i][counter]); 630 if (mmap_array[i][counter].base)
631 mmap_read(&mmap_array[i][counter]);
632 }
619 } 633 }
620 634
621 if (hits == samples) { 635 if (hits == samples) {
622 if (done) 636 if (done)
623 break; 637 break;
624 ret = poll(event_array, nr_poll, 100); 638 ret = poll(event_array, nr_poll, -1);
639 waking++;
640 }
641
642 if (done) {
643 for (i = 0; i < nr_cpu; i++) {
644 for (counter = 0; counter < nr_counters; counter++)
645 ioctl(fd[i][counter], PERF_COUNTER_IOC_DISABLE);
646 }
625 } 647 }
626 } 648 }
627 649
650 fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
651
628 /* 652 /*
629 * Approximate RIP event size: 24 bytes. 653 * Approximate RIP event size: 24 bytes.
630 */ 654 */
@@ -681,6 +705,8 @@ static const struct option options[] = {
681 "Sample addresses"), 705 "Sample addresses"),
682 OPT_BOOLEAN('n', "no-samples", &no_samples, 706 OPT_BOOLEAN('n', "no-samples", &no_samples,
683 "don't sample"), 707 "don't sample"),
708 OPT_BOOLEAN('M', "multiplex", &multiplex,
709 "multiplex counter output in a single channel"),
684 OPT_END() 710 OPT_END()
685}; 711};
686 712
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
new file mode 100644
index 000000000000..275d79c6627a
--- /dev/null
+++ b/tools/perf/builtin-sched.c
@@ -0,0 +1,2004 @@
1#include "builtin.h"
2#include "perf.h"
3
4#include "util/util.h"
5#include "util/cache.h"
6#include "util/symbol.h"
7#include "util/thread.h"
8#include "util/header.h"
9
10#include "util/parse-options.h"
11#include "util/trace-event.h"
12
13#include "util/debug.h"
14
15#include <sys/types.h>
16#include <sys/prctl.h>
17
18#include <semaphore.h>
19#include <pthread.h>
20#include <math.h>
21
22static char const *input_name = "perf.data";
23static int input;
24static unsigned long page_size;
25static unsigned long mmap_window = 32;
26
27static unsigned long total_comm = 0;
28
29static struct rb_root threads;
30static struct thread *last_match;
31
32static struct perf_header *header;
33static u64 sample_type;
34
35static char default_sort_order[] = "avg, max, switch, runtime";
36static char *sort_order = default_sort_order;
37
38#define PR_SET_NAME 15 /* Set process name */
39#define MAX_CPUS 4096
40
41#define BUG_ON(x) assert(!(x))
42
43static u64 run_measurement_overhead;
44static u64 sleep_measurement_overhead;
45
46#define COMM_LEN 20
47#define SYM_LEN 129
48
49#define MAX_PID 65536
50
51static unsigned long nr_tasks;
52
53struct sched_atom;
54
55struct task_desc {
56 unsigned long nr;
57 unsigned long pid;
58 char comm[COMM_LEN];
59
60 unsigned long nr_events;
61 unsigned long curr_event;
62 struct sched_atom **atoms;
63
64 pthread_t thread;
65 sem_t sleep_sem;
66
67 sem_t ready_for_work;
68 sem_t work_done_sem;
69
70 u64 cpu_usage;
71};
72
73enum sched_event_type {
74 SCHED_EVENT_RUN,
75 SCHED_EVENT_SLEEP,
76 SCHED_EVENT_WAKEUP,
77};
78
79struct sched_atom {
80 enum sched_event_type type;
81 u64 timestamp;
82 u64 duration;
83 unsigned long nr;
84 int specific_wait;
85 sem_t *wait_sem;
86 struct task_desc *wakee;
87};
88
89static struct task_desc *pid_to_task[MAX_PID];
90
91static struct task_desc **tasks;
92
93static pthread_mutex_t start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
94static u64 start_time;
95
96static pthread_mutex_t work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
97
98static unsigned long nr_run_events;
99static unsigned long nr_sleep_events;
100static unsigned long nr_wakeup_events;
101
102static unsigned long nr_sleep_corrections;
103static unsigned long nr_run_events_optimized;
104
105static unsigned long targetless_wakeups;
106static unsigned long multitarget_wakeups;
107
108static u64 cpu_usage;
109static u64 runavg_cpu_usage;
110static u64 parent_cpu_usage;
111static u64 runavg_parent_cpu_usage;
112
113static unsigned long nr_runs;
114static u64 sum_runtime;
115static u64 sum_fluct;
116static u64 run_avg;
117
118static unsigned long replay_repeat = 10;
119static unsigned long nr_timestamps;
120static unsigned long nr_unordered_timestamps;
121static unsigned long nr_state_machine_bugs;
122static unsigned long nr_context_switch_bugs;
123static unsigned long nr_events;
124static unsigned long nr_lost_chunks;
125static unsigned long nr_lost_events;
126
127#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
128
129enum thread_state {
130 THREAD_SLEEPING = 0,
131 THREAD_WAIT_CPU,
132 THREAD_SCHED_IN,
133 THREAD_IGNORE
134};
135
136struct work_atom {
137 struct list_head list;
138 enum thread_state state;
139 u64 sched_out_time;
140 u64 wake_up_time;
141 u64 sched_in_time;
142 u64 runtime;
143};
144
145struct work_atoms {
146 struct list_head work_list;
147 struct thread *thread;
148 struct rb_node node;
149 u64 max_lat;
150 u64 total_lat;
151 u64 nb_atoms;
152 u64 total_runtime;
153};
154
155typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
156
157static struct rb_root atom_root, sorted_atom_root;
158
159static u64 all_runtime;
160static u64 all_count;
161
162
163static u64 get_nsecs(void)
164{
165 struct timespec ts;
166
167 clock_gettime(CLOCK_MONOTONIC, &ts);
168
169 return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
170}
171
172static void burn_nsecs(u64 nsecs)
173{
174 u64 T0 = get_nsecs(), T1;
175
176 do {
177 T1 = get_nsecs();
178 } while (T1 + run_measurement_overhead < T0 + nsecs);
179}
180
181static void sleep_nsecs(u64 nsecs)
182{
183 struct timespec ts;
184
185 ts.tv_nsec = nsecs % 999999999;
186 ts.tv_sec = nsecs / 999999999;
187
188 nanosleep(&ts, NULL);
189}
190
191static void calibrate_run_measurement_overhead(void)
192{
193 u64 T0, T1, delta, min_delta = 1000000000ULL;
194 int i;
195
196 for (i = 0; i < 10; i++) {
197 T0 = get_nsecs();
198 burn_nsecs(0);
199 T1 = get_nsecs();
200 delta = T1-T0;
201 min_delta = min(min_delta, delta);
202 }
203 run_measurement_overhead = min_delta;
204
205 printf("run measurement overhead: %Ld nsecs\n", min_delta);
206}
207
208static void calibrate_sleep_measurement_overhead(void)
209{
210 u64 T0, T1, delta, min_delta = 1000000000ULL;
211 int i;
212
213 for (i = 0; i < 10; i++) {
214 T0 = get_nsecs();
215 sleep_nsecs(10000);
216 T1 = get_nsecs();
217 delta = T1-T0;
218 min_delta = min(min_delta, delta);
219 }
220 min_delta -= 10000;
221 sleep_measurement_overhead = min_delta;
222
223 printf("sleep measurement overhead: %Ld nsecs\n", min_delta);
224}
225
226static struct sched_atom *
227get_new_event(struct task_desc *task, u64 timestamp)
228{
229 struct sched_atom *event = calloc(1, sizeof(*event));
230 unsigned long idx = task->nr_events;
231 size_t size;
232
233 event->timestamp = timestamp;
234 event->nr = idx;
235
236 task->nr_events++;
237 size = sizeof(struct sched_atom *) * task->nr_events;
238 task->atoms = realloc(task->atoms, size);
239 BUG_ON(!task->atoms);
240
241 task->atoms[idx] = event;
242
243 return event;
244}
245
246static struct sched_atom *last_event(struct task_desc *task)
247{
248 if (!task->nr_events)
249 return NULL;
250
251 return task->atoms[task->nr_events - 1];
252}
253
254static void
255add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
256{
257 struct sched_atom *event, *curr_event = last_event(task);
258
259 /*
260 * optimize an existing RUN event by merging this one
261 * to it:
262 */
263 if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
264 nr_run_events_optimized++;
265 curr_event->duration += duration;
266 return;
267 }
268
269 event = get_new_event(task, timestamp);
270
271 event->type = SCHED_EVENT_RUN;
272 event->duration = duration;
273
274 nr_run_events++;
275}
276
277static void
278add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
279 struct task_desc *wakee)
280{
281 struct sched_atom *event, *wakee_event;
282
283 event = get_new_event(task, timestamp);
284 event->type = SCHED_EVENT_WAKEUP;
285 event->wakee = wakee;
286
287 wakee_event = last_event(wakee);
288 if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
289 targetless_wakeups++;
290 return;
291 }
292 if (wakee_event->wait_sem) {
293 multitarget_wakeups++;
294 return;
295 }
296
297 wakee_event->wait_sem = calloc(1, sizeof(*wakee_event->wait_sem));
298 sem_init(wakee_event->wait_sem, 0, 0);
299 wakee_event->specific_wait = 1;
300 event->wait_sem = wakee_event->wait_sem;
301
302 nr_wakeup_events++;
303}
304
305static void
306add_sched_event_sleep(struct task_desc *task, u64 timestamp,
307 u64 task_state __used)
308{
309 struct sched_atom *event = get_new_event(task, timestamp);
310
311 event->type = SCHED_EVENT_SLEEP;
312
313 nr_sleep_events++;
314}
315
316static struct task_desc *register_pid(unsigned long pid, const char *comm)
317{
318 struct task_desc *task;
319
320 BUG_ON(pid >= MAX_PID);
321
322 task = pid_to_task[pid];
323
324 if (task)
325 return task;
326
327 task = calloc(1, sizeof(*task));
328 task->pid = pid;
329 task->nr = nr_tasks;
330 strcpy(task->comm, comm);
331 /*
332 * every task starts in sleeping state - this gets ignored
333 * if there's no wakeup pointing to this sleep state:
334 */
335 add_sched_event_sleep(task, 0, 0);
336
337 pid_to_task[pid] = task;
338 nr_tasks++;
339 tasks = realloc(tasks, nr_tasks*sizeof(struct task_task *));
340 BUG_ON(!tasks);
341 tasks[task->nr] = task;
342
343 if (verbose)
344 printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
345
346 return task;
347}
348
349
350static void print_task_traces(void)
351{
352 struct task_desc *task;
353 unsigned long i;
354
355 for (i = 0; i < nr_tasks; i++) {
356 task = tasks[i];
357 printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
358 task->nr, task->comm, task->pid, task->nr_events);
359 }
360}
361
362static void add_cross_task_wakeups(void)
363{
364 struct task_desc *task1, *task2;
365 unsigned long i, j;
366
367 for (i = 0; i < nr_tasks; i++) {
368 task1 = tasks[i];
369 j = i + 1;
370 if (j == nr_tasks)
371 j = 0;
372 task2 = tasks[j];
373 add_sched_event_wakeup(task1, 0, task2);
374 }
375}
376
377static void
378process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
379{
380 int ret = 0;
381 u64 now;
382 long long delta;
383
384 now = get_nsecs();
385 delta = start_time + atom->timestamp - now;
386
387 switch (atom->type) {
388 case SCHED_EVENT_RUN:
389 burn_nsecs(atom->duration);
390 break;
391 case SCHED_EVENT_SLEEP:
392 if (atom->wait_sem)
393 ret = sem_wait(atom->wait_sem);
394 BUG_ON(ret);
395 break;
396 case SCHED_EVENT_WAKEUP:
397 if (atom->wait_sem)
398 ret = sem_post(atom->wait_sem);
399 BUG_ON(ret);
400 break;
401 default:
402 BUG_ON(1);
403 }
404}
405
406static u64 get_cpu_usage_nsec_parent(void)
407{
408 struct rusage ru;
409 u64 sum;
410 int err;
411
412 err = getrusage(RUSAGE_SELF, &ru);
413 BUG_ON(err);
414
415 sum = ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
416 sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
417
418 return sum;
419}
420
421static u64 get_cpu_usage_nsec_self(void)
422{
423 char filename [] = "/proc/1234567890/sched";
424 unsigned long msecs, nsecs;
425 char *line = NULL;
426 u64 total = 0;
427 size_t len = 0;
428 ssize_t chars;
429 FILE *file;
430 int ret;
431
432 sprintf(filename, "/proc/%d/sched", getpid());
433 file = fopen(filename, "r");
434 BUG_ON(!file);
435
436 while ((chars = getline(&line, &len, file)) != -1) {
437 ret = sscanf(line, "se.sum_exec_runtime : %ld.%06ld\n",
438 &msecs, &nsecs);
439 if (ret == 2) {
440 total = msecs*1e6 + nsecs;
441 break;
442 }
443 }
444 if (line)
445 free(line);
446 fclose(file);
447
448 return total;
449}
450
451static void *thread_func(void *ctx)
452{
453 struct task_desc *this_task = ctx;
454 u64 cpu_usage_0, cpu_usage_1;
455 unsigned long i, ret;
456 char comm2[22];
457
458 sprintf(comm2, ":%s", this_task->comm);
459 prctl(PR_SET_NAME, comm2);
460
461again:
462 ret = sem_post(&this_task->ready_for_work);
463 BUG_ON(ret);
464 ret = pthread_mutex_lock(&start_work_mutex);
465 BUG_ON(ret);
466 ret = pthread_mutex_unlock(&start_work_mutex);
467 BUG_ON(ret);
468
469 cpu_usage_0 = get_cpu_usage_nsec_self();
470
471 for (i = 0; i < this_task->nr_events; i++) {
472 this_task->curr_event = i;
473 process_sched_event(this_task, this_task->atoms[i]);
474 }
475
476 cpu_usage_1 = get_cpu_usage_nsec_self();
477 this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
478
479 ret = sem_post(&this_task->work_done_sem);
480 BUG_ON(ret);
481
482 ret = pthread_mutex_lock(&work_done_wait_mutex);
483 BUG_ON(ret);
484 ret = pthread_mutex_unlock(&work_done_wait_mutex);
485 BUG_ON(ret);
486
487 goto again;
488}
489
490static void create_tasks(void)
491{
492 struct task_desc *task;
493 pthread_attr_t attr;
494 unsigned long i;
495 int err;
496
497 err = pthread_attr_init(&attr);
498 BUG_ON(err);
499 err = pthread_attr_setstacksize(&attr, (size_t)(16*1024));
500 BUG_ON(err);
501 err = pthread_mutex_lock(&start_work_mutex);
502 BUG_ON(err);
503 err = pthread_mutex_lock(&work_done_wait_mutex);
504 BUG_ON(err);
505 for (i = 0; i < nr_tasks; i++) {
506 task = tasks[i];
507 sem_init(&task->sleep_sem, 0, 0);
508 sem_init(&task->ready_for_work, 0, 0);
509 sem_init(&task->work_done_sem, 0, 0);
510 task->curr_event = 0;
511 err = pthread_create(&task->thread, &attr, thread_func, task);
512 BUG_ON(err);
513 }
514}
515
516static void wait_for_tasks(void)
517{
518 u64 cpu_usage_0, cpu_usage_1;
519 struct task_desc *task;
520 unsigned long i, ret;
521
522 start_time = get_nsecs();
523 cpu_usage = 0;
524 pthread_mutex_unlock(&work_done_wait_mutex);
525
526 for (i = 0; i < nr_tasks; i++) {
527 task = tasks[i];
528 ret = sem_wait(&task->ready_for_work);
529 BUG_ON(ret);
530 sem_init(&task->ready_for_work, 0, 0);
531 }
532 ret = pthread_mutex_lock(&work_done_wait_mutex);
533 BUG_ON(ret);
534
535 cpu_usage_0 = get_cpu_usage_nsec_parent();
536
537 pthread_mutex_unlock(&start_work_mutex);
538
539 for (i = 0; i < nr_tasks; i++) {
540 task = tasks[i];
541 ret = sem_wait(&task->work_done_sem);
542 BUG_ON(ret);
543 sem_init(&task->work_done_sem, 0, 0);
544 cpu_usage += task->cpu_usage;
545 task->cpu_usage = 0;
546 }
547
548 cpu_usage_1 = get_cpu_usage_nsec_parent();
549 if (!runavg_cpu_usage)
550 runavg_cpu_usage = cpu_usage;
551 runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10;
552
553 parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
554 if (!runavg_parent_cpu_usage)
555 runavg_parent_cpu_usage = parent_cpu_usage;
556 runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 +
557 parent_cpu_usage)/10;
558
559 ret = pthread_mutex_lock(&start_work_mutex);
560 BUG_ON(ret);
561
562 for (i = 0; i < nr_tasks; i++) {
563 task = tasks[i];
564 sem_init(&task->sleep_sem, 0, 0);
565 task->curr_event = 0;
566 }
567}
568
569static void run_one_test(void)
570{
571 u64 T0, T1, delta, avg_delta, fluct, std_dev;
572
573 T0 = get_nsecs();
574 wait_for_tasks();
575 T1 = get_nsecs();
576
577 delta = T1 - T0;
578 sum_runtime += delta;
579 nr_runs++;
580
581 avg_delta = sum_runtime / nr_runs;
582 if (delta < avg_delta)
583 fluct = avg_delta - delta;
584 else
585 fluct = delta - avg_delta;
586 sum_fluct += fluct;
587 std_dev = sum_fluct / nr_runs / sqrt(nr_runs);
588 if (!run_avg)
589 run_avg = delta;
590 run_avg = (run_avg*9 + delta)/10;
591
592 printf("#%-3ld: %0.3f, ",
593 nr_runs, (double)delta/1000000.0);
594
595 printf("ravg: %0.2f, ",
596 (double)run_avg/1e6);
597
598 printf("cpu: %0.2f / %0.2f",
599 (double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
600
601#if 0
602 /*
603 * rusage statistics done by the parent, these are less
604 * accurate than the sum_exec_runtime based statistics:
605 */
606 printf(" [%0.2f / %0.2f]",
607 (double)parent_cpu_usage/1e6,
608 (double)runavg_parent_cpu_usage/1e6);
609#endif
610
611 printf("\n");
612
613 if (nr_sleep_corrections)
614 printf(" (%ld sleep corrections)\n", nr_sleep_corrections);
615 nr_sleep_corrections = 0;
616}
617
618static void test_calibrations(void)
619{
620 u64 T0, T1;
621
622 T0 = get_nsecs();
623 burn_nsecs(1e6);
624 T1 = get_nsecs();
625
626 printf("the run test took %Ld nsecs\n", T1-T0);
627
628 T0 = get_nsecs();
629 sleep_nsecs(1e6);
630 T1 = get_nsecs();
631
632 printf("the sleep test took %Ld nsecs\n", T1-T0);
633}
634
635static int
636process_comm_event(event_t *event, unsigned long offset, unsigned long head)
637{
638 struct thread *thread;
639
640 thread = threads__findnew(event->comm.pid, &threads, &last_match);
641
642 dump_printf("%p [%p]: perf_event_comm: %s:%d\n",
643 (void *)(offset + head),
644 (void *)(long)(event->header.size),
645 event->comm.comm, event->comm.pid);
646
647 if (thread == NULL ||
648 thread__set_comm(thread, event->comm.comm)) {
649 dump_printf("problem processing perf_event_comm, skipping event.\n");
650 return -1;
651 }
652 total_comm++;
653
654 return 0;
655}
656
657
658struct raw_event_sample {
659 u32 size;
660 char data[0];
661};
662
663#define FILL_FIELD(ptr, field, event, data) \
664 ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data)
665
666#define FILL_ARRAY(ptr, array, event, data) \
667do { \
668 void *__array = raw_field_ptr(event, #array, data); \
669 memcpy(ptr.array, __array, sizeof(ptr.array)); \
670} while(0)
671
672#define FILL_COMMON_FIELDS(ptr, event, data) \
673do { \
674 FILL_FIELD(ptr, common_type, event, data); \
675 FILL_FIELD(ptr, common_flags, event, data); \
676 FILL_FIELD(ptr, common_preempt_count, event, data); \
677 FILL_FIELD(ptr, common_pid, event, data); \
678 FILL_FIELD(ptr, common_tgid, event, data); \
679} while (0)
680
681
682
683struct trace_switch_event {
684 u32 size;
685
686 u16 common_type;
687 u8 common_flags;
688 u8 common_preempt_count;
689 u32 common_pid;
690 u32 common_tgid;
691
692 char prev_comm[16];
693 u32 prev_pid;
694 u32 prev_prio;
695 u64 prev_state;
696 char next_comm[16];
697 u32 next_pid;
698 u32 next_prio;
699};
700
701struct trace_runtime_event {
702 u32 size;
703
704 u16 common_type;
705 u8 common_flags;
706 u8 common_preempt_count;
707 u32 common_pid;
708 u32 common_tgid;
709
710 char comm[16];
711 u32 pid;
712 u64 runtime;
713 u64 vruntime;
714};
715
716struct trace_wakeup_event {
717 u32 size;
718
719 u16 common_type;
720 u8 common_flags;
721 u8 common_preempt_count;
722 u32 common_pid;
723 u32 common_tgid;
724
725 char comm[16];
726 u32 pid;
727
728 u32 prio;
729 u32 success;
730 u32 cpu;
731};
732
733struct trace_fork_event {
734 u32 size;
735
736 u16 common_type;
737 u8 common_flags;
738 u8 common_preempt_count;
739 u32 common_pid;
740 u32 common_tgid;
741
742 char parent_comm[16];
743 u32 parent_pid;
744 char child_comm[16];
745 u32 child_pid;
746};
747
748struct trace_sched_handler {
749 void (*switch_event)(struct trace_switch_event *,
750 struct event *,
751 int cpu,
752 u64 timestamp,
753 struct thread *thread);
754
755 void (*runtime_event)(struct trace_runtime_event *,
756 struct event *,
757 int cpu,
758 u64 timestamp,
759 struct thread *thread);
760
761 void (*wakeup_event)(struct trace_wakeup_event *,
762 struct event *,
763 int cpu,
764 u64 timestamp,
765 struct thread *thread);
766
767 void (*fork_event)(struct trace_fork_event *,
768 struct event *,
769 int cpu,
770 u64 timestamp,
771 struct thread *thread);
772};
773
774
775static void
776replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
777 struct event *event,
778 int cpu __used,
779 u64 timestamp __used,
780 struct thread *thread __used)
781{
782 struct task_desc *waker, *wakee;
783
784 if (verbose) {
785 printf("sched_wakeup event %p\n", event);
786
787 printf(" ... pid %d woke up %s/%d\n",
788 wakeup_event->common_pid,
789 wakeup_event->comm,
790 wakeup_event->pid);
791 }
792
793 waker = register_pid(wakeup_event->common_pid, "<unknown>");
794 wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
795
796 add_sched_event_wakeup(waker, timestamp, wakee);
797}
798
799static u64 cpu_last_switched[MAX_CPUS];
800
801static void
802replay_switch_event(struct trace_switch_event *switch_event,
803 struct event *event,
804 int cpu,
805 u64 timestamp,
806 struct thread *thread __used)
807{
808 struct task_desc *prev, *next;
809 u64 timestamp0;
810 s64 delta;
811
812 if (verbose)
813 printf("sched_switch event %p\n", event);
814
815 if (cpu >= MAX_CPUS || cpu < 0)
816 return;
817
818 timestamp0 = cpu_last_switched[cpu];
819 if (timestamp0)
820 delta = timestamp - timestamp0;
821 else
822 delta = 0;
823
824 if (delta < 0)
825 die("hm, delta: %Ld < 0 ?\n", delta);
826
827 if (verbose) {
828 printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
829 switch_event->prev_comm, switch_event->prev_pid,
830 switch_event->next_comm, switch_event->next_pid,
831 delta);
832 }
833
834 prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
835 next = register_pid(switch_event->next_pid, switch_event->next_comm);
836
837 cpu_last_switched[cpu] = timestamp;
838
839 add_sched_event_run(prev, timestamp, delta);
840 add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
841}
842
843
844static void
845replay_fork_event(struct trace_fork_event *fork_event,
846 struct event *event,
847 int cpu __used,
848 u64 timestamp __used,
849 struct thread *thread __used)
850{
851 if (verbose) {
852 printf("sched_fork event %p\n", event);
853 printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
854 printf("... child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
855 }
856 register_pid(fork_event->parent_pid, fork_event->parent_comm);
857 register_pid(fork_event->child_pid, fork_event->child_comm);
858}
859
860static struct trace_sched_handler replay_ops = {
861 .wakeup_event = replay_wakeup_event,
862 .switch_event = replay_switch_event,
863 .fork_event = replay_fork_event,
864};
865
866struct sort_dimension {
867 const char *name;
868 sort_fn_t cmp;
869 struct list_head list;
870};
871
872static LIST_HEAD(cmp_pid);
873
874static int
875thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
876{
877 struct sort_dimension *sort;
878 int ret = 0;
879
880 BUG_ON(list_empty(list));
881
882 list_for_each_entry(sort, list, list) {
883 ret = sort->cmp(l, r);
884 if (ret)
885 return ret;
886 }
887
888 return ret;
889}
890
891static struct work_atoms *
892thread_atoms_search(struct rb_root *root, struct thread *thread,
893 struct list_head *sort_list)
894{
895 struct rb_node *node = root->rb_node;
896 struct work_atoms key = { .thread = thread };
897
898 while (node) {
899 struct work_atoms *atoms;
900 int cmp;
901
902 atoms = container_of(node, struct work_atoms, node);
903
904 cmp = thread_lat_cmp(sort_list, &key, atoms);
905 if (cmp > 0)
906 node = node->rb_left;
907 else if (cmp < 0)
908 node = node->rb_right;
909 else {
910 BUG_ON(thread != atoms->thread);
911 return atoms;
912 }
913 }
914 return NULL;
915}
916
917static void
918__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
919 struct list_head *sort_list)
920{
921 struct rb_node **new = &(root->rb_node), *parent = NULL;
922
923 while (*new) {
924 struct work_atoms *this;
925 int cmp;
926
927 this = container_of(*new, struct work_atoms, node);
928 parent = *new;
929
930 cmp = thread_lat_cmp(sort_list, data, this);
931
932 if (cmp > 0)
933 new = &((*new)->rb_left);
934 else
935 new = &((*new)->rb_right);
936 }
937
938 rb_link_node(&data->node, parent, new);
939 rb_insert_color(&data->node, root);
940}
941
942static void thread_atoms_insert(struct thread *thread)
943{
944 struct work_atoms *atoms;
945
946 atoms = calloc(sizeof(*atoms), 1);
947 if (!atoms)
948 die("No memory");
949
950 atoms->thread = thread;
951 INIT_LIST_HEAD(&atoms->work_list);
952 __thread_latency_insert(&atom_root, atoms, &cmp_pid);
953}
954
955static void
956latency_fork_event(struct trace_fork_event *fork_event __used,
957 struct event *event __used,
958 int cpu __used,
959 u64 timestamp __used,
960 struct thread *thread __used)
961{
962 /* should insert the newcomer */
963}
964
965__used
966static char sched_out_state(struct trace_switch_event *switch_event)
967{
968 const char *str = TASK_STATE_TO_CHAR_STR;
969
970 return str[switch_event->prev_state];
971}
972
973static void
974add_sched_out_event(struct work_atoms *atoms,
975 char run_state,
976 u64 timestamp)
977{
978 struct work_atom *atom;
979
980 atom = calloc(sizeof(*atom), 1);
981 if (!atom)
982 die("Non memory");
983
984 atom->sched_out_time = timestamp;
985
986 if (run_state == 'R') {
987 atom->state = THREAD_WAIT_CPU;
988 atom->wake_up_time = atom->sched_out_time;
989 }
990
991 list_add_tail(&atom->list, &atoms->work_list);
992}
993
994static void
995add_runtime_event(struct work_atoms *atoms, u64 delta, u64 timestamp __used)
996{
997 struct work_atom *atom;
998
999 BUG_ON(list_empty(&atoms->work_list));
1000
1001 atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1002
1003 atom->runtime += delta;
1004 atoms->total_runtime += delta;
1005}
1006
1007static void
1008add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
1009{
1010 struct work_atom *atom;
1011 u64 delta;
1012
1013 if (list_empty(&atoms->work_list))
1014 return;
1015
1016 atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1017
1018 if (atom->state != THREAD_WAIT_CPU)
1019 return;
1020
1021 if (timestamp < atom->wake_up_time) {
1022 atom->state = THREAD_IGNORE;
1023 return;
1024 }
1025
1026 atom->state = THREAD_SCHED_IN;
1027 atom->sched_in_time = timestamp;
1028
1029 delta = atom->sched_in_time - atom->wake_up_time;
1030 atoms->total_lat += delta;
1031 if (delta > atoms->max_lat)
1032 atoms->max_lat = delta;
1033 atoms->nb_atoms++;
1034}
1035
1036static void
1037latency_switch_event(struct trace_switch_event *switch_event,
1038 struct event *event __used,
1039 int cpu,
1040 u64 timestamp,
1041 struct thread *thread __used)
1042{
1043 struct work_atoms *out_events, *in_events;
1044 struct thread *sched_out, *sched_in;
1045 u64 timestamp0;
1046 s64 delta;
1047
1048 BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1049
1050 timestamp0 = cpu_last_switched[cpu];
1051 cpu_last_switched[cpu] = timestamp;
1052 if (timestamp0)
1053 delta = timestamp - timestamp0;
1054 else
1055 delta = 0;
1056
1057 if (delta < 0)
1058 die("hm, delta: %Ld < 0 ?\n", delta);
1059
1060
1061 sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
1062 sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
1063
1064 out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1065 if (!out_events) {
1066 thread_atoms_insert(sched_out);
1067 out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
1068 if (!out_events)
1069 die("out-event: Internal tree error");
1070 }
1071 add_sched_out_event(out_events, sched_out_state(switch_event), timestamp);
1072
1073 in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1074 if (!in_events) {
1075 thread_atoms_insert(sched_in);
1076 in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
1077 if (!in_events)
1078 die("in-event: Internal tree error");
1079 /*
1080 * Take came in we have not heard about yet,
1081 * add in an initial atom in runnable state:
1082 */
1083 add_sched_out_event(in_events, 'R', timestamp);
1084 }
1085 add_sched_in_event(in_events, timestamp);
1086}
1087
1088static void
1089latency_runtime_event(struct trace_runtime_event *runtime_event,
1090 struct event *event __used,
1091 int cpu,
1092 u64 timestamp,
1093 struct thread *this_thread __used)
1094{
1095 struct work_atoms *atoms;
1096 struct thread *thread;
1097
1098 BUG_ON(cpu >= MAX_CPUS || cpu < 0);
1099
1100 thread = threads__findnew(runtime_event->pid, &threads, &last_match);
1101 atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1102 if (!atoms) {
1103 thread_atoms_insert(thread);
1104 atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
1105 if (!atoms)
1106 die("in-event: Internal tree error");
1107 add_sched_out_event(atoms, 'R', timestamp);
1108 }
1109
1110 add_runtime_event(atoms, runtime_event->runtime, timestamp);
1111}
1112
1113static void
1114latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
1115 struct event *__event __used,
1116 int cpu __used,
1117 u64 timestamp,
1118 struct thread *thread __used)
1119{
1120 struct work_atoms *atoms;
1121 struct work_atom *atom;
1122 struct thread *wakee;
1123
1124 /* Note for later, it may be interesting to observe the failing cases */
1125 if (!wakeup_event->success)
1126 return;
1127
1128 wakee = threads__findnew(wakeup_event->pid, &threads, &last_match);
1129 atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1130 if (!atoms) {
1131 thread_atoms_insert(wakee);
1132 atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
1133 if (!atoms)
1134 die("wakeup-event: Internal tree error");
1135 add_sched_out_event(atoms, 'S', timestamp);
1136 }
1137
1138 BUG_ON(list_empty(&atoms->work_list));
1139
1140 atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1141
1142 if (atom->state != THREAD_SLEEPING)
1143 nr_state_machine_bugs++;
1144
1145 nr_timestamps++;
1146 if (atom->sched_out_time > timestamp) {
1147 nr_unordered_timestamps++;
1148 return;
1149 }
1150
1151 atom->state = THREAD_WAIT_CPU;
1152 atom->wake_up_time = timestamp;
1153}
1154
1155static struct trace_sched_handler lat_ops = {
1156 .wakeup_event = latency_wakeup_event,
1157 .switch_event = latency_switch_event,
1158 .runtime_event = latency_runtime_event,
1159 .fork_event = latency_fork_event,
1160};
1161
1162static void output_lat_thread(struct work_atoms *work_list)
1163{
1164 int i;
1165 int ret;
1166 u64 avg;
1167
1168 if (!work_list->nb_atoms)
1169 return;
1170 /*
1171 * Ignore idle threads:
1172 */
1173 if (!strcmp(work_list->thread->comm, "swapper"))
1174 return;
1175
1176 all_runtime += work_list->total_runtime;
1177 all_count += work_list->nb_atoms;
1178
1179 ret = printf(" %s:%d ", work_list->thread->comm, work_list->thread->pid);
1180
1181 for (i = 0; i < 24 - ret; i++)
1182 printf(" ");
1183
1184 avg = work_list->total_lat / work_list->nb_atoms;
1185
1186 printf("|%11.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n",
1187 (double)work_list->total_runtime / 1e6,
1188 work_list->nb_atoms, (double)avg / 1e6,
1189 (double)work_list->max_lat / 1e6);
1190}
1191
1192static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
1193{
1194 if (l->thread->pid < r->thread->pid)
1195 return -1;
1196 if (l->thread->pid > r->thread->pid)
1197 return 1;
1198
1199 return 0;
1200}
1201
1202static struct sort_dimension pid_sort_dimension = {
1203 .name = "pid",
1204 .cmp = pid_cmp,
1205};
1206
1207static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
1208{
1209 u64 avgl, avgr;
1210
1211 if (!l->nb_atoms)
1212 return -1;
1213
1214 if (!r->nb_atoms)
1215 return 1;
1216
1217 avgl = l->total_lat / l->nb_atoms;
1218 avgr = r->total_lat / r->nb_atoms;
1219
1220 if (avgl < avgr)
1221 return -1;
1222 if (avgl > avgr)
1223 return 1;
1224
1225 return 0;
1226}
1227
1228static struct sort_dimension avg_sort_dimension = {
1229 .name = "avg",
1230 .cmp = avg_cmp,
1231};
1232
1233static int max_cmp(struct work_atoms *l, struct work_atoms *r)
1234{
1235 if (l->max_lat < r->max_lat)
1236 return -1;
1237 if (l->max_lat > r->max_lat)
1238 return 1;
1239
1240 return 0;
1241}
1242
1243static struct sort_dimension max_sort_dimension = {
1244 .name = "max",
1245 .cmp = max_cmp,
1246};
1247
1248static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
1249{
1250 if (l->nb_atoms < r->nb_atoms)
1251 return -1;
1252 if (l->nb_atoms > r->nb_atoms)
1253 return 1;
1254
1255 return 0;
1256}
1257
1258static struct sort_dimension switch_sort_dimension = {
1259 .name = "switch",
1260 .cmp = switch_cmp,
1261};
1262
1263static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
1264{
1265 if (l->total_runtime < r->total_runtime)
1266 return -1;
1267 if (l->total_runtime > r->total_runtime)
1268 return 1;
1269
1270 return 0;
1271}
1272
1273static struct sort_dimension runtime_sort_dimension = {
1274 .name = "runtime",
1275 .cmp = runtime_cmp,
1276};
1277
1278static struct sort_dimension *available_sorts[] = {
1279 &pid_sort_dimension,
1280 &avg_sort_dimension,
1281 &max_sort_dimension,
1282 &switch_sort_dimension,
1283 &runtime_sort_dimension,
1284};
1285
1286#define NB_AVAILABLE_SORTS (int)(sizeof(available_sorts) / sizeof(struct sort_dimension *))
1287
1288static LIST_HEAD(sort_list);
1289
1290static int sort_dimension__add(char *tok, struct list_head *list)
1291{
1292 int i;
1293
1294 for (i = 0; i < NB_AVAILABLE_SORTS; i++) {
1295 if (!strcmp(available_sorts[i]->name, tok)) {
1296 list_add_tail(&available_sorts[i]->list, list);
1297
1298 return 0;
1299 }
1300 }
1301
1302 return -1;
1303}
1304
1305static void setup_sorting(void);
1306
1307static void sort_lat(void)
1308{
1309 struct rb_node *node;
1310
1311 for (;;) {
1312 struct work_atoms *data;
1313 node = rb_first(&atom_root);
1314 if (!node)
1315 break;
1316
1317 rb_erase(node, &atom_root);
1318 data = rb_entry(node, struct work_atoms, node);
1319 __thread_latency_insert(&sorted_atom_root, data, &sort_list);
1320 }
1321}
1322
1323static struct trace_sched_handler *trace_handler;
1324
1325static void
1326process_sched_wakeup_event(struct raw_event_sample *raw,
1327 struct event *event,
1328 int cpu __used,
1329 u64 timestamp __used,
1330 struct thread *thread __used)
1331{
1332 struct trace_wakeup_event wakeup_event;
1333
1334 FILL_COMMON_FIELDS(wakeup_event, event, raw->data);
1335
1336 FILL_ARRAY(wakeup_event, comm, event, raw->data);
1337 FILL_FIELD(wakeup_event, pid, event, raw->data);
1338 FILL_FIELD(wakeup_event, prio, event, raw->data);
1339 FILL_FIELD(wakeup_event, success, event, raw->data);
1340 FILL_FIELD(wakeup_event, cpu, event, raw->data);
1341
1342 if (trace_handler->wakeup_event)
1343 trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread);
1344}
1345
1346/*
1347 * Track the current task - that way we can know whether there's any
1348 * weird events, such as a task being switched away that is not current.
1349 */
1350static int max_cpu;
1351
1352static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
1353
1354static struct thread *curr_thread[MAX_CPUS];
1355
1356static char next_shortname1 = 'A';
1357static char next_shortname2 = '0';
1358
1359static void
1360map_switch_event(struct trace_switch_event *switch_event,
1361 struct event *event __used,
1362 int this_cpu,
1363 u64 timestamp,
1364 struct thread *thread __used)
1365{
1366 struct thread *sched_out, *sched_in;
1367 int new_shortname;
1368 u64 timestamp0;
1369 s64 delta;
1370 int cpu;
1371
1372 BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
1373
1374 if (this_cpu > max_cpu)
1375 max_cpu = this_cpu;
1376
1377 timestamp0 = cpu_last_switched[this_cpu];
1378 cpu_last_switched[this_cpu] = timestamp;
1379 if (timestamp0)
1380 delta = timestamp - timestamp0;
1381 else
1382 delta = 0;
1383
1384 if (delta < 0)
1385 die("hm, delta: %Ld < 0 ?\n", delta);
1386
1387
1388 sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
1389 sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
1390
1391 curr_thread[this_cpu] = sched_in;
1392
1393 printf(" ");
1394
1395 new_shortname = 0;
1396 if (!sched_in->shortname[0]) {
1397 sched_in->shortname[0] = next_shortname1;
1398 sched_in->shortname[1] = next_shortname2;
1399
1400 if (next_shortname1 < 'Z') {
1401 next_shortname1++;
1402 } else {
1403 next_shortname1='A';
1404 if (next_shortname2 < '9') {
1405 next_shortname2++;
1406 } else {
1407 next_shortname2='0';
1408 }
1409 }
1410 new_shortname = 1;
1411 }
1412
1413 for (cpu = 0; cpu <= max_cpu; cpu++) {
1414 if (cpu != this_cpu)
1415 printf(" ");
1416 else
1417 printf("*");
1418
1419 if (curr_thread[cpu]) {
1420 if (curr_thread[cpu]->pid)
1421 printf("%2s ", curr_thread[cpu]->shortname);
1422 else
1423 printf(". ");
1424 } else
1425 printf(" ");
1426 }
1427
1428 printf(" %12.6f secs ", (double)timestamp/1e9);
1429 if (new_shortname) {
1430 printf("%s => %s:%d\n",
1431 sched_in->shortname, sched_in->comm, sched_in->pid);
1432 } else {
1433 printf("\n");
1434 }
1435}
1436
1437
1438static void
1439process_sched_switch_event(struct raw_event_sample *raw,
1440 struct event *event,
1441 int this_cpu,
1442 u64 timestamp __used,
1443 struct thread *thread __used)
1444{
1445 struct trace_switch_event switch_event;
1446
1447 FILL_COMMON_FIELDS(switch_event, event, raw->data);
1448
1449 FILL_ARRAY(switch_event, prev_comm, event, raw->data);
1450 FILL_FIELD(switch_event, prev_pid, event, raw->data);
1451 FILL_FIELD(switch_event, prev_prio, event, raw->data);
1452 FILL_FIELD(switch_event, prev_state, event, raw->data);
1453 FILL_ARRAY(switch_event, next_comm, event, raw->data);
1454 FILL_FIELD(switch_event, next_pid, event, raw->data);
1455 FILL_FIELD(switch_event, next_prio, event, raw->data);
1456
1457 if (curr_pid[this_cpu] != (u32)-1) {
1458 /*
1459 * Are we trying to switch away a PID that is
1460 * not current?
1461 */
1462 if (curr_pid[this_cpu] != switch_event.prev_pid)
1463 nr_context_switch_bugs++;
1464 }
1465 if (trace_handler->switch_event)
1466 trace_handler->switch_event(&switch_event, event, this_cpu, timestamp, thread);
1467
1468 curr_pid[this_cpu] = switch_event.next_pid;
1469}
1470
1471static void
1472process_sched_runtime_event(struct raw_event_sample *raw,
1473 struct event *event,
1474 int cpu __used,
1475 u64 timestamp __used,
1476 struct thread *thread __used)
1477{
1478 struct trace_runtime_event runtime_event;
1479
1480 FILL_ARRAY(runtime_event, comm, event, raw->data);
1481 FILL_FIELD(runtime_event, pid, event, raw->data);
1482 FILL_FIELD(runtime_event, runtime, event, raw->data);
1483 FILL_FIELD(runtime_event, vruntime, event, raw->data);
1484
1485 if (trace_handler->runtime_event)
1486 trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread);
1487}
1488
1489static void
1490process_sched_fork_event(struct raw_event_sample *raw,
1491 struct event *event,
1492 int cpu __used,
1493 u64 timestamp __used,
1494 struct thread *thread __used)
1495{
1496 struct trace_fork_event fork_event;
1497
1498 FILL_COMMON_FIELDS(fork_event, event, raw->data);
1499
1500 FILL_ARRAY(fork_event, parent_comm, event, raw->data);
1501 FILL_FIELD(fork_event, parent_pid, event, raw->data);
1502 FILL_ARRAY(fork_event, child_comm, event, raw->data);
1503 FILL_FIELD(fork_event, child_pid, event, raw->data);
1504
1505 if (trace_handler->fork_event)
1506 trace_handler->fork_event(&fork_event, event, cpu, timestamp, thread);
1507}
1508
1509static void
1510process_sched_exit_event(struct event *event,
1511 int cpu __used,
1512 u64 timestamp __used,
1513 struct thread *thread __used)
1514{
1515 if (verbose)
1516 printf("sched_exit event %p\n", event);
1517}
1518
1519static void
1520process_raw_event(event_t *raw_event __used, void *more_data,
1521 int cpu, u64 timestamp, struct thread *thread)
1522{
1523 struct raw_event_sample *raw = more_data;
1524 struct event *event;
1525 int type;
1526
1527 type = trace_parse_common_type(raw->data);
1528 event = trace_find_event(type);
1529
1530 if (!strcmp(event->name, "sched_switch"))
1531 process_sched_switch_event(raw, event, cpu, timestamp, thread);
1532 if (!strcmp(event->name, "sched_stat_runtime"))
1533 process_sched_runtime_event(raw, event, cpu, timestamp, thread);
1534 if (!strcmp(event->name, "sched_wakeup"))
1535 process_sched_wakeup_event(raw, event, cpu, timestamp, thread);
1536 if (!strcmp(event->name, "sched_wakeup_new"))
1537 process_sched_wakeup_event(raw, event, cpu, timestamp, thread);
1538 if (!strcmp(event->name, "sched_process_fork"))
1539 process_sched_fork_event(raw, event, cpu, timestamp, thread);
1540 if (!strcmp(event->name, "sched_process_exit"))
1541 process_sched_exit_event(event, cpu, timestamp, thread);
1542}
1543
1544static int
1545process_sample_event(event_t *event, unsigned long offset, unsigned long head)
1546{
1547 char level;
1548 int show = 0;
1549 struct dso *dso = NULL;
1550 struct thread *thread;
1551 u64 ip = event->ip.ip;
1552 u64 timestamp = -1;
1553 u32 cpu = -1;
1554 u64 period = 1;
1555 void *more_data = event->ip.__more_data;
1556 int cpumode;
1557
1558 thread = threads__findnew(event->ip.pid, &threads, &last_match);
1559
1560 if (sample_type & PERF_SAMPLE_TIME) {
1561 timestamp = *(u64 *)more_data;
1562 more_data += sizeof(u64);
1563 }
1564
1565 if (sample_type & PERF_SAMPLE_CPU) {
1566 cpu = *(u32 *)more_data;
1567 more_data += sizeof(u32);
1568 more_data += sizeof(u32); /* reserved */
1569 }
1570
1571 if (sample_type & PERF_SAMPLE_PERIOD) {
1572 period = *(u64 *)more_data;
1573 more_data += sizeof(u64);
1574 }
1575
1576 dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
1577 (void *)(offset + head),
1578 (void *)(long)(event->header.size),
1579 event->header.misc,
1580 event->ip.pid, event->ip.tid,
1581 (void *)(long)ip,
1582 (long long)period);
1583
1584 dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
1585
1586 if (thread == NULL) {
1587 eprintf("problem processing %d event, skipping it.\n",
1588 event->header.type);
1589 return -1;
1590 }
1591
1592 cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
1593
1594 if (cpumode == PERF_EVENT_MISC_KERNEL) {
1595 show = SHOW_KERNEL;
1596 level = 'k';
1597
1598 dso = kernel_dso;
1599
1600 dump_printf(" ...... dso: %s\n", dso->name);
1601
1602 } else if (cpumode == PERF_EVENT_MISC_USER) {
1603
1604 show = SHOW_USER;
1605 level = '.';
1606
1607 } else {
1608 show = SHOW_HV;
1609 level = 'H';
1610
1611 dso = hypervisor_dso;
1612
1613 dump_printf(" ...... dso: [hypervisor]\n");
1614 }
1615
1616 if (sample_type & PERF_SAMPLE_RAW)
1617 process_raw_event(event, more_data, cpu, timestamp, thread);
1618
1619 return 0;
1620}
1621
1622static int
1623process_event(event_t *event, unsigned long offset, unsigned long head)
1624{
1625 trace_event(event);
1626
1627 nr_events++;
1628 switch (event->header.type) {
1629 case PERF_EVENT_MMAP:
1630 return 0;
1631 case PERF_EVENT_LOST:
1632 nr_lost_chunks++;
1633 nr_lost_events += event->lost.lost;
1634 return 0;
1635
1636 case PERF_EVENT_COMM:
1637 return process_comm_event(event, offset, head);
1638
1639 case PERF_EVENT_EXIT ... PERF_EVENT_READ:
1640 return 0;
1641
1642 case PERF_EVENT_SAMPLE:
1643 return process_sample_event(event, offset, head);
1644
1645 case PERF_EVENT_MAX:
1646 default:
1647 return -1;
1648 }
1649
1650 return 0;
1651}
1652
1653static int read_events(void)
1654{
1655 int ret, rc = EXIT_FAILURE;
1656 unsigned long offset = 0;
1657 unsigned long head = 0;
1658 struct stat perf_stat;
1659 event_t *event;
1660 uint32_t size;
1661 char *buf;
1662
1663 trace_report();
1664 register_idle_thread(&threads, &last_match);
1665
1666 input = open(input_name, O_RDONLY);
1667 if (input < 0) {
1668 perror("failed to open file");
1669 exit(-1);
1670 }
1671
1672 ret = fstat(input, &perf_stat);
1673 if (ret < 0) {
1674 perror("failed to stat file");
1675 exit(-1);
1676 }
1677
1678 if (!perf_stat.st_size) {
1679 fprintf(stderr, "zero-sized file, nothing to do!\n");
1680 exit(0);
1681 }
1682 header = perf_header__read(input);
1683 head = header->data_offset;
1684 sample_type = perf_header__sample_type(header);
1685
1686 if (!(sample_type & PERF_SAMPLE_RAW))
1687 die("No trace sample to read. Did you call perf record "
1688 "without -R?");
1689
1690 if (load_kernel() < 0) {
1691 perror("failed to load kernel symbols");
1692 return EXIT_FAILURE;
1693 }
1694
1695remap:
1696 buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
1697 MAP_SHARED, input, offset);
1698 if (buf == MAP_FAILED) {
1699 perror("failed to mmap file");
1700 exit(-1);
1701 }
1702
1703more:
1704 event = (event_t *)(buf + head);
1705
1706 size = event->header.size;
1707 if (!size)
1708 size = 8;
1709
1710 if (head + event->header.size >= page_size * mmap_window) {
1711 unsigned long shift = page_size * (head / page_size);
1712 int res;
1713
1714 res = munmap(buf, page_size * mmap_window);
1715 assert(res == 0);
1716
1717 offset += shift;
1718 head -= shift;
1719 goto remap;
1720 }
1721
1722 size = event->header.size;
1723
1724
1725 if (!size || process_event(event, offset, head) < 0) {
1726
1727 /*
1728 * assume we lost track of the stream, check alignment, and
1729 * increment a single u64 in the hope to catch on again 'soon'.
1730 */
1731
1732 if (unlikely(head & 7))
1733 head &= ~7ULL;
1734
1735 size = 8;
1736 }
1737
1738 head += size;
1739
1740 if (offset + head < (unsigned long)perf_stat.st_size)
1741 goto more;
1742
1743 rc = EXIT_SUCCESS;
1744 close(input);
1745
1746 return rc;
1747}
1748
1749static void print_bad_events(void)
1750{
1751 if (nr_unordered_timestamps && nr_timestamps) {
1752 printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
1753 (double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
1754 nr_unordered_timestamps, nr_timestamps);
1755 }
1756 if (nr_lost_events && nr_events) {
1757 printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
1758 (double)nr_lost_events/(double)nr_events*100.0,
1759 nr_lost_events, nr_events, nr_lost_chunks);
1760 }
1761 if (nr_state_machine_bugs && nr_timestamps) {
1762 printf(" INFO: %.3f%% state machine bugs (%ld out of %ld)",
1763 (double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
1764 nr_state_machine_bugs, nr_timestamps);
1765 if (nr_lost_events)
1766 printf(" (due to lost events?)");
1767 printf("\n");
1768 }
1769 if (nr_context_switch_bugs && nr_timestamps) {
1770 printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)",
1771 (double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
1772 nr_context_switch_bugs, nr_timestamps);
1773 if (nr_lost_events)
1774 printf(" (due to lost events?)");
1775 printf("\n");
1776 }
1777}
1778
1779static void __cmd_lat(void)
1780{
1781 struct rb_node *next;
1782
1783 setup_pager();
1784 read_events();
1785 sort_lat();
1786
1787 printf("\n -----------------------------------------------------------------------------------------\n");
1788 printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n");
1789 printf(" -----------------------------------------------------------------------------------------\n");
1790
1791 next = rb_first(&sorted_atom_root);
1792
1793 while (next) {
1794 struct work_atoms *work_list;
1795
1796 work_list = rb_entry(next, struct work_atoms, node);
1797 output_lat_thread(work_list);
1798 next = rb_next(next);
1799 }
1800
1801 printf(" -----------------------------------------------------------------------------------------\n");
1802 printf(" TOTAL: |%11.3f ms |%9Ld |\n",
1803 (double)all_runtime/1e6, all_count);
1804
1805 printf(" ---------------------------------------------------\n");
1806
1807 print_bad_events();
1808 printf("\n");
1809
1810}
1811
1812static struct trace_sched_handler map_ops = {
1813 .wakeup_event = NULL,
1814 .switch_event = map_switch_event,
1815 .runtime_event = NULL,
1816 .fork_event = NULL,
1817};
1818
1819static void __cmd_map(void)
1820{
1821 max_cpu = sysconf(_SC_NPROCESSORS_CONF);
1822
1823 setup_pager();
1824 read_events();
1825 print_bad_events();
1826}
1827
1828static void __cmd_replay(void)
1829{
1830 unsigned long i;
1831
1832 calibrate_run_measurement_overhead();
1833 calibrate_sleep_measurement_overhead();
1834
1835 test_calibrations();
1836
1837 read_events();
1838
1839 printf("nr_run_events: %ld\n", nr_run_events);
1840 printf("nr_sleep_events: %ld\n", nr_sleep_events);
1841 printf("nr_wakeup_events: %ld\n", nr_wakeup_events);
1842
1843 if (targetless_wakeups)
1844 printf("target-less wakeups: %ld\n", targetless_wakeups);
1845 if (multitarget_wakeups)
1846 printf("multi-target wakeups: %ld\n", multitarget_wakeups);
1847 if (nr_run_events_optimized)
1848 printf("run atoms optimized: %ld\n",
1849 nr_run_events_optimized);
1850
1851 print_task_traces();
1852 add_cross_task_wakeups();
1853
1854 create_tasks();
1855 printf("------------------------------------------------------------\n");
1856 for (i = 0; i < replay_repeat; i++)
1857 run_one_test();
1858}
1859
1860
1861static const char * const sched_usage[] = {
1862 "perf sched [<options>] {record|latency|map|replay|trace}",
1863 NULL
1864};
1865
1866static const struct option sched_options[] = {
1867 OPT_STRING('i', "input", &input_name, "file",
1868 "input file name"),
1869 OPT_BOOLEAN('v', "verbose", &verbose,
1870 "be more verbose (show symbol address, etc)"),
1871 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1872 "dump raw trace in ASCII"),
1873 OPT_END()
1874};
1875
1876static const char * const latency_usage[] = {
1877 "perf sched latency [<options>]",
1878 NULL
1879};
1880
1881static const struct option latency_options[] = {
1882 OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
1883 "sort by key(s): runtime, switch, avg, max"),
1884 OPT_BOOLEAN('v', "verbose", &verbose,
1885 "be more verbose (show symbol address, etc)"),
1886 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1887 "dump raw trace in ASCII"),
1888 OPT_END()
1889};
1890
1891static const char * const replay_usage[] = {
1892 "perf sched replay [<options>]",
1893 NULL
1894};
1895
1896static const struct option replay_options[] = {
1897 OPT_INTEGER('r', "repeat", &replay_repeat,
1898 "repeat the workload replay N times (-1: infinite)"),
1899 OPT_BOOLEAN('v', "verbose", &verbose,
1900 "be more verbose (show symbol address, etc)"),
1901 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1902 "dump raw trace in ASCII"),
1903 OPT_END()
1904};
1905
1906static void setup_sorting(void)
1907{
1908 char *tmp, *tok, *str = strdup(sort_order);
1909
1910 for (tok = strtok_r(str, ", ", &tmp);
1911 tok; tok = strtok_r(NULL, ", ", &tmp)) {
1912 if (sort_dimension__add(tok, &sort_list) < 0) {
1913 error("Unknown --sort key: `%s'", tok);
1914 usage_with_options(latency_usage, latency_options);
1915 }
1916 }
1917
1918 free(str);
1919
1920 sort_dimension__add((char *)"pid", &cmp_pid);
1921}
1922
1923static const char *record_args[] = {
1924 "record",
1925 "-a",
1926 "-R",
1927 "-M",
1928 "-f",
1929 "-m", "1024",
1930 "-c", "1",
1931 "-e", "sched:sched_switch:r",
1932 "-e", "sched:sched_stat_wait:r",
1933 "-e", "sched:sched_stat_sleep:r",
1934 "-e", "sched:sched_stat_iowait:r",
1935 "-e", "sched:sched_stat_runtime:r",
1936 "-e", "sched:sched_process_exit:r",
1937 "-e", "sched:sched_process_fork:r",
1938 "-e", "sched:sched_wakeup:r",
1939 "-e", "sched:sched_migrate_task:r",
1940};
1941
1942static int __cmd_record(int argc, const char **argv)
1943{
1944 unsigned int rec_argc, i, j;
1945 const char **rec_argv;
1946
1947 rec_argc = ARRAY_SIZE(record_args) + argc - 1;
1948 rec_argv = calloc(rec_argc + 1, sizeof(char *));
1949
1950 for (i = 0; i < ARRAY_SIZE(record_args); i++)
1951 rec_argv[i] = strdup(record_args[i]);
1952
1953 for (j = 1; j < (unsigned int)argc; j++, i++)
1954 rec_argv[i] = argv[j];
1955
1956 BUG_ON(i != rec_argc);
1957
1958 return cmd_record(i, rec_argv, NULL);
1959}
1960
1961int cmd_sched(int argc, const char **argv, const char *prefix __used)
1962{
1963 symbol__init();
1964 page_size = getpagesize();
1965
1966 argc = parse_options(argc, argv, sched_options, sched_usage,
1967 PARSE_OPT_STOP_AT_NON_OPTION);
1968 if (!argc)
1969 usage_with_options(sched_usage, sched_options);
1970
1971 if (!strncmp(argv[0], "rec", 3)) {
1972 return __cmd_record(argc, argv);
1973 } else if (!strncmp(argv[0], "lat", 3)) {
1974 trace_handler = &lat_ops;
1975 if (argc > 1) {
1976 argc = parse_options(argc, argv, latency_options, latency_usage, 0);
1977 if (argc)
1978 usage_with_options(latency_usage, latency_options);
1979 }
1980 setup_sorting();
1981 __cmd_lat();
1982 } else if (!strcmp(argv[0], "map")) {
1983 trace_handler = &map_ops;
1984 setup_sorting();
1985 __cmd_map();
1986 } else if (!strncmp(argv[0], "rep", 3)) {
1987 trace_handler = &replay_ops;
1988 if (argc) {
1989 argc = parse_options(argc, argv, replay_options, replay_usage, 0);
1990 if (argc)
1991 usage_with_options(replay_usage, replay_options);
1992 }
1993 __cmd_replay();
1994 } else if (!strcmp(argv[0], "trace")) {
1995 /*
1996 * Aliased to 'perf trace' for now:
1997 */
1998 return cmd_trace(argc, argv, prefix);
1999 } else {
2000 usage_with_options(sched_usage, sched_options);
2001 }
2002
2003 return 0;
2004}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 3a63e41fb44e..b09cadbd76b1 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -16,12 +16,13 @@ extern int check_pager_config(const char *cmd);
16 16
17extern int cmd_annotate(int argc, const char **argv, const char *prefix); 17extern int cmd_annotate(int argc, const char **argv, const char *prefix);
18extern int cmd_help(int argc, const char **argv, const char *prefix); 18extern int cmd_help(int argc, const char **argv, const char *prefix);
19extern int cmd_sched(int argc, const char **argv, const char *prefix);
20extern int cmd_list(int argc, const char **argv, const char *prefix);
19extern int cmd_record(int argc, const char **argv, const char *prefix); 21extern int cmd_record(int argc, const char **argv, const char *prefix);
20extern int cmd_report(int argc, const char **argv, const char *prefix); 22extern int cmd_report(int argc, const char **argv, const char *prefix);
21extern int cmd_stat(int argc, const char **argv, const char *prefix); 23extern int cmd_stat(int argc, const char **argv, const char *prefix);
22extern int cmd_top(int argc, const char **argv, const char *prefix); 24extern int cmd_top(int argc, const char **argv, const char *prefix);
23extern int cmd_version(int argc, const char **argv, const char *prefix);
24extern int cmd_list(int argc, const char **argv, const char *prefix);
25extern int cmd_trace(int argc, const char **argv, const char *prefix); 25extern int cmd_trace(int argc, const char **argv, const char *prefix);
26extern int cmd_version(int argc, const char **argv, const char *prefix);
26 27
27#endif 28#endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index eebce30afbc0..3133c74729dd 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -4,7 +4,9 @@
4# 4#
5perf-annotate mainporcelain common 5perf-annotate mainporcelain common
6perf-list mainporcelain common 6perf-list mainporcelain common
7perf-sched mainporcelain common
7perf-record mainporcelain common 8perf-record mainporcelain common
8perf-report mainporcelain common 9perf-report mainporcelain common
9perf-stat mainporcelain common 10perf-stat mainporcelain common
10perf-top mainporcelain common 11perf-top mainporcelain common
12perf-trace mainporcelain common
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index fe4589dde950..c972d1c35489 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -293,6 +293,7 @@ static void handle_internal_command(int argc, const char **argv)
293 { "annotate", cmd_annotate, 0 }, 293 { "annotate", cmd_annotate, 0 },
294 { "version", cmd_version, 0 }, 294 { "version", cmd_version, 0 },
295 { "trace", cmd_trace, 0 }, 295 { "trace", cmd_trace, 0 },
296 { "sched", cmd_sched, 0 },
296 }; 297 };
297 unsigned int i; 298 unsigned int i;
298 static const char ext[] = STRIP_EXTENSION; 299 static const char ext[] = STRIP_EXTENSION;
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index fa2d4e91d329..2495529cae7d 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -52,7 +52,7 @@ struct lost_event {
52 */ 52 */
53struct read_event { 53struct read_event {
54 struct perf_event_header header; 54 struct perf_event_header header;
55 u32 pid,tid; 55 u32 pid, tid;
56 u64 value; 56 u64 value;
57 u64 time_enabled; 57 u64 time_enabled;
58 u64 time_running; 58 u64 time_running;
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index a587d41ae3c9..a9bdcab8c070 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -18,6 +18,12 @@ struct event_symbol {
18 const char *alias; 18 const char *alias;
19}; 19};
20 20
21enum event_result {
22 EVT_FAILED,
23 EVT_HANDLED,
24 EVT_HANDLED_ALL
25};
26
21char debugfs_path[MAXPATHLEN]; 27char debugfs_path[MAXPATHLEN];
22 28
23#define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x 29#define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x
@@ -139,7 +145,7 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
139 (strcmp(evt_dirent.d_name, "..")) && \ 145 (strcmp(evt_dirent.d_name, "..")) && \
140 (!tp_event_has_id(&sys_dirent, &evt_dirent))) 146 (!tp_event_has_id(&sys_dirent, &evt_dirent)))
141 147
142#define MAX_EVENT_LENGTH 30 148#define MAX_EVENT_LENGTH 512
143 149
144int valid_debugfs_mount(const char *debugfs) 150int valid_debugfs_mount(const char *debugfs)
145{ 151{
@@ -344,7 +350,7 @@ static int parse_aliases(const char **str, const char *names[][MAX_ALIASES], int
344 return -1; 350 return -1;
345} 351}
346 352
347static int 353static enum event_result
348parse_generic_hw_event(const char **str, struct perf_counter_attr *attr) 354parse_generic_hw_event(const char **str, struct perf_counter_attr *attr)
349{ 355{
350 const char *s = *str; 356 const char *s = *str;
@@ -356,7 +362,7 @@ parse_generic_hw_event(const char **str, struct perf_counter_attr *attr)
356 * then bail out: 362 * then bail out:
357 */ 363 */
358 if (cache_type == -1) 364 if (cache_type == -1)
359 return 0; 365 return EVT_FAILED;
360 366
361 while ((cache_op == -1 || cache_result == -1) && *s == '-') { 367 while ((cache_op == -1 || cache_result == -1) && *s == '-') {
362 ++s; 368 ++s;
@@ -402,27 +408,115 @@ parse_generic_hw_event(const char **str, struct perf_counter_attr *attr)
402 attr->type = PERF_TYPE_HW_CACHE; 408 attr->type = PERF_TYPE_HW_CACHE;
403 409
404 *str = s; 410 *str = s;
405 return 1; 411 return EVT_HANDLED;
412}
413
414static enum event_result
415parse_single_tracepoint_event(char *sys_name,
416 const char *evt_name,
417 unsigned int evt_length,
418 char *flags,
419 struct perf_counter_attr *attr,
420 const char **strp)
421{
422 char evt_path[MAXPATHLEN];
423 char id_buf[4];
424 u64 id;
425 int fd;
426
427 if (flags) {
428 if (!strncmp(flags, "record", strlen(flags))) {
429 attr->sample_type |= PERF_SAMPLE_RAW;
430 attr->sample_type |= PERF_SAMPLE_TIME;
431 attr->sample_type |= PERF_SAMPLE_CPU;
432 }
433 }
434
435 snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path,
436 sys_name, evt_name);
437
438 fd = open(evt_path, O_RDONLY);
439 if (fd < 0)
440 return EVT_FAILED;
441
442 if (read(fd, id_buf, sizeof(id_buf)) < 0) {
443 close(fd);
444 return EVT_FAILED;
445 }
446
447 close(fd);
448 id = atoll(id_buf);
449 attr->config = id;
450 attr->type = PERF_TYPE_TRACEPOINT;
451 *strp = evt_name + evt_length;
452
453 return EVT_HANDLED;
454}
455
456/* sys + ':' + event + ':' + flags*/
457#define MAX_EVOPT_LEN (MAX_EVENT_LENGTH * 2 + 2 + 128)
458static enum event_result
459parse_subsystem_tracepoint_event(char *sys_name, char *flags)
460{
461 char evt_path[MAXPATHLEN];
462 struct dirent *evt_ent;
463 DIR *evt_dir;
464
465 snprintf(evt_path, MAXPATHLEN, "%s/%s", debugfs_path, sys_name);
466 evt_dir = opendir(evt_path);
467
468 if (!evt_dir) {
469 perror("Can't open event dir");
470 return EVT_FAILED;
471 }
472
473 while ((evt_ent = readdir(evt_dir))) {
474 char event_opt[MAX_EVOPT_LEN + 1];
475 int len;
476 unsigned int rem = MAX_EVOPT_LEN;
477
478 if (!strcmp(evt_ent->d_name, ".")
479 || !strcmp(evt_ent->d_name, "..")
480 || !strcmp(evt_ent->d_name, "enable")
481 || !strcmp(evt_ent->d_name, "filter"))
482 continue;
483
484 len = snprintf(event_opt, MAX_EVOPT_LEN, "%s:%s", sys_name,
485 evt_ent->d_name);
486 if (len < 0)
487 return EVT_FAILED;
488
489 rem -= len;
490 if (flags) {
491 if (rem < strlen(flags) + 1)
492 return EVT_FAILED;
493
494 strcat(event_opt, ":");
495 strcat(event_opt, flags);
496 }
497
498 if (parse_events(NULL, event_opt, 0))
499 return EVT_FAILED;
500 }
501
502 return EVT_HANDLED_ALL;
406} 503}
407 504
408static int parse_tracepoint_event(const char **strp, 505
506static enum event_result parse_tracepoint_event(const char **strp,
409 struct perf_counter_attr *attr) 507 struct perf_counter_attr *attr)
410{ 508{
411 const char *evt_name; 509 const char *evt_name;
412 char *flags; 510 char *flags;
413 char sys_name[MAX_EVENT_LENGTH]; 511 char sys_name[MAX_EVENT_LENGTH];
414 char id_buf[4];
415 int fd;
416 unsigned int sys_length, evt_length; 512 unsigned int sys_length, evt_length;
417 u64 id;
418 char evt_path[MAXPATHLEN];
419 513
420 if (valid_debugfs_mount(debugfs_path)) 514 if (valid_debugfs_mount(debugfs_path))
421 return 0; 515 return 0;
422 516
423 evt_name = strchr(*strp, ':'); 517 evt_name = strchr(*strp, ':');
424 if (!evt_name) 518 if (!evt_name)
425 return 0; 519 return EVT_FAILED;
426 520
427 sys_length = evt_name - *strp; 521 sys_length = evt_name - *strp;
428 if (sys_length >= MAX_EVENT_LENGTH) 522 if (sys_length >= MAX_EVENT_LENGTH)
@@ -434,32 +528,22 @@ static int parse_tracepoint_event(const char **strp,
434 528
435 flags = strchr(evt_name, ':'); 529 flags = strchr(evt_name, ':');
436 if (flags) { 530 if (flags) {
437 *flags = '\0'; 531 /* split it out: */
532 evt_name = strndup(evt_name, flags - evt_name);
438 flags++; 533 flags++;
439 if (!strncmp(flags, "record", strlen(flags)))
440 attr->sample_type |= PERF_SAMPLE_RAW;
441 } 534 }
442 535
443 evt_length = strlen(evt_name); 536 evt_length = strlen(evt_name);
444 if (evt_length >= MAX_EVENT_LENGTH) 537 if (evt_length >= MAX_EVENT_LENGTH)
445 return 0; 538 return EVT_FAILED;
446 539
447 snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path, 540 if (!strcmp(evt_name, "*")) {
448 sys_name, evt_name); 541 *strp = evt_name + evt_length;
449 fd = open(evt_path, O_RDONLY); 542 return parse_subsystem_tracepoint_event(sys_name, flags);
450 if (fd < 0) 543 } else
451 return 0; 544 return parse_single_tracepoint_event(sys_name, evt_name,
452 545 evt_length, flags,
453 if (read(fd, id_buf, sizeof(id_buf)) < 0) { 546 attr, strp);
454 close(fd);
455 return 0;
456 }
457 close(fd);
458 id = atoll(id_buf);
459 attr->config = id;
460 attr->type = PERF_TYPE_TRACEPOINT;
461 *strp = evt_name + evt_length;
462 return 1;
463} 547}
464 548
465static int check_events(const char *str, unsigned int i) 549static int check_events(const char *str, unsigned int i)
@@ -477,7 +561,7 @@ static int check_events(const char *str, unsigned int i)
477 return 0; 561 return 0;
478} 562}
479 563
480static int 564static enum event_result
481parse_symbolic_event(const char **strp, struct perf_counter_attr *attr) 565parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
482{ 566{
483 const char *str = *strp; 567 const char *str = *strp;
@@ -490,31 +574,32 @@ parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
490 attr->type = event_symbols[i].type; 574 attr->type = event_symbols[i].type;
491 attr->config = event_symbols[i].config; 575 attr->config = event_symbols[i].config;
492 *strp = str + n; 576 *strp = str + n;
493 return 1; 577 return EVT_HANDLED;
494 } 578 }
495 } 579 }
496 return 0; 580 return EVT_FAILED;
497} 581}
498 582
499static int parse_raw_event(const char **strp, struct perf_counter_attr *attr) 583static enum event_result
584parse_raw_event(const char **strp, struct perf_counter_attr *attr)
500{ 585{
501 const char *str = *strp; 586 const char *str = *strp;
502 u64 config; 587 u64 config;
503 int n; 588 int n;
504 589
505 if (*str != 'r') 590 if (*str != 'r')
506 return 0; 591 return EVT_FAILED;
507 n = hex2u64(str + 1, &config); 592 n = hex2u64(str + 1, &config);
508 if (n > 0) { 593 if (n > 0) {
509 *strp = str + n + 1; 594 *strp = str + n + 1;
510 attr->type = PERF_TYPE_RAW; 595 attr->type = PERF_TYPE_RAW;
511 attr->config = config; 596 attr->config = config;
512 return 1; 597 return EVT_HANDLED;
513 } 598 }
514 return 0; 599 return EVT_FAILED;
515} 600}
516 601
517static int 602static enum event_result
518parse_numeric_event(const char **strp, struct perf_counter_attr *attr) 603parse_numeric_event(const char **strp, struct perf_counter_attr *attr)
519{ 604{
520 const char *str = *strp; 605 const char *str = *strp;
@@ -530,13 +615,13 @@ parse_numeric_event(const char **strp, struct perf_counter_attr *attr)
530 attr->type = type; 615 attr->type = type;
531 attr->config = config; 616 attr->config = config;
532 *strp = endp; 617 *strp = endp;
533 return 1; 618 return EVT_HANDLED;
534 } 619 }
535 } 620 }
536 return 0; 621 return EVT_FAILED;
537} 622}
538 623
539static int 624static enum event_result
540parse_event_modifier(const char **strp, struct perf_counter_attr *attr) 625parse_event_modifier(const char **strp, struct perf_counter_attr *attr)
541{ 626{
542 const char *str = *strp; 627 const char *str = *strp;
@@ -569,37 +654,60 @@ parse_event_modifier(const char **strp, struct perf_counter_attr *attr)
569 * Each event can have multiple symbolic names. 654 * Each event can have multiple symbolic names.
570 * Symbolic names are (almost) exactly matched. 655 * Symbolic names are (almost) exactly matched.
571 */ 656 */
572static int parse_event_symbols(const char **str, struct perf_counter_attr *attr) 657static enum event_result
658parse_event_symbols(const char **str, struct perf_counter_attr *attr)
573{ 659{
574 if (!(parse_tracepoint_event(str, attr) || 660 enum event_result ret;
575 parse_raw_event(str, attr) || 661
576 parse_numeric_event(str, attr) || 662 ret = parse_tracepoint_event(str, attr);
577 parse_symbolic_event(str, attr) || 663 if (ret != EVT_FAILED)
578 parse_generic_hw_event(str, attr))) 664 goto modifier;
579 return 0; 665
666 ret = parse_raw_event(str, attr);
667 if (ret != EVT_FAILED)
668 goto modifier;
580 669
670 ret = parse_numeric_event(str, attr);
671 if (ret != EVT_FAILED)
672 goto modifier;
673
674 ret = parse_symbolic_event(str, attr);
675 if (ret != EVT_FAILED)
676 goto modifier;
677
678 ret = parse_generic_hw_event(str, attr);
679 if (ret != EVT_FAILED)
680 goto modifier;
681
682 return EVT_FAILED;
683
684modifier:
581 parse_event_modifier(str, attr); 685 parse_event_modifier(str, attr);
582 686
583 return 1; 687 return ret;
584} 688}
585 689
586int parse_events(const struct option *opt __used, const char *str, int unset __used) 690int parse_events(const struct option *opt __used, const char *str, int unset __used)
587{ 691{
588 struct perf_counter_attr attr; 692 struct perf_counter_attr attr;
693 enum event_result ret;
589 694
590 for (;;) { 695 for (;;) {
591 if (nr_counters == MAX_COUNTERS) 696 if (nr_counters == MAX_COUNTERS)
592 return -1; 697 return -1;
593 698
594 memset(&attr, 0, sizeof(attr)); 699 memset(&attr, 0, sizeof(attr));
595 if (!parse_event_symbols(&str, &attr)) 700 ret = parse_event_symbols(&str, &attr);
701 if (ret == EVT_FAILED)
596 return -1; 702 return -1;
597 703
598 if (!(*str == 0 || *str == ',' || isspace(*str))) 704 if (!(*str == 0 || *str == ',' || isspace(*str)))
599 return -1; 705 return -1;
600 706
601 attrs[nr_counters] = attr; 707 if (ret != EVT_HANDLED_ALL) {
602 nr_counters++; 708 attrs[nr_counters] = attr;
709 nr_counters++;
710 }
603 711
604 if (*str == 0) 712 if (*str == 0)
605 break; 713 break;
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 7635928ca278..45efb5db0d19 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -8,7 +8,7 @@
8 8
9static struct thread *thread__new(pid_t pid) 9static struct thread *thread__new(pid_t pid)
10{ 10{
11 struct thread *self = malloc(sizeof(*self)); 11 struct thread *self = calloc(1, sizeof(*self));
12 12
13 if (self != NULL) { 13 if (self != NULL) {
14 self->pid = pid; 14 self->pid = pid;
@@ -85,7 +85,7 @@ register_idle_thread(struct rb_root *threads, struct thread **last_match)
85{ 85{
86 struct thread *thread = threads__findnew(0, threads, last_match); 86 struct thread *thread = threads__findnew(0, threads, last_match);
87 87
88 if (!thread || thread__set_comm(thread, "[init]")) { 88 if (!thread || thread__set_comm(thread, "swapper")) {
89 fprintf(stderr, "problem inserting idle task.\n"); 89 fprintf(stderr, "problem inserting idle task.\n");
90 exit(-1); 90 exit(-1);
91 } 91 }
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 634f2809a342..32aea3c1c2ad 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -4,10 +4,11 @@
4#include "symbol.h" 4#include "symbol.h"
5 5
6struct thread { 6struct thread {
7 struct rb_node rb_node; 7 struct rb_node rb_node;
8 struct list_head maps; 8 struct list_head maps;
9 pid_t pid; 9 pid_t pid;
10 char *comm; 10 char shortname[3];
11 char *comm;
11}; 12};
12 13
13int thread__set_comm(struct thread *self, const char *comm); 14int thread__set_comm(struct thread *self, const char *comm);
diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index 6c9302a7274c..1fd824c1f1c4 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -458,7 +458,7 @@ static void read_proc_kallsyms(void)
458static void read_ftrace_printk(void) 458static void read_ftrace_printk(void)
459{ 459{
460 unsigned int size, check_size; 460 unsigned int size, check_size;
461 const char *path; 461 char *path;
462 struct stat st; 462 struct stat st;
463 int ret; 463 int ret;
464 464
@@ -468,14 +468,15 @@ static void read_ftrace_printk(void)
468 /* not found */ 468 /* not found */
469 size = 0; 469 size = 0;
470 write_or_die(&size, 4); 470 write_or_die(&size, 4);
471 return; 471 goto out;
472 } 472 }
473 size = get_size(path); 473 size = get_size(path);
474 write_or_die(&size, 4); 474 write_or_die(&size, 4);
475 check_size = copy_file(path); 475 check_size = copy_file(path);
476 if (size != check_size) 476 if (size != check_size)
477 die("error in size of file '%s'", path); 477 die("error in size of file '%s'", path);
478 478out:
479 put_tracing_file(path);
479} 480}
480 481
481static struct tracepoint_path * 482static struct tracepoint_path *
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 629e602d9405..f6a8437141c8 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -1776,6 +1776,29 @@ static unsigned long long read_size(void *ptr, int size)
1776 } 1776 }
1777} 1777}
1778 1778
1779unsigned long long
1780raw_field_value(struct event *event, const char *name, void *data)
1781{
1782 struct format_field *field;
1783
1784 field = find_any_field(event, name);
1785 if (!field)
1786 return 0ULL;
1787
1788 return read_size(data + field->offset, field->size);
1789}
1790
1791void *raw_field_ptr(struct event *event, const char *name, void *data)
1792{
1793 struct format_field *field;
1794
1795 field = find_any_field(event, name);
1796 if (!field)
1797 return NULL;
1798
1799 return data + field->offset;
1800}
1801
1779static int get_common_info(const char *type, int *offset, int *size) 1802static int get_common_info(const char *type, int *offset, int *size)
1780{ 1803{
1781 struct event *event; 1804 struct event *event;
@@ -1799,7 +1822,7 @@ static int get_common_info(const char *type, int *offset, int *size)
1799 return 0; 1822 return 0;
1800} 1823}
1801 1824
1802static int parse_common_type(void *data) 1825int trace_parse_common_type(void *data)
1803{ 1826{
1804 static int type_offset; 1827 static int type_offset;
1805 static int type_size; 1828 static int type_size;
@@ -1832,7 +1855,7 @@ static int parse_common_pid(void *data)
1832 return read_size(data + pid_offset, pid_size); 1855 return read_size(data + pid_offset, pid_size);
1833} 1856}
1834 1857
1835static struct event *find_event(int id) 1858struct event *trace_find_event(int id)
1836{ 1859{
1837 struct event *event; 1860 struct event *event;
1838 1861
@@ -2420,8 +2443,8 @@ get_return_for_leaf(int cpu, int cur_pid, unsigned long long cur_func,
2420 int type; 2443 int type;
2421 int pid; 2444 int pid;
2422 2445
2423 type = parse_common_type(next->data); 2446 type = trace_parse_common_type(next->data);
2424 event = find_event(type); 2447 event = trace_find_event(type);
2425 if (!event) 2448 if (!event)
2426 return NULL; 2449 return NULL;
2427 2450
@@ -2502,8 +2525,8 @@ print_graph_entry_leaf(struct event *event, void *data, struct record *ret_rec)
2502 int type; 2525 int type;
2503 int i; 2526 int i;
2504 2527
2505 type = parse_common_type(ret_rec->data); 2528 type = trace_parse_common_type(ret_rec->data);
2506 ret_event = find_event(type); 2529 ret_event = trace_find_event(type);
2507 2530
2508 field = find_field(ret_event, "rettime"); 2531 field = find_field(ret_event, "rettime");
2509 if (!field) 2532 if (!field)
@@ -2696,11 +2719,13 @@ void print_event(int cpu, void *data, int size, unsigned long long nsecs,
2696 nsecs -= secs * NSECS_PER_SEC; 2719 nsecs -= secs * NSECS_PER_SEC;
2697 usecs = nsecs / NSECS_PER_USEC; 2720 usecs = nsecs / NSECS_PER_USEC;
2698 2721
2699 type = parse_common_type(data); 2722 type = trace_parse_common_type(data);
2700 2723
2701 event = find_event(type); 2724 event = trace_find_event(type);
2702 if (!event) 2725 if (!event) {
2703 die("ug! no event found for type %d", type); 2726 printf("ug! no event found for type %d\n", type);
2727 return;
2728 }
2704 2729
2705 pid = parse_common_pid(data); 2730 pid = parse_common_pid(data);
2706 2731
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index a1217a10632f..1b5c847d2c22 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c
@@ -458,12 +458,13 @@ struct record *trace_read_data(int cpu)
458 return data; 458 return data;
459} 459}
460 460
461void trace_report (void) 461void trace_report(void)
462{ 462{
463 const char *input_file = "trace.info"; 463 const char *input_file = "trace.info";
464 char buf[BUFSIZ]; 464 char buf[BUFSIZ];
465 char test[] = { 23, 8, 68 }; 465 char test[] = { 23, 8, 68 };
466 char *version; 466 char *version;
467 int show_version = 0;
467 int show_funcs = 0; 468 int show_funcs = 0;
468 int show_printk = 0; 469 int show_printk = 0;
469 470
@@ -480,7 +481,8 @@ void trace_report (void)
480 die("not a trace file (missing tracing)"); 481 die("not a trace file (missing tracing)");
481 482
482 version = read_string(); 483 version = read_string();
483 printf("version = %s\n", version); 484 if (show_version)
485 printf("version = %s\n", version);
484 free(version); 486 free(version);
485 487
486 read_or_die(buf, 1); 488 read_or_die(buf, 1);
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 420294a5773e..d35ebf1e29ff 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -234,6 +234,11 @@ extern int header_page_data_offset;
234extern int header_page_data_size; 234extern int header_page_data_size;
235 235
236int parse_header_page(char *buf, unsigned long size); 236int parse_header_page(char *buf, unsigned long size);
237int trace_parse_common_type(void *data);
238struct event *trace_find_event(int id);
239unsigned long long
240raw_field_value(struct event *event, const char *name, void *data);
241void *raw_field_ptr(struct event *event, const char *name, void *data);
237 242
238void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters); 243void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters);
239 244