aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf
diff options
context:
space:
mode:
authorJosef Bacik <jbacik@fb.com>2015-05-22 09:18:40 -0400
committerArnaldo Carvalho de Melo <acme@redhat.com>2015-05-27 11:21:45 -0400
commit2f80dd4488c204a4850554746eb31f25f5a84405 (patch)
treee424e67e0fde7c9d9890187ba298063b3d33ae38 /tools/perf
parente8b7ea4356fdd3c4de5478f3418eb84f8dce2b61 (diff)
perf sched: Add option to merge like comms to lat output
Sometimes when debugging large multi-threaded applications it is helpful to collate all of the latency numbers into one bulk record to get an idea of what is going on. This patch does this by merging any entries that belong to the same comm into one entry and then spits out those totals. I've also slightly changed the output so you can see how many threads were merged in the processing. Here is the new default output format ----------------------------------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at | ----------------------------------------------------------------------------------------------------------- chrome:(23) | 740.878 ms | 2612 | avg: 0.022 ms | max: 0.845 ms | max at: 7935.254223 s pulseaudio:1523 | 94.440 ms | 597 | avg: 0.027 ms | max: 0.110 ms | max at: 7934.668372 s threaded-ml:6042 | 72.554 ms | 386 | avg: 0.035 ms | max: 1.186 ms | max at: 7935.330911 s Chrome_IOThread:3832 | 52.388 ms | 456 | avg: 0.021 ms | max: 1.365 ms | max at: 7935.330602 s Chrome_ChildIOT:(7) | 50.694 ms | 743 | avg: 0.021 ms | max: 1.448 ms | max at: 7935.256659 s Compositor:5510 | 30.012 ms | 192 | avg: 0.019 ms | max: 0.131 ms | max at: 7936.636815 s plugin_audio_th:6043 | 24.828 ms | 314 | avg: 0.018 ms | max: 0.143 ms | max at: 7936.205994 s CompositorTileW:(2) | 14.099 ms | 45 | avg: 0.022 ms | max: 0.153 ms | max at: 7937.521800 s the (#) after the task is the number of tasks merged, and then if there were no tasks merged it just shows the pid. Here is the same trace file with the -p option to print the per-pid latency numbers ----------------------------------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at | ----------------------------------------------------------------------------------------------------------- chrome:5500 | 386.872 ms | 387 | avg: 0.023 ms | max: 0.241 ms | max at: 7936.001694 s pulseaudio:1523 | 94.440 ms | 597 | avg: 0.027 ms | max: 0.110 ms | max at: 7934.668372 s threaded-ml:6042 | 72.554 ms | 386 | avg: 0.035 ms | max: 1.186 ms | max at: 7935.330911 s chrome:10226 | 69.710 ms | 251 | avg: 0.023 ms | max: 0.764 ms | max at: 7935.992305 s chrome:4267 | 64.551 ms | 418 | avg: 0.021 ms | max: 0.294 ms | max at: 7937.862427 s chrome:4827 | 62.268 ms | 54 | avg: 0.029 ms | max: 0.666 ms | max at: 7935.992813 s Chrome_IOThread:3832 | 52.388 ms | 456 | avg: 0.021 ms | max: 1.365 ms | max at: 7935.330602 s chrome:3776 | 46.150 ms | 349 | avg: 0.023 ms | max: 0.845 ms | max at: 7935.254223 s Signed-off-by: Josef Bacik <jbacik@fb.com> Acked-by: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/1432300720-30478-1-git-send-email-jbacik@fb.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Diffstat (limited to 'tools/perf')
-rw-r--r--tools/perf/builtin-sched.c77
1 files changed, 72 insertions, 5 deletions
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 79273ecf92eb..33962612a5e9 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -95,6 +95,7 @@ struct work_atoms {
95 u64 total_lat; 95 u64 total_lat;
96 u64 nb_atoms; 96 u64 nb_atoms;
97 u64 total_runtime; 97 u64 total_runtime;
98 int num_merged;
98}; 99};
99 100
100typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *); 101typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
@@ -168,9 +169,10 @@ struct perf_sched {
168 u64 all_runtime; 169 u64 all_runtime;
169 u64 all_count; 170 u64 all_count;
170 u64 cpu_last_switched[MAX_CPUS]; 171 u64 cpu_last_switched[MAX_CPUS];
171 struct rb_root atom_root, sorted_atom_root; 172 struct rb_root atom_root, sorted_atom_root, merged_atom_root;
172 struct list_head sort_list, cmp_pid; 173 struct list_head sort_list, cmp_pid;
173 bool force; 174 bool force;
175 bool skip_merge;
174}; 176};
175 177
176static u64 get_nsecs(void) 178static u64 get_nsecs(void)
@@ -1182,7 +1184,10 @@ static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_
1182 sched->all_runtime += work_list->total_runtime; 1184 sched->all_runtime += work_list->total_runtime;
1183 sched->all_count += work_list->nb_atoms; 1185 sched->all_count += work_list->nb_atoms;
1184 1186
1185 ret = printf(" %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid); 1187 if (work_list->num_merged > 1)
1188 ret = printf(" %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
1189 else
1190 ret = printf(" %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1186 1191
1187 for (i = 0; i < 24 - ret; i++) 1192 for (i = 0; i < 24 - ret; i++)
1188 printf(" "); 1193 printf(" ");
@@ -1302,17 +1307,22 @@ static int sort_dimension__add(const char *tok, struct list_head *list)
1302static void perf_sched__sort_lat(struct perf_sched *sched) 1307static void perf_sched__sort_lat(struct perf_sched *sched)
1303{ 1308{
1304 struct rb_node *node; 1309 struct rb_node *node;
1305 1310 struct rb_root *root = &sched->atom_root;
1311again:
1306 for (;;) { 1312 for (;;) {
1307 struct work_atoms *data; 1313 struct work_atoms *data;
1308 node = rb_first(&sched->atom_root); 1314 node = rb_first(root);
1309 if (!node) 1315 if (!node)
1310 break; 1316 break;
1311 1317
1312 rb_erase(node, &sched->atom_root); 1318 rb_erase(node, root);
1313 data = rb_entry(node, struct work_atoms, node); 1319 data = rb_entry(node, struct work_atoms, node);
1314 __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list); 1320 __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1315 } 1321 }
1322 if (root == &sched->atom_root) {
1323 root = &sched->merged_atom_root;
1324 goto again;
1325 }
1316} 1326}
1317 1327
1318static int process_sched_wakeup_event(struct perf_tool *tool, 1328static int process_sched_wakeup_event(struct perf_tool *tool,
@@ -1572,6 +1582,59 @@ static void print_bad_events(struct perf_sched *sched)
1572 } 1582 }
1573} 1583}
1574 1584
1585static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data)
1586{
1587 struct rb_node **new = &(root->rb_node), *parent = NULL;
1588 struct work_atoms *this;
1589 const char *comm = thread__comm_str(data->thread), *this_comm;
1590
1591 while (*new) {
1592 int cmp;
1593
1594 this = container_of(*new, struct work_atoms, node);
1595 parent = *new;
1596
1597 this_comm = thread__comm_str(this->thread);
1598 cmp = strcmp(comm, this_comm);
1599 if (cmp > 0) {
1600 new = &((*new)->rb_left);
1601 } else if (cmp < 0) {
1602 new = &((*new)->rb_right);
1603 } else {
1604 this->num_merged++;
1605 this->total_runtime += data->total_runtime;
1606 this->nb_atoms += data->nb_atoms;
1607 this->total_lat += data->total_lat;
1608 list_splice(&data->work_list, &this->work_list);
1609 if (this->max_lat < data->max_lat) {
1610 this->max_lat = data->max_lat;
1611 this->max_lat_at = data->max_lat_at;
1612 }
1613 zfree(&data);
1614 return;
1615 }
1616 }
1617
1618 data->num_merged++;
1619 rb_link_node(&data->node, parent, new);
1620 rb_insert_color(&data->node, root);
1621}
1622
1623static void perf_sched__merge_lat(struct perf_sched *sched)
1624{
1625 struct work_atoms *data;
1626 struct rb_node *node;
1627
1628 if (sched->skip_merge)
1629 return;
1630
1631 while ((node = rb_first(&sched->atom_root))) {
1632 rb_erase(node, &sched->atom_root);
1633 data = rb_entry(node, struct work_atoms, node);
1634 __merge_work_atoms(&sched->merged_atom_root, data);
1635 }
1636}
1637
1575static int perf_sched__lat(struct perf_sched *sched) 1638static int perf_sched__lat(struct perf_sched *sched)
1576{ 1639{
1577 struct rb_node *next; 1640 struct rb_node *next;
@@ -1581,6 +1644,7 @@ static int perf_sched__lat(struct perf_sched *sched)
1581 if (perf_sched__read_events(sched)) 1644 if (perf_sched__read_events(sched))
1582 return -1; 1645 return -1;
1583 1646
1647 perf_sched__merge_lat(sched);
1584 perf_sched__sort_lat(sched); 1648 perf_sched__sort_lat(sched);
1585 1649
1586 printf("\n -----------------------------------------------------------------------------------------------------------------\n"); 1650 printf("\n -----------------------------------------------------------------------------------------------------------------\n");
@@ -1732,6 +1796,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
1732 .profile_cpu = -1, 1796 .profile_cpu = -1,
1733 .next_shortname1 = 'A', 1797 .next_shortname1 = 'A',
1734 .next_shortname2 = '0', 1798 .next_shortname2 = '0',
1799 .skip_merge = 0,
1735 }; 1800 };
1736 const struct option latency_options[] = { 1801 const struct option latency_options[] = {
1737 OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]", 1802 OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
@@ -1742,6 +1807,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
1742 "CPU to profile on"), 1807 "CPU to profile on"),
1743 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, 1808 OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1744 "dump raw trace in ASCII"), 1809 "dump raw trace in ASCII"),
1810 OPT_BOOLEAN('p', "pids", &sched.skip_merge,
1811 "latency stats per pid instead of per comm"),
1745 OPT_END() 1812 OPT_END()
1746 }; 1813 };
1747 const struct option replay_options[] = { 1814 const struct option replay_options[] = {