perf sched: Add option to merge like comms to lat output

Sometimes when debugging large multi-threaded applications it is helpful to collate all of the latency numbers into one bulk record to get an idea of what is going on. This patch does this by merging any entries that belong to the same comm into one entry and then spits out those totals. I've also slightly changed the output so you can see how many threads were merged in the processing. Here is the new default output format ----------------------------------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at | ----------------------------------------------------------------------------------------------------------- chrome:(23) | 740.878 ms | 2612 | avg: 0.022 ms | max: 0.845 ms | max at: 7935.254223 s pulseaudio:1523 | 94.440 ms | 597 | avg: 0.027 ms | max: 0.110 ms | max at: 7934.668372 s threaded-ml:6042 | 72.554 ms | 386 | avg: 0.035 ms | max: 1.186 ms | max at: 7935.330911 s Chrome_IOThread:3832 | 52.388 ms | 456 | avg: 0.021 ms | max: 1.365 ms | max at: 7935.330602 s Chrome_ChildIOT:(7) | 50.694 ms | 743 | avg: 0.021 ms | max: 1.448 ms | max at: 7935.256659 s Compositor:5510 | 30.012 ms | 192 | avg: 0.019 ms | max: 0.131 ms | max at: 7936.636815 s plugin_audio_th:6043 | 24.828 ms | 314 | avg: 0.018 ms | max: 0.143 ms | max at: 7936.205994 s CompositorTileW:(2) | 14.099 ms | 45 | avg: 0.022 ms | max: 0.153 ms | max at: 7937.521800 s the (#) after the task is the number of tasks merged, and then if there were no tasks merged it just shows the pid. Here is the same trace file with the -p option to print the per-pid latency numbers ----------------------------------------------------------------------------------------------------------- Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at | ----------------------------------------------------------------------------------------------------------- chrome:5500 | 386.872 ms | 387 | avg: 0.023 ms | max: 0.241 ms | max at: 7936.001694 s pulseaudio:1523 | 94.440 ms | 597 | avg: 0.027 ms | max: 0.110 ms | max at: 7934.668372 s threaded-ml:6042 | 72.554 ms | 386 | avg: 0.035 ms | max: 1.186 ms | max at: 7935.330911 s chrome:10226 | 69.710 ms | 251 | avg: 0.023 ms | max: 0.764 ms | max at: 7935.992305 s chrome:4267 | 64.551 ms | 418 | avg: 0.021 ms | max: 0.294 ms | max at: 7937.862427 s chrome:4827 | 62.268 ms | 54 | avg: 0.029 ms | max: 0.666 ms | max at: 7935.992813 s Chrome_IOThread:3832 | 52.388 ms | 456 | avg: 0.021 ms | max: 1.365 ms | max at: 7935.330602 s chrome:3776 | 46.150 ms | 349 | avg: 0.023 ms | max: 0.845 ms | max at: 7935.254223 s Signed-off-by: Josef Bacik <jbacik@fb.com> Acked-by: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: kernel-team@fb.com Link: http://lkml.kernel.org/r/1432300720-30478-1-git-send-email-jbacik@fb.com Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
author: Josef Bacik <jbacik@fb.com> 2015-05-22 09:18:40 -0400
committer: Arnaldo Carvalho de Melo <acme@redhat.com> 2015-05-27 11:21:45 -0400
commit: 2f80dd4488c204a4850554746eb31f25f5a84405 (patch)
tree: e424e67e0fde7c9d9890187ba298063b3d33ae38 /tools/perf
parent: e8b7ea4356fdd3c4de5478f3418eb84f8dce2b61 (diff)
1 files changed, 72 insertions, 5 deletions
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 79273ecf92eb..33962612a5e9 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -95,6 +95,7 @@ struct work_atoms {
        u64                     total_lat;
        u64                     nb_atoms;
        u64                     total_runtime;
+        int                     num_merged;
 };
 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
@@ -168,9 +169,10 @@ struct perf_sched {
        u64              all_runtime;
        u64              all_count;
        u64              cpu_last_switched[MAX_CPUS];
-        struct rb_root   atom_root, sorted_atom_root;
+        struct rb_root   atom_root, sorted_atom_root, merged_atom_root;
        struct list_head sort_list, cmp_pid;
        bool force;
+        bool skip_merge;
 };
 static u64 get_nsecs(void)
@@ -1182,7 +1184,10 @@ static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_
        sched->all_runtime += work_list->total_runtime;
        sched->all_count   += work_list->nb_atoms;
-        ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
+        if (work_list->num_merged > 1)
+                ret = printf("  %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
+        else
+                ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
        for (i = 0; i < 24 - ret; i++)
                printf(" ");
@@ -1302,17 +1307,22 @@ static int sort_dimension__add(const char *tok, struct list_head *list)
 static void perf_sched__sort_lat(struct perf_sched *sched)
 {
        struct rb_node *node;
+        struct rb_root *root = &sched->atom_root;
+again:
        for (;;) {
                struct work_atoms *data;
-                node = rb_first(&sched->atom_root);
+                node = rb_first(root);
                if (!node)
                        break;
-                rb_erase(node, &sched->atom_root);
+                rb_erase(node, root);
                data = rb_entry(node, struct work_atoms, node);
                __thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
        }
+        if (root == &sched->atom_root) {
+                root = &sched->merged_atom_root;
+                goto again;
+        }
 }
 static int process_sched_wakeup_event(struct perf_tool *tool,
@@ -1572,6 +1582,59 @@ static void print_bad_events(struct perf_sched *sched)
        }
 }
+static void __merge_work_atoms(struct rb_root *root, struct work_atoms *data)
+{
+        struct rb_node **new = &(root->rb_node), *parent = NULL;
+        struct work_atoms *this;
+        const char *comm = thread__comm_str(data->thread), *this_comm;
+        while (*new) {
+                int cmp;
+                this = container_of(*new, struct work_atoms, node);
+                parent = *new;
+                this_comm = thread__comm_str(this->thread);
+                cmp = strcmp(comm, this_comm);
+                if (cmp > 0) {
+                        new = &((*new)->rb_left);
+                } else if (cmp < 0) {
+                        new = &((*new)->rb_right);
+                } else {
+                        this->num_merged++;
+                        this->total_runtime += data->total_runtime;
+                        this->nb_atoms += data->nb_atoms;
+                        this->total_lat += data->total_lat;
+                        list_splice(&data->work_list, &this->work_list);
+                        if (this->max_lat < data->max_lat) {
+                                this->max_lat = data->max_lat;
+                                this->max_lat_at = data->max_lat_at;
+                        }
+                        zfree(&data);
+                        return;
+                }
+        }
+        data->num_merged++;
+        rb_link_node(&data->node, parent, new);
+        rb_insert_color(&data->node, root);
+}
+static void perf_sched__merge_lat(struct perf_sched *sched)
+{
+        struct work_atoms *data;
+        struct rb_node *node;
+        if (sched->skip_merge)
+                return;
+        while ((node = rb_first(&sched->atom_root))) {
+                rb_erase(node, &sched->atom_root);
+                data = rb_entry(node, struct work_atoms, node);
+                __merge_work_atoms(&sched->merged_atom_root, data);
+        }
+}
 static int perf_sched__lat(struct perf_sched *sched)
 {
        struct rb_node *next;
@@ -1581,6 +1644,7 @@ static int perf_sched__lat(struct perf_sched *sched)
        if (perf_sched__read_events(sched))
                return -1;
+        perf_sched__merge_lat(sched);
        perf_sched__sort_lat(sched);
        printf("\n -----------------------------------------------------------------------------------------------------------------\n");
@@ -1732,6 +1796,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
                .profile_cpu          = -1,
                .next_shortname1      = 'A',
                .next_shortname2      = '0',
+                .skip_merge           = 0,
        };
        const struct option latency_options[] = {
        OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
@@ -1742,6 +1807,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
                    "CPU to profile on"),
        OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
                    "dump raw trace in ASCII"),
+        OPT_BOOLEAN('p', "pids", &sched.skip_merge,
+                    "latency stats per pid instead of per comm"),
        OPT_END()
        };
        const struct option replay_options[] = {
author	Josef Bacik <jbacik@fb.com>	2015-05-22 09:18:40 -0400
committer	Arnaldo Carvalho de Melo <acme@redhat.com>	2015-05-27 11:21:45 -0400
commit	2f80dd4488c204a4850554746eb31f25f5a84405 (patch)
tree	e424e67e0fde7c9d9890187ba298063b3d33ae38 /tools/perf
parent	e8b7ea4356fdd3c4de5478f3418eb84f8dce2b61 (diff)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 79273ecf92eb..33962612a5e9 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c
@@ -95,6 +95,7 @@ struct work_atoms {
95	u64 total_lat;	95	u64 total_lat;
96	u64 nb_atoms;	96	u64 nb_atoms;
97	u64 total_runtime;	97	u64 total_runtime;
		98	int num_merged;
98	};	99	};
99		100
100	typedef int (sort_fn_t)(struct work_atoms , struct work_atoms *);	101	typedef int (sort_fn_t)(struct work_atoms , struct work_atoms *);
@@ -168,9 +169,10 @@ struct perf_sched {
168	u64 all_runtime;	169	u64 all_runtime;
169	u64 all_count;	170	u64 all_count;
170	u64 cpu_last_switched[MAX_CPUS];	171	u64 cpu_last_switched[MAX_CPUS];
171	struct rb_root atom_root, sorted_atom_root;	172	struct rb_root atom_root, sorted_atom_root, merged_atom_root;
172	struct list_head sort_list, cmp_pid;	173	struct list_head sort_list, cmp_pid;
173	bool force;	174	bool force;
		175	bool skip_merge;
174	};	176	};
175		177
176	static u64 get_nsecs(void)	178	static u64 get_nsecs(void)
@@ -1182,7 +1184,10 @@ static void output_lat_thread(struct perf_sched sched, struct work_atoms work_
1182	sched->all_runtime += work_list->total_runtime;	1184	sched->all_runtime += work_list->total_runtime;
1183	sched->all_count += work_list->nb_atoms;	1185	sched->all_count += work_list->nb_atoms;
1184		1186
1185	ret = printf(" %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);	1187	if (work_list->num_merged > 1)
		1188	ret = printf(" %s:(%d) ", thread__comm_str(work_list->thread), work_list->num_merged);
		1189	else
		1190	ret = printf(" %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
1186		1191
1187	for (i = 0; i < 24 - ret; i++)	1192	for (i = 0; i < 24 - ret; i++)
1188	printf(" ");	1193	printf(" ");
@@ -1302,17 +1307,22 @@ static int sort_dimension__add(const char tok, struct list_head list)
1302	static void perf_sched__sort_lat(struct perf_sched *sched)	1307	static void perf_sched__sort_lat(struct perf_sched *sched)
1303	{	1308	{
1304	struct rb_node *node;	1309	struct rb_node *node;
1305		1310	struct rb_root *root = &sched->atom_root;
		1311	again:
1306	for (;;) {	1312	for (;;) {
1307	struct work_atoms *data;	1313	struct work_atoms *data;
1308	node = rb_first(&sched->atom_root);	1314	node = rb_first(root);
1309	if (!node)	1315	if (!node)
1310	break;	1316	break;
1311		1317
1312	rb_erase(node, &sched->atom_root);	1318	rb_erase(node, root);
1313	data = rb_entry(node, struct work_atoms, node);	1319	data = rb_entry(node, struct work_atoms, node);
1314	__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);	1320	__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
1315	}	1321	}
		1322	if (root == &sched->atom_root) {
		1323	root = &sched->merged_atom_root;
		1324	goto again;
		1325	}
1316	}	1326	}
1317		1327
1318	static int process_sched_wakeup_event(struct perf_tool *tool,	1328	static int process_sched_wakeup_event(struct perf_tool *tool,
@@ -1572,6 +1582,59 @@ static void print_bad_events(struct perf_sched *sched)
1572	}	1582	}
1573	}	1583	}
1574		1584
		1585	static void __merge_work_atoms(struct rb_root root, struct work_atoms data)
		1586	{
		1587	struct rb_node *new = &(root->rb_node), parent = NULL;
		1588	struct work_atoms *this;
		1589	const char comm = thread__comm_str(data->thread), this_comm;
		1590
		1591	while (*new) {
		1592	int cmp;
		1593
		1594	this = container_of(*new, struct work_atoms, node);
		1595	parent = *new;
		1596
		1597	this_comm = thread__comm_str(this->thread);
		1598	cmp = strcmp(comm, this_comm);
		1599	if (cmp > 0) {
		1600	new = &((*new)->rb_left);
		1601	} else if (cmp < 0) {
		1602	new = &((*new)->rb_right);
		1603	} else {
		1604	this->num_merged++;
		1605	this->total_runtime += data->total_runtime;
		1606	this->nb_atoms += data->nb_atoms;
		1607	this->total_lat += data->total_lat;
		1608	list_splice(&data->work_list, &this->work_list);
		1609	if (this->max_lat < data->max_lat) {
		1610	this->max_lat = data->max_lat;
		1611	this->max_lat_at = data->max_lat_at;
		1612	}
		1613	zfree(&data);
		1614	return;
		1615	}
		1616	}
		1617
		1618	data->num_merged++;
		1619	rb_link_node(&data->node, parent, new);
		1620	rb_insert_color(&data->node, root);
		1621	}
		1622
		1623	static void perf_sched__merge_lat(struct perf_sched *sched)
		1624	{
		1625	struct work_atoms *data;
		1626	struct rb_node *node;
		1627
		1628	if (sched->skip_merge)
		1629	return;
		1630
		1631	while ((node = rb_first(&sched->atom_root))) {
		1632	rb_erase(node, &sched->atom_root);
		1633	data = rb_entry(node, struct work_atoms, node);
		1634	__merge_work_atoms(&sched->merged_atom_root, data);
		1635	}
		1636	}
		1637
1575	static int perf_sched__lat(struct perf_sched *sched)	1638	static int perf_sched__lat(struct perf_sched *sched)
1576	{	1639	{
1577	struct rb_node *next;	1640	struct rb_node *next;
@@ -1581,6 +1644,7 @@ static int perf_sched__lat(struct perf_sched *sched)
1581	if (perf_sched__read_events(sched))	1644	if (perf_sched__read_events(sched))
1582	return -1;	1645	return -1;
1583		1646
		1647	perf_sched__merge_lat(sched);
1584	perf_sched__sort_lat(sched);	1648	perf_sched__sort_lat(sched);
1585		1649
1586	printf("\n -----------------------------------------------------------------------------------------------------------------\n");	1650	printf("\n -----------------------------------------------------------------------------------------------------------------\n");
@@ -1732,6 +1796,7 @@ int cmd_sched(int argc, const char *argv, const char prefix __maybe_unused)
1732	.profile_cpu = -1,	1796	.profile_cpu = -1,
1733	.next_shortname1 = 'A',	1797	.next_shortname1 = 'A',
1734	.next_shortname2 = '0',	1798	.next_shortname2 = '0',
		1799	.skip_merge = 0,
1735	};	1800	};
1736	const struct option latency_options[] = {	1801	const struct option latency_options[] = {
1737	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",	1802	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
@@ -1742,6 +1807,8 @@ int cmd_sched(int argc, const char *argv, const char prefix __maybe_unused)
1742	"CPU to profile on"),	1807	"CPU to profile on"),
1743	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,	1808	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
1744	"dump raw trace in ASCII"),	1809	"dump raw trace in ASCII"),
		1810	OPT_BOOLEAN('p', "pids", &sched.skip_merge,
		1811	"latency stats per pid instead of per comm"),
1745	OPT_END()	1812	OPT_END()
1746	};	1813	};
1747	const struct option replay_options[] = {	1814	const struct option replay_options[] = {