perf lock: Drop the buffers multiplexing dependency

We need to deal with time ordered events to build a correct state machine of lock events. This is why we multiplex the lock events buffers. But the ordering is done from the kernel, on the tracing fast path, leading to high contention between cpus. Without multiplexing, the events appears in a weak order. If we have four events, each split per cpu, perf record will read the events buffers in the following order: [ CPU0 ev0, CPU0 ev1, CPU0 ev3, CPU0 ev4, CPU1 ev0, CPU1 ev0....] To handle a post processing reordering, we could just read and sort the whole in memory, but it just doesn't scale with high amounts of events: lock events can fill huge amounts in few times. Basically we need to sort in memory and find a "grace period" point when we know that a given slice of previously sorted events can be committed for post-processing, so that we can unload the memory usage step by step and keep a scalable sorting list. There is no strong rules about how to define such "grace period". What does this patch is: We define a FLUSH_PERIOD value that defines a grace period in seconds. We want to have a slice of events covering 2 * FLUSH_PERIOD in our sorted list. If FLUSH_PERIOD is big enough, it ensures every events that occured in the first half of the timeslice have all been buffered and there are none remaining and there won't be further to put inside this first timeslice. Then once we reach the 2 * FLUSH_PERIOD timeslice, we flush the first half to be gentle with the memory (the second half can still get new events in the middle, so wait another period to flush it) FLUSH_PERIOD is defined to 5 seconds. Say the first event started on time t0. We can safely assume that at the time we are processing events of t0 + 10 seconds, ther won't be anymore events to read from perf.data that occured between t0 and t0 + 5 seconds. Hence we can safely flush the first half. To point out funky bugs, we have a guardian that checks a new event timestamp is not below the last event's timestamp flushed and that displays a warning in this case. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp> Cc: Li Zefan <lizf@cn.fujitsu.com> Cc: Lai Jiangshan <laijs@cn.fujitsu.com> Cc: Masami Hiramatsu <mhiramat@redhat.com> Cc: Jens Axboe <jens.axboe@oracle.com>
author: Frederic Weisbecker <fweisbec@gmail.com> 2010-02-03 03:09:33 -0500
committer: Frederic Weisbecker <fweisbec@gmail.com> 2010-02-27 11:06:19 -0500
commit: b67577dfb45580c498bfdb1bc76c00c3b2ad6310 (patch)
tree: 6d3c8a49706fea39d4279382618b84a9e8625951 /tools/perf/builtin-lock.c
parent: 84c6f88fc8265d7a712d7d6ed8fc1a878dfc84d1 (diff)
1 files changed, 146 insertions, 2 deletions
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index fb9ab2ad3f9..e12c844df1e 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -460,6 +460,150 @@ process_raw_event(void *data, int cpu,
                process_lock_release_event(data, event, cpu, timestamp, thread);
 }
+struct raw_event_queue {
+        u64                     timestamp;
+        int                     cpu;
+        void                    *data;
+        struct thread           *thread;
+        struct list_head        list;
+};
+static LIST_HEAD(raw_event_head);
+#define FLUSH_PERIOD    (5 * NSEC_PER_SEC)
+static u64 flush_limit = ULLONG_MAX;
+static u64 last_flush = 0;
+struct raw_event_queue *last_inserted;
+static void flush_raw_event_queue(u64 limit)
+{
+        struct raw_event_queue *tmp, *iter;
+        list_for_each_entry_safe(iter, tmp, &raw_event_head, list) {
+                if (iter->timestamp > limit)
+                        return;
+                if (iter == last_inserted)
+                        last_inserted = NULL;
+                process_raw_event(iter->data, iter->cpu, iter->timestamp,
+                                  iter->thread);
+                last_flush = iter->timestamp;
+                list_del(&iter->list);
+                free(iter->data);
+                free(iter);
+        }
+}
+static void __queue_raw_event_end(struct raw_event_queue *new)
+{
+        struct raw_event_queue *iter;
+        list_for_each_entry_reverse(iter, &raw_event_head, list) {
+                if (iter->timestamp < new->timestamp) {
+                        list_add(&new->list, &iter->list);
+                        return;
+                }
+        }
+        list_add(&new->list, &raw_event_head);
+}
+static void __queue_raw_event_before(struct raw_event_queue *new,
+                                     struct raw_event_queue *iter)
+{
+        list_for_each_entry_continue_reverse(iter, &raw_event_head, list) {
+                if (iter->timestamp < new->timestamp) {
+                        list_add(&new->list, &iter->list);
+                        return;
+                }
+        }
+        list_add(&new->list, &raw_event_head);
+}
+static void __queue_raw_event_after(struct raw_event_queue *new,
+                                     struct raw_event_queue *iter)
+{
+        list_for_each_entry_continue(iter, &raw_event_head, list) {
+                if (iter->timestamp > new->timestamp) {
+                        list_add_tail(&new->list, &iter->list);
+                        return;
+                }
+        }
+        list_add_tail(&new->list, &raw_event_head);
+}
+/* The queue is ordered by time */
+static void __queue_raw_event(struct raw_event_queue *new)
+{
+        if (!last_inserted) {
+                __queue_raw_event_end(new);
+                return;
+        }
+        /*
+         * Most of the time the current event has a timestamp
+         * very close to the last event inserted, unless we just switched
+         * to another event buffer. Having a sorting based on a list and
+         * on the last inserted event that is close to the current one is
+         * probably more efficient than an rbtree based sorting.
+         */
+        if (last_inserted->timestamp >= new->timestamp)
+                __queue_raw_event_before(new, last_inserted);
+        else
+                __queue_raw_event_after(new, last_inserted);
+}
+static void queue_raw_event(void *data, int raw_size, int cpu,
+                            u64 timestamp, struct thread *thread)
+{
+        struct raw_event_queue *new;
+        if (flush_limit == ULLONG_MAX)
+                flush_limit = timestamp + FLUSH_PERIOD;
+        if (timestamp < last_flush) {
+                printf("Warning: Timestamp below last timeslice flush\n");
+                return;
+        }
+        new = malloc(sizeof(*new));
+        if (!new)
+                die("Not enough memory\n");
+        new->timestamp = timestamp;
+        new->cpu = cpu;
+        new->thread = thread;
+        new->data = malloc(raw_size);
+        if (!new->data)
+                die("Not enough memory\n");
+        memcpy(new->data, data, raw_size);
+        __queue_raw_event(new);
+        last_inserted = new;
+        /*
+         * We want to have a slice of events covering 2 * FLUSH_PERIOD
+         * If FLUSH_PERIOD is big enough, it ensures every events that occured
+         * in the first half of the timeslice have all been buffered and there
+         * are none remaining (we need that because of the weakly ordered
+         * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
+         * timeslice, we flush the first half to be gentle with the memory
+         * (the second half can still get new events in the middle, so wait
+         * another period to flush it)
+         */
+        if (new->timestamp > flush_limit &&
+                new->timestamp - flush_limit > FLUSH_PERIOD) {
+                flush_limit += FLUSH_PERIOD;
+                flush_raw_event_queue(flush_limit);
+        }
+}
 static int process_sample_event(event_t *event, struct perf_session *session)
 {
        struct thread *thread;
@@ -480,7 +624,7 @@ static int process_sample_event(event_t *event, struct perf_session *session)
        if (profile_cpu != -1 && profile_cpu != (int) data.cpu)
                return 0;
-        process_raw_event(data.raw_data, data.cpu, data.time, thread);
+        queue_raw_event(data.raw_data, data.raw_size, data.cpu, data.time, thread);
        return 0;
 }
@@ -576,6 +720,7 @@ static void __cmd_report(void)
        setup_pager();
        select_key();
        read_events();
+        flush_raw_event_queue(ULLONG_MAX);
        sort_result();
        print_result();
 }
@@ -608,7 +753,6 @@ static const char *record_args[] = {
        "record",
        "-a",
        "-R",
-        "-M",
        "-f",
        "-m", "1024",
        "-c", "1",
author	Frederic Weisbecker <fweisbec@gmail.com>	2010-02-03 03:09:33 -0500
committer	Frederic Weisbecker <fweisbec@gmail.com>	2010-02-27 11:06:19 -0500
commit	b67577dfb45580c498bfdb1bc76c00c3b2ad6310 (patch)
tree	6d3c8a49706fea39d4279382618b84a9e8625951 /tools/perf/builtin-lock.c
parent	84c6f88fc8265d7a712d7d6ed8fc1a878dfc84d1 (diff)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index fb9ab2ad3f9..e12c844df1e 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c
@@ -460,6 +460,150 @@ process_raw_event(void *data, int cpu,
460	process_lock_release_event(data, event, cpu, timestamp, thread);	460	process_lock_release_event(data, event, cpu, timestamp, thread);
461	}	461	}
462		462
		463	struct raw_event_queue {
		464	u64 timestamp;
		465	int cpu;
		466	void *data;
		467	struct thread *thread;
		468	struct list_head list;
		469	};
		470
		471	static LIST_HEAD(raw_event_head);
		472
		473	#define FLUSH_PERIOD (5 * NSEC_PER_SEC)
		474
		475	static u64 flush_limit = ULLONG_MAX;
		476	static u64 last_flush = 0;
		477	struct raw_event_queue *last_inserted;
		478
		479	static void flush_raw_event_queue(u64 limit)
		480	{
		481	struct raw_event_queue tmp, iter;
		482
		483	list_for_each_entry_safe(iter, tmp, &raw_event_head, list) {
		484	if (iter->timestamp > limit)
		485	return;
		486
		487	if (iter == last_inserted)
		488	last_inserted = NULL;
		489
		490	process_raw_event(iter->data, iter->cpu, iter->timestamp,
		491	iter->thread);
		492
		493	last_flush = iter->timestamp;
		494	list_del(&iter->list);
		495	free(iter->data);
		496	free(iter);
		497	}
		498	}
		499
		500	static void __queue_raw_event_end(struct raw_event_queue *new)
		501	{
		502	struct raw_event_queue *iter;
		503
		504	list_for_each_entry_reverse(iter, &raw_event_head, list) {
		505	if (iter->timestamp < new->timestamp) {
		506	list_add(&new->list, &iter->list);
		507	return;
		508	}
		509	}
		510
		511	list_add(&new->list, &raw_event_head);
		512	}
		513
		514	static void __queue_raw_event_before(struct raw_event_queue *new,
		515	struct raw_event_queue *iter)
		516	{
		517	list_for_each_entry_continue_reverse(iter, &raw_event_head, list) {
		518	if (iter->timestamp < new->timestamp) {
		519	list_add(&new->list, &iter->list);
		520	return;
		521	}
		522	}
		523
		524	list_add(&new->list, &raw_event_head);
		525	}
		526
		527	static void __queue_raw_event_after(struct raw_event_queue *new,
		528	struct raw_event_queue *iter)
		529	{
		530	list_for_each_entry_continue(iter, &raw_event_head, list) {
		531	if (iter->timestamp > new->timestamp) {
		532	list_add_tail(&new->list, &iter->list);
		533	return;
		534	}
		535	}
		536	list_add_tail(&new->list, &raw_event_head);
		537	}
		538
		539	/* The queue is ordered by time */
		540	static void __queue_raw_event(struct raw_event_queue *new)
		541	{
		542	if (!last_inserted) {
		543	__queue_raw_event_end(new);
		544	return;
		545	}
		546
		547	/*
		548	* Most of the time the current event has a timestamp
		549	* very close to the last event inserted, unless we just switched
		550	* to another event buffer. Having a sorting based on a list and
		551	* on the last inserted event that is close to the current one is
		552	* probably more efficient than an rbtree based sorting.
		553	*/
		554	if (last_inserted->timestamp >= new->timestamp)
		555	__queue_raw_event_before(new, last_inserted);
		556	else
		557	__queue_raw_event_after(new, last_inserted);
		558	}
		559
		560	static void queue_raw_event(void *data, int raw_size, int cpu,
		561	u64 timestamp, struct thread *thread)
		562	{
		563	struct raw_event_queue *new;
		564
		565	if (flush_limit == ULLONG_MAX)
		566	flush_limit = timestamp + FLUSH_PERIOD;
		567
		568	if (timestamp < last_flush) {
		569	printf("Warning: Timestamp below last timeslice flush\n");
		570	return;
		571	}
		572
		573	new = malloc(sizeof(*new));
		574	if (!new)
		575	die("Not enough memory\n");
		576
		577	new->timestamp = timestamp;
		578	new->cpu = cpu;
		579	new->thread = thread;
		580
		581	new->data = malloc(raw_size);
		582	if (!new->data)
		583	die("Not enough memory\n");
		584
		585	memcpy(new->data, data, raw_size);
		586
		587	__queue_raw_event(new);
		588	last_inserted = new;
		589
		590	/*
		591	* We want to have a slice of events covering 2 * FLUSH_PERIOD
		592	* If FLUSH_PERIOD is big enough, it ensures every events that occured
		593	* in the first half of the timeslice have all been buffered and there
		594	* are none remaining (we need that because of the weakly ordered
		595	* event recording we have). Then once we reach the 2 * FLUSH_PERIOD
		596	* timeslice, we flush the first half to be gentle with the memory
		597	* (the second half can still get new events in the middle, so wait
		598	* another period to flush it)
		599	*/
		600	if (new->timestamp > flush_limit &&
		601	new->timestamp - flush_limit > FLUSH_PERIOD) {
		602	flush_limit += FLUSH_PERIOD;
		603	flush_raw_event_queue(flush_limit);
		604	}
		605	}
		606
463	static int process_sample_event(event_t event, struct perf_session session)	607	static int process_sample_event(event_t event, struct perf_session session)
464	{	608	{
465	struct thread *thread;	609	struct thread *thread;
@@ -480,7 +624,7 @@ static int process_sample_event(event_t event, struct perf_session session)
480	if (profile_cpu != -1 && profile_cpu != (int) data.cpu)	624	if (profile_cpu != -1 && profile_cpu != (int) data.cpu)
481	return 0;	625	return 0;
482		626
483	process_raw_event(data.raw_data, data.cpu, data.time, thread);	627	queue_raw_event(data.raw_data, data.raw_size, data.cpu, data.time, thread);
484		628
485	return 0;	629	return 0;
486	}	630	}
@@ -576,6 +720,7 @@ static void __cmd_report(void)
576	setup_pager();	720	setup_pager();
577	select_key();	721	select_key();
578	read_events();	722	read_events();
		723	flush_raw_event_queue(ULLONG_MAX);
579	sort_result();	724	sort_result();
580	print_result();	725	print_result();
581	}	726	}
@@ -608,7 +753,6 @@ static const char *record_args[] = {
608	"record",	753	"record",
609	"-a",	754	"-a",
610	"-R",	755	"-R",
611	"-M",
612	"-f",	756	"-f",
613	"-m", "1024",	757	"-m", "1024",
614	"-c", "1",	758	"-c", "1",