perf: Generalize perf lock's sample event reordering to the session layer

The sample events recorded by perf record are not time ordered because we have one buffer per cpu for each event (even demultiplexed per task/per cpu for task bound events). But when we read trace events we want them to be ordered by time because many state machines are involved. There are currently two ways perf tools deal with that: - use -M to multiplex every buffers (perf sched, perf kmem) But this creates a lot of contention in SMP machines on record time. - use a post-processing time reordering (perf timechart, perf lock) The reordering used by timechart is simple but doesn't scale well with huge flow of events, in terms of performance and memory use (unusable with perf lock for example). Perf lock has its own samples reordering that flushes its memory use in a regular basis and that uses a sorting based on the previous event queued (a new event to be queued is close to the previous one most of the time). This patch proposes to export perf lock's samples reordering facility to the session layer that reads the events. So if a tool wants to get ordered sample events, it needs to set its struct perf_event_ops::ordered_samples to true and that's it. This prepares tracing based perf tools to get rid of the need to use buffers multiplexing (-M) or to implement their own reordering. Also lower the flush period to 2 as it's sufficient already. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp> Cc: Ingo Molnar <mingo@elte.hu> Cc: Masami Hiramatsu <mhiramat@redhat.com> Cc: Tom Zanussi <tzanussi@gmail.com>
author: Frederic Weisbecker <fweisbec@gmail.com> 2010-04-23 18:04:12 -0400
committer: Frederic Weisbecker <fweisbec@gmail.com> 2010-04-23 21:49:58 -0400
commit: c61e52ee705f938596d307625dce00cc4345aaf0 (patch)
tree: 6bb8a1d2662790c6b5ee8d09e0b94d91c97d1da0 /tools/perf/util/session.c
parent: 5710fcad7c367adefe5634dc998f1f88780a8457 (diff)
1 files changed, 178 insertions, 1 deletions
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7d88ae5c270f..b7aade2184b2 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -98,6 +98,8 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
        self->cwdlen = 0;
        self->unknown_events = 0;
        self->kerninfo_root = RB_ROOT;
+        self->ordered_samples.flush_limit = ULLONG_MAX;
+        INIT_LIST_HEAD(&self->ordered_samples.samples_head);
        if (mode == O_RDONLY) {
                if (perf_session__open(self, force) < 0)
@@ -351,6 +353,178 @@ static event__swap_op event__swap_ops[] = {
        [PERF_RECORD_HEADER_MAX]    = NULL,
 };
+struct sample_queue {
+        u64                     timestamp;
+        struct sample_event     *event;
+        struct list_head        list;
+};
+#define FLUSH_PERIOD    (2 * NSEC_PER_SEC)
+static void flush_sample_queue(struct perf_session *s,
+                               struct perf_event_ops *ops)
+{
+        struct list_head *head = &s->ordered_samples.samples_head;
+        u64 limit = s->ordered_samples.flush_limit;
+        struct sample_queue *tmp, *iter;
+        if (!ops->ordered_samples)
+                return;
+        list_for_each_entry_safe(iter, tmp, head, list) {
+                if (iter->timestamp > limit)
+                        return;
+                if (iter == s->ordered_samples.last_inserted)
+                        s->ordered_samples.last_inserted = NULL;
+                ops->sample((event_t *)iter->event, s);
+                s->ordered_samples.last_flush = iter->timestamp;
+                list_del(&iter->list);
+                free(iter->event);
+                free(iter);
+        }
+}
+static void __queue_sample_end(struct sample_queue *new, struct list_head *head)
+{
+        struct sample_queue *iter;
+        list_for_each_entry_reverse(iter, head, list) {
+                if (iter->timestamp < new->timestamp) {
+                        list_add(&new->list, &iter->list);
+                        return;
+                }
+        }
+        list_add(&new->list, head);
+}
+static void __queue_sample_before(struct sample_queue *new,
+                                  struct sample_queue *iter,
+                                  struct list_head *head)
+{
+        list_for_each_entry_continue_reverse(iter, head, list) {
+                if (iter->timestamp < new->timestamp) {
+                        list_add(&new->list, &iter->list);
+                        return;
+                }
+        }
+        list_add(&new->list, head);
+}
+static void __queue_sample_after(struct sample_queue *new,
+                                 struct sample_queue *iter,
+                                 struct list_head *head)
+{
+        list_for_each_entry_continue(iter, head, list) {
+                if (iter->timestamp > new->timestamp) {
+                        list_add_tail(&new->list, &iter->list);
+                        return;
+                }
+        }
+        list_add_tail(&new->list, head);
+}
+/* The queue is ordered by time */
+static void __queue_sample_event(struct sample_queue *new,
+                                 struct perf_session *s)
+{
+        struct sample_queue *last_inserted = s->ordered_samples.last_inserted;
+        struct list_head *head = &s->ordered_samples.samples_head;
+        if (!last_inserted) {
+                __queue_sample_end(new, head);
+                return;
+        }
+        /*
+         * Most of the time the current event has a timestamp
+         * very close to the last event inserted, unless we just switched
+         * to another event buffer. Having a sorting based on a list and
+         * on the last inserted event that is close to the current one is
+         * probably more efficient than an rbtree based sorting.
+         */
+        if (last_inserted->timestamp >= new->timestamp)
+                __queue_sample_before(new, last_inserted, head);
+        else
+                __queue_sample_after(new, last_inserted, head);
+}
+static int queue_sample_event(event_t *event, struct sample_data *data,
+                              struct perf_session *s,
+                              struct perf_event_ops *ops)
+{
+        u64 timestamp = data->time;
+        struct sample_queue *new;
+        u64 flush_limit;
+        if (s->ordered_samples.flush_limit == ULLONG_MAX)
+                s->ordered_samples.flush_limit = timestamp + FLUSH_PERIOD;
+        if (timestamp < s->ordered_samples.last_flush) {
+                printf("Warning: Timestamp below last timeslice flush\n");
+                return -EINVAL;
+        }
+        new = malloc(sizeof(*new));
+        if (!new)
+                return -ENOMEM;
+        new->timestamp = timestamp;
+        new->event = malloc(event->header.size);
+        if (!new->event) {
+                free(new);
+                return -ENOMEM;
+        }
+        memcpy(new->event, event, event->header.size);
+        __queue_sample_event(new, s);
+        s->ordered_samples.last_inserted = new;
+        /*
+         * We want to have a slice of events covering 2 * FLUSH_PERIOD
+         * If FLUSH_PERIOD is big enough, it ensures every events that occured
+         * in the first half of the timeslice have all been buffered and there
+         * are none remaining (we need that because of the weakly ordered
+         * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
+         * timeslice, we flush the first half to be gentle with the memory
+         * (the second half can still get new events in the middle, so wait
+         * another period to flush it)
+         */
+        flush_limit = s->ordered_samples.flush_limit;
+        if (new->timestamp > flush_limit &&
+                new->timestamp - flush_limit > FLUSH_PERIOD) {
+                s->ordered_samples.flush_limit += FLUSH_PERIOD;
+                flush_sample_queue(s, ops);
+        }
+        return 0;
+}
+static int perf_session__process_sample(event_t *event, struct perf_session *s,
+                                        struct perf_event_ops *ops)
+{
+        struct sample_data data;
+        if (!ops->ordered_samples)
+                return ops->sample(event, s);
+        bzero(&data, sizeof(struct sample_data));
+        event__parse_sample(event, s->sample_type, &data);
+        queue_sample_event(event, &data, s, ops);
+        return 0;
+}
 static int perf_session__process_event(struct perf_session *self,
                                       event_t *event,
                                       struct perf_event_ops *ops,
@@ -371,7 +545,7 @@ static int perf_session__process_event(struct perf_session *self,
        switch (event->header.type) {
        case PERF_RECORD_SAMPLE:
-                return ops->sample(event, self);
+                return perf_session__process_sample(event, self, ops);
        case PERF_RECORD_MMAP:
                return ops->mmap(event, self);
        case PERF_RECORD_COMM:
@@ -611,6 +785,9 @@ more:
                goto more;
 done:
        err = 0;
+        /* do the final flush for ordered samples */
+        self->ordered_samples.flush_limit = ULLONG_MAX;
+        flush_sample_queue(self, ops);
 out_err:
        ui_progress__delete(progress);
        return err;
author	Frederic Weisbecker <fweisbec@gmail.com>	2010-04-23 18:04:12 -0400
committer	Frederic Weisbecker <fweisbec@gmail.com>	2010-04-23 21:49:58 -0400
commit	c61e52ee705f938596d307625dce00cc4345aaf0 (patch)
tree	6bb8a1d2662790c6b5ee8d09e0b94d91c97d1da0 /tools/perf/util/session.c
parent	5710fcad7c367adefe5634dc998f1f88780a8457 (diff)

diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 7d88ae5c270f..b7aade2184b2 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c
@@ -98,6 +98,8 @@ struct perf_session perf_session__new(const char filename, int mode, bool forc
98	self->cwdlen = 0;	98	self->cwdlen = 0;
99	self->unknown_events = 0;	99	self->unknown_events = 0;
100	self->kerninfo_root = RB_ROOT;	100	self->kerninfo_root = RB_ROOT;
		101	self->ordered_samples.flush_limit = ULLONG_MAX;
		102	INIT_LIST_HEAD(&self->ordered_samples.samples_head);
101		103
102	if (mode == O_RDONLY) {	104	if (mode == O_RDONLY) {
103	if (perf_session__open(self, force) < 0)	105	if (perf_session__open(self, force) < 0)
@@ -351,6 +353,178 @@ static event__swap_op event__swap_ops[] = {
351	[PERF_RECORD_HEADER_MAX] = NULL,	353	[PERF_RECORD_HEADER_MAX] = NULL,
352	};	354	};
353		355
		356	struct sample_queue {
		357	u64 timestamp;
		358	struct sample_event *event;
		359	struct list_head list;
		360	};
		361
		362	#define FLUSH_PERIOD (2 * NSEC_PER_SEC)
		363
		364	static void flush_sample_queue(struct perf_session *s,
		365	struct perf_event_ops *ops)
		366	{
		367	struct list_head *head = &s->ordered_samples.samples_head;
		368	u64 limit = s->ordered_samples.flush_limit;
		369	struct sample_queue tmp, iter;
		370
		371	if (!ops->ordered_samples)
		372	return;
		373
		374	list_for_each_entry_safe(iter, tmp, head, list) {
		375	if (iter->timestamp > limit)
		376	return;
		377
		378	if (iter == s->ordered_samples.last_inserted)
		379	s->ordered_samples.last_inserted = NULL;
		380
		381	ops->sample((event_t *)iter->event, s);
		382
		383	s->ordered_samples.last_flush = iter->timestamp;
		384	list_del(&iter->list);
		385	free(iter->event);
		386	free(iter);
		387	}
		388	}
		389
		390	static void __queue_sample_end(struct sample_queue new, struct list_head head)
		391	{
		392	struct sample_queue *iter;
		393
		394	list_for_each_entry_reverse(iter, head, list) {
		395	if (iter->timestamp < new->timestamp) {
		396	list_add(&new->list, &iter->list);
		397	return;
		398	}
		399	}
		400
		401	list_add(&new->list, head);
		402	}
		403
		404	static void __queue_sample_before(struct sample_queue *new,
		405	struct sample_queue *iter,
		406	struct list_head *head)
		407	{
		408	list_for_each_entry_continue_reverse(iter, head, list) {
		409	if (iter->timestamp < new->timestamp) {
		410	list_add(&new->list, &iter->list);
		411	return;
		412	}
		413	}
		414
		415	list_add(&new->list, head);
		416	}
		417
		418	static void __queue_sample_after(struct sample_queue *new,
		419	struct sample_queue *iter,
		420	struct list_head *head)
		421	{
		422	list_for_each_entry_continue(iter, head, list) {
		423	if (iter->timestamp > new->timestamp) {
		424	list_add_tail(&new->list, &iter->list);
		425	return;
		426	}
		427	}
		428	list_add_tail(&new->list, head);
		429	}
		430
		431	/* The queue is ordered by time */
		432	static void __queue_sample_event(struct sample_queue *new,
		433	struct perf_session *s)
		434	{
		435	struct sample_queue *last_inserted = s->ordered_samples.last_inserted;
		436	struct list_head *head = &s->ordered_samples.samples_head;
		437
		438
		439	if (!last_inserted) {
		440	__queue_sample_end(new, head);
		441	return;
		442	}
		443
		444	/*
		445	* Most of the time the current event has a timestamp
		446	* very close to the last event inserted, unless we just switched
		447	* to another event buffer. Having a sorting based on a list and
		448	* on the last inserted event that is close to the current one is
		449	* probably more efficient than an rbtree based sorting.
		450	*/
		451	if (last_inserted->timestamp >= new->timestamp)
		452	__queue_sample_before(new, last_inserted, head);
		453	else
		454	__queue_sample_after(new, last_inserted, head);
		455	}
		456
		457	static int queue_sample_event(event_t event, struct sample_data data,
		458	struct perf_session *s,
		459	struct perf_event_ops *ops)
		460	{
		461	u64 timestamp = data->time;
		462	struct sample_queue *new;
		463	u64 flush_limit;
		464
		465
		466	if (s->ordered_samples.flush_limit == ULLONG_MAX)
		467	s->ordered_samples.flush_limit = timestamp + FLUSH_PERIOD;
		468
		469	if (timestamp < s->ordered_samples.last_flush) {
		470	printf("Warning: Timestamp below last timeslice flush\n");
		471	return -EINVAL;
		472	}
		473
		474	new = malloc(sizeof(*new));
		475	if (!new)
		476	return -ENOMEM;
		477
		478	new->timestamp = timestamp;
		479
		480	new->event = malloc(event->header.size);
		481	if (!new->event) {
		482	free(new);
		483	return -ENOMEM;
		484	}
		485
		486	memcpy(new->event, event, event->header.size);
		487
		488	__queue_sample_event(new, s);
		489	s->ordered_samples.last_inserted = new;
		490
		491	/*
		492	* We want to have a slice of events covering 2 * FLUSH_PERIOD
		493	* If FLUSH_PERIOD is big enough, it ensures every events that occured
		494	* in the first half of the timeslice have all been buffered and there
		495	* are none remaining (we need that because of the weakly ordered
		496	* event recording we have). Then once we reach the 2 * FLUSH_PERIOD
		497	* timeslice, we flush the first half to be gentle with the memory
		498	* (the second half can still get new events in the middle, so wait
		499	* another period to flush it)
		500	*/
		501	flush_limit = s->ordered_samples.flush_limit;
		502
		503	if (new->timestamp > flush_limit &&
		504	new->timestamp - flush_limit > FLUSH_PERIOD) {
		505	s->ordered_samples.flush_limit += FLUSH_PERIOD;
		506	flush_sample_queue(s, ops);
		507	}
		508
		509	return 0;
		510	}
		511
		512	static int perf_session__process_sample(event_t event, struct perf_session s,
		513	struct perf_event_ops *ops)
		514	{
		515	struct sample_data data;
		516
		517	if (!ops->ordered_samples)
		518	return ops->sample(event, s);
		519
		520	bzero(&data, sizeof(struct sample_data));
		521	event__parse_sample(event, s->sample_type, &data);
		522
		523	queue_sample_event(event, &data, s, ops);
		524
		525	return 0;
		526	}
		527
354	static int perf_session__process_event(struct perf_session *self,	528	static int perf_session__process_event(struct perf_session *self,
355	event_t *event,	529	event_t *event,
356	struct perf_event_ops *ops,	530	struct perf_event_ops *ops,
@@ -371,7 +545,7 @@ static int perf_session__process_event(struct perf_session *self,
371		545
372	switch (event->header.type) {	546	switch (event->header.type) {
373	case PERF_RECORD_SAMPLE:	547	case PERF_RECORD_SAMPLE:
374	return ops->sample(event, self);	548	return perf_session__process_sample(event, self, ops);
375	case PERF_RECORD_MMAP:	549	case PERF_RECORD_MMAP:
376	return ops->mmap(event, self);	550	return ops->mmap(event, self);
377	case PERF_RECORD_COMM:	551	case PERF_RECORD_COMM:
@@ -611,6 +785,9 @@ more:
611	goto more;	785	goto more;
612	done:	786	done:
613	err = 0;	787	err = 0;
		788	/* do the final flush for ordered samples */
		789	self->ordered_samples.flush_limit = ULLONG_MAX;
		790	flush_sample_queue(self, ops);
614	out_err:	791	out_err:
615	ui_progress__delete(progress);	792	ui_progress__delete(progress);
616	return err;	793	return err;