perf: Generalize perf lock's sample event reordering to the session layer

The sample events recorded by perf record are not time ordered because we have one buffer per cpu for each event (even demultiplexed per task/per cpu for task bound events). But when we read trace events we want them to be ordered by time because many state machines are involved. There are currently two ways perf tools deal with that: - use -M to multiplex every buffers (perf sched, perf kmem) But this creates a lot of contention in SMP machines on record time. - use a post-processing time reordering (perf timechart, perf lock) The reordering used by timechart is simple but doesn't scale well with huge flow of events, in terms of performance and memory use (unusable with perf lock for example). Perf lock has its own samples reordering that flushes its memory use in a regular basis and that uses a sorting based on the previous event queued (a new event to be queued is close to the previous one most of the time). This patch proposes to export perf lock's samples reordering facility to the session layer that reads the events. So if a tool wants to get ordered sample events, it needs to set its struct perf_event_ops::ordered_samples to true and that's it. This prepares tracing based perf tools to get rid of the need to use buffers multiplexing (-M) or to implement their own reordering. Also lower the flush period to 2 as it's sufficient already. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp> Cc: Ingo Molnar <mingo@elte.hu> Cc: Masami Hiramatsu <mhiramat@redhat.com> Cc: Tom Zanussi <tzanussi@gmail.com>
author: Frederic Weisbecker <fweisbec@gmail.com> 2010-04-23 18:04:12 -0400
committer: Frederic Weisbecker <fweisbec@gmail.com> 2010-04-23 21:49:58 -0400
commit: c61e52ee705f938596d307625dce00cc4345aaf0 (patch)
tree: 6bb8a1d2662790c6b5ee8d09e0b94d91c97d1da0 /tools/perf/builtin-lock.c
parent: 5710fcad7c367adefe5634dc998f1f88780a8457 (diff)
1 files changed, 22 insertions, 175 deletions
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 716d8c544a56..ce276750b140 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -316,8 +316,6 @@ alloc_failed:
 static char                     const *input_name = "perf.data";
-static int                      profile_cpu = -1;
 struct raw_event_sample {
        u32                     size;
        char                    data[0];
@@ -697,8 +695,7 @@ process_lock_release_event(void *data,
 }
 static void
-process_raw_event(void *data, int cpu __used,
+process_raw_event(void *data, int cpu, u64 timestamp, struct thread *thread)
-                  u64 timestamp __used, struct thread *thread __used)
 {
        struct event *event;
        int type;
@@ -716,176 +713,6 @@ process_raw_event(void *data, int cpu __used,
                process_lock_release_event(data, event, cpu, timestamp, thread);
 }
-struct raw_event_queue {
-        u64                     timestamp;
-        int                     cpu;
-        void                    *data;
-        struct thread           *thread;
-        struct list_head        list;
-};
-static LIST_HEAD(raw_event_head);
-#define FLUSH_PERIOD    (5 * NSEC_PER_SEC)
-static u64 flush_limit = ULLONG_MAX;
-static u64 last_flush = 0;
-struct raw_event_queue *last_inserted;
-static void flush_raw_event_queue(u64 limit)
-{
-        struct raw_event_queue *tmp, *iter;
-        list_for_each_entry_safe(iter, tmp, &raw_event_head, list) {
-                if (iter->timestamp > limit)
-                        return;
-                if (iter == last_inserted)
-                        last_inserted = NULL;
-                process_raw_event(iter->data, iter->cpu, iter->timestamp,
-                                  iter->thread);
-                last_flush = iter->timestamp;
-                list_del(&iter->list);
-                free(iter->data);
-                free(iter);
-        }
-}
-static void __queue_raw_event_end(struct raw_event_queue *new)
-{
-        struct raw_event_queue *iter;
-        list_for_each_entry_reverse(iter, &raw_event_head, list) {
-                if (iter->timestamp < new->timestamp) {
-                        list_add(&new->list, &iter->list);
-                        return;
-                }
-        }
-        list_add(&new->list, &raw_event_head);
-}
-static void __queue_raw_event_before(struct raw_event_queue *new,
-                                     struct raw_event_queue *iter)
-{
-        list_for_each_entry_continue_reverse(iter, &raw_event_head, list) {
-                if (iter->timestamp < new->timestamp) {
-                        list_add(&new->list, &iter->list);
-                        return;
-                }
-        }
-        list_add(&new->list, &raw_event_head);
-}
-static void __queue_raw_event_after(struct raw_event_queue *new,
-                                     struct raw_event_queue *iter)
-{
-        list_for_each_entry_continue(iter, &raw_event_head, list) {
-                if (iter->timestamp > new->timestamp) {
-                        list_add_tail(&new->list, &iter->list);
-                        return;
-                }
-        }
-        list_add_tail(&new->list, &raw_event_head);
-}
-/* The queue is ordered by time */
-static void __queue_raw_event(struct raw_event_queue *new)
-{
-        if (!last_inserted) {
-                __queue_raw_event_end(new);
-                return;
-        }
-        /*
-         * Most of the time the current event has a timestamp
-         * very close to the last event inserted, unless we just switched
-         * to another event buffer. Having a sorting based on a list and
-         * on the last inserted event that is close to the current one is
-         * probably more efficient than an rbtree based sorting.
-         */
-        if (last_inserted->timestamp >= new->timestamp)
-                __queue_raw_event_before(new, last_inserted);
-        else
-                __queue_raw_event_after(new, last_inserted);
-}
-static void queue_raw_event(void *data, int raw_size, int cpu,
-                            u64 timestamp, struct thread *thread)
-{
-        struct raw_event_queue *new;
-        if (flush_limit == ULLONG_MAX)
-                flush_limit = timestamp + FLUSH_PERIOD;
-        if (timestamp < last_flush) {
-                printf("Warning: Timestamp below last timeslice flush\n");
-                return;
-        }
-        new = malloc(sizeof(*new));
-        if (!new)
-                die("Not enough memory\n");
-        new->timestamp = timestamp;
-        new->cpu = cpu;
-        new->thread = thread;
-        new->data = malloc(raw_size);
-        if (!new->data)
-                die("Not enough memory\n");
-        memcpy(new->data, data, raw_size);
-        __queue_raw_event(new);
-        last_inserted = new;
-        /*
-         * We want to have a slice of events covering 2 * FLUSH_PERIOD
-         * If FLUSH_PERIOD is big enough, it ensures every events that occured
-         * in the first half of the timeslice have all been buffered and there
-         * are none remaining (we need that because of the weakly ordered
-         * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
-         * timeslice, we flush the first half to be gentle with the memory
-         * (the second half can still get new events in the middle, so wait
-         * another period to flush it)
-         */
-        if (new->timestamp > flush_limit &&
-                new->timestamp - flush_limit > FLUSH_PERIOD) {
-                flush_limit += FLUSH_PERIOD;
-                flush_raw_event_queue(flush_limit);
-        }
-}
-static int process_sample_event(event_t *event, struct perf_session *s)
-{
-        struct thread *thread;
-        struct sample_data data;
-        bzero(&data, sizeof(struct sample_data));
-        event__parse_sample(event, s->sample_type, &data);
-        /* CAUTION: using tid as thread.pid */
-        thread = perf_session__findnew(s, data.tid);
-        if (thread == NULL) {
-                pr_debug("problem processing %d event, skipping it.\n",
-                         event->header.type);
-                return -1;
-        }
-        dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
-        if (profile_cpu != -1 && profile_cpu != (int) data.cpu)
-                return 0;
-        queue_raw_event(data.raw_data, data.raw_size, data.cpu, data.time, thread);
-        return 0;
-}
 /* TODO: various way to print, coloring, nano or milli sec */
 static void print_result(void)
 {
@@ -963,9 +790,30 @@ static void dump_map(void)
        }
 }
+static int process_sample_event(event_t *self, struct perf_session *s)
+{
+        struct sample_data data;
+        struct thread *thread;
+        bzero(&data, sizeof(data));
+        event__parse_sample(self, s->sample_type, &data);
+        thread = perf_session__findnew(s, data.tid);
+        if (thread == NULL) {
+                pr_debug("problem processing %d event, skipping it.\n",
+                        self->header.type);
+                return -1;
+        }
+        process_raw_event(data.raw_data, data.cpu, data.time, thread);
+        return 0;
+}
 static struct perf_event_ops eops = {
        .sample                 = process_sample_event,
        .comm                   = event__process_comm,
+        .ordered_samples        = true,
 };
 static int read_events(void)
@@ -994,7 +842,6 @@ static void __cmd_report(void)
        setup_pager();
        select_key();
        read_events();
-        flush_raw_event_queue(ULLONG_MAX);
        sort_result();
        print_result();
 }
author	Frederic Weisbecker <fweisbec@gmail.com>	2010-04-23 18:04:12 -0400
committer	Frederic Weisbecker <fweisbec@gmail.com>	2010-04-23 21:49:58 -0400
commit	c61e52ee705f938596d307625dce00cc4345aaf0 (patch)
tree	6bb8a1d2662790c6b5ee8d09e0b94d91c97d1da0 /tools/perf/builtin-lock.c
parent	5710fcad7c367adefe5634dc998f1f88780a8457 (diff)

diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c index 716d8c544a56..ce276750b140 100644 --- a/tools/perf/builtin-lock.c +++ b/tools/perf/builtin-lock.c
@@ -316,8 +316,6 @@ alloc_failed:
316		316
317	static char const *input_name = "perf.data";	317	static char const *input_name = "perf.data";
318		318
319	static int profile_cpu = -1;
320
321	struct raw_event_sample {	319	struct raw_event_sample {
322	u32 size;	320	u32 size;
323	char data[0];	321	char data[0];
@@ -697,8 +695,7 @@ process_lock_release_event(void *data,
697	}	695	}
698		696
699	static void	697	static void
700	process_raw_event(void *data, int cpu __used,	698	process_raw_event(void data, int cpu, u64 timestamp, struct thread thread)
701	u64 timestamp __used, struct thread *thread __used)
702	{	699	{
703	struct event *event;	700	struct event *event;
704	int type;	701	int type;
@@ -716,176 +713,6 @@ process_raw_event(void *data, int cpu __used,
716	process_lock_release_event(data, event, cpu, timestamp, thread);	713	process_lock_release_event(data, event, cpu, timestamp, thread);
717	}	714	}
718		715
719	struct raw_event_queue {
720	u64 timestamp;
721	int cpu;
722	void *data;
723	struct thread *thread;
724	struct list_head list;
725	};
726
727	static LIST_HEAD(raw_event_head);
728
729	#define FLUSH_PERIOD (5 * NSEC_PER_SEC)
730
731	static u64 flush_limit = ULLONG_MAX;
732	static u64 last_flush = 0;
733	struct raw_event_queue *last_inserted;
734
735	static void flush_raw_event_queue(u64 limit)
736	{
737	struct raw_event_queue tmp, iter;
738
739	list_for_each_entry_safe(iter, tmp, &raw_event_head, list) {
740	if (iter->timestamp > limit)
741	return;
742
743	if (iter == last_inserted)
744	last_inserted = NULL;
745
746	process_raw_event(iter->data, iter->cpu, iter->timestamp,
747	iter->thread);
748
749	last_flush = iter->timestamp;
750	list_del(&iter->list);
751	free(iter->data);
752	free(iter);
753	}
754	}
755
756	static void __queue_raw_event_end(struct raw_event_queue *new)
757	{
758	struct raw_event_queue *iter;
759
760	list_for_each_entry_reverse(iter, &raw_event_head, list) {
761	if (iter->timestamp < new->timestamp) {
762	list_add(&new->list, &iter->list);
763	return;
764	}
765	}
766
767	list_add(&new->list, &raw_event_head);
768	}
769
770	static void __queue_raw_event_before(struct raw_event_queue *new,
771	struct raw_event_queue *iter)
772	{
773	list_for_each_entry_continue_reverse(iter, &raw_event_head, list) {
774	if (iter->timestamp < new->timestamp) {
775	list_add(&new->list, &iter->list);
776	return;
777	}
778	}
779
780	list_add(&new->list, &raw_event_head);
781	}
782
783	static void __queue_raw_event_after(struct raw_event_queue *new,
784	struct raw_event_queue *iter)
785	{
786	list_for_each_entry_continue(iter, &raw_event_head, list) {
787	if (iter->timestamp > new->timestamp) {
788	list_add_tail(&new->list, &iter->list);
789	return;
790	}
791	}
792	list_add_tail(&new->list, &raw_event_head);
793	}
794
795	/* The queue is ordered by time */
796	static void __queue_raw_event(struct raw_event_queue *new)
797	{
798	if (!last_inserted) {
799	__queue_raw_event_end(new);
800	return;
801	}
802
803	/*
804	* Most of the time the current event has a timestamp
805	* very close to the last event inserted, unless we just switched
806	* to another event buffer. Having a sorting based on a list and
807	* on the last inserted event that is close to the current one is
808	* probably more efficient than an rbtree based sorting.
809	*/
810	if (last_inserted->timestamp >= new->timestamp)
811	__queue_raw_event_before(new, last_inserted);
812	else
813	__queue_raw_event_after(new, last_inserted);
814	}
815
816	static void queue_raw_event(void *data, int raw_size, int cpu,
817	u64 timestamp, struct thread *thread)
818	{
819	struct raw_event_queue *new;
820
821	if (flush_limit == ULLONG_MAX)
822	flush_limit = timestamp + FLUSH_PERIOD;
823
824	if (timestamp < last_flush) {
825	printf("Warning: Timestamp below last timeslice flush\n");
826	return;
827	}
828
829	new = malloc(sizeof(*new));
830	if (!new)
831	die("Not enough memory\n");
832
833	new->timestamp = timestamp;
834	new->cpu = cpu;
835	new->thread = thread;
836
837	new->data = malloc(raw_size);
838	if (!new->data)
839	die("Not enough memory\n");
840
841	memcpy(new->data, data, raw_size);
842
843	__queue_raw_event(new);
844	last_inserted = new;
845
846	/*
847	* We want to have a slice of events covering 2 * FLUSH_PERIOD
848	* If FLUSH_PERIOD is big enough, it ensures every events that occured
849	* in the first half of the timeslice have all been buffered and there
850	* are none remaining (we need that because of the weakly ordered
851	* event recording we have). Then once we reach the 2 * FLUSH_PERIOD
852	* timeslice, we flush the first half to be gentle with the memory
853	* (the second half can still get new events in the middle, so wait
854	* another period to flush it)
855	*/
856	if (new->timestamp > flush_limit &&
857	new->timestamp - flush_limit > FLUSH_PERIOD) {
858	flush_limit += FLUSH_PERIOD;
859	flush_raw_event_queue(flush_limit);
860	}
861	}
862
863	static int process_sample_event(event_t event, struct perf_session s)
864	{
865	struct thread *thread;
866	struct sample_data data;
867
868	bzero(&data, sizeof(struct sample_data));
869	event__parse_sample(event, s->sample_type, &data);
870	/* CAUTION: using tid as thread.pid */
871	thread = perf_session__findnew(s, data.tid);
872
873	if (thread == NULL) {
874	pr_debug("problem processing %d event, skipping it.\n",
875	event->header.type);
876	return -1;
877	}
878
879	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
880
881	if (profile_cpu != -1 && profile_cpu != (int) data.cpu)
882	return 0;
883
884	queue_raw_event(data.raw_data, data.raw_size, data.cpu, data.time, thread);
885
886	return 0;
887	}
888
889	/* TODO: various way to print, coloring, nano or milli sec */	716	/* TODO: various way to print, coloring, nano or milli sec */
890	static void print_result(void)	717	static void print_result(void)
891	{	718	{
@@ -963,9 +790,30 @@ static void dump_map(void)
963	}	790	}
964	}	791	}
965		792
		793	static int process_sample_event(event_t self, struct perf_session s)
		794	{
		795	struct sample_data data;
		796	struct thread *thread;
		797
		798	bzero(&data, sizeof(data));
		799	event__parse_sample(self, s->sample_type, &data);
		800
		801	thread = perf_session__findnew(s, data.tid);
		802	if (thread == NULL) {
		803	pr_debug("problem processing %d event, skipping it.\n",
		804	self->header.type);
		805	return -1;
		806	}
		807
		808	process_raw_event(data.raw_data, data.cpu, data.time, thread);
		809
		810	return 0;
		811	}
		812
966	static struct perf_event_ops eops = {	813	static struct perf_event_ops eops = {
967	.sample = process_sample_event,	814	.sample = process_sample_event,
968	.comm = event__process_comm,	815	.comm = event__process_comm,
		816	.ordered_samples = true,
969	};	817	};
970		818
971	static int read_events(void)	819	static int read_events(void)
@@ -994,7 +842,6 @@ static void __cmd_report(void)
994	setup_pager();	842	setup_pager();
995	select_key();	843	select_key();
996	read_events();	844	read_events();
997	flush_raw_event_queue(ULLONG_MAX);
998	sort_result();	845	sort_result();
999	print_result();	846	print_result();
1000	}	847	}