aboutsummaryrefslogtreecommitdiffstats
path: root/tools/perf
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2010-04-23 18:04:12 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2010-04-23 21:49:58 -0400
commitc61e52ee705f938596d307625dce00cc4345aaf0 (patch)
tree6bb8a1d2662790c6b5ee8d09e0b94d91c97d1da0 /tools/perf
parent5710fcad7c367adefe5634dc998f1f88780a8457 (diff)
perf: Generalize perf lock's sample event reordering to the session layer
The sample events recorded by perf record are not time ordered because we have one buffer per cpu for each event (even demultiplexed per task/per cpu for task bound events). But when we read trace events we want them to be ordered by time because many state machines are involved. There are currently two ways perf tools deal with that: - use -M to multiplex every buffers (perf sched, perf kmem) But this creates a lot of contention in SMP machines on record time. - use a post-processing time reordering (perf timechart, perf lock) The reordering used by timechart is simple but doesn't scale well with huge flow of events, in terms of performance and memory use (unusable with perf lock for example). Perf lock has its own samples reordering that flushes its memory use in a regular basis and that uses a sorting based on the previous event queued (a new event to be queued is close to the previous one most of the time). This patch proposes to export perf lock's samples reordering facility to the session layer that reads the events. So if a tool wants to get ordered sample events, it needs to set its struct perf_event_ops::ordered_samples to true and that's it. This prepares tracing based perf tools to get rid of the need to use buffers multiplexing (-M) or to implement their own reordering. Also lower the flush period to 2 as it's sufficient already. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp> Cc: Ingo Molnar <mingo@elte.hu> Cc: Masami Hiramatsu <mhiramat@redhat.com> Cc: Tom Zanussi <tzanussi@gmail.com>
Diffstat (limited to 'tools/perf')
-rw-r--r--tools/perf/builtin-lock.c197
-rw-r--r--tools/perf/util/session.c179
-rw-r--r--tools/perf/util/session.h10
3 files changed, 210 insertions, 176 deletions
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index 716d8c544a56..ce276750b140 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -316,8 +316,6 @@ alloc_failed:
316 316
317static char const *input_name = "perf.data"; 317static char const *input_name = "perf.data";
318 318
319static int profile_cpu = -1;
320
321struct raw_event_sample { 319struct raw_event_sample {
322 u32 size; 320 u32 size;
323 char data[0]; 321 char data[0];
@@ -697,8 +695,7 @@ process_lock_release_event(void *data,
697} 695}
698 696
699static void 697static void
700process_raw_event(void *data, int cpu __used, 698process_raw_event(void *data, int cpu, u64 timestamp, struct thread *thread)
701 u64 timestamp __used, struct thread *thread __used)
702{ 699{
703 struct event *event; 700 struct event *event;
704 int type; 701 int type;
@@ -716,176 +713,6 @@ process_raw_event(void *data, int cpu __used,
716 process_lock_release_event(data, event, cpu, timestamp, thread); 713 process_lock_release_event(data, event, cpu, timestamp, thread);
717} 714}
718 715
719struct raw_event_queue {
720 u64 timestamp;
721 int cpu;
722 void *data;
723 struct thread *thread;
724 struct list_head list;
725};
726
727static LIST_HEAD(raw_event_head);
728
729#define FLUSH_PERIOD (5 * NSEC_PER_SEC)
730
731static u64 flush_limit = ULLONG_MAX;
732static u64 last_flush = 0;
733struct raw_event_queue *last_inserted;
734
735static void flush_raw_event_queue(u64 limit)
736{
737 struct raw_event_queue *tmp, *iter;
738
739 list_for_each_entry_safe(iter, tmp, &raw_event_head, list) {
740 if (iter->timestamp > limit)
741 return;
742
743 if (iter == last_inserted)
744 last_inserted = NULL;
745
746 process_raw_event(iter->data, iter->cpu, iter->timestamp,
747 iter->thread);
748
749 last_flush = iter->timestamp;
750 list_del(&iter->list);
751 free(iter->data);
752 free(iter);
753 }
754}
755
756static void __queue_raw_event_end(struct raw_event_queue *new)
757{
758 struct raw_event_queue *iter;
759
760 list_for_each_entry_reverse(iter, &raw_event_head, list) {
761 if (iter->timestamp < new->timestamp) {
762 list_add(&new->list, &iter->list);
763 return;
764 }
765 }
766
767 list_add(&new->list, &raw_event_head);
768}
769
770static void __queue_raw_event_before(struct raw_event_queue *new,
771 struct raw_event_queue *iter)
772{
773 list_for_each_entry_continue_reverse(iter, &raw_event_head, list) {
774 if (iter->timestamp < new->timestamp) {
775 list_add(&new->list, &iter->list);
776 return;
777 }
778 }
779
780 list_add(&new->list, &raw_event_head);
781}
782
783static void __queue_raw_event_after(struct raw_event_queue *new,
784 struct raw_event_queue *iter)
785{
786 list_for_each_entry_continue(iter, &raw_event_head, list) {
787 if (iter->timestamp > new->timestamp) {
788 list_add_tail(&new->list, &iter->list);
789 return;
790 }
791 }
792 list_add_tail(&new->list, &raw_event_head);
793}
794
795/* The queue is ordered by time */
796static void __queue_raw_event(struct raw_event_queue *new)
797{
798 if (!last_inserted) {
799 __queue_raw_event_end(new);
800 return;
801 }
802
803 /*
804 * Most of the time the current event has a timestamp
805 * very close to the last event inserted, unless we just switched
806 * to another event buffer. Having a sorting based on a list and
807 * on the last inserted event that is close to the current one is
808 * probably more efficient than an rbtree based sorting.
809 */
810 if (last_inserted->timestamp >= new->timestamp)
811 __queue_raw_event_before(new, last_inserted);
812 else
813 __queue_raw_event_after(new, last_inserted);
814}
815
816static void queue_raw_event(void *data, int raw_size, int cpu,
817 u64 timestamp, struct thread *thread)
818{
819 struct raw_event_queue *new;
820
821 if (flush_limit == ULLONG_MAX)
822 flush_limit = timestamp + FLUSH_PERIOD;
823
824 if (timestamp < last_flush) {
825 printf("Warning: Timestamp below last timeslice flush\n");
826 return;
827 }
828
829 new = malloc(sizeof(*new));
830 if (!new)
831 die("Not enough memory\n");
832
833 new->timestamp = timestamp;
834 new->cpu = cpu;
835 new->thread = thread;
836
837 new->data = malloc(raw_size);
838 if (!new->data)
839 die("Not enough memory\n");
840
841 memcpy(new->data, data, raw_size);
842
843 __queue_raw_event(new);
844 last_inserted = new;
845
846 /*
847 * We want to have a slice of events covering 2 * FLUSH_PERIOD
848 * If FLUSH_PERIOD is big enough, it ensures every events that occured
849 * in the first half of the timeslice have all been buffered and there
850 * are none remaining (we need that because of the weakly ordered
851 * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
852 * timeslice, we flush the first half to be gentle with the memory
853 * (the second half can still get new events in the middle, so wait
854 * another period to flush it)
855 */
856 if (new->timestamp > flush_limit &&
857 new->timestamp - flush_limit > FLUSH_PERIOD) {
858 flush_limit += FLUSH_PERIOD;
859 flush_raw_event_queue(flush_limit);
860 }
861}
862
863static int process_sample_event(event_t *event, struct perf_session *s)
864{
865 struct thread *thread;
866 struct sample_data data;
867
868 bzero(&data, sizeof(struct sample_data));
869 event__parse_sample(event, s->sample_type, &data);
870 /* CAUTION: using tid as thread.pid */
871 thread = perf_session__findnew(s, data.tid);
872
873 if (thread == NULL) {
874 pr_debug("problem processing %d event, skipping it.\n",
875 event->header.type);
876 return -1;
877 }
878
879 dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
880
881 if (profile_cpu != -1 && profile_cpu != (int) data.cpu)
882 return 0;
883
884 queue_raw_event(data.raw_data, data.raw_size, data.cpu, data.time, thread);
885
886 return 0;
887}
888
889/* TODO: various way to print, coloring, nano or milli sec */ 716/* TODO: various way to print, coloring, nano or milli sec */
890static void print_result(void) 717static void print_result(void)
891{ 718{
@@ -963,9 +790,30 @@ static void dump_map(void)
963 } 790 }
964} 791}
965 792
793static int process_sample_event(event_t *self, struct perf_session *s)
794{
795 struct sample_data data;
796 struct thread *thread;
797
798 bzero(&data, sizeof(data));
799 event__parse_sample(self, s->sample_type, &data);
800
801 thread = perf_session__findnew(s, data.tid);
802 if (thread == NULL) {
803 pr_debug("problem processing %d event, skipping it.\n",
804 self->header.type);
805 return -1;
806 }
807
808 process_raw_event(data.raw_data, data.cpu, data.time, thread);
809
810 return 0;
811}
812
966static struct perf_event_ops eops = { 813static struct perf_event_ops eops = {
967 .sample = process_sample_event, 814 .sample = process_sample_event,
968 .comm = event__process_comm, 815 .comm = event__process_comm,
816 .ordered_samples = true,
969}; 817};
970 818
971static int read_events(void) 819static int read_events(void)
@@ -994,7 +842,6 @@ static void __cmd_report(void)
994 setup_pager(); 842 setup_pager();
995 select_key(); 843 select_key();
996 read_events(); 844 read_events();
997 flush_raw_event_queue(ULLONG_MAX);
998 sort_result(); 845 sort_result();
999 print_result(); 846 print_result();
1000} 847}
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 7d88ae5c270f..b7aade2184b2 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -98,6 +98,8 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
98 self->cwdlen = 0; 98 self->cwdlen = 0;
99 self->unknown_events = 0; 99 self->unknown_events = 0;
100 self->kerninfo_root = RB_ROOT; 100 self->kerninfo_root = RB_ROOT;
101 self->ordered_samples.flush_limit = ULLONG_MAX;
102 INIT_LIST_HEAD(&self->ordered_samples.samples_head);
101 103
102 if (mode == O_RDONLY) { 104 if (mode == O_RDONLY) {
103 if (perf_session__open(self, force) < 0) 105 if (perf_session__open(self, force) < 0)
@@ -351,6 +353,178 @@ static event__swap_op event__swap_ops[] = {
351 [PERF_RECORD_HEADER_MAX] = NULL, 353 [PERF_RECORD_HEADER_MAX] = NULL,
352}; 354};
353 355
356struct sample_queue {
357 u64 timestamp;
358 struct sample_event *event;
359 struct list_head list;
360};
361
362#define FLUSH_PERIOD (2 * NSEC_PER_SEC)
363
364static void flush_sample_queue(struct perf_session *s,
365 struct perf_event_ops *ops)
366{
367 struct list_head *head = &s->ordered_samples.samples_head;
368 u64 limit = s->ordered_samples.flush_limit;
369 struct sample_queue *tmp, *iter;
370
371 if (!ops->ordered_samples)
372 return;
373
374 list_for_each_entry_safe(iter, tmp, head, list) {
375 if (iter->timestamp > limit)
376 return;
377
378 if (iter == s->ordered_samples.last_inserted)
379 s->ordered_samples.last_inserted = NULL;
380
381 ops->sample((event_t *)iter->event, s);
382
383 s->ordered_samples.last_flush = iter->timestamp;
384 list_del(&iter->list);
385 free(iter->event);
386 free(iter);
387 }
388}
389
390static void __queue_sample_end(struct sample_queue *new, struct list_head *head)
391{
392 struct sample_queue *iter;
393
394 list_for_each_entry_reverse(iter, head, list) {
395 if (iter->timestamp < new->timestamp) {
396 list_add(&new->list, &iter->list);
397 return;
398 }
399 }
400
401 list_add(&new->list, head);
402}
403
404static void __queue_sample_before(struct sample_queue *new,
405 struct sample_queue *iter,
406 struct list_head *head)
407{
408 list_for_each_entry_continue_reverse(iter, head, list) {
409 if (iter->timestamp < new->timestamp) {
410 list_add(&new->list, &iter->list);
411 return;
412 }
413 }
414
415 list_add(&new->list, head);
416}
417
418static void __queue_sample_after(struct sample_queue *new,
419 struct sample_queue *iter,
420 struct list_head *head)
421{
422 list_for_each_entry_continue(iter, head, list) {
423 if (iter->timestamp > new->timestamp) {
424 list_add_tail(&new->list, &iter->list);
425 return;
426 }
427 }
428 list_add_tail(&new->list, head);
429}
430
431/* The queue is ordered by time */
432static void __queue_sample_event(struct sample_queue *new,
433 struct perf_session *s)
434{
435 struct sample_queue *last_inserted = s->ordered_samples.last_inserted;
436 struct list_head *head = &s->ordered_samples.samples_head;
437
438
439 if (!last_inserted) {
440 __queue_sample_end(new, head);
441 return;
442 }
443
444 /*
445 * Most of the time the current event has a timestamp
446 * very close to the last event inserted, unless we just switched
447 * to another event buffer. Having a sorting based on a list and
448 * on the last inserted event that is close to the current one is
449 * probably more efficient than an rbtree based sorting.
450 */
451 if (last_inserted->timestamp >= new->timestamp)
452 __queue_sample_before(new, last_inserted, head);
453 else
454 __queue_sample_after(new, last_inserted, head);
455}
456
457static int queue_sample_event(event_t *event, struct sample_data *data,
458 struct perf_session *s,
459 struct perf_event_ops *ops)
460{
461 u64 timestamp = data->time;
462 struct sample_queue *new;
463 u64 flush_limit;
464
465
466 if (s->ordered_samples.flush_limit == ULLONG_MAX)
467 s->ordered_samples.flush_limit = timestamp + FLUSH_PERIOD;
468
469 if (timestamp < s->ordered_samples.last_flush) {
470 printf("Warning: Timestamp below last timeslice flush\n");
471 return -EINVAL;
472 }
473
474 new = malloc(sizeof(*new));
475 if (!new)
476 return -ENOMEM;
477
478 new->timestamp = timestamp;
479
480 new->event = malloc(event->header.size);
481 if (!new->event) {
482 free(new);
483 return -ENOMEM;
484 }
485
486 memcpy(new->event, event, event->header.size);
487
488 __queue_sample_event(new, s);
489 s->ordered_samples.last_inserted = new;
490
491 /*
492 * We want to have a slice of events covering 2 * FLUSH_PERIOD
493 * If FLUSH_PERIOD is big enough, it ensures every events that occured
494 * in the first half of the timeslice have all been buffered and there
495 * are none remaining (we need that because of the weakly ordered
496 * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
497 * timeslice, we flush the first half to be gentle with the memory
498 * (the second half can still get new events in the middle, so wait
499 * another period to flush it)
500 */
501 flush_limit = s->ordered_samples.flush_limit;
502
503 if (new->timestamp > flush_limit &&
504 new->timestamp - flush_limit > FLUSH_PERIOD) {
505 s->ordered_samples.flush_limit += FLUSH_PERIOD;
506 flush_sample_queue(s, ops);
507 }
508
509 return 0;
510}
511
512static int perf_session__process_sample(event_t *event, struct perf_session *s,
513 struct perf_event_ops *ops)
514{
515 struct sample_data data;
516
517 if (!ops->ordered_samples)
518 return ops->sample(event, s);
519
520 bzero(&data, sizeof(struct sample_data));
521 event__parse_sample(event, s->sample_type, &data);
522
523 queue_sample_event(event, &data, s, ops);
524
525 return 0;
526}
527
354static int perf_session__process_event(struct perf_session *self, 528static int perf_session__process_event(struct perf_session *self,
355 event_t *event, 529 event_t *event,
356 struct perf_event_ops *ops, 530 struct perf_event_ops *ops,
@@ -371,7 +545,7 @@ static int perf_session__process_event(struct perf_session *self,
371 545
372 switch (event->header.type) { 546 switch (event->header.type) {
373 case PERF_RECORD_SAMPLE: 547 case PERF_RECORD_SAMPLE:
374 return ops->sample(event, self); 548 return perf_session__process_sample(event, self, ops);
375 case PERF_RECORD_MMAP: 549 case PERF_RECORD_MMAP:
376 return ops->mmap(event, self); 550 return ops->mmap(event, self);
377 case PERF_RECORD_COMM: 551 case PERF_RECORD_COMM:
@@ -611,6 +785,9 @@ more:
611 goto more; 785 goto more;
612done: 786done:
613 err = 0; 787 err = 0;
788 /* do the final flush for ordered samples */
789 self->ordered_samples.flush_limit = ULLONG_MAX;
790 flush_sample_queue(self, ops);
614out_err: 791out_err:
615 ui_progress__delete(progress); 792 ui_progress__delete(progress);
616 return err; 793 return err;
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index 5e47c87b9266..796e2291ebd7 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -8,9 +8,17 @@
8#include <linux/rbtree.h> 8#include <linux/rbtree.h>
9#include "../../../include/linux/perf_event.h" 9#include "../../../include/linux/perf_event.h"
10 10
11struct sample_queue;
11struct ip_callchain; 12struct ip_callchain;
12struct thread; 13struct thread;
13 14
15struct ordered_samples {
16 u64 last_flush;
17 u64 flush_limit;
18 struct list_head samples_head;
19 struct sample_queue *last_inserted;
20};
21
14struct perf_session { 22struct perf_session {
15 struct perf_header header; 23 struct perf_header header;
16 unsigned long size; 24 unsigned long size;
@@ -28,6 +36,7 @@ struct perf_session {
28 bool fd_pipe; 36 bool fd_pipe;
29 int cwdlen; 37 int cwdlen;
30 char *cwd; 38 char *cwd;
39 struct ordered_samples ordered_samples;
31 char filename[0]; 40 char filename[0];
32}; 41};
33 42
@@ -47,6 +56,7 @@ struct perf_event_ops {
47 event_type, 56 event_type,
48 tracing_data, 57 tracing_data,
49 build_id; 58 build_id;
59 bool ordered_samples;
50}; 60};
51 61
52struct perf_session *perf_session__new(const char *filename, int mode, bool force); 62struct perf_session *perf_session__new(const char *filename, int mode, bool force);