aboutsummaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2010-05-03 09:14:33 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2010-05-09 07:43:42 -0400
commitd6b17bebd79dae2e3577f2ea27a832af4991a5e6 (patch)
tree3b7f253ccd54ef9355706224a055dc79e59d954a /tools
parent984028075794c00cbf4fb1e94bb6233e8be08875 (diff)
perf: Provide a new deterministic events reordering algorithm
The current events reordering algorithm is based on a heuristic that gets broken once we deal with a very fast flow of events. Indeed the time period based flushing is not suitable anymore in the following case, assuming we have a flush period of two seconds. CPU 0 | CPU 1 | cnt1 timestamps | cnt1 timestamps | 0 | 0 1 | 1 2 | 2 3 | 3 [...] | [...] 4 seconds later If we spend too much time to read the buffers (case of a lot of events to record in each buffers or when we have a lot of CPU buffers to read), in the next pass the CPU 0 buffer could contain a slice of several seconds of events. We'll read them all and notice we've reached the period to flush. In the above example we flush the first half of the CPU 0 buffer, then we read the CPU 1 buffer where we have events that were on the flush slice and then the reordering fails. It's simple to reproduce with: perf lock record perf bench sched messaging To solve this, we use a new solution that doesn't rely on an heuristical time slice period anymore but on a deterministic basis based on how perf record does its job. perf record saves the buffers through passes. A pass is a tour on every buffers from every CPUs. This is made in order: for each CPU we read the buffers of every counters. So the more buffers we visit, the later will be the timstamps of their events. When perf record finishes a pass it records a PERF_RECORD_FINISHED_ROUND pseudo event. We record the max timestamp t found in the pass n. Assuming these timestamps are monotonic across cpus, we know that if a buffer still has events with timestamps below t, they will be all available and then read in the pass n + 1. Hence when we start to read the pass n + 2, we can safely flush every events with timestamps below t. ============ PASS n ================= CPU 0 | CPU 1 | cnt1 timestamps | cnt2 timestamps 1 | 2 2 | 3 - | 4 <--- max recorded ============ PASS n + 1 ============== CPU 0 | CPU 1 | cnt1 timestamps | cnt2 timestamps 3 | 5 4 | 6 5 | 7 <---- max recorded Flush every events below timestamp 4 ============ PASS n + 2 ============== CPU 0 | CPU 1 | cnt1 timestamps | cnt2 timestamps 6 | 8 7 | 9 - | 10 Flush every events below timestamp 7 etc... It also works on perf.data versions that don't have PERF_RECORD_FINISHED_ROUND pseudo events. The difference is that the events will be only flushed in the end of the perf.data processing. It will then consume more memory and scale less with large perf.data files. Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Paul Mackerras <paulus@samba.org> Cc: Tom Zanussi <tzanussi@gmail.com> Cc: Masami Hiramatsu <mhiramat@redhat.com>
Diffstat (limited to 'tools')
-rw-r--r--tools/perf/util/session.c106
-rw-r--r--tools/perf/util/session.h36
2 files changed, 97 insertions, 45 deletions
diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c
index 5d353e70fe26..9401909fa283 100644
--- a/tools/perf/util/session.c
+++ b/tools/perf/util/session.c
@@ -98,7 +98,6 @@ struct perf_session *perf_session__new(const char *filename, int mode, bool forc
98 self->unknown_events = 0; 98 self->unknown_events = 0;
99 self->machines = RB_ROOT; 99 self->machines = RB_ROOT;
100 self->repipe = repipe; 100 self->repipe = repipe;
101 self->ordered_samples.flush_limit = ULLONG_MAX;
102 INIT_LIST_HEAD(&self->ordered_samples.samples_head); 101 INIT_LIST_HEAD(&self->ordered_samples.samples_head);
103 102
104 if (mode == O_RDONLY) { 103 if (mode == O_RDONLY) {
@@ -194,6 +193,18 @@ static int process_event_stub(event_t *event __used,
194 return 0; 193 return 0;
195} 194}
196 195
196static int process_finished_round_stub(event_t *event __used,
197 struct perf_session *session __used,
198 struct perf_event_ops *ops __used)
199{
200 dump_printf(": unhandled!\n");
201 return 0;
202}
203
204static int process_finished_round(event_t *event,
205 struct perf_session *session,
206 struct perf_event_ops *ops);
207
197static void perf_event_ops__fill_defaults(struct perf_event_ops *handler) 208static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
198{ 209{
199 if (handler->sample == NULL) 210 if (handler->sample == NULL)
@@ -222,6 +233,12 @@ static void perf_event_ops__fill_defaults(struct perf_event_ops *handler)
222 handler->tracing_data = process_event_stub; 233 handler->tracing_data = process_event_stub;
223 if (handler->build_id == NULL) 234 if (handler->build_id == NULL)
224 handler->build_id = process_event_stub; 235 handler->build_id = process_event_stub;
236 if (handler->finished_round == NULL) {
237 if (handler->ordered_samples)
238 handler->finished_round = process_finished_round;
239 else
240 handler->finished_round = process_finished_round_stub;
241 }
225} 242}
226 243
227static const char *event__name[] = { 244static const char *event__name[] = {
@@ -359,16 +376,14 @@ struct sample_queue {
359 struct list_head list; 376 struct list_head list;
360}; 377};
361 378
362#define FLUSH_PERIOD (2 * NSEC_PER_SEC)
363
364static void flush_sample_queue(struct perf_session *s, 379static void flush_sample_queue(struct perf_session *s,
365 struct perf_event_ops *ops) 380 struct perf_event_ops *ops)
366{ 381{
367 struct list_head *head = &s->ordered_samples.samples_head; 382 struct list_head *head = &s->ordered_samples.samples_head;
368 u64 limit = s->ordered_samples.flush_limit; 383 u64 limit = s->ordered_samples.next_flush;
369 struct sample_queue *tmp, *iter; 384 struct sample_queue *tmp, *iter;
370 385
371 if (!ops->ordered_samples) 386 if (!ops->ordered_samples || !limit)
372 return; 387 return;
373 388
374 list_for_each_entry_safe(iter, tmp, head, list) { 389 list_for_each_entry_safe(iter, tmp, head, list) {
@@ -387,6 +402,55 @@ static void flush_sample_queue(struct perf_session *s,
387 } 402 }
388} 403}
389 404
405/*
406 * When perf record finishes a pass on every buffers, it records this pseudo
407 * event.
408 * We record the max timestamp t found in the pass n.
409 * Assuming these timestamps are monotonic across cpus, we know that if
410 * a buffer still has events with timestamps below t, they will be all
411 * available and then read in the pass n + 1.
412 * Hence when we start to read the pass n + 2, we can safely flush every
413 * events with timestamps below t.
414 *
415 * ============ PASS n =================
416 * CPU 0 | CPU 1
417 * |
418 * cnt1 timestamps | cnt2 timestamps
419 * 1 | 2
420 * 2 | 3
421 * - | 4 <--- max recorded
422 *
423 * ============ PASS n + 1 ==============
424 * CPU 0 | CPU 1
425 * |
426 * cnt1 timestamps | cnt2 timestamps
427 * 3 | 5
428 * 4 | 6
429 * 5 | 7 <---- max recorded
430 *
431 * Flush every events below timestamp 4
432 *
433 * ============ PASS n + 2 ==============
434 * CPU 0 | CPU 1
435 * |
436 * cnt1 timestamps | cnt2 timestamps
437 * 6 | 8
438 * 7 | 9
439 * - | 10
440 *
441 * Flush every events below timestamp 7
442 * etc...
443 */
444static int process_finished_round(event_t *event __used,
445 struct perf_session *session,
446 struct perf_event_ops *ops)
447{
448 flush_sample_queue(session, ops);
449 session->ordered_samples.next_flush = session->ordered_samples.max_timestamp;
450
451 return 0;
452}
453
390static void __queue_sample_end(struct sample_queue *new, struct list_head *head) 454static void __queue_sample_end(struct sample_queue *new, struct list_head *head)
391{ 455{
392 struct sample_queue *iter; 456 struct sample_queue *iter;
@@ -455,16 +519,11 @@ static void __queue_sample_event(struct sample_queue *new,
455} 519}
456 520
457static int queue_sample_event(event_t *event, struct sample_data *data, 521static int queue_sample_event(event_t *event, struct sample_data *data,
458 struct perf_session *s, 522 struct perf_session *s)
459 struct perf_event_ops *ops)
460{ 523{
461 u64 timestamp = data->time; 524 u64 timestamp = data->time;
462 struct sample_queue *new; 525 struct sample_queue *new;
463 u64 flush_limit;
464
465 526
466 if (s->ordered_samples.flush_limit == ULLONG_MAX)
467 s->ordered_samples.flush_limit = timestamp + FLUSH_PERIOD;
468 527
469 if (timestamp < s->ordered_samples.last_flush) { 528 if (timestamp < s->ordered_samples.last_flush) {
470 printf("Warning: Timestamp below last timeslice flush\n"); 529 printf("Warning: Timestamp below last timeslice flush\n");
@@ -488,23 +547,8 @@ static int queue_sample_event(event_t *event, struct sample_data *data,
488 __queue_sample_event(new, s); 547 __queue_sample_event(new, s);
489 s->ordered_samples.last_inserted = new; 548 s->ordered_samples.last_inserted = new;
490 549
491 /* 550 if (new->timestamp > s->ordered_samples.max_timestamp)
492 * We want to have a slice of events covering 2 * FLUSH_PERIOD 551 s->ordered_samples.max_timestamp = new->timestamp;
493 * If FLUSH_PERIOD is big enough, it ensures every events that occured
494 * in the first half of the timeslice have all been buffered and there
495 * are none remaining (we need that because of the weakly ordered
496 * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
497 * timeslice, we flush the first half to be gentle with the memory
498 * (the second half can still get new events in the middle, so wait
499 * another period to flush it)
500 */
501 flush_limit = s->ordered_samples.flush_limit;
502
503 if (new->timestamp > flush_limit &&
504 new->timestamp - flush_limit > FLUSH_PERIOD) {
505 s->ordered_samples.flush_limit += FLUSH_PERIOD;
506 flush_sample_queue(s, ops);
507 }
508 552
509 return 0; 553 return 0;
510} 554}
@@ -520,7 +564,7 @@ static int perf_session__process_sample(event_t *event, struct perf_session *s,
520 bzero(&data, sizeof(struct sample_data)); 564 bzero(&data, sizeof(struct sample_data));
521 event__parse_sample(event, s->sample_type, &data); 565 event__parse_sample(event, s->sample_type, &data);
522 566
523 queue_sample_event(event, &data, s, ops); 567 queue_sample_event(event, &data, s);
524 568
525 return 0; 569 return 0;
526} 570}
@@ -572,6 +616,8 @@ static int perf_session__process_event(struct perf_session *self,
572 return ops->tracing_data(event, self); 616 return ops->tracing_data(event, self);
573 case PERF_RECORD_HEADER_BUILD_ID: 617 case PERF_RECORD_HEADER_BUILD_ID:
574 return ops->build_id(event, self); 618 return ops->build_id(event, self);
619 case PERF_RECORD_FINISHED_ROUND:
620 return ops->finished_round(event, self, ops);
575 default: 621 default:
576 self->unknown_events++; 622 self->unknown_events++;
577 return -1; 623 return -1;
@@ -786,7 +832,7 @@ more:
786done: 832done:
787 err = 0; 833 err = 0;
788 /* do the final flush for ordered samples */ 834 /* do the final flush for ordered samples */
789 self->ordered_samples.flush_limit = ULLONG_MAX; 835 self->ordered_samples.next_flush = ULLONG_MAX;
790 flush_sample_queue(self, ops); 836 flush_sample_queue(self, ops);
791out_err: 837out_err:
792 ui_progress__delete(progress); 838 ui_progress__delete(progress);
diff --git a/tools/perf/util/session.h b/tools/perf/util/session.h
index f2b2c6a3a49d..568fd08a478d 100644
--- a/tools/perf/util/session.h
+++ b/tools/perf/util/session.h
@@ -14,7 +14,8 @@ struct thread;
14 14
15struct ordered_samples { 15struct ordered_samples {
16 u64 last_flush; 16 u64 last_flush;
17 u64 flush_limit; 17 u64 next_flush;
18 u64 max_timestamp;
18 struct list_head samples_head; 19 struct list_head samples_head;
19 struct sample_queue *last_inserted; 20 struct sample_queue *last_inserted;
20}; 21};
@@ -41,23 +42,28 @@ struct perf_session {
41 char filename[0]; 42 char filename[0];
42}; 43};
43 44
45struct perf_event_ops;
46
44typedef int (*event_op)(event_t *self, struct perf_session *session); 47typedef int (*event_op)(event_t *self, struct perf_session *session);
48typedef int (*event_op2)(event_t *self, struct perf_session *session,
49 struct perf_event_ops *ops);
45 50
46struct perf_event_ops { 51struct perf_event_ops {
47 event_op sample, 52 event_op sample,
48 mmap, 53 mmap,
49 comm, 54 comm,
50 fork, 55 fork,
51 exit, 56 exit,
52 lost, 57 lost,
53 read, 58 read,
54 throttle, 59 throttle,
55 unthrottle, 60 unthrottle,
56 attr, 61 attr,
57 event_type, 62 event_type,
58 tracing_data, 63 tracing_data,
59 build_id; 64 build_id;
60 bool ordered_samples; 65 event_op2 finished_round;
66 bool ordered_samples;
61}; 67};
62 68
63struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe); 69struct perf_session *perf_session__new(const char *filename, int mode, bool force, bool repipe);