aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2009-05-22 00:27:22 -0400
committerIngo Molnar <mingo@elte.hu>2009-05-22 06:18:20 -0400
commit564c2b210add41df9a3a5aaa365c1d97cff6110d (patch)
tree2fe54df816abfe8c6010d277c79fbe18aa5c4082
parenta63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 (diff)
perf_counter: Optimize context switch between identical inherited contexts
When monitoring a process and its descendants with a set of inherited counters, we can often get the situation in a context switch where both the old (outgoing) and new (incoming) process have the same set of counters, and their values are ultimately going to be added together. In that situation it doesn't matter which set of counters are used to count the activity for the new process, so there is really no need to go through the process of reading the hardware counters and updating the old task's counters and then setting up the PMU for the new task. This optimizes the context switch in this situation. Instead of scheduling out the perf_counter_context for the old task and scheduling in the new context, we simply transfer the old context to the new task and keep using it without interruption. The new context gets transferred to the old task. This means that both tasks still have a valid perf_counter_context, so no special case is introduced when the old task gets scheduled in again, either on this CPU or another CPU. The equivalence of contexts is detected by keeping a pointer in each cloned context pointing to the context it was cloned from. To cope with the situation where a context is changed by adding or removing counters after it has been cloned, we also keep a generation number on each context which is incremented every time a context is changed. When a context is cloned we take a copy of the parent's generation number, and two cloned contexts are equivalent only if they have the same parent and the same generation number. In order that the parent context pointer remains valid (and is not reused), we increment the parent context's reference count for each context cloned from it. Since we don't have individual fds for the counters in a cloned context, the only thing that can make two clones of a given parent different after they have been cloned is enabling or disabling all counters with prctl. To account for this, we keep a count of the number of enabled counters in each context. Two contexts must have the same number of enabled counters to be considered equivalent. Here are some measurements of the context switch time as measured with the lat_ctx benchmark from lmbench, comparing the times obtained with and without this patch series: -----Unmodified----- With this patch series Counters: none 2 HW 4H+4S none 2 HW 4H+4S 2 processes: Average 3.44 6.45 11.24 3.12 3.39 3.60 St dev 0.04 0.04 0.13 0.05 0.17 0.19 8 processes: Average 6.45 8.79 14.00 5.57 6.23 7.57 St dev 1.27 1.04 0.88 1.42 1.46 1.42 32 processes: Average 5.56 8.43 13.78 5.28 5.55 7.15 St dev 0.41 0.47 0.53 0.54 0.57 0.81 The numbers are the mean and standard deviation of 20 runs of lat_ctx. The "none" columns are lat_ctx run directly without any counters. The "2 HW" columns are with lat_ctx run under perfstat, counting cycles and instructions. The "4H+4S" columns are lat_ctx run under perfstat with 4 hardware counters and 4 software counters (cycles, instructions, cache references, cache misses, task clock, context switch, cpu migrations, and page faults). [ Impact: performance optimization of counter context-switches ] Signed-off-by: Paul Mackerras <paulus@samba.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/perf_counter.h12
-rw-r--r--kernel/perf_counter.c109
-rw-r--r--kernel/sched.c2
3 files changed, 107 insertions, 16 deletions
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 071309005468..4cae01a50450 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -513,6 +513,7 @@ struct perf_counter_context {
513 struct list_head event_list; 513 struct list_head event_list;
514 int nr_counters; 514 int nr_counters;
515 int nr_active; 515 int nr_active;
516 int nr_enabled;
516 int is_active; 517 int is_active;
517 atomic_t refcount; 518 atomic_t refcount;
518 struct task_struct *task; 519 struct task_struct *task;
@@ -522,6 +523,14 @@ struct perf_counter_context {
522 */ 523 */
523 u64 time; 524 u64 time;
524 u64 timestamp; 525 u64 timestamp;
526
527 /*
528 * These fields let us detect when two contexts have both
529 * been cloned (inherited) from a common ancestor.
530 */
531 struct perf_counter_context *parent_ctx;
532 u32 parent_gen;
533 u32 generation;
525}; 534};
526 535
527/** 536/**
@@ -552,7 +561,8 @@ extern int perf_max_counters;
552extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter); 561extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
553 562
554extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); 563extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
555extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); 564extern void perf_counter_task_sched_out(struct task_struct *task,
565 struct task_struct *next, int cpu);
556extern void perf_counter_task_tick(struct task_struct *task, int cpu); 566extern void perf_counter_task_tick(struct task_struct *task, int cpu);
557extern void perf_counter_init_task(struct task_struct *child); 567extern void perf_counter_init_task(struct task_struct *child);
558extern void perf_counter_exit_task(struct task_struct *child); 568extern void perf_counter_exit_task(struct task_struct *child);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 06ea3eae886e..c10055416dea 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -104,8 +104,11 @@ static void get_ctx(struct perf_counter_context *ctx)
104 104
105static void put_ctx(struct perf_counter_context *ctx) 105static void put_ctx(struct perf_counter_context *ctx)
106{ 106{
107 if (atomic_dec_and_test(&ctx->refcount)) 107 if (atomic_dec_and_test(&ctx->refcount)) {
108 if (ctx->parent_ctx)
109 put_ctx(ctx->parent_ctx);
108 kfree(ctx); 110 kfree(ctx);
111 }
109} 112}
110 113
111static void 114static void
@@ -127,6 +130,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
127 130
128 list_add_rcu(&counter->event_entry, &ctx->event_list); 131 list_add_rcu(&counter->event_entry, &ctx->event_list);
129 ctx->nr_counters++; 132 ctx->nr_counters++;
133 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
134 ctx->nr_enabled++;
130} 135}
131 136
132/* 137/*
@@ -141,6 +146,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
141 if (list_empty(&counter->list_entry)) 146 if (list_empty(&counter->list_entry))
142 return; 147 return;
143 ctx->nr_counters--; 148 ctx->nr_counters--;
149 if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
150 ctx->nr_enabled--;
144 151
145 list_del_init(&counter->list_entry); 152 list_del_init(&counter->list_entry);
146 list_del_rcu(&counter->event_entry); 153 list_del_rcu(&counter->event_entry);
@@ -204,6 +211,22 @@ group_sched_out(struct perf_counter *group_counter,
204} 211}
205 212
206/* 213/*
214 * Mark this context as not being a clone of another.
215 * Called when counters are added to or removed from this context.
216 * We also increment our generation number so that anything that
217 * was cloned from this context before this will not match anything
218 * cloned from this context after this.
219 */
220static void unclone_ctx(struct perf_counter_context *ctx)
221{
222 ++ctx->generation;
223 if (!ctx->parent_ctx)
224 return;
225 put_ctx(ctx->parent_ctx);
226 ctx->parent_ctx = NULL;
227}
228
229/*
207 * Cross CPU call to remove a performance counter 230 * Cross CPU call to remove a performance counter
208 * 231 *
209 * We disable the counter on the hardware level first. After that we 232 * We disable the counter on the hardware level first. After that we
@@ -263,6 +286,7 @@ static void perf_counter_remove_from_context(struct perf_counter *counter)
263 struct perf_counter_context *ctx = counter->ctx; 286 struct perf_counter_context *ctx = counter->ctx;
264 struct task_struct *task = ctx->task; 287 struct task_struct *task = ctx->task;
265 288
289 unclone_ctx(ctx);
266 if (!task) { 290 if (!task) {
267 /* 291 /*
268 * Per cpu counters are removed via an smp call and 292 * Per cpu counters are removed via an smp call and
@@ -378,6 +402,7 @@ static void __perf_counter_disable(void *info)
378 else 402 else
379 counter_sched_out(counter, cpuctx, ctx); 403 counter_sched_out(counter, cpuctx, ctx);
380 counter->state = PERF_COUNTER_STATE_OFF; 404 counter->state = PERF_COUNTER_STATE_OFF;
405 ctx->nr_enabled--;
381 } 406 }
382 407
383 spin_unlock_irqrestore(&ctx->lock, flags); 408 spin_unlock_irqrestore(&ctx->lock, flags);
@@ -419,6 +444,7 @@ static void perf_counter_disable(struct perf_counter *counter)
419 if (counter->state == PERF_COUNTER_STATE_INACTIVE) { 444 if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
420 update_counter_times(counter); 445 update_counter_times(counter);
421 counter->state = PERF_COUNTER_STATE_OFF; 446 counter->state = PERF_COUNTER_STATE_OFF;
447 ctx->nr_enabled--;
422 } 448 }
423 449
424 spin_unlock_irq(&ctx->lock); 450 spin_unlock_irq(&ctx->lock);
@@ -727,6 +753,7 @@ static void __perf_counter_enable(void *info)
727 goto unlock; 753 goto unlock;
728 counter->state = PERF_COUNTER_STATE_INACTIVE; 754 counter->state = PERF_COUNTER_STATE_INACTIVE;
729 counter->tstamp_enabled = ctx->time - counter->total_time_enabled; 755 counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
756 ctx->nr_enabled++;
730 757
731 /* 758 /*
732 * If the counter is in a group and isn't the group leader, 759 * If the counter is in a group and isn't the group leader,
@@ -817,6 +844,7 @@ static void perf_counter_enable(struct perf_counter *counter)
817 counter->state = PERF_COUNTER_STATE_INACTIVE; 844 counter->state = PERF_COUNTER_STATE_INACTIVE;
818 counter->tstamp_enabled = 845 counter->tstamp_enabled =
819 ctx->time - counter->total_time_enabled; 846 ctx->time - counter->total_time_enabled;
847 ctx->nr_enabled++;
820 } 848 }
821 out: 849 out:
822 spin_unlock_irq(&ctx->lock); 850 spin_unlock_irq(&ctx->lock);
@@ -862,6 +890,25 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
862} 890}
863 891
864/* 892/*
893 * Test whether two contexts are equivalent, i.e. whether they
894 * have both been cloned from the same version of the same context
895 * and they both have the same number of enabled counters.
896 * If the number of enabled counters is the same, then the set
897 * of enabled counters should be the same, because these are both
898 * inherited contexts, therefore we can't access individual counters
899 * in them directly with an fd; we can only enable/disable all
900 * counters via prctl, or enable/disable all counters in a family
901 * via ioctl, which will have the same effect on both contexts.
902 */
903static int context_equiv(struct perf_counter_context *ctx1,
904 struct perf_counter_context *ctx2)
905{
906 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
907 && ctx1->parent_gen == ctx2->parent_gen
908 && ctx1->nr_enabled == ctx2->nr_enabled;
909}
910
911/*
865 * Called from scheduler to remove the counters of the current task, 912 * Called from scheduler to remove the counters of the current task,
866 * with interrupts disabled. 913 * with interrupts disabled.
867 * 914 *
@@ -872,10 +919,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
872 * accessing the counter control register. If a NMI hits, then it will 919 * accessing the counter control register. If a NMI hits, then it will
873 * not restart the counter. 920 * not restart the counter.
874 */ 921 */
875void perf_counter_task_sched_out(struct task_struct *task, int cpu) 922void perf_counter_task_sched_out(struct task_struct *task,
923 struct task_struct *next, int cpu)
876{ 924{
877 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 925 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
878 struct perf_counter_context *ctx = task->perf_counter_ctxp; 926 struct perf_counter_context *ctx = task->perf_counter_ctxp;
927 struct perf_counter_context *next_ctx;
879 struct pt_regs *regs; 928 struct pt_regs *regs;
880 929
881 if (likely(!ctx || !cpuctx->task_ctx)) 930 if (likely(!ctx || !cpuctx->task_ctx))
@@ -885,6 +934,16 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
885 934
886 regs = task_pt_regs(task); 935 regs = task_pt_regs(task);
887 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); 936 perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
937
938 next_ctx = next->perf_counter_ctxp;
939 if (next_ctx && context_equiv(ctx, next_ctx)) {
940 task->perf_counter_ctxp = next_ctx;
941 next->perf_counter_ctxp = ctx;
942 ctx->task = next;
943 next_ctx->task = task;
944 return;
945 }
946
888 __perf_counter_sched_out(ctx, cpuctx); 947 __perf_counter_sched_out(ctx, cpuctx);
889 948
890 cpuctx->task_ctx = NULL; 949 cpuctx->task_ctx = NULL;
@@ -998,6 +1057,8 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
998 1057
999 if (likely(!ctx)) 1058 if (likely(!ctx))
1000 return; 1059 return;
1060 if (cpuctx->task_ctx == ctx)
1061 return;
1001 __perf_counter_sched_in(ctx, cpuctx, cpu); 1062 __perf_counter_sched_in(ctx, cpuctx, cpu);
1002 cpuctx->task_ctx = ctx; 1063 cpuctx->task_ctx = ctx;
1003} 1064}
@@ -3253,6 +3314,16 @@ inherit_counter(struct perf_counter *parent_counter,
3253 return child_counter; 3314 return child_counter;
3254 3315
3255 /* 3316 /*
3317 * Make the child state follow the state of the parent counter,
3318 * not its hw_event.disabled bit. We hold the parent's mutex,
3319 * so we won't race with perf_counter_{en,dis}able_family.
3320 */
3321 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
3322 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
3323 else
3324 child_counter->state = PERF_COUNTER_STATE_OFF;
3325
3326 /*
3256 * Link it up in the child's context: 3327 * Link it up in the child's context:
3257 */ 3328 */
3258 add_counter_to_ctx(child_counter, child_ctx); 3329 add_counter_to_ctx(child_counter, child_ctx);
@@ -3277,16 +3348,6 @@ inherit_counter(struct perf_counter *parent_counter,
3277 mutex_lock(&parent_counter->mutex); 3348 mutex_lock(&parent_counter->mutex);
3278 list_add_tail(&child_counter->child_list, &parent_counter->child_list); 3349 list_add_tail(&child_counter->child_list, &parent_counter->child_list);
3279 3350
3280 /*
3281 * Make the child state follow the state of the parent counter,
3282 * not its hw_event.disabled bit. We hold the parent's mutex,
3283 * so we won't race with perf_counter_{en,dis}able_family.
3284 */
3285 if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
3286 child_counter->state = PERF_COUNTER_STATE_INACTIVE;
3287 else
3288 child_counter->state = PERF_COUNTER_STATE_OFF;
3289
3290 mutex_unlock(&parent_counter->mutex); 3351 mutex_unlock(&parent_counter->mutex);
3291 3352
3292 return child_counter; 3353 return child_counter;
@@ -3429,6 +3490,7 @@ void perf_counter_init_task(struct task_struct *child)
3429 struct perf_counter_context *child_ctx, *parent_ctx; 3490 struct perf_counter_context *child_ctx, *parent_ctx;
3430 struct perf_counter *counter; 3491 struct perf_counter *counter;
3431 struct task_struct *parent = current; 3492 struct task_struct *parent = current;
3493 int inherited_all = 1;
3432 3494
3433 child->perf_counter_ctxp = NULL; 3495 child->perf_counter_ctxp = NULL;
3434 3496
@@ -3463,12 +3525,31 @@ void perf_counter_init_task(struct task_struct *child)
3463 if (counter != counter->group_leader) 3525 if (counter != counter->group_leader)
3464 continue; 3526 continue;
3465 3527
3466 if (!counter->hw_event.inherit) 3528 if (!counter->hw_event.inherit) {
3529 inherited_all = 0;
3467 continue; 3530 continue;
3531 }
3468 3532
3469 if (inherit_group(counter, parent, 3533 if (inherit_group(counter, parent,
3470 parent_ctx, child, child_ctx)) 3534 parent_ctx, child, child_ctx)) {
3535 inherited_all = 0;
3471 break; 3536 break;
3537 }
3538 }
3539
3540 if (inherited_all) {
3541 /*
3542 * Mark the child context as a clone of the parent
3543 * context, or of whatever the parent is a clone of.
3544 */
3545 if (parent_ctx->parent_ctx) {
3546 child_ctx->parent_ctx = parent_ctx->parent_ctx;
3547 child_ctx->parent_gen = parent_ctx->parent_gen;
3548 } else {
3549 child_ctx->parent_ctx = parent_ctx;
3550 child_ctx->parent_gen = parent_ctx->generation;
3551 }
3552 get_ctx(child_ctx->parent_ctx);
3472 } 3553 }
3473 3554
3474 mutex_unlock(&parent_ctx->mutex); 3555 mutex_unlock(&parent_ctx->mutex);
diff --git a/kernel/sched.c b/kernel/sched.c
index 419a39d0988f..4c0d58bce6b2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5091,7 +5091,7 @@ need_resched_nonpreemptible:
5091 5091
5092 if (likely(prev != next)) { 5092 if (likely(prev != next)) {
5093 sched_info_switch(prev, next); 5093 sched_info_switch(prev, next);
5094 perf_counter_task_sched_out(prev, cpu); 5094 perf_counter_task_sched_out(prev, next, cpu);
5095 5095
5096 rq->nr_switches++; 5096 rq->nr_switches++;
5097 rq->curr = next; 5097 rq->curr = next;