aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2010-09-02 10:50:03 -0400
committerIngo Molnar <mingo@elte.hu>2010-09-09 14:46:33 -0400
commit8dc85d547285668e509f86c177bcd4ea055bcaaf (patch)
tree5474dd6c31952b342f0a3f1bcec09e3049129264
parenteb184479874238393ac186c4e054d24311c34aaa (diff)
perf: Multiple task contexts
Provide the infrastructure for multiple task contexts. A more flexible approach would have resulted in more pointer chases in the scheduling hot-paths. This approach has the limitation of a static number of task contexts. Since I expect most external PMUs to be system wide, or at least node wide (as per the intel uncore unit) they won't actually need a task context. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: paulus <paulus@samba.org> Cc: stephane eranian <eranian@googlemail.com> Cc: Robert Richter <robert.richter@amd.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Lin Ming <ming.m.lin@intel.com> Cc: Yanmin <yanmin_zhang@linux.intel.com> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/perf_event.h1
-rw-r--r--include/linux/sched.h8
-rw-r--r--kernel/perf_event.c336
3 files changed, 239 insertions, 106 deletions
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 22155ef3b362..9ecfd856ce6e 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -572,6 +572,7 @@ struct pmu {
572 572
573 int * __percpu pmu_disable_count; 573 int * __percpu pmu_disable_count;
574 struct perf_cpu_context * __percpu pmu_cpu_context; 574 struct perf_cpu_context * __percpu pmu_cpu_context;
575 int task_ctx_nr;
575 576
576 /* 577 /*
577 * Fully disable/enable this PMU, can be used to protect from the PMI 578 * Fully disable/enable this PMU, can be used to protect from the PMI
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e2a6db2d7dd..89d6023c6f82 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1160,6 +1160,12 @@ struct sched_rt_entity {
1160 1160
1161struct rcu_node; 1161struct rcu_node;
1162 1162
1163enum perf_event_task_context {
1164 perf_invalid_context = -1,
1165 perf_hw_context = 0,
1166 perf_nr_task_contexts,
1167};
1168
1163struct task_struct { 1169struct task_struct {
1164 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ 1170 volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
1165 void *stack; 1171 void *stack;
@@ -1431,7 +1437,7 @@ struct task_struct {
1431 struct futex_pi_state *pi_state_cache; 1437 struct futex_pi_state *pi_state_cache;
1432#endif 1438#endif
1433#ifdef CONFIG_PERF_EVENTS 1439#ifdef CONFIG_PERF_EVENTS
1434 struct perf_event_context *perf_event_ctxp; 1440 struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
1435 struct mutex perf_event_mutex; 1441 struct mutex perf_event_mutex;
1436 struct list_head perf_event_list; 1442 struct list_head perf_event_list;
1437#endif 1443#endif
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 13d98d756347..7223ea875861 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -148,13 +148,13 @@ static u64 primary_event_id(struct perf_event *event)
148 * the context could get moved to another task. 148 * the context could get moved to another task.
149 */ 149 */
150static struct perf_event_context * 150static struct perf_event_context *
151perf_lock_task_context(struct task_struct *task, unsigned long *flags) 151perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
152{ 152{
153 struct perf_event_context *ctx; 153 struct perf_event_context *ctx;
154 154
155 rcu_read_lock(); 155 rcu_read_lock();
156retry: 156retry:
157 ctx = rcu_dereference(task->perf_event_ctxp); 157 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
158 if (ctx) { 158 if (ctx) {
159 /* 159 /*
160 * If this context is a clone of another, it might 160 * If this context is a clone of another, it might
@@ -167,7 +167,7 @@ retry:
167 * can't get swapped on us any more. 167 * can't get swapped on us any more.
168 */ 168 */
169 raw_spin_lock_irqsave(&ctx->lock, *flags); 169 raw_spin_lock_irqsave(&ctx->lock, *flags);
170 if (ctx != rcu_dereference(task->perf_event_ctxp)) { 170 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
171 raw_spin_unlock_irqrestore(&ctx->lock, *flags); 171 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
172 goto retry; 172 goto retry;
173 } 173 }
@@ -186,12 +186,13 @@ retry:
186 * can't get swapped to another task. This also increments its 186 * can't get swapped to another task. This also increments its
187 * reference count so that the context can't get freed. 187 * reference count so that the context can't get freed.
188 */ 188 */
189static struct perf_event_context *perf_pin_task_context(struct task_struct *task) 189static struct perf_event_context *
190perf_pin_task_context(struct task_struct *task, int ctxn)
190{ 191{
191 struct perf_event_context *ctx; 192 struct perf_event_context *ctx;
192 unsigned long flags; 193 unsigned long flags;
193 194
194 ctx = perf_lock_task_context(task, &flags); 195 ctx = perf_lock_task_context(task, ctxn, &flags);
195 if (ctx) { 196 if (ctx) {
196 ++ctx->pin_count; 197 ++ctx->pin_count;
197 raw_spin_unlock_irqrestore(&ctx->lock, flags); 198 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1179,28 +1180,15 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1179 } 1180 }
1180} 1181}
1181 1182
1182/* 1183void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1183 * Called from scheduler to remove the events of the current task, 1184 struct task_struct *next)
1184 * with interrupts disabled.
1185 *
1186 * We stop each event and update the event value in event->count.
1187 *
1188 * This does not protect us against NMI, but disable()
1189 * sets the disabled bit in the control field of event _before_
1190 * accessing the event control register. If a NMI hits, then it will
1191 * not restart the event.
1192 */
1193void perf_event_task_sched_out(struct task_struct *task,
1194 struct task_struct *next)
1195{ 1185{
1196 struct perf_event_context *ctx = task->perf_event_ctxp; 1186 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
1197 struct perf_event_context *next_ctx; 1187 struct perf_event_context *next_ctx;
1198 struct perf_event_context *parent; 1188 struct perf_event_context *parent;
1199 struct perf_cpu_context *cpuctx; 1189 struct perf_cpu_context *cpuctx;
1200 int do_switch = 1; 1190 int do_switch = 1;
1201 1191
1202 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1203
1204 if (likely(!ctx)) 1192 if (likely(!ctx))
1205 return; 1193 return;
1206 1194
@@ -1210,7 +1198,7 @@ void perf_event_task_sched_out(struct task_struct *task,
1210 1198
1211 rcu_read_lock(); 1199 rcu_read_lock();
1212 parent = rcu_dereference(ctx->parent_ctx); 1200 parent = rcu_dereference(ctx->parent_ctx);
1213 next_ctx = next->perf_event_ctxp; 1201 next_ctx = next->perf_event_ctxp[ctxn];
1214 if (parent && next_ctx && 1202 if (parent && next_ctx &&
1215 rcu_dereference(next_ctx->parent_ctx) == parent) { 1203 rcu_dereference(next_ctx->parent_ctx) == parent) {
1216 /* 1204 /*
@@ -1229,8 +1217,8 @@ void perf_event_task_sched_out(struct task_struct *task,
1229 * XXX do we need a memory barrier of sorts 1217 * XXX do we need a memory barrier of sorts
1230 * wrt to rcu_dereference() of perf_event_ctxp 1218 * wrt to rcu_dereference() of perf_event_ctxp
1231 */ 1219 */
1232 task->perf_event_ctxp = next_ctx; 1220 task->perf_event_ctxp[ctxn] = next_ctx;
1233 next->perf_event_ctxp = ctx; 1221 next->perf_event_ctxp[ctxn] = ctx;
1234 ctx->task = next; 1222 ctx->task = next;
1235 next_ctx->task = task; 1223 next_ctx->task = task;
1236 do_switch = 0; 1224 do_switch = 0;
@@ -1248,6 +1236,31 @@ void perf_event_task_sched_out(struct task_struct *task,
1248 } 1236 }
1249} 1237}
1250 1238
1239#define for_each_task_context_nr(ctxn) \
1240 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
1241
1242/*
1243 * Called from scheduler to remove the events of the current task,
1244 * with interrupts disabled.
1245 *
1246 * We stop each event and update the event value in event->count.
1247 *
1248 * This does not protect us against NMI, but disable()
1249 * sets the disabled bit in the control field of event _before_
1250 * accessing the event control register. If a NMI hits, then it will
1251 * not restart the event.
1252 */
1253void perf_event_task_sched_out(struct task_struct *task,
1254 struct task_struct *next)
1255{
1256 int ctxn;
1257
1258 perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0);
1259
1260 for_each_task_context_nr(ctxn)
1261 perf_event_context_sched_out(task, ctxn, next);
1262}
1263
1251static void task_ctx_sched_out(struct perf_event_context *ctx, 1264static void task_ctx_sched_out(struct perf_event_context *ctx,
1252 enum event_type_t event_type) 1265 enum event_type_t event_type)
1253{ 1266{
@@ -1366,38 +1379,23 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1366 ctx_sched_in(ctx, cpuctx, event_type); 1379 ctx_sched_in(ctx, cpuctx, event_type);
1367} 1380}
1368 1381
1369static void task_ctx_sched_in(struct task_struct *task, 1382static void task_ctx_sched_in(struct perf_event_context *ctx,
1370 enum event_type_t event_type) 1383 enum event_type_t event_type)
1371{ 1384{
1372 struct perf_event_context *ctx = task->perf_event_ctxp; 1385 struct perf_cpu_context *cpuctx;
1373 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1374 1386
1375 if (likely(!ctx)) 1387 cpuctx = __get_cpu_context(ctx);
1376 return;
1377 if (cpuctx->task_ctx == ctx) 1388 if (cpuctx->task_ctx == ctx)
1378 return; 1389 return;
1390
1379 ctx_sched_in(ctx, cpuctx, event_type); 1391 ctx_sched_in(ctx, cpuctx, event_type);
1380 cpuctx->task_ctx = ctx; 1392 cpuctx->task_ctx = ctx;
1381} 1393}
1382/* 1394
1383 * Called from scheduler to add the events of the current task 1395void perf_event_context_sched_in(struct perf_event_context *ctx)
1384 * with interrupts disabled.
1385 *
1386 * We restore the event value and then enable it.
1387 *
1388 * This does not protect us against NMI, but enable()
1389 * sets the enabled bit in the control field of event _before_
1390 * accessing the event control register. If a NMI hits, then it will
1391 * keep the event running.
1392 */
1393void perf_event_task_sched_in(struct task_struct *task)
1394{ 1396{
1395 struct perf_event_context *ctx = task->perf_event_ctxp;
1396 struct perf_cpu_context *cpuctx; 1397 struct perf_cpu_context *cpuctx;
1397 1398
1398 if (likely(!ctx))
1399 return;
1400
1401 cpuctx = __get_cpu_context(ctx); 1399 cpuctx = __get_cpu_context(ctx);
1402 if (cpuctx->task_ctx == ctx) 1400 if (cpuctx->task_ctx == ctx)
1403 return; 1401 return;
@@ -1422,6 +1420,31 @@ void perf_event_task_sched_in(struct task_struct *task)
1422 perf_pmu_rotate_start(ctx->pmu); 1420 perf_pmu_rotate_start(ctx->pmu);
1423} 1421}
1424 1422
1423/*
1424 * Called from scheduler to add the events of the current task
1425 * with interrupts disabled.
1426 *
1427 * We restore the event value and then enable it.
1428 *
1429 * This does not protect us against NMI, but enable()
1430 * sets the enabled bit in the control field of event _before_
1431 * accessing the event control register. If a NMI hits, then it will
1432 * keep the event running.
1433 */
1434void perf_event_task_sched_in(struct task_struct *task)
1435{
1436 struct perf_event_context *ctx;
1437 int ctxn;
1438
1439 for_each_task_context_nr(ctxn) {
1440 ctx = task->perf_event_ctxp[ctxn];
1441 if (likely(!ctx))
1442 continue;
1443
1444 perf_event_context_sched_in(ctx);
1445 }
1446}
1447
1425#define MAX_INTERRUPTS (~0ULL) 1448#define MAX_INTERRUPTS (~0ULL)
1426 1449
1427static void perf_log_throttle(struct perf_event *event, int enable); 1450static void perf_log_throttle(struct perf_event *event, int enable);
@@ -1588,7 +1611,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer)
1588{ 1611{
1589 enum hrtimer_restart restart = HRTIMER_NORESTART; 1612 enum hrtimer_restart restart = HRTIMER_NORESTART;
1590 struct perf_cpu_context *cpuctx; 1613 struct perf_cpu_context *cpuctx;
1591 struct perf_event_context *ctx; 1614 struct perf_event_context *ctx = NULL;
1592 int rotate = 0; 1615 int rotate = 0;
1593 1616
1594 cpuctx = container_of(timer, struct perf_cpu_context, timer); 1617 cpuctx = container_of(timer, struct perf_cpu_context, timer);
@@ -1599,7 +1622,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer)
1599 rotate = 1; 1622 rotate = 1;
1600 } 1623 }
1601 1624
1602 ctx = current->perf_event_ctxp; 1625 ctx = cpuctx->task_ctx;
1603 if (ctx && ctx->nr_events) { 1626 if (ctx && ctx->nr_events) {
1604 restart = HRTIMER_RESTART; 1627 restart = HRTIMER_RESTART;
1605 if (ctx->nr_events != ctx->nr_active) 1628 if (ctx->nr_events != ctx->nr_active)
@@ -1623,7 +1646,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer)
1623 1646
1624 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); 1647 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1625 if (ctx) 1648 if (ctx)
1626 task_ctx_sched_in(current, EVENT_FLEXIBLE); 1649 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
1627 1650
1628done: 1651done:
1629 hrtimer_forward_now(timer, ns_to_ktime(cpuctx->timer_interval)); 1652 hrtimer_forward_now(timer, ns_to_ktime(cpuctx->timer_interval));
@@ -1650,20 +1673,18 @@ static int event_enable_on_exec(struct perf_event *event,
1650 * Enable all of a task's events that have been marked enable-on-exec. 1673 * Enable all of a task's events that have been marked enable-on-exec.
1651 * This expects task == current. 1674 * This expects task == current.
1652 */ 1675 */
1653static void perf_event_enable_on_exec(struct task_struct *task) 1676static void perf_event_enable_on_exec(struct perf_event_context *ctx)
1654{ 1677{
1655 struct perf_event_context *ctx;
1656 struct perf_event *event; 1678 struct perf_event *event;
1657 unsigned long flags; 1679 unsigned long flags;
1658 int enabled = 0; 1680 int enabled = 0;
1659 int ret; 1681 int ret;
1660 1682
1661 local_irq_save(flags); 1683 local_irq_save(flags);
1662 ctx = task->perf_event_ctxp;
1663 if (!ctx || !ctx->nr_events) 1684 if (!ctx || !ctx->nr_events)
1664 goto out; 1685 goto out;
1665 1686
1666 __perf_event_task_sched_out(ctx); 1687 task_ctx_sched_out(ctx, EVENT_ALL);
1667 1688
1668 raw_spin_lock(&ctx->lock); 1689 raw_spin_lock(&ctx->lock);
1669 1690
@@ -1687,7 +1708,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1687 1708
1688 raw_spin_unlock(&ctx->lock); 1709 raw_spin_unlock(&ctx->lock);
1689 1710
1690 perf_event_task_sched_in(task); 1711 perf_event_context_sched_in(ctx);
1691out: 1712out:
1692 local_irq_restore(flags); 1713 local_irq_restore(flags);
1693} 1714}
@@ -1995,7 +2016,7 @@ find_get_context(struct pmu *pmu, pid_t pid, int cpu)
1995 struct perf_cpu_context *cpuctx; 2016 struct perf_cpu_context *cpuctx;
1996 struct task_struct *task; 2017 struct task_struct *task;
1997 unsigned long flags; 2018 unsigned long flags;
1998 int err; 2019 int ctxn, err;
1999 2020
2000 if (pid == -1 && cpu != -1) { 2021 if (pid == -1 && cpu != -1) {
2001 /* Must be root to operate on a CPU event: */ 2022 /* Must be root to operate on a CPU event: */
@@ -2044,8 +2065,13 @@ find_get_context(struct pmu *pmu, pid_t pid, int cpu)
2044 if (!ptrace_may_access(task, PTRACE_MODE_READ)) 2065 if (!ptrace_may_access(task, PTRACE_MODE_READ))
2045 goto errout; 2066 goto errout;
2046 2067
2068 err = -EINVAL;
2069 ctxn = pmu->task_ctx_nr;
2070 if (ctxn < 0)
2071 goto errout;
2072
2047retry: 2073retry:
2048 ctx = perf_lock_task_context(task, &flags); 2074 ctx = perf_lock_task_context(task, ctxn, &flags);
2049 if (ctx) { 2075 if (ctx) {
2050 unclone_ctx(ctx); 2076 unclone_ctx(ctx);
2051 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2077 raw_spin_unlock_irqrestore(&ctx->lock, flags);
@@ -2059,7 +2085,7 @@ retry:
2059 2085
2060 get_ctx(ctx); 2086 get_ctx(ctx);
2061 2087
2062 if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { 2088 if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) {
2063 /* 2089 /*
2064 * We raced with some other task; use 2090 * We raced with some other task; use
2065 * the context they set. 2091 * the context they set.
@@ -3773,19 +3799,26 @@ static void perf_event_task_ctx(struct perf_event_context *ctx,
3773 3799
3774static void perf_event_task_event(struct perf_task_event *task_event) 3800static void perf_event_task_event(struct perf_task_event *task_event)
3775{ 3801{
3776 struct perf_event_context *ctx = task_event->task_ctx;
3777 struct perf_cpu_context *cpuctx; 3802 struct perf_cpu_context *cpuctx;
3803 struct perf_event_context *ctx;
3778 struct pmu *pmu; 3804 struct pmu *pmu;
3805 int ctxn;
3779 3806
3780 rcu_read_lock_sched(); 3807 rcu_read_lock_sched();
3781 list_for_each_entry_rcu(pmu, &pmus, entry) { 3808 list_for_each_entry_rcu(pmu, &pmus, entry) {
3782 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 3809 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3783 perf_event_task_ctx(&cpuctx->ctx, task_event); 3810 perf_event_task_ctx(&cpuctx->ctx, task_event);
3811
3812 ctx = task_event->task_ctx;
3813 if (!ctx) {
3814 ctxn = pmu->task_ctx_nr;
3815 if (ctxn < 0)
3816 continue;
3817 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3818 }
3819 if (ctx)
3820 perf_event_task_ctx(ctx, task_event);
3784 } 3821 }
3785 if (!ctx)
3786 ctx = rcu_dereference(current->perf_event_ctxp);
3787 if (ctx)
3788 perf_event_task_ctx(ctx, task_event);
3789 rcu_read_unlock_sched(); 3822 rcu_read_unlock_sched();
3790} 3823}
3791 3824
@@ -3890,9 +3923,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3890{ 3923{
3891 struct perf_cpu_context *cpuctx; 3924 struct perf_cpu_context *cpuctx;
3892 struct perf_event_context *ctx; 3925 struct perf_event_context *ctx;
3926 char comm[TASK_COMM_LEN];
3893 unsigned int size; 3927 unsigned int size;
3894 struct pmu *pmu; 3928 struct pmu *pmu;
3895 char comm[TASK_COMM_LEN]; 3929 int ctxn;
3896 3930
3897 memset(comm, 0, sizeof(comm)); 3931 memset(comm, 0, sizeof(comm));
3898 strlcpy(comm, comm_event->task->comm, sizeof(comm)); 3932 strlcpy(comm, comm_event->task->comm, sizeof(comm));
@@ -3907,19 +3941,31 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
3907 list_for_each_entry_rcu(pmu, &pmus, entry) { 3941 list_for_each_entry_rcu(pmu, &pmus, entry) {
3908 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 3942 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
3909 perf_event_comm_ctx(&cpuctx->ctx, comm_event); 3943 perf_event_comm_ctx(&cpuctx->ctx, comm_event);
3944
3945 ctxn = pmu->task_ctx_nr;
3946 if (ctxn < 0)
3947 continue;
3948
3949 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
3950 if (ctx)
3951 perf_event_comm_ctx(ctx, comm_event);
3910 } 3952 }
3911 ctx = rcu_dereference(current->perf_event_ctxp);
3912 if (ctx)
3913 perf_event_comm_ctx(ctx, comm_event);
3914 rcu_read_unlock_sched(); 3953 rcu_read_unlock_sched();
3915} 3954}
3916 3955
3917void perf_event_comm(struct task_struct *task) 3956void perf_event_comm(struct task_struct *task)
3918{ 3957{
3919 struct perf_comm_event comm_event; 3958 struct perf_comm_event comm_event;
3959 struct perf_event_context *ctx;
3960 int ctxn;
3920 3961
3921 if (task->perf_event_ctxp) 3962 for_each_task_context_nr(ctxn) {
3922 perf_event_enable_on_exec(task); 3963 ctx = task->perf_event_ctxp[ctxn];
3964 if (!ctx)
3965 continue;
3966
3967 perf_event_enable_on_exec(ctx);
3968 }
3923 3969
3924 if (!atomic_read(&nr_comm_events)) 3970 if (!atomic_read(&nr_comm_events))
3925 return; 3971 return;
@@ -4022,6 +4068,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4022 char *buf = NULL; 4068 char *buf = NULL;
4023 const char *name; 4069 const char *name;
4024 struct pmu *pmu; 4070 struct pmu *pmu;
4071 int ctxn;
4025 4072
4026 memset(tmp, 0, sizeof(tmp)); 4073 memset(tmp, 0, sizeof(tmp));
4027 4074
@@ -4078,10 +4125,17 @@ got_name:
4078 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 4125 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
4079 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, 4126 perf_event_mmap_ctx(&cpuctx->ctx, mmap_event,
4080 vma->vm_flags & VM_EXEC); 4127 vma->vm_flags & VM_EXEC);
4128
4129 ctxn = pmu->task_ctx_nr;
4130 if (ctxn < 0)
4131 continue;
4132
4133 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4134 if (ctx) {
4135 perf_event_mmap_ctx(ctx, mmap_event,
4136 vma->vm_flags & VM_EXEC);
4137 }
4081 } 4138 }
4082 ctx = rcu_dereference(current->perf_event_ctxp);
4083 if (ctx)
4084 perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC);
4085 rcu_read_unlock_sched(); 4139 rcu_read_unlock_sched();
4086 4140
4087 kfree(buf); 4141 kfree(buf);
@@ -5042,6 +5096,43 @@ static void perf_pmu_cancel_txn(struct pmu *pmu)
5042 perf_pmu_enable(pmu); 5096 perf_pmu_enable(pmu);
5043} 5097}
5044 5098
5099/*
5100 * Ensures all contexts with the same task_ctx_nr have the same
5101 * pmu_cpu_context too.
5102 */
5103static void *find_pmu_context(int ctxn)
5104{
5105 struct pmu *pmu;
5106
5107 if (ctxn < 0)
5108 return NULL;
5109
5110 list_for_each_entry(pmu, &pmus, entry) {
5111 if (pmu->task_ctx_nr == ctxn)
5112 return pmu->pmu_cpu_context;
5113 }
5114
5115 return NULL;
5116}
5117
5118static void free_pmu_context(void * __percpu cpu_context)
5119{
5120 struct pmu *pmu;
5121
5122 mutex_lock(&pmus_lock);
5123 /*
5124 * Like a real lame refcount.
5125 */
5126 list_for_each_entry(pmu, &pmus, entry) {
5127 if (pmu->pmu_cpu_context == cpu_context)
5128 goto out;
5129 }
5130
5131 free_percpu(cpu_context);
5132out:
5133 mutex_unlock(&pmus_lock);
5134}
5135
5045int perf_pmu_register(struct pmu *pmu) 5136int perf_pmu_register(struct pmu *pmu)
5046{ 5137{
5047 int cpu, ret; 5138 int cpu, ret;
@@ -5052,6 +5143,10 @@ int perf_pmu_register(struct pmu *pmu)
5052 if (!pmu->pmu_disable_count) 5143 if (!pmu->pmu_disable_count)
5053 goto unlock; 5144 goto unlock;
5054 5145
5146 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
5147 if (pmu->pmu_cpu_context)
5148 goto got_cpu_context;
5149
5055 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 5150 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
5056 if (!pmu->pmu_cpu_context) 5151 if (!pmu->pmu_cpu_context)
5057 goto free_pdc; 5152 goto free_pdc;
@@ -5067,6 +5162,7 @@ int perf_pmu_register(struct pmu *pmu)
5067 cpuctx->timer.function = perf_event_context_tick; 5162 cpuctx->timer.function = perf_event_context_tick;
5068 } 5163 }
5069 5164
5165got_cpu_context:
5070 if (!pmu->start_txn) { 5166 if (!pmu->start_txn) {
5071 if (pmu->pmu_enable) { 5167 if (pmu->pmu_enable) {
5072 /* 5168 /*
@@ -5114,7 +5210,7 @@ void perf_pmu_unregister(struct pmu *pmu)
5114 synchronize_srcu(&pmus_srcu); 5210 synchronize_srcu(&pmus_srcu);
5115 5211
5116 free_percpu(pmu->pmu_disable_count); 5212 free_percpu(pmu->pmu_disable_count);
5117 free_percpu(pmu->pmu_cpu_context); 5213 free_pmu_context(pmu->pmu_cpu_context);
5118} 5214}
5119 5215
5120struct pmu *perf_init_event(struct perf_event *event) 5216struct pmu *perf_init_event(struct perf_event *event)
@@ -5628,16 +5724,13 @@ __perf_event_exit_task(struct perf_event *child_event,
5628 } 5724 }
5629} 5725}
5630 5726
5631/* 5727static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
5632 * When a child task exits, feed back event values to parent events.
5633 */
5634void perf_event_exit_task(struct task_struct *child)
5635{ 5728{
5636 struct perf_event *child_event, *tmp; 5729 struct perf_event *child_event, *tmp;
5637 struct perf_event_context *child_ctx; 5730 struct perf_event_context *child_ctx;
5638 unsigned long flags; 5731 unsigned long flags;
5639 5732
5640 if (likely(!child->perf_event_ctxp)) { 5733 if (likely(!child->perf_event_ctxp[ctxn])) {
5641 perf_event_task(child, NULL, 0); 5734 perf_event_task(child, NULL, 0);
5642 return; 5735 return;
5643 } 5736 }
@@ -5649,7 +5742,7 @@ void perf_event_exit_task(struct task_struct *child)
5649 * scheduled, so we are now safe from rescheduling changing 5742 * scheduled, so we are now safe from rescheduling changing
5650 * our context. 5743 * our context.
5651 */ 5744 */
5652 child_ctx = child->perf_event_ctxp; 5745 child_ctx = child->perf_event_ctxp[ctxn];
5653 __perf_event_task_sched_out(child_ctx); 5746 __perf_event_task_sched_out(child_ctx);
5654 5747
5655 /* 5748 /*
@@ -5658,7 +5751,7 @@ void perf_event_exit_task(struct task_struct *child)
5658 * incremented the context's refcount before we do put_ctx below. 5751 * incremented the context's refcount before we do put_ctx below.
5659 */ 5752 */
5660 raw_spin_lock(&child_ctx->lock); 5753 raw_spin_lock(&child_ctx->lock);
5661 child->perf_event_ctxp = NULL; 5754 child->perf_event_ctxp[ctxn] = NULL;
5662 /* 5755 /*
5663 * If this context is a clone; unclone it so it can't get 5756 * If this context is a clone; unclone it so it can't get
5664 * swapped to another process while we're removing all 5757 * swapped to another process while we're removing all
@@ -5711,6 +5804,17 @@ again:
5711 put_ctx(child_ctx); 5804 put_ctx(child_ctx);
5712} 5805}
5713 5806
5807/*
5808 * When a child task exits, feed back event values to parent events.
5809 */
5810void perf_event_exit_task(struct task_struct *child)
5811{
5812 int ctxn;
5813
5814 for_each_task_context_nr(ctxn)
5815 perf_event_exit_task_context(child, ctxn);
5816}
5817
5714static void perf_free_event(struct perf_event *event, 5818static void perf_free_event(struct perf_event *event,
5715 struct perf_event_context *ctx) 5819 struct perf_event_context *ctx)
5716{ 5820{
@@ -5732,32 +5836,37 @@ static void perf_free_event(struct perf_event *event,
5732 5836
5733/* 5837/*
5734 * free an unexposed, unused context as created by inheritance by 5838 * free an unexposed, unused context as created by inheritance by
5735 * init_task below, used by fork() in case of fail. 5839 * perf_event_init_task below, used by fork() in case of fail.
5736 */ 5840 */
5737void perf_event_free_task(struct task_struct *task) 5841void perf_event_free_task(struct task_struct *task)
5738{ 5842{
5739 struct perf_event_context *ctx = task->perf_event_ctxp; 5843 struct perf_event_context *ctx;
5740 struct perf_event *event, *tmp; 5844 struct perf_event *event, *tmp;
5845 int ctxn;
5741 5846
5742 if (!ctx) 5847 for_each_task_context_nr(ctxn) {
5743 return; 5848 ctx = task->perf_event_ctxp[ctxn];
5849 if (!ctx)
5850 continue;
5744 5851
5745 mutex_lock(&ctx->mutex); 5852 mutex_lock(&ctx->mutex);
5746again: 5853again:
5747 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) 5854 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
5748 perf_free_event(event, ctx); 5855 group_entry)
5856 perf_free_event(event, ctx);
5749 5857
5750 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, 5858 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5751 group_entry) 5859 group_entry)
5752 perf_free_event(event, ctx); 5860 perf_free_event(event, ctx);
5753 5861
5754 if (!list_empty(&ctx->pinned_groups) || 5862 if (!list_empty(&ctx->pinned_groups) ||
5755 !list_empty(&ctx->flexible_groups)) 5863 !list_empty(&ctx->flexible_groups))
5756 goto again; 5864 goto again;
5757 5865
5758 mutex_unlock(&ctx->mutex); 5866 mutex_unlock(&ctx->mutex);
5759 5867
5760 put_ctx(ctx); 5868 put_ctx(ctx);
5869 }
5761} 5870}
5762 5871
5763/* 5872/*
@@ -5863,17 +5972,18 @@ static int inherit_group(struct perf_event *parent_event,
5863static int 5972static int
5864inherit_task_group(struct perf_event *event, struct task_struct *parent, 5973inherit_task_group(struct perf_event *event, struct task_struct *parent,
5865 struct perf_event_context *parent_ctx, 5974 struct perf_event_context *parent_ctx,
5866 struct task_struct *child, 5975 struct task_struct *child, int ctxn,
5867 int *inherited_all) 5976 int *inherited_all)
5868{ 5977{
5869 int ret; 5978 int ret;
5870 struct perf_event_context *child_ctx = child->perf_event_ctxp; 5979 struct perf_event_context *child_ctx;
5871 5980
5872 if (!event->attr.inherit) { 5981 if (!event->attr.inherit) {
5873 *inherited_all = 0; 5982 *inherited_all = 0;
5874 return 0; 5983 return 0;
5875 } 5984 }
5876 5985
5986 child_ctx = child->perf_event_ctxp[ctxn];
5877 if (!child_ctx) { 5987 if (!child_ctx) {
5878 /* 5988 /*
5879 * This is executed from the parent task context, so 5989 * This is executed from the parent task context, so
@@ -5886,7 +5996,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5886 if (!child_ctx) 5996 if (!child_ctx)
5887 return -ENOMEM; 5997 return -ENOMEM;
5888 5998
5889 child->perf_event_ctxp = child_ctx; 5999 child->perf_event_ctxp[ctxn] = child_ctx;
5890 } 6000 }
5891 6001
5892 ret = inherit_group(event, parent, parent_ctx, 6002 ret = inherit_group(event, parent, parent_ctx,
@@ -5901,7 +6011,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
5901/* 6011/*
5902 * Initialize the perf_event context in task_struct 6012 * Initialize the perf_event context in task_struct
5903 */ 6013 */
5904int perf_event_init_task(struct task_struct *child) 6014int perf_event_init_context(struct task_struct *child, int ctxn)
5905{ 6015{
5906 struct perf_event_context *child_ctx, *parent_ctx; 6016 struct perf_event_context *child_ctx, *parent_ctx;
5907 struct perf_event_context *cloned_ctx; 6017 struct perf_event_context *cloned_ctx;
@@ -5910,19 +6020,19 @@ int perf_event_init_task(struct task_struct *child)
5910 int inherited_all = 1; 6020 int inherited_all = 1;
5911 int ret = 0; 6021 int ret = 0;
5912 6022
5913 child->perf_event_ctxp = NULL; 6023 child->perf_event_ctxp[ctxn] = NULL;
5914 6024
5915 mutex_init(&child->perf_event_mutex); 6025 mutex_init(&child->perf_event_mutex);
5916 INIT_LIST_HEAD(&child->perf_event_list); 6026 INIT_LIST_HEAD(&child->perf_event_list);
5917 6027
5918 if (likely(!parent->perf_event_ctxp)) 6028 if (likely(!parent->perf_event_ctxp[ctxn]))
5919 return 0; 6029 return 0;
5920 6030
5921 /* 6031 /*
5922 * If the parent's context is a clone, pin it so it won't get 6032 * If the parent's context is a clone, pin it so it won't get
5923 * swapped under us. 6033 * swapped under us.
5924 */ 6034 */
5925 parent_ctx = perf_pin_task_context(parent); 6035 parent_ctx = perf_pin_task_context(parent, ctxn);
5926 6036
5927 /* 6037 /*
5928 * No need to check if parent_ctx != NULL here; since we saw 6038 * No need to check if parent_ctx != NULL here; since we saw
@@ -5942,20 +6052,20 @@ int perf_event_init_task(struct task_struct *child)
5942 * the list, not manipulating it: 6052 * the list, not manipulating it:
5943 */ 6053 */
5944 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { 6054 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5945 ret = inherit_task_group(event, parent, parent_ctx, child, 6055 ret = inherit_task_group(event, parent, parent_ctx,
5946 &inherited_all); 6056 child, ctxn, &inherited_all);
5947 if (ret) 6057 if (ret)
5948 break; 6058 break;
5949 } 6059 }
5950 6060
5951 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { 6061 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5952 ret = inherit_task_group(event, parent, parent_ctx, child, 6062 ret = inherit_task_group(event, parent, parent_ctx,
5953 &inherited_all); 6063 child, ctxn, &inherited_all);
5954 if (ret) 6064 if (ret)
5955 break; 6065 break;
5956 } 6066 }
5957 6067
5958 child_ctx = child->perf_event_ctxp; 6068 child_ctx = child->perf_event_ctxp[ctxn];
5959 6069
5960 if (child_ctx && inherited_all) { 6070 if (child_ctx && inherited_all) {
5961 /* 6071 /*
@@ -5984,6 +6094,22 @@ int perf_event_init_task(struct task_struct *child)
5984 return ret; 6094 return ret;
5985} 6095}
5986 6096
6097/*
6098 * Initialize the perf_event context in task_struct
6099 */
6100int perf_event_init_task(struct task_struct *child)
6101{
6102 int ctxn, ret;
6103
6104 for_each_task_context_nr(ctxn) {
6105 ret = perf_event_init_context(child, ctxn);
6106 if (ret)
6107 return ret;
6108 }
6109
6110 return 0;
6111}
6112
5987static void __init perf_event_init_all_cpus(void) 6113static void __init perf_event_init_all_cpus(void)
5988{ 6114{
5989 struct swevent_htable *swhash; 6115 struct swevent_htable *swhash;