aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c744
1 files changed, 464 insertions, 280 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2ae7409bf38f..574ee58a3046 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -56,21 +56,6 @@ static atomic_t nr_task_events __read_mostly;
56 */ 56 */
57int sysctl_perf_event_paranoid __read_mostly = 1; 57int sysctl_perf_event_paranoid __read_mostly = 1;
58 58
59static inline bool perf_paranoid_tracepoint_raw(void)
60{
61 return sysctl_perf_event_paranoid > -1;
62}
63
64static inline bool perf_paranoid_cpu(void)
65{
66 return sysctl_perf_event_paranoid > 0;
67}
68
69static inline bool perf_paranoid_kernel(void)
70{
71 return sysctl_perf_event_paranoid > 1;
72}
73
74int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ 59int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
75 60
76/* 61/*
@@ -96,13 +81,10 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
96void __weak hw_perf_disable(void) { barrier(); } 81void __weak hw_perf_disable(void) { barrier(); }
97void __weak hw_perf_enable(void) { barrier(); } 82void __weak hw_perf_enable(void) { barrier(); }
98 83
99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101
102int __weak 84int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader, 85hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx, 86 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu) 87 struct perf_event_context *ctx)
106{ 88{
107 return 0; 89 return 0;
108} 90}
@@ -111,25 +93,15 @@ void __weak perf_event_print_debug(void) { }
111 93
112static DEFINE_PER_CPU(int, perf_disable_count); 94static DEFINE_PER_CPU(int, perf_disable_count);
113 95
114void __perf_disable(void)
115{
116 __get_cpu_var(perf_disable_count)++;
117}
118
119bool __perf_enable(void)
120{
121 return !--__get_cpu_var(perf_disable_count);
122}
123
124void perf_disable(void) 96void perf_disable(void)
125{ 97{
126 __perf_disable(); 98 if (!__get_cpu_var(perf_disable_count)++)
127 hw_perf_disable(); 99 hw_perf_disable();
128} 100}
129 101
130void perf_enable(void) 102void perf_enable(void)
131{ 103{
132 if (__perf_enable()) 104 if (!--__get_cpu_var(perf_disable_count))
133 hw_perf_enable(); 105 hw_perf_enable();
134} 106}
135 107
@@ -248,7 +220,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
248 220
249static inline u64 perf_clock(void) 221static inline u64 perf_clock(void)
250{ 222{
251 return cpu_clock(smp_processor_id()); 223 return cpu_clock(raw_smp_processor_id());
252} 224}
253 225
254/* 226/*
@@ -289,6 +261,15 @@ static void update_event_times(struct perf_event *event)
289 event->total_time_running = run_end - event->tstamp_running; 261 event->total_time_running = run_end - event->tstamp_running;
290} 262}
291 263
264static struct list_head *
265ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
266{
267 if (event->attr.pinned)
268 return &ctx->pinned_groups;
269 else
270 return &ctx->flexible_groups;
271}
272
292/* 273/*
293 * Add a event from the lists for its context. 274 * Add a event from the lists for its context.
294 * Must be called with ctx->mutex and ctx->lock held. 275 * Must be called with ctx->mutex and ctx->lock held.
@@ -303,9 +284,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
303 * add it straight to the context's event list, or to the group 284 * add it straight to the context's event list, or to the group
304 * leader's sibling list: 285 * leader's sibling list:
305 */ 286 */
306 if (group_leader == event) 287 if (group_leader == event) {
307 list_add_tail(&event->group_entry, &ctx->group_list); 288 struct list_head *list;
308 else { 289
290 if (is_software_event(event))
291 event->group_flags |= PERF_GROUP_SOFTWARE;
292
293 list = ctx_group_list(event, ctx);
294 list_add_tail(&event->group_entry, list);
295 } else {
296 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
297 !is_software_event(event))
298 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
299
309 list_add_tail(&event->group_entry, &group_leader->sibling_list); 300 list_add_tail(&event->group_entry, &group_leader->sibling_list);
310 group_leader->nr_siblings++; 301 group_leader->nr_siblings++;
311 } 302 }
@@ -355,9 +346,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
355 * to the context list directly: 346 * to the context list directly:
356 */ 347 */
357 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 348 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
349 struct list_head *list;
358 350
359 list_move_tail(&sibling->group_entry, &ctx->group_list); 351 list = ctx_group_list(event, ctx);
352 list_move_tail(&sibling->group_entry, list);
360 sibling->group_leader = sibling; 353 sibling->group_leader = sibling;
354
355 /* Inherit group flags from the previous leader */
356 sibling->group_flags = event->group_flags;
361 } 357 }
362} 358}
363 359
@@ -608,14 +604,13 @@ void perf_event_disable(struct perf_event *event)
608static int 604static int
609event_sched_in(struct perf_event *event, 605event_sched_in(struct perf_event *event,
610 struct perf_cpu_context *cpuctx, 606 struct perf_cpu_context *cpuctx,
611 struct perf_event_context *ctx, 607 struct perf_event_context *ctx)
612 int cpu)
613{ 608{
614 if (event->state <= PERF_EVENT_STATE_OFF) 609 if (event->state <= PERF_EVENT_STATE_OFF)
615 return 0; 610 return 0;
616 611
617 event->state = PERF_EVENT_STATE_ACTIVE; 612 event->state = PERF_EVENT_STATE_ACTIVE;
618 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 613 event->oncpu = smp_processor_id();
619 /* 614 /*
620 * The new state must be visible before we turn it on in the hardware: 615 * The new state must be visible before we turn it on in the hardware:
621 */ 616 */
@@ -642,8 +637,7 @@ event_sched_in(struct perf_event *event,
642static int 637static int
643group_sched_in(struct perf_event *group_event, 638group_sched_in(struct perf_event *group_event,
644 struct perf_cpu_context *cpuctx, 639 struct perf_cpu_context *cpuctx,
645 struct perf_event_context *ctx, 640 struct perf_event_context *ctx)
646 int cpu)
647{ 641{
648 struct perf_event *event, *partial_group; 642 struct perf_event *event, *partial_group;
649 int ret; 643 int ret;
@@ -651,18 +645,18 @@ group_sched_in(struct perf_event *group_event,
651 if (group_event->state == PERF_EVENT_STATE_OFF) 645 if (group_event->state == PERF_EVENT_STATE_OFF)
652 return 0; 646 return 0;
653 647
654 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 648 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
655 if (ret) 649 if (ret)
656 return ret < 0 ? ret : 0; 650 return ret < 0 ? ret : 0;
657 651
658 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 652 if (event_sched_in(group_event, cpuctx, ctx))
659 return -EAGAIN; 653 return -EAGAIN;
660 654
661 /* 655 /*
662 * Schedule in siblings as one group (if any): 656 * Schedule in siblings as one group (if any):
663 */ 657 */
664 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 658 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
665 if (event_sched_in(event, cpuctx, ctx, cpu)) { 659 if (event_sched_in(event, cpuctx, ctx)) {
666 partial_group = event; 660 partial_group = event;
667 goto group_error; 661 goto group_error;
668 } 662 }
@@ -686,24 +680,6 @@ group_error:
686} 680}
687 681
688/* 682/*
689 * Return 1 for a group consisting entirely of software events,
690 * 0 if the group contains any hardware events.
691 */
692static int is_software_only_group(struct perf_event *leader)
693{
694 struct perf_event *event;
695
696 if (!is_software_event(leader))
697 return 0;
698
699 list_for_each_entry(event, &leader->sibling_list, group_entry)
700 if (!is_software_event(event))
701 return 0;
702
703 return 1;
704}
705
706/*
707 * Work out whether we can put this event group on the CPU now. 683 * Work out whether we can put this event group on the CPU now.
708 */ 684 */
709static int group_can_go_on(struct perf_event *event, 685static int group_can_go_on(struct perf_event *event,
@@ -713,7 +689,7 @@ static int group_can_go_on(struct perf_event *event,
713 /* 689 /*
714 * Groups consisting entirely of software events can always go on. 690 * Groups consisting entirely of software events can always go on.
715 */ 691 */
716 if (is_software_only_group(event)) 692 if (event->group_flags & PERF_GROUP_SOFTWARE)
717 return 1; 693 return 1;
718 /* 694 /*
719 * If an exclusive group is already on, no other hardware 695 * If an exclusive group is already on, no other hardware
@@ -754,7 +730,6 @@ static void __perf_install_in_context(void *info)
754 struct perf_event *event = info; 730 struct perf_event *event = info;
755 struct perf_event_context *ctx = event->ctx; 731 struct perf_event_context *ctx = event->ctx;
756 struct perf_event *leader = event->group_leader; 732 struct perf_event *leader = event->group_leader;
757 int cpu = smp_processor_id();
758 int err; 733 int err;
759 734
760 /* 735 /*
@@ -801,7 +776,7 @@ static void __perf_install_in_context(void *info)
801 if (!group_can_go_on(event, cpuctx, 1)) 776 if (!group_can_go_on(event, cpuctx, 1))
802 err = -EEXIST; 777 err = -EEXIST;
803 else 778 else
804 err = event_sched_in(event, cpuctx, ctx, cpu); 779 err = event_sched_in(event, cpuctx, ctx);
805 780
806 if (err) { 781 if (err) {
807 /* 782 /*
@@ -943,11 +918,9 @@ static void __perf_event_enable(void *info)
943 } else { 918 } else {
944 perf_disable(); 919 perf_disable();
945 if (event == leader) 920 if (event == leader)
946 err = group_sched_in(event, cpuctx, ctx, 921 err = group_sched_in(event, cpuctx, ctx);
947 smp_processor_id());
948 else 922 else
949 err = event_sched_in(event, cpuctx, ctx, 923 err = event_sched_in(event, cpuctx, ctx);
950 smp_processor_id());
951 perf_enable(); 924 perf_enable();
952 } 925 }
953 926
@@ -1043,8 +1016,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1043 return 0; 1016 return 0;
1044} 1017}
1045 1018
1046void __perf_event_sched_out(struct perf_event_context *ctx, 1019enum event_type_t {
1047 struct perf_cpu_context *cpuctx) 1020 EVENT_FLEXIBLE = 0x1,
1021 EVENT_PINNED = 0x2,
1022 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1023};
1024
1025static void ctx_sched_out(struct perf_event_context *ctx,
1026 struct perf_cpu_context *cpuctx,
1027 enum event_type_t event_type)
1048{ 1028{
1049 struct perf_event *event; 1029 struct perf_event *event;
1050 1030
@@ -1055,10 +1035,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1055 update_context_time(ctx); 1035 update_context_time(ctx);
1056 1036
1057 perf_disable(); 1037 perf_disable();
1058 if (ctx->nr_active) { 1038 if (!ctx->nr_active)
1059 list_for_each_entry(event, &ctx->group_list, group_entry) 1039 goto out_enable;
1040
1041 if (event_type & EVENT_PINNED)
1042 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1060 group_sched_out(event, cpuctx, ctx); 1043 group_sched_out(event, cpuctx, ctx);
1061 } 1044
1045 if (event_type & EVENT_FLEXIBLE)
1046 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1047 group_sched_out(event, cpuctx, ctx);
1048
1049 out_enable:
1062 perf_enable(); 1050 perf_enable();
1063 out: 1051 out:
1064 raw_spin_unlock(&ctx->lock); 1052 raw_spin_unlock(&ctx->lock);
@@ -1170,9 +1158,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1170 * not restart the event. 1158 * not restart the event.
1171 */ 1159 */
1172void perf_event_task_sched_out(struct task_struct *task, 1160void perf_event_task_sched_out(struct task_struct *task,
1173 struct task_struct *next, int cpu) 1161 struct task_struct *next)
1174{ 1162{
1175 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1163 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1176 struct perf_event_context *ctx = task->perf_event_ctxp; 1164 struct perf_event_context *ctx = task->perf_event_ctxp;
1177 struct perf_event_context *next_ctx; 1165 struct perf_event_context *next_ctx;
1178 struct perf_event_context *parent; 1166 struct perf_event_context *parent;
@@ -1220,15 +1208,13 @@ void perf_event_task_sched_out(struct task_struct *task,
1220 rcu_read_unlock(); 1208 rcu_read_unlock();
1221 1209
1222 if (do_switch) { 1210 if (do_switch) {
1223 __perf_event_sched_out(ctx, cpuctx); 1211 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1224 cpuctx->task_ctx = NULL; 1212 cpuctx->task_ctx = NULL;
1225 } 1213 }
1226} 1214}
1227 1215
1228/* 1216static void task_ctx_sched_out(struct perf_event_context *ctx,
1229 * Called with IRQs disabled 1217 enum event_type_t event_type)
1230 */
1231static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1232{ 1218{
1233 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1219 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1234 1220
@@ -1238,47 +1224,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1238 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1224 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1239 return; 1225 return;
1240 1226
1241 __perf_event_sched_out(ctx, cpuctx); 1227 ctx_sched_out(ctx, cpuctx, event_type);
1242 cpuctx->task_ctx = NULL; 1228 cpuctx->task_ctx = NULL;
1243} 1229}
1244 1230
1245/* 1231/*
1246 * Called with IRQs disabled 1232 * Called with IRQs disabled
1247 */ 1233 */
1248static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) 1234static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1249{ 1235{
1250 __perf_event_sched_out(&cpuctx->ctx, cpuctx); 1236 task_ctx_sched_out(ctx, EVENT_ALL);
1237}
1238
1239/*
1240 * Called with IRQs disabled
1241 */
1242static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1243 enum event_type_t event_type)
1244{
1245 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1251} 1246}
1252 1247
1253static void 1248static void
1254__perf_event_sched_in(struct perf_event_context *ctx, 1249ctx_pinned_sched_in(struct perf_event_context *ctx,
1255 struct perf_cpu_context *cpuctx, int cpu) 1250 struct perf_cpu_context *cpuctx)
1256{ 1251{
1257 struct perf_event *event; 1252 struct perf_event *event;
1258 int can_add_hw = 1;
1259
1260 raw_spin_lock(&ctx->lock);
1261 ctx->is_active = 1;
1262 if (likely(!ctx->nr_events))
1263 goto out;
1264
1265 ctx->timestamp = perf_clock();
1266 1253
1267 perf_disable(); 1254 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1268 1255 if (event->state <= PERF_EVENT_STATE_OFF)
1269 /*
1270 * First go through the list and put on any pinned groups
1271 * in order to give them the best chance of going on.
1272 */
1273 list_for_each_entry(event, &ctx->group_list, group_entry) {
1274 if (event->state <= PERF_EVENT_STATE_OFF ||
1275 !event->attr.pinned)
1276 continue; 1256 continue;
1277 if (event->cpu != -1 && event->cpu != cpu) 1257 if (event->cpu != -1 && event->cpu != smp_processor_id())
1278 continue; 1258 continue;
1279 1259
1280 if (group_can_go_on(event, cpuctx, 1)) 1260 if (group_can_go_on(event, cpuctx, 1))
1281 group_sched_in(event, cpuctx, ctx, cpu); 1261 group_sched_in(event, cpuctx, ctx);
1282 1262
1283 /* 1263 /*
1284 * If this pinned group hasn't been scheduled, 1264 * If this pinned group hasn't been scheduled,
@@ -1289,32 +1269,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1289 event->state = PERF_EVENT_STATE_ERROR; 1269 event->state = PERF_EVENT_STATE_ERROR;
1290 } 1270 }
1291 } 1271 }
1272}
1292 1273
1293 list_for_each_entry(event, &ctx->group_list, group_entry) { 1274static void
1294 /* 1275ctx_flexible_sched_in(struct perf_event_context *ctx,
1295 * Ignore events in OFF or ERROR state, and 1276 struct perf_cpu_context *cpuctx)
1296 * ignore pinned events since we did them already. 1277{
1297 */ 1278 struct perf_event *event;
1298 if (event->state <= PERF_EVENT_STATE_OFF || 1279 int can_add_hw = 1;
1299 event->attr.pinned)
1300 continue;
1301 1280
1281 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1282 /* Ignore events in OFF or ERROR state */
1283 if (event->state <= PERF_EVENT_STATE_OFF)
1284 continue;
1302 /* 1285 /*
1303 * Listen to the 'cpu' scheduling filter constraint 1286 * Listen to the 'cpu' scheduling filter constraint
1304 * of events: 1287 * of events:
1305 */ 1288 */
1306 if (event->cpu != -1 && event->cpu != cpu) 1289 if (event->cpu != -1 && event->cpu != smp_processor_id())
1307 continue; 1290 continue;
1308 1291
1309 if (group_can_go_on(event, cpuctx, can_add_hw)) 1292 if (group_can_go_on(event, cpuctx, can_add_hw))
1310 if (group_sched_in(event, cpuctx, ctx, cpu)) 1293 if (group_sched_in(event, cpuctx, ctx))
1311 can_add_hw = 0; 1294 can_add_hw = 0;
1312 } 1295 }
1296}
1297
1298static void
1299ctx_sched_in(struct perf_event_context *ctx,
1300 struct perf_cpu_context *cpuctx,
1301 enum event_type_t event_type)
1302{
1303 raw_spin_lock(&ctx->lock);
1304 ctx->is_active = 1;
1305 if (likely(!ctx->nr_events))
1306 goto out;
1307
1308 ctx->timestamp = perf_clock();
1309
1310 perf_disable();
1311
1312 /*
1313 * First go through the list and put on any pinned groups
1314 * in order to give them the best chance of going on.
1315 */
1316 if (event_type & EVENT_PINNED)
1317 ctx_pinned_sched_in(ctx, cpuctx);
1318
1319 /* Then walk through the lower prio flexible groups */
1320 if (event_type & EVENT_FLEXIBLE)
1321 ctx_flexible_sched_in(ctx, cpuctx);
1322
1313 perf_enable(); 1323 perf_enable();
1314 out: 1324 out:
1315 raw_spin_unlock(&ctx->lock); 1325 raw_spin_unlock(&ctx->lock);
1316} 1326}
1317 1327
1328static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1329 enum event_type_t event_type)
1330{
1331 struct perf_event_context *ctx = &cpuctx->ctx;
1332
1333 ctx_sched_in(ctx, cpuctx, event_type);
1334}
1335
1336static void task_ctx_sched_in(struct task_struct *task,
1337 enum event_type_t event_type)
1338{
1339 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1340 struct perf_event_context *ctx = task->perf_event_ctxp;
1341
1342 if (likely(!ctx))
1343 return;
1344 if (cpuctx->task_ctx == ctx)
1345 return;
1346 ctx_sched_in(ctx, cpuctx, event_type);
1347 cpuctx->task_ctx = ctx;
1348}
1318/* 1349/*
1319 * Called from scheduler to add the events of the current task 1350 * Called from scheduler to add the events of the current task
1320 * with interrupts disabled. 1351 * with interrupts disabled.
@@ -1326,38 +1357,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1326 * accessing the event control register. If a NMI hits, then it will 1357 * accessing the event control register. If a NMI hits, then it will
1327 * keep the event running. 1358 * keep the event running.
1328 */ 1359 */
1329void perf_event_task_sched_in(struct task_struct *task, int cpu) 1360void perf_event_task_sched_in(struct task_struct *task)
1330{ 1361{
1331 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1362 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1332 struct perf_event_context *ctx = task->perf_event_ctxp; 1363 struct perf_event_context *ctx = task->perf_event_ctxp;
1333 1364
1334 if (likely(!ctx)) 1365 if (likely(!ctx))
1335 return; 1366 return;
1367
1336 if (cpuctx->task_ctx == ctx) 1368 if (cpuctx->task_ctx == ctx)
1337 return; 1369 return;
1338 __perf_event_sched_in(ctx, cpuctx, cpu); 1370
1371 /*
1372 * We want to keep the following priority order:
1373 * cpu pinned (that don't need to move), task pinned,
1374 * cpu flexible, task flexible.
1375 */
1376 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1377
1378 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1379 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1380 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1381
1339 cpuctx->task_ctx = ctx; 1382 cpuctx->task_ctx = ctx;
1340} 1383}
1341 1384
1342static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) 1385#define MAX_INTERRUPTS (~0ULL)
1386
1387static void perf_log_throttle(struct perf_event *event, int enable);
1388
1389static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1343{ 1390{
1344 struct perf_event_context *ctx = &cpuctx->ctx; 1391 u64 frequency = event->attr.sample_freq;
1392 u64 sec = NSEC_PER_SEC;
1393 u64 divisor, dividend;
1394
1395 int count_fls, nsec_fls, frequency_fls, sec_fls;
1396
1397 count_fls = fls64(count);
1398 nsec_fls = fls64(nsec);
1399 frequency_fls = fls64(frequency);
1400 sec_fls = 30;
1401
1402 /*
1403 * We got @count in @nsec, with a target of sample_freq HZ
1404 * the target period becomes:
1405 *
1406 * @count * 10^9
1407 * period = -------------------
1408 * @nsec * sample_freq
1409 *
1410 */
1411
1412 /*
1413 * Reduce accuracy by one bit such that @a and @b converge
1414 * to a similar magnitude.
1415 */
1416#define REDUCE_FLS(a, b) \
1417do { \
1418 if (a##_fls > b##_fls) { \
1419 a >>= 1; \
1420 a##_fls--; \
1421 } else { \
1422 b >>= 1; \
1423 b##_fls--; \
1424 } \
1425} while (0)
1426
1427 /*
1428 * Reduce accuracy until either term fits in a u64, then proceed with
1429 * the other, so that finally we can do a u64/u64 division.
1430 */
1431 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1432 REDUCE_FLS(nsec, frequency);
1433 REDUCE_FLS(sec, count);
1434 }
1435
1436 if (count_fls + sec_fls > 64) {
1437 divisor = nsec * frequency;
1438
1439 while (count_fls + sec_fls > 64) {
1440 REDUCE_FLS(count, sec);
1441 divisor >>= 1;
1442 }
1443
1444 dividend = count * sec;
1445 } else {
1446 dividend = count * sec;
1345 1447
1346 __perf_event_sched_in(ctx, cpuctx, cpu); 1448 while (nsec_fls + frequency_fls > 64) {
1449 REDUCE_FLS(nsec, frequency);
1450 dividend >>= 1;
1451 }
1452
1453 divisor = nsec * frequency;
1454 }
1455
1456 return div64_u64(dividend, divisor);
1347} 1457}
1348 1458
1349#define MAX_INTERRUPTS (~0ULL) 1459static void perf_event_stop(struct perf_event *event)
1460{
1461 if (!event->pmu->stop)
1462 return event->pmu->disable(event);
1350 1463
1351static void perf_log_throttle(struct perf_event *event, int enable); 1464 return event->pmu->stop(event);
1465}
1352 1466
1353static void perf_adjust_period(struct perf_event *event, u64 events) 1467static int perf_event_start(struct perf_event *event)
1468{
1469 if (!event->pmu->start)
1470 return event->pmu->enable(event);
1471
1472 return event->pmu->start(event);
1473}
1474
1475static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1354{ 1476{
1355 struct hw_perf_event *hwc = &event->hw; 1477 struct hw_perf_event *hwc = &event->hw;
1356 u64 period, sample_period; 1478 u64 period, sample_period;
1357 s64 delta; 1479 s64 delta;
1358 1480
1359 events *= hwc->sample_period; 1481 period = perf_calculate_period(event, nsec, count);
1360 period = div64_u64(events, event->attr.sample_freq);
1361 1482
1362 delta = (s64)(period - hwc->sample_period); 1483 delta = (s64)(period - hwc->sample_period);
1363 delta = (delta + 7) / 8; /* low pass filter */ 1484 delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1489,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
1368 sample_period = 1; 1489 sample_period = 1;
1369 1490
1370 hwc->sample_period = sample_period; 1491 hwc->sample_period = sample_period;
1492
1493 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1494 perf_disable();
1495 perf_event_stop(event);
1496 atomic64_set(&hwc->period_left, 0);
1497 perf_event_start(event);
1498 perf_enable();
1499 }
1371} 1500}
1372 1501
1373static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1502static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1374{ 1503{
1375 struct perf_event *event; 1504 struct perf_event *event;
1376 struct hw_perf_event *hwc; 1505 struct hw_perf_event *hwc;
1377 u64 interrupts, freq; 1506 u64 interrupts, now;
1507 s64 delta;
1378 1508
1379 raw_spin_lock(&ctx->lock); 1509 raw_spin_lock(&ctx->lock);
1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1510 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1394,45 +1524,23 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1394 */ 1524 */
1395 if (interrupts == MAX_INTERRUPTS) { 1525 if (interrupts == MAX_INTERRUPTS) {
1396 perf_log_throttle(event, 1); 1526 perf_log_throttle(event, 1);
1527 perf_disable();
1397 event->pmu->unthrottle(event); 1528 event->pmu->unthrottle(event);
1398 interrupts = 2*sysctl_perf_event_sample_rate/HZ; 1529 perf_enable();
1399 } 1530 }
1400 1531
1401 if (!event->attr.freq || !event->attr.sample_freq) 1532 if (!event->attr.freq || !event->attr.sample_freq)
1402 continue; 1533 continue;
1403 1534
1404 /* 1535 perf_disable();
1405 * if the specified freq < HZ then we need to skip ticks 1536 event->pmu->read(event);
1406 */ 1537 now = atomic64_read(&event->count);
1407 if (event->attr.sample_freq < HZ) { 1538 delta = now - hwc->freq_count_stamp;
1408 freq = event->attr.sample_freq; 1539 hwc->freq_count_stamp = now;
1409
1410 hwc->freq_count += freq;
1411 hwc->freq_interrupts += interrupts;
1412
1413 if (hwc->freq_count < HZ)
1414 continue;
1415
1416 interrupts = hwc->freq_interrupts;
1417 hwc->freq_interrupts = 0;
1418 hwc->freq_count -= HZ;
1419 } else
1420 freq = HZ;
1421
1422 perf_adjust_period(event, freq * interrupts);
1423 1540
1424 /* 1541 if (delta > 0)
1425 * In order to avoid being stalled by an (accidental) huge 1542 perf_adjust_period(event, TICK_NSEC, delta);
1426 * sample period, force reset the sample period if we didn't 1543 perf_enable();
1427 * get any events in this freq period.
1428 */
1429 if (!interrupts) {
1430 perf_disable();
1431 event->pmu->disable(event);
1432 atomic64_set(&hwc->period_left, 0);
1433 event->pmu->enable(event);
1434 perf_enable();
1435 }
1436 } 1544 }
1437 raw_spin_unlock(&ctx->lock); 1545 raw_spin_unlock(&ctx->lock);
1438} 1546}
@@ -1442,51 +1550,67 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1442 */ 1550 */
1443static void rotate_ctx(struct perf_event_context *ctx) 1551static void rotate_ctx(struct perf_event_context *ctx)
1444{ 1552{
1445 struct perf_event *event;
1446
1447 if (!ctx->nr_events)
1448 return;
1449
1450 raw_spin_lock(&ctx->lock); 1553 raw_spin_lock(&ctx->lock);
1451 /* 1554
1452 * Rotate the first entry last (works just fine for group events too): 1555 /* Rotate the first entry last of non-pinned groups */
1453 */ 1556 list_rotate_left(&ctx->flexible_groups);
1454 perf_disable();
1455 list_for_each_entry(event, &ctx->group_list, group_entry) {
1456 list_move_tail(&event->group_entry, &ctx->group_list);
1457 break;
1458 }
1459 perf_enable();
1460 1557
1461 raw_spin_unlock(&ctx->lock); 1558 raw_spin_unlock(&ctx->lock);
1462} 1559}
1463 1560
1464void perf_event_task_tick(struct task_struct *curr, int cpu) 1561void perf_event_task_tick(struct task_struct *curr)
1465{ 1562{
1466 struct perf_cpu_context *cpuctx; 1563 struct perf_cpu_context *cpuctx;
1467 struct perf_event_context *ctx; 1564 struct perf_event_context *ctx;
1565 int rotate = 0;
1468 1566
1469 if (!atomic_read(&nr_events)) 1567 if (!atomic_read(&nr_events))
1470 return; 1568 return;
1471 1569
1472 cpuctx = &per_cpu(perf_cpu_context, cpu); 1570 cpuctx = &__get_cpu_var(perf_cpu_context);
1571 if (cpuctx->ctx.nr_events &&
1572 cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
1573 rotate = 1;
1574
1473 ctx = curr->perf_event_ctxp; 1575 ctx = curr->perf_event_ctxp;
1576 if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active)
1577 rotate = 1;
1474 1578
1475 perf_ctx_adjust_freq(&cpuctx->ctx); 1579 perf_ctx_adjust_freq(&cpuctx->ctx);
1476 if (ctx) 1580 if (ctx)
1477 perf_ctx_adjust_freq(ctx); 1581 perf_ctx_adjust_freq(ctx);
1478 1582
1479 perf_event_cpu_sched_out(cpuctx); 1583 if (!rotate)
1584 return;
1585
1586 perf_disable();
1587 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1480 if (ctx) 1588 if (ctx)
1481 __perf_event_task_sched_out(ctx); 1589 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1482 1590
1483 rotate_ctx(&cpuctx->ctx); 1591 rotate_ctx(&cpuctx->ctx);
1484 if (ctx) 1592 if (ctx)
1485 rotate_ctx(ctx); 1593 rotate_ctx(ctx);
1486 1594
1487 perf_event_cpu_sched_in(cpuctx, cpu); 1595 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1488 if (ctx) 1596 if (ctx)
1489 perf_event_task_sched_in(curr, cpu); 1597 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1598 perf_enable();
1599}
1600
1601static int event_enable_on_exec(struct perf_event *event,
1602 struct perf_event_context *ctx)
1603{
1604 if (!event->attr.enable_on_exec)
1605 return 0;
1606
1607 event->attr.enable_on_exec = 0;
1608 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1609 return 0;
1610
1611 __perf_event_mark_enabled(event, ctx);
1612
1613 return 1;
1490} 1614}
1491 1615
1492/* 1616/*
@@ -1499,6 +1623,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1499 struct perf_event *event; 1623 struct perf_event *event;
1500 unsigned long flags; 1624 unsigned long flags;
1501 int enabled = 0; 1625 int enabled = 0;
1626 int ret;
1502 1627
1503 local_irq_save(flags); 1628 local_irq_save(flags);
1504 ctx = task->perf_event_ctxp; 1629 ctx = task->perf_event_ctxp;
@@ -1509,14 +1634,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1509 1634
1510 raw_spin_lock(&ctx->lock); 1635 raw_spin_lock(&ctx->lock);
1511 1636
1512 list_for_each_entry(event, &ctx->group_list, group_entry) { 1637 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1513 if (!event->attr.enable_on_exec) 1638 ret = event_enable_on_exec(event, ctx);
1514 continue; 1639 if (ret)
1515 event->attr.enable_on_exec = 0; 1640 enabled = 1;
1516 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1641 }
1517 continue; 1642
1518 __perf_event_mark_enabled(event, ctx); 1643 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1519 enabled = 1; 1644 ret = event_enable_on_exec(event, ctx);
1645 if (ret)
1646 enabled = 1;
1520 } 1647 }
1521 1648
1522 /* 1649 /*
@@ -1527,7 +1654,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1527 1654
1528 raw_spin_unlock(&ctx->lock); 1655 raw_spin_unlock(&ctx->lock);
1529 1656
1530 perf_event_task_sched_in(task, smp_processor_id()); 1657 perf_event_task_sched_in(task);
1531 out: 1658 out:
1532 local_irq_restore(flags); 1659 local_irq_restore(flags);
1533} 1660}
@@ -1590,7 +1717,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
1590{ 1717{
1591 raw_spin_lock_init(&ctx->lock); 1718 raw_spin_lock_init(&ctx->lock);
1592 mutex_init(&ctx->mutex); 1719 mutex_init(&ctx->mutex);
1593 INIT_LIST_HEAD(&ctx->group_list); 1720 INIT_LIST_HEAD(&ctx->pinned_groups);
1721 INIT_LIST_HEAD(&ctx->flexible_groups);
1594 INIT_LIST_HEAD(&ctx->event_list); 1722 INIT_LIST_HEAD(&ctx->event_list);
1595 atomic_set(&ctx->refcount, 1); 1723 atomic_set(&ctx->refcount, 1);
1596 ctx->task = task; 1724 ctx->task = task;
@@ -2462,7 +2590,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
2462 if (user_locked > user_lock_limit) 2590 if (user_locked > user_lock_limit)
2463 extra = user_locked - user_lock_limit; 2591 extra = user_locked - user_lock_limit;
2464 2592
2465 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 2593 lock_limit = rlimit(RLIMIT_MEMLOCK);
2466 lock_limit >>= PAGE_SHIFT; 2594 lock_limit >>= PAGE_SHIFT;
2467 locked = vma->vm_mm->locked_vm + extra; 2595 locked = vma->vm_mm->locked_vm + extra;
2468 2596
@@ -2658,6 +2786,13 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2658 return NULL; 2786 return NULL;
2659} 2787}
2660 2788
2789#ifdef CONFIG_EVENT_TRACING
2790__weak
2791void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2792{
2793}
2794#endif
2795
2661/* 2796/*
2662 * Output 2797 * Output
2663 */ 2798 */
@@ -3608,7 +3743,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3608 /* .tid */ 3743 /* .tid */
3609 .start = vma->vm_start, 3744 .start = vma->vm_start,
3610 .len = vma->vm_end - vma->vm_start, 3745 .len = vma->vm_end - vma->vm_start,
3611 .pgoff = vma->vm_pgoff, 3746 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3612 }, 3747 },
3613 }; 3748 };
3614 3749
@@ -3688,12 +3823,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3688 3823
3689 if (event->attr.freq) { 3824 if (event->attr.freq) {
3690 u64 now = perf_clock(); 3825 u64 now = perf_clock();
3691 s64 delta = now - hwc->freq_stamp; 3826 s64 delta = now - hwc->freq_time_stamp;
3692 3827
3693 hwc->freq_stamp = now; 3828 hwc->freq_time_stamp = now;
3694 3829
3695 if (delta > 0 && delta < TICK_NSEC) 3830 if (delta > 0 && delta < 2*TICK_NSEC)
3696 perf_adjust_period(event, NSEC_PER_SEC / (int)delta); 3831 perf_adjust_period(event, delta, hwc->last_period);
3697 } 3832 }
3698 3833
3699 /* 3834 /*
@@ -3975,8 +4110,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
3975 if (rctx < 0) 4110 if (rctx < 0)
3976 return; 4111 return;
3977 4112
3978 data.addr = addr; 4113 perf_sample_data_init(&data, addr);
3979 data.raw = NULL;
3980 4114
3981 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4115 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs);
3982 4116
@@ -4021,11 +4155,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
4021 struct perf_event *event; 4155 struct perf_event *event;
4022 u64 period; 4156 u64 period;
4023 4157
4024 event = container_of(hrtimer, struct perf_event, hw.hrtimer); 4158 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
4025 event->pmu->read(event); 4159 event->pmu->read(event);
4026 4160
4027 data.addr = 0; 4161 perf_sample_data_init(&data, 0);
4028 data.raw = NULL;
4029 data.period = event->hw.last_period; 4162 data.period = event->hw.last_period;
4030 regs = get_irq_regs(); 4163 regs = get_irq_regs();
4031 /* 4164 /*
@@ -4184,29 +4317,23 @@ static const struct pmu perf_ops_task_clock = {
4184 .read = task_clock_perf_event_read, 4317 .read = task_clock_perf_event_read,
4185}; 4318};
4186 4319
4187#ifdef CONFIG_EVENT_PROFILE 4320#ifdef CONFIG_EVENT_TRACING
4188 4321
4189void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4322void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4190 int entry_size) 4323 int entry_size, struct pt_regs *regs)
4191{ 4324{
4325 struct perf_sample_data data;
4192 struct perf_raw_record raw = { 4326 struct perf_raw_record raw = {
4193 .size = entry_size, 4327 .size = entry_size,
4194 .data = record, 4328 .data = record,
4195 }; 4329 };
4196 4330
4197 struct perf_sample_data data = { 4331 perf_sample_data_init(&data, addr);
4198 .addr = addr, 4332 data.raw = &raw;
4199 .raw = &raw,
4200 };
4201
4202 struct pt_regs *regs = get_irq_regs();
4203
4204 if (!regs)
4205 regs = task_pt_regs(current);
4206 4333
4207 /* Trace events already protected against recursion */ 4334 /* Trace events already protected against recursion */
4208 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, 4335 do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
4209 &data, regs); 4336 &data, regs);
4210} 4337}
4211EXPORT_SYMBOL_GPL(perf_tp_event); 4338EXPORT_SYMBOL_GPL(perf_tp_event);
4212 4339
@@ -4222,7 +4349,7 @@ static int perf_tp_event_match(struct perf_event *event,
4222 4349
4223static void tp_perf_event_destroy(struct perf_event *event) 4350static void tp_perf_event_destroy(struct perf_event *event)
4224{ 4351{
4225 ftrace_profile_disable(event->attr.config); 4352 perf_trace_disable(event->attr.config);
4226} 4353}
4227 4354
4228static const struct pmu *tp_perf_event_init(struct perf_event *event) 4355static const struct pmu *tp_perf_event_init(struct perf_event *event)
@@ -4236,7 +4363,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event)
4236 !capable(CAP_SYS_ADMIN)) 4363 !capable(CAP_SYS_ADMIN))
4237 return ERR_PTR(-EPERM); 4364 return ERR_PTR(-EPERM);
4238 4365
4239 if (ftrace_profile_enable(event->attr.config)) 4366 if (perf_trace_enable(event->attr.config))
4240 return NULL; 4367 return NULL;
4241 4368
4242 event->destroy = tp_perf_event_destroy; 4369 event->destroy = tp_perf_event_destroy;
@@ -4289,7 +4416,7 @@ static void perf_event_free_filter(struct perf_event *event)
4289{ 4416{
4290} 4417}
4291 4418
4292#endif /* CONFIG_EVENT_PROFILE */ 4419#endif /* CONFIG_EVENT_TRACING */
4293 4420
4294#ifdef CONFIG_HAVE_HW_BREAKPOINT 4421#ifdef CONFIG_HAVE_HW_BREAKPOINT
4295static void bp_perf_event_destroy(struct perf_event *event) 4422static void bp_perf_event_destroy(struct perf_event *event)
@@ -4315,8 +4442,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
4315 struct perf_sample_data sample; 4442 struct perf_sample_data sample;
4316 struct pt_regs *regs = data; 4443 struct pt_regs *regs = data;
4317 4444
4318 sample.raw = NULL; 4445 perf_sample_data_init(&sample, bp->attr.bp_addr);
4319 sample.addr = bp->attr.bp_addr;
4320 4446
4321 if (!perf_exclude_event(bp, regs)) 4447 if (!perf_exclude_event(bp, regs))
4322 perf_swevent_add(bp, 1, 1, &sample, regs); 4448 perf_swevent_add(bp, 1, 1, &sample, regs);
@@ -4870,8 +4996,15 @@ inherit_event(struct perf_event *parent_event,
4870 else 4996 else
4871 child_event->state = PERF_EVENT_STATE_OFF; 4997 child_event->state = PERF_EVENT_STATE_OFF;
4872 4998
4873 if (parent_event->attr.freq) 4999 if (parent_event->attr.freq) {
4874 child_event->hw.sample_period = parent_event->hw.sample_period; 5000 u64 sample_period = parent_event->hw.sample_period;
5001 struct hw_perf_event *hwc = &child_event->hw;
5002
5003 hwc->sample_period = sample_period;
5004 hwc->last_period = sample_period;
5005
5006 atomic64_set(&hwc->period_left, sample_period);
5007 }
4875 5008
4876 child_event->overflow_handler = parent_event->overflow_handler; 5009 child_event->overflow_handler = parent_event->overflow_handler;
4877 5010
@@ -5039,7 +5172,11 @@ void perf_event_exit_task(struct task_struct *child)
5039 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5172 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5040 5173
5041again: 5174again:
5042 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, 5175 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5176 group_entry)
5177 __perf_event_exit_task(child_event, child_ctx, child);
5178
5179 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5043 group_entry) 5180 group_entry)
5044 __perf_event_exit_task(child_event, child_ctx, child); 5181 __perf_event_exit_task(child_event, child_ctx, child);
5045 5182
@@ -5048,7 +5185,8 @@ again:
5048 * its siblings to the list, but we obtained 'tmp' before that which 5185 * its siblings to the list, but we obtained 'tmp' before that which
5049 * will still point to the list head terminating the iteration. 5186 * will still point to the list head terminating the iteration.
5050 */ 5187 */
5051 if (!list_empty(&child_ctx->group_list)) 5188 if (!list_empty(&child_ctx->pinned_groups) ||
5189 !list_empty(&child_ctx->flexible_groups))
5052 goto again; 5190 goto again;
5053 5191
5054 mutex_unlock(&child_ctx->mutex); 5192 mutex_unlock(&child_ctx->mutex);
@@ -5056,6 +5194,24 @@ again:
5056 put_ctx(child_ctx); 5194 put_ctx(child_ctx);
5057} 5195}
5058 5196
5197static void perf_free_event(struct perf_event *event,
5198 struct perf_event_context *ctx)
5199{
5200 struct perf_event *parent = event->parent;
5201
5202 if (WARN_ON_ONCE(!parent))
5203 return;
5204
5205 mutex_lock(&parent->child_mutex);
5206 list_del_init(&event->child_list);
5207 mutex_unlock(&parent->child_mutex);
5208
5209 fput(parent->filp);
5210
5211 list_del_event(event, ctx);
5212 free_event(event);
5213}
5214
5059/* 5215/*
5060 * free an unexposed, unused context as created by inheritance by 5216 * free an unexposed, unused context as created by inheritance by
5061 * init_task below, used by fork() in case of fail. 5217 * init_task below, used by fork() in case of fail.
@@ -5070,36 +5226,70 @@ void perf_event_free_task(struct task_struct *task)
5070 5226
5071 mutex_lock(&ctx->mutex); 5227 mutex_lock(&ctx->mutex);
5072again: 5228again:
5073 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { 5229 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5074 struct perf_event *parent = event->parent; 5230 perf_free_event(event, ctx);
5075 5231
5076 if (WARN_ON_ONCE(!parent)) 5232 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5077 continue; 5233 group_entry)
5234 perf_free_event(event, ctx);
5078 5235
5079 mutex_lock(&parent->child_mutex); 5236 if (!list_empty(&ctx->pinned_groups) ||
5080 list_del_init(&event->child_list); 5237 !list_empty(&ctx->flexible_groups))
5081 mutex_unlock(&parent->child_mutex); 5238 goto again;
5082 5239
5083 fput(parent->filp); 5240 mutex_unlock(&ctx->mutex);
5084 5241
5085 list_del_event(event, ctx); 5242 put_ctx(ctx);
5086 free_event(event); 5243}
5244
5245static int
5246inherit_task_group(struct perf_event *event, struct task_struct *parent,
5247 struct perf_event_context *parent_ctx,
5248 struct task_struct *child,
5249 int *inherited_all)
5250{
5251 int ret;
5252 struct perf_event_context *child_ctx = child->perf_event_ctxp;
5253
5254 if (!event->attr.inherit) {
5255 *inherited_all = 0;
5256 return 0;
5087 } 5257 }
5088 5258
5089 if (!list_empty(&ctx->group_list)) 5259 if (!child_ctx) {
5090 goto again; 5260 /*
5261 * This is executed from the parent task context, so
5262 * inherit events that have been marked for cloning.
5263 * First allocate and initialize a context for the
5264 * child.
5265 */
5091 5266
5092 mutex_unlock(&ctx->mutex); 5267 child_ctx = kzalloc(sizeof(struct perf_event_context),
5268 GFP_KERNEL);
5269 if (!child_ctx)
5270 return -ENOMEM;
5093 5271
5094 put_ctx(ctx); 5272 __perf_event_init_context(child_ctx, child);
5273 child->perf_event_ctxp = child_ctx;
5274 get_task_struct(child);
5275 }
5276
5277 ret = inherit_group(event, parent, parent_ctx,
5278 child, child_ctx);
5279
5280 if (ret)
5281 *inherited_all = 0;
5282
5283 return ret;
5095} 5284}
5096 5285
5286
5097/* 5287/*
5098 * Initialize the perf_event context in task_struct 5288 * Initialize the perf_event context in task_struct
5099 */ 5289 */
5100int perf_event_init_task(struct task_struct *child) 5290int perf_event_init_task(struct task_struct *child)
5101{ 5291{
5102 struct perf_event_context *child_ctx = NULL, *parent_ctx; 5292 struct perf_event_context *child_ctx, *parent_ctx;
5103 struct perf_event_context *cloned_ctx; 5293 struct perf_event_context *cloned_ctx;
5104 struct perf_event *event; 5294 struct perf_event *event;
5105 struct task_struct *parent = current; 5295 struct task_struct *parent = current;
@@ -5137,41 +5327,22 @@ int perf_event_init_task(struct task_struct *child)
5137 * We dont have to disable NMIs - we are only looking at 5327 * We dont have to disable NMIs - we are only looking at
5138 * the list, not manipulating it: 5328 * the list, not manipulating it:
5139 */ 5329 */
5140 list_for_each_entry(event, &parent_ctx->group_list, group_entry) { 5330 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5141 5331 ret = inherit_task_group(event, parent, parent_ctx, child,
5142 if (!event->attr.inherit) { 5332 &inherited_all);
5143 inherited_all = 0; 5333 if (ret)
5144 continue; 5334 break;
5145 } 5335 }
5146
5147 if (!child->perf_event_ctxp) {
5148 /*
5149 * This is executed from the parent task context, so
5150 * inherit events that have been marked for cloning.
5151 * First allocate and initialize a context for the
5152 * child.
5153 */
5154
5155 child_ctx = kzalloc(sizeof(struct perf_event_context),
5156 GFP_KERNEL);
5157 if (!child_ctx) {
5158 ret = -ENOMEM;
5159 break;
5160 }
5161
5162 __perf_event_init_context(child_ctx, child);
5163 child->perf_event_ctxp = child_ctx;
5164 get_task_struct(child);
5165 }
5166 5336
5167 ret = inherit_group(event, parent, parent_ctx, 5337 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5168 child, child_ctx); 5338 ret = inherit_task_group(event, parent, parent_ctx, child,
5169 if (ret) { 5339 &inherited_all);
5170 inherited_all = 0; 5340 if (ret)
5171 break; 5341 break;
5172 }
5173 } 5342 }
5174 5343
5344 child_ctx = child->perf_event_ctxp;
5345
5175 if (child_ctx && inherited_all) { 5346 if (child_ctx && inherited_all) {
5176 /* 5347 /*
5177 * Mark the child context as a clone of the parent 5348 * Mark the child context as a clone of the parent
@@ -5199,18 +5370,26 @@ int perf_event_init_task(struct task_struct *child)
5199 return ret; 5370 return ret;
5200} 5371}
5201 5372
5373static void __init perf_event_init_all_cpus(void)
5374{
5375 int cpu;
5376 struct perf_cpu_context *cpuctx;
5377
5378 for_each_possible_cpu(cpu) {
5379 cpuctx = &per_cpu(perf_cpu_context, cpu);
5380 __perf_event_init_context(&cpuctx->ctx, NULL);
5381 }
5382}
5383
5202static void __cpuinit perf_event_init_cpu(int cpu) 5384static void __cpuinit perf_event_init_cpu(int cpu)
5203{ 5385{
5204 struct perf_cpu_context *cpuctx; 5386 struct perf_cpu_context *cpuctx;
5205 5387
5206 cpuctx = &per_cpu(perf_cpu_context, cpu); 5388 cpuctx = &per_cpu(perf_cpu_context, cpu);
5207 __perf_event_init_context(&cpuctx->ctx, NULL);
5208 5389
5209 spin_lock(&perf_resource_lock); 5390 spin_lock(&perf_resource_lock);
5210 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; 5391 cpuctx->max_pertask = perf_max_events - perf_reserved_percpu;
5211 spin_unlock(&perf_resource_lock); 5392 spin_unlock(&perf_resource_lock);
5212
5213 hw_perf_event_setup(cpu);
5214} 5393}
5215 5394
5216#ifdef CONFIG_HOTPLUG_CPU 5395#ifdef CONFIG_HOTPLUG_CPU
@@ -5220,7 +5399,9 @@ static void __perf_event_exit_cpu(void *info)
5220 struct perf_event_context *ctx = &cpuctx->ctx; 5399 struct perf_event_context *ctx = &cpuctx->ctx;
5221 struct perf_event *event, *tmp; 5400 struct perf_event *event, *tmp;
5222 5401
5223 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) 5402 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5403 __perf_event_remove_from_context(event);
5404 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5224 __perf_event_remove_from_context(event); 5405 __perf_event_remove_from_context(event);
5225} 5406}
5226static void perf_event_exit_cpu(int cpu) 5407static void perf_event_exit_cpu(int cpu)
@@ -5248,11 +5429,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5248 perf_event_init_cpu(cpu); 5429 perf_event_init_cpu(cpu);
5249 break; 5430 break;
5250 5431
5251 case CPU_ONLINE:
5252 case CPU_ONLINE_FROZEN:
5253 hw_perf_event_setup_online(cpu);
5254 break;
5255
5256 case CPU_DOWN_PREPARE: 5432 case CPU_DOWN_PREPARE:
5257 case CPU_DOWN_PREPARE_FROZEN: 5433 case CPU_DOWN_PREPARE_FROZEN:
5258 perf_event_exit_cpu(cpu); 5434 perf_event_exit_cpu(cpu);
@@ -5275,6 +5451,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
5275 5451
5276void __init perf_event_init(void) 5452void __init perf_event_init(void)
5277{ 5453{
5454 perf_event_init_all_cpus();
5278 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, 5455 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
5279 (void *)(long)smp_processor_id()); 5456 (void *)(long)smp_processor_id());
5280 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, 5457 perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
@@ -5282,13 +5459,16 @@ void __init perf_event_init(void)
5282 register_cpu_notifier(&perf_cpu_nb); 5459 register_cpu_notifier(&perf_cpu_nb);
5283} 5460}
5284 5461
5285static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) 5462static ssize_t perf_show_reserve_percpu(struct sysdev_class *class,
5463 struct sysdev_class_attribute *attr,
5464 char *buf)
5286{ 5465{
5287 return sprintf(buf, "%d\n", perf_reserved_percpu); 5466 return sprintf(buf, "%d\n", perf_reserved_percpu);
5288} 5467}
5289 5468
5290static ssize_t 5469static ssize_t
5291perf_set_reserve_percpu(struct sysdev_class *class, 5470perf_set_reserve_percpu(struct sysdev_class *class,
5471 struct sysdev_class_attribute *attr,
5292 const char *buf, 5472 const char *buf,
5293 size_t count) 5473 size_t count)
5294{ 5474{
@@ -5317,13 +5497,17 @@ perf_set_reserve_percpu(struct sysdev_class *class,
5317 return count; 5497 return count;
5318} 5498}
5319 5499
5320static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) 5500static ssize_t perf_show_overcommit(struct sysdev_class *class,
5501 struct sysdev_class_attribute *attr,
5502 char *buf)
5321{ 5503{
5322 return sprintf(buf, "%d\n", perf_overcommit); 5504 return sprintf(buf, "%d\n", perf_overcommit);
5323} 5505}
5324 5506
5325static ssize_t 5507static ssize_t
5326perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) 5508perf_set_overcommit(struct sysdev_class *class,
5509 struct sysdev_class_attribute *attr,
5510 const char *buf, size_t count)
5327{ 5511{
5328 unsigned long val; 5512 unsigned long val;
5329 int err; 5513 int err;