aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/perf_event.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/perf_event.c')
-rw-r--r--kernel/perf_event.c627
1 files changed, 413 insertions, 214 deletions
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2ae7409bf38f..a661e7991865 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -98,11 +98,12 @@ void __weak hw_perf_enable(void) { barrier(); }
98 98
99void __weak hw_perf_event_setup(int cpu) { barrier(); } 99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); } 100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101void __weak hw_perf_event_setup_offline(int cpu) { barrier(); }
101 102
102int __weak 103int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader, 104hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx, 105 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu) 106 struct perf_event_context *ctx)
106{ 107{
107 return 0; 108 return 0;
108} 109}
@@ -248,7 +249,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
248 249
249static inline u64 perf_clock(void) 250static inline u64 perf_clock(void)
250{ 251{
251 return cpu_clock(smp_processor_id()); 252 return cpu_clock(raw_smp_processor_id());
252} 253}
253 254
254/* 255/*
@@ -289,6 +290,15 @@ static void update_event_times(struct perf_event *event)
289 event->total_time_running = run_end - event->tstamp_running; 290 event->total_time_running = run_end - event->tstamp_running;
290} 291}
291 292
293static struct list_head *
294ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
295{
296 if (event->attr.pinned)
297 return &ctx->pinned_groups;
298 else
299 return &ctx->flexible_groups;
300}
301
292/* 302/*
293 * Add a event from the lists for its context. 303 * Add a event from the lists for its context.
294 * Must be called with ctx->mutex and ctx->lock held. 304 * Must be called with ctx->mutex and ctx->lock held.
@@ -303,9 +313,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
303 * add it straight to the context's event list, or to the group 313 * add it straight to the context's event list, or to the group
304 * leader's sibling list: 314 * leader's sibling list:
305 */ 315 */
306 if (group_leader == event) 316 if (group_leader == event) {
307 list_add_tail(&event->group_entry, &ctx->group_list); 317 struct list_head *list;
308 else { 318
319 if (is_software_event(event))
320 event->group_flags |= PERF_GROUP_SOFTWARE;
321
322 list = ctx_group_list(event, ctx);
323 list_add_tail(&event->group_entry, list);
324 } else {
325 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
326 !is_software_event(event))
327 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
328
309 list_add_tail(&event->group_entry, &group_leader->sibling_list); 329 list_add_tail(&event->group_entry, &group_leader->sibling_list);
310 group_leader->nr_siblings++; 330 group_leader->nr_siblings++;
311 } 331 }
@@ -355,9 +375,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
355 * to the context list directly: 375 * to the context list directly:
356 */ 376 */
357 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 377 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
378 struct list_head *list;
358 379
359 list_move_tail(&sibling->group_entry, &ctx->group_list); 380 list = ctx_group_list(event, ctx);
381 list_move_tail(&sibling->group_entry, list);
360 sibling->group_leader = sibling; 382 sibling->group_leader = sibling;
383
384 /* Inherit group flags from the previous leader */
385 sibling->group_flags = event->group_flags;
361 } 386 }
362} 387}
363 388
@@ -608,14 +633,13 @@ void perf_event_disable(struct perf_event *event)
608static int 633static int
609event_sched_in(struct perf_event *event, 634event_sched_in(struct perf_event *event,
610 struct perf_cpu_context *cpuctx, 635 struct perf_cpu_context *cpuctx,
611 struct perf_event_context *ctx, 636 struct perf_event_context *ctx)
612 int cpu)
613{ 637{
614 if (event->state <= PERF_EVENT_STATE_OFF) 638 if (event->state <= PERF_EVENT_STATE_OFF)
615 return 0; 639 return 0;
616 640
617 event->state = PERF_EVENT_STATE_ACTIVE; 641 event->state = PERF_EVENT_STATE_ACTIVE;
618 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 642 event->oncpu = smp_processor_id();
619 /* 643 /*
620 * The new state must be visible before we turn it on in the hardware: 644 * The new state must be visible before we turn it on in the hardware:
621 */ 645 */
@@ -642,8 +666,7 @@ event_sched_in(struct perf_event *event,
642static int 666static int
643group_sched_in(struct perf_event *group_event, 667group_sched_in(struct perf_event *group_event,
644 struct perf_cpu_context *cpuctx, 668 struct perf_cpu_context *cpuctx,
645 struct perf_event_context *ctx, 669 struct perf_event_context *ctx)
646 int cpu)
647{ 670{
648 struct perf_event *event, *partial_group; 671 struct perf_event *event, *partial_group;
649 int ret; 672 int ret;
@@ -651,18 +674,18 @@ group_sched_in(struct perf_event *group_event,
651 if (group_event->state == PERF_EVENT_STATE_OFF) 674 if (group_event->state == PERF_EVENT_STATE_OFF)
652 return 0; 675 return 0;
653 676
654 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 677 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
655 if (ret) 678 if (ret)
656 return ret < 0 ? ret : 0; 679 return ret < 0 ? ret : 0;
657 680
658 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 681 if (event_sched_in(group_event, cpuctx, ctx))
659 return -EAGAIN; 682 return -EAGAIN;
660 683
661 /* 684 /*
662 * Schedule in siblings as one group (if any): 685 * Schedule in siblings as one group (if any):
663 */ 686 */
664 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 687 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
665 if (event_sched_in(event, cpuctx, ctx, cpu)) { 688 if (event_sched_in(event, cpuctx, ctx)) {
666 partial_group = event; 689 partial_group = event;
667 goto group_error; 690 goto group_error;
668 } 691 }
@@ -686,24 +709,6 @@ group_error:
686} 709}
687 710
688/* 711/*
689 * Return 1 for a group consisting entirely of software events,
690 * 0 if the group contains any hardware events.
691 */
692static int is_software_only_group(struct perf_event *leader)
693{
694 struct perf_event *event;
695
696 if (!is_software_event(leader))
697 return 0;
698
699 list_for_each_entry(event, &leader->sibling_list, group_entry)
700 if (!is_software_event(event))
701 return 0;
702
703 return 1;
704}
705
706/*
707 * Work out whether we can put this event group on the CPU now. 712 * Work out whether we can put this event group on the CPU now.
708 */ 713 */
709static int group_can_go_on(struct perf_event *event, 714static int group_can_go_on(struct perf_event *event,
@@ -713,7 +718,7 @@ static int group_can_go_on(struct perf_event *event,
713 /* 718 /*
714 * Groups consisting entirely of software events can always go on. 719 * Groups consisting entirely of software events can always go on.
715 */ 720 */
716 if (is_software_only_group(event)) 721 if (event->group_flags & PERF_GROUP_SOFTWARE)
717 return 1; 722 return 1;
718 /* 723 /*
719 * If an exclusive group is already on, no other hardware 724 * If an exclusive group is already on, no other hardware
@@ -754,7 +759,6 @@ static void __perf_install_in_context(void *info)
754 struct perf_event *event = info; 759 struct perf_event *event = info;
755 struct perf_event_context *ctx = event->ctx; 760 struct perf_event_context *ctx = event->ctx;
756 struct perf_event *leader = event->group_leader; 761 struct perf_event *leader = event->group_leader;
757 int cpu = smp_processor_id();
758 int err; 762 int err;
759 763
760 /* 764 /*
@@ -801,7 +805,7 @@ static void __perf_install_in_context(void *info)
801 if (!group_can_go_on(event, cpuctx, 1)) 805 if (!group_can_go_on(event, cpuctx, 1))
802 err = -EEXIST; 806 err = -EEXIST;
803 else 807 else
804 err = event_sched_in(event, cpuctx, ctx, cpu); 808 err = event_sched_in(event, cpuctx, ctx);
805 809
806 if (err) { 810 if (err) {
807 /* 811 /*
@@ -943,11 +947,9 @@ static void __perf_event_enable(void *info)
943 } else { 947 } else {
944 perf_disable(); 948 perf_disable();
945 if (event == leader) 949 if (event == leader)
946 err = group_sched_in(event, cpuctx, ctx, 950 err = group_sched_in(event, cpuctx, ctx);
947 smp_processor_id());
948 else 951 else
949 err = event_sched_in(event, cpuctx, ctx, 952 err = event_sched_in(event, cpuctx, ctx);
950 smp_processor_id());
951 perf_enable(); 953 perf_enable();
952 } 954 }
953 955
@@ -1043,8 +1045,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1043 return 0; 1045 return 0;
1044} 1046}
1045 1047
1046void __perf_event_sched_out(struct perf_event_context *ctx, 1048enum event_type_t {
1047 struct perf_cpu_context *cpuctx) 1049 EVENT_FLEXIBLE = 0x1,
1050 EVENT_PINNED = 0x2,
1051 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1052};
1053
1054static void ctx_sched_out(struct perf_event_context *ctx,
1055 struct perf_cpu_context *cpuctx,
1056 enum event_type_t event_type)
1048{ 1057{
1049 struct perf_event *event; 1058 struct perf_event *event;
1050 1059
@@ -1055,10 +1064,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1055 update_context_time(ctx); 1064 update_context_time(ctx);
1056 1065
1057 perf_disable(); 1066 perf_disable();
1058 if (ctx->nr_active) { 1067 if (!ctx->nr_active)
1059 list_for_each_entry(event, &ctx->group_list, group_entry) 1068 goto out_enable;
1069
1070 if (event_type & EVENT_PINNED)
1071 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1060 group_sched_out(event, cpuctx, ctx); 1072 group_sched_out(event, cpuctx, ctx);
1061 } 1073
1074 if (event_type & EVENT_FLEXIBLE)
1075 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1076 group_sched_out(event, cpuctx, ctx);
1077
1078 out_enable:
1062 perf_enable(); 1079 perf_enable();
1063 out: 1080 out:
1064 raw_spin_unlock(&ctx->lock); 1081 raw_spin_unlock(&ctx->lock);
@@ -1170,9 +1187,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1170 * not restart the event. 1187 * not restart the event.
1171 */ 1188 */
1172void perf_event_task_sched_out(struct task_struct *task, 1189void perf_event_task_sched_out(struct task_struct *task,
1173 struct task_struct *next, int cpu) 1190 struct task_struct *next)
1174{ 1191{
1175 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1192 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1176 struct perf_event_context *ctx = task->perf_event_ctxp; 1193 struct perf_event_context *ctx = task->perf_event_ctxp;
1177 struct perf_event_context *next_ctx; 1194 struct perf_event_context *next_ctx;
1178 struct perf_event_context *parent; 1195 struct perf_event_context *parent;
@@ -1220,15 +1237,13 @@ void perf_event_task_sched_out(struct task_struct *task,
1220 rcu_read_unlock(); 1237 rcu_read_unlock();
1221 1238
1222 if (do_switch) { 1239 if (do_switch) {
1223 __perf_event_sched_out(ctx, cpuctx); 1240 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1224 cpuctx->task_ctx = NULL; 1241 cpuctx->task_ctx = NULL;
1225 } 1242 }
1226} 1243}
1227 1244
1228/* 1245static void task_ctx_sched_out(struct perf_event_context *ctx,
1229 * Called with IRQs disabled 1246 enum event_type_t event_type)
1230 */
1231static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1232{ 1247{
1233 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1248 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1234 1249
@@ -1238,47 +1253,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1238 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1253 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1239 return; 1254 return;
1240 1255
1241 __perf_event_sched_out(ctx, cpuctx); 1256 ctx_sched_out(ctx, cpuctx, event_type);
1242 cpuctx->task_ctx = NULL; 1257 cpuctx->task_ctx = NULL;
1243} 1258}
1244 1259
1245/* 1260/*
1246 * Called with IRQs disabled 1261 * Called with IRQs disabled
1247 */ 1262 */
1248static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) 1263static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1264{
1265 task_ctx_sched_out(ctx, EVENT_ALL);
1266}
1267
1268/*
1269 * Called with IRQs disabled
1270 */
1271static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1272 enum event_type_t event_type)
1249{ 1273{
1250 __perf_event_sched_out(&cpuctx->ctx, cpuctx); 1274 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1251} 1275}
1252 1276
1253static void 1277static void
1254__perf_event_sched_in(struct perf_event_context *ctx, 1278ctx_pinned_sched_in(struct perf_event_context *ctx,
1255 struct perf_cpu_context *cpuctx, int cpu) 1279 struct perf_cpu_context *cpuctx)
1256{ 1280{
1257 struct perf_event *event; 1281 struct perf_event *event;
1258 int can_add_hw = 1;
1259
1260 raw_spin_lock(&ctx->lock);
1261 ctx->is_active = 1;
1262 if (likely(!ctx->nr_events))
1263 goto out;
1264 1282
1265 ctx->timestamp = perf_clock(); 1283 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1266 1284 if (event->state <= PERF_EVENT_STATE_OFF)
1267 perf_disable();
1268
1269 /*
1270 * First go through the list and put on any pinned groups
1271 * in order to give them the best chance of going on.
1272 */
1273 list_for_each_entry(event, &ctx->group_list, group_entry) {
1274 if (event->state <= PERF_EVENT_STATE_OFF ||
1275 !event->attr.pinned)
1276 continue; 1285 continue;
1277 if (event->cpu != -1 && event->cpu != cpu) 1286 if (event->cpu != -1 && event->cpu != smp_processor_id())
1278 continue; 1287 continue;
1279 1288
1280 if (group_can_go_on(event, cpuctx, 1)) 1289 if (group_can_go_on(event, cpuctx, 1))
1281 group_sched_in(event, cpuctx, ctx, cpu); 1290 group_sched_in(event, cpuctx, ctx);
1282 1291
1283 /* 1292 /*
1284 * If this pinned group hasn't been scheduled, 1293 * If this pinned group hasn't been scheduled,
@@ -1289,32 +1298,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1289 event->state = PERF_EVENT_STATE_ERROR; 1298 event->state = PERF_EVENT_STATE_ERROR;
1290 } 1299 }
1291 } 1300 }
1301}
1292 1302
1293 list_for_each_entry(event, &ctx->group_list, group_entry) { 1303static void
1294 /* 1304ctx_flexible_sched_in(struct perf_event_context *ctx,
1295 * Ignore events in OFF or ERROR state, and 1305 struct perf_cpu_context *cpuctx)
1296 * ignore pinned events since we did them already. 1306{
1297 */ 1307 struct perf_event *event;
1298 if (event->state <= PERF_EVENT_STATE_OFF || 1308 int can_add_hw = 1;
1299 event->attr.pinned)
1300 continue;
1301 1309
1310 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1311 /* Ignore events in OFF or ERROR state */
1312 if (event->state <= PERF_EVENT_STATE_OFF)
1313 continue;
1302 /* 1314 /*
1303 * Listen to the 'cpu' scheduling filter constraint 1315 * Listen to the 'cpu' scheduling filter constraint
1304 * of events: 1316 * of events:
1305 */ 1317 */
1306 if (event->cpu != -1 && event->cpu != cpu) 1318 if (event->cpu != -1 && event->cpu != smp_processor_id())
1307 continue; 1319 continue;
1308 1320
1309 if (group_can_go_on(event, cpuctx, can_add_hw)) 1321 if (group_can_go_on(event, cpuctx, can_add_hw))
1310 if (group_sched_in(event, cpuctx, ctx, cpu)) 1322 if (group_sched_in(event, cpuctx, ctx))
1311 can_add_hw = 0; 1323 can_add_hw = 0;
1312 } 1324 }
1325}
1326
1327static void
1328ctx_sched_in(struct perf_event_context *ctx,
1329 struct perf_cpu_context *cpuctx,
1330 enum event_type_t event_type)
1331{
1332 raw_spin_lock(&ctx->lock);
1333 ctx->is_active = 1;
1334 if (likely(!ctx->nr_events))
1335 goto out;
1336
1337 ctx->timestamp = perf_clock();
1338
1339 perf_disable();
1340
1341 /*
1342 * First go through the list and put on any pinned groups
1343 * in order to give them the best chance of going on.
1344 */
1345 if (event_type & EVENT_PINNED)
1346 ctx_pinned_sched_in(ctx, cpuctx);
1347
1348 /* Then walk through the lower prio flexible groups */
1349 if (event_type & EVENT_FLEXIBLE)
1350 ctx_flexible_sched_in(ctx, cpuctx);
1351
1313 perf_enable(); 1352 perf_enable();
1314 out: 1353 out:
1315 raw_spin_unlock(&ctx->lock); 1354 raw_spin_unlock(&ctx->lock);
1316} 1355}
1317 1356
1357static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1358 enum event_type_t event_type)
1359{
1360 struct perf_event_context *ctx = &cpuctx->ctx;
1361
1362 ctx_sched_in(ctx, cpuctx, event_type);
1363}
1364
1365static void task_ctx_sched_in(struct task_struct *task,
1366 enum event_type_t event_type)
1367{
1368 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1369 struct perf_event_context *ctx = task->perf_event_ctxp;
1370
1371 if (likely(!ctx))
1372 return;
1373 if (cpuctx->task_ctx == ctx)
1374 return;
1375 ctx_sched_in(ctx, cpuctx, event_type);
1376 cpuctx->task_ctx = ctx;
1377}
1318/* 1378/*
1319 * Called from scheduler to add the events of the current task 1379 * Called from scheduler to add the events of the current task
1320 * with interrupts disabled. 1380 * with interrupts disabled.
@@ -1326,38 +1386,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1326 * accessing the event control register. If a NMI hits, then it will 1386 * accessing the event control register. If a NMI hits, then it will
1327 * keep the event running. 1387 * keep the event running.
1328 */ 1388 */
1329void perf_event_task_sched_in(struct task_struct *task, int cpu) 1389void perf_event_task_sched_in(struct task_struct *task)
1330{ 1390{
1331 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1391 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1332 struct perf_event_context *ctx = task->perf_event_ctxp; 1392 struct perf_event_context *ctx = task->perf_event_ctxp;
1333 1393
1334 if (likely(!ctx)) 1394 if (likely(!ctx))
1335 return; 1395 return;
1396
1336 if (cpuctx->task_ctx == ctx) 1397 if (cpuctx->task_ctx == ctx)
1337 return; 1398 return;
1338 __perf_event_sched_in(ctx, cpuctx, cpu); 1399
1400 /*
1401 * We want to keep the following priority order:
1402 * cpu pinned (that don't need to move), task pinned,
1403 * cpu flexible, task flexible.
1404 */
1405 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1406
1407 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1408 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1409 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1410
1339 cpuctx->task_ctx = ctx; 1411 cpuctx->task_ctx = ctx;
1340} 1412}
1341 1413
1342static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) 1414#define MAX_INTERRUPTS (~0ULL)
1415
1416static void perf_log_throttle(struct perf_event *event, int enable);
1417
1418static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1343{ 1419{
1344 struct perf_event_context *ctx = &cpuctx->ctx; 1420 u64 frequency = event->attr.sample_freq;
1421 u64 sec = NSEC_PER_SEC;
1422 u64 divisor, dividend;
1423
1424 int count_fls, nsec_fls, frequency_fls, sec_fls;
1425
1426 count_fls = fls64(count);
1427 nsec_fls = fls64(nsec);
1428 frequency_fls = fls64(frequency);
1429 sec_fls = 30;
1345 1430
1346 __perf_event_sched_in(ctx, cpuctx, cpu); 1431 /*
1432 * We got @count in @nsec, with a target of sample_freq HZ
1433 * the target period becomes:
1434 *
1435 * @count * 10^9
1436 * period = -------------------
1437 * @nsec * sample_freq
1438 *
1439 */
1440
1441 /*
1442 * Reduce accuracy by one bit such that @a and @b converge
1443 * to a similar magnitude.
1444 */
1445#define REDUCE_FLS(a, b) \
1446do { \
1447 if (a##_fls > b##_fls) { \
1448 a >>= 1; \
1449 a##_fls--; \
1450 } else { \
1451 b >>= 1; \
1452 b##_fls--; \
1453 } \
1454} while (0)
1455
1456 /*
1457 * Reduce accuracy until either term fits in a u64, then proceed with
1458 * the other, so that finally we can do a u64/u64 division.
1459 */
1460 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1461 REDUCE_FLS(nsec, frequency);
1462 REDUCE_FLS(sec, count);
1463 }
1464
1465 if (count_fls + sec_fls > 64) {
1466 divisor = nsec * frequency;
1467
1468 while (count_fls + sec_fls > 64) {
1469 REDUCE_FLS(count, sec);
1470 divisor >>= 1;
1471 }
1472
1473 dividend = count * sec;
1474 } else {
1475 dividend = count * sec;
1476
1477 while (nsec_fls + frequency_fls > 64) {
1478 REDUCE_FLS(nsec, frequency);
1479 dividend >>= 1;
1480 }
1481
1482 divisor = nsec * frequency;
1483 }
1484
1485 return div64_u64(dividend, divisor);
1347} 1486}
1348 1487
1349#define MAX_INTERRUPTS (~0ULL) 1488static void perf_event_stop(struct perf_event *event)
1489{
1490 if (!event->pmu->stop)
1491 return event->pmu->disable(event);
1350 1492
1351static void perf_log_throttle(struct perf_event *event, int enable); 1493 return event->pmu->stop(event);
1494}
1495
1496static int perf_event_start(struct perf_event *event)
1497{
1498 if (!event->pmu->start)
1499 return event->pmu->enable(event);
1352 1500
1353static void perf_adjust_period(struct perf_event *event, u64 events) 1501 return event->pmu->start(event);
1502}
1503
1504static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1354{ 1505{
1355 struct hw_perf_event *hwc = &event->hw; 1506 struct hw_perf_event *hwc = &event->hw;
1356 u64 period, sample_period; 1507 u64 period, sample_period;
1357 s64 delta; 1508 s64 delta;
1358 1509
1359 events *= hwc->sample_period; 1510 period = perf_calculate_period(event, nsec, count);
1360 period = div64_u64(events, event->attr.sample_freq);
1361 1511
1362 delta = (s64)(period - hwc->sample_period); 1512 delta = (s64)(period - hwc->sample_period);
1363 delta = (delta + 7) / 8; /* low pass filter */ 1513 delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1518,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
1368 sample_period = 1; 1518 sample_period = 1;
1369 1519
1370 hwc->sample_period = sample_period; 1520 hwc->sample_period = sample_period;
1521
1522 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1523 perf_disable();
1524 perf_event_stop(event);
1525 atomic64_set(&hwc->period_left, 0);
1526 perf_event_start(event);
1527 perf_enable();
1528 }
1371} 1529}
1372 1530
1373static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1531static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1374{ 1532{
1375 struct perf_event *event; 1533 struct perf_event *event;
1376 struct hw_perf_event *hwc; 1534 struct hw_perf_event *hwc;
1377 u64 interrupts, freq; 1535 u64 interrupts, now;
1536 s64 delta;
1378 1537
1379 raw_spin_lock(&ctx->lock); 1538 raw_spin_lock(&ctx->lock);
1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1539 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1395,44 +1554,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1395 if (interrupts == MAX_INTERRUPTS) { 1554 if (interrupts == MAX_INTERRUPTS) {
1396 perf_log_throttle(event, 1); 1555 perf_log_throttle(event, 1);
1397 event->pmu->unthrottle(event); 1556 event->pmu->unthrottle(event);
1398 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1399 } 1557 }
1400 1558
1401 if (!event->attr.freq || !event->attr.sample_freq) 1559 if (!event->attr.freq || !event->attr.sample_freq)
1402 continue; 1560 continue;
1403 1561
1404 /* 1562 event->pmu->read(event);
1405 * if the specified freq < HZ then we need to skip ticks 1563 now = atomic64_read(&event->count);
1406 */ 1564 delta = now - hwc->freq_count_stamp;
1407 if (event->attr.sample_freq < HZ) { 1565 hwc->freq_count_stamp = now;
1408 freq = event->attr.sample_freq;
1409
1410 hwc->freq_count += freq;
1411 hwc->freq_interrupts += interrupts;
1412
1413 if (hwc->freq_count < HZ)
1414 continue;
1415
1416 interrupts = hwc->freq_interrupts;
1417 hwc->freq_interrupts = 0;
1418 hwc->freq_count -= HZ;
1419 } else
1420 freq = HZ;
1421
1422 perf_adjust_period(event, freq * interrupts);
1423 1566
1424 /* 1567 if (delta > 0)
1425 * In order to avoid being stalled by an (accidental) huge 1568 perf_adjust_period(event, TICK_NSEC, delta);
1426 * sample period, force reset the sample period if we didn't
1427 * get any events in this freq period.
1428 */
1429 if (!interrupts) {
1430 perf_disable();
1431 event->pmu->disable(event);
1432 atomic64_set(&hwc->period_left, 0);
1433 event->pmu->enable(event);
1434 perf_enable();
1435 }
1436 } 1569 }
1437 raw_spin_unlock(&ctx->lock); 1570 raw_spin_unlock(&ctx->lock);
1438} 1571}
@@ -1442,26 +1575,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1442 */ 1575 */
1443static void rotate_ctx(struct perf_event_context *ctx) 1576static void rotate_ctx(struct perf_event_context *ctx)
1444{ 1577{
1445 struct perf_event *event;
1446
1447 if (!ctx->nr_events) 1578 if (!ctx->nr_events)
1448 return; 1579 return;
1449 1580
1450 raw_spin_lock(&ctx->lock); 1581 raw_spin_lock(&ctx->lock);
1451 /* 1582
1452 * Rotate the first entry last (works just fine for group events too): 1583 /* Rotate the first entry last of non-pinned groups */
1453 */ 1584 list_rotate_left(&ctx->flexible_groups);
1454 perf_disable();
1455 list_for_each_entry(event, &ctx->group_list, group_entry) {
1456 list_move_tail(&event->group_entry, &ctx->group_list);
1457 break;
1458 }
1459 perf_enable();
1460 1585
1461 raw_spin_unlock(&ctx->lock); 1586 raw_spin_unlock(&ctx->lock);
1462} 1587}
1463 1588
1464void perf_event_task_tick(struct task_struct *curr, int cpu) 1589void perf_event_task_tick(struct task_struct *curr)
1465{ 1590{
1466 struct perf_cpu_context *cpuctx; 1591 struct perf_cpu_context *cpuctx;
1467 struct perf_event_context *ctx; 1592 struct perf_event_context *ctx;
@@ -1469,24 +1594,43 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
1469 if (!atomic_read(&nr_events)) 1594 if (!atomic_read(&nr_events))
1470 return; 1595 return;
1471 1596
1472 cpuctx = &per_cpu(perf_cpu_context, cpu); 1597 cpuctx = &__get_cpu_var(perf_cpu_context);
1473 ctx = curr->perf_event_ctxp; 1598 ctx = curr->perf_event_ctxp;
1474 1599
1600 perf_disable();
1601
1475 perf_ctx_adjust_freq(&cpuctx->ctx); 1602 perf_ctx_adjust_freq(&cpuctx->ctx);
1476 if (ctx) 1603 if (ctx)
1477 perf_ctx_adjust_freq(ctx); 1604 perf_ctx_adjust_freq(ctx);
1478 1605
1479 perf_event_cpu_sched_out(cpuctx); 1606 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1480 if (ctx) 1607 if (ctx)
1481 __perf_event_task_sched_out(ctx); 1608 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1482 1609
1483 rotate_ctx(&cpuctx->ctx); 1610 rotate_ctx(&cpuctx->ctx);
1484 if (ctx) 1611 if (ctx)
1485 rotate_ctx(ctx); 1612 rotate_ctx(ctx);
1486 1613
1487 perf_event_cpu_sched_in(cpuctx, cpu); 1614 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1488 if (ctx) 1615 if (ctx)
1489 perf_event_task_sched_in(curr, cpu); 1616 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1617
1618 perf_enable();
1619}
1620
1621static int event_enable_on_exec(struct perf_event *event,
1622 struct perf_event_context *ctx)
1623{
1624 if (!event->attr.enable_on_exec)
1625 return 0;
1626
1627 event->attr.enable_on_exec = 0;
1628 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1629 return 0;
1630
1631 __perf_event_mark_enabled(event, ctx);
1632
1633 return 1;
1490} 1634}
1491 1635
1492/* 1636/*
@@ -1499,6 +1643,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1499 struct perf_event *event; 1643 struct perf_event *event;
1500 unsigned long flags; 1644 unsigned long flags;
1501 int enabled = 0; 1645 int enabled = 0;
1646 int ret;
1502 1647
1503 local_irq_save(flags); 1648 local_irq_save(flags);
1504 ctx = task->perf_event_ctxp; 1649 ctx = task->perf_event_ctxp;
@@ -1509,14 +1654,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1509 1654
1510 raw_spin_lock(&ctx->lock); 1655 raw_spin_lock(&ctx->lock);
1511 1656
1512 list_for_each_entry(event, &ctx->group_list, group_entry) { 1657 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1513 if (!event->attr.enable_on_exec) 1658 ret = event_enable_on_exec(event, ctx);
1514 continue; 1659 if (ret)
1515 event->attr.enable_on_exec = 0; 1660 enabled = 1;
1516 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1661 }
1517 continue; 1662
1518 __perf_event_mark_enabled(event, ctx); 1663 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1519 enabled = 1; 1664 ret = event_enable_on_exec(event, ctx);
1665 if (ret)
1666 enabled = 1;
1520 } 1667 }
1521 1668
1522 /* 1669 /*
@@ -1527,7 +1674,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1527 1674
1528 raw_spin_unlock(&ctx->lock); 1675 raw_spin_unlock(&ctx->lock);
1529 1676
1530 perf_event_task_sched_in(task, smp_processor_id()); 1677 perf_event_task_sched_in(task);
1531 out: 1678 out:
1532 local_irq_restore(flags); 1679 local_irq_restore(flags);
1533} 1680}
@@ -1590,7 +1737,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
1590{ 1737{
1591 raw_spin_lock_init(&ctx->lock); 1738 raw_spin_lock_init(&ctx->lock);
1592 mutex_init(&ctx->mutex); 1739 mutex_init(&ctx->mutex);
1593 INIT_LIST_HEAD(&ctx->group_list); 1740 INIT_LIST_HEAD(&ctx->pinned_groups);
1741 INIT_LIST_HEAD(&ctx->flexible_groups);
1594 INIT_LIST_HEAD(&ctx->event_list); 1742 INIT_LIST_HEAD(&ctx->event_list);
1595 atomic_set(&ctx->refcount, 1); 1743 atomic_set(&ctx->refcount, 1);
1596 ctx->task = task; 1744 ctx->task = task;
@@ -3608,7 +3756,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3608 /* .tid */ 3756 /* .tid */
3609 .start = vma->vm_start, 3757 .start = vma->vm_start,
3610 .len = vma->vm_end - vma->vm_start, 3758 .len = vma->vm_end - vma->vm_start,
3611 .pgoff = vma->vm_pgoff, 3759 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3612 }, 3760 },
3613 }; 3761 };
3614 3762
@@ -3688,12 +3836,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3688 3836
3689 if (event->attr.freq) { 3837 if (event->attr.freq) {
3690 u64 now = perf_clock(); 3838 u64 now = perf_clock();
3691 s64 delta = now - hwc->freq_stamp; 3839 s64 delta = now - hwc->freq_time_stamp;
3692 3840
3693 hwc->freq_stamp = now; 3841 hwc->freq_time_stamp = now;
3694 3842
3695 if (delta > 0 && delta < TICK_NSEC) 3843 if (delta > 0 && delta < 2*TICK_NSEC)
3696 perf_adjust_period(event, NSEC_PER_SEC / (int)delta); 3844 perf_adjust_period(event, delta, hwc->last_period);
3697 } 3845 }
3698 3846
3699 /* 3847 /*
@@ -4184,7 +4332,7 @@ static const struct pmu perf_ops_task_clock = {
4184 .read = task_clock_perf_event_read, 4332 .read = task_clock_perf_event_read,
4185}; 4333};
4186 4334
4187#ifdef CONFIG_EVENT_PROFILE 4335#ifdef CONFIG_EVENT_TRACING
4188 4336
4189void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4337void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4190 int entry_size) 4338 int entry_size)
@@ -4289,7 +4437,7 @@ static void perf_event_free_filter(struct perf_event *event)
4289{ 4437{
4290} 4438}
4291 4439
4292#endif /* CONFIG_EVENT_PROFILE */ 4440#endif /* CONFIG_EVENT_TRACING */
4293 4441
4294#ifdef CONFIG_HAVE_HW_BREAKPOINT 4442#ifdef CONFIG_HAVE_HW_BREAKPOINT
4295static void bp_perf_event_destroy(struct perf_event *event) 4443static void bp_perf_event_destroy(struct perf_event *event)
@@ -4870,8 +5018,15 @@ inherit_event(struct perf_event *parent_event,
4870 else 5018 else
4871 child_event->state = PERF_EVENT_STATE_OFF; 5019 child_event->state = PERF_EVENT_STATE_OFF;
4872 5020
4873 if (parent_event->attr.freq) 5021 if (parent_event->attr.freq) {
4874 child_event->hw.sample_period = parent_event->hw.sample_period; 5022 u64 sample_period = parent_event->hw.sample_period;
5023 struct hw_perf_event *hwc = &child_event->hw;
5024
5025 hwc->sample_period = sample_period;
5026 hwc->last_period = sample_period;
5027
5028 atomic64_set(&hwc->period_left, sample_period);
5029 }
4875 5030
4876 child_event->overflow_handler = parent_event->overflow_handler; 5031 child_event->overflow_handler = parent_event->overflow_handler;
4877 5032
@@ -5039,7 +5194,11 @@ void perf_event_exit_task(struct task_struct *child)
5039 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5194 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5040 5195
5041again: 5196again:
5042 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, 5197 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5198 group_entry)
5199 __perf_event_exit_task(child_event, child_ctx, child);
5200
5201 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5043 group_entry) 5202 group_entry)
5044 __perf_event_exit_task(child_event, child_ctx, child); 5203 __perf_event_exit_task(child_event, child_ctx, child);
5045 5204
@@ -5048,7 +5207,8 @@ again:
5048 * its siblings to the list, but we obtained 'tmp' before that which 5207 * its siblings to the list, but we obtained 'tmp' before that which
5049 * will still point to the list head terminating the iteration. 5208 * will still point to the list head terminating the iteration.
5050 */ 5209 */
5051 if (!list_empty(&child_ctx->group_list)) 5210 if (!list_empty(&child_ctx->pinned_groups) ||
5211 !list_empty(&child_ctx->flexible_groups))
5052 goto again; 5212 goto again;
5053 5213
5054 mutex_unlock(&child_ctx->mutex); 5214 mutex_unlock(&child_ctx->mutex);
@@ -5056,6 +5216,24 @@ again:
5056 put_ctx(child_ctx); 5216 put_ctx(child_ctx);
5057} 5217}
5058 5218
5219static void perf_free_event(struct perf_event *event,
5220 struct perf_event_context *ctx)
5221{
5222 struct perf_event *parent = event->parent;
5223
5224 if (WARN_ON_ONCE(!parent))
5225 return;
5226
5227 mutex_lock(&parent->child_mutex);
5228 list_del_init(&event->child_list);
5229 mutex_unlock(&parent->child_mutex);
5230
5231 fput(parent->filp);
5232
5233 list_del_event(event, ctx);
5234 free_event(event);
5235}
5236
5059/* 5237/*
5060 * free an unexposed, unused context as created by inheritance by 5238 * free an unexposed, unused context as created by inheritance by
5061 * init_task below, used by fork() in case of fail. 5239 * init_task below, used by fork() in case of fail.
@@ -5070,36 +5248,70 @@ void perf_event_free_task(struct task_struct *task)
5070 5248
5071 mutex_lock(&ctx->mutex); 5249 mutex_lock(&ctx->mutex);
5072again: 5250again:
5073 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { 5251 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5074 struct perf_event *parent = event->parent; 5252 perf_free_event(event, ctx);
5075 5253
5076 if (WARN_ON_ONCE(!parent)) 5254 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5077 continue; 5255 group_entry)
5256 perf_free_event(event, ctx);
5078 5257
5079 mutex_lock(&parent->child_mutex); 5258 if (!list_empty(&ctx->pinned_groups) ||
5080 list_del_init(&event->child_list); 5259 !list_empty(&ctx->flexible_groups))
5081 mutex_unlock(&parent->child_mutex); 5260 goto again;
5082 5261
5083 fput(parent->filp); 5262 mutex_unlock(&ctx->mutex);
5084 5263
5085 list_del_event(event, ctx); 5264 put_ctx(ctx);
5086 free_event(event); 5265}
5266
5267static int
5268inherit_task_group(struct perf_event *event, struct task_struct *parent,
5269 struct perf_event_context *parent_ctx,
5270 struct task_struct *child,
5271 int *inherited_all)
5272{
5273 int ret;
5274 struct perf_event_context *child_ctx = child->perf_event_ctxp;
5275
5276 if (!event->attr.inherit) {
5277 *inherited_all = 0;
5278 return 0;
5087 } 5279 }
5088 5280
5089 if (!list_empty(&ctx->group_list)) 5281 if (!child_ctx) {
5090 goto again; 5282 /*
5283 * This is executed from the parent task context, so
5284 * inherit events that have been marked for cloning.
5285 * First allocate and initialize a context for the
5286 * child.
5287 */
5091 5288
5092 mutex_unlock(&ctx->mutex); 5289 child_ctx = kzalloc(sizeof(struct perf_event_context),
5290 GFP_KERNEL);
5291 if (!child_ctx)
5292 return -ENOMEM;
5093 5293
5094 put_ctx(ctx); 5294 __perf_event_init_context(child_ctx, child);
5295 child->perf_event_ctxp = child_ctx;
5296 get_task_struct(child);
5297 }
5298
5299 ret = inherit_group(event, parent, parent_ctx,
5300 child, child_ctx);
5301
5302 if (ret)
5303 *inherited_all = 0;
5304
5305 return ret;
5095} 5306}
5096 5307
5308
5097/* 5309/*
5098 * Initialize the perf_event context in task_struct 5310 * Initialize the perf_event context in task_struct
5099 */ 5311 */
5100int perf_event_init_task(struct task_struct *child) 5312int perf_event_init_task(struct task_struct *child)
5101{ 5313{
5102 struct perf_event_context *child_ctx = NULL, *parent_ctx; 5314 struct perf_event_context *child_ctx, *parent_ctx;
5103 struct perf_event_context *cloned_ctx; 5315 struct perf_event_context *cloned_ctx;
5104 struct perf_event *event; 5316 struct perf_event *event;
5105 struct task_struct *parent = current; 5317 struct task_struct *parent = current;
@@ -5137,41 +5349,22 @@ int perf_event_init_task(struct task_struct *child)
5137 * We dont have to disable NMIs - we are only looking at 5349 * We dont have to disable NMIs - we are only looking at
5138 * the list, not manipulating it: 5350 * the list, not manipulating it:
5139 */ 5351 */
5140 list_for_each_entry(event, &parent_ctx->group_list, group_entry) { 5352 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5141 5353 ret = inherit_task_group(event, parent, parent_ctx, child,
5142 if (!event->attr.inherit) { 5354 &inherited_all);
5143 inherited_all = 0; 5355 if (ret)
5144 continue; 5356 break;
5145 } 5357 }
5146
5147 if (!child->perf_event_ctxp) {
5148 /*
5149 * This is executed from the parent task context, so
5150 * inherit events that have been marked for cloning.
5151 * First allocate and initialize a context for the
5152 * child.
5153 */
5154
5155 child_ctx = kzalloc(sizeof(struct perf_event_context),
5156 GFP_KERNEL);
5157 if (!child_ctx) {
5158 ret = -ENOMEM;
5159 break;
5160 }
5161
5162 __perf_event_init_context(child_ctx, child);
5163 child->perf_event_ctxp = child_ctx;
5164 get_task_struct(child);
5165 }
5166 5358
5167 ret = inherit_group(event, parent, parent_ctx, 5359 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5168 child, child_ctx); 5360 ret = inherit_task_group(event, parent, parent_ctx, child,
5169 if (ret) { 5361 &inherited_all);
5170 inherited_all = 0; 5362 if (ret)
5171 break; 5363 break;
5172 }
5173 } 5364 }
5174 5365
5366 child_ctx = child->perf_event_ctxp;
5367
5175 if (child_ctx && inherited_all) { 5368 if (child_ctx && inherited_all) {
5176 /* 5369 /*
5177 * Mark the child context as a clone of the parent 5370 * Mark the child context as a clone of the parent
@@ -5220,7 +5413,9 @@ static void __perf_event_exit_cpu(void *info)
5220 struct perf_event_context *ctx = &cpuctx->ctx; 5413 struct perf_event_context *ctx = &cpuctx->ctx;
5221 struct perf_event *event, *tmp; 5414 struct perf_event *event, *tmp;
5222 5415
5223 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) 5416 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5417 __perf_event_remove_from_context(event);
5418 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5224 __perf_event_remove_from_context(event); 5419 __perf_event_remove_from_context(event);
5225} 5420}
5226static void perf_event_exit_cpu(int cpu) 5421static void perf_event_exit_cpu(int cpu)
@@ -5258,6 +5453,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5258 perf_event_exit_cpu(cpu); 5453 perf_event_exit_cpu(cpu);
5259 break; 5454 break;
5260 5455
5456 case CPU_DEAD:
5457 hw_perf_event_setup_offline(cpu);
5458 break;
5459
5261 default: 5460 default:
5262 break; 5461 break;
5263 } 5462 }