aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/kprobes.c34
-rw-r--r--kernel/perf_event.c627
-rw-r--r--kernel/sched.c12
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c54
-rw-r--r--kernel/trace/trace_event_profile.c52
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_kprobe.c196
-rw-r--r--kernel/trace/trace_syscalls.c76
9 files changed, 580 insertions, 479 deletions
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b7df302a0204..ccec774c716d 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -44,6 +44,7 @@
44#include <linux/debugfs.h> 44#include <linux/debugfs.h>
45#include <linux/kdebug.h> 45#include <linux/kdebug.h>
46#include <linux/memory.h> 46#include <linux/memory.h>
47#include <linux/ftrace.h>
47 48
48#include <asm-generic/sections.h> 49#include <asm-generic/sections.h>
49#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
@@ -93,6 +94,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = {
93 {"native_get_debugreg",}, 94 {"native_get_debugreg",},
94 {"irq_entries_start",}, 95 {"irq_entries_start",},
95 {"common_interrupt",}, 96 {"common_interrupt",},
97 {"mcount",}, /* mcount can be called from everywhere */
96 {NULL} /* Terminator */ 98 {NULL} /* Terminator */
97}; 99};
98 100
@@ -124,30 +126,6 @@ static LIST_HEAD(kprobe_insn_pages);
124static int kprobe_garbage_slots; 126static int kprobe_garbage_slots;
125static int collect_garbage_slots(void); 127static int collect_garbage_slots(void);
126 128
127static int __kprobes check_safety(void)
128{
129 int ret = 0;
130#if defined(CONFIG_PREEMPT) && defined(CONFIG_FREEZER)
131 ret = freeze_processes();
132 if (ret == 0) {
133 struct task_struct *p, *q;
134 do_each_thread(p, q) {
135 if (p != current && p->state == TASK_RUNNING &&
136 p->pid != 0) {
137 printk("Check failed: %s is running\n",p->comm);
138 ret = -1;
139 goto loop_end;
140 }
141 } while_each_thread(p, q);
142 }
143loop_end:
144 thaw_processes();
145#else
146 synchronize_sched();
147#endif
148 return ret;
149}
150
151/** 129/**
152 * __get_insn_slot() - Find a slot on an executable page for an instruction. 130 * __get_insn_slot() - Find a slot on an executable page for an instruction.
153 * We allocate an executable page if there's no room on existing ones. 131 * We allocate an executable page if there's no room on existing ones.
@@ -235,9 +213,8 @@ static int __kprobes collect_garbage_slots(void)
235{ 213{
236 struct kprobe_insn_page *kip, *next; 214 struct kprobe_insn_page *kip, *next;
237 215
238 /* Ensure no-one is preepmted on the garbages */ 216 /* Ensure no-one is interrupted on the garbages */
239 if (check_safety()) 217 synchronize_sched();
240 return -EAGAIN;
241 218
242 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { 219 list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) {
243 int i; 220 int i;
@@ -728,7 +705,8 @@ int __kprobes register_kprobe(struct kprobe *p)
728 705
729 preempt_disable(); 706 preempt_disable();
730 if (!kernel_text_address((unsigned long) p->addr) || 707 if (!kernel_text_address((unsigned long) p->addr) ||
731 in_kprobes_functions((unsigned long) p->addr)) { 708 in_kprobes_functions((unsigned long) p->addr) ||
709 ftrace_text_reserved(p->addr, p->addr)) {
732 preempt_enable(); 710 preempt_enable();
733 return -EINVAL; 711 return -EINVAL;
734 } 712 }
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 2ae7409bf38f..a661e7991865 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -98,11 +98,12 @@ void __weak hw_perf_enable(void) { barrier(); }
98 98
99void __weak hw_perf_event_setup(int cpu) { barrier(); } 99void __weak hw_perf_event_setup(int cpu) { barrier(); }
100void __weak hw_perf_event_setup_online(int cpu) { barrier(); } 100void __weak hw_perf_event_setup_online(int cpu) { barrier(); }
101void __weak hw_perf_event_setup_offline(int cpu) { barrier(); }
101 102
102int __weak 103int __weak
103hw_perf_group_sched_in(struct perf_event *group_leader, 104hw_perf_group_sched_in(struct perf_event *group_leader,
104 struct perf_cpu_context *cpuctx, 105 struct perf_cpu_context *cpuctx,
105 struct perf_event_context *ctx, int cpu) 106 struct perf_event_context *ctx)
106{ 107{
107 return 0; 108 return 0;
108} 109}
@@ -248,7 +249,7 @@ static void perf_unpin_context(struct perf_event_context *ctx)
248 249
249static inline u64 perf_clock(void) 250static inline u64 perf_clock(void)
250{ 251{
251 return cpu_clock(smp_processor_id()); 252 return cpu_clock(raw_smp_processor_id());
252} 253}
253 254
254/* 255/*
@@ -289,6 +290,15 @@ static void update_event_times(struct perf_event *event)
289 event->total_time_running = run_end - event->tstamp_running; 290 event->total_time_running = run_end - event->tstamp_running;
290} 291}
291 292
293static struct list_head *
294ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
295{
296 if (event->attr.pinned)
297 return &ctx->pinned_groups;
298 else
299 return &ctx->flexible_groups;
300}
301
292/* 302/*
293 * Add a event from the lists for its context. 303 * Add a event from the lists for its context.
294 * Must be called with ctx->mutex and ctx->lock held. 304 * Must be called with ctx->mutex and ctx->lock held.
@@ -303,9 +313,19 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
303 * add it straight to the context's event list, or to the group 313 * add it straight to the context's event list, or to the group
304 * leader's sibling list: 314 * leader's sibling list:
305 */ 315 */
306 if (group_leader == event) 316 if (group_leader == event) {
307 list_add_tail(&event->group_entry, &ctx->group_list); 317 struct list_head *list;
308 else { 318
319 if (is_software_event(event))
320 event->group_flags |= PERF_GROUP_SOFTWARE;
321
322 list = ctx_group_list(event, ctx);
323 list_add_tail(&event->group_entry, list);
324 } else {
325 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
326 !is_software_event(event))
327 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
328
309 list_add_tail(&event->group_entry, &group_leader->sibling_list); 329 list_add_tail(&event->group_entry, &group_leader->sibling_list);
310 group_leader->nr_siblings++; 330 group_leader->nr_siblings++;
311 } 331 }
@@ -355,9 +375,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
355 * to the context list directly: 375 * to the context list directly:
356 */ 376 */
357 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { 377 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
378 struct list_head *list;
358 379
359 list_move_tail(&sibling->group_entry, &ctx->group_list); 380 list = ctx_group_list(event, ctx);
381 list_move_tail(&sibling->group_entry, list);
360 sibling->group_leader = sibling; 382 sibling->group_leader = sibling;
383
384 /* Inherit group flags from the previous leader */
385 sibling->group_flags = event->group_flags;
361 } 386 }
362} 387}
363 388
@@ -608,14 +633,13 @@ void perf_event_disable(struct perf_event *event)
608static int 633static int
609event_sched_in(struct perf_event *event, 634event_sched_in(struct perf_event *event,
610 struct perf_cpu_context *cpuctx, 635 struct perf_cpu_context *cpuctx,
611 struct perf_event_context *ctx, 636 struct perf_event_context *ctx)
612 int cpu)
613{ 637{
614 if (event->state <= PERF_EVENT_STATE_OFF) 638 if (event->state <= PERF_EVENT_STATE_OFF)
615 return 0; 639 return 0;
616 640
617 event->state = PERF_EVENT_STATE_ACTIVE; 641 event->state = PERF_EVENT_STATE_ACTIVE;
618 event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ 642 event->oncpu = smp_processor_id();
619 /* 643 /*
620 * The new state must be visible before we turn it on in the hardware: 644 * The new state must be visible before we turn it on in the hardware:
621 */ 645 */
@@ -642,8 +666,7 @@ event_sched_in(struct perf_event *event,
642static int 666static int
643group_sched_in(struct perf_event *group_event, 667group_sched_in(struct perf_event *group_event,
644 struct perf_cpu_context *cpuctx, 668 struct perf_cpu_context *cpuctx,
645 struct perf_event_context *ctx, 669 struct perf_event_context *ctx)
646 int cpu)
647{ 670{
648 struct perf_event *event, *partial_group; 671 struct perf_event *event, *partial_group;
649 int ret; 672 int ret;
@@ -651,18 +674,18 @@ group_sched_in(struct perf_event *group_event,
651 if (group_event->state == PERF_EVENT_STATE_OFF) 674 if (group_event->state == PERF_EVENT_STATE_OFF)
652 return 0; 675 return 0;
653 676
654 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); 677 ret = hw_perf_group_sched_in(group_event, cpuctx, ctx);
655 if (ret) 678 if (ret)
656 return ret < 0 ? ret : 0; 679 return ret < 0 ? ret : 0;
657 680
658 if (event_sched_in(group_event, cpuctx, ctx, cpu)) 681 if (event_sched_in(group_event, cpuctx, ctx))
659 return -EAGAIN; 682 return -EAGAIN;
660 683
661 /* 684 /*
662 * Schedule in siblings as one group (if any): 685 * Schedule in siblings as one group (if any):
663 */ 686 */
664 list_for_each_entry(event, &group_event->sibling_list, group_entry) { 687 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
665 if (event_sched_in(event, cpuctx, ctx, cpu)) { 688 if (event_sched_in(event, cpuctx, ctx)) {
666 partial_group = event; 689 partial_group = event;
667 goto group_error; 690 goto group_error;
668 } 691 }
@@ -686,24 +709,6 @@ group_error:
686} 709}
687 710
688/* 711/*
689 * Return 1 for a group consisting entirely of software events,
690 * 0 if the group contains any hardware events.
691 */
692static int is_software_only_group(struct perf_event *leader)
693{
694 struct perf_event *event;
695
696 if (!is_software_event(leader))
697 return 0;
698
699 list_for_each_entry(event, &leader->sibling_list, group_entry)
700 if (!is_software_event(event))
701 return 0;
702
703 return 1;
704}
705
706/*
707 * Work out whether we can put this event group on the CPU now. 712 * Work out whether we can put this event group on the CPU now.
708 */ 713 */
709static int group_can_go_on(struct perf_event *event, 714static int group_can_go_on(struct perf_event *event,
@@ -713,7 +718,7 @@ static int group_can_go_on(struct perf_event *event,
713 /* 718 /*
714 * Groups consisting entirely of software events can always go on. 719 * Groups consisting entirely of software events can always go on.
715 */ 720 */
716 if (is_software_only_group(event)) 721 if (event->group_flags & PERF_GROUP_SOFTWARE)
717 return 1; 722 return 1;
718 /* 723 /*
719 * If an exclusive group is already on, no other hardware 724 * If an exclusive group is already on, no other hardware
@@ -754,7 +759,6 @@ static void __perf_install_in_context(void *info)
754 struct perf_event *event = info; 759 struct perf_event *event = info;
755 struct perf_event_context *ctx = event->ctx; 760 struct perf_event_context *ctx = event->ctx;
756 struct perf_event *leader = event->group_leader; 761 struct perf_event *leader = event->group_leader;
757 int cpu = smp_processor_id();
758 int err; 762 int err;
759 763
760 /* 764 /*
@@ -801,7 +805,7 @@ static void __perf_install_in_context(void *info)
801 if (!group_can_go_on(event, cpuctx, 1)) 805 if (!group_can_go_on(event, cpuctx, 1))
802 err = -EEXIST; 806 err = -EEXIST;
803 else 807 else
804 err = event_sched_in(event, cpuctx, ctx, cpu); 808 err = event_sched_in(event, cpuctx, ctx);
805 809
806 if (err) { 810 if (err) {
807 /* 811 /*
@@ -943,11 +947,9 @@ static void __perf_event_enable(void *info)
943 } else { 947 } else {
944 perf_disable(); 948 perf_disable();
945 if (event == leader) 949 if (event == leader)
946 err = group_sched_in(event, cpuctx, ctx, 950 err = group_sched_in(event, cpuctx, ctx);
947 smp_processor_id());
948 else 951 else
949 err = event_sched_in(event, cpuctx, ctx, 952 err = event_sched_in(event, cpuctx, ctx);
950 smp_processor_id());
951 perf_enable(); 953 perf_enable();
952 } 954 }
953 955
@@ -1043,8 +1045,15 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1043 return 0; 1045 return 0;
1044} 1046}
1045 1047
1046void __perf_event_sched_out(struct perf_event_context *ctx, 1048enum event_type_t {
1047 struct perf_cpu_context *cpuctx) 1049 EVENT_FLEXIBLE = 0x1,
1050 EVENT_PINNED = 0x2,
1051 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
1052};
1053
1054static void ctx_sched_out(struct perf_event_context *ctx,
1055 struct perf_cpu_context *cpuctx,
1056 enum event_type_t event_type)
1048{ 1057{
1049 struct perf_event *event; 1058 struct perf_event *event;
1050 1059
@@ -1055,10 +1064,18 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
1055 update_context_time(ctx); 1064 update_context_time(ctx);
1056 1065
1057 perf_disable(); 1066 perf_disable();
1058 if (ctx->nr_active) { 1067 if (!ctx->nr_active)
1059 list_for_each_entry(event, &ctx->group_list, group_entry) 1068 goto out_enable;
1069
1070 if (event_type & EVENT_PINNED)
1071 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1060 group_sched_out(event, cpuctx, ctx); 1072 group_sched_out(event, cpuctx, ctx);
1061 } 1073
1074 if (event_type & EVENT_FLEXIBLE)
1075 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1076 group_sched_out(event, cpuctx, ctx);
1077
1078 out_enable:
1062 perf_enable(); 1079 perf_enable();
1063 out: 1080 out:
1064 raw_spin_unlock(&ctx->lock); 1081 raw_spin_unlock(&ctx->lock);
@@ -1170,9 +1187,9 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
1170 * not restart the event. 1187 * not restart the event.
1171 */ 1188 */
1172void perf_event_task_sched_out(struct task_struct *task, 1189void perf_event_task_sched_out(struct task_struct *task,
1173 struct task_struct *next, int cpu) 1190 struct task_struct *next)
1174{ 1191{
1175 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1192 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1176 struct perf_event_context *ctx = task->perf_event_ctxp; 1193 struct perf_event_context *ctx = task->perf_event_ctxp;
1177 struct perf_event_context *next_ctx; 1194 struct perf_event_context *next_ctx;
1178 struct perf_event_context *parent; 1195 struct perf_event_context *parent;
@@ -1220,15 +1237,13 @@ void perf_event_task_sched_out(struct task_struct *task,
1220 rcu_read_unlock(); 1237 rcu_read_unlock();
1221 1238
1222 if (do_switch) { 1239 if (do_switch) {
1223 __perf_event_sched_out(ctx, cpuctx); 1240 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1224 cpuctx->task_ctx = NULL; 1241 cpuctx->task_ctx = NULL;
1225 } 1242 }
1226} 1243}
1227 1244
1228/* 1245static void task_ctx_sched_out(struct perf_event_context *ctx,
1229 * Called with IRQs disabled 1246 enum event_type_t event_type)
1230 */
1231static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1232{ 1247{
1233 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); 1248 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1234 1249
@@ -1238,47 +1253,41 @@ static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1238 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 1253 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1239 return; 1254 return;
1240 1255
1241 __perf_event_sched_out(ctx, cpuctx); 1256 ctx_sched_out(ctx, cpuctx, event_type);
1242 cpuctx->task_ctx = NULL; 1257 cpuctx->task_ctx = NULL;
1243} 1258}
1244 1259
1245/* 1260/*
1246 * Called with IRQs disabled 1261 * Called with IRQs disabled
1247 */ 1262 */
1248static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) 1263static void __perf_event_task_sched_out(struct perf_event_context *ctx)
1264{
1265 task_ctx_sched_out(ctx, EVENT_ALL);
1266}
1267
1268/*
1269 * Called with IRQs disabled
1270 */
1271static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
1272 enum event_type_t event_type)
1249{ 1273{
1250 __perf_event_sched_out(&cpuctx->ctx, cpuctx); 1274 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
1251} 1275}
1252 1276
1253static void 1277static void
1254__perf_event_sched_in(struct perf_event_context *ctx, 1278ctx_pinned_sched_in(struct perf_event_context *ctx,
1255 struct perf_cpu_context *cpuctx, int cpu) 1279 struct perf_cpu_context *cpuctx)
1256{ 1280{
1257 struct perf_event *event; 1281 struct perf_event *event;
1258 int can_add_hw = 1;
1259
1260 raw_spin_lock(&ctx->lock);
1261 ctx->is_active = 1;
1262 if (likely(!ctx->nr_events))
1263 goto out;
1264 1282
1265 ctx->timestamp = perf_clock(); 1283 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1266 1284 if (event->state <= PERF_EVENT_STATE_OFF)
1267 perf_disable();
1268
1269 /*
1270 * First go through the list and put on any pinned groups
1271 * in order to give them the best chance of going on.
1272 */
1273 list_for_each_entry(event, &ctx->group_list, group_entry) {
1274 if (event->state <= PERF_EVENT_STATE_OFF ||
1275 !event->attr.pinned)
1276 continue; 1285 continue;
1277 if (event->cpu != -1 && event->cpu != cpu) 1286 if (event->cpu != -1 && event->cpu != smp_processor_id())
1278 continue; 1287 continue;
1279 1288
1280 if (group_can_go_on(event, cpuctx, 1)) 1289 if (group_can_go_on(event, cpuctx, 1))
1281 group_sched_in(event, cpuctx, ctx, cpu); 1290 group_sched_in(event, cpuctx, ctx);
1282 1291
1283 /* 1292 /*
1284 * If this pinned group hasn't been scheduled, 1293 * If this pinned group hasn't been scheduled,
@@ -1289,32 +1298,83 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1289 event->state = PERF_EVENT_STATE_ERROR; 1298 event->state = PERF_EVENT_STATE_ERROR;
1290 } 1299 }
1291 } 1300 }
1301}
1292 1302
1293 list_for_each_entry(event, &ctx->group_list, group_entry) { 1303static void
1294 /* 1304ctx_flexible_sched_in(struct perf_event_context *ctx,
1295 * Ignore events in OFF or ERROR state, and 1305 struct perf_cpu_context *cpuctx)
1296 * ignore pinned events since we did them already. 1306{
1297 */ 1307 struct perf_event *event;
1298 if (event->state <= PERF_EVENT_STATE_OFF || 1308 int can_add_hw = 1;
1299 event->attr.pinned)
1300 continue;
1301 1309
1310 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1311 /* Ignore events in OFF or ERROR state */
1312 if (event->state <= PERF_EVENT_STATE_OFF)
1313 continue;
1302 /* 1314 /*
1303 * Listen to the 'cpu' scheduling filter constraint 1315 * Listen to the 'cpu' scheduling filter constraint
1304 * of events: 1316 * of events:
1305 */ 1317 */
1306 if (event->cpu != -1 && event->cpu != cpu) 1318 if (event->cpu != -1 && event->cpu != smp_processor_id())
1307 continue; 1319 continue;
1308 1320
1309 if (group_can_go_on(event, cpuctx, can_add_hw)) 1321 if (group_can_go_on(event, cpuctx, can_add_hw))
1310 if (group_sched_in(event, cpuctx, ctx, cpu)) 1322 if (group_sched_in(event, cpuctx, ctx))
1311 can_add_hw = 0; 1323 can_add_hw = 0;
1312 } 1324 }
1325}
1326
1327static void
1328ctx_sched_in(struct perf_event_context *ctx,
1329 struct perf_cpu_context *cpuctx,
1330 enum event_type_t event_type)
1331{
1332 raw_spin_lock(&ctx->lock);
1333 ctx->is_active = 1;
1334 if (likely(!ctx->nr_events))
1335 goto out;
1336
1337 ctx->timestamp = perf_clock();
1338
1339 perf_disable();
1340
1341 /*
1342 * First go through the list and put on any pinned groups
1343 * in order to give them the best chance of going on.
1344 */
1345 if (event_type & EVENT_PINNED)
1346 ctx_pinned_sched_in(ctx, cpuctx);
1347
1348 /* Then walk through the lower prio flexible groups */
1349 if (event_type & EVENT_FLEXIBLE)
1350 ctx_flexible_sched_in(ctx, cpuctx);
1351
1313 perf_enable(); 1352 perf_enable();
1314 out: 1353 out:
1315 raw_spin_unlock(&ctx->lock); 1354 raw_spin_unlock(&ctx->lock);
1316} 1355}
1317 1356
1357static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
1358 enum event_type_t event_type)
1359{
1360 struct perf_event_context *ctx = &cpuctx->ctx;
1361
1362 ctx_sched_in(ctx, cpuctx, event_type);
1363}
1364
1365static void task_ctx_sched_in(struct task_struct *task,
1366 enum event_type_t event_type)
1367{
1368 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1369 struct perf_event_context *ctx = task->perf_event_ctxp;
1370
1371 if (likely(!ctx))
1372 return;
1373 if (cpuctx->task_ctx == ctx)
1374 return;
1375 ctx_sched_in(ctx, cpuctx, event_type);
1376 cpuctx->task_ctx = ctx;
1377}
1318/* 1378/*
1319 * Called from scheduler to add the events of the current task 1379 * Called from scheduler to add the events of the current task
1320 * with interrupts disabled. 1380 * with interrupts disabled.
@@ -1326,38 +1386,128 @@ __perf_event_sched_in(struct perf_event_context *ctx,
1326 * accessing the event control register. If a NMI hits, then it will 1386 * accessing the event control register. If a NMI hits, then it will
1327 * keep the event running. 1387 * keep the event running.
1328 */ 1388 */
1329void perf_event_task_sched_in(struct task_struct *task, int cpu) 1389void perf_event_task_sched_in(struct task_struct *task)
1330{ 1390{
1331 struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); 1391 struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
1332 struct perf_event_context *ctx = task->perf_event_ctxp; 1392 struct perf_event_context *ctx = task->perf_event_ctxp;
1333 1393
1334 if (likely(!ctx)) 1394 if (likely(!ctx))
1335 return; 1395 return;
1396
1336 if (cpuctx->task_ctx == ctx) 1397 if (cpuctx->task_ctx == ctx)
1337 return; 1398 return;
1338 __perf_event_sched_in(ctx, cpuctx, cpu); 1399
1400 /*
1401 * We want to keep the following priority order:
1402 * cpu pinned (that don't need to move), task pinned,
1403 * cpu flexible, task flexible.
1404 */
1405 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1406
1407 ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
1408 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1409 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
1410
1339 cpuctx->task_ctx = ctx; 1411 cpuctx->task_ctx = ctx;
1340} 1412}
1341 1413
1342static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) 1414#define MAX_INTERRUPTS (~0ULL)
1415
1416static void perf_log_throttle(struct perf_event *event, int enable);
1417
1418static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
1343{ 1419{
1344 struct perf_event_context *ctx = &cpuctx->ctx; 1420 u64 frequency = event->attr.sample_freq;
1421 u64 sec = NSEC_PER_SEC;
1422 u64 divisor, dividend;
1423
1424 int count_fls, nsec_fls, frequency_fls, sec_fls;
1425
1426 count_fls = fls64(count);
1427 nsec_fls = fls64(nsec);
1428 frequency_fls = fls64(frequency);
1429 sec_fls = 30;
1345 1430
1346 __perf_event_sched_in(ctx, cpuctx, cpu); 1431 /*
1432 * We got @count in @nsec, with a target of sample_freq HZ
1433 * the target period becomes:
1434 *
1435 * @count * 10^9
1436 * period = -------------------
1437 * @nsec * sample_freq
1438 *
1439 */
1440
1441 /*
1442 * Reduce accuracy by one bit such that @a and @b converge
1443 * to a similar magnitude.
1444 */
1445#define REDUCE_FLS(a, b) \
1446do { \
1447 if (a##_fls > b##_fls) { \
1448 a >>= 1; \
1449 a##_fls--; \
1450 } else { \
1451 b >>= 1; \
1452 b##_fls--; \
1453 } \
1454} while (0)
1455
1456 /*
1457 * Reduce accuracy until either term fits in a u64, then proceed with
1458 * the other, so that finally we can do a u64/u64 division.
1459 */
1460 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
1461 REDUCE_FLS(nsec, frequency);
1462 REDUCE_FLS(sec, count);
1463 }
1464
1465 if (count_fls + sec_fls > 64) {
1466 divisor = nsec * frequency;
1467
1468 while (count_fls + sec_fls > 64) {
1469 REDUCE_FLS(count, sec);
1470 divisor >>= 1;
1471 }
1472
1473 dividend = count * sec;
1474 } else {
1475 dividend = count * sec;
1476
1477 while (nsec_fls + frequency_fls > 64) {
1478 REDUCE_FLS(nsec, frequency);
1479 dividend >>= 1;
1480 }
1481
1482 divisor = nsec * frequency;
1483 }
1484
1485 return div64_u64(dividend, divisor);
1347} 1486}
1348 1487
1349#define MAX_INTERRUPTS (~0ULL) 1488static void perf_event_stop(struct perf_event *event)
1489{
1490 if (!event->pmu->stop)
1491 return event->pmu->disable(event);
1350 1492
1351static void perf_log_throttle(struct perf_event *event, int enable); 1493 return event->pmu->stop(event);
1494}
1495
1496static int perf_event_start(struct perf_event *event)
1497{
1498 if (!event->pmu->start)
1499 return event->pmu->enable(event);
1352 1500
1353static void perf_adjust_period(struct perf_event *event, u64 events) 1501 return event->pmu->start(event);
1502}
1503
1504static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count)
1354{ 1505{
1355 struct hw_perf_event *hwc = &event->hw; 1506 struct hw_perf_event *hwc = &event->hw;
1356 u64 period, sample_period; 1507 u64 period, sample_period;
1357 s64 delta; 1508 s64 delta;
1358 1509
1359 events *= hwc->sample_period; 1510 period = perf_calculate_period(event, nsec, count);
1360 period = div64_u64(events, event->attr.sample_freq);
1361 1511
1362 delta = (s64)(period - hwc->sample_period); 1512 delta = (s64)(period - hwc->sample_period);
1363 delta = (delta + 7) / 8; /* low pass filter */ 1513 delta = (delta + 7) / 8; /* low pass filter */
@@ -1368,13 +1518,22 @@ static void perf_adjust_period(struct perf_event *event, u64 events)
1368 sample_period = 1; 1518 sample_period = 1;
1369 1519
1370 hwc->sample_period = sample_period; 1520 hwc->sample_period = sample_period;
1521
1522 if (atomic64_read(&hwc->period_left) > 8*sample_period) {
1523 perf_disable();
1524 perf_event_stop(event);
1525 atomic64_set(&hwc->period_left, 0);
1526 perf_event_start(event);
1527 perf_enable();
1528 }
1371} 1529}
1372 1530
1373static void perf_ctx_adjust_freq(struct perf_event_context *ctx) 1531static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1374{ 1532{
1375 struct perf_event *event; 1533 struct perf_event *event;
1376 struct hw_perf_event *hwc; 1534 struct hw_perf_event *hwc;
1377 u64 interrupts, freq; 1535 u64 interrupts, now;
1536 s64 delta;
1378 1537
1379 raw_spin_lock(&ctx->lock); 1538 raw_spin_lock(&ctx->lock);
1380 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 1539 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
@@ -1395,44 +1554,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1395 if (interrupts == MAX_INTERRUPTS) { 1554 if (interrupts == MAX_INTERRUPTS) {
1396 perf_log_throttle(event, 1); 1555 perf_log_throttle(event, 1);
1397 event->pmu->unthrottle(event); 1556 event->pmu->unthrottle(event);
1398 interrupts = 2*sysctl_perf_event_sample_rate/HZ;
1399 } 1557 }
1400 1558
1401 if (!event->attr.freq || !event->attr.sample_freq) 1559 if (!event->attr.freq || !event->attr.sample_freq)
1402 continue; 1560 continue;
1403 1561
1404 /* 1562 event->pmu->read(event);
1405 * if the specified freq < HZ then we need to skip ticks 1563 now = atomic64_read(&event->count);
1406 */ 1564 delta = now - hwc->freq_count_stamp;
1407 if (event->attr.sample_freq < HZ) { 1565 hwc->freq_count_stamp = now;
1408 freq = event->attr.sample_freq;
1409
1410 hwc->freq_count += freq;
1411 hwc->freq_interrupts += interrupts;
1412
1413 if (hwc->freq_count < HZ)
1414 continue;
1415
1416 interrupts = hwc->freq_interrupts;
1417 hwc->freq_interrupts = 0;
1418 hwc->freq_count -= HZ;
1419 } else
1420 freq = HZ;
1421
1422 perf_adjust_period(event, freq * interrupts);
1423 1566
1424 /* 1567 if (delta > 0)
1425 * In order to avoid being stalled by an (accidental) huge 1568 perf_adjust_period(event, TICK_NSEC, delta);
1426 * sample period, force reset the sample period if we didn't
1427 * get any events in this freq period.
1428 */
1429 if (!interrupts) {
1430 perf_disable();
1431 event->pmu->disable(event);
1432 atomic64_set(&hwc->period_left, 0);
1433 event->pmu->enable(event);
1434 perf_enable();
1435 }
1436 } 1569 }
1437 raw_spin_unlock(&ctx->lock); 1570 raw_spin_unlock(&ctx->lock);
1438} 1571}
@@ -1442,26 +1575,18 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
1442 */ 1575 */
1443static void rotate_ctx(struct perf_event_context *ctx) 1576static void rotate_ctx(struct perf_event_context *ctx)
1444{ 1577{
1445 struct perf_event *event;
1446
1447 if (!ctx->nr_events) 1578 if (!ctx->nr_events)
1448 return; 1579 return;
1449 1580
1450 raw_spin_lock(&ctx->lock); 1581 raw_spin_lock(&ctx->lock);
1451 /* 1582
1452 * Rotate the first entry last (works just fine for group events too): 1583 /* Rotate the first entry last of non-pinned groups */
1453 */ 1584 list_rotate_left(&ctx->flexible_groups);
1454 perf_disable();
1455 list_for_each_entry(event, &ctx->group_list, group_entry) {
1456 list_move_tail(&event->group_entry, &ctx->group_list);
1457 break;
1458 }
1459 perf_enable();
1460 1585
1461 raw_spin_unlock(&ctx->lock); 1586 raw_spin_unlock(&ctx->lock);
1462} 1587}
1463 1588
1464void perf_event_task_tick(struct task_struct *curr, int cpu) 1589void perf_event_task_tick(struct task_struct *curr)
1465{ 1590{
1466 struct perf_cpu_context *cpuctx; 1591 struct perf_cpu_context *cpuctx;
1467 struct perf_event_context *ctx; 1592 struct perf_event_context *ctx;
@@ -1469,24 +1594,43 @@ void perf_event_task_tick(struct task_struct *curr, int cpu)
1469 if (!atomic_read(&nr_events)) 1594 if (!atomic_read(&nr_events))
1470 return; 1595 return;
1471 1596
1472 cpuctx = &per_cpu(perf_cpu_context, cpu); 1597 cpuctx = &__get_cpu_var(perf_cpu_context);
1473 ctx = curr->perf_event_ctxp; 1598 ctx = curr->perf_event_ctxp;
1474 1599
1600 perf_disable();
1601
1475 perf_ctx_adjust_freq(&cpuctx->ctx); 1602 perf_ctx_adjust_freq(&cpuctx->ctx);
1476 if (ctx) 1603 if (ctx)
1477 perf_ctx_adjust_freq(ctx); 1604 perf_ctx_adjust_freq(ctx);
1478 1605
1479 perf_event_cpu_sched_out(cpuctx); 1606 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
1480 if (ctx) 1607 if (ctx)
1481 __perf_event_task_sched_out(ctx); 1608 task_ctx_sched_out(ctx, EVENT_FLEXIBLE);
1482 1609
1483 rotate_ctx(&cpuctx->ctx); 1610 rotate_ctx(&cpuctx->ctx);
1484 if (ctx) 1611 if (ctx)
1485 rotate_ctx(ctx); 1612 rotate_ctx(ctx);
1486 1613
1487 perf_event_cpu_sched_in(cpuctx, cpu); 1614 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
1488 if (ctx) 1615 if (ctx)
1489 perf_event_task_sched_in(curr, cpu); 1616 task_ctx_sched_in(curr, EVENT_FLEXIBLE);
1617
1618 perf_enable();
1619}
1620
1621static int event_enable_on_exec(struct perf_event *event,
1622 struct perf_event_context *ctx)
1623{
1624 if (!event->attr.enable_on_exec)
1625 return 0;
1626
1627 event->attr.enable_on_exec = 0;
1628 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1629 return 0;
1630
1631 __perf_event_mark_enabled(event, ctx);
1632
1633 return 1;
1490} 1634}
1491 1635
1492/* 1636/*
@@ -1499,6 +1643,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1499 struct perf_event *event; 1643 struct perf_event *event;
1500 unsigned long flags; 1644 unsigned long flags;
1501 int enabled = 0; 1645 int enabled = 0;
1646 int ret;
1502 1647
1503 local_irq_save(flags); 1648 local_irq_save(flags);
1504 ctx = task->perf_event_ctxp; 1649 ctx = task->perf_event_ctxp;
@@ -1509,14 +1654,16 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1509 1654
1510 raw_spin_lock(&ctx->lock); 1655 raw_spin_lock(&ctx->lock);
1511 1656
1512 list_for_each_entry(event, &ctx->group_list, group_entry) { 1657 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
1513 if (!event->attr.enable_on_exec) 1658 ret = event_enable_on_exec(event, ctx);
1514 continue; 1659 if (ret)
1515 event->attr.enable_on_exec = 0; 1660 enabled = 1;
1516 if (event->state >= PERF_EVENT_STATE_INACTIVE) 1661 }
1517 continue; 1662
1518 __perf_event_mark_enabled(event, ctx); 1663 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
1519 enabled = 1; 1664 ret = event_enable_on_exec(event, ctx);
1665 if (ret)
1666 enabled = 1;
1520 } 1667 }
1521 1668
1522 /* 1669 /*
@@ -1527,7 +1674,7 @@ static void perf_event_enable_on_exec(struct task_struct *task)
1527 1674
1528 raw_spin_unlock(&ctx->lock); 1675 raw_spin_unlock(&ctx->lock);
1529 1676
1530 perf_event_task_sched_in(task, smp_processor_id()); 1677 perf_event_task_sched_in(task);
1531 out: 1678 out:
1532 local_irq_restore(flags); 1679 local_irq_restore(flags);
1533} 1680}
@@ -1590,7 +1737,8 @@ __perf_event_init_context(struct perf_event_context *ctx,
1590{ 1737{
1591 raw_spin_lock_init(&ctx->lock); 1738 raw_spin_lock_init(&ctx->lock);
1592 mutex_init(&ctx->mutex); 1739 mutex_init(&ctx->mutex);
1593 INIT_LIST_HEAD(&ctx->group_list); 1740 INIT_LIST_HEAD(&ctx->pinned_groups);
1741 INIT_LIST_HEAD(&ctx->flexible_groups);
1594 INIT_LIST_HEAD(&ctx->event_list); 1742 INIT_LIST_HEAD(&ctx->event_list);
1595 atomic_set(&ctx->refcount, 1); 1743 atomic_set(&ctx->refcount, 1);
1596 ctx->task = task; 1744 ctx->task = task;
@@ -3608,7 +3756,7 @@ void __perf_event_mmap(struct vm_area_struct *vma)
3608 /* .tid */ 3756 /* .tid */
3609 .start = vma->vm_start, 3757 .start = vma->vm_start,
3610 .len = vma->vm_end - vma->vm_start, 3758 .len = vma->vm_end - vma->vm_start,
3611 .pgoff = vma->vm_pgoff, 3759 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
3612 }, 3760 },
3613 }; 3761 };
3614 3762
@@ -3688,12 +3836,12 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
3688 3836
3689 if (event->attr.freq) { 3837 if (event->attr.freq) {
3690 u64 now = perf_clock(); 3838 u64 now = perf_clock();
3691 s64 delta = now - hwc->freq_stamp; 3839 s64 delta = now - hwc->freq_time_stamp;
3692 3840
3693 hwc->freq_stamp = now; 3841 hwc->freq_time_stamp = now;
3694 3842
3695 if (delta > 0 && delta < TICK_NSEC) 3843 if (delta > 0 && delta < 2*TICK_NSEC)
3696 perf_adjust_period(event, NSEC_PER_SEC / (int)delta); 3844 perf_adjust_period(event, delta, hwc->last_period);
3697 } 3845 }
3698 3846
3699 /* 3847 /*
@@ -4184,7 +4332,7 @@ static const struct pmu perf_ops_task_clock = {
4184 .read = task_clock_perf_event_read, 4332 .read = task_clock_perf_event_read,
4185}; 4333};
4186 4334
4187#ifdef CONFIG_EVENT_PROFILE 4335#ifdef CONFIG_EVENT_TRACING
4188 4336
4189void perf_tp_event(int event_id, u64 addr, u64 count, void *record, 4337void perf_tp_event(int event_id, u64 addr, u64 count, void *record,
4190 int entry_size) 4338 int entry_size)
@@ -4289,7 +4437,7 @@ static void perf_event_free_filter(struct perf_event *event)
4289{ 4437{
4290} 4438}
4291 4439
4292#endif /* CONFIG_EVENT_PROFILE */ 4440#endif /* CONFIG_EVENT_TRACING */
4293 4441
4294#ifdef CONFIG_HAVE_HW_BREAKPOINT 4442#ifdef CONFIG_HAVE_HW_BREAKPOINT
4295static void bp_perf_event_destroy(struct perf_event *event) 4443static void bp_perf_event_destroy(struct perf_event *event)
@@ -4870,8 +5018,15 @@ inherit_event(struct perf_event *parent_event,
4870 else 5018 else
4871 child_event->state = PERF_EVENT_STATE_OFF; 5019 child_event->state = PERF_EVENT_STATE_OFF;
4872 5020
4873 if (parent_event->attr.freq) 5021 if (parent_event->attr.freq) {
4874 child_event->hw.sample_period = parent_event->hw.sample_period; 5022 u64 sample_period = parent_event->hw.sample_period;
5023 struct hw_perf_event *hwc = &child_event->hw;
5024
5025 hwc->sample_period = sample_period;
5026 hwc->last_period = sample_period;
5027
5028 atomic64_set(&hwc->period_left, sample_period);
5029 }
4875 5030
4876 child_event->overflow_handler = parent_event->overflow_handler; 5031 child_event->overflow_handler = parent_event->overflow_handler;
4877 5032
@@ -5039,7 +5194,11 @@ void perf_event_exit_task(struct task_struct *child)
5039 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); 5194 mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
5040 5195
5041again: 5196again:
5042 list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, 5197 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
5198 group_entry)
5199 __perf_event_exit_task(child_event, child_ctx, child);
5200
5201 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
5043 group_entry) 5202 group_entry)
5044 __perf_event_exit_task(child_event, child_ctx, child); 5203 __perf_event_exit_task(child_event, child_ctx, child);
5045 5204
@@ -5048,7 +5207,8 @@ again:
5048 * its siblings to the list, but we obtained 'tmp' before that which 5207 * its siblings to the list, but we obtained 'tmp' before that which
5049 * will still point to the list head terminating the iteration. 5208 * will still point to the list head terminating the iteration.
5050 */ 5209 */
5051 if (!list_empty(&child_ctx->group_list)) 5210 if (!list_empty(&child_ctx->pinned_groups) ||
5211 !list_empty(&child_ctx->flexible_groups))
5052 goto again; 5212 goto again;
5053 5213
5054 mutex_unlock(&child_ctx->mutex); 5214 mutex_unlock(&child_ctx->mutex);
@@ -5056,6 +5216,24 @@ again:
5056 put_ctx(child_ctx); 5216 put_ctx(child_ctx);
5057} 5217}
5058 5218
5219static void perf_free_event(struct perf_event *event,
5220 struct perf_event_context *ctx)
5221{
5222 struct perf_event *parent = event->parent;
5223
5224 if (WARN_ON_ONCE(!parent))
5225 return;
5226
5227 mutex_lock(&parent->child_mutex);
5228 list_del_init(&event->child_list);
5229 mutex_unlock(&parent->child_mutex);
5230
5231 fput(parent->filp);
5232
5233 list_del_event(event, ctx);
5234 free_event(event);
5235}
5236
5059/* 5237/*
5060 * free an unexposed, unused context as created by inheritance by 5238 * free an unexposed, unused context as created by inheritance by
5061 * init_task below, used by fork() in case of fail. 5239 * init_task below, used by fork() in case of fail.
@@ -5070,36 +5248,70 @@ void perf_event_free_task(struct task_struct *task)
5070 5248
5071 mutex_lock(&ctx->mutex); 5249 mutex_lock(&ctx->mutex);
5072again: 5250again:
5073 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { 5251 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5074 struct perf_event *parent = event->parent; 5252 perf_free_event(event, ctx);
5075 5253
5076 if (WARN_ON_ONCE(!parent)) 5254 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
5077 continue; 5255 group_entry)
5256 perf_free_event(event, ctx);
5078 5257
5079 mutex_lock(&parent->child_mutex); 5258 if (!list_empty(&ctx->pinned_groups) ||
5080 list_del_init(&event->child_list); 5259 !list_empty(&ctx->flexible_groups))
5081 mutex_unlock(&parent->child_mutex); 5260 goto again;
5082 5261
5083 fput(parent->filp); 5262 mutex_unlock(&ctx->mutex);
5084 5263
5085 list_del_event(event, ctx); 5264 put_ctx(ctx);
5086 free_event(event); 5265}
5266
5267static int
5268inherit_task_group(struct perf_event *event, struct task_struct *parent,
5269 struct perf_event_context *parent_ctx,
5270 struct task_struct *child,
5271 int *inherited_all)
5272{
5273 int ret;
5274 struct perf_event_context *child_ctx = child->perf_event_ctxp;
5275
5276 if (!event->attr.inherit) {
5277 *inherited_all = 0;
5278 return 0;
5087 } 5279 }
5088 5280
5089 if (!list_empty(&ctx->group_list)) 5281 if (!child_ctx) {
5090 goto again; 5282 /*
5283 * This is executed from the parent task context, so
5284 * inherit events that have been marked for cloning.
5285 * First allocate and initialize a context for the
5286 * child.
5287 */
5091 5288
5092 mutex_unlock(&ctx->mutex); 5289 child_ctx = kzalloc(sizeof(struct perf_event_context),
5290 GFP_KERNEL);
5291 if (!child_ctx)
5292 return -ENOMEM;
5093 5293
5094 put_ctx(ctx); 5294 __perf_event_init_context(child_ctx, child);
5295 child->perf_event_ctxp = child_ctx;
5296 get_task_struct(child);
5297 }
5298
5299 ret = inherit_group(event, parent, parent_ctx,
5300 child, child_ctx);
5301
5302 if (ret)
5303 *inherited_all = 0;
5304
5305 return ret;
5095} 5306}
5096 5307
5308
5097/* 5309/*
5098 * Initialize the perf_event context in task_struct 5310 * Initialize the perf_event context in task_struct
5099 */ 5311 */
5100int perf_event_init_task(struct task_struct *child) 5312int perf_event_init_task(struct task_struct *child)
5101{ 5313{
5102 struct perf_event_context *child_ctx = NULL, *parent_ctx; 5314 struct perf_event_context *child_ctx, *parent_ctx;
5103 struct perf_event_context *cloned_ctx; 5315 struct perf_event_context *cloned_ctx;
5104 struct perf_event *event; 5316 struct perf_event *event;
5105 struct task_struct *parent = current; 5317 struct task_struct *parent = current;
@@ -5137,41 +5349,22 @@ int perf_event_init_task(struct task_struct *child)
5137 * We dont have to disable NMIs - we are only looking at 5349 * We dont have to disable NMIs - we are only looking at
5138 * the list, not manipulating it: 5350 * the list, not manipulating it:
5139 */ 5351 */
5140 list_for_each_entry(event, &parent_ctx->group_list, group_entry) { 5352 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
5141 5353 ret = inherit_task_group(event, parent, parent_ctx, child,
5142 if (!event->attr.inherit) { 5354 &inherited_all);
5143 inherited_all = 0; 5355 if (ret)
5144 continue; 5356 break;
5145 } 5357 }
5146
5147 if (!child->perf_event_ctxp) {
5148 /*
5149 * This is executed from the parent task context, so
5150 * inherit events that have been marked for cloning.
5151 * First allocate and initialize a context for the
5152 * child.
5153 */
5154
5155 child_ctx = kzalloc(sizeof(struct perf_event_context),
5156 GFP_KERNEL);
5157 if (!child_ctx) {
5158 ret = -ENOMEM;
5159 break;
5160 }
5161
5162 __perf_event_init_context(child_ctx, child);
5163 child->perf_event_ctxp = child_ctx;
5164 get_task_struct(child);
5165 }
5166 5358
5167 ret = inherit_group(event, parent, parent_ctx, 5359 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
5168 child, child_ctx); 5360 ret = inherit_task_group(event, parent, parent_ctx, child,
5169 if (ret) { 5361 &inherited_all);
5170 inherited_all = 0; 5362 if (ret)
5171 break; 5363 break;
5172 }
5173 } 5364 }
5174 5365
5366 child_ctx = child->perf_event_ctxp;
5367
5175 if (child_ctx && inherited_all) { 5368 if (child_ctx && inherited_all) {
5176 /* 5369 /*
5177 * Mark the child context as a clone of the parent 5370 * Mark the child context as a clone of the parent
@@ -5220,7 +5413,9 @@ static void __perf_event_exit_cpu(void *info)
5220 struct perf_event_context *ctx = &cpuctx->ctx; 5413 struct perf_event_context *ctx = &cpuctx->ctx;
5221 struct perf_event *event, *tmp; 5414 struct perf_event *event, *tmp;
5222 5415
5223 list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) 5416 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
5417 __perf_event_remove_from_context(event);
5418 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
5224 __perf_event_remove_from_context(event); 5419 __perf_event_remove_from_context(event);
5225} 5420}
5226static void perf_event_exit_cpu(int cpu) 5421static void perf_event_exit_cpu(int cpu)
@@ -5258,6 +5453,10 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
5258 perf_event_exit_cpu(cpu); 5453 perf_event_exit_cpu(cpu);
5259 break; 5454 break;
5260 5455
5456 case CPU_DEAD:
5457 hw_perf_event_setup_offline(cpu);
5458 break;
5459
5261 default: 5460 default:
5262 break; 5461 break;
5263 } 5462 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 3a8fb30a91b1..3e71ebb101c2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2794,7 +2794,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2794 */ 2794 */
2795 prev_state = prev->state; 2795 prev_state = prev->state;
2796 finish_arch_switch(prev); 2796 finish_arch_switch(prev);
2797 perf_event_task_sched_in(current, cpu_of(rq)); 2797#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2798 local_irq_disable();
2799#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2800 perf_event_task_sched_in(current);
2801#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2802 local_irq_enable();
2803#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2798 finish_lock_switch(rq, prev); 2804 finish_lock_switch(rq, prev);
2799 2805
2800 fire_sched_in_preempt_notifiers(current); 2806 fire_sched_in_preempt_notifiers(current);
@@ -5309,7 +5315,7 @@ void scheduler_tick(void)
5309 curr->sched_class->task_tick(rq, curr, 0); 5315 curr->sched_class->task_tick(rq, curr, 0);
5310 raw_spin_unlock(&rq->lock); 5316 raw_spin_unlock(&rq->lock);
5311 5317
5312 perf_event_task_tick(curr, cpu); 5318 perf_event_task_tick(curr);
5313 5319
5314#ifdef CONFIG_SMP 5320#ifdef CONFIG_SMP
5315 rq->idle_at_tick = idle_cpu(cpu); 5321 rq->idle_at_tick = idle_cpu(cpu);
@@ -5523,7 +5529,7 @@ need_resched_nonpreemptible:
5523 5529
5524 if (likely(prev != next)) { 5530 if (likely(prev != next)) {
5525 sched_info_switch(prev, next); 5531 sched_info_switch(prev, next);
5526 perf_event_task_sched_out(prev, next, cpu); 5532 perf_event_task_sched_out(prev, next);
5527 5533
5528 rq->nr_switches++; 5534 rq->nr_switches++;
5529 rq->curr = next; 5535 rq->curr = next;
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index cd9ecd89ec77..d00c6fe23f54 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -51,7 +51,9 @@ endif
51obj-$(CONFIG_EVENT_TRACING) += trace_events.o 51obj-$(CONFIG_EVENT_TRACING) += trace_events.o
52obj-$(CONFIG_EVENT_TRACING) += trace_export.o 52obj-$(CONFIG_EVENT_TRACING) += trace_export.o
53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o 53obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
54obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o 54ifeq ($(CONFIG_PERF_EVENTS),y)
55obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o
56endif
55obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o 57obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
56obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o 58obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
57obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o 59obj-$(CONFIG_KSYM_TRACER) += trace_ksym.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e6640f80454..1904797f4a8a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,7 +22,6 @@
22#include <linux/hardirq.h> 22#include <linux/hardirq.h>
23#include <linux/kthread.h> 23#include <linux/kthread.h>
24#include <linux/uaccess.h> 24#include <linux/uaccess.h>
25#include <linux/kprobes.h>
26#include <linux/ftrace.h> 25#include <linux/ftrace.h>
27#include <linux/sysctl.h> 26#include <linux/sysctl.h>
28#include <linux/ctype.h> 27#include <linux/ctype.h>
@@ -898,36 +897,6 @@ static struct dyn_ftrace *ftrace_free_records;
898 } \ 897 } \
899 } 898 }
900 899
901#ifdef CONFIG_KPROBES
902
903static int frozen_record_count;
904
905static inline void freeze_record(struct dyn_ftrace *rec)
906{
907 if (!(rec->flags & FTRACE_FL_FROZEN)) {
908 rec->flags |= FTRACE_FL_FROZEN;
909 frozen_record_count++;
910 }
911}
912
913static inline void unfreeze_record(struct dyn_ftrace *rec)
914{
915 if (rec->flags & FTRACE_FL_FROZEN) {
916 rec->flags &= ~FTRACE_FL_FROZEN;
917 frozen_record_count--;
918 }
919}
920
921static inline int record_frozen(struct dyn_ftrace *rec)
922{
923 return rec->flags & FTRACE_FL_FROZEN;
924}
925#else
926# define freeze_record(rec) ({ 0; })
927# define unfreeze_record(rec) ({ 0; })
928# define record_frozen(rec) ({ 0; })
929#endif /* CONFIG_KPROBES */
930
931static void ftrace_free_rec(struct dyn_ftrace *rec) 900static void ftrace_free_rec(struct dyn_ftrace *rec)
932{ 901{
933 rec->freelist = ftrace_free_records; 902 rec->freelist = ftrace_free_records;
@@ -1025,6 +994,21 @@ static void ftrace_bug(int failed, unsigned long ip)
1025} 994}
1026 995
1027 996
997/* Return 1 if the address range is reserved for ftrace */
998int ftrace_text_reserved(void *start, void *end)
999{
1000 struct dyn_ftrace *rec;
1001 struct ftrace_page *pg;
1002
1003 do_for_each_ftrace_rec(pg, rec) {
1004 if (rec->ip <= (unsigned long)end &&
1005 rec->ip + MCOUNT_INSN_SIZE > (unsigned long)start)
1006 return 1;
1007 } while_for_each_ftrace_rec();
1008 return 0;
1009}
1010
1011
1028static int 1012static int
1029__ftrace_replace_code(struct dyn_ftrace *rec, int enable) 1013__ftrace_replace_code(struct dyn_ftrace *rec, int enable)
1030{ 1014{
@@ -1076,14 +1060,6 @@ static void ftrace_replace_code(int enable)
1076 !(rec->flags & FTRACE_FL_CONVERTED)) 1060 !(rec->flags & FTRACE_FL_CONVERTED))
1077 continue; 1061 continue;
1078 1062
1079 /* ignore updates to this record's mcount site */
1080 if (get_kprobe((void *)rec->ip)) {
1081 freeze_record(rec);
1082 continue;
1083 } else {
1084 unfreeze_record(rec);
1085 }
1086
1087 failed = __ftrace_replace_code(rec, enable); 1063 failed = __ftrace_replace_code(rec, enable);
1088 if (failed) { 1064 if (failed) {
1089 rec->flags |= FTRACE_FL_FAILED; 1065 rec->flags |= FTRACE_FL_FAILED;
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 9e25573242cf..f0d693005075 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -6,14 +6,12 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/kprobes.h>
9#include "trace.h" 10#include "trace.h"
10 11
11 12
12char *perf_trace_buf; 13static char *perf_trace_buf;
13EXPORT_SYMBOL_GPL(perf_trace_buf); 14static char *perf_trace_buf_nmi;
14
15char *perf_trace_buf_nmi;
16EXPORT_SYMBOL_GPL(perf_trace_buf_nmi);
17 15
18typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; 16typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ;
19 17
@@ -120,3 +118,47 @@ void ftrace_profile_disable(int event_id)
120 } 118 }
121 mutex_unlock(&event_mutex); 119 mutex_unlock(&event_mutex);
122} 120}
121
122__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type,
123 int *rctxp, unsigned long *irq_flags)
124{
125 struct trace_entry *entry;
126 char *trace_buf, *raw_data;
127 int pc, cpu;
128
129 pc = preempt_count();
130
131 /* Protect the per cpu buffer, begin the rcu read side */
132 local_irq_save(*irq_flags);
133
134 *rctxp = perf_swevent_get_recursion_context();
135 if (*rctxp < 0)
136 goto err_recursion;
137
138 cpu = smp_processor_id();
139
140 if (in_nmi())
141 trace_buf = rcu_dereference(perf_trace_buf_nmi);
142 else
143 trace_buf = rcu_dereference(perf_trace_buf);
144
145 if (!trace_buf)
146 goto err;
147
148 raw_data = per_cpu_ptr(trace_buf, cpu);
149
150 /* zero the dead bytes from align to not leak stack to user */
151 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
152
153 entry = (struct trace_entry *)raw_data;
154 tracing_generic_entry_update(entry, *irq_flags, pc);
155 entry->type = type;
156
157 return raw_data;
158err:
159 perf_swevent_put_recursion_context(*rctxp);
160err_recursion:
161 local_irq_restore(*irq_flags);
162 return NULL;
163}
164EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e42af9aad69f..4615f62a04f1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1371,7 +1371,7 @@ out_unlock:
1371 return err; 1371 return err;
1372} 1372}
1373 1373
1374#ifdef CONFIG_EVENT_PROFILE 1374#ifdef CONFIG_PERF_EVENTS
1375 1375
1376void ftrace_profile_free_filter(struct perf_event *event) 1376void ftrace_profile_free_filter(struct perf_event *event)
1377{ 1377{
@@ -1439,5 +1439,5 @@ out_unlock:
1439 return err; 1439 return err;
1440} 1440}
1441 1441
1442#endif /* CONFIG_EVENT_PROFILE */ 1442#endif /* CONFIG_PERF_EVENTS */
1443 1443
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 50b1b8239806..356c10227c98 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -91,11 +91,6 @@ static __kprobes unsigned long fetch_memory(struct pt_regs *regs, void *addr)
91 return retval; 91 return retval;
92} 92}
93 93
94static __kprobes unsigned long fetch_argument(struct pt_regs *regs, void *num)
95{
96 return regs_get_argument_nth(regs, (unsigned int)((unsigned long)num));
97}
98
99static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs, 94static __kprobes unsigned long fetch_retvalue(struct pt_regs *regs,
100 void *dummy) 95 void *dummy)
101{ 96{
@@ -231,9 +226,7 @@ static int probe_arg_string(char *buf, size_t n, struct fetch_func *ff)
231{ 226{
232 int ret = -EINVAL; 227 int ret = -EINVAL;
233 228
234 if (ff->func == fetch_argument) 229 if (ff->func == fetch_register) {
235 ret = snprintf(buf, n, "$arg%lu", (unsigned long)ff->data);
236 else if (ff->func == fetch_register) {
237 const char *name; 230 const char *name;
238 name = regs_query_register_name((unsigned int)((long)ff->data)); 231 name = regs_query_register_name((unsigned int)((long)ff->data));
239 ret = snprintf(buf, n, "%%%s", name); 232 ret = snprintf(buf, n, "%%%s", name);
@@ -489,14 +482,6 @@ static int parse_probe_vars(char *arg, struct fetch_func *ff, int is_return)
489 } 482 }
490 } else 483 } else
491 ret = -EINVAL; 484 ret = -EINVAL;
492 } else if (strncmp(arg, "arg", 3) == 0 && isdigit(arg[3])) {
493 ret = strict_strtoul(arg + 3, 10, &param);
494 if (ret || param > PARAM_MAX_ARGS)
495 ret = -EINVAL;
496 else {
497 ff->func = fetch_argument;
498 ff->data = (void *)param;
499 }
500 } else 485 } else
501 ret = -EINVAL; 486 ret = -EINVAL;
502 return ret; 487 return ret;
@@ -611,7 +596,6 @@ static int create_trace_probe(int argc, char **argv)
611 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS] 596 * - Add kprobe: p[:[GRP/]EVENT] KSYM[+OFFS]|KADDR [FETCHARGS]
612 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS] 597 * - Add kretprobe: r[:[GRP/]EVENT] KSYM[+0] [FETCHARGS]
613 * Fetch args: 598 * Fetch args:
614 * $argN : fetch Nth of function argument. (N:0-)
615 * $retval : fetch return value 599 * $retval : fetch return value
616 * $stack : fetch stack address 600 * $stack : fetch stack address
617 * $stackN : fetch Nth of stack (N:0-) 601 * $stackN : fetch Nth of stack (N:0-)
@@ -958,7 +942,7 @@ static const struct file_operations kprobe_profile_ops = {
958}; 942};
959 943
960/* Kprobe handler */ 944/* Kprobe handler */
961static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) 945static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
962{ 946{
963 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 947 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
964 struct kprobe_trace_entry *entry; 948 struct kprobe_trace_entry *entry;
@@ -978,7 +962,7 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
978 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 962 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
979 irq_flags, pc); 963 irq_flags, pc);
980 if (!event) 964 if (!event)
981 return 0; 965 return;
982 966
983 entry = ring_buffer_event_data(event); 967 entry = ring_buffer_event_data(event);
984 entry->nargs = tp->nr_args; 968 entry->nargs = tp->nr_args;
@@ -988,11 +972,10 @@ static __kprobes int kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs)
988 972
989 if (!filter_current_check_discard(buffer, call, entry, event)) 973 if (!filter_current_check_discard(buffer, call, entry, event))
990 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 974 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
991 return 0;
992} 975}
993 976
994/* Kretprobe handler */ 977/* Kretprobe handler */
995static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri, 978static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri,
996 struct pt_regs *regs) 979 struct pt_regs *regs)
997{ 980{
998 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 981 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
@@ -1011,7 +994,7 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
1011 event = trace_current_buffer_lock_reserve(&buffer, call->id, size, 994 event = trace_current_buffer_lock_reserve(&buffer, call->id, size,
1012 irq_flags, pc); 995 irq_flags, pc);
1013 if (!event) 996 if (!event)
1014 return 0; 997 return;
1015 998
1016 entry = ring_buffer_event_data(event); 999 entry = ring_buffer_event_data(event);
1017 entry->nargs = tp->nr_args; 1000 entry->nargs = tp->nr_args;
@@ -1022,8 +1005,6 @@ static __kprobes int kretprobe_trace_func(struct kretprobe_instance *ri,
1022 1005
1023 if (!filter_current_check_discard(buffer, call, entry, event)) 1006 if (!filter_current_check_discard(buffer, call, entry, event))
1024 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc); 1007 trace_nowake_buffer_unlock_commit(buffer, event, irq_flags, pc);
1025
1026 return 0;
1027} 1008}
1028 1009
1029/* Event entry printers */ 1010/* Event entry printers */
@@ -1250,137 +1231,67 @@ static int kretprobe_event_show_format(struct ftrace_event_call *call,
1250 ", REC->" FIELD_STRING_RETIP); 1231 ", REC->" FIELD_STRING_RETIP);
1251} 1232}
1252 1233
1253#ifdef CONFIG_EVENT_PROFILE 1234#ifdef CONFIG_PERF_EVENTS
1254 1235
1255/* Kprobe profile handler */ 1236/* Kprobe profile handler */
1256static __kprobes int kprobe_profile_func(struct kprobe *kp, 1237static __kprobes void kprobe_profile_func(struct kprobe *kp,
1257 struct pt_regs *regs) 1238 struct pt_regs *regs)
1258{ 1239{
1259 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); 1240 struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp);
1260 struct ftrace_event_call *call = &tp->call; 1241 struct ftrace_event_call *call = &tp->call;
1261 struct kprobe_trace_entry *entry; 1242 struct kprobe_trace_entry *entry;
1262 struct trace_entry *ent; 1243 int size, __size, i;
1263 int size, __size, i, pc, __cpu;
1264 unsigned long irq_flags; 1244 unsigned long irq_flags;
1265 char *trace_buf;
1266 char *raw_data;
1267 int rctx; 1245 int rctx;
1268 1246
1269 pc = preempt_count();
1270 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); 1247 __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args);
1271 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1248 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1272 size -= sizeof(u32); 1249 size -= sizeof(u32);
1273 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1250 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1274 "profile buffer not large enough")) 1251 "profile buffer not large enough"))
1275 return 0; 1252 return;
1276
1277 /*
1278 * Protect the non nmi buffer
1279 * This also protects the rcu read side
1280 */
1281 local_irq_save(irq_flags);
1282 1253
1283 rctx = perf_swevent_get_recursion_context(); 1254 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1284 if (rctx < 0) 1255 if (!entry)
1285 goto end_recursion; 1256 return;
1286
1287 __cpu = smp_processor_id();
1288
1289 if (in_nmi())
1290 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1291 else
1292 trace_buf = rcu_dereference(perf_trace_buf);
1293
1294 if (!trace_buf)
1295 goto end;
1296
1297 raw_data = per_cpu_ptr(trace_buf, __cpu);
1298
1299 /* Zero dead bytes from alignment to avoid buffer leak to userspace */
1300 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
1301 entry = (struct kprobe_trace_entry *)raw_data;
1302 ent = &entry->ent;
1303 1257
1304 tracing_generic_entry_update(ent, irq_flags, pc);
1305 ent->type = call->id;
1306 entry->nargs = tp->nr_args; 1258 entry->nargs = tp->nr_args;
1307 entry->ip = (unsigned long)kp->addr; 1259 entry->ip = (unsigned long)kp->addr;
1308 for (i = 0; i < tp->nr_args; i++) 1260 for (i = 0; i < tp->nr_args; i++)
1309 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1261 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1310 perf_tp_event(call->id, entry->ip, 1, entry, size);
1311 1262
1312end: 1263 ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags);
1313 perf_swevent_put_recursion_context(rctx);
1314end_recursion:
1315 local_irq_restore(irq_flags);
1316
1317 return 0;
1318} 1264}
1319 1265
1320/* Kretprobe profile handler */ 1266/* Kretprobe profile handler */
1321static __kprobes int kretprobe_profile_func(struct kretprobe_instance *ri, 1267static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri,
1322 struct pt_regs *regs) 1268 struct pt_regs *regs)
1323{ 1269{
1324 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); 1270 struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp);
1325 struct ftrace_event_call *call = &tp->call; 1271 struct ftrace_event_call *call = &tp->call;
1326 struct kretprobe_trace_entry *entry; 1272 struct kretprobe_trace_entry *entry;
1327 struct trace_entry *ent; 1273 int size, __size, i;
1328 int size, __size, i, pc, __cpu;
1329 unsigned long irq_flags; 1274 unsigned long irq_flags;
1330 char *trace_buf;
1331 char *raw_data;
1332 int rctx; 1275 int rctx;
1333 1276
1334 pc = preempt_count();
1335 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); 1277 __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args);
1336 size = ALIGN(__size + sizeof(u32), sizeof(u64)); 1278 size = ALIGN(__size + sizeof(u32), sizeof(u64));
1337 size -= sizeof(u32); 1279 size -= sizeof(u32);
1338 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, 1280 if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE,
1339 "profile buffer not large enough")) 1281 "profile buffer not large enough"))
1340 return 0; 1282 return;
1341
1342 /*
1343 * Protect the non nmi buffer
1344 * This also protects the rcu read side
1345 */
1346 local_irq_save(irq_flags);
1347
1348 rctx = perf_swevent_get_recursion_context();
1349 if (rctx < 0)
1350 goto end_recursion;
1351
1352 __cpu = smp_processor_id();
1353
1354 if (in_nmi())
1355 trace_buf = rcu_dereference(perf_trace_buf_nmi);
1356 else
1357 trace_buf = rcu_dereference(perf_trace_buf);
1358
1359 if (!trace_buf)
1360 goto end;
1361
1362 raw_data = per_cpu_ptr(trace_buf, __cpu);
1363 1283
1364 /* Zero dead bytes from alignment to avoid buffer leak to userspace */ 1284 entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags);
1365 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; 1285 if (!entry)
1366 entry = (struct kretprobe_trace_entry *)raw_data; 1286 return;
1367 ent = &entry->ent;
1368 1287
1369 tracing_generic_entry_update(ent, irq_flags, pc);
1370 ent->type = call->id;
1371 entry->nargs = tp->nr_args; 1288 entry->nargs = tp->nr_args;
1372 entry->func = (unsigned long)tp->rp.kp.addr; 1289 entry->func = (unsigned long)tp->rp.kp.addr;
1373 entry->ret_ip = (unsigned long)ri->ret_addr; 1290 entry->ret_ip = (unsigned long)ri->ret_addr;
1374 for (i = 0; i < tp->nr_args; i++) 1291 for (i = 0; i < tp->nr_args; i++)
1375 entry->args[i] = call_fetch(&tp->args[i].fetch, regs); 1292 entry->args[i] = call_fetch(&tp->args[i].fetch, regs);
1376 perf_tp_event(call->id, entry->ret_ip, 1, entry, size);
1377
1378end:
1379 perf_swevent_put_recursion_context(rctx);
1380end_recursion:
1381 local_irq_restore(irq_flags);
1382 1293
1383 return 0; 1294 ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags);
1384} 1295}
1385 1296
1386static int probe_profile_enable(struct ftrace_event_call *call) 1297static int probe_profile_enable(struct ftrace_event_call *call)
@@ -1408,7 +1319,7 @@ static void probe_profile_disable(struct ftrace_event_call *call)
1408 disable_kprobe(&tp->rp.kp); 1319 disable_kprobe(&tp->rp.kp);
1409 } 1320 }
1410} 1321}
1411#endif /* CONFIG_EVENT_PROFILE */ 1322#endif /* CONFIG_PERF_EVENTS */
1412 1323
1413 1324
1414static __kprobes 1325static __kprobes
@@ -1418,10 +1329,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
1418 1329
1419 if (tp->flags & TP_FLAG_TRACE) 1330 if (tp->flags & TP_FLAG_TRACE)
1420 kprobe_trace_func(kp, regs); 1331 kprobe_trace_func(kp, regs);
1421#ifdef CONFIG_EVENT_PROFILE 1332#ifdef CONFIG_PERF_EVENTS
1422 if (tp->flags & TP_FLAG_PROFILE) 1333 if (tp->flags & TP_FLAG_PROFILE)
1423 kprobe_profile_func(kp, regs); 1334 kprobe_profile_func(kp, regs);
1424#endif /* CONFIG_EVENT_PROFILE */ 1335#endif
1425 return 0; /* We don't tweek kernel, so just return 0 */ 1336 return 0; /* We don't tweek kernel, so just return 0 */
1426} 1337}
1427 1338
@@ -1432,10 +1343,10 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
1432 1343
1433 if (tp->flags & TP_FLAG_TRACE) 1344 if (tp->flags & TP_FLAG_TRACE)
1434 kretprobe_trace_func(ri, regs); 1345 kretprobe_trace_func(ri, regs);
1435#ifdef CONFIG_EVENT_PROFILE 1346#ifdef CONFIG_PERF_EVENTS
1436 if (tp->flags & TP_FLAG_PROFILE) 1347 if (tp->flags & TP_FLAG_PROFILE)
1437 kretprobe_profile_func(ri, regs); 1348 kretprobe_profile_func(ri, regs);
1438#endif /* CONFIG_EVENT_PROFILE */ 1349#endif
1439 return 0; /* We don't tweek kernel, so just return 0 */ 1350 return 0; /* We don't tweek kernel, so just return 0 */
1440} 1351}
1441 1352
@@ -1464,7 +1375,7 @@ static int register_probe_event(struct trace_probe *tp)
1464 call->regfunc = probe_event_enable; 1375 call->regfunc = probe_event_enable;
1465 call->unregfunc = probe_event_disable; 1376 call->unregfunc = probe_event_disable;
1466 1377
1467#ifdef CONFIG_EVENT_PROFILE 1378#ifdef CONFIG_PERF_EVENTS
1468 call->profile_enable = probe_profile_enable; 1379 call->profile_enable = probe_profile_enable;
1469 call->profile_disable = probe_profile_disable; 1380 call->profile_disable = probe_profile_disable;
1470#endif 1381#endif
@@ -1523,28 +1434,67 @@ static int kprobe_trace_selftest_target(int a1, int a2, int a3,
1523 1434
1524static __init int kprobe_trace_self_tests_init(void) 1435static __init int kprobe_trace_self_tests_init(void)
1525{ 1436{
1526 int ret; 1437 int ret, warn = 0;
1527 int (*target)(int, int, int, int, int, int); 1438 int (*target)(int, int, int, int, int, int);
1439 struct trace_probe *tp;
1528 1440
1529 target = kprobe_trace_selftest_target; 1441 target = kprobe_trace_selftest_target;
1530 1442
1531 pr_info("Testing kprobe tracing: "); 1443 pr_info("Testing kprobe tracing: ");
1532 1444
1533 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target " 1445 ret = command_trace_probe("p:testprobe kprobe_trace_selftest_target "
1534 "$arg1 $arg2 $arg3 $arg4 $stack $stack0"); 1446 "$stack $stack0 +0($stack)");
1535 if (WARN_ON_ONCE(ret)) 1447 if (WARN_ON_ONCE(ret)) {
1536 pr_warning("error enabling function entry\n"); 1448 pr_warning("error on probing function entry.\n");
1449 warn++;
1450 } else {
1451 /* Enable trace point */
1452 tp = find_probe_event("testprobe", KPROBE_EVENT_SYSTEM);
1453 if (WARN_ON_ONCE(tp == NULL)) {
1454 pr_warning("error on getting new probe.\n");
1455 warn++;
1456 } else
1457 probe_event_enable(&tp->call);
1458 }
1537 1459
1538 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target " 1460 ret = command_trace_probe("r:testprobe2 kprobe_trace_selftest_target "
1539 "$retval"); 1461 "$retval");
1540 if (WARN_ON_ONCE(ret)) 1462 if (WARN_ON_ONCE(ret)) {
1541 pr_warning("error enabling function return\n"); 1463 pr_warning("error on probing function return.\n");
1464 warn++;
1465 } else {
1466 /* Enable trace point */
1467 tp = find_probe_event("testprobe2", KPROBE_EVENT_SYSTEM);
1468 if (WARN_ON_ONCE(tp == NULL)) {
1469 pr_warning("error on getting new probe.\n");
1470 warn++;
1471 } else
1472 probe_event_enable(&tp->call);
1473 }
1474
1475 if (warn)
1476 goto end;
1542 1477
1543 ret = target(1, 2, 3, 4, 5, 6); 1478 ret = target(1, 2, 3, 4, 5, 6);
1544 1479
1545 cleanup_all_probes(); 1480 ret = command_trace_probe("-:testprobe");
1481 if (WARN_ON_ONCE(ret)) {
1482 pr_warning("error on deleting a probe.\n");
1483 warn++;
1484 }
1485
1486 ret = command_trace_probe("-:testprobe2");
1487 if (WARN_ON_ONCE(ret)) {
1488 pr_warning("error on deleting a probe.\n");
1489 warn++;
1490 }
1546 1491
1547 pr_cont("OK\n"); 1492end:
1493 cleanup_all_probes();
1494 if (warn)
1495 pr_cont("NG: Some tests are failed. Please check them.\n");
1496 else
1497 pr_cont("OK\n");
1548 return 0; 1498 return 0;
1549} 1499}
1550 1500
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 75289f372dd2..4e332b9e449c 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -421,7 +421,7 @@ int __init init_ftrace_syscalls(void)
421} 421}
422core_initcall(init_ftrace_syscalls); 422core_initcall(init_ftrace_syscalls);
423 423
424#ifdef CONFIG_EVENT_PROFILE 424#ifdef CONFIG_PERF_EVENTS
425 425
426static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); 426static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls);
427static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); 427static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls);
@@ -433,12 +433,9 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
433 struct syscall_metadata *sys_data; 433 struct syscall_metadata *sys_data;
434 struct syscall_trace_enter *rec; 434 struct syscall_trace_enter *rec;
435 unsigned long flags; 435 unsigned long flags;
436 char *trace_buf;
437 char *raw_data;
438 int syscall_nr; 436 int syscall_nr;
439 int rctx; 437 int rctx;
440 int size; 438 int size;
441 int cpu;
442 439
443 syscall_nr = syscall_get_nr(current, regs); 440 syscall_nr = syscall_get_nr(current, regs);
444 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) 441 if (!test_bit(syscall_nr, enabled_prof_enter_syscalls))
@@ -457,37 +454,15 @@ static void prof_syscall_enter(struct pt_regs *regs, long id)
457 "profile buffer not large enough")) 454 "profile buffer not large enough"))
458 return; 455 return;
459 456
460 /* Protect the per cpu buffer, begin the rcu read side */ 457 rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size,
461 local_irq_save(flags); 458 sys_data->enter_event->id, &rctx, &flags);
462 459 if (!rec)
463 rctx = perf_swevent_get_recursion_context(); 460 return;
464 if (rctx < 0)
465 goto end_recursion;
466
467 cpu = smp_processor_id();
468
469 trace_buf = rcu_dereference(perf_trace_buf);
470
471 if (!trace_buf)
472 goto end;
473
474 raw_data = per_cpu_ptr(trace_buf, cpu);
475
476 /* zero the dead bytes from align to not leak stack to user */
477 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
478 461
479 rec = (struct syscall_trace_enter *) raw_data;
480 tracing_generic_entry_update(&rec->ent, 0, 0);
481 rec->ent.type = sys_data->enter_event->id;
482 rec->nr = syscall_nr; 462 rec->nr = syscall_nr;
483 syscall_get_arguments(current, regs, 0, sys_data->nb_args, 463 syscall_get_arguments(current, regs, 0, sys_data->nb_args,
484 (unsigned long *)&rec->args); 464 (unsigned long *)&rec->args);
485 perf_tp_event(sys_data->enter_event->id, 0, 1, rec, size); 465 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
486
487end:
488 perf_swevent_put_recursion_context(rctx);
489end_recursion:
490 local_irq_restore(flags);
491} 466}
492 467
493int prof_sysenter_enable(struct ftrace_event_call *call) 468int prof_sysenter_enable(struct ftrace_event_call *call)
@@ -531,11 +506,8 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
531 struct syscall_trace_exit *rec; 506 struct syscall_trace_exit *rec;
532 unsigned long flags; 507 unsigned long flags;
533 int syscall_nr; 508 int syscall_nr;
534 char *trace_buf;
535 char *raw_data;
536 int rctx; 509 int rctx;
537 int size; 510 int size;
538 int cpu;
539 511
540 syscall_nr = syscall_get_nr(current, regs); 512 syscall_nr = syscall_get_nr(current, regs);
541 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) 513 if (!test_bit(syscall_nr, enabled_prof_exit_syscalls))
@@ -557,38 +529,15 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret)
557 "exit event has grown above profile buffer size")) 529 "exit event has grown above profile buffer size"))
558 return; 530 return;
559 531
560 /* Protect the per cpu buffer, begin the rcu read side */ 532 rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size,
561 local_irq_save(flags); 533 sys_data->exit_event->id, &rctx, &flags);
562 534 if (!rec)
563 rctx = perf_swevent_get_recursion_context(); 535 return;
564 if (rctx < 0)
565 goto end_recursion;
566
567 cpu = smp_processor_id();
568
569 trace_buf = rcu_dereference(perf_trace_buf);
570
571 if (!trace_buf)
572 goto end;
573
574 raw_data = per_cpu_ptr(trace_buf, cpu);
575
576 /* zero the dead bytes from align to not leak stack to user */
577 *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL;
578
579 rec = (struct syscall_trace_exit *)raw_data;
580 536
581 tracing_generic_entry_update(&rec->ent, 0, 0);
582 rec->ent.type = sys_data->exit_event->id;
583 rec->nr = syscall_nr; 537 rec->nr = syscall_nr;
584 rec->ret = syscall_get_return_value(current, regs); 538 rec->ret = syscall_get_return_value(current, regs);
585 539
586 perf_tp_event(sys_data->exit_event->id, 0, 1, rec, size); 540 ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags);
587
588end:
589 perf_swevent_put_recursion_context(rctx);
590end_recursion:
591 local_irq_restore(flags);
592} 541}
593 542
594int prof_sysexit_enable(struct ftrace_event_call *call) 543int prof_sysexit_enable(struct ftrace_event_call *call)
@@ -626,6 +575,5 @@ void prof_sysexit_disable(struct ftrace_event_call *call)
626 mutex_unlock(&syscall_trace_lock); 575 mutex_unlock(&syscall_trace_lock);
627} 576}
628 577
629#endif 578#endif /* CONFIG_PERF_EVENTS */
630
631 579