diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 1 | ||||
-rw-r--r-- | kernel/exit.c | 13 | ||||
-rw-r--r-- | kernel/fork.c | 1 | ||||
-rw-r--r-- | kernel/perf_counter.c | 1686 | ||||
-rw-r--r-- | kernel/sched.c | 76 | ||||
-rw-r--r-- | kernel/sys.c | 7 | ||||
-rw-r--r-- | kernel/sys_ni.c | 3 |
7 files changed, 1780 insertions, 7 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 2921d90ce32f..8b2628c7914b 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -89,6 +89,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o | |||
89 | obj-$(CONFIG_FUNCTION_TRACER) += trace/ | 89 | obj-$(CONFIG_FUNCTION_TRACER) += trace/ |
90 | obj-$(CONFIG_TRACING) += trace/ | 90 | obj-$(CONFIG_TRACING) += trace/ |
91 | obj-$(CONFIG_SMP) += sched_cpupri.o | 91 | obj-$(CONFIG_SMP) += sched_cpupri.o |
92 | obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o | ||
92 | 93 | ||
93 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | 94 | ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) |
94 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 95 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/exit.c b/kernel/exit.c index c7740fa3252c..cbdb39a498eb 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -159,6 +159,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp) | |||
159 | { | 159 | { |
160 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); | 160 | struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); |
161 | 161 | ||
162 | #ifdef CONFIG_PERF_COUNTERS | ||
163 | WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); | ||
164 | #endif | ||
162 | trace_sched_process_free(tsk); | 165 | trace_sched_process_free(tsk); |
163 | put_task_struct(tsk); | 166 | put_task_struct(tsk); |
164 | } | 167 | } |
@@ -1093,10 +1096,6 @@ NORET_TYPE void do_exit(long code) | |||
1093 | tsk->mempolicy = NULL; | 1096 | tsk->mempolicy = NULL; |
1094 | #endif | 1097 | #endif |
1095 | #ifdef CONFIG_FUTEX | 1098 | #ifdef CONFIG_FUTEX |
1096 | /* | ||
1097 | * This must happen late, after the PID is not | ||
1098 | * hashed anymore: | ||
1099 | */ | ||
1100 | if (unlikely(!list_empty(&tsk->pi_state_list))) | 1099 | if (unlikely(!list_empty(&tsk->pi_state_list))) |
1101 | exit_pi_state_list(tsk); | 1100 | exit_pi_state_list(tsk); |
1102 | if (unlikely(current->pi_state_cache)) | 1101 | if (unlikely(current->pi_state_cache)) |
@@ -1361,6 +1360,12 @@ static int wait_task_zombie(struct task_struct *p, int options, | |||
1361 | */ | 1360 | */ |
1362 | read_unlock(&tasklist_lock); | 1361 | read_unlock(&tasklist_lock); |
1363 | 1362 | ||
1363 | /* | ||
1364 | * Flush inherited counters to the parent - before the parent | ||
1365 | * gets woken up by child-exit notifications. | ||
1366 | */ | ||
1367 | perf_counter_exit_task(p); | ||
1368 | |||
1364 | retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; | 1369 | retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; |
1365 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) | 1370 | status = (p->signal->flags & SIGNAL_GROUP_EXIT) |
1366 | ? p->signal->group_exit_code : p->exit_code; | 1371 | ? p->signal->group_exit_code : p->exit_code; |
diff --git a/kernel/fork.c b/kernel/fork.c index 1d68f1255dd8..b1f8609287eb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -985,6 +985,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
985 | goto fork_out; | 985 | goto fork_out; |
986 | 986 | ||
987 | rt_mutex_init_task(p); | 987 | rt_mutex_init_task(p); |
988 | perf_counter_init_task(p); | ||
988 | 989 | ||
989 | #ifdef CONFIG_PROVE_LOCKING | 990 | #ifdef CONFIG_PROVE_LOCKING |
990 | DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); | 991 | DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); |
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c new file mode 100644 index 000000000000..37f771691f93 --- /dev/null +++ b/kernel/perf_counter.c | |||
@@ -0,0 +1,1686 @@ | |||
1 | /* | ||
2 | * Performance counter core code | ||
3 | * | ||
4 | * Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de> | ||
5 | * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar | ||
6 | * | ||
7 | * For licencing details see kernel-base/COPYING | ||
8 | */ | ||
9 | |||
10 | #include <linux/fs.h> | ||
11 | #include <linux/cpu.h> | ||
12 | #include <linux/smp.h> | ||
13 | #include <linux/file.h> | ||
14 | #include <linux/poll.h> | ||
15 | #include <linux/sysfs.h> | ||
16 | #include <linux/ptrace.h> | ||
17 | #include <linux/percpu.h> | ||
18 | #include <linux/uaccess.h> | ||
19 | #include <linux/syscalls.h> | ||
20 | #include <linux/anon_inodes.h> | ||
21 | #include <linux/kernel_stat.h> | ||
22 | #include <linux/perf_counter.h> | ||
23 | |||
24 | /* | ||
25 | * Each CPU has a list of per CPU counters: | ||
26 | */ | ||
27 | DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); | ||
28 | |||
29 | int perf_max_counters __read_mostly = 1; | ||
30 | static int perf_reserved_percpu __read_mostly; | ||
31 | static int perf_overcommit __read_mostly = 1; | ||
32 | |||
33 | /* | ||
34 | * Mutex for (sysadmin-configurable) counter reservations: | ||
35 | */ | ||
36 | static DEFINE_MUTEX(perf_resource_mutex); | ||
37 | |||
38 | /* | ||
39 | * Architecture provided APIs - weak aliases: | ||
40 | */ | ||
41 | extern __weak const struct hw_perf_counter_ops * | ||
42 | hw_perf_counter_init(struct perf_counter *counter) | ||
43 | { | ||
44 | return ERR_PTR(-EINVAL); | ||
45 | } | ||
46 | |||
47 | u64 __weak hw_perf_save_disable(void) { return 0; } | ||
48 | void __weak hw_perf_restore(u64 ctrl) { barrier(); } | ||
49 | void __weak hw_perf_counter_setup(void) { barrier(); } | ||
50 | |||
51 | static void | ||
52 | list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) | ||
53 | { | ||
54 | struct perf_counter *group_leader = counter->group_leader; | ||
55 | |||
56 | /* | ||
57 | * Depending on whether it is a standalone or sibling counter, | ||
58 | * add it straight to the context's counter list, or to the group | ||
59 | * leader's sibling list: | ||
60 | */ | ||
61 | if (counter->group_leader == counter) | ||
62 | list_add_tail(&counter->list_entry, &ctx->counter_list); | ||
63 | else | ||
64 | list_add_tail(&counter->list_entry, &group_leader->sibling_list); | ||
65 | } | ||
66 | |||
67 | static void | ||
68 | list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) | ||
69 | { | ||
70 | struct perf_counter *sibling, *tmp; | ||
71 | |||
72 | list_del_init(&counter->list_entry); | ||
73 | |||
74 | /* | ||
75 | * If this was a group counter with sibling counters then | ||
76 | * upgrade the siblings to singleton counters by adding them | ||
77 | * to the context list directly: | ||
78 | */ | ||
79 | list_for_each_entry_safe(sibling, tmp, | ||
80 | &counter->sibling_list, list_entry) { | ||
81 | |||
82 | list_del_init(&sibling->list_entry); | ||
83 | list_add_tail(&sibling->list_entry, &ctx->counter_list); | ||
84 | sibling->group_leader = sibling; | ||
85 | } | ||
86 | } | ||
87 | |||
88 | /* | ||
89 | * Cross CPU call to remove a performance counter | ||
90 | * | ||
91 | * We disable the counter on the hardware level first. After that we | ||
92 | * remove it from the context list. | ||
93 | */ | ||
94 | static void __perf_counter_remove_from_context(void *info) | ||
95 | { | ||
96 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
97 | struct perf_counter *counter = info; | ||
98 | struct perf_counter_context *ctx = counter->ctx; | ||
99 | unsigned long flags; | ||
100 | u64 perf_flags; | ||
101 | |||
102 | /* | ||
103 | * If this is a task context, we need to check whether it is | ||
104 | * the current task context of this cpu. If not it has been | ||
105 | * scheduled out before the smp call arrived. | ||
106 | */ | ||
107 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
108 | return; | ||
109 | |||
110 | curr_rq_lock_irq_save(&flags); | ||
111 | spin_lock(&ctx->lock); | ||
112 | |||
113 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) { | ||
114 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
115 | counter->hw_ops->disable(counter); | ||
116 | ctx->nr_active--; | ||
117 | cpuctx->active_oncpu--; | ||
118 | counter->task = NULL; | ||
119 | counter->oncpu = -1; | ||
120 | } | ||
121 | ctx->nr_counters--; | ||
122 | |||
123 | /* | ||
124 | * Protect the list operation against NMI by disabling the | ||
125 | * counters on a global level. NOP for non NMI based counters. | ||
126 | */ | ||
127 | perf_flags = hw_perf_save_disable(); | ||
128 | list_del_counter(counter, ctx); | ||
129 | hw_perf_restore(perf_flags); | ||
130 | |||
131 | if (!ctx->task) { | ||
132 | /* | ||
133 | * Allow more per task counters with respect to the | ||
134 | * reservation: | ||
135 | */ | ||
136 | cpuctx->max_pertask = | ||
137 | min(perf_max_counters - ctx->nr_counters, | ||
138 | perf_max_counters - perf_reserved_percpu); | ||
139 | } | ||
140 | |||
141 | spin_unlock(&ctx->lock); | ||
142 | curr_rq_unlock_irq_restore(&flags); | ||
143 | } | ||
144 | |||
145 | |||
146 | /* | ||
147 | * Remove the counter from a task's (or a CPU's) list of counters. | ||
148 | * | ||
149 | * Must be called with counter->mutex held. | ||
150 | * | ||
151 | * CPU counters are removed with a smp call. For task counters we only | ||
152 | * call when the task is on a CPU. | ||
153 | */ | ||
154 | static void perf_counter_remove_from_context(struct perf_counter *counter) | ||
155 | { | ||
156 | struct perf_counter_context *ctx = counter->ctx; | ||
157 | struct task_struct *task = ctx->task; | ||
158 | |||
159 | if (!task) { | ||
160 | /* | ||
161 | * Per cpu counters are removed via an smp call and | ||
162 | * the removal is always sucessful. | ||
163 | */ | ||
164 | smp_call_function_single(counter->cpu, | ||
165 | __perf_counter_remove_from_context, | ||
166 | counter, 1); | ||
167 | return; | ||
168 | } | ||
169 | |||
170 | retry: | ||
171 | task_oncpu_function_call(task, __perf_counter_remove_from_context, | ||
172 | counter); | ||
173 | |||
174 | spin_lock_irq(&ctx->lock); | ||
175 | /* | ||
176 | * If the context is active we need to retry the smp call. | ||
177 | */ | ||
178 | if (ctx->nr_active && !list_empty(&counter->list_entry)) { | ||
179 | spin_unlock_irq(&ctx->lock); | ||
180 | goto retry; | ||
181 | } | ||
182 | |||
183 | /* | ||
184 | * The lock prevents that this context is scheduled in so we | ||
185 | * can remove the counter safely, if the call above did not | ||
186 | * succeed. | ||
187 | */ | ||
188 | if (!list_empty(&counter->list_entry)) { | ||
189 | ctx->nr_counters--; | ||
190 | list_del_counter(counter, ctx); | ||
191 | counter->task = NULL; | ||
192 | } | ||
193 | spin_unlock_irq(&ctx->lock); | ||
194 | } | ||
195 | |||
196 | static int | ||
197 | counter_sched_in(struct perf_counter *counter, | ||
198 | struct perf_cpu_context *cpuctx, | ||
199 | struct perf_counter_context *ctx, | ||
200 | int cpu) | ||
201 | { | ||
202 | if (counter->state == PERF_COUNTER_STATE_OFF) | ||
203 | return 0; | ||
204 | |||
205 | counter->state = PERF_COUNTER_STATE_ACTIVE; | ||
206 | counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ | ||
207 | /* | ||
208 | * The new state must be visible before we turn it on in the hardware: | ||
209 | */ | ||
210 | smp_wmb(); | ||
211 | |||
212 | if (counter->hw_ops->enable(counter)) { | ||
213 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
214 | counter->oncpu = -1; | ||
215 | return -EAGAIN; | ||
216 | } | ||
217 | |||
218 | cpuctx->active_oncpu++; | ||
219 | ctx->nr_active++; | ||
220 | |||
221 | return 0; | ||
222 | } | ||
223 | |||
224 | /* | ||
225 | * Cross CPU call to install and enable a performance counter | ||
226 | */ | ||
227 | static void __perf_install_in_context(void *info) | ||
228 | { | ||
229 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
230 | struct perf_counter *counter = info; | ||
231 | struct perf_counter_context *ctx = counter->ctx; | ||
232 | int cpu = smp_processor_id(); | ||
233 | unsigned long flags; | ||
234 | u64 perf_flags; | ||
235 | |||
236 | /* | ||
237 | * If this is a task context, we need to check whether it is | ||
238 | * the current task context of this cpu. If not it has been | ||
239 | * scheduled out before the smp call arrived. | ||
240 | */ | ||
241 | if (ctx->task && cpuctx->task_ctx != ctx) | ||
242 | return; | ||
243 | |||
244 | curr_rq_lock_irq_save(&flags); | ||
245 | spin_lock(&ctx->lock); | ||
246 | |||
247 | /* | ||
248 | * Protect the list operation against NMI by disabling the | ||
249 | * counters on a global level. NOP for non NMI based counters. | ||
250 | */ | ||
251 | perf_flags = hw_perf_save_disable(); | ||
252 | |||
253 | list_add_counter(counter, ctx); | ||
254 | ctx->nr_counters++; | ||
255 | |||
256 | counter_sched_in(counter, cpuctx, ctx, cpu); | ||
257 | |||
258 | if (!ctx->task && cpuctx->max_pertask) | ||
259 | cpuctx->max_pertask--; | ||
260 | |||
261 | hw_perf_restore(perf_flags); | ||
262 | |||
263 | spin_unlock(&ctx->lock); | ||
264 | curr_rq_unlock_irq_restore(&flags); | ||
265 | } | ||
266 | |||
267 | /* | ||
268 | * Attach a performance counter to a context | ||
269 | * | ||
270 | * First we add the counter to the list with the hardware enable bit | ||
271 | * in counter->hw_config cleared. | ||
272 | * | ||
273 | * If the counter is attached to a task which is on a CPU we use a smp | ||
274 | * call to enable it in the task context. The task might have been | ||
275 | * scheduled away, but we check this in the smp call again. | ||
276 | */ | ||
277 | static void | ||
278 | perf_install_in_context(struct perf_counter_context *ctx, | ||
279 | struct perf_counter *counter, | ||
280 | int cpu) | ||
281 | { | ||
282 | struct task_struct *task = ctx->task; | ||
283 | |||
284 | counter->ctx = ctx; | ||
285 | if (!task) { | ||
286 | /* | ||
287 | * Per cpu counters are installed via an smp call and | ||
288 | * the install is always sucessful. | ||
289 | */ | ||
290 | smp_call_function_single(cpu, __perf_install_in_context, | ||
291 | counter, 1); | ||
292 | return; | ||
293 | } | ||
294 | |||
295 | counter->task = task; | ||
296 | retry: | ||
297 | task_oncpu_function_call(task, __perf_install_in_context, | ||
298 | counter); | ||
299 | |||
300 | spin_lock_irq(&ctx->lock); | ||
301 | /* | ||
302 | * we need to retry the smp call. | ||
303 | */ | ||
304 | if (ctx->nr_active && list_empty(&counter->list_entry)) { | ||
305 | spin_unlock_irq(&ctx->lock); | ||
306 | goto retry; | ||
307 | } | ||
308 | |||
309 | /* | ||
310 | * The lock prevents that this context is scheduled in so we | ||
311 | * can add the counter safely, if it the call above did not | ||
312 | * succeed. | ||
313 | */ | ||
314 | if (list_empty(&counter->list_entry)) { | ||
315 | list_add_counter(counter, ctx); | ||
316 | ctx->nr_counters++; | ||
317 | } | ||
318 | spin_unlock_irq(&ctx->lock); | ||
319 | } | ||
320 | |||
321 | static void | ||
322 | counter_sched_out(struct perf_counter *counter, | ||
323 | struct perf_cpu_context *cpuctx, | ||
324 | struct perf_counter_context *ctx) | ||
325 | { | ||
326 | if (counter->state != PERF_COUNTER_STATE_ACTIVE) | ||
327 | return; | ||
328 | |||
329 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
330 | counter->hw_ops->disable(counter); | ||
331 | counter->oncpu = -1; | ||
332 | |||
333 | cpuctx->active_oncpu--; | ||
334 | ctx->nr_active--; | ||
335 | } | ||
336 | |||
337 | static void | ||
338 | group_sched_out(struct perf_counter *group_counter, | ||
339 | struct perf_cpu_context *cpuctx, | ||
340 | struct perf_counter_context *ctx) | ||
341 | { | ||
342 | struct perf_counter *counter; | ||
343 | |||
344 | counter_sched_out(group_counter, cpuctx, ctx); | ||
345 | |||
346 | /* | ||
347 | * Schedule out siblings (if any): | ||
348 | */ | ||
349 | list_for_each_entry(counter, &group_counter->sibling_list, list_entry) | ||
350 | counter_sched_out(counter, cpuctx, ctx); | ||
351 | } | ||
352 | |||
353 | void __perf_counter_sched_out(struct perf_counter_context *ctx, | ||
354 | struct perf_cpu_context *cpuctx) | ||
355 | { | ||
356 | struct perf_counter *counter; | ||
357 | |||
358 | if (likely(!ctx->nr_counters)) | ||
359 | return; | ||
360 | |||
361 | spin_lock(&ctx->lock); | ||
362 | if (ctx->nr_active) { | ||
363 | list_for_each_entry(counter, &ctx->counter_list, list_entry) | ||
364 | group_sched_out(counter, cpuctx, ctx); | ||
365 | } | ||
366 | spin_unlock(&ctx->lock); | ||
367 | } | ||
368 | |||
369 | /* | ||
370 | * Called from scheduler to remove the counters of the current task, | ||
371 | * with interrupts disabled. | ||
372 | * | ||
373 | * We stop each counter and update the counter value in counter->count. | ||
374 | * | ||
375 | * This does not protect us against NMI, but disable() | ||
376 | * sets the disabled bit in the control field of counter _before_ | ||
377 | * accessing the counter control register. If a NMI hits, then it will | ||
378 | * not restart the counter. | ||
379 | */ | ||
380 | void perf_counter_task_sched_out(struct task_struct *task, int cpu) | ||
381 | { | ||
382 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
383 | struct perf_counter_context *ctx = &task->perf_counter_ctx; | ||
384 | |||
385 | if (likely(!cpuctx->task_ctx)) | ||
386 | return; | ||
387 | |||
388 | __perf_counter_sched_out(ctx, cpuctx); | ||
389 | |||
390 | cpuctx->task_ctx = NULL; | ||
391 | } | ||
392 | |||
393 | static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) | ||
394 | { | ||
395 | __perf_counter_sched_out(&cpuctx->ctx, cpuctx); | ||
396 | } | ||
397 | |||
398 | static int | ||
399 | group_sched_in(struct perf_counter *group_counter, | ||
400 | struct perf_cpu_context *cpuctx, | ||
401 | struct perf_counter_context *ctx, | ||
402 | int cpu) | ||
403 | { | ||
404 | struct perf_counter *counter, *partial_group; | ||
405 | int ret = 0; | ||
406 | |||
407 | if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) | ||
408 | return -EAGAIN; | ||
409 | |||
410 | /* | ||
411 | * Schedule in siblings as one group (if any): | ||
412 | */ | ||
413 | list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { | ||
414 | if (counter_sched_in(counter, cpuctx, ctx, cpu)) { | ||
415 | partial_group = counter; | ||
416 | goto group_error; | ||
417 | } | ||
418 | ret = -EAGAIN; | ||
419 | } | ||
420 | |||
421 | return ret; | ||
422 | |||
423 | group_error: | ||
424 | /* | ||
425 | * Groups can be scheduled in as one unit only, so undo any | ||
426 | * partial group before returning: | ||
427 | */ | ||
428 | list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { | ||
429 | if (counter == partial_group) | ||
430 | break; | ||
431 | counter_sched_out(counter, cpuctx, ctx); | ||
432 | } | ||
433 | counter_sched_out(group_counter, cpuctx, ctx); | ||
434 | |||
435 | return -EAGAIN; | ||
436 | } | ||
437 | |||
438 | static void | ||
439 | __perf_counter_sched_in(struct perf_counter_context *ctx, | ||
440 | struct perf_cpu_context *cpuctx, int cpu) | ||
441 | { | ||
442 | struct perf_counter *counter; | ||
443 | |||
444 | if (likely(!ctx->nr_counters)) | ||
445 | return; | ||
446 | |||
447 | spin_lock(&ctx->lock); | ||
448 | list_for_each_entry(counter, &ctx->counter_list, list_entry) { | ||
449 | /* | ||
450 | * Listen to the 'cpu' scheduling filter constraint | ||
451 | * of counters: | ||
452 | */ | ||
453 | if (counter->cpu != -1 && counter->cpu != cpu) | ||
454 | continue; | ||
455 | |||
456 | /* | ||
457 | * If we scheduled in a group atomically and | ||
458 | * exclusively, break out: | ||
459 | */ | ||
460 | if (group_sched_in(counter, cpuctx, ctx, cpu)) | ||
461 | break; | ||
462 | } | ||
463 | spin_unlock(&ctx->lock); | ||
464 | } | ||
465 | |||
466 | /* | ||
467 | * Called from scheduler to add the counters of the current task | ||
468 | * with interrupts disabled. | ||
469 | * | ||
470 | * We restore the counter value and then enable it. | ||
471 | * | ||
472 | * This does not protect us against NMI, but enable() | ||
473 | * sets the enabled bit in the control field of counter _before_ | ||
474 | * accessing the counter control register. If a NMI hits, then it will | ||
475 | * keep the counter running. | ||
476 | */ | ||
477 | void perf_counter_task_sched_in(struct task_struct *task, int cpu) | ||
478 | { | ||
479 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
480 | struct perf_counter_context *ctx = &task->perf_counter_ctx; | ||
481 | |||
482 | __perf_counter_sched_in(ctx, cpuctx, cpu); | ||
483 | cpuctx->task_ctx = ctx; | ||
484 | } | ||
485 | |||
486 | static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) | ||
487 | { | ||
488 | struct perf_counter_context *ctx = &cpuctx->ctx; | ||
489 | |||
490 | __perf_counter_sched_in(ctx, cpuctx, cpu); | ||
491 | } | ||
492 | |||
493 | int perf_counter_task_disable(void) | ||
494 | { | ||
495 | struct task_struct *curr = current; | ||
496 | struct perf_counter_context *ctx = &curr->perf_counter_ctx; | ||
497 | struct perf_counter *counter; | ||
498 | unsigned long flags; | ||
499 | u64 perf_flags; | ||
500 | int cpu; | ||
501 | |||
502 | if (likely(!ctx->nr_counters)) | ||
503 | return 0; | ||
504 | |||
505 | curr_rq_lock_irq_save(&flags); | ||
506 | cpu = smp_processor_id(); | ||
507 | |||
508 | /* force the update of the task clock: */ | ||
509 | __task_delta_exec(curr, 1); | ||
510 | |||
511 | perf_counter_task_sched_out(curr, cpu); | ||
512 | |||
513 | spin_lock(&ctx->lock); | ||
514 | |||
515 | /* | ||
516 | * Disable all the counters: | ||
517 | */ | ||
518 | perf_flags = hw_perf_save_disable(); | ||
519 | |||
520 | list_for_each_entry(counter, &ctx->counter_list, list_entry) | ||
521 | counter->state = PERF_COUNTER_STATE_OFF; | ||
522 | |||
523 | hw_perf_restore(perf_flags); | ||
524 | |||
525 | spin_unlock(&ctx->lock); | ||
526 | |||
527 | curr_rq_unlock_irq_restore(&flags); | ||
528 | |||
529 | return 0; | ||
530 | } | ||
531 | |||
532 | int perf_counter_task_enable(void) | ||
533 | { | ||
534 | struct task_struct *curr = current; | ||
535 | struct perf_counter_context *ctx = &curr->perf_counter_ctx; | ||
536 | struct perf_counter *counter; | ||
537 | unsigned long flags; | ||
538 | u64 perf_flags; | ||
539 | int cpu; | ||
540 | |||
541 | if (likely(!ctx->nr_counters)) | ||
542 | return 0; | ||
543 | |||
544 | curr_rq_lock_irq_save(&flags); | ||
545 | cpu = smp_processor_id(); | ||
546 | |||
547 | /* force the update of the task clock: */ | ||
548 | __task_delta_exec(curr, 1); | ||
549 | |||
550 | perf_counter_task_sched_out(curr, cpu); | ||
551 | |||
552 | spin_lock(&ctx->lock); | ||
553 | |||
554 | /* | ||
555 | * Disable all the counters: | ||
556 | */ | ||
557 | perf_flags = hw_perf_save_disable(); | ||
558 | |||
559 | list_for_each_entry(counter, &ctx->counter_list, list_entry) { | ||
560 | if (counter->state != PERF_COUNTER_STATE_OFF) | ||
561 | continue; | ||
562 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
563 | counter->hw_event.disabled = 0; | ||
564 | } | ||
565 | hw_perf_restore(perf_flags); | ||
566 | |||
567 | spin_unlock(&ctx->lock); | ||
568 | |||
569 | perf_counter_task_sched_in(curr, cpu); | ||
570 | |||
571 | curr_rq_unlock_irq_restore(&flags); | ||
572 | |||
573 | return 0; | ||
574 | } | ||
575 | |||
576 | /* | ||
577 | * Round-robin a context's counters: | ||
578 | */ | ||
579 | static void rotate_ctx(struct perf_counter_context *ctx) | ||
580 | { | ||
581 | struct perf_counter *counter; | ||
582 | u64 perf_flags; | ||
583 | |||
584 | if (!ctx->nr_counters) | ||
585 | return; | ||
586 | |||
587 | spin_lock(&ctx->lock); | ||
588 | /* | ||
589 | * Rotate the first entry last (works just fine for group counters too): | ||
590 | */ | ||
591 | perf_flags = hw_perf_save_disable(); | ||
592 | list_for_each_entry(counter, &ctx->counter_list, list_entry) { | ||
593 | list_del(&counter->list_entry); | ||
594 | list_add_tail(&counter->list_entry, &ctx->counter_list); | ||
595 | break; | ||
596 | } | ||
597 | hw_perf_restore(perf_flags); | ||
598 | |||
599 | spin_unlock(&ctx->lock); | ||
600 | } | ||
601 | |||
602 | void perf_counter_task_tick(struct task_struct *curr, int cpu) | ||
603 | { | ||
604 | struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
605 | struct perf_counter_context *ctx = &curr->perf_counter_ctx; | ||
606 | const int rotate_percpu = 0; | ||
607 | |||
608 | if (rotate_percpu) | ||
609 | perf_counter_cpu_sched_out(cpuctx); | ||
610 | perf_counter_task_sched_out(curr, cpu); | ||
611 | |||
612 | if (rotate_percpu) | ||
613 | rotate_ctx(&cpuctx->ctx); | ||
614 | rotate_ctx(ctx); | ||
615 | |||
616 | if (rotate_percpu) | ||
617 | perf_counter_cpu_sched_in(cpuctx, cpu); | ||
618 | perf_counter_task_sched_in(curr, cpu); | ||
619 | } | ||
620 | |||
621 | /* | ||
622 | * Cross CPU call to read the hardware counter | ||
623 | */ | ||
624 | static void __read(void *info) | ||
625 | { | ||
626 | struct perf_counter *counter = info; | ||
627 | unsigned long flags; | ||
628 | |||
629 | curr_rq_lock_irq_save(&flags); | ||
630 | counter->hw_ops->read(counter); | ||
631 | curr_rq_unlock_irq_restore(&flags); | ||
632 | } | ||
633 | |||
634 | static u64 perf_counter_read(struct perf_counter *counter) | ||
635 | { | ||
636 | /* | ||
637 | * If counter is enabled and currently active on a CPU, update the | ||
638 | * value in the counter structure: | ||
639 | */ | ||
640 | if (counter->state == PERF_COUNTER_STATE_ACTIVE) { | ||
641 | smp_call_function_single(counter->oncpu, | ||
642 | __read, counter, 1); | ||
643 | } | ||
644 | |||
645 | return atomic64_read(&counter->count); | ||
646 | } | ||
647 | |||
648 | /* | ||
649 | * Cross CPU call to switch performance data pointers | ||
650 | */ | ||
651 | static void __perf_switch_irq_data(void *info) | ||
652 | { | ||
653 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
654 | struct perf_counter *counter = info; | ||
655 | struct perf_counter_context *ctx = counter->ctx; | ||
656 | struct perf_data *oldirqdata = counter->irqdata; | ||
657 | |||
658 | /* | ||
659 | * If this is a task context, we need to check whether it is | ||
660 | * the current task context of this cpu. If not it has been | ||
661 | * scheduled out before the smp call arrived. | ||
662 | */ | ||
663 | if (ctx->task) { | ||
664 | if (cpuctx->task_ctx != ctx) | ||
665 | return; | ||
666 | spin_lock(&ctx->lock); | ||
667 | } | ||
668 | |||
669 | /* Change the pointer NMI safe */ | ||
670 | atomic_long_set((atomic_long_t *)&counter->irqdata, | ||
671 | (unsigned long) counter->usrdata); | ||
672 | counter->usrdata = oldirqdata; | ||
673 | |||
674 | if (ctx->task) | ||
675 | spin_unlock(&ctx->lock); | ||
676 | } | ||
677 | |||
678 | static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) | ||
679 | { | ||
680 | struct perf_counter_context *ctx = counter->ctx; | ||
681 | struct perf_data *oldirqdata = counter->irqdata; | ||
682 | struct task_struct *task = ctx->task; | ||
683 | |||
684 | if (!task) { | ||
685 | smp_call_function_single(counter->cpu, | ||
686 | __perf_switch_irq_data, | ||
687 | counter, 1); | ||
688 | return counter->usrdata; | ||
689 | } | ||
690 | |||
691 | retry: | ||
692 | spin_lock_irq(&ctx->lock); | ||
693 | if (counter->state != PERF_COUNTER_STATE_ACTIVE) { | ||
694 | counter->irqdata = counter->usrdata; | ||
695 | counter->usrdata = oldirqdata; | ||
696 | spin_unlock_irq(&ctx->lock); | ||
697 | return oldirqdata; | ||
698 | } | ||
699 | spin_unlock_irq(&ctx->lock); | ||
700 | task_oncpu_function_call(task, __perf_switch_irq_data, counter); | ||
701 | /* Might have failed, because task was scheduled out */ | ||
702 | if (counter->irqdata == oldirqdata) | ||
703 | goto retry; | ||
704 | |||
705 | return counter->usrdata; | ||
706 | } | ||
707 | |||
708 | static void put_context(struct perf_counter_context *ctx) | ||
709 | { | ||
710 | if (ctx->task) | ||
711 | put_task_struct(ctx->task); | ||
712 | } | ||
713 | |||
714 | static struct perf_counter_context *find_get_context(pid_t pid, int cpu) | ||
715 | { | ||
716 | struct perf_cpu_context *cpuctx; | ||
717 | struct perf_counter_context *ctx; | ||
718 | struct task_struct *task; | ||
719 | |||
720 | /* | ||
721 | * If cpu is not a wildcard then this is a percpu counter: | ||
722 | */ | ||
723 | if (cpu != -1) { | ||
724 | /* Must be root to operate on a CPU counter: */ | ||
725 | if (!capable(CAP_SYS_ADMIN)) | ||
726 | return ERR_PTR(-EACCES); | ||
727 | |||
728 | if (cpu < 0 || cpu > num_possible_cpus()) | ||
729 | return ERR_PTR(-EINVAL); | ||
730 | |||
731 | /* | ||
732 | * We could be clever and allow to attach a counter to an | ||
733 | * offline CPU and activate it when the CPU comes up, but | ||
734 | * that's for later. | ||
735 | */ | ||
736 | if (!cpu_isset(cpu, cpu_online_map)) | ||
737 | return ERR_PTR(-ENODEV); | ||
738 | |||
739 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
740 | ctx = &cpuctx->ctx; | ||
741 | |||
742 | return ctx; | ||
743 | } | ||
744 | |||
745 | rcu_read_lock(); | ||
746 | if (!pid) | ||
747 | task = current; | ||
748 | else | ||
749 | task = find_task_by_vpid(pid); | ||
750 | if (task) | ||
751 | get_task_struct(task); | ||
752 | rcu_read_unlock(); | ||
753 | |||
754 | if (!task) | ||
755 | return ERR_PTR(-ESRCH); | ||
756 | |||
757 | ctx = &task->perf_counter_ctx; | ||
758 | ctx->task = task; | ||
759 | |||
760 | /* Reuse ptrace permission checks for now. */ | ||
761 | if (!ptrace_may_access(task, PTRACE_MODE_READ)) { | ||
762 | put_context(ctx); | ||
763 | return ERR_PTR(-EACCES); | ||
764 | } | ||
765 | |||
766 | return ctx; | ||
767 | } | ||
768 | |||
769 | /* | ||
770 | * Called when the last reference to the file is gone. | ||
771 | */ | ||
772 | static int perf_release(struct inode *inode, struct file *file) | ||
773 | { | ||
774 | struct perf_counter *counter = file->private_data; | ||
775 | struct perf_counter_context *ctx = counter->ctx; | ||
776 | |||
777 | file->private_data = NULL; | ||
778 | |||
779 | mutex_lock(&counter->mutex); | ||
780 | |||
781 | perf_counter_remove_from_context(counter); | ||
782 | put_context(ctx); | ||
783 | |||
784 | mutex_unlock(&counter->mutex); | ||
785 | |||
786 | kfree(counter); | ||
787 | |||
788 | return 0; | ||
789 | } | ||
790 | |||
791 | /* | ||
792 | * Read the performance counter - simple non blocking version for now | ||
793 | */ | ||
794 | static ssize_t | ||
795 | perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) | ||
796 | { | ||
797 | u64 cntval; | ||
798 | |||
799 | if (count != sizeof(cntval)) | ||
800 | return -EINVAL; | ||
801 | |||
802 | mutex_lock(&counter->mutex); | ||
803 | cntval = perf_counter_read(counter); | ||
804 | mutex_unlock(&counter->mutex); | ||
805 | |||
806 | return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); | ||
807 | } | ||
808 | |||
809 | static ssize_t | ||
810 | perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count) | ||
811 | { | ||
812 | if (!usrdata->len) | ||
813 | return 0; | ||
814 | |||
815 | count = min(count, (size_t)usrdata->len); | ||
816 | if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count)) | ||
817 | return -EFAULT; | ||
818 | |||
819 | /* Adjust the counters */ | ||
820 | usrdata->len -= count; | ||
821 | if (!usrdata->len) | ||
822 | usrdata->rd_idx = 0; | ||
823 | else | ||
824 | usrdata->rd_idx += count; | ||
825 | |||
826 | return count; | ||
827 | } | ||
828 | |||
829 | static ssize_t | ||
830 | perf_read_irq_data(struct perf_counter *counter, | ||
831 | char __user *buf, | ||
832 | size_t count, | ||
833 | int nonblocking) | ||
834 | { | ||
835 | struct perf_data *irqdata, *usrdata; | ||
836 | DECLARE_WAITQUEUE(wait, current); | ||
837 | ssize_t res; | ||
838 | |||
839 | irqdata = counter->irqdata; | ||
840 | usrdata = counter->usrdata; | ||
841 | |||
842 | if (usrdata->len + irqdata->len >= count) | ||
843 | goto read_pending; | ||
844 | |||
845 | if (nonblocking) | ||
846 | return -EAGAIN; | ||
847 | |||
848 | spin_lock_irq(&counter->waitq.lock); | ||
849 | __add_wait_queue(&counter->waitq, &wait); | ||
850 | for (;;) { | ||
851 | set_current_state(TASK_INTERRUPTIBLE); | ||
852 | if (usrdata->len + irqdata->len >= count) | ||
853 | break; | ||
854 | |||
855 | if (signal_pending(current)) | ||
856 | break; | ||
857 | |||
858 | spin_unlock_irq(&counter->waitq.lock); | ||
859 | schedule(); | ||
860 | spin_lock_irq(&counter->waitq.lock); | ||
861 | } | ||
862 | __remove_wait_queue(&counter->waitq, &wait); | ||
863 | __set_current_state(TASK_RUNNING); | ||
864 | spin_unlock_irq(&counter->waitq.lock); | ||
865 | |||
866 | if (usrdata->len + irqdata->len < count) | ||
867 | return -ERESTARTSYS; | ||
868 | read_pending: | ||
869 | mutex_lock(&counter->mutex); | ||
870 | |||
871 | /* Drain pending data first: */ | ||
872 | res = perf_copy_usrdata(usrdata, buf, count); | ||
873 | if (res < 0 || res == count) | ||
874 | goto out; | ||
875 | |||
876 | /* Switch irq buffer: */ | ||
877 | usrdata = perf_switch_irq_data(counter); | ||
878 | if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) { | ||
879 | if (!res) | ||
880 | res = -EFAULT; | ||
881 | } else { | ||
882 | res = count; | ||
883 | } | ||
884 | out: | ||
885 | mutex_unlock(&counter->mutex); | ||
886 | |||
887 | return res; | ||
888 | } | ||
889 | |||
890 | static ssize_t | ||
891 | perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) | ||
892 | { | ||
893 | struct perf_counter *counter = file->private_data; | ||
894 | |||
895 | switch (counter->hw_event.record_type) { | ||
896 | case PERF_RECORD_SIMPLE: | ||
897 | return perf_read_hw(counter, buf, count); | ||
898 | |||
899 | case PERF_RECORD_IRQ: | ||
900 | case PERF_RECORD_GROUP: | ||
901 | return perf_read_irq_data(counter, buf, count, | ||
902 | file->f_flags & O_NONBLOCK); | ||
903 | } | ||
904 | return -EINVAL; | ||
905 | } | ||
906 | |||
907 | static unsigned int perf_poll(struct file *file, poll_table *wait) | ||
908 | { | ||
909 | struct perf_counter *counter = file->private_data; | ||
910 | unsigned int events = 0; | ||
911 | unsigned long flags; | ||
912 | |||
913 | poll_wait(file, &counter->waitq, wait); | ||
914 | |||
915 | spin_lock_irqsave(&counter->waitq.lock, flags); | ||
916 | if (counter->usrdata->len || counter->irqdata->len) | ||
917 | events |= POLLIN; | ||
918 | spin_unlock_irqrestore(&counter->waitq.lock, flags); | ||
919 | |||
920 | return events; | ||
921 | } | ||
922 | |||
923 | static const struct file_operations perf_fops = { | ||
924 | .release = perf_release, | ||
925 | .read = perf_read, | ||
926 | .poll = perf_poll, | ||
927 | }; | ||
928 | |||
929 | static int cpu_clock_perf_counter_enable(struct perf_counter *counter) | ||
930 | { | ||
931 | return 0; | ||
932 | } | ||
933 | |||
934 | static void cpu_clock_perf_counter_disable(struct perf_counter *counter) | ||
935 | { | ||
936 | } | ||
937 | |||
938 | static void cpu_clock_perf_counter_read(struct perf_counter *counter) | ||
939 | { | ||
940 | int cpu = raw_smp_processor_id(); | ||
941 | |||
942 | atomic64_set(&counter->count, cpu_clock(cpu)); | ||
943 | } | ||
944 | |||
945 | static const struct hw_perf_counter_ops perf_ops_cpu_clock = { | ||
946 | .enable = cpu_clock_perf_counter_enable, | ||
947 | .disable = cpu_clock_perf_counter_disable, | ||
948 | .read = cpu_clock_perf_counter_read, | ||
949 | }; | ||
950 | |||
951 | /* | ||
952 | * Called from within the scheduler: | ||
953 | */ | ||
954 | static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) | ||
955 | { | ||
956 | struct task_struct *curr = counter->task; | ||
957 | u64 delta; | ||
958 | |||
959 | delta = __task_delta_exec(curr, update); | ||
960 | |||
961 | return curr->se.sum_exec_runtime + delta; | ||
962 | } | ||
963 | |||
964 | static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) | ||
965 | { | ||
966 | u64 prev; | ||
967 | s64 delta; | ||
968 | |||
969 | prev = atomic64_read(&counter->hw.prev_count); | ||
970 | |||
971 | atomic64_set(&counter->hw.prev_count, now); | ||
972 | |||
973 | delta = now - prev; | ||
974 | |||
975 | atomic64_add(delta, &counter->count); | ||
976 | } | ||
977 | |||
978 | static void task_clock_perf_counter_read(struct perf_counter *counter) | ||
979 | { | ||
980 | u64 now = task_clock_perf_counter_val(counter, 1); | ||
981 | |||
982 | task_clock_perf_counter_update(counter, now); | ||
983 | } | ||
984 | |||
985 | static int task_clock_perf_counter_enable(struct perf_counter *counter) | ||
986 | { | ||
987 | u64 now = task_clock_perf_counter_val(counter, 0); | ||
988 | |||
989 | atomic64_set(&counter->hw.prev_count, now); | ||
990 | |||
991 | return 0; | ||
992 | } | ||
993 | |||
994 | static void task_clock_perf_counter_disable(struct perf_counter *counter) | ||
995 | { | ||
996 | u64 now = task_clock_perf_counter_val(counter, 0); | ||
997 | |||
998 | task_clock_perf_counter_update(counter, now); | ||
999 | } | ||
1000 | |||
1001 | static const struct hw_perf_counter_ops perf_ops_task_clock = { | ||
1002 | .enable = task_clock_perf_counter_enable, | ||
1003 | .disable = task_clock_perf_counter_disable, | ||
1004 | .read = task_clock_perf_counter_read, | ||
1005 | }; | ||
1006 | |||
1007 | static u64 get_page_faults(void) | ||
1008 | { | ||
1009 | struct task_struct *curr = current; | ||
1010 | |||
1011 | return curr->maj_flt + curr->min_flt; | ||
1012 | } | ||
1013 | |||
1014 | static void page_faults_perf_counter_update(struct perf_counter *counter) | ||
1015 | { | ||
1016 | u64 prev, now; | ||
1017 | s64 delta; | ||
1018 | |||
1019 | prev = atomic64_read(&counter->hw.prev_count); | ||
1020 | now = get_page_faults(); | ||
1021 | |||
1022 | atomic64_set(&counter->hw.prev_count, now); | ||
1023 | |||
1024 | delta = now - prev; | ||
1025 | |||
1026 | atomic64_add(delta, &counter->count); | ||
1027 | } | ||
1028 | |||
1029 | static void page_faults_perf_counter_read(struct perf_counter *counter) | ||
1030 | { | ||
1031 | page_faults_perf_counter_update(counter); | ||
1032 | } | ||
1033 | |||
1034 | static int page_faults_perf_counter_enable(struct perf_counter *counter) | ||
1035 | { | ||
1036 | /* | ||
1037 | * page-faults is a per-task value already, | ||
1038 | * so we dont have to clear it on switch-in. | ||
1039 | */ | ||
1040 | |||
1041 | return 0; | ||
1042 | } | ||
1043 | |||
1044 | static void page_faults_perf_counter_disable(struct perf_counter *counter) | ||
1045 | { | ||
1046 | page_faults_perf_counter_update(counter); | ||
1047 | } | ||
1048 | |||
1049 | static const struct hw_perf_counter_ops perf_ops_page_faults = { | ||
1050 | .enable = page_faults_perf_counter_enable, | ||
1051 | .disable = page_faults_perf_counter_disable, | ||
1052 | .read = page_faults_perf_counter_read, | ||
1053 | }; | ||
1054 | |||
1055 | static u64 get_context_switches(void) | ||
1056 | { | ||
1057 | struct task_struct *curr = current; | ||
1058 | |||
1059 | return curr->nvcsw + curr->nivcsw; | ||
1060 | } | ||
1061 | |||
1062 | static void context_switches_perf_counter_update(struct perf_counter *counter) | ||
1063 | { | ||
1064 | u64 prev, now; | ||
1065 | s64 delta; | ||
1066 | |||
1067 | prev = atomic64_read(&counter->hw.prev_count); | ||
1068 | now = get_context_switches(); | ||
1069 | |||
1070 | atomic64_set(&counter->hw.prev_count, now); | ||
1071 | |||
1072 | delta = now - prev; | ||
1073 | |||
1074 | atomic64_add(delta, &counter->count); | ||
1075 | } | ||
1076 | |||
1077 | static void context_switches_perf_counter_read(struct perf_counter *counter) | ||
1078 | { | ||
1079 | context_switches_perf_counter_update(counter); | ||
1080 | } | ||
1081 | |||
1082 | static int context_switches_perf_counter_enable(struct perf_counter *counter) | ||
1083 | { | ||
1084 | /* | ||
1085 | * ->nvcsw + curr->nivcsw is a per-task value already, | ||
1086 | * so we dont have to clear it on switch-in. | ||
1087 | */ | ||
1088 | |||
1089 | return 0; | ||
1090 | } | ||
1091 | |||
1092 | static void context_switches_perf_counter_disable(struct perf_counter *counter) | ||
1093 | { | ||
1094 | context_switches_perf_counter_update(counter); | ||
1095 | } | ||
1096 | |||
1097 | static const struct hw_perf_counter_ops perf_ops_context_switches = { | ||
1098 | .enable = context_switches_perf_counter_enable, | ||
1099 | .disable = context_switches_perf_counter_disable, | ||
1100 | .read = context_switches_perf_counter_read, | ||
1101 | }; | ||
1102 | |||
1103 | static inline u64 get_cpu_migrations(void) | ||
1104 | { | ||
1105 | return current->se.nr_migrations; | ||
1106 | } | ||
1107 | |||
1108 | static void cpu_migrations_perf_counter_update(struct perf_counter *counter) | ||
1109 | { | ||
1110 | u64 prev, now; | ||
1111 | s64 delta; | ||
1112 | |||
1113 | prev = atomic64_read(&counter->hw.prev_count); | ||
1114 | now = get_cpu_migrations(); | ||
1115 | |||
1116 | atomic64_set(&counter->hw.prev_count, now); | ||
1117 | |||
1118 | delta = now - prev; | ||
1119 | |||
1120 | atomic64_add(delta, &counter->count); | ||
1121 | } | ||
1122 | |||
1123 | static void cpu_migrations_perf_counter_read(struct perf_counter *counter) | ||
1124 | { | ||
1125 | cpu_migrations_perf_counter_update(counter); | ||
1126 | } | ||
1127 | |||
1128 | static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) | ||
1129 | { | ||
1130 | /* | ||
1131 | * se.nr_migrations is a per-task value already, | ||
1132 | * so we dont have to clear it on switch-in. | ||
1133 | */ | ||
1134 | |||
1135 | return 0; | ||
1136 | } | ||
1137 | |||
1138 | static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) | ||
1139 | { | ||
1140 | cpu_migrations_perf_counter_update(counter); | ||
1141 | } | ||
1142 | |||
1143 | static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { | ||
1144 | .enable = cpu_migrations_perf_counter_enable, | ||
1145 | .disable = cpu_migrations_perf_counter_disable, | ||
1146 | .read = cpu_migrations_perf_counter_read, | ||
1147 | }; | ||
1148 | |||
1149 | static const struct hw_perf_counter_ops * | ||
1150 | sw_perf_counter_init(struct perf_counter *counter) | ||
1151 | { | ||
1152 | const struct hw_perf_counter_ops *hw_ops = NULL; | ||
1153 | |||
1154 | switch (counter->hw_event.type) { | ||
1155 | case PERF_COUNT_CPU_CLOCK: | ||
1156 | hw_ops = &perf_ops_cpu_clock; | ||
1157 | break; | ||
1158 | case PERF_COUNT_TASK_CLOCK: | ||
1159 | hw_ops = &perf_ops_task_clock; | ||
1160 | break; | ||
1161 | case PERF_COUNT_PAGE_FAULTS: | ||
1162 | hw_ops = &perf_ops_page_faults; | ||
1163 | break; | ||
1164 | case PERF_COUNT_CONTEXT_SWITCHES: | ||
1165 | hw_ops = &perf_ops_context_switches; | ||
1166 | break; | ||
1167 | case PERF_COUNT_CPU_MIGRATIONS: | ||
1168 | hw_ops = &perf_ops_cpu_migrations; | ||
1169 | break; | ||
1170 | default: | ||
1171 | break; | ||
1172 | } | ||
1173 | return hw_ops; | ||
1174 | } | ||
1175 | |||
1176 | /* | ||
1177 | * Allocate and initialize a counter structure | ||
1178 | */ | ||
1179 | static struct perf_counter * | ||
1180 | perf_counter_alloc(struct perf_counter_hw_event *hw_event, | ||
1181 | int cpu, | ||
1182 | struct perf_counter *group_leader, | ||
1183 | gfp_t gfpflags) | ||
1184 | { | ||
1185 | const struct hw_perf_counter_ops *hw_ops; | ||
1186 | struct perf_counter *counter; | ||
1187 | |||
1188 | counter = kzalloc(sizeof(*counter), gfpflags); | ||
1189 | if (!counter) | ||
1190 | return NULL; | ||
1191 | |||
1192 | /* | ||
1193 | * Single counters are their own group leaders, with an | ||
1194 | * empty sibling list: | ||
1195 | */ | ||
1196 | if (!group_leader) | ||
1197 | group_leader = counter; | ||
1198 | |||
1199 | mutex_init(&counter->mutex); | ||
1200 | INIT_LIST_HEAD(&counter->list_entry); | ||
1201 | INIT_LIST_HEAD(&counter->sibling_list); | ||
1202 | init_waitqueue_head(&counter->waitq); | ||
1203 | |||
1204 | counter->irqdata = &counter->data[0]; | ||
1205 | counter->usrdata = &counter->data[1]; | ||
1206 | counter->cpu = cpu; | ||
1207 | counter->hw_event = *hw_event; | ||
1208 | counter->wakeup_pending = 0; | ||
1209 | counter->group_leader = group_leader; | ||
1210 | counter->hw_ops = NULL; | ||
1211 | |||
1212 | counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
1213 | if (hw_event->disabled) | ||
1214 | counter->state = PERF_COUNTER_STATE_OFF; | ||
1215 | |||
1216 | hw_ops = NULL; | ||
1217 | if (!hw_event->raw && hw_event->type < 0) | ||
1218 | hw_ops = sw_perf_counter_init(counter); | ||
1219 | if (!hw_ops) | ||
1220 | hw_ops = hw_perf_counter_init(counter); | ||
1221 | |||
1222 | if (!hw_ops) { | ||
1223 | kfree(counter); | ||
1224 | return NULL; | ||
1225 | } | ||
1226 | counter->hw_ops = hw_ops; | ||
1227 | |||
1228 | return counter; | ||
1229 | } | ||
1230 | |||
1231 | /** | ||
1232 | * sys_perf_task_open - open a performance counter, associate it to a task/cpu | ||
1233 | * | ||
1234 | * @hw_event_uptr: event type attributes for monitoring/sampling | ||
1235 | * @pid: target pid | ||
1236 | * @cpu: target cpu | ||
1237 | * @group_fd: group leader counter fd | ||
1238 | */ | ||
1239 | asmlinkage int | ||
1240 | sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, | ||
1241 | pid_t pid, int cpu, int group_fd) | ||
1242 | { | ||
1243 | struct perf_counter *counter, *group_leader; | ||
1244 | struct perf_counter_hw_event hw_event; | ||
1245 | struct perf_counter_context *ctx; | ||
1246 | struct file *counter_file = NULL; | ||
1247 | struct file *group_file = NULL; | ||
1248 | int fput_needed = 0; | ||
1249 | int fput_needed2 = 0; | ||
1250 | int ret; | ||
1251 | |||
1252 | if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) | ||
1253 | return -EFAULT; | ||
1254 | |||
1255 | /* | ||
1256 | * Get the target context (task or percpu): | ||
1257 | */ | ||
1258 | ctx = find_get_context(pid, cpu); | ||
1259 | if (IS_ERR(ctx)) | ||
1260 | return PTR_ERR(ctx); | ||
1261 | |||
1262 | /* | ||
1263 | * Look up the group leader (we will attach this counter to it): | ||
1264 | */ | ||
1265 | group_leader = NULL; | ||
1266 | if (group_fd != -1) { | ||
1267 | ret = -EINVAL; | ||
1268 | group_file = fget_light(group_fd, &fput_needed); | ||
1269 | if (!group_file) | ||
1270 | goto err_put_context; | ||
1271 | if (group_file->f_op != &perf_fops) | ||
1272 | goto err_put_context; | ||
1273 | |||
1274 | group_leader = group_file->private_data; | ||
1275 | /* | ||
1276 | * Do not allow a recursive hierarchy (this new sibling | ||
1277 | * becoming part of another group-sibling): | ||
1278 | */ | ||
1279 | if (group_leader->group_leader != group_leader) | ||
1280 | goto err_put_context; | ||
1281 | /* | ||
1282 | * Do not allow to attach to a group in a different | ||
1283 | * task or CPU context: | ||
1284 | */ | ||
1285 | if (group_leader->ctx != ctx) | ||
1286 | goto err_put_context; | ||
1287 | } | ||
1288 | |||
1289 | ret = -EINVAL; | ||
1290 | counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL); | ||
1291 | if (!counter) | ||
1292 | goto err_put_context; | ||
1293 | |||
1294 | ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); | ||
1295 | if (ret < 0) | ||
1296 | goto err_free_put_context; | ||
1297 | |||
1298 | counter_file = fget_light(ret, &fput_needed2); | ||
1299 | if (!counter_file) | ||
1300 | goto err_free_put_context; | ||
1301 | |||
1302 | counter->filp = counter_file; | ||
1303 | perf_install_in_context(ctx, counter, cpu); | ||
1304 | |||
1305 | fput_light(counter_file, fput_needed2); | ||
1306 | |||
1307 | out_fput: | ||
1308 | fput_light(group_file, fput_needed); | ||
1309 | |||
1310 | return ret; | ||
1311 | |||
1312 | err_free_put_context: | ||
1313 | kfree(counter); | ||
1314 | |||
1315 | err_put_context: | ||
1316 | put_context(ctx); | ||
1317 | |||
1318 | goto out_fput; | ||
1319 | } | ||
1320 | |||
1321 | /* | ||
1322 | * Initialize the perf_counter context in a task_struct: | ||
1323 | */ | ||
1324 | static void | ||
1325 | __perf_counter_init_context(struct perf_counter_context *ctx, | ||
1326 | struct task_struct *task) | ||
1327 | { | ||
1328 | memset(ctx, 0, sizeof(*ctx)); | ||
1329 | spin_lock_init(&ctx->lock); | ||
1330 | INIT_LIST_HEAD(&ctx->counter_list); | ||
1331 | ctx->task = task; | ||
1332 | } | ||
1333 | |||
1334 | /* | ||
1335 | * inherit a counter from parent task to child task: | ||
1336 | */ | ||
1337 | static int | ||
1338 | inherit_counter(struct perf_counter *parent_counter, | ||
1339 | struct task_struct *parent, | ||
1340 | struct perf_counter_context *parent_ctx, | ||
1341 | struct task_struct *child, | ||
1342 | struct perf_counter_context *child_ctx) | ||
1343 | { | ||
1344 | struct perf_counter *child_counter; | ||
1345 | |||
1346 | child_counter = perf_counter_alloc(&parent_counter->hw_event, | ||
1347 | parent_counter->cpu, NULL, | ||
1348 | GFP_ATOMIC); | ||
1349 | if (!child_counter) | ||
1350 | return -ENOMEM; | ||
1351 | |||
1352 | /* | ||
1353 | * Link it up in the child's context: | ||
1354 | */ | ||
1355 | child_counter->ctx = child_ctx; | ||
1356 | child_counter->task = child; | ||
1357 | list_add_counter(child_counter, child_ctx); | ||
1358 | child_ctx->nr_counters++; | ||
1359 | |||
1360 | child_counter->parent = parent_counter; | ||
1361 | /* | ||
1362 | * inherit into child's child as well: | ||
1363 | */ | ||
1364 | child_counter->hw_event.inherit = 1; | ||
1365 | |||
1366 | /* | ||
1367 | * Get a reference to the parent filp - we will fput it | ||
1368 | * when the child counter exits. This is safe to do because | ||
1369 | * we are in the parent and we know that the filp still | ||
1370 | * exists and has a nonzero count: | ||
1371 | */ | ||
1372 | atomic_long_inc(&parent_counter->filp->f_count); | ||
1373 | |||
1374 | return 0; | ||
1375 | } | ||
1376 | |||
1377 | static void | ||
1378 | __perf_counter_exit_task(struct task_struct *child, | ||
1379 | struct perf_counter *child_counter, | ||
1380 | struct perf_counter_context *child_ctx) | ||
1381 | { | ||
1382 | struct perf_counter *parent_counter; | ||
1383 | u64 parent_val, child_val; | ||
1384 | |||
1385 | /* | ||
1386 | * If we do not self-reap then we have to wait for the | ||
1387 | * child task to unschedule (it will happen for sure), | ||
1388 | * so that its counter is at its final count. (This | ||
1389 | * condition triggers rarely - child tasks usually get | ||
1390 | * off their CPU before the parent has a chance to | ||
1391 | * get this far into the reaping action) | ||
1392 | */ | ||
1393 | if (child != current) { | ||
1394 | wait_task_inactive(child, 0); | ||
1395 | list_del_init(&child_counter->list_entry); | ||
1396 | } else { | ||
1397 | struct perf_cpu_context *cpuctx; | ||
1398 | unsigned long flags; | ||
1399 | u64 perf_flags; | ||
1400 | |||
1401 | /* | ||
1402 | * Disable and unlink this counter. | ||
1403 | * | ||
1404 | * Be careful about zapping the list - IRQ/NMI context | ||
1405 | * could still be processing it: | ||
1406 | */ | ||
1407 | curr_rq_lock_irq_save(&flags); | ||
1408 | perf_flags = hw_perf_save_disable(); | ||
1409 | |||
1410 | cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1411 | |||
1412 | if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { | ||
1413 | child_counter->state = PERF_COUNTER_STATE_INACTIVE; | ||
1414 | child_counter->hw_ops->disable(child_counter); | ||
1415 | cpuctx->active_oncpu--; | ||
1416 | child_ctx->nr_active--; | ||
1417 | child_counter->oncpu = -1; | ||
1418 | } | ||
1419 | |||
1420 | list_del_init(&child_counter->list_entry); | ||
1421 | |||
1422 | child_ctx->nr_counters--; | ||
1423 | |||
1424 | hw_perf_restore(perf_flags); | ||
1425 | curr_rq_unlock_irq_restore(&flags); | ||
1426 | } | ||
1427 | |||
1428 | parent_counter = child_counter->parent; | ||
1429 | /* | ||
1430 | * It can happen that parent exits first, and has counters | ||
1431 | * that are still around due to the child reference. These | ||
1432 | * counters need to be zapped - but otherwise linger. | ||
1433 | */ | ||
1434 | if (!parent_counter) | ||
1435 | return; | ||
1436 | |||
1437 | parent_val = atomic64_read(&parent_counter->count); | ||
1438 | child_val = atomic64_read(&child_counter->count); | ||
1439 | |||
1440 | /* | ||
1441 | * Add back the child's count to the parent's count: | ||
1442 | */ | ||
1443 | atomic64_add(child_val, &parent_counter->count); | ||
1444 | |||
1445 | fput(parent_counter->filp); | ||
1446 | |||
1447 | kfree(child_counter); | ||
1448 | } | ||
1449 | |||
1450 | /* | ||
1451 | * When a child task exist, feed back counter values to parent counters. | ||
1452 | * | ||
1453 | * Note: we are running in child context, but the PID is not hashed | ||
1454 | * anymore so new counters will not be added. | ||
1455 | */ | ||
1456 | void perf_counter_exit_task(struct task_struct *child) | ||
1457 | { | ||
1458 | struct perf_counter *child_counter, *tmp; | ||
1459 | struct perf_counter_context *child_ctx; | ||
1460 | |||
1461 | child_ctx = &child->perf_counter_ctx; | ||
1462 | |||
1463 | if (likely(!child_ctx->nr_counters)) | ||
1464 | return; | ||
1465 | |||
1466 | list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, | ||
1467 | list_entry) | ||
1468 | __perf_counter_exit_task(child, child_counter, child_ctx); | ||
1469 | } | ||
1470 | |||
1471 | /* | ||
1472 | * Initialize the perf_counter context in task_struct | ||
1473 | */ | ||
1474 | void perf_counter_init_task(struct task_struct *child) | ||
1475 | { | ||
1476 | struct perf_counter_context *child_ctx, *parent_ctx; | ||
1477 | struct perf_counter *counter, *parent_counter; | ||
1478 | struct task_struct *parent = current; | ||
1479 | unsigned long flags; | ||
1480 | |||
1481 | child_ctx = &child->perf_counter_ctx; | ||
1482 | parent_ctx = &parent->perf_counter_ctx; | ||
1483 | |||
1484 | __perf_counter_init_context(child_ctx, child); | ||
1485 | |||
1486 | /* | ||
1487 | * This is executed from the parent task context, so inherit | ||
1488 | * counters that have been marked for cloning: | ||
1489 | */ | ||
1490 | |||
1491 | if (likely(!parent_ctx->nr_counters)) | ||
1492 | return; | ||
1493 | |||
1494 | /* | ||
1495 | * Lock the parent list. No need to lock the child - not PID | ||
1496 | * hashed yet and not running, so nobody can access it. | ||
1497 | */ | ||
1498 | spin_lock_irqsave(&parent_ctx->lock, flags); | ||
1499 | |||
1500 | /* | ||
1501 | * We dont have to disable NMIs - we are only looking at | ||
1502 | * the list, not manipulating it: | ||
1503 | */ | ||
1504 | list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { | ||
1505 | if (!counter->hw_event.inherit || counter->group_leader != counter) | ||
1506 | continue; | ||
1507 | |||
1508 | /* | ||
1509 | * Instead of creating recursive hierarchies of counters, | ||
1510 | * we link inheritd counters back to the original parent, | ||
1511 | * which has a filp for sure, which we use as the reference | ||
1512 | * count: | ||
1513 | */ | ||
1514 | parent_counter = counter; | ||
1515 | if (counter->parent) | ||
1516 | parent_counter = counter->parent; | ||
1517 | |||
1518 | if (inherit_counter(parent_counter, parent, | ||
1519 | parent_ctx, child, child_ctx)) | ||
1520 | break; | ||
1521 | } | ||
1522 | |||
1523 | spin_unlock_irqrestore(&parent_ctx->lock, flags); | ||
1524 | } | ||
1525 | |||
1526 | static void __cpuinit perf_counter_init_cpu(int cpu) | ||
1527 | { | ||
1528 | struct perf_cpu_context *cpuctx; | ||
1529 | |||
1530 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
1531 | __perf_counter_init_context(&cpuctx->ctx, NULL); | ||
1532 | |||
1533 | mutex_lock(&perf_resource_mutex); | ||
1534 | cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; | ||
1535 | mutex_unlock(&perf_resource_mutex); | ||
1536 | |||
1537 | hw_perf_counter_setup(); | ||
1538 | } | ||
1539 | |||
1540 | #ifdef CONFIG_HOTPLUG_CPU | ||
1541 | static void __perf_counter_exit_cpu(void *info) | ||
1542 | { | ||
1543 | struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); | ||
1544 | struct perf_counter_context *ctx = &cpuctx->ctx; | ||
1545 | struct perf_counter *counter, *tmp; | ||
1546 | |||
1547 | list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) | ||
1548 | __perf_counter_remove_from_context(counter); | ||
1549 | |||
1550 | } | ||
1551 | static void perf_counter_exit_cpu(int cpu) | ||
1552 | { | ||
1553 | smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); | ||
1554 | } | ||
1555 | #else | ||
1556 | static inline void perf_counter_exit_cpu(int cpu) { } | ||
1557 | #endif | ||
1558 | |||
1559 | static int __cpuinit | ||
1560 | perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) | ||
1561 | { | ||
1562 | unsigned int cpu = (long)hcpu; | ||
1563 | |||
1564 | switch (action) { | ||
1565 | |||
1566 | case CPU_UP_PREPARE: | ||
1567 | case CPU_UP_PREPARE_FROZEN: | ||
1568 | perf_counter_init_cpu(cpu); | ||
1569 | break; | ||
1570 | |||
1571 | case CPU_DOWN_PREPARE: | ||
1572 | case CPU_DOWN_PREPARE_FROZEN: | ||
1573 | perf_counter_exit_cpu(cpu); | ||
1574 | break; | ||
1575 | |||
1576 | default: | ||
1577 | break; | ||
1578 | } | ||
1579 | |||
1580 | return NOTIFY_OK; | ||
1581 | } | ||
1582 | |||
1583 | static struct notifier_block __cpuinitdata perf_cpu_nb = { | ||
1584 | .notifier_call = perf_cpu_notify, | ||
1585 | }; | ||
1586 | |||
1587 | static int __init perf_counter_init(void) | ||
1588 | { | ||
1589 | perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, | ||
1590 | (void *)(long)smp_processor_id()); | ||
1591 | register_cpu_notifier(&perf_cpu_nb); | ||
1592 | |||
1593 | return 0; | ||
1594 | } | ||
1595 | early_initcall(perf_counter_init); | ||
1596 | |||
1597 | static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) | ||
1598 | { | ||
1599 | return sprintf(buf, "%d\n", perf_reserved_percpu); | ||
1600 | } | ||
1601 | |||
1602 | static ssize_t | ||
1603 | perf_set_reserve_percpu(struct sysdev_class *class, | ||
1604 | const char *buf, | ||
1605 | size_t count) | ||
1606 | { | ||
1607 | struct perf_cpu_context *cpuctx; | ||
1608 | unsigned long val; | ||
1609 | int err, cpu, mpt; | ||
1610 | |||
1611 | err = strict_strtoul(buf, 10, &val); | ||
1612 | if (err) | ||
1613 | return err; | ||
1614 | if (val > perf_max_counters) | ||
1615 | return -EINVAL; | ||
1616 | |||
1617 | mutex_lock(&perf_resource_mutex); | ||
1618 | perf_reserved_percpu = val; | ||
1619 | for_each_online_cpu(cpu) { | ||
1620 | cpuctx = &per_cpu(perf_cpu_context, cpu); | ||
1621 | spin_lock_irq(&cpuctx->ctx.lock); | ||
1622 | mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, | ||
1623 | perf_max_counters - perf_reserved_percpu); | ||
1624 | cpuctx->max_pertask = mpt; | ||
1625 | spin_unlock_irq(&cpuctx->ctx.lock); | ||
1626 | } | ||
1627 | mutex_unlock(&perf_resource_mutex); | ||
1628 | |||
1629 | return count; | ||
1630 | } | ||
1631 | |||
1632 | static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) | ||
1633 | { | ||
1634 | return sprintf(buf, "%d\n", perf_overcommit); | ||
1635 | } | ||
1636 | |||
1637 | static ssize_t | ||
1638 | perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) | ||
1639 | { | ||
1640 | unsigned long val; | ||
1641 | int err; | ||
1642 | |||
1643 | err = strict_strtoul(buf, 10, &val); | ||
1644 | if (err) | ||
1645 | return err; | ||
1646 | if (val > 1) | ||
1647 | return -EINVAL; | ||
1648 | |||
1649 | mutex_lock(&perf_resource_mutex); | ||
1650 | perf_overcommit = val; | ||
1651 | mutex_unlock(&perf_resource_mutex); | ||
1652 | |||
1653 | return count; | ||
1654 | } | ||
1655 | |||
1656 | static SYSDEV_CLASS_ATTR( | ||
1657 | reserve_percpu, | ||
1658 | 0644, | ||
1659 | perf_show_reserve_percpu, | ||
1660 | perf_set_reserve_percpu | ||
1661 | ); | ||
1662 | |||
1663 | static SYSDEV_CLASS_ATTR( | ||
1664 | overcommit, | ||
1665 | 0644, | ||
1666 | perf_show_overcommit, | ||
1667 | perf_set_overcommit | ||
1668 | ); | ||
1669 | |||
1670 | static struct attribute *perfclass_attrs[] = { | ||
1671 | &attr_reserve_percpu.attr, | ||
1672 | &attr_overcommit.attr, | ||
1673 | NULL | ||
1674 | }; | ||
1675 | |||
1676 | static struct attribute_group perfclass_attr_group = { | ||
1677 | .attrs = perfclass_attrs, | ||
1678 | .name = "perf_counters", | ||
1679 | }; | ||
1680 | |||
1681 | static int __init perf_counter_sysfs_init(void) | ||
1682 | { | ||
1683 | return sysfs_create_group(&cpu_sysdev_class.kset.kobj, | ||
1684 | &perfclass_attr_group); | ||
1685 | } | ||
1686 | device_initcall(perf_counter_sysfs_init); | ||
diff --git a/kernel/sched.c b/kernel/sched.c index deb5ac8c12f3..43fd21233b93 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -665,7 +665,7 @@ static inline int cpu_of(struct rq *rq) | |||
665 | #define task_rq(p) cpu_rq(task_cpu(p)) | 665 | #define task_rq(p) cpu_rq(task_cpu(p)) |
666 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 666 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
667 | 667 | ||
668 | static inline void update_rq_clock(struct rq *rq) | 668 | inline void update_rq_clock(struct rq *rq) |
669 | { | 669 | { |
670 | rq->clock = sched_clock_cpu(cpu_of(rq)); | 670 | rq->clock = sched_clock_cpu(cpu_of(rq)); |
671 | } | 671 | } |
@@ -976,6 +976,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | |||
976 | } | 976 | } |
977 | } | 977 | } |
978 | 978 | ||
979 | void curr_rq_lock_irq_save(unsigned long *flags) | ||
980 | __acquires(rq->lock) | ||
981 | { | ||
982 | struct rq *rq; | ||
983 | |||
984 | local_irq_save(*flags); | ||
985 | rq = cpu_rq(smp_processor_id()); | ||
986 | spin_lock(&rq->lock); | ||
987 | } | ||
988 | |||
989 | void curr_rq_unlock_irq_restore(unsigned long *flags) | ||
990 | __releases(rq->lock) | ||
991 | { | ||
992 | struct rq *rq; | ||
993 | |||
994 | rq = cpu_rq(smp_processor_id()); | ||
995 | spin_unlock(&rq->lock); | ||
996 | local_irq_restore(*flags); | ||
997 | } | ||
998 | |||
979 | void task_rq_unlock_wait(struct task_struct *p) | 999 | void task_rq_unlock_wait(struct task_struct *p) |
980 | { | 1000 | { |
981 | struct rq *rq = task_rq(p); | 1001 | struct rq *rq = task_rq(p); |
@@ -1882,12 +1902,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) | |||
1882 | p->se.sleep_start -= clock_offset; | 1902 | p->se.sleep_start -= clock_offset; |
1883 | if (p->se.block_start) | 1903 | if (p->se.block_start) |
1884 | p->se.block_start -= clock_offset; | 1904 | p->se.block_start -= clock_offset; |
1905 | #endif | ||
1885 | if (old_cpu != new_cpu) { | 1906 | if (old_cpu != new_cpu) { |
1886 | schedstat_inc(p, se.nr_migrations); | 1907 | p->se.nr_migrations++; |
1908 | #ifdef CONFIG_SCHEDSTATS | ||
1887 | if (task_hot(p, old_rq->clock, NULL)) | 1909 | if (task_hot(p, old_rq->clock, NULL)) |
1888 | schedstat_inc(p, se.nr_forced2_migrations); | 1910 | schedstat_inc(p, se.nr_forced2_migrations); |
1889 | } | ||
1890 | #endif | 1911 | #endif |
1912 | } | ||
1891 | p->se.vruntime -= old_cfsrq->min_vruntime - | 1913 | p->se.vruntime -= old_cfsrq->min_vruntime - |
1892 | new_cfsrq->min_vruntime; | 1914 | new_cfsrq->min_vruntime; |
1893 | 1915 | ||
@@ -2239,6 +2261,27 @@ static int sched_balance_self(int cpu, int flag) | |||
2239 | 2261 | ||
2240 | #endif /* CONFIG_SMP */ | 2262 | #endif /* CONFIG_SMP */ |
2241 | 2263 | ||
2264 | /** | ||
2265 | * task_oncpu_function_call - call a function on the cpu on which a task runs | ||
2266 | * @p: the task to evaluate | ||
2267 | * @func: the function to be called | ||
2268 | * @info: the function call argument | ||
2269 | * | ||
2270 | * Calls the function @func when the task is currently running. This might | ||
2271 | * be on the current CPU, which just calls the function directly | ||
2272 | */ | ||
2273 | void task_oncpu_function_call(struct task_struct *p, | ||
2274 | void (*func) (void *info), void *info) | ||
2275 | { | ||
2276 | int cpu; | ||
2277 | |||
2278 | preempt_disable(); | ||
2279 | cpu = task_cpu(p); | ||
2280 | if (task_curr(p)) | ||
2281 | smp_call_function_single(cpu, func, info, 1); | ||
2282 | preempt_enable(); | ||
2283 | } | ||
2284 | |||
2242 | /*** | 2285 | /*** |
2243 | * try_to_wake_up - wake up a thread | 2286 | * try_to_wake_up - wake up a thread |
2244 | * @p: the to-be-woken-up thread | 2287 | * @p: the to-be-woken-up thread |
@@ -2381,6 +2424,7 @@ static void __sched_fork(struct task_struct *p) | |||
2381 | p->se.exec_start = 0; | 2424 | p->se.exec_start = 0; |
2382 | p->se.sum_exec_runtime = 0; | 2425 | p->se.sum_exec_runtime = 0; |
2383 | p->se.prev_sum_exec_runtime = 0; | 2426 | p->se.prev_sum_exec_runtime = 0; |
2427 | p->se.nr_migrations = 0; | ||
2384 | p->se.last_wakeup = 0; | 2428 | p->se.last_wakeup = 0; |
2385 | p->se.avg_overlap = 0; | 2429 | p->se.avg_overlap = 0; |
2386 | 2430 | ||
@@ -2601,6 +2645,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
2601 | */ | 2645 | */ |
2602 | prev_state = prev->state; | 2646 | prev_state = prev->state; |
2603 | finish_arch_switch(prev); | 2647 | finish_arch_switch(prev); |
2648 | perf_counter_task_sched_in(current, cpu_of(rq)); | ||
2604 | finish_lock_switch(rq, prev); | 2649 | finish_lock_switch(rq, prev); |
2605 | #ifdef CONFIG_SMP | 2650 | #ifdef CONFIG_SMP |
2606 | if (current->sched_class->post_schedule) | 2651 | if (current->sched_class->post_schedule) |
@@ -4129,6 +4174,29 @@ EXPORT_PER_CPU_SYMBOL(kstat); | |||
4129 | * Return any ns on the sched_clock that have not yet been banked in | 4174 | * Return any ns on the sched_clock that have not yet been banked in |
4130 | * @p in case that task is currently running. | 4175 | * @p in case that task is currently running. |
4131 | */ | 4176 | */ |
4177 | unsigned long long __task_delta_exec(struct task_struct *p, int update) | ||
4178 | { | ||
4179 | s64 delta_exec; | ||
4180 | struct rq *rq; | ||
4181 | |||
4182 | rq = task_rq(p); | ||
4183 | WARN_ON_ONCE(!runqueue_is_locked()); | ||
4184 | WARN_ON_ONCE(!task_current(rq, p)); | ||
4185 | |||
4186 | if (update) | ||
4187 | update_rq_clock(rq); | ||
4188 | |||
4189 | delta_exec = rq->clock - p->se.exec_start; | ||
4190 | |||
4191 | WARN_ON_ONCE(delta_exec < 0); | ||
4192 | |||
4193 | return delta_exec; | ||
4194 | } | ||
4195 | |||
4196 | /* | ||
4197 | * Return any ns on the sched_clock that have not yet been banked in | ||
4198 | * @p in case that task is currently running. | ||
4199 | */ | ||
4132 | unsigned long long task_delta_exec(struct task_struct *p) | 4200 | unsigned long long task_delta_exec(struct task_struct *p) |
4133 | { | 4201 | { |
4134 | unsigned long flags; | 4202 | unsigned long flags; |
@@ -4388,6 +4456,7 @@ void scheduler_tick(void) | |||
4388 | update_rq_clock(rq); | 4456 | update_rq_clock(rq); |
4389 | update_cpu_load(rq); | 4457 | update_cpu_load(rq); |
4390 | curr->sched_class->task_tick(rq, curr, 0); | 4458 | curr->sched_class->task_tick(rq, curr, 0); |
4459 | perf_counter_task_tick(curr, cpu); | ||
4391 | spin_unlock(&rq->lock); | 4460 | spin_unlock(&rq->lock); |
4392 | 4461 | ||
4393 | #ifdef CONFIG_SMP | 4462 | #ifdef CONFIG_SMP |
@@ -4583,6 +4652,7 @@ need_resched_nonpreemptible: | |||
4583 | 4652 | ||
4584 | if (likely(prev != next)) { | 4653 | if (likely(prev != next)) { |
4585 | sched_info_switch(prev, next); | 4654 | sched_info_switch(prev, next); |
4655 | perf_counter_task_sched_out(prev, cpu); | ||
4586 | 4656 | ||
4587 | rq->nr_switches++; | 4657 | rq->nr_switches++; |
4588 | rq->curr = next; | 4658 | rq->curr = next; |
diff --git a/kernel/sys.c b/kernel/sys.c index 763c3c17ded3..c2a951ae4223 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -14,6 +14,7 @@ | |||
14 | #include <linux/prctl.h> | 14 | #include <linux/prctl.h> |
15 | #include <linux/highuid.h> | 15 | #include <linux/highuid.h> |
16 | #include <linux/fs.h> | 16 | #include <linux/fs.h> |
17 | #include <linux/perf_counter.h> | ||
17 | #include <linux/resource.h> | 18 | #include <linux/resource.h> |
18 | #include <linux/kernel.h> | 19 | #include <linux/kernel.h> |
19 | #include <linux/kexec.h> | 20 | #include <linux/kexec.h> |
@@ -1797,6 +1798,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
1797 | case PR_SET_TSC: | 1798 | case PR_SET_TSC: |
1798 | error = SET_TSC_CTL(arg2); | 1799 | error = SET_TSC_CTL(arg2); |
1799 | break; | 1800 | break; |
1801 | case PR_TASK_PERF_COUNTERS_DISABLE: | ||
1802 | error = perf_counter_task_disable(); | ||
1803 | break; | ||
1804 | case PR_TASK_PERF_COUNTERS_ENABLE: | ||
1805 | error = perf_counter_task_enable(); | ||
1806 | break; | ||
1800 | case PR_GET_TIMERSLACK: | 1807 | case PR_GET_TIMERSLACK: |
1801 | error = current->timer_slack_ns; | 1808 | error = current->timer_slack_ns; |
1802 | break; | 1809 | break; |
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e14a23281707..4be8bbc7577c 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime); | |||
174 | cond_syscall(compat_sys_timerfd_gettime); | 174 | cond_syscall(compat_sys_timerfd_gettime); |
175 | cond_syscall(sys_eventfd); | 175 | cond_syscall(sys_eventfd); |
176 | cond_syscall(sys_eventfd2); | 176 | cond_syscall(sys_eventfd2); |
177 | |||
178 | /* performance counters: */ | ||
179 | cond_syscall(sys_perf_counter_open); | ||