aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/events
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/Makefile2
-rw-r--r--kernel/events/core.c960
-rw-r--r--kernel/events/hw_breakpoint.c10
-rw-r--r--kernel/events/internal.h96
-rw-r--r--kernel/events/ring_buffer.c380
5 files changed, 756 insertions, 692 deletions
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 1ce23d3d8394..89e5e8aa4c36 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
2CFLAGS_REMOVE_core.o = -pg 2CFLAGS_REMOVE_core.o = -pg
3endif 3endif
4 4
5obj-y := core.o 5obj-y := core.o ring_buffer.o
6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o 6obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d863b3c057bb..b8785e26ee1c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -36,6 +36,8 @@
36#include <linux/ftrace_event.h> 36#include <linux/ftrace_event.h>
37#include <linux/hw_breakpoint.h> 37#include <linux/hw_breakpoint.h>
38 38
39#include "internal.h"
40
39#include <asm/irq_regs.h> 41#include <asm/irq_regs.h>
40 42
41struct remote_function_call { 43struct remote_function_call {
@@ -200,6 +202,22 @@ __get_cpu_context(struct perf_event_context *ctx)
200 return this_cpu_ptr(ctx->pmu->pmu_cpu_context); 202 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
201} 203}
202 204
205static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
206 struct perf_event_context *ctx)
207{
208 raw_spin_lock(&cpuctx->ctx.lock);
209 if (ctx)
210 raw_spin_lock(&ctx->lock);
211}
212
213static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
214 struct perf_event_context *ctx)
215{
216 if (ctx)
217 raw_spin_unlock(&ctx->lock);
218 raw_spin_unlock(&cpuctx->ctx.lock);
219}
220
203#ifdef CONFIG_CGROUP_PERF 221#ifdef CONFIG_CGROUP_PERF
204 222
205/* 223/*
@@ -340,11 +358,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
340 rcu_read_lock(); 358 rcu_read_lock();
341 359
342 list_for_each_entry_rcu(pmu, &pmus, entry) { 360 list_for_each_entry_rcu(pmu, &pmus, entry) {
343
344 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); 361 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
345 362
346 perf_pmu_disable(cpuctx->ctx.pmu);
347
348 /* 363 /*
349 * perf_cgroup_events says at least one 364 * perf_cgroup_events says at least one
350 * context on this CPU has cgroup events. 365 * context on this CPU has cgroup events.
@@ -353,6 +368,8 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
353 * events for a context. 368 * events for a context.
354 */ 369 */
355 if (cpuctx->ctx.nr_cgroups > 0) { 370 if (cpuctx->ctx.nr_cgroups > 0) {
371 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
372 perf_pmu_disable(cpuctx->ctx.pmu);
356 373
357 if (mode & PERF_CGROUP_SWOUT) { 374 if (mode & PERF_CGROUP_SWOUT) {
358 cpu_ctx_sched_out(cpuctx, EVENT_ALL); 375 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
@@ -372,9 +389,9 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
372 cpuctx->cgrp = perf_cgroup_from_task(task); 389 cpuctx->cgrp = perf_cgroup_from_task(task);
373 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); 390 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
374 } 391 }
392 perf_pmu_enable(cpuctx->ctx.pmu);
393 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
375 } 394 }
376
377 perf_pmu_enable(cpuctx->ctx.pmu);
378 } 395 }
379 396
380 rcu_read_unlock(); 397 rcu_read_unlock();
@@ -731,6 +748,7 @@ static u64 perf_event_time(struct perf_event *event)
731 748
732/* 749/*
733 * Update the total_time_enabled and total_time_running fields for a event. 750 * Update the total_time_enabled and total_time_running fields for a event.
751 * The caller of this function needs to hold the ctx->lock.
734 */ 752 */
735static void update_event_times(struct perf_event *event) 753static void update_event_times(struct perf_event *event)
736{ 754{
@@ -1105,6 +1123,10 @@ static int __perf_remove_from_context(void *info)
1105 raw_spin_lock(&ctx->lock); 1123 raw_spin_lock(&ctx->lock);
1106 event_sched_out(event, cpuctx, ctx); 1124 event_sched_out(event, cpuctx, ctx);
1107 list_del_event(event, ctx); 1125 list_del_event(event, ctx);
1126 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1127 ctx->is_active = 0;
1128 cpuctx->task_ctx = NULL;
1129 }
1108 raw_spin_unlock(&ctx->lock); 1130 raw_spin_unlock(&ctx->lock);
1109 1131
1110 return 0; 1132 return 0;
@@ -1454,8 +1476,24 @@ static void add_event_to_ctx(struct perf_event *event,
1454 event->tstamp_stopped = tstamp; 1476 event->tstamp_stopped = tstamp;
1455} 1477}
1456 1478
1457static void perf_event_context_sched_in(struct perf_event_context *ctx, 1479static void task_ctx_sched_out(struct perf_event_context *ctx);
1458 struct task_struct *tsk); 1480static void
1481ctx_sched_in(struct perf_event_context *ctx,
1482 struct perf_cpu_context *cpuctx,
1483 enum event_type_t event_type,
1484 struct task_struct *task);
1485
1486static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1487 struct perf_event_context *ctx,
1488 struct task_struct *task)
1489{
1490 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1491 if (ctx)
1492 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1493 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1494 if (ctx)
1495 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1496}
1459 1497
1460/* 1498/*
1461 * Cross CPU call to install and enable a performance event 1499 * Cross CPU call to install and enable a performance event
@@ -1466,20 +1504,37 @@ static int __perf_install_in_context(void *info)
1466{ 1504{
1467 struct perf_event *event = info; 1505 struct perf_event *event = info;
1468 struct perf_event_context *ctx = event->ctx; 1506 struct perf_event_context *ctx = event->ctx;
1469 struct perf_event *leader = event->group_leader;
1470 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1507 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1471 int err; 1508 struct perf_event_context *task_ctx = cpuctx->task_ctx;
1509 struct task_struct *task = current;
1510
1511 perf_ctx_lock(cpuctx, task_ctx);
1512 perf_pmu_disable(cpuctx->ctx.pmu);
1472 1513
1473 /* 1514 /*
1474 * In case we're installing a new context to an already running task, 1515 * If there was an active task_ctx schedule it out.
1475 * could also happen before perf_event_task_sched_in() on architectures
1476 * which do context switches with IRQs enabled.
1477 */ 1516 */
1478 if (ctx->task && !cpuctx->task_ctx) 1517 if (task_ctx)
1479 perf_event_context_sched_in(ctx, ctx->task); 1518 task_ctx_sched_out(task_ctx);
1519
1520 /*
1521 * If the context we're installing events in is not the
1522 * active task_ctx, flip them.
1523 */
1524 if (ctx->task && task_ctx != ctx) {
1525 if (task_ctx)
1526 raw_spin_unlock(&task_ctx->lock);
1527 raw_spin_lock(&ctx->lock);
1528 task_ctx = ctx;
1529 }
1530
1531 if (task_ctx) {
1532 cpuctx->task_ctx = task_ctx;
1533 task = task_ctx->task;
1534 }
1535
1536 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
1480 1537
1481 raw_spin_lock(&ctx->lock);
1482 ctx->is_active = 1;
1483 update_context_time(ctx); 1538 update_context_time(ctx);
1484 /* 1539 /*
1485 * update cgrp time only if current cgrp 1540 * update cgrp time only if current cgrp
@@ -1490,43 +1545,13 @@ static int __perf_install_in_context(void *info)
1490 1545
1491 add_event_to_ctx(event, ctx); 1546 add_event_to_ctx(event, ctx);
1492 1547
1493 if (!event_filter_match(event))
1494 goto unlock;
1495
1496 /*
1497 * Don't put the event on if it is disabled or if
1498 * it is in a group and the group isn't on.
1499 */
1500 if (event->state != PERF_EVENT_STATE_INACTIVE ||
1501 (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
1502 goto unlock;
1503
1504 /* 1548 /*
1505 * An exclusive event can't go on if there are already active 1549 * Schedule everything back in
1506 * hardware events, and no hardware event can go on if there
1507 * is already an exclusive event on.
1508 */ 1550 */
1509 if (!group_can_go_on(event, cpuctx, 1)) 1551 perf_event_sched_in(cpuctx, task_ctx, task);
1510 err = -EEXIST;
1511 else
1512 err = event_sched_in(event, cpuctx, ctx);
1513
1514 if (err) {
1515 /*
1516 * This event couldn't go on. If it is in a group
1517 * then we have to pull the whole group off.
1518 * If the event group is pinned then put it in error state.
1519 */
1520 if (leader != event)
1521 group_sched_out(leader, cpuctx, ctx);
1522 if (leader->attr.pinned) {
1523 update_group_times(leader);
1524 leader->state = PERF_EVENT_STATE_ERROR;
1525 }
1526 }
1527 1552
1528unlock: 1553 perf_pmu_enable(cpuctx->ctx.pmu);
1529 raw_spin_unlock(&ctx->lock); 1554 perf_ctx_unlock(cpuctx, task_ctx);
1530 1555
1531 return 0; 1556 return 0;
1532} 1557}
@@ -1739,7 +1764,7 @@ out:
1739 raw_spin_unlock_irq(&ctx->lock); 1764 raw_spin_unlock_irq(&ctx->lock);
1740} 1765}
1741 1766
1742static int perf_event_refresh(struct perf_event *event, int refresh) 1767int perf_event_refresh(struct perf_event *event, int refresh)
1743{ 1768{
1744 /* 1769 /*
1745 * not supported on inherited events 1770 * not supported on inherited events
@@ -1752,36 +1777,35 @@ static int perf_event_refresh(struct perf_event *event, int refresh)
1752 1777
1753 return 0; 1778 return 0;
1754} 1779}
1780EXPORT_SYMBOL_GPL(perf_event_refresh);
1755 1781
1756static void ctx_sched_out(struct perf_event_context *ctx, 1782static void ctx_sched_out(struct perf_event_context *ctx,
1757 struct perf_cpu_context *cpuctx, 1783 struct perf_cpu_context *cpuctx,
1758 enum event_type_t event_type) 1784 enum event_type_t event_type)
1759{ 1785{
1760 struct perf_event *event; 1786 struct perf_event *event;
1787 int is_active = ctx->is_active;
1761 1788
1762 raw_spin_lock(&ctx->lock); 1789 ctx->is_active &= ~event_type;
1763 perf_pmu_disable(ctx->pmu);
1764 ctx->is_active = 0;
1765 if (likely(!ctx->nr_events)) 1790 if (likely(!ctx->nr_events))
1766 goto out; 1791 return;
1792
1767 update_context_time(ctx); 1793 update_context_time(ctx);
1768 update_cgrp_time_from_cpuctx(cpuctx); 1794 update_cgrp_time_from_cpuctx(cpuctx);
1769
1770 if (!ctx->nr_active) 1795 if (!ctx->nr_active)
1771 goto out; 1796 return;
1772 1797
1773 if (event_type & EVENT_PINNED) { 1798 perf_pmu_disable(ctx->pmu);
1799 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
1774 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 1800 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
1775 group_sched_out(event, cpuctx, ctx); 1801 group_sched_out(event, cpuctx, ctx);
1776 } 1802 }
1777 1803
1778 if (event_type & EVENT_FLEXIBLE) { 1804 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
1779 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 1805 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
1780 group_sched_out(event, cpuctx, ctx); 1806 group_sched_out(event, cpuctx, ctx);
1781 } 1807 }
1782out:
1783 perf_pmu_enable(ctx->pmu); 1808 perf_pmu_enable(ctx->pmu);
1784 raw_spin_unlock(&ctx->lock);
1785} 1809}
1786 1810
1787/* 1811/*
@@ -1929,8 +1953,10 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
1929 rcu_read_unlock(); 1953 rcu_read_unlock();
1930 1954
1931 if (do_switch) { 1955 if (do_switch) {
1956 raw_spin_lock(&ctx->lock);
1932 ctx_sched_out(ctx, cpuctx, EVENT_ALL); 1957 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1933 cpuctx->task_ctx = NULL; 1958 cpuctx->task_ctx = NULL;
1959 raw_spin_unlock(&ctx->lock);
1934 } 1960 }
1935} 1961}
1936 1962
@@ -1965,8 +1991,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
1965 perf_cgroup_sched_out(task); 1991 perf_cgroup_sched_out(task);
1966} 1992}
1967 1993
1968static void task_ctx_sched_out(struct perf_event_context *ctx, 1994static void task_ctx_sched_out(struct perf_event_context *ctx)
1969 enum event_type_t event_type)
1970{ 1995{
1971 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 1996 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1972 1997
@@ -1976,7 +2001,7 @@ static void task_ctx_sched_out(struct perf_event_context *ctx,
1976 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) 2001 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
1977 return; 2002 return;
1978 2003
1979 ctx_sched_out(ctx, cpuctx, event_type); 2004 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
1980 cpuctx->task_ctx = NULL; 2005 cpuctx->task_ctx = NULL;
1981} 2006}
1982 2007
@@ -2055,11 +2080,11 @@ ctx_sched_in(struct perf_event_context *ctx,
2055 struct task_struct *task) 2080 struct task_struct *task)
2056{ 2081{
2057 u64 now; 2082 u64 now;
2083 int is_active = ctx->is_active;
2058 2084
2059 raw_spin_lock(&ctx->lock); 2085 ctx->is_active |= event_type;
2060 ctx->is_active = 1;
2061 if (likely(!ctx->nr_events)) 2086 if (likely(!ctx->nr_events))
2062 goto out; 2087 return;
2063 2088
2064 now = perf_clock(); 2089 now = perf_clock();
2065 ctx->timestamp = now; 2090 ctx->timestamp = now;
@@ -2068,15 +2093,12 @@ ctx_sched_in(struct perf_event_context *ctx,
2068 * First go through the list and put on any pinned groups 2093 * First go through the list and put on any pinned groups
2069 * in order to give them the best chance of going on. 2094 * in order to give them the best chance of going on.
2070 */ 2095 */
2071 if (event_type & EVENT_PINNED) 2096 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2072 ctx_pinned_sched_in(ctx, cpuctx); 2097 ctx_pinned_sched_in(ctx, cpuctx);
2073 2098
2074 /* Then walk through the lower prio flexible groups */ 2099 /* Then walk through the lower prio flexible groups */
2075 if (event_type & EVENT_FLEXIBLE) 2100 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2076 ctx_flexible_sched_in(ctx, cpuctx); 2101 ctx_flexible_sched_in(ctx, cpuctx);
2077
2078out:
2079 raw_spin_unlock(&ctx->lock);
2080} 2102}
2081 2103
2082static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, 2104static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
@@ -2088,19 +2110,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2088 ctx_sched_in(ctx, cpuctx, event_type, task); 2110 ctx_sched_in(ctx, cpuctx, event_type, task);
2089} 2111}
2090 2112
2091static void task_ctx_sched_in(struct perf_event_context *ctx,
2092 enum event_type_t event_type)
2093{
2094 struct perf_cpu_context *cpuctx;
2095
2096 cpuctx = __get_cpu_context(ctx);
2097 if (cpuctx->task_ctx == ctx)
2098 return;
2099
2100 ctx_sched_in(ctx, cpuctx, event_type, NULL);
2101 cpuctx->task_ctx = ctx;
2102}
2103
2104static void perf_event_context_sched_in(struct perf_event_context *ctx, 2113static void perf_event_context_sched_in(struct perf_event_context *ctx,
2105 struct task_struct *task) 2114 struct task_struct *task)
2106{ 2115{
@@ -2110,6 +2119,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2110 if (cpuctx->task_ctx == ctx) 2119 if (cpuctx->task_ctx == ctx)
2111 return; 2120 return;
2112 2121
2122 perf_ctx_lock(cpuctx, ctx);
2113 perf_pmu_disable(ctx->pmu); 2123 perf_pmu_disable(ctx->pmu);
2114 /* 2124 /*
2115 * We want to keep the following priority order: 2125 * We want to keep the following priority order:
@@ -2118,18 +2128,18 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
2118 */ 2128 */
2119 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2129 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2120 2130
2121 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task); 2131 perf_event_sched_in(cpuctx, ctx, task);
2122 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2123 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2124 2132
2125 cpuctx->task_ctx = ctx; 2133 cpuctx->task_ctx = ctx;
2126 2134
2135 perf_pmu_enable(ctx->pmu);
2136 perf_ctx_unlock(cpuctx, ctx);
2137
2127 /* 2138 /*
2128 * Since these rotations are per-cpu, we need to ensure the 2139 * Since these rotations are per-cpu, we need to ensure the
2129 * cpu-context we got scheduled on is actually rotating. 2140 * cpu-context we got scheduled on is actually rotating.
2130 */ 2141 */
2131 perf_pmu_rotate_start(ctx->pmu); 2142 perf_pmu_rotate_start(ctx->pmu);
2132 perf_pmu_enable(ctx->pmu);
2133} 2143}
2134 2144
2135/* 2145/*
@@ -2269,7 +2279,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2269 u64 interrupts, now; 2279 u64 interrupts, now;
2270 s64 delta; 2280 s64 delta;
2271 2281
2272 raw_spin_lock(&ctx->lock);
2273 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { 2282 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2274 if (event->state != PERF_EVENT_STATE_ACTIVE) 2283 if (event->state != PERF_EVENT_STATE_ACTIVE)
2275 continue; 2284 continue;
@@ -2301,7 +2310,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2301 if (delta > 0) 2310 if (delta > 0)
2302 perf_adjust_period(event, period, delta); 2311 perf_adjust_period(event, period, delta);
2303 } 2312 }
2304 raw_spin_unlock(&ctx->lock);
2305} 2313}
2306 2314
2307/* 2315/*
@@ -2309,16 +2317,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
2309 */ 2317 */
2310static void rotate_ctx(struct perf_event_context *ctx) 2318static void rotate_ctx(struct perf_event_context *ctx)
2311{ 2319{
2312 raw_spin_lock(&ctx->lock);
2313
2314 /* 2320 /*
2315 * Rotate the first entry last of non-pinned groups. Rotation might be 2321 * Rotate the first entry last of non-pinned groups. Rotation might be
2316 * disabled by the inheritance code. 2322 * disabled by the inheritance code.
2317 */ 2323 */
2318 if (!ctx->rotate_disable) 2324 if (!ctx->rotate_disable)
2319 list_rotate_left(&ctx->flexible_groups); 2325 list_rotate_left(&ctx->flexible_groups);
2320
2321 raw_spin_unlock(&ctx->lock);
2322} 2326}
2323 2327
2324/* 2328/*
@@ -2345,6 +2349,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2345 rotate = 1; 2349 rotate = 1;
2346 } 2350 }
2347 2351
2352 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2348 perf_pmu_disable(cpuctx->ctx.pmu); 2353 perf_pmu_disable(cpuctx->ctx.pmu);
2349 perf_ctx_adjust_freq(&cpuctx->ctx, interval); 2354 perf_ctx_adjust_freq(&cpuctx->ctx, interval);
2350 if (ctx) 2355 if (ctx)
@@ -2355,21 +2360,20 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2355 2360
2356 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); 2361 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2357 if (ctx) 2362 if (ctx)
2358 task_ctx_sched_out(ctx, EVENT_FLEXIBLE); 2363 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2359 2364
2360 rotate_ctx(&cpuctx->ctx); 2365 rotate_ctx(&cpuctx->ctx);
2361 if (ctx) 2366 if (ctx)
2362 rotate_ctx(ctx); 2367 rotate_ctx(ctx);
2363 2368
2364 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current); 2369 perf_event_sched_in(cpuctx, ctx, current);
2365 if (ctx)
2366 task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
2367 2370
2368done: 2371done:
2369 if (remove) 2372 if (remove)
2370 list_del_init(&cpuctx->rotation_list); 2373 list_del_init(&cpuctx->rotation_list);
2371 2374
2372 perf_pmu_enable(cpuctx->ctx.pmu); 2375 perf_pmu_enable(cpuctx->ctx.pmu);
2376 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2373} 2377}
2374 2378
2375void perf_event_task_tick(void) 2379void perf_event_task_tick(void)
@@ -2424,9 +2428,9 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2424 * in. 2428 * in.
2425 */ 2429 */
2426 perf_cgroup_sched_out(current); 2430 perf_cgroup_sched_out(current);
2427 task_ctx_sched_out(ctx, EVENT_ALL);
2428 2431
2429 raw_spin_lock(&ctx->lock); 2432 raw_spin_lock(&ctx->lock);
2433 task_ctx_sched_out(ctx);
2430 2434
2431 list_for_each_entry(event, &ctx->pinned_groups, group_entry) { 2435 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2432 ret = event_enable_on_exec(event, ctx); 2436 ret = event_enable_on_exec(event, ctx);
@@ -2835,16 +2839,12 @@ retry:
2835 unclone_ctx(ctx); 2839 unclone_ctx(ctx);
2836 ++ctx->pin_count; 2840 ++ctx->pin_count;
2837 raw_spin_unlock_irqrestore(&ctx->lock, flags); 2841 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2838 } 2842 } else {
2839
2840 if (!ctx) {
2841 ctx = alloc_perf_context(pmu, task); 2843 ctx = alloc_perf_context(pmu, task);
2842 err = -ENOMEM; 2844 err = -ENOMEM;
2843 if (!ctx) 2845 if (!ctx)
2844 goto errout; 2846 goto errout;
2845 2847
2846 get_ctx(ctx);
2847
2848 err = 0; 2848 err = 0;
2849 mutex_lock(&task->perf_event_mutex); 2849 mutex_lock(&task->perf_event_mutex);
2850 /* 2850 /*
@@ -2856,14 +2856,14 @@ retry:
2856 else if (task->perf_event_ctxp[ctxn]) 2856 else if (task->perf_event_ctxp[ctxn])
2857 err = -EAGAIN; 2857 err = -EAGAIN;
2858 else { 2858 else {
2859 get_ctx(ctx);
2859 ++ctx->pin_count; 2860 ++ctx->pin_count;
2860 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx); 2861 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
2861 } 2862 }
2862 mutex_unlock(&task->perf_event_mutex); 2863 mutex_unlock(&task->perf_event_mutex);
2863 2864
2864 if (unlikely(err)) { 2865 if (unlikely(err)) {
2865 put_task_struct(task); 2866 put_ctx(ctx);
2866 kfree(ctx);
2867 2867
2868 if (err == -EAGAIN) 2868 if (err == -EAGAIN)
2869 goto retry; 2869 goto retry;
@@ -2890,7 +2890,7 @@ static void free_event_rcu(struct rcu_head *head)
2890 kfree(event); 2890 kfree(event);
2891} 2891}
2892 2892
2893static void perf_buffer_put(struct perf_buffer *buffer); 2893static void ring_buffer_put(struct ring_buffer *rb);
2894 2894
2895static void free_event(struct perf_event *event) 2895static void free_event(struct perf_event *event)
2896{ 2896{
@@ -2913,9 +2913,9 @@ static void free_event(struct perf_event *event)
2913 } 2913 }
2914 } 2914 }
2915 2915
2916 if (event->buffer) { 2916 if (event->rb) {
2917 perf_buffer_put(event->buffer); 2917 ring_buffer_put(event->rb);
2918 event->buffer = NULL; 2918 event->rb = NULL;
2919 } 2919 }
2920 2920
2921 if (is_cgroup_event(event)) 2921 if (is_cgroup_event(event))
@@ -2934,12 +2934,6 @@ int perf_event_release_kernel(struct perf_event *event)
2934{ 2934{
2935 struct perf_event_context *ctx = event->ctx; 2935 struct perf_event_context *ctx = event->ctx;
2936 2936
2937 /*
2938 * Remove from the PMU, can't get re-enabled since we got
2939 * here because the last ref went.
2940 */
2941 perf_event_disable(event);
2942
2943 WARN_ON_ONCE(ctx->parent_ctx); 2937 WARN_ON_ONCE(ctx->parent_ctx);
2944 /* 2938 /*
2945 * There are two ways this annotation is useful: 2939 * There are two ways this annotation is useful:
@@ -2956,8 +2950,8 @@ int perf_event_release_kernel(struct perf_event *event)
2956 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); 2950 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
2957 raw_spin_lock_irq(&ctx->lock); 2951 raw_spin_lock_irq(&ctx->lock);
2958 perf_group_detach(event); 2952 perf_group_detach(event);
2959 list_del_event(event, ctx);
2960 raw_spin_unlock_irq(&ctx->lock); 2953 raw_spin_unlock_irq(&ctx->lock);
2954 perf_remove_from_context(event);
2961 mutex_unlock(&ctx->mutex); 2955 mutex_unlock(&ctx->mutex);
2962 2956
2963 free_event(event); 2957 free_event(event);
@@ -3149,13 +3143,13 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3149static unsigned int perf_poll(struct file *file, poll_table *wait) 3143static unsigned int perf_poll(struct file *file, poll_table *wait)
3150{ 3144{
3151 struct perf_event *event = file->private_data; 3145 struct perf_event *event = file->private_data;
3152 struct perf_buffer *buffer; 3146 struct ring_buffer *rb;
3153 unsigned int events = POLL_HUP; 3147 unsigned int events = POLL_HUP;
3154 3148
3155 rcu_read_lock(); 3149 rcu_read_lock();
3156 buffer = rcu_dereference(event->buffer); 3150 rb = rcu_dereference(event->rb);
3157 if (buffer) 3151 if (rb)
3158 events = atomic_xchg(&buffer->poll, 0); 3152 events = atomic_xchg(&rb->poll, 0);
3159 rcu_read_unlock(); 3153 rcu_read_unlock();
3160 3154
3161 poll_wait(file, &event->waitq, wait); 3155 poll_wait(file, &event->waitq, wait);
@@ -3358,6 +3352,18 @@ static int perf_event_index(struct perf_event *event)
3358 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; 3352 return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET;
3359} 3353}
3360 3354
3355static void calc_timer_values(struct perf_event *event,
3356 u64 *running,
3357 u64 *enabled)
3358{
3359 u64 now, ctx_time;
3360
3361 now = perf_clock();
3362 ctx_time = event->shadow_ctx_time + now;
3363 *enabled = ctx_time - event->tstamp_enabled;
3364 *running = ctx_time - event->tstamp_running;
3365}
3366
3361/* 3367/*
3362 * Callers need to ensure there can be no nesting of this function, otherwise 3368 * Callers need to ensure there can be no nesting of this function, otherwise
3363 * the seqlock logic goes bad. We can not serialize this because the arch 3369 * the seqlock logic goes bad. We can not serialize this because the arch
@@ -3366,14 +3372,25 @@ static int perf_event_index(struct perf_event *event)
3366void perf_event_update_userpage(struct perf_event *event) 3372void perf_event_update_userpage(struct perf_event *event)
3367{ 3373{
3368 struct perf_event_mmap_page *userpg; 3374 struct perf_event_mmap_page *userpg;
3369 struct perf_buffer *buffer; 3375 struct ring_buffer *rb;
3376 u64 enabled, running;
3370 3377
3371 rcu_read_lock(); 3378 rcu_read_lock();
3372 buffer = rcu_dereference(event->buffer); 3379 /*
3373 if (!buffer) 3380 * compute total_time_enabled, total_time_running
3381 * based on snapshot values taken when the event
3382 * was last scheduled in.
3383 *
3384 * we cannot simply called update_context_time()
3385 * because of locking issue as we can be called in
3386 * NMI context
3387 */
3388 calc_timer_values(event, &enabled, &running);
3389 rb = rcu_dereference(event->rb);
3390 if (!rb)
3374 goto unlock; 3391 goto unlock;
3375 3392
3376 userpg = buffer->user_page; 3393 userpg = rb->user_page;
3377 3394
3378 /* 3395 /*
3379 * Disable preemption so as to not let the corresponding user-space 3396 * Disable preemption so as to not let the corresponding user-space
@@ -3387,10 +3404,10 @@ void perf_event_update_userpage(struct perf_event *event)
3387 if (event->state == PERF_EVENT_STATE_ACTIVE) 3404 if (event->state == PERF_EVENT_STATE_ACTIVE)
3388 userpg->offset -= local64_read(&event->hw.prev_count); 3405 userpg->offset -= local64_read(&event->hw.prev_count);
3389 3406
3390 userpg->time_enabled = event->total_time_enabled + 3407 userpg->time_enabled = enabled +
3391 atomic64_read(&event->child_total_time_enabled); 3408 atomic64_read(&event->child_total_time_enabled);
3392 3409
3393 userpg->time_running = event->total_time_running + 3410 userpg->time_running = running +
3394 atomic64_read(&event->child_total_time_running); 3411 atomic64_read(&event->child_total_time_running);
3395 3412
3396 barrier(); 3413 barrier();
@@ -3400,220 +3417,10 @@ unlock:
3400 rcu_read_unlock(); 3417 rcu_read_unlock();
3401} 3418}
3402 3419
3403static unsigned long perf_data_size(struct perf_buffer *buffer);
3404
3405static void
3406perf_buffer_init(struct perf_buffer *buffer, long watermark, int flags)
3407{
3408 long max_size = perf_data_size(buffer);
3409
3410 if (watermark)
3411 buffer->watermark = min(max_size, watermark);
3412
3413 if (!buffer->watermark)
3414 buffer->watermark = max_size / 2;
3415
3416 if (flags & PERF_BUFFER_WRITABLE)
3417 buffer->writable = 1;
3418
3419 atomic_set(&buffer->refcount, 1);
3420}
3421
3422#ifndef CONFIG_PERF_USE_VMALLOC
3423
3424/*
3425 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
3426 */
3427
3428static struct page *
3429perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3430{
3431 if (pgoff > buffer->nr_pages)
3432 return NULL;
3433
3434 if (pgoff == 0)
3435 return virt_to_page(buffer->user_page);
3436
3437 return virt_to_page(buffer->data_pages[pgoff - 1]);
3438}
3439
3440static void *perf_mmap_alloc_page(int cpu)
3441{
3442 struct page *page;
3443 int node;
3444
3445 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
3446 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
3447 if (!page)
3448 return NULL;
3449
3450 return page_address(page);
3451}
3452
3453static struct perf_buffer *
3454perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3455{
3456 struct perf_buffer *buffer;
3457 unsigned long size;
3458 int i;
3459
3460 size = sizeof(struct perf_buffer);
3461 size += nr_pages * sizeof(void *);
3462
3463 buffer = kzalloc(size, GFP_KERNEL);
3464 if (!buffer)
3465 goto fail;
3466
3467 buffer->user_page = perf_mmap_alloc_page(cpu);
3468 if (!buffer->user_page)
3469 goto fail_user_page;
3470
3471 for (i = 0; i < nr_pages; i++) {
3472 buffer->data_pages[i] = perf_mmap_alloc_page(cpu);
3473 if (!buffer->data_pages[i])
3474 goto fail_data_pages;
3475 }
3476
3477 buffer->nr_pages = nr_pages;
3478
3479 perf_buffer_init(buffer, watermark, flags);
3480
3481 return buffer;
3482
3483fail_data_pages:
3484 for (i--; i >= 0; i--)
3485 free_page((unsigned long)buffer->data_pages[i]);
3486
3487 free_page((unsigned long)buffer->user_page);
3488
3489fail_user_page:
3490 kfree(buffer);
3491
3492fail:
3493 return NULL;
3494}
3495
3496static void perf_mmap_free_page(unsigned long addr)
3497{
3498 struct page *page = virt_to_page((void *)addr);
3499
3500 page->mapping = NULL;
3501 __free_page(page);
3502}
3503
3504static void perf_buffer_free(struct perf_buffer *buffer)
3505{
3506 int i;
3507
3508 perf_mmap_free_page((unsigned long)buffer->user_page);
3509 for (i = 0; i < buffer->nr_pages; i++)
3510 perf_mmap_free_page((unsigned long)buffer->data_pages[i]);
3511 kfree(buffer);
3512}
3513
3514static inline int page_order(struct perf_buffer *buffer)
3515{
3516 return 0;
3517}
3518
3519#else
3520
3521/*
3522 * Back perf_mmap() with vmalloc memory.
3523 *
3524 * Required for architectures that have d-cache aliasing issues.
3525 */
3526
3527static inline int page_order(struct perf_buffer *buffer)
3528{
3529 return buffer->page_order;
3530}
3531
3532static struct page *
3533perf_mmap_to_page(struct perf_buffer *buffer, unsigned long pgoff)
3534{
3535 if (pgoff > (1UL << page_order(buffer)))
3536 return NULL;
3537
3538 return vmalloc_to_page((void *)buffer->user_page + pgoff * PAGE_SIZE);
3539}
3540
3541static void perf_mmap_unmark_page(void *addr)
3542{
3543 struct page *page = vmalloc_to_page(addr);
3544
3545 page->mapping = NULL;
3546}
3547
3548static void perf_buffer_free_work(struct work_struct *work)
3549{
3550 struct perf_buffer *buffer;
3551 void *base;
3552 int i, nr;
3553
3554 buffer = container_of(work, struct perf_buffer, work);
3555 nr = 1 << page_order(buffer);
3556
3557 base = buffer->user_page;
3558 for (i = 0; i < nr + 1; i++)
3559 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
3560
3561 vfree(base);
3562 kfree(buffer);
3563}
3564
3565static void perf_buffer_free(struct perf_buffer *buffer)
3566{
3567 schedule_work(&buffer->work);
3568}
3569
3570static struct perf_buffer *
3571perf_buffer_alloc(int nr_pages, long watermark, int cpu, int flags)
3572{
3573 struct perf_buffer *buffer;
3574 unsigned long size;
3575 void *all_buf;
3576
3577 size = sizeof(struct perf_buffer);
3578 size += sizeof(void *);
3579
3580 buffer = kzalloc(size, GFP_KERNEL);
3581 if (!buffer)
3582 goto fail;
3583
3584 INIT_WORK(&buffer->work, perf_buffer_free_work);
3585
3586 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
3587 if (!all_buf)
3588 goto fail_all_buf;
3589
3590 buffer->user_page = all_buf;
3591 buffer->data_pages[0] = all_buf + PAGE_SIZE;
3592 buffer->page_order = ilog2(nr_pages);
3593 buffer->nr_pages = 1;
3594
3595 perf_buffer_init(buffer, watermark, flags);
3596
3597 return buffer;
3598
3599fail_all_buf:
3600 kfree(buffer);
3601
3602fail:
3603 return NULL;
3604}
3605
3606#endif
3607
3608static unsigned long perf_data_size(struct perf_buffer *buffer)
3609{
3610 return buffer->nr_pages << (PAGE_SHIFT + page_order(buffer));
3611}
3612
3613static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) 3420static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3614{ 3421{
3615 struct perf_event *event = vma->vm_file->private_data; 3422 struct perf_event *event = vma->vm_file->private_data;
3616 struct perf_buffer *buffer; 3423 struct ring_buffer *rb;
3617 int ret = VM_FAULT_SIGBUS; 3424 int ret = VM_FAULT_SIGBUS;
3618 3425
3619 if (vmf->flags & FAULT_FLAG_MKWRITE) { 3426 if (vmf->flags & FAULT_FLAG_MKWRITE) {
@@ -3623,14 +3430,14 @@ static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3623 } 3430 }
3624 3431
3625 rcu_read_lock(); 3432 rcu_read_lock();
3626 buffer = rcu_dereference(event->buffer); 3433 rb = rcu_dereference(event->rb);
3627 if (!buffer) 3434 if (!rb)
3628 goto unlock; 3435 goto unlock;
3629 3436
3630 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE)) 3437 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3631 goto unlock; 3438 goto unlock;
3632 3439
3633 vmf->page = perf_mmap_to_page(buffer, vmf->pgoff); 3440 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
3634 if (!vmf->page) 3441 if (!vmf->page)
3635 goto unlock; 3442 goto unlock;
3636 3443
@@ -3645,35 +3452,35 @@ unlock:
3645 return ret; 3452 return ret;
3646} 3453}
3647 3454
3648static void perf_buffer_free_rcu(struct rcu_head *rcu_head) 3455static void rb_free_rcu(struct rcu_head *rcu_head)
3649{ 3456{
3650 struct perf_buffer *buffer; 3457 struct ring_buffer *rb;
3651 3458
3652 buffer = container_of(rcu_head, struct perf_buffer, rcu_head); 3459 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
3653 perf_buffer_free(buffer); 3460 rb_free(rb);
3654} 3461}
3655 3462
3656static struct perf_buffer *perf_buffer_get(struct perf_event *event) 3463static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3657{ 3464{
3658 struct perf_buffer *buffer; 3465 struct ring_buffer *rb;
3659 3466
3660 rcu_read_lock(); 3467 rcu_read_lock();
3661 buffer = rcu_dereference(event->buffer); 3468 rb = rcu_dereference(event->rb);
3662 if (buffer) { 3469 if (rb) {
3663 if (!atomic_inc_not_zero(&buffer->refcount)) 3470 if (!atomic_inc_not_zero(&rb->refcount))
3664 buffer = NULL; 3471 rb = NULL;
3665 } 3472 }
3666 rcu_read_unlock(); 3473 rcu_read_unlock();
3667 3474
3668 return buffer; 3475 return rb;
3669} 3476}
3670 3477
3671static void perf_buffer_put(struct perf_buffer *buffer) 3478static void ring_buffer_put(struct ring_buffer *rb)
3672{ 3479{
3673 if (!atomic_dec_and_test(&buffer->refcount)) 3480 if (!atomic_dec_and_test(&rb->refcount))
3674 return; 3481 return;
3675 3482
3676 call_rcu(&buffer->rcu_head, perf_buffer_free_rcu); 3483 call_rcu(&rb->rcu_head, rb_free_rcu);
3677} 3484}
3678 3485
3679static void perf_mmap_open(struct vm_area_struct *vma) 3486static void perf_mmap_open(struct vm_area_struct *vma)
@@ -3688,16 +3495,16 @@ static void perf_mmap_close(struct vm_area_struct *vma)
3688 struct perf_event *event = vma->vm_file->private_data; 3495 struct perf_event *event = vma->vm_file->private_data;
3689 3496
3690 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { 3497 if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
3691 unsigned long size = perf_data_size(event->buffer); 3498 unsigned long size = perf_data_size(event->rb);
3692 struct user_struct *user = event->mmap_user; 3499 struct user_struct *user = event->mmap_user;
3693 struct perf_buffer *buffer = event->buffer; 3500 struct ring_buffer *rb = event->rb;
3694 3501
3695 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); 3502 atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
3696 vma->vm_mm->locked_vm -= event->mmap_locked; 3503 vma->vm_mm->locked_vm -= event->mmap_locked;
3697 rcu_assign_pointer(event->buffer, NULL); 3504 rcu_assign_pointer(event->rb, NULL);
3698 mutex_unlock(&event->mmap_mutex); 3505 mutex_unlock(&event->mmap_mutex);
3699 3506
3700 perf_buffer_put(buffer); 3507 ring_buffer_put(rb);
3701 free_uid(user); 3508 free_uid(user);
3702 } 3509 }
3703} 3510}
@@ -3715,7 +3522,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3715 unsigned long user_locked, user_lock_limit; 3522 unsigned long user_locked, user_lock_limit;
3716 struct user_struct *user = current_user(); 3523 struct user_struct *user = current_user();
3717 unsigned long locked, lock_limit; 3524 unsigned long locked, lock_limit;
3718 struct perf_buffer *buffer; 3525 struct ring_buffer *rb;
3719 unsigned long vma_size; 3526 unsigned long vma_size;
3720 unsigned long nr_pages; 3527 unsigned long nr_pages;
3721 long user_extra, extra; 3528 long user_extra, extra;
@@ -3724,7 +3531,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3724 /* 3531 /*
3725 * Don't allow mmap() of inherited per-task counters. This would 3532 * Don't allow mmap() of inherited per-task counters. This would
3726 * create a performance issue due to all children writing to the 3533 * create a performance issue due to all children writing to the
3727 * same buffer. 3534 * same rb.
3728 */ 3535 */
3729 if (event->cpu == -1 && event->attr.inherit) 3536 if (event->cpu == -1 && event->attr.inherit)
3730 return -EINVAL; 3537 return -EINVAL;
@@ -3736,7 +3543,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3736 nr_pages = (vma_size / PAGE_SIZE) - 1; 3543 nr_pages = (vma_size / PAGE_SIZE) - 1;
3737 3544
3738 /* 3545 /*
3739 * If we have buffer pages ensure they're a power-of-two number, so we 3546 * If we have rb pages ensure they're a power-of-two number, so we
3740 * can do bitmasks instead of modulo. 3547 * can do bitmasks instead of modulo.
3741 */ 3548 */
3742 if (nr_pages != 0 && !is_power_of_2(nr_pages)) 3549 if (nr_pages != 0 && !is_power_of_2(nr_pages))
@@ -3750,9 +3557,9 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3750 3557
3751 WARN_ON_ONCE(event->ctx->parent_ctx); 3558 WARN_ON_ONCE(event->ctx->parent_ctx);
3752 mutex_lock(&event->mmap_mutex); 3559 mutex_lock(&event->mmap_mutex);
3753 if (event->buffer) { 3560 if (event->rb) {
3754 if (event->buffer->nr_pages == nr_pages) 3561 if (event->rb->nr_pages == nr_pages)
3755 atomic_inc(&event->buffer->refcount); 3562 atomic_inc(&event->rb->refcount);
3756 else 3563 else
3757 ret = -EINVAL; 3564 ret = -EINVAL;
3758 goto unlock; 3565 goto unlock;
@@ -3782,18 +3589,20 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3782 goto unlock; 3589 goto unlock;
3783 } 3590 }
3784 3591
3785 WARN_ON(event->buffer); 3592 WARN_ON(event->rb);
3786 3593
3787 if (vma->vm_flags & VM_WRITE) 3594 if (vma->vm_flags & VM_WRITE)
3788 flags |= PERF_BUFFER_WRITABLE; 3595 flags |= RING_BUFFER_WRITABLE;
3596
3597 rb = rb_alloc(nr_pages,
3598 event->attr.watermark ? event->attr.wakeup_watermark : 0,
3599 event->cpu, flags);
3789 3600
3790 buffer = perf_buffer_alloc(nr_pages, event->attr.wakeup_watermark, 3601 if (!rb) {
3791 event->cpu, flags);
3792 if (!buffer) {
3793 ret = -ENOMEM; 3602 ret = -ENOMEM;
3794 goto unlock; 3603 goto unlock;
3795 } 3604 }
3796 rcu_assign_pointer(event->buffer, buffer); 3605 rcu_assign_pointer(event->rb, rb);
3797 3606
3798 atomic_long_add(user_extra, &user->locked_vm); 3607 atomic_long_add(user_extra, &user->locked_vm);
3799 event->mmap_locked = extra; 3608 event->mmap_locked = extra;
@@ -3892,117 +3701,6 @@ int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
3892} 3701}
3893EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks); 3702EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
3894 3703
3895/*
3896 * Output
3897 */
3898static bool perf_output_space(struct perf_buffer *buffer, unsigned long tail,
3899 unsigned long offset, unsigned long head)
3900{
3901 unsigned long mask;
3902
3903 if (!buffer->writable)
3904 return true;
3905
3906 mask = perf_data_size(buffer) - 1;
3907
3908 offset = (offset - tail) & mask;
3909 head = (head - tail) & mask;
3910
3911 if ((int)(head - offset) < 0)
3912 return false;
3913
3914 return true;
3915}
3916
3917static void perf_output_wakeup(struct perf_output_handle *handle)
3918{
3919 atomic_set(&handle->buffer->poll, POLL_IN);
3920
3921 if (handle->nmi) {
3922 handle->event->pending_wakeup = 1;
3923 irq_work_queue(&handle->event->pending);
3924 } else
3925 perf_event_wakeup(handle->event);
3926}
3927
3928/*
3929 * We need to ensure a later event_id doesn't publish a head when a former
3930 * event isn't done writing. However since we need to deal with NMIs we
3931 * cannot fully serialize things.
3932 *
3933 * We only publish the head (and generate a wakeup) when the outer-most
3934 * event completes.
3935 */
3936static void perf_output_get_handle(struct perf_output_handle *handle)
3937{
3938 struct perf_buffer *buffer = handle->buffer;
3939
3940 preempt_disable();
3941 local_inc(&buffer->nest);
3942 handle->wakeup = local_read(&buffer->wakeup);
3943}
3944
3945static void perf_output_put_handle(struct perf_output_handle *handle)
3946{
3947 struct perf_buffer *buffer = handle->buffer;
3948 unsigned long head;
3949
3950again:
3951 head = local_read(&buffer->head);
3952
3953 /*
3954 * IRQ/NMI can happen here, which means we can miss a head update.
3955 */
3956
3957 if (!local_dec_and_test(&buffer->nest))
3958 goto out;
3959
3960 /*
3961 * Publish the known good head. Rely on the full barrier implied
3962 * by atomic_dec_and_test() order the buffer->head read and this
3963 * write.
3964 */
3965 buffer->user_page->data_head = head;
3966
3967 /*
3968 * Now check if we missed an update, rely on the (compiler)
3969 * barrier in atomic_dec_and_test() to re-read buffer->head.
3970 */
3971 if (unlikely(head != local_read(&buffer->head))) {
3972 local_inc(&buffer->nest);
3973 goto again;
3974 }
3975
3976 if (handle->wakeup != local_read(&buffer->wakeup))
3977 perf_output_wakeup(handle);
3978
3979out:
3980 preempt_enable();
3981}
3982
3983__always_inline void perf_output_copy(struct perf_output_handle *handle,
3984 const void *buf, unsigned int len)
3985{
3986 do {
3987 unsigned long size = min_t(unsigned long, handle->size, len);
3988
3989 memcpy(handle->addr, buf, size);
3990
3991 len -= size;
3992 handle->addr += size;
3993 buf += size;
3994 handle->size -= size;
3995 if (!handle->size) {
3996 struct perf_buffer *buffer = handle->buffer;
3997
3998 handle->page++;
3999 handle->page &= buffer->nr_pages - 1;
4000 handle->addr = buffer->data_pages[handle->page];
4001 handle->size = PAGE_SIZE << page_order(buffer);
4002 }
4003 } while (len);
4004}
4005
4006static void __perf_event_header__init_id(struct perf_event_header *header, 3704static void __perf_event_header__init_id(struct perf_event_header *header,
4007 struct perf_sample_data *data, 3705 struct perf_sample_data *data,
4008 struct perf_event *event) 3706 struct perf_event *event)
@@ -4033,9 +3731,9 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
4033 } 3731 }
4034} 3732}
4035 3733
4036static void perf_event_header__init_id(struct perf_event_header *header, 3734void perf_event_header__init_id(struct perf_event_header *header,
4037 struct perf_sample_data *data, 3735 struct perf_sample_data *data,
4038 struct perf_event *event) 3736 struct perf_event *event)
4039{ 3737{
4040 if (event->attr.sample_id_all) 3738 if (event->attr.sample_id_all)
4041 __perf_event_header__init_id(header, data, event); 3739 __perf_event_header__init_id(header, data, event);
@@ -4062,121 +3760,14 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4062 perf_output_put(handle, data->cpu_entry); 3760 perf_output_put(handle, data->cpu_entry);
4063} 3761}
4064 3762
4065static void perf_event__output_id_sample(struct perf_event *event, 3763void perf_event__output_id_sample(struct perf_event *event,
4066 struct perf_output_handle *handle, 3764 struct perf_output_handle *handle,
4067 struct perf_sample_data *sample) 3765 struct perf_sample_data *sample)
4068{ 3766{
4069 if (event->attr.sample_id_all) 3767 if (event->attr.sample_id_all)
4070 __perf_event__output_id_sample(handle, sample); 3768 __perf_event__output_id_sample(handle, sample);
4071} 3769}
4072 3770
4073int perf_output_begin(struct perf_output_handle *handle,
4074 struct perf_event *event, unsigned int size,
4075 int nmi, int sample)
4076{
4077 struct perf_buffer *buffer;
4078 unsigned long tail, offset, head;
4079 int have_lost;
4080 struct perf_sample_data sample_data;
4081 struct {
4082 struct perf_event_header header;
4083 u64 id;
4084 u64 lost;
4085 } lost_event;
4086
4087 rcu_read_lock();
4088 /*
4089 * For inherited events we send all the output towards the parent.
4090 */
4091 if (event->parent)
4092 event = event->parent;
4093
4094 buffer = rcu_dereference(event->buffer);
4095 if (!buffer)
4096 goto out;
4097
4098 handle->buffer = buffer;
4099 handle->event = event;
4100 handle->nmi = nmi;
4101 handle->sample = sample;
4102
4103 if (!buffer->nr_pages)
4104 goto out;
4105
4106 have_lost = local_read(&buffer->lost);
4107 if (have_lost) {
4108 lost_event.header.size = sizeof(lost_event);
4109 perf_event_header__init_id(&lost_event.header, &sample_data,
4110 event);
4111 size += lost_event.header.size;
4112 }
4113
4114 perf_output_get_handle(handle);
4115
4116 do {
4117 /*
4118 * Userspace could choose to issue a mb() before updating the
4119 * tail pointer. So that all reads will be completed before the
4120 * write is issued.
4121 */
4122 tail = ACCESS_ONCE(buffer->user_page->data_tail);
4123 smp_rmb();
4124 offset = head = local_read(&buffer->head);
4125 head += size;
4126 if (unlikely(!perf_output_space(buffer, tail, offset, head)))
4127 goto fail;
4128 } while (local_cmpxchg(&buffer->head, offset, head) != offset);
4129
4130 if (head - local_read(&buffer->wakeup) > buffer->watermark)
4131 local_add(buffer->watermark, &buffer->wakeup);
4132
4133 handle->page = offset >> (PAGE_SHIFT + page_order(buffer));
4134 handle->page &= buffer->nr_pages - 1;
4135 handle->size = offset & ((PAGE_SIZE << page_order(buffer)) - 1);
4136 handle->addr = buffer->data_pages[handle->page];
4137 handle->addr += handle->size;
4138 handle->size = (PAGE_SIZE << page_order(buffer)) - handle->size;
4139
4140 if (have_lost) {
4141 lost_event.header.type = PERF_RECORD_LOST;
4142 lost_event.header.misc = 0;
4143 lost_event.id = event->id;
4144 lost_event.lost = local_xchg(&buffer->lost, 0);
4145
4146 perf_output_put(handle, lost_event);
4147 perf_event__output_id_sample(event, handle, &sample_data);
4148 }
4149
4150 return 0;
4151
4152fail:
4153 local_inc(&buffer->lost);
4154 perf_output_put_handle(handle);
4155out:
4156 rcu_read_unlock();
4157
4158 return -ENOSPC;
4159}
4160
4161void perf_output_end(struct perf_output_handle *handle)
4162{
4163 struct perf_event *event = handle->event;
4164 struct perf_buffer *buffer = handle->buffer;
4165
4166 int wakeup_events = event->attr.wakeup_events;
4167
4168 if (handle->sample && wakeup_events) {
4169 int events = local_inc_return(&buffer->events);
4170 if (events >= wakeup_events) {
4171 local_sub(wakeup_events, &buffer->events);
4172 local_inc(&buffer->wakeup);
4173 }
4174 }
4175
4176 perf_output_put_handle(handle);
4177 rcu_read_unlock();
4178}
4179
4180static void perf_output_read_one(struct perf_output_handle *handle, 3771static void perf_output_read_one(struct perf_output_handle *handle,
4181 struct perf_event *event, 3772 struct perf_event *event,
4182 u64 enabled, u64 running) 3773 u64 enabled, u64 running)
@@ -4197,7 +3788,7 @@ static void perf_output_read_one(struct perf_output_handle *handle,
4197 if (read_format & PERF_FORMAT_ID) 3788 if (read_format & PERF_FORMAT_ID)
4198 values[n++] = primary_event_id(event); 3789 values[n++] = primary_event_id(event);
4199 3790
4200 perf_output_copy(handle, values, n * sizeof(u64)); 3791 __output_copy(handle, values, n * sizeof(u64));
4201} 3792}
4202 3793
4203/* 3794/*
@@ -4227,7 +3818,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4227 if (read_format & PERF_FORMAT_ID) 3818 if (read_format & PERF_FORMAT_ID)
4228 values[n++] = primary_event_id(leader); 3819 values[n++] = primary_event_id(leader);
4229 3820
4230 perf_output_copy(handle, values, n * sizeof(u64)); 3821 __output_copy(handle, values, n * sizeof(u64));
4231 3822
4232 list_for_each_entry(sub, &leader->sibling_list, group_entry) { 3823 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4233 n = 0; 3824 n = 0;
@@ -4239,7 +3830,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4239 if (read_format & PERF_FORMAT_ID) 3830 if (read_format & PERF_FORMAT_ID)
4240 values[n++] = primary_event_id(sub); 3831 values[n++] = primary_event_id(sub);
4241 3832
4242 perf_output_copy(handle, values, n * sizeof(u64)); 3833 __output_copy(handle, values, n * sizeof(u64));
4243 } 3834 }
4244} 3835}
4245 3836
@@ -4249,7 +3840,7 @@ static void perf_output_read_group(struct perf_output_handle *handle,
4249static void perf_output_read(struct perf_output_handle *handle, 3840static void perf_output_read(struct perf_output_handle *handle,
4250 struct perf_event *event) 3841 struct perf_event *event)
4251{ 3842{
4252 u64 enabled = 0, running = 0, now, ctx_time; 3843 u64 enabled = 0, running = 0;
4253 u64 read_format = event->attr.read_format; 3844 u64 read_format = event->attr.read_format;
4254 3845
4255 /* 3846 /*
@@ -4261,12 +3852,8 @@ static void perf_output_read(struct perf_output_handle *handle,
4261 * because of locking issue as we are called in 3852 * because of locking issue as we are called in
4262 * NMI context 3853 * NMI context
4263 */ 3854 */
4264 if (read_format & PERF_FORMAT_TOTAL_TIMES) { 3855 if (read_format & PERF_FORMAT_TOTAL_TIMES)
4265 now = perf_clock(); 3856 calc_timer_values(event, &enabled, &running);
4266 ctx_time = event->shadow_ctx_time + now;
4267 enabled = ctx_time - event->tstamp_enabled;
4268 running = ctx_time - event->tstamp_running;
4269 }
4270 3857
4271 if (event->attr.read_format & PERF_FORMAT_GROUP) 3858 if (event->attr.read_format & PERF_FORMAT_GROUP)
4272 perf_output_read_group(handle, event, enabled, running); 3859 perf_output_read_group(handle, event, enabled, running);
@@ -4319,7 +3906,7 @@ void perf_output_sample(struct perf_output_handle *handle,
4319 3906
4320 size *= sizeof(u64); 3907 size *= sizeof(u64);
4321 3908
4322 perf_output_copy(handle, data->callchain, size); 3909 __output_copy(handle, data->callchain, size);
4323 } else { 3910 } else {
4324 u64 nr = 0; 3911 u64 nr = 0;
4325 perf_output_put(handle, nr); 3912 perf_output_put(handle, nr);
@@ -4329,8 +3916,8 @@ void perf_output_sample(struct perf_output_handle *handle,
4329 if (sample_type & PERF_SAMPLE_RAW) { 3916 if (sample_type & PERF_SAMPLE_RAW) {
4330 if (data->raw) { 3917 if (data->raw) {
4331 perf_output_put(handle, data->raw->size); 3918 perf_output_put(handle, data->raw->size);
4332 perf_output_copy(handle, data->raw->data, 3919 __output_copy(handle, data->raw->data,
4333 data->raw->size); 3920 data->raw->size);
4334 } else { 3921 } else {
4335 struct { 3922 struct {
4336 u32 size; 3923 u32 size;
@@ -4342,6 +3929,20 @@ void perf_output_sample(struct perf_output_handle *handle,
4342 perf_output_put(handle, raw); 3929 perf_output_put(handle, raw);
4343 } 3930 }
4344 } 3931 }
3932
3933 if (!event->attr.watermark) {
3934 int wakeup_events = event->attr.wakeup_events;
3935
3936 if (wakeup_events) {
3937 struct ring_buffer *rb = handle->rb;
3938 int events = local_inc_return(&rb->events);
3939
3940 if (events >= wakeup_events) {
3941 local_sub(wakeup_events, &rb->events);
3942 local_inc(&rb->wakeup);
3943 }
3944 }
3945 }
4345} 3946}
4346 3947
4347void perf_prepare_sample(struct perf_event_header *header, 3948void perf_prepare_sample(struct perf_event_header *header,
@@ -4386,7 +3987,7 @@ void perf_prepare_sample(struct perf_event_header *header,
4386 } 3987 }
4387} 3988}
4388 3989
4389static void perf_event_output(struct perf_event *event, int nmi, 3990static void perf_event_output(struct perf_event *event,
4390 struct perf_sample_data *data, 3991 struct perf_sample_data *data,
4391 struct pt_regs *regs) 3992 struct pt_regs *regs)
4392{ 3993{
@@ -4398,7 +3999,7 @@ static void perf_event_output(struct perf_event *event, int nmi,
4398 3999
4399 perf_prepare_sample(&header, data, event, regs); 4000 perf_prepare_sample(&header, data, event, regs);
4400 4001
4401 if (perf_output_begin(&handle, event, header.size, nmi, 1)) 4002 if (perf_output_begin(&handle, event, header.size))
4402 goto exit; 4003 goto exit;
4403 4004
4404 perf_output_sample(&handle, &header, data, event); 4005 perf_output_sample(&handle, &header, data, event);
@@ -4438,7 +4039,7 @@ perf_event_read_event(struct perf_event *event,
4438 int ret; 4039 int ret;
4439 4040
4440 perf_event_header__init_id(&read_event.header, &sample, event); 4041 perf_event_header__init_id(&read_event.header, &sample, event);
4441 ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); 4042 ret = perf_output_begin(&handle, event, read_event.header.size);
4442 if (ret) 4043 if (ret)
4443 return; 4044 return;
4444 4045
@@ -4481,7 +4082,7 @@ static void perf_event_task_output(struct perf_event *event,
4481 perf_event_header__init_id(&task_event->event_id.header, &sample, event); 4082 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4482 4083
4483 ret = perf_output_begin(&handle, event, 4084 ret = perf_output_begin(&handle, event,
4484 task_event->event_id.header.size, 0, 0); 4085 task_event->event_id.header.size);
4485 if (ret) 4086 if (ret)
4486 goto out; 4087 goto out;
4487 4088
@@ -4618,7 +4219,7 @@ static void perf_event_comm_output(struct perf_event *event,
4618 4219
4619 perf_event_header__init_id(&comm_event->event_id.header, &sample, event); 4220 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4620 ret = perf_output_begin(&handle, event, 4221 ret = perf_output_begin(&handle, event,
4621 comm_event->event_id.header.size, 0, 0); 4222 comm_event->event_id.header.size);
4622 4223
4623 if (ret) 4224 if (ret)
4624 goto out; 4225 goto out;
@@ -4627,7 +4228,7 @@ static void perf_event_comm_output(struct perf_event *event,
4627 comm_event->event_id.tid = perf_event_tid(event, comm_event->task); 4228 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4628 4229
4629 perf_output_put(&handle, comm_event->event_id); 4230 perf_output_put(&handle, comm_event->event_id);
4630 perf_output_copy(&handle, comm_event->comm, 4231 __output_copy(&handle, comm_event->comm,
4631 comm_event->comm_size); 4232 comm_event->comm_size);
4632 4233
4633 perf_event__output_id_sample(event, &handle, &sample); 4234 perf_event__output_id_sample(event, &handle, &sample);
@@ -4765,7 +4366,7 @@ static void perf_event_mmap_output(struct perf_event *event,
4765 4366
4766 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); 4367 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4767 ret = perf_output_begin(&handle, event, 4368 ret = perf_output_begin(&handle, event,
4768 mmap_event->event_id.header.size, 0, 0); 4369 mmap_event->event_id.header.size);
4769 if (ret) 4370 if (ret)
4770 goto out; 4371 goto out;
4771 4372
@@ -4773,7 +4374,7 @@ static void perf_event_mmap_output(struct perf_event *event,
4773 mmap_event->event_id.tid = perf_event_tid(event, current); 4374 mmap_event->event_id.tid = perf_event_tid(event, current);
4774 4375
4775 perf_output_put(&handle, mmap_event->event_id); 4376 perf_output_put(&handle, mmap_event->event_id);
4776 perf_output_copy(&handle, mmap_event->file_name, 4377 __output_copy(&handle, mmap_event->file_name,
4777 mmap_event->file_size); 4378 mmap_event->file_size);
4778 4379
4779 perf_event__output_id_sample(event, &handle, &sample); 4380 perf_event__output_id_sample(event, &handle, &sample);
@@ -4829,7 +4430,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4829 4430
4830 if (file) { 4431 if (file) {
4831 /* 4432 /*
4832 * d_path works from the end of the buffer backwards, so we 4433 * d_path works from the end of the rb backwards, so we
4833 * need to add enough zero bytes after the string to handle 4434 * need to add enough zero bytes after the string to handle
4834 * the 64bit alignment we do later. 4435 * the 64bit alignment we do later.
4835 */ 4436 */
@@ -4960,7 +4561,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4960 perf_event_header__init_id(&throttle_event.header, &sample, event); 4561 perf_event_header__init_id(&throttle_event.header, &sample, event);
4961 4562
4962 ret = perf_output_begin(&handle, event, 4563 ret = perf_output_begin(&handle, event,
4963 throttle_event.header.size, 1, 0); 4564 throttle_event.header.size);
4964 if (ret) 4565 if (ret)
4965 return; 4566 return;
4966 4567
@@ -4973,7 +4574,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
4973 * Generic event overflow handling, sampling. 4574 * Generic event overflow handling, sampling.
4974 */ 4575 */
4975 4576
4976static int __perf_event_overflow(struct perf_event *event, int nmi, 4577static int __perf_event_overflow(struct perf_event *event,
4977 int throttle, struct perf_sample_data *data, 4578 int throttle, struct perf_sample_data *data,
4978 struct pt_regs *regs) 4579 struct pt_regs *regs)
4979{ 4580{
@@ -5016,34 +4617,28 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
5016 if (events && atomic_dec_and_test(&event->event_limit)) { 4617 if (events && atomic_dec_and_test(&event->event_limit)) {
5017 ret = 1; 4618 ret = 1;
5018 event->pending_kill = POLL_HUP; 4619 event->pending_kill = POLL_HUP;
5019 if (nmi) { 4620 event->pending_disable = 1;
5020 event->pending_disable = 1; 4621 irq_work_queue(&event->pending);
5021 irq_work_queue(&event->pending);
5022 } else
5023 perf_event_disable(event);
5024 } 4622 }
5025 4623
5026 if (event->overflow_handler) 4624 if (event->overflow_handler)
5027 event->overflow_handler(event, nmi, data, regs); 4625 event->overflow_handler(event, data, regs);
5028 else 4626 else
5029 perf_event_output(event, nmi, data, regs); 4627 perf_event_output(event, data, regs);
5030 4628
5031 if (event->fasync && event->pending_kill) { 4629 if (event->fasync && event->pending_kill) {
5032 if (nmi) { 4630 event->pending_wakeup = 1;
5033 event->pending_wakeup = 1; 4631 irq_work_queue(&event->pending);
5034 irq_work_queue(&event->pending);
5035 } else
5036 perf_event_wakeup(event);
5037 } 4632 }
5038 4633
5039 return ret; 4634 return ret;
5040} 4635}
5041 4636
5042int perf_event_overflow(struct perf_event *event, int nmi, 4637int perf_event_overflow(struct perf_event *event,
5043 struct perf_sample_data *data, 4638 struct perf_sample_data *data,
5044 struct pt_regs *regs) 4639 struct pt_regs *regs)
5045{ 4640{
5046 return __perf_event_overflow(event, nmi, 1, data, regs); 4641 return __perf_event_overflow(event, 1, data, regs);
5047} 4642}
5048 4643
5049/* 4644/*
@@ -5092,7 +4687,7 @@ again:
5092} 4687}
5093 4688
5094static void perf_swevent_overflow(struct perf_event *event, u64 overflow, 4689static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5095 int nmi, struct perf_sample_data *data, 4690 struct perf_sample_data *data,
5096 struct pt_regs *regs) 4691 struct pt_regs *regs)
5097{ 4692{
5098 struct hw_perf_event *hwc = &event->hw; 4693 struct hw_perf_event *hwc = &event->hw;
@@ -5106,7 +4701,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5106 return; 4701 return;
5107 4702
5108 for (; overflow; overflow--) { 4703 for (; overflow; overflow--) {
5109 if (__perf_event_overflow(event, nmi, throttle, 4704 if (__perf_event_overflow(event, throttle,
5110 data, regs)) { 4705 data, regs)) {
5111 /* 4706 /*
5112 * We inhibit the overflow from happening when 4707 * We inhibit the overflow from happening when
@@ -5119,7 +4714,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5119} 4714}
5120 4715
5121static void perf_swevent_event(struct perf_event *event, u64 nr, 4716static void perf_swevent_event(struct perf_event *event, u64 nr,
5122 int nmi, struct perf_sample_data *data, 4717 struct perf_sample_data *data,
5123 struct pt_regs *regs) 4718 struct pt_regs *regs)
5124{ 4719{
5125 struct hw_perf_event *hwc = &event->hw; 4720 struct hw_perf_event *hwc = &event->hw;
@@ -5133,12 +4728,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
5133 return; 4728 return;
5134 4729
5135 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq) 4730 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5136 return perf_swevent_overflow(event, 1, nmi, data, regs); 4731 return perf_swevent_overflow(event, 1, data, regs);
5137 4732
5138 if (local64_add_negative(nr, &hwc->period_left)) 4733 if (local64_add_negative(nr, &hwc->period_left))
5139 return; 4734 return;
5140 4735
5141 perf_swevent_overflow(event, 0, nmi, data, regs); 4736 perf_swevent_overflow(event, 0, data, regs);
5142} 4737}
5143 4738
5144static int perf_exclude_event(struct perf_event *event, 4739static int perf_exclude_event(struct perf_event *event,
@@ -5226,7 +4821,7 @@ find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
5226} 4821}
5227 4822
5228static void do_perf_sw_event(enum perf_type_id type, u32 event_id, 4823static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5229 u64 nr, int nmi, 4824 u64 nr,
5230 struct perf_sample_data *data, 4825 struct perf_sample_data *data,
5231 struct pt_regs *regs) 4826 struct pt_regs *regs)
5232{ 4827{
@@ -5242,7 +4837,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5242 4837
5243 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 4838 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5244 if (perf_swevent_match(event, type, event_id, data, regs)) 4839 if (perf_swevent_match(event, type, event_id, data, regs))
5245 perf_swevent_event(event, nr, nmi, data, regs); 4840 perf_swevent_event(event, nr, data, regs);
5246 } 4841 }
5247end: 4842end:
5248 rcu_read_unlock(); 4843 rcu_read_unlock();
@@ -5263,8 +4858,7 @@ inline void perf_swevent_put_recursion_context(int rctx)
5263 put_recursion_context(swhash->recursion, rctx); 4858 put_recursion_context(swhash->recursion, rctx);
5264} 4859}
5265 4860
5266void __perf_sw_event(u32 event_id, u64 nr, int nmi, 4861void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5267 struct pt_regs *regs, u64 addr)
5268{ 4862{
5269 struct perf_sample_data data; 4863 struct perf_sample_data data;
5270 int rctx; 4864 int rctx;
@@ -5276,7 +4870,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi,
5276 4870
5277 perf_sample_data_init(&data, addr); 4871 perf_sample_data_init(&data, addr);
5278 4872
5279 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); 4873 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
5280 4874
5281 perf_swevent_put_recursion_context(rctx); 4875 perf_swevent_put_recursion_context(rctx);
5282 preempt_enable_notrace(); 4876 preempt_enable_notrace();
@@ -5524,7 +5118,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5524 5118
5525 hlist_for_each_entry_rcu(event, node, head, hlist_entry) { 5119 hlist_for_each_entry_rcu(event, node, head, hlist_entry) {
5526 if (perf_tp_event_match(event, &data, regs)) 5120 if (perf_tp_event_match(event, &data, regs))
5527 perf_swevent_event(event, count, 1, &data, regs); 5121 perf_swevent_event(event, count, &data, regs);
5528 } 5122 }
5529 5123
5530 perf_swevent_put_recursion_context(rctx); 5124 perf_swevent_put_recursion_context(rctx);
@@ -5617,7 +5211,7 @@ void perf_bp_event(struct perf_event *bp, void *data)
5617 perf_sample_data_init(&sample, bp->attr.bp_addr); 5211 perf_sample_data_init(&sample, bp->attr.bp_addr);
5618 5212
5619 if (!bp->hw.state && !perf_exclude_event(bp, regs)) 5213 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5620 perf_swevent_event(bp, 1, 1, &sample, regs); 5214 perf_swevent_event(bp, 1, &sample, regs);
5621} 5215}
5622#endif 5216#endif
5623 5217
@@ -5646,7 +5240,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5646 5240
5647 if (regs && !perf_exclude_event(event, regs)) { 5241 if (regs && !perf_exclude_event(event, regs)) {
5648 if (!(event->attr.exclude_idle && current->pid == 0)) 5242 if (!(event->attr.exclude_idle && current->pid == 0))
5649 if (perf_event_overflow(event, 0, &data, regs)) 5243 if (perf_event_overflow(event, &data, regs))
5650 ret = HRTIMER_NORESTART; 5244 ret = HRTIMER_NORESTART;
5651 } 5245 }
5652 5246
@@ -5986,6 +5580,7 @@ free_dev:
5986} 5580}
5987 5581
5988static struct lock_class_key cpuctx_mutex; 5582static struct lock_class_key cpuctx_mutex;
5583static struct lock_class_key cpuctx_lock;
5989 5584
5990int perf_pmu_register(struct pmu *pmu, char *name, int type) 5585int perf_pmu_register(struct pmu *pmu, char *name, int type)
5991{ 5586{
@@ -6036,6 +5631,7 @@ skip_type:
6036 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); 5631 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6037 __perf_event_init_context(&cpuctx->ctx); 5632 __perf_event_init_context(&cpuctx->ctx);
6038 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); 5633 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
5634 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6039 cpuctx->ctx.type = cpu_context; 5635 cpuctx->ctx.type = cpu_context;
6040 cpuctx->ctx.pmu = pmu; 5636 cpuctx->ctx.pmu = pmu;
6041 cpuctx->jiffies_interval = 1; 5637 cpuctx->jiffies_interval = 1;
@@ -6150,7 +5746,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6150 struct task_struct *task, 5746 struct task_struct *task,
6151 struct perf_event *group_leader, 5747 struct perf_event *group_leader,
6152 struct perf_event *parent_event, 5748 struct perf_event *parent_event,
6153 perf_overflow_handler_t overflow_handler) 5749 perf_overflow_handler_t overflow_handler,
5750 void *context)
6154{ 5751{
6155 struct pmu *pmu; 5752 struct pmu *pmu;
6156 struct perf_event *event; 5753 struct perf_event *event;
@@ -6208,10 +5805,13 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
6208#endif 5805#endif
6209 } 5806 }
6210 5807
6211 if (!overflow_handler && parent_event) 5808 if (!overflow_handler && parent_event) {
6212 overflow_handler = parent_event->overflow_handler; 5809 overflow_handler = parent_event->overflow_handler;
5810 context = parent_event->overflow_handler_context;
5811 }
6213 5812
6214 event->overflow_handler = overflow_handler; 5813 event->overflow_handler = overflow_handler;
5814 event->overflow_handler_context = context;
6215 5815
6216 if (attr->disabled) 5816 if (attr->disabled)
6217 event->state = PERF_EVENT_STATE_OFF; 5817 event->state = PERF_EVENT_STATE_OFF;
@@ -6326,13 +5926,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6326 if (ret) 5926 if (ret)
6327 return -EFAULT; 5927 return -EFAULT;
6328 5928
6329 /*
6330 * If the type exists, the corresponding creation will verify
6331 * the attr->config.
6332 */
6333 if (attr->type >= PERF_TYPE_MAX)
6334 return -EINVAL;
6335
6336 if (attr->__reserved_1) 5929 if (attr->__reserved_1)
6337 return -EINVAL; 5930 return -EINVAL;
6338 5931
@@ -6354,7 +5947,7 @@ err_size:
6354static int 5947static int
6355perf_event_set_output(struct perf_event *event, struct perf_event *output_event) 5948perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6356{ 5949{
6357 struct perf_buffer *buffer = NULL, *old_buffer = NULL; 5950 struct ring_buffer *rb = NULL, *old_rb = NULL;
6358 int ret = -EINVAL; 5951 int ret = -EINVAL;
6359 5952
6360 if (!output_event) 5953 if (!output_event)
@@ -6371,7 +5964,7 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6371 goto out; 5964 goto out;
6372 5965
6373 /* 5966 /*
6374 * If its not a per-cpu buffer, it must be the same task. 5967 * If its not a per-cpu rb, it must be the same task.
6375 */ 5968 */
6376 if (output_event->cpu == -1 && output_event->ctx != event->ctx) 5969 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6377 goto out; 5970 goto out;
@@ -6383,20 +5976,20 @@ set:
6383 goto unlock; 5976 goto unlock;
6384 5977
6385 if (output_event) { 5978 if (output_event) {
6386 /* get the buffer we want to redirect to */ 5979 /* get the rb we want to redirect to */
6387 buffer = perf_buffer_get(output_event); 5980 rb = ring_buffer_get(output_event);
6388 if (!buffer) 5981 if (!rb)
6389 goto unlock; 5982 goto unlock;
6390 } 5983 }
6391 5984
6392 old_buffer = event->buffer; 5985 old_rb = event->rb;
6393 rcu_assign_pointer(event->buffer, buffer); 5986 rcu_assign_pointer(event->rb, rb);
6394 ret = 0; 5987 ret = 0;
6395unlock: 5988unlock:
6396 mutex_unlock(&event->mmap_mutex); 5989 mutex_unlock(&event->mmap_mutex);
6397 5990
6398 if (old_buffer) 5991 if (old_rb)
6399 perf_buffer_put(old_buffer); 5992 ring_buffer_put(old_rb);
6400out: 5993out:
6401 return ret; 5994 return ret;
6402} 5995}
@@ -6478,7 +6071,8 @@ SYSCALL_DEFINE5(perf_event_open,
6478 } 6071 }
6479 } 6072 }
6480 6073
6481 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); 6074 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6075 NULL, NULL);
6482 if (IS_ERR(event)) { 6076 if (IS_ERR(event)) {
6483 err = PTR_ERR(event); 6077 err = PTR_ERR(event);
6484 goto err_task; 6078 goto err_task;
@@ -6663,7 +6257,8 @@ err_fd:
6663struct perf_event * 6257struct perf_event *
6664perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, 6258perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6665 struct task_struct *task, 6259 struct task_struct *task,
6666 perf_overflow_handler_t overflow_handler) 6260 perf_overflow_handler_t overflow_handler,
6261 void *context)
6667{ 6262{
6668 struct perf_event_context *ctx; 6263 struct perf_event_context *ctx;
6669 struct perf_event *event; 6264 struct perf_event *event;
@@ -6673,7 +6268,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6673 * Get the target context (task or percpu): 6268 * Get the target context (task or percpu):
6674 */ 6269 */
6675 6270
6676 event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); 6271 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
6272 overflow_handler, context);
6677 if (IS_ERR(event)) { 6273 if (IS_ERR(event)) {
6678 err = PTR_ERR(event); 6274 err = PTR_ERR(event);
6679 goto err; 6275 goto err;
@@ -6780,7 +6376,6 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6780 * our context. 6376 * our context.
6781 */ 6377 */
6782 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); 6378 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
6783 task_ctx_sched_out(child_ctx, EVENT_ALL);
6784 6379
6785 /* 6380 /*
6786 * Take the context lock here so that if find_get_context is 6381 * Take the context lock here so that if find_get_context is
@@ -6788,6 +6383,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
6788 * incremented the context's refcount before we do put_ctx below. 6383 * incremented the context's refcount before we do put_ctx below.
6789 */ 6384 */
6790 raw_spin_lock(&child_ctx->lock); 6385 raw_spin_lock(&child_ctx->lock);
6386 task_ctx_sched_out(child_ctx);
6791 child->perf_event_ctxp[ctxn] = NULL; 6387 child->perf_event_ctxp[ctxn] = NULL;
6792 /* 6388 /*
6793 * If this context is a clone; unclone it so it can't get 6389 * If this context is a clone; unclone it so it can't get
@@ -6957,7 +6553,7 @@ inherit_event(struct perf_event *parent_event,
6957 parent_event->cpu, 6553 parent_event->cpu,
6958 child, 6554 child,
6959 group_leader, parent_event, 6555 group_leader, parent_event,
6960 NULL); 6556 NULL, NULL);
6961 if (IS_ERR(child_event)) 6557 if (IS_ERR(child_event))
6962 return child_event; 6558 return child_event;
6963 get_ctx(child_ctx); 6559 get_ctx(child_ctx);
@@ -6984,6 +6580,8 @@ inherit_event(struct perf_event *parent_event,
6984 6580
6985 child_event->ctx = child_ctx; 6581 child_event->ctx = child_ctx;
6986 child_event->overflow_handler = parent_event->overflow_handler; 6582 child_event->overflow_handler = parent_event->overflow_handler;
6583 child_event->overflow_handler_context
6584 = parent_event->overflow_handler_context;
6987 6585
6988 /* 6586 /*
6989 * Precalculate sample_data sizes 6587 * Precalculate sample_data sizes
@@ -7402,26 +7000,12 @@ static int __perf_cgroup_move(void *info)
7402 return 0; 7000 return 0;
7403} 7001}
7404 7002
7405static void perf_cgroup_move(struct task_struct *task) 7003static void
7004perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task)
7406{ 7005{
7407 task_function_call(task, __perf_cgroup_move, task); 7006 task_function_call(task, __perf_cgroup_move, task);
7408} 7007}
7409 7008
7410static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7411 struct cgroup *old_cgrp, struct task_struct *task,
7412 bool threadgroup)
7413{
7414 perf_cgroup_move(task);
7415 if (threadgroup) {
7416 struct task_struct *c;
7417 rcu_read_lock();
7418 list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
7419 perf_cgroup_move(c);
7420 }
7421 rcu_read_unlock();
7422 }
7423}
7424
7425static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, 7009static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7426 struct cgroup *old_cgrp, struct task_struct *task) 7010 struct cgroup *old_cgrp, struct task_struct *task)
7427{ 7011{
@@ -7433,7 +7017,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
7433 if (!(task->flags & PF_EXITING)) 7017 if (!(task->flags & PF_EXITING))
7434 return; 7018 return;
7435 7019
7436 perf_cgroup_move(task); 7020 perf_cgroup_attach_task(cgrp, task);
7437} 7021}
7438 7022
7439struct cgroup_subsys perf_subsys = { 7023struct cgroup_subsys perf_subsys = {
@@ -7442,6 +7026,6 @@ struct cgroup_subsys perf_subsys = {
7442 .create = perf_cgroup_create, 7026 .create = perf_cgroup_create,
7443 .destroy = perf_cgroup_destroy, 7027 .destroy = perf_cgroup_destroy,
7444 .exit = perf_cgroup_exit, 7028 .exit = perf_cgroup_exit,
7445 .attach = perf_cgroup_attach, 7029 .attach_task = perf_cgroup_attach_task,
7446}; 7030};
7447#endif /* CONFIG_CGROUP_PERF */ 7031#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..b7971d6f38bf 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -431,9 +431,11 @@ int register_perf_hw_breakpoint(struct perf_event *bp)
431struct perf_event * 431struct perf_event *
432register_user_hw_breakpoint(struct perf_event_attr *attr, 432register_user_hw_breakpoint(struct perf_event_attr *attr,
433 perf_overflow_handler_t triggered, 433 perf_overflow_handler_t triggered,
434 void *context,
434 struct task_struct *tsk) 435 struct task_struct *tsk)
435{ 436{
436 return perf_event_create_kernel_counter(attr, -1, tsk, triggered); 437 return perf_event_create_kernel_counter(attr, -1, tsk, triggered,
438 context);
437} 439}
438EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); 440EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
439 441
@@ -502,7 +504,8 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint);
502 */ 504 */
503struct perf_event * __percpu * 505struct perf_event * __percpu *
504register_wide_hw_breakpoint(struct perf_event_attr *attr, 506register_wide_hw_breakpoint(struct perf_event_attr *attr,
505 perf_overflow_handler_t triggered) 507 perf_overflow_handler_t triggered,
508 void *context)
506{ 509{
507 struct perf_event * __percpu *cpu_events, **pevent, *bp; 510 struct perf_event * __percpu *cpu_events, **pevent, *bp;
508 long err; 511 long err;
@@ -515,7 +518,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
515 get_online_cpus(); 518 get_online_cpus();
516 for_each_online_cpu(cpu) { 519 for_each_online_cpu(cpu) {
517 pevent = per_cpu_ptr(cpu_events, cpu); 520 pevent = per_cpu_ptr(cpu_events, cpu);
518 bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); 521 bp = perf_event_create_kernel_counter(attr, cpu, NULL,
522 triggered, context);
519 523
520 *pevent = bp; 524 *pevent = bp;
521 525
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
new file mode 100644
index 000000000000..09097dd8116c
--- /dev/null
+++ b/kernel/events/internal.h
@@ -0,0 +1,96 @@
1#ifndef _KERNEL_EVENTS_INTERNAL_H
2#define _KERNEL_EVENTS_INTERNAL_H
3
4#define RING_BUFFER_WRITABLE 0x01
5
6struct ring_buffer {
7 atomic_t refcount;
8 struct rcu_head rcu_head;
9#ifdef CONFIG_PERF_USE_VMALLOC
10 struct work_struct work;
11 int page_order; /* allocation order */
12#endif
13 int nr_pages; /* nr of data pages */
14 int writable; /* are we writable */
15
16 atomic_t poll; /* POLL_ for wakeups */
17
18 local_t head; /* write position */
19 local_t nest; /* nested writers */
20 local_t events; /* event limit */
21 local_t wakeup; /* wakeup stamp */
22 local_t lost; /* nr records lost */
23
24 long watermark; /* wakeup watermark */
25
26 struct perf_event_mmap_page *user_page;
27 void *data_pages[0];
28};
29
30extern void rb_free(struct ring_buffer *rb);
31extern struct ring_buffer *
32rb_alloc(int nr_pages, long watermark, int cpu, int flags);
33extern void perf_event_wakeup(struct perf_event *event);
34
35extern void
36perf_event_header__init_id(struct perf_event_header *header,
37 struct perf_sample_data *data,
38 struct perf_event *event);
39extern void
40perf_event__output_id_sample(struct perf_event *event,
41 struct perf_output_handle *handle,
42 struct perf_sample_data *sample);
43
44extern struct page *
45perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff);
46
47#ifdef CONFIG_PERF_USE_VMALLOC
48/*
49 * Back perf_mmap() with vmalloc memory.
50 *
51 * Required for architectures that have d-cache aliasing issues.
52 */
53
54static inline int page_order(struct ring_buffer *rb)
55{
56 return rb->page_order;
57}
58
59#else
60
61static inline int page_order(struct ring_buffer *rb)
62{
63 return 0;
64}
65#endif
66
67static unsigned long perf_data_size(struct ring_buffer *rb)
68{
69 return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
70}
71
72static inline void
73__output_copy(struct perf_output_handle *handle,
74 const void *buf, unsigned int len)
75{
76 do {
77 unsigned long size = min_t(unsigned long, handle->size, len);
78
79 memcpy(handle->addr, buf, size);
80
81 len -= size;
82 handle->addr += size;
83 buf += size;
84 handle->size -= size;
85 if (!handle->size) {
86 struct ring_buffer *rb = handle->rb;
87
88 handle->page++;
89 handle->page &= rb->nr_pages - 1;
90 handle->addr = rb->data_pages[handle->page];
91 handle->size = PAGE_SIZE << page_order(rb);
92 }
93 } while (len);
94}
95
96#endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
new file mode 100644
index 000000000000..a2a29205cc0f
--- /dev/null
+++ b/kernel/events/ring_buffer.c
@@ -0,0 +1,380 @@
1/*
2 * Performance events ring-buffer code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12#include <linux/perf_event.h>
13#include <linux/vmalloc.h>
14#include <linux/slab.h>
15
16#include "internal.h"
17
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head)
20{
21 unsigned long mask;
22
23 if (!rb->writable)
24 return true;
25
26 mask = perf_data_size(rb) - 1;
27
28 offset = (offset - tail) & mask;
29 head = (head - tail) & mask;
30
31 if ((int)(head - offset) < 0)
32 return false;
33
34 return true;
35}
36
37static void perf_output_wakeup(struct perf_output_handle *handle)
38{
39 atomic_set(&handle->rb->poll, POLL_IN);
40
41 handle->event->pending_wakeup = 1;
42 irq_work_queue(&handle->event->pending);
43}
44
45/*
46 * We need to ensure a later event_id doesn't publish a head when a former
47 * event isn't done writing. However since we need to deal with NMIs we
48 * cannot fully serialize things.
49 *
50 * We only publish the head (and generate a wakeup) when the outer-most
51 * event completes.
52 */
53static void perf_output_get_handle(struct perf_output_handle *handle)
54{
55 struct ring_buffer *rb = handle->rb;
56
57 preempt_disable();
58 local_inc(&rb->nest);
59 handle->wakeup = local_read(&rb->wakeup);
60}
61
62static void perf_output_put_handle(struct perf_output_handle *handle)
63{
64 struct ring_buffer *rb = handle->rb;
65 unsigned long head;
66
67again:
68 head = local_read(&rb->head);
69
70 /*
71 * IRQ/NMI can happen here, which means we can miss a head update.
72 */
73
74 if (!local_dec_and_test(&rb->nest))
75 goto out;
76
77 /*
78 * Publish the known good head. Rely on the full barrier implied
79 * by atomic_dec_and_test() order the rb->head read and this
80 * write.
81 */
82 rb->user_page->data_head = head;
83
84 /*
85 * Now check if we missed an update, rely on the (compiler)
86 * barrier in atomic_dec_and_test() to re-read rb->head.
87 */
88 if (unlikely(head != local_read(&rb->head))) {
89 local_inc(&rb->nest);
90 goto again;
91 }
92
93 if (handle->wakeup != local_read(&rb->wakeup))
94 perf_output_wakeup(handle);
95
96out:
97 preempt_enable();
98}
99
100int perf_output_begin(struct perf_output_handle *handle,
101 struct perf_event *event, unsigned int size)
102{
103 struct ring_buffer *rb;
104 unsigned long tail, offset, head;
105 int have_lost;
106 struct perf_sample_data sample_data;
107 struct {
108 struct perf_event_header header;
109 u64 id;
110 u64 lost;
111 } lost_event;
112
113 rcu_read_lock();
114 /*
115 * For inherited events we send all the output towards the parent.
116 */
117 if (event->parent)
118 event = event->parent;
119
120 rb = rcu_dereference(event->rb);
121 if (!rb)
122 goto out;
123
124 handle->rb = rb;
125 handle->event = event;
126
127 if (!rb->nr_pages)
128 goto out;
129
130 have_lost = local_read(&rb->lost);
131 if (have_lost) {
132 lost_event.header.size = sizeof(lost_event);
133 perf_event_header__init_id(&lost_event.header, &sample_data,
134 event);
135 size += lost_event.header.size;
136 }
137
138 perf_output_get_handle(handle);
139
140 do {
141 /*
142 * Userspace could choose to issue a mb() before updating the
143 * tail pointer. So that all reads will be completed before the
144 * write is issued.
145 */
146 tail = ACCESS_ONCE(rb->user_page->data_tail);
147 smp_rmb();
148 offset = head = local_read(&rb->head);
149 head += size;
150 if (unlikely(!perf_output_space(rb, tail, offset, head)))
151 goto fail;
152 } while (local_cmpxchg(&rb->head, offset, head) != offset);
153
154 if (head - local_read(&rb->wakeup) > rb->watermark)
155 local_add(rb->watermark, &rb->wakeup);
156
157 handle->page = offset >> (PAGE_SHIFT + page_order(rb));
158 handle->page &= rb->nr_pages - 1;
159 handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
160 handle->addr = rb->data_pages[handle->page];
161 handle->addr += handle->size;
162 handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
163
164 if (have_lost) {
165 lost_event.header.type = PERF_RECORD_LOST;
166 lost_event.header.misc = 0;
167 lost_event.id = event->id;
168 lost_event.lost = local_xchg(&rb->lost, 0);
169
170 perf_output_put(handle, lost_event);
171 perf_event__output_id_sample(event, handle, &sample_data);
172 }
173
174 return 0;
175
176fail:
177 local_inc(&rb->lost);
178 perf_output_put_handle(handle);
179out:
180 rcu_read_unlock();
181
182 return -ENOSPC;
183}
184
185void perf_output_copy(struct perf_output_handle *handle,
186 const void *buf, unsigned int len)
187{
188 __output_copy(handle, buf, len);
189}
190
191void perf_output_end(struct perf_output_handle *handle)
192{
193 perf_output_put_handle(handle);
194 rcu_read_unlock();
195}
196
197static void
198ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
199{
200 long max_size = perf_data_size(rb);
201
202 if (watermark)
203 rb->watermark = min(max_size, watermark);
204
205 if (!rb->watermark)
206 rb->watermark = max_size / 2;
207
208 if (flags & RING_BUFFER_WRITABLE)
209 rb->writable = 1;
210
211 atomic_set(&rb->refcount, 1);
212}
213
214#ifndef CONFIG_PERF_USE_VMALLOC
215
216/*
217 * Back perf_mmap() with regular GFP_KERNEL-0 pages.
218 */
219
220struct page *
221perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
222{
223 if (pgoff > rb->nr_pages)
224 return NULL;
225
226 if (pgoff == 0)
227 return virt_to_page(rb->user_page);
228
229 return virt_to_page(rb->data_pages[pgoff - 1]);
230}
231
232static void *perf_mmap_alloc_page(int cpu)
233{
234 struct page *page;
235 int node;
236
237 node = (cpu == -1) ? cpu : cpu_to_node(cpu);
238 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0);
239 if (!page)
240 return NULL;
241
242 return page_address(page);
243}
244
245struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
246{
247 struct ring_buffer *rb;
248 unsigned long size;
249 int i;
250
251 size = sizeof(struct ring_buffer);
252 size += nr_pages * sizeof(void *);
253
254 rb = kzalloc(size, GFP_KERNEL);
255 if (!rb)
256 goto fail;
257
258 rb->user_page = perf_mmap_alloc_page(cpu);
259 if (!rb->user_page)
260 goto fail_user_page;
261
262 for (i = 0; i < nr_pages; i++) {
263 rb->data_pages[i] = perf_mmap_alloc_page(cpu);
264 if (!rb->data_pages[i])
265 goto fail_data_pages;
266 }
267
268 rb->nr_pages = nr_pages;
269
270 ring_buffer_init(rb, watermark, flags);
271
272 return rb;
273
274fail_data_pages:
275 for (i--; i >= 0; i--)
276 free_page((unsigned long)rb->data_pages[i]);
277
278 free_page((unsigned long)rb->user_page);
279
280fail_user_page:
281 kfree(rb);
282
283fail:
284 return NULL;
285}
286
287static void perf_mmap_free_page(unsigned long addr)
288{
289 struct page *page = virt_to_page((void *)addr);
290
291 page->mapping = NULL;
292 __free_page(page);
293}
294
295void rb_free(struct ring_buffer *rb)
296{
297 int i;
298
299 perf_mmap_free_page((unsigned long)rb->user_page);
300 for (i = 0; i < rb->nr_pages; i++)
301 perf_mmap_free_page((unsigned long)rb->data_pages[i]);
302 kfree(rb);
303}
304
305#else
306
307struct page *
308perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff)
309{
310 if (pgoff > (1UL << page_order(rb)))
311 return NULL;
312
313 return vmalloc_to_page((void *)rb->user_page + pgoff * PAGE_SIZE);
314}
315
316static void perf_mmap_unmark_page(void *addr)
317{
318 struct page *page = vmalloc_to_page(addr);
319
320 page->mapping = NULL;
321}
322
323static void rb_free_work(struct work_struct *work)
324{
325 struct ring_buffer *rb;
326 void *base;
327 int i, nr;
328
329 rb = container_of(work, struct ring_buffer, work);
330 nr = 1 << page_order(rb);
331
332 base = rb->user_page;
333 for (i = 0; i < nr + 1; i++)
334 perf_mmap_unmark_page(base + (i * PAGE_SIZE));
335
336 vfree(base);
337 kfree(rb);
338}
339
340void rb_free(struct ring_buffer *rb)
341{
342 schedule_work(&rb->work);
343}
344
345struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
346{
347 struct ring_buffer *rb;
348 unsigned long size;
349 void *all_buf;
350
351 size = sizeof(struct ring_buffer);
352 size += sizeof(void *);
353
354 rb = kzalloc(size, GFP_KERNEL);
355 if (!rb)
356 goto fail;
357
358 INIT_WORK(&rb->work, rb_free_work);
359
360 all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
361 if (!all_buf)
362 goto fail_all_buf;
363
364 rb->user_page = all_buf;
365 rb->data_pages[0] = all_buf + PAGE_SIZE;
366 rb->page_order = ilog2(nr_pages);
367 rb->nr_pages = 1;
368
369 ring_buffer_init(rb, watermark, flags);
370
371 return rb;
372
373fail_all_buf:
374 kfree(rb);
375
376fail:
377 return NULL;
378}
379
380#endif