aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-02-28 10:52:00 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2016-02-28 10:52:00 -0500
commit1b9540ce033ad15802e36ad1cd1c36bdad98eeea (patch)
tree4b6d5484b15a9a9ca8ff64f7444705600d0cbb68
parent4b696dcb1a55e40648ad0eec4af991c72f945a85 (diff)
parent0da4cf3e0a68c97ef811569804616a811f786729 (diff)
Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf fixes from Thomas Gleixner: "A rather largish series of 12 patches addressing a maze of race conditions in the perf core code from Peter Zijlstra" * 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: perf: Robustify task_function_call() perf: Fix scaling vs. perf_install_in_context() perf: Fix scaling vs. perf_event_enable() perf: Fix scaling vs. perf_event_enable_on_exec() perf: Fix ctx time tracking by introducing EVENT_TIME perf: Cure event->pending_disable race perf: Fix race between event install and jump_labels perf: Fix cloning perf: Only update context time when active perf: Allow perf_release() with !event->ctx perf: Do not double free perf: Close install vs. exit race
-rw-r--r--include/linux/perf_event.h7
-rw-r--r--kernel/events/core.c368
2 files changed, 244 insertions, 131 deletions
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b35a61a481fa..f5c5a3fa2c81 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -397,6 +397,7 @@ struct pmu {
397 * enum perf_event_active_state - the states of a event 397 * enum perf_event_active_state - the states of a event
398 */ 398 */
399enum perf_event_active_state { 399enum perf_event_active_state {
400 PERF_EVENT_STATE_DEAD = -4,
400 PERF_EVENT_STATE_EXIT = -3, 401 PERF_EVENT_STATE_EXIT = -3,
401 PERF_EVENT_STATE_ERROR = -2, 402 PERF_EVENT_STATE_ERROR = -2,
402 PERF_EVENT_STATE_OFF = -1, 403 PERF_EVENT_STATE_OFF = -1,
@@ -905,7 +906,7 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr)
905 } 906 }
906} 907}
907 908
908extern struct static_key_deferred perf_sched_events; 909extern struct static_key_false perf_sched_events;
909 910
910static __always_inline bool 911static __always_inline bool
911perf_sw_migrate_enabled(void) 912perf_sw_migrate_enabled(void)
@@ -924,7 +925,7 @@ static inline void perf_event_task_migrate(struct task_struct *task)
924static inline void perf_event_task_sched_in(struct task_struct *prev, 925static inline void perf_event_task_sched_in(struct task_struct *prev,
925 struct task_struct *task) 926 struct task_struct *task)
926{ 927{
927 if (static_key_false(&perf_sched_events.key)) 928 if (static_branch_unlikely(&perf_sched_events))
928 __perf_event_task_sched_in(prev, task); 929 __perf_event_task_sched_in(prev, task);
929 930
930 if (perf_sw_migrate_enabled() && task->sched_migrated) { 931 if (perf_sw_migrate_enabled() && task->sched_migrated) {
@@ -941,7 +942,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
941{ 942{
942 perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); 943 perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0);
943 944
944 if (static_key_false(&perf_sched_events.key)) 945 if (static_branch_unlikely(&perf_sched_events))
945 __perf_event_task_sched_out(prev, next); 946 __perf_event_task_sched_out(prev, next);
946} 947}
947 948
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0d58522103cd..614614821f00 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -64,8 +64,17 @@ static void remote_function(void *data)
64 struct task_struct *p = tfc->p; 64 struct task_struct *p = tfc->p;
65 65
66 if (p) { 66 if (p) {
67 tfc->ret = -EAGAIN; 67 /* -EAGAIN */
68 if (task_cpu(p) != smp_processor_id() || !task_curr(p)) 68 if (task_cpu(p) != smp_processor_id())
69 return;
70
71 /*
72 * Now that we're on right CPU with IRQs disabled, we can test
73 * if we hit the right task without races.
74 */
75
76 tfc->ret = -ESRCH; /* No such (running) process */
77 if (p != current)
69 return; 78 return;
70 } 79 }
71 80
@@ -92,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
92 .p = p, 101 .p = p,
93 .func = func, 102 .func = func,
94 .info = info, 103 .info = info,
95 .ret = -ESRCH, /* No such (running) process */ 104 .ret = -EAGAIN,
96 }; 105 };
106 int ret;
97 107
98 if (task_curr(p)) 108 do {
99 smp_call_function_single(task_cpu(p), remote_function, &data, 1); 109 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
110 if (!ret)
111 ret = data.ret;
112 } while (ret == -EAGAIN);
100 113
101 return data.ret; 114 return ret;
102} 115}
103 116
104/** 117/**
@@ -169,19 +182,6 @@ static bool is_kernel_event(struct perf_event *event)
169 * rely on ctx->is_active and therefore cannot use event_function_call(). 182 * rely on ctx->is_active and therefore cannot use event_function_call().
170 * See perf_install_in_context(). 183 * See perf_install_in_context().
171 * 184 *
172 * This is because we need a ctx->lock serialized variable (ctx->is_active)
173 * to reliably determine if a particular task/context is scheduled in. The
174 * task_curr() use in task_function_call() is racy in that a remote context
175 * switch is not a single atomic operation.
176 *
177 * As is, the situation is 'safe' because we set rq->curr before we do the
178 * actual context switch. This means that task_curr() will fail early, but
179 * we'll continue spinning on ctx->is_active until we've passed
180 * perf_event_task_sched_out().
181 *
182 * Without this ctx->lock serialized variable we could have race where we find
183 * the task (and hence the context) would not be active while in fact they are.
184 *
185 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. 185 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
186 */ 186 */
187 187
@@ -212,7 +212,7 @@ static int event_function(void *info)
212 */ 212 */
213 if (ctx->task) { 213 if (ctx->task) {
214 if (ctx->task != current) { 214 if (ctx->task != current) {
215 ret = -EAGAIN; 215 ret = -ESRCH;
216 goto unlock; 216 goto unlock;
217 } 217 }
218 218
@@ -276,10 +276,10 @@ static void event_function_call(struct perf_event *event, event_f func, void *da
276 return; 276 return;
277 } 277 }
278 278
279again:
280 if (task == TASK_TOMBSTONE) 279 if (task == TASK_TOMBSTONE)
281 return; 280 return;
282 281
282again:
283 if (!task_function_call(task, event_function, &efs)) 283 if (!task_function_call(task, event_function, &efs))
284 return; 284 return;
285 285
@@ -289,13 +289,15 @@ again:
289 * a concurrent perf_event_context_sched_out(). 289 * a concurrent perf_event_context_sched_out().
290 */ 290 */
291 task = ctx->task; 291 task = ctx->task;
292 if (task != TASK_TOMBSTONE) { 292 if (task == TASK_TOMBSTONE) {
293 if (ctx->is_active) { 293 raw_spin_unlock_irq(&ctx->lock);
294 raw_spin_unlock_irq(&ctx->lock); 294 return;
295 goto again;
296 }
297 func(event, NULL, ctx, data);
298 } 295 }
296 if (ctx->is_active) {
297 raw_spin_unlock_irq(&ctx->lock);
298 goto again;
299 }
300 func(event, NULL, ctx, data);
299 raw_spin_unlock_irq(&ctx->lock); 301 raw_spin_unlock_irq(&ctx->lock);
300} 302}
301 303
@@ -314,6 +316,7 @@ again:
314enum event_type_t { 316enum event_type_t {
315 EVENT_FLEXIBLE = 0x1, 317 EVENT_FLEXIBLE = 0x1,
316 EVENT_PINNED = 0x2, 318 EVENT_PINNED = 0x2,
319 EVENT_TIME = 0x4,
317 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, 320 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
318}; 321};
319 322
@@ -321,7 +324,13 @@ enum event_type_t {
321 * perf_sched_events : >0 events exist 324 * perf_sched_events : >0 events exist
322 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu 325 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
323 */ 326 */
324struct static_key_deferred perf_sched_events __read_mostly; 327
328static void perf_sched_delayed(struct work_struct *work);
329DEFINE_STATIC_KEY_FALSE(perf_sched_events);
330static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
331static DEFINE_MUTEX(perf_sched_mutex);
332static atomic_t perf_sched_count;
333
325static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); 334static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
326static DEFINE_PER_CPU(int, perf_sched_cb_usages); 335static DEFINE_PER_CPU(int, perf_sched_cb_usages);
327 336
@@ -1288,16 +1297,18 @@ static u64 perf_event_time(struct perf_event *event)
1288 1297
1289/* 1298/*
1290 * Update the total_time_enabled and total_time_running fields for a event. 1299 * Update the total_time_enabled and total_time_running fields for a event.
1291 * The caller of this function needs to hold the ctx->lock.
1292 */ 1300 */
1293static void update_event_times(struct perf_event *event) 1301static void update_event_times(struct perf_event *event)
1294{ 1302{
1295 struct perf_event_context *ctx = event->ctx; 1303 struct perf_event_context *ctx = event->ctx;
1296 u64 run_end; 1304 u64 run_end;
1297 1305
1306 lockdep_assert_held(&ctx->lock);
1307
1298 if (event->state < PERF_EVENT_STATE_INACTIVE || 1308 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1299 event->group_leader->state < PERF_EVENT_STATE_INACTIVE) 1309 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1300 return; 1310 return;
1311
1301 /* 1312 /*
1302 * in cgroup mode, time_enabled represents 1313 * in cgroup mode, time_enabled represents
1303 * the time the event was enabled AND active 1314 * the time the event was enabled AND active
@@ -1645,7 +1656,7 @@ out:
1645 1656
1646static bool is_orphaned_event(struct perf_event *event) 1657static bool is_orphaned_event(struct perf_event *event)
1647{ 1658{
1648 return event->state == PERF_EVENT_STATE_EXIT; 1659 return event->state == PERF_EVENT_STATE_DEAD;
1649} 1660}
1650 1661
1651static inline int pmu_filter_match(struct perf_event *event) 1662static inline int pmu_filter_match(struct perf_event *event)
@@ -1690,14 +1701,14 @@ event_sched_out(struct perf_event *event,
1690 1701
1691 perf_pmu_disable(event->pmu); 1702 perf_pmu_disable(event->pmu);
1692 1703
1704 event->tstamp_stopped = tstamp;
1705 event->pmu->del(event, 0);
1706 event->oncpu = -1;
1693 event->state = PERF_EVENT_STATE_INACTIVE; 1707 event->state = PERF_EVENT_STATE_INACTIVE;
1694 if (event->pending_disable) { 1708 if (event->pending_disable) {
1695 event->pending_disable = 0; 1709 event->pending_disable = 0;
1696 event->state = PERF_EVENT_STATE_OFF; 1710 event->state = PERF_EVENT_STATE_OFF;
1697 } 1711 }
1698 event->tstamp_stopped = tstamp;
1699 event->pmu->del(event, 0);
1700 event->oncpu = -1;
1701 1712
1702 if (!is_software_event(event)) 1713 if (!is_software_event(event))
1703 cpuctx->active_oncpu--; 1714 cpuctx->active_oncpu--;
@@ -1732,7 +1743,6 @@ group_sched_out(struct perf_event *group_event,
1732} 1743}
1733 1744
1734#define DETACH_GROUP 0x01UL 1745#define DETACH_GROUP 0x01UL
1735#define DETACH_STATE 0x02UL
1736 1746
1737/* 1747/*
1738 * Cross CPU call to remove a performance event 1748 * Cross CPU call to remove a performance event
@@ -1752,8 +1762,6 @@ __perf_remove_from_context(struct perf_event *event,
1752 if (flags & DETACH_GROUP) 1762 if (flags & DETACH_GROUP)
1753 perf_group_detach(event); 1763 perf_group_detach(event);
1754 list_del_event(event, ctx); 1764 list_del_event(event, ctx);
1755 if (flags & DETACH_STATE)
1756 event->state = PERF_EVENT_STATE_EXIT;
1757 1765
1758 if (!ctx->nr_events && ctx->is_active) { 1766 if (!ctx->nr_events && ctx->is_active) {
1759 ctx->is_active = 0; 1767 ctx->is_active = 0;
@@ -2063,14 +2071,27 @@ static void add_event_to_ctx(struct perf_event *event,
2063 event->tstamp_stopped = tstamp; 2071 event->tstamp_stopped = tstamp;
2064} 2072}
2065 2073
2066static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, 2074static void ctx_sched_out(struct perf_event_context *ctx,
2067 struct perf_event_context *ctx); 2075 struct perf_cpu_context *cpuctx,
2076 enum event_type_t event_type);
2068static void 2077static void
2069ctx_sched_in(struct perf_event_context *ctx, 2078ctx_sched_in(struct perf_event_context *ctx,
2070 struct perf_cpu_context *cpuctx, 2079 struct perf_cpu_context *cpuctx,
2071 enum event_type_t event_type, 2080 enum event_type_t event_type,
2072 struct task_struct *task); 2081 struct task_struct *task);
2073 2082
2083static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2084 struct perf_event_context *ctx)
2085{
2086 if (!cpuctx->task_ctx)
2087 return;
2088
2089 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2090 return;
2091
2092 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2093}
2094
2074static void perf_event_sched_in(struct perf_cpu_context *cpuctx, 2095static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2075 struct perf_event_context *ctx, 2096 struct perf_event_context *ctx,
2076 struct task_struct *task) 2097 struct task_struct *task)
@@ -2097,49 +2118,68 @@ static void ctx_resched(struct perf_cpu_context *cpuctx,
2097/* 2118/*
2098 * Cross CPU call to install and enable a performance event 2119 * Cross CPU call to install and enable a performance event
2099 * 2120 *
2100 * Must be called with ctx->mutex held 2121 * Very similar to remote_function() + event_function() but cannot assume that
2122 * things like ctx->is_active and cpuctx->task_ctx are set.
2101 */ 2123 */
2102static int __perf_install_in_context(void *info) 2124static int __perf_install_in_context(void *info)
2103{ 2125{
2104 struct perf_event_context *ctx = info; 2126 struct perf_event *event = info;
2127 struct perf_event_context *ctx = event->ctx;
2105 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 2128 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2106 struct perf_event_context *task_ctx = cpuctx->task_ctx; 2129 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2130 bool activate = true;
2131 int ret = 0;
2107 2132
2108 raw_spin_lock(&cpuctx->ctx.lock); 2133 raw_spin_lock(&cpuctx->ctx.lock);
2109 if (ctx->task) { 2134 if (ctx->task) {
2110 raw_spin_lock(&ctx->lock); 2135 raw_spin_lock(&ctx->lock);
2111 /*
2112 * If we hit the 'wrong' task, we've since scheduled and
2113 * everything should be sorted, nothing to do!
2114 */
2115 task_ctx = ctx; 2136 task_ctx = ctx;
2116 if (ctx->task != current) 2137
2138 /* If we're on the wrong CPU, try again */
2139 if (task_cpu(ctx->task) != smp_processor_id()) {
2140 ret = -ESRCH;
2117 goto unlock; 2141 goto unlock;
2142 }
2118 2143
2119 /* 2144 /*
2120 * If task_ctx is set, it had better be to us. 2145 * If we're on the right CPU, see if the task we target is
2146 * current, if not we don't have to activate the ctx, a future
2147 * context switch will do that for us.
2121 */ 2148 */
2122 WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); 2149 if (ctx->task != current)
2150 activate = false;
2151 else
2152 WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2153
2123 } else if (task_ctx) { 2154 } else if (task_ctx) {
2124 raw_spin_lock(&task_ctx->lock); 2155 raw_spin_lock(&task_ctx->lock);
2125 } 2156 }
2126 2157
2127 ctx_resched(cpuctx, task_ctx); 2158 if (activate) {
2159 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2160 add_event_to_ctx(event, ctx);
2161 ctx_resched(cpuctx, task_ctx);
2162 } else {
2163 add_event_to_ctx(event, ctx);
2164 }
2165
2128unlock: 2166unlock:
2129 perf_ctx_unlock(cpuctx, task_ctx); 2167 perf_ctx_unlock(cpuctx, task_ctx);
2130 2168
2131 return 0; 2169 return ret;
2132} 2170}
2133 2171
2134/* 2172/*
2135 * Attach a performance event to a context 2173 * Attach a performance event to a context.
2174 *
2175 * Very similar to event_function_call, see comment there.
2136 */ 2176 */
2137static void 2177static void
2138perf_install_in_context(struct perf_event_context *ctx, 2178perf_install_in_context(struct perf_event_context *ctx,
2139 struct perf_event *event, 2179 struct perf_event *event,
2140 int cpu) 2180 int cpu)
2141{ 2181{
2142 struct task_struct *task = NULL; 2182 struct task_struct *task = READ_ONCE(ctx->task);
2143 2183
2144 lockdep_assert_held(&ctx->mutex); 2184 lockdep_assert_held(&ctx->mutex);
2145 2185
@@ -2147,40 +2187,46 @@ perf_install_in_context(struct perf_event_context *ctx,
2147 if (event->cpu != -1) 2187 if (event->cpu != -1)
2148 event->cpu = cpu; 2188 event->cpu = cpu;
2149 2189
2190 if (!task) {
2191 cpu_function_call(cpu, __perf_install_in_context, event);
2192 return;
2193 }
2194
2195 /*
2196 * Should not happen, we validate the ctx is still alive before calling.
2197 */
2198 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2199 return;
2200
2150 /* 2201 /*
2151 * Installing events is tricky because we cannot rely on ctx->is_active 2202 * Installing events is tricky because we cannot rely on ctx->is_active
2152 * to be set in case this is the nr_events 0 -> 1 transition. 2203 * to be set in case this is the nr_events 0 -> 1 transition.
2153 *
2154 * So what we do is we add the event to the list here, which will allow
2155 * a future context switch to DTRT and then send a racy IPI. If the IPI
2156 * fails to hit the right task, this means a context switch must have
2157 * happened and that will have taken care of business.
2158 */ 2204 */
2159 raw_spin_lock_irq(&ctx->lock); 2205again:
2160 task = ctx->task;
2161 /* 2206 /*
2162 * Worse, we cannot even rely on the ctx actually existing anymore. If 2207 * Cannot use task_function_call() because we need to run on the task's
2163 * between find_get_context() and perf_install_in_context() the task 2208 * CPU regardless of whether its current or not.
2164 * went through perf_event_exit_task() its dead and we should not be
2165 * adding new events.
2166 */ 2209 */
2167 if (task == TASK_TOMBSTONE) { 2210 if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
2211 return;
2212
2213 raw_spin_lock_irq(&ctx->lock);
2214 task = ctx->task;
2215 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2216 /*
2217 * Cannot happen because we already checked above (which also
2218 * cannot happen), and we hold ctx->mutex, which serializes us
2219 * against perf_event_exit_task_context().
2220 */
2168 raw_spin_unlock_irq(&ctx->lock); 2221 raw_spin_unlock_irq(&ctx->lock);
2169 return; 2222 return;
2170 } 2223 }
2171 update_context_time(ctx); 2224 raw_spin_unlock_irq(&ctx->lock);
2172 /* 2225 /*
2173 * Update cgrp time only if current cgrp matches event->cgrp. 2226 * Since !ctx->is_active doesn't mean anything, we must IPI
2174 * Must be done before calling add_event_to_ctx(). 2227 * unconditionally.
2175 */ 2228 */
2176 update_cgrp_time_from_event(event); 2229 goto again;
2177 add_event_to_ctx(event, ctx);
2178 raw_spin_unlock_irq(&ctx->lock);
2179
2180 if (task)
2181 task_function_call(task, __perf_install_in_context, ctx);
2182 else
2183 cpu_function_call(cpu, __perf_install_in_context, ctx);
2184} 2230}
2185 2231
2186/* 2232/*
@@ -2219,17 +2265,18 @@ static void __perf_event_enable(struct perf_event *event,
2219 event->state <= PERF_EVENT_STATE_ERROR) 2265 event->state <= PERF_EVENT_STATE_ERROR)
2220 return; 2266 return;
2221 2267
2222 update_context_time(ctx); 2268 if (ctx->is_active)
2269 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2270
2223 __perf_event_mark_enabled(event); 2271 __perf_event_mark_enabled(event);
2224 2272
2225 if (!ctx->is_active) 2273 if (!ctx->is_active)
2226 return; 2274 return;
2227 2275
2228 if (!event_filter_match(event)) { 2276 if (!event_filter_match(event)) {
2229 if (is_cgroup_event(event)) { 2277 if (is_cgroup_event(event))
2230 perf_cgroup_set_timestamp(current, ctx); // XXX ?
2231 perf_cgroup_defer_enabled(event); 2278 perf_cgroup_defer_enabled(event);
2232 } 2279 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2233 return; 2280 return;
2234 } 2281 }
2235 2282
@@ -2237,8 +2284,10 @@ static void __perf_event_enable(struct perf_event *event,
2237 * If the event is in a group and isn't the group leader, 2284 * If the event is in a group and isn't the group leader,
2238 * then don't put it on unless the group is on. 2285 * then don't put it on unless the group is on.
2239 */ 2286 */
2240 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) 2287 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2288 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2241 return; 2289 return;
2290 }
2242 2291
2243 task_ctx = cpuctx->task_ctx; 2292 task_ctx = cpuctx->task_ctx;
2244 if (ctx->task) 2293 if (ctx->task)
@@ -2344,24 +2393,33 @@ static void ctx_sched_out(struct perf_event_context *ctx,
2344 } 2393 }
2345 2394
2346 ctx->is_active &= ~event_type; 2395 ctx->is_active &= ~event_type;
2396 if (!(ctx->is_active & EVENT_ALL))
2397 ctx->is_active = 0;
2398
2347 if (ctx->task) { 2399 if (ctx->task) {
2348 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2400 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2349 if (!ctx->is_active) 2401 if (!ctx->is_active)
2350 cpuctx->task_ctx = NULL; 2402 cpuctx->task_ctx = NULL;
2351 } 2403 }
2352 2404
2353 update_context_time(ctx); 2405 is_active ^= ctx->is_active; /* changed bits */
2354 update_cgrp_time_from_cpuctx(cpuctx); 2406
2355 if (!ctx->nr_active) 2407 if (is_active & EVENT_TIME) {
2408 /* update (and stop) ctx time */
2409 update_context_time(ctx);
2410 update_cgrp_time_from_cpuctx(cpuctx);
2411 }
2412
2413 if (!ctx->nr_active || !(is_active & EVENT_ALL))
2356 return; 2414 return;
2357 2415
2358 perf_pmu_disable(ctx->pmu); 2416 perf_pmu_disable(ctx->pmu);
2359 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { 2417 if (is_active & EVENT_PINNED) {
2360 list_for_each_entry(event, &ctx->pinned_groups, group_entry) 2418 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2361 group_sched_out(event, cpuctx, ctx); 2419 group_sched_out(event, cpuctx, ctx);
2362 } 2420 }
2363 2421
2364 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { 2422 if (is_active & EVENT_FLEXIBLE) {
2365 list_for_each_entry(event, &ctx->flexible_groups, group_entry) 2423 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2366 group_sched_out(event, cpuctx, ctx); 2424 group_sched_out(event, cpuctx, ctx);
2367 } 2425 }
@@ -2641,18 +2699,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
2641 perf_cgroup_sched_out(task, next); 2699 perf_cgroup_sched_out(task, next);
2642} 2700}
2643 2701
2644static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2645 struct perf_event_context *ctx)
2646{
2647 if (!cpuctx->task_ctx)
2648 return;
2649
2650 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2651 return;
2652
2653 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2654}
2655
2656/* 2702/*
2657 * Called with IRQs disabled 2703 * Called with IRQs disabled
2658 */ 2704 */
@@ -2735,7 +2781,7 @@ ctx_sched_in(struct perf_event_context *ctx,
2735 if (likely(!ctx->nr_events)) 2781 if (likely(!ctx->nr_events))
2736 return; 2782 return;
2737 2783
2738 ctx->is_active |= event_type; 2784 ctx->is_active |= (event_type | EVENT_TIME);
2739 if (ctx->task) { 2785 if (ctx->task) {
2740 if (!is_active) 2786 if (!is_active)
2741 cpuctx->task_ctx = ctx; 2787 cpuctx->task_ctx = ctx;
@@ -2743,18 +2789,24 @@ ctx_sched_in(struct perf_event_context *ctx,
2743 WARN_ON_ONCE(cpuctx->task_ctx != ctx); 2789 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2744 } 2790 }
2745 2791
2746 now = perf_clock(); 2792 is_active ^= ctx->is_active; /* changed bits */
2747 ctx->timestamp = now; 2793
2748 perf_cgroup_set_timestamp(task, ctx); 2794 if (is_active & EVENT_TIME) {
2795 /* start ctx time */
2796 now = perf_clock();
2797 ctx->timestamp = now;
2798 perf_cgroup_set_timestamp(task, ctx);
2799 }
2800
2749 /* 2801 /*
2750 * First go through the list and put on any pinned groups 2802 * First go through the list and put on any pinned groups
2751 * in order to give them the best chance of going on. 2803 * in order to give them the best chance of going on.
2752 */ 2804 */
2753 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) 2805 if (is_active & EVENT_PINNED)
2754 ctx_pinned_sched_in(ctx, cpuctx); 2806 ctx_pinned_sched_in(ctx, cpuctx);
2755 2807
2756 /* Then walk through the lower prio flexible groups */ 2808 /* Then walk through the lower prio flexible groups */
2757 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) 2809 if (is_active & EVENT_FLEXIBLE)
2758 ctx_flexible_sched_in(ctx, cpuctx); 2810 ctx_flexible_sched_in(ctx, cpuctx);
2759} 2811}
2760 2812
@@ -3120,6 +3172,7 @@ static void perf_event_enable_on_exec(int ctxn)
3120 3172
3121 cpuctx = __get_cpu_context(ctx); 3173 cpuctx = __get_cpu_context(ctx);
3122 perf_ctx_lock(cpuctx, ctx); 3174 perf_ctx_lock(cpuctx, ctx);
3175 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3123 list_for_each_entry(event, &ctx->event_list, event_entry) 3176 list_for_each_entry(event, &ctx->event_list, event_entry)
3124 enabled |= event_enable_on_exec(event, ctx); 3177 enabled |= event_enable_on_exec(event, ctx);
3125 3178
@@ -3537,12 +3590,22 @@ static void unaccount_event(struct perf_event *event)
3537 if (has_branch_stack(event)) 3590 if (has_branch_stack(event))
3538 dec = true; 3591 dec = true;
3539 3592
3540 if (dec) 3593 if (dec) {
3541 static_key_slow_dec_deferred(&perf_sched_events); 3594 if (!atomic_add_unless(&perf_sched_count, -1, 1))
3595 schedule_delayed_work(&perf_sched_work, HZ);
3596 }
3542 3597
3543 unaccount_event_cpu(event, event->cpu); 3598 unaccount_event_cpu(event, event->cpu);
3544} 3599}
3545 3600
3601static void perf_sched_delayed(struct work_struct *work)
3602{
3603 mutex_lock(&perf_sched_mutex);
3604 if (atomic_dec_and_test(&perf_sched_count))
3605 static_branch_disable(&perf_sched_events);
3606 mutex_unlock(&perf_sched_mutex);
3607}
3608
3546/* 3609/*
3547 * The following implement mutual exclusion of events on "exclusive" pmus 3610 * The following implement mutual exclusion of events on "exclusive" pmus
3548 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled 3611 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
@@ -3752,30 +3815,42 @@ static void put_event(struct perf_event *event)
3752 */ 3815 */
3753int perf_event_release_kernel(struct perf_event *event) 3816int perf_event_release_kernel(struct perf_event *event)
3754{ 3817{
3755 struct perf_event_context *ctx; 3818 struct perf_event_context *ctx = event->ctx;
3756 struct perf_event *child, *tmp; 3819 struct perf_event *child, *tmp;
3757 3820
3821 /*
3822 * If we got here through err_file: fput(event_file); we will not have
3823 * attached to a context yet.
3824 */
3825 if (!ctx) {
3826 WARN_ON_ONCE(event->attach_state &
3827 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
3828 goto no_ctx;
3829 }
3830
3758 if (!is_kernel_event(event)) 3831 if (!is_kernel_event(event))
3759 perf_remove_from_owner(event); 3832 perf_remove_from_owner(event);
3760 3833
3761 ctx = perf_event_ctx_lock(event); 3834 ctx = perf_event_ctx_lock(event);
3762 WARN_ON_ONCE(ctx->parent_ctx); 3835 WARN_ON_ONCE(ctx->parent_ctx);
3763 perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); 3836 perf_remove_from_context(event, DETACH_GROUP);
3764 perf_event_ctx_unlock(event, ctx);
3765 3837
3838 raw_spin_lock_irq(&ctx->lock);
3766 /* 3839 /*
3767 * At this point we must have event->state == PERF_EVENT_STATE_EXIT, 3840 * Mark this even as STATE_DEAD, there is no external reference to it
3768 * either from the above perf_remove_from_context() or through 3841 * anymore.
3769 * perf_event_exit_event().
3770 * 3842 *
3771 * Therefore, anybody acquiring event->child_mutex after the below 3843 * Anybody acquiring event->child_mutex after the below loop _must_
3772 * loop _must_ also see this, most importantly inherit_event() which 3844 * also see this, most importantly inherit_event() which will avoid
3773 * will avoid placing more children on the list. 3845 * placing more children on the list.
3774 * 3846 *
3775 * Thus this guarantees that we will in fact observe and kill _ALL_ 3847 * Thus this guarantees that we will in fact observe and kill _ALL_
3776 * child events. 3848 * child events.
3777 */ 3849 */
3778 WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); 3850 event->state = PERF_EVENT_STATE_DEAD;
3851 raw_spin_unlock_irq(&ctx->lock);
3852
3853 perf_event_ctx_unlock(event, ctx);
3779 3854
3780again: 3855again:
3781 mutex_lock(&event->child_mutex); 3856 mutex_lock(&event->child_mutex);
@@ -3830,8 +3905,8 @@ again:
3830 } 3905 }
3831 mutex_unlock(&event->child_mutex); 3906 mutex_unlock(&event->child_mutex);
3832 3907
3833 /* Must be the last reference */ 3908no_ctx:
3834 put_event(event); 3909 put_event(event); /* Must be the 'last' reference */
3835 return 0; 3910 return 0;
3836} 3911}
3837EXPORT_SYMBOL_GPL(perf_event_release_kernel); 3912EXPORT_SYMBOL_GPL(perf_event_release_kernel);
@@ -3988,7 +4063,7 @@ static bool is_event_hup(struct perf_event *event)
3988{ 4063{
3989 bool no_children; 4064 bool no_children;
3990 4065
3991 if (event->state != PERF_EVENT_STATE_EXIT) 4066 if (event->state > PERF_EVENT_STATE_EXIT)
3992 return false; 4067 return false;
3993 4068
3994 mutex_lock(&event->child_mutex); 4069 mutex_lock(&event->child_mutex);
@@ -7769,8 +7844,28 @@ static void account_event(struct perf_event *event)
7769 if (is_cgroup_event(event)) 7844 if (is_cgroup_event(event))
7770 inc = true; 7845 inc = true;
7771 7846
7772 if (inc) 7847 if (inc) {
7773 static_key_slow_inc(&perf_sched_events.key); 7848 if (atomic_inc_not_zero(&perf_sched_count))
7849 goto enabled;
7850
7851 mutex_lock(&perf_sched_mutex);
7852 if (!atomic_read(&perf_sched_count)) {
7853 static_branch_enable(&perf_sched_events);
7854 /*
7855 * Guarantee that all CPUs observe they key change and
7856 * call the perf scheduling hooks before proceeding to
7857 * install events that need them.
7858 */
7859 synchronize_sched();
7860 }
7861 /*
7862 * Now that we have waited for the sync_sched(), allow further
7863 * increments to by-pass the mutex.
7864 */
7865 atomic_inc(&perf_sched_count);
7866 mutex_unlock(&perf_sched_mutex);
7867 }
7868enabled:
7774 7869
7775 account_event_cpu(event, event->cpu); 7870 account_event_cpu(event, event->cpu);
7776} 7871}
@@ -8389,10 +8484,19 @@ SYSCALL_DEFINE5(perf_event_open,
8389 if (move_group) { 8484 if (move_group) {
8390 gctx = group_leader->ctx; 8485 gctx = group_leader->ctx;
8391 mutex_lock_double(&gctx->mutex, &ctx->mutex); 8486 mutex_lock_double(&gctx->mutex, &ctx->mutex);
8487 if (gctx->task == TASK_TOMBSTONE) {
8488 err = -ESRCH;
8489 goto err_locked;
8490 }
8392 } else { 8491 } else {
8393 mutex_lock(&ctx->mutex); 8492 mutex_lock(&ctx->mutex);
8394 } 8493 }
8395 8494
8495 if (ctx->task == TASK_TOMBSTONE) {
8496 err = -ESRCH;
8497 goto err_locked;
8498 }
8499
8396 if (!perf_event_validate_size(event)) { 8500 if (!perf_event_validate_size(event)) {
8397 err = -E2BIG; 8501 err = -E2BIG;
8398 goto err_locked; 8502 goto err_locked;
@@ -8509,7 +8613,12 @@ err_context:
8509 perf_unpin_context(ctx); 8613 perf_unpin_context(ctx);
8510 put_ctx(ctx); 8614 put_ctx(ctx);
8511err_alloc: 8615err_alloc:
8512 free_event(event); 8616 /*
8617 * If event_file is set, the fput() above will have called ->release()
8618 * and that will take care of freeing the event.
8619 */
8620 if (!event_file)
8621 free_event(event);
8513err_cpus: 8622err_cpus:
8514 put_online_cpus(); 8623 put_online_cpus();
8515err_task: 8624err_task:
@@ -8563,12 +8672,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
8563 8672
8564 WARN_ON_ONCE(ctx->parent_ctx); 8673 WARN_ON_ONCE(ctx->parent_ctx);
8565 mutex_lock(&ctx->mutex); 8674 mutex_lock(&ctx->mutex);
8675 if (ctx->task == TASK_TOMBSTONE) {
8676 err = -ESRCH;
8677 goto err_unlock;
8678 }
8679
8566 if (!exclusive_event_installable(event, ctx)) { 8680 if (!exclusive_event_installable(event, ctx)) {
8567 mutex_unlock(&ctx->mutex);
8568 perf_unpin_context(ctx);
8569 put_ctx(ctx);
8570 err = -EBUSY; 8681 err = -EBUSY;
8571 goto err_free; 8682 goto err_unlock;
8572 } 8683 }
8573 8684
8574 perf_install_in_context(ctx, event, cpu); 8685 perf_install_in_context(ctx, event, cpu);
@@ -8577,6 +8688,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
8577 8688
8578 return event; 8689 return event;
8579 8690
8691err_unlock:
8692 mutex_unlock(&ctx->mutex);
8693 perf_unpin_context(ctx);
8694 put_ctx(ctx);
8580err_free: 8695err_free:
8581 free_event(event); 8696 free_event(event);
8582err: 8697err:
@@ -8695,7 +8810,7 @@ perf_event_exit_event(struct perf_event *child_event,
8695 if (parent_event) 8810 if (parent_event)
8696 perf_group_detach(child_event); 8811 perf_group_detach(child_event);
8697 list_del_event(child_event, child_ctx); 8812 list_del_event(child_event, child_ctx);
8698 child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ 8813 child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
8699 raw_spin_unlock_irq(&child_ctx->lock); 8814 raw_spin_unlock_irq(&child_ctx->lock);
8700 8815
8701 /* 8816 /*
@@ -9313,9 +9428,6 @@ void __init perf_event_init(void)
9313 ret = init_hw_breakpoint(); 9428 ret = init_hw_breakpoint();
9314 WARN(ret, "hw_breakpoint initialization failed with: %d", ret); 9429 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
9315 9430
9316 /* do not patch jump label more than once per second */
9317 jump_label_rate_limit(&perf_sched_events, HZ);
9318
9319 /* 9431 /*
9320 * Build time assertion that we keep the data_head at the intended 9432 * Build time assertion that we keep the data_head at the intended
9321 * location. IOW, validation we got the __reserved[] size right. 9433 * location. IOW, validation we got the __reserved[] size right.