diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-02-28 10:52:00 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-02-28 10:52:00 -0500 |
commit | 1b9540ce033ad15802e36ad1cd1c36bdad98eeea (patch) | |
tree | 4b6d5484b15a9a9ca8ff64f7444705600d0cbb68 | |
parent | 4b696dcb1a55e40648ad0eec4af991c72f945a85 (diff) | |
parent | 0da4cf3e0a68c97ef811569804616a811f786729 (diff) |
Merge branch 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf fixes from Thomas Gleixner:
"A rather largish series of 12 patches addressing a maze of race
conditions in the perf core code from Peter Zijlstra"
* 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
perf: Robustify task_function_call()
perf: Fix scaling vs. perf_install_in_context()
perf: Fix scaling vs. perf_event_enable()
perf: Fix scaling vs. perf_event_enable_on_exec()
perf: Fix ctx time tracking by introducing EVENT_TIME
perf: Cure event->pending_disable race
perf: Fix race between event install and jump_labels
perf: Fix cloning
perf: Only update context time when active
perf: Allow perf_release() with !event->ctx
perf: Do not double free
perf: Close install vs. exit race
-rw-r--r-- | include/linux/perf_event.h | 7 | ||||
-rw-r--r-- | kernel/events/core.c | 368 |
2 files changed, 244 insertions, 131 deletions
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b35a61a481fa..f5c5a3fa2c81 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h | |||
@@ -397,6 +397,7 @@ struct pmu { | |||
397 | * enum perf_event_active_state - the states of a event | 397 | * enum perf_event_active_state - the states of a event |
398 | */ | 398 | */ |
399 | enum perf_event_active_state { | 399 | enum perf_event_active_state { |
400 | PERF_EVENT_STATE_DEAD = -4, | ||
400 | PERF_EVENT_STATE_EXIT = -3, | 401 | PERF_EVENT_STATE_EXIT = -3, |
401 | PERF_EVENT_STATE_ERROR = -2, | 402 | PERF_EVENT_STATE_ERROR = -2, |
402 | PERF_EVENT_STATE_OFF = -1, | 403 | PERF_EVENT_STATE_OFF = -1, |
@@ -905,7 +906,7 @@ perf_sw_event_sched(u32 event_id, u64 nr, u64 addr) | |||
905 | } | 906 | } |
906 | } | 907 | } |
907 | 908 | ||
908 | extern struct static_key_deferred perf_sched_events; | 909 | extern struct static_key_false perf_sched_events; |
909 | 910 | ||
910 | static __always_inline bool | 911 | static __always_inline bool |
911 | perf_sw_migrate_enabled(void) | 912 | perf_sw_migrate_enabled(void) |
@@ -924,7 +925,7 @@ static inline void perf_event_task_migrate(struct task_struct *task) | |||
924 | static inline void perf_event_task_sched_in(struct task_struct *prev, | 925 | static inline void perf_event_task_sched_in(struct task_struct *prev, |
925 | struct task_struct *task) | 926 | struct task_struct *task) |
926 | { | 927 | { |
927 | if (static_key_false(&perf_sched_events.key)) | 928 | if (static_branch_unlikely(&perf_sched_events)) |
928 | __perf_event_task_sched_in(prev, task); | 929 | __perf_event_task_sched_in(prev, task); |
929 | 930 | ||
930 | if (perf_sw_migrate_enabled() && task->sched_migrated) { | 931 | if (perf_sw_migrate_enabled() && task->sched_migrated) { |
@@ -941,7 +942,7 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, | |||
941 | { | 942 | { |
942 | perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); | 943 | perf_sw_event_sched(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 0); |
943 | 944 | ||
944 | if (static_key_false(&perf_sched_events.key)) | 945 | if (static_branch_unlikely(&perf_sched_events)) |
945 | __perf_event_task_sched_out(prev, next); | 946 | __perf_event_task_sched_out(prev, next); |
946 | } | 947 | } |
947 | 948 | ||
diff --git a/kernel/events/core.c b/kernel/events/core.c index 0d58522103cd..614614821f00 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -64,8 +64,17 @@ static void remote_function(void *data) | |||
64 | struct task_struct *p = tfc->p; | 64 | struct task_struct *p = tfc->p; |
65 | 65 | ||
66 | if (p) { | 66 | if (p) { |
67 | tfc->ret = -EAGAIN; | 67 | /* -EAGAIN */ |
68 | if (task_cpu(p) != smp_processor_id() || !task_curr(p)) | 68 | if (task_cpu(p) != smp_processor_id()) |
69 | return; | ||
70 | |||
71 | /* | ||
72 | * Now that we're on right CPU with IRQs disabled, we can test | ||
73 | * if we hit the right task without races. | ||
74 | */ | ||
75 | |||
76 | tfc->ret = -ESRCH; /* No such (running) process */ | ||
77 | if (p != current) | ||
69 | return; | 78 | return; |
70 | } | 79 | } |
71 | 80 | ||
@@ -92,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) | |||
92 | .p = p, | 101 | .p = p, |
93 | .func = func, | 102 | .func = func, |
94 | .info = info, | 103 | .info = info, |
95 | .ret = -ESRCH, /* No such (running) process */ | 104 | .ret = -EAGAIN, |
96 | }; | 105 | }; |
106 | int ret; | ||
97 | 107 | ||
98 | if (task_curr(p)) | 108 | do { |
99 | smp_call_function_single(task_cpu(p), remote_function, &data, 1); | 109 | ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); |
110 | if (!ret) | ||
111 | ret = data.ret; | ||
112 | } while (ret == -EAGAIN); | ||
100 | 113 | ||
101 | return data.ret; | 114 | return ret; |
102 | } | 115 | } |
103 | 116 | ||
104 | /** | 117 | /** |
@@ -169,19 +182,6 @@ static bool is_kernel_event(struct perf_event *event) | |||
169 | * rely on ctx->is_active and therefore cannot use event_function_call(). | 182 | * rely on ctx->is_active and therefore cannot use event_function_call(). |
170 | * See perf_install_in_context(). | 183 | * See perf_install_in_context(). |
171 | * | 184 | * |
172 | * This is because we need a ctx->lock serialized variable (ctx->is_active) | ||
173 | * to reliably determine if a particular task/context is scheduled in. The | ||
174 | * task_curr() use in task_function_call() is racy in that a remote context | ||
175 | * switch is not a single atomic operation. | ||
176 | * | ||
177 | * As is, the situation is 'safe' because we set rq->curr before we do the | ||
178 | * actual context switch. This means that task_curr() will fail early, but | ||
179 | * we'll continue spinning on ctx->is_active until we've passed | ||
180 | * perf_event_task_sched_out(). | ||
181 | * | ||
182 | * Without this ctx->lock serialized variable we could have race where we find | ||
183 | * the task (and hence the context) would not be active while in fact they are. | ||
184 | * | ||
185 | * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. | 185 | * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. |
186 | */ | 186 | */ |
187 | 187 | ||
@@ -212,7 +212,7 @@ static int event_function(void *info) | |||
212 | */ | 212 | */ |
213 | if (ctx->task) { | 213 | if (ctx->task) { |
214 | if (ctx->task != current) { | 214 | if (ctx->task != current) { |
215 | ret = -EAGAIN; | 215 | ret = -ESRCH; |
216 | goto unlock; | 216 | goto unlock; |
217 | } | 217 | } |
218 | 218 | ||
@@ -276,10 +276,10 @@ static void event_function_call(struct perf_event *event, event_f func, void *da | |||
276 | return; | 276 | return; |
277 | } | 277 | } |
278 | 278 | ||
279 | again: | ||
280 | if (task == TASK_TOMBSTONE) | 279 | if (task == TASK_TOMBSTONE) |
281 | return; | 280 | return; |
282 | 281 | ||
282 | again: | ||
283 | if (!task_function_call(task, event_function, &efs)) | 283 | if (!task_function_call(task, event_function, &efs)) |
284 | return; | 284 | return; |
285 | 285 | ||
@@ -289,13 +289,15 @@ again: | |||
289 | * a concurrent perf_event_context_sched_out(). | 289 | * a concurrent perf_event_context_sched_out(). |
290 | */ | 290 | */ |
291 | task = ctx->task; | 291 | task = ctx->task; |
292 | if (task != TASK_TOMBSTONE) { | 292 | if (task == TASK_TOMBSTONE) { |
293 | if (ctx->is_active) { | 293 | raw_spin_unlock_irq(&ctx->lock); |
294 | raw_spin_unlock_irq(&ctx->lock); | 294 | return; |
295 | goto again; | ||
296 | } | ||
297 | func(event, NULL, ctx, data); | ||
298 | } | 295 | } |
296 | if (ctx->is_active) { | ||
297 | raw_spin_unlock_irq(&ctx->lock); | ||
298 | goto again; | ||
299 | } | ||
300 | func(event, NULL, ctx, data); | ||
299 | raw_spin_unlock_irq(&ctx->lock); | 301 | raw_spin_unlock_irq(&ctx->lock); |
300 | } | 302 | } |
301 | 303 | ||
@@ -314,6 +316,7 @@ again: | |||
314 | enum event_type_t { | 316 | enum event_type_t { |
315 | EVENT_FLEXIBLE = 0x1, | 317 | EVENT_FLEXIBLE = 0x1, |
316 | EVENT_PINNED = 0x2, | 318 | EVENT_PINNED = 0x2, |
319 | EVENT_TIME = 0x4, | ||
317 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, | 320 | EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, |
318 | }; | 321 | }; |
319 | 322 | ||
@@ -321,7 +324,13 @@ enum event_type_t { | |||
321 | * perf_sched_events : >0 events exist | 324 | * perf_sched_events : >0 events exist |
322 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu | 325 | * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu |
323 | */ | 326 | */ |
324 | struct static_key_deferred perf_sched_events __read_mostly; | 327 | |
328 | static void perf_sched_delayed(struct work_struct *work); | ||
329 | DEFINE_STATIC_KEY_FALSE(perf_sched_events); | ||
330 | static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); | ||
331 | static DEFINE_MUTEX(perf_sched_mutex); | ||
332 | static atomic_t perf_sched_count; | ||
333 | |||
325 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); | 334 | static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); |
326 | static DEFINE_PER_CPU(int, perf_sched_cb_usages); | 335 | static DEFINE_PER_CPU(int, perf_sched_cb_usages); |
327 | 336 | ||
@@ -1288,16 +1297,18 @@ static u64 perf_event_time(struct perf_event *event) | |||
1288 | 1297 | ||
1289 | /* | 1298 | /* |
1290 | * Update the total_time_enabled and total_time_running fields for a event. | 1299 | * Update the total_time_enabled and total_time_running fields for a event. |
1291 | * The caller of this function needs to hold the ctx->lock. | ||
1292 | */ | 1300 | */ |
1293 | static void update_event_times(struct perf_event *event) | 1301 | static void update_event_times(struct perf_event *event) |
1294 | { | 1302 | { |
1295 | struct perf_event_context *ctx = event->ctx; | 1303 | struct perf_event_context *ctx = event->ctx; |
1296 | u64 run_end; | 1304 | u64 run_end; |
1297 | 1305 | ||
1306 | lockdep_assert_held(&ctx->lock); | ||
1307 | |||
1298 | if (event->state < PERF_EVENT_STATE_INACTIVE || | 1308 | if (event->state < PERF_EVENT_STATE_INACTIVE || |
1299 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) | 1309 | event->group_leader->state < PERF_EVENT_STATE_INACTIVE) |
1300 | return; | 1310 | return; |
1311 | |||
1301 | /* | 1312 | /* |
1302 | * in cgroup mode, time_enabled represents | 1313 | * in cgroup mode, time_enabled represents |
1303 | * the time the event was enabled AND active | 1314 | * the time the event was enabled AND active |
@@ -1645,7 +1656,7 @@ out: | |||
1645 | 1656 | ||
1646 | static bool is_orphaned_event(struct perf_event *event) | 1657 | static bool is_orphaned_event(struct perf_event *event) |
1647 | { | 1658 | { |
1648 | return event->state == PERF_EVENT_STATE_EXIT; | 1659 | return event->state == PERF_EVENT_STATE_DEAD; |
1649 | } | 1660 | } |
1650 | 1661 | ||
1651 | static inline int pmu_filter_match(struct perf_event *event) | 1662 | static inline int pmu_filter_match(struct perf_event *event) |
@@ -1690,14 +1701,14 @@ event_sched_out(struct perf_event *event, | |||
1690 | 1701 | ||
1691 | perf_pmu_disable(event->pmu); | 1702 | perf_pmu_disable(event->pmu); |
1692 | 1703 | ||
1704 | event->tstamp_stopped = tstamp; | ||
1705 | event->pmu->del(event, 0); | ||
1706 | event->oncpu = -1; | ||
1693 | event->state = PERF_EVENT_STATE_INACTIVE; | 1707 | event->state = PERF_EVENT_STATE_INACTIVE; |
1694 | if (event->pending_disable) { | 1708 | if (event->pending_disable) { |
1695 | event->pending_disable = 0; | 1709 | event->pending_disable = 0; |
1696 | event->state = PERF_EVENT_STATE_OFF; | 1710 | event->state = PERF_EVENT_STATE_OFF; |
1697 | } | 1711 | } |
1698 | event->tstamp_stopped = tstamp; | ||
1699 | event->pmu->del(event, 0); | ||
1700 | event->oncpu = -1; | ||
1701 | 1712 | ||
1702 | if (!is_software_event(event)) | 1713 | if (!is_software_event(event)) |
1703 | cpuctx->active_oncpu--; | 1714 | cpuctx->active_oncpu--; |
@@ -1732,7 +1743,6 @@ group_sched_out(struct perf_event *group_event, | |||
1732 | } | 1743 | } |
1733 | 1744 | ||
1734 | #define DETACH_GROUP 0x01UL | 1745 | #define DETACH_GROUP 0x01UL |
1735 | #define DETACH_STATE 0x02UL | ||
1736 | 1746 | ||
1737 | /* | 1747 | /* |
1738 | * Cross CPU call to remove a performance event | 1748 | * Cross CPU call to remove a performance event |
@@ -1752,8 +1762,6 @@ __perf_remove_from_context(struct perf_event *event, | |||
1752 | if (flags & DETACH_GROUP) | 1762 | if (flags & DETACH_GROUP) |
1753 | perf_group_detach(event); | 1763 | perf_group_detach(event); |
1754 | list_del_event(event, ctx); | 1764 | list_del_event(event, ctx); |
1755 | if (flags & DETACH_STATE) | ||
1756 | event->state = PERF_EVENT_STATE_EXIT; | ||
1757 | 1765 | ||
1758 | if (!ctx->nr_events && ctx->is_active) { | 1766 | if (!ctx->nr_events && ctx->is_active) { |
1759 | ctx->is_active = 0; | 1767 | ctx->is_active = 0; |
@@ -2063,14 +2071,27 @@ static void add_event_to_ctx(struct perf_event *event, | |||
2063 | event->tstamp_stopped = tstamp; | 2071 | event->tstamp_stopped = tstamp; |
2064 | } | 2072 | } |
2065 | 2073 | ||
2066 | static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, | 2074 | static void ctx_sched_out(struct perf_event_context *ctx, |
2067 | struct perf_event_context *ctx); | 2075 | struct perf_cpu_context *cpuctx, |
2076 | enum event_type_t event_type); | ||
2068 | static void | 2077 | static void |
2069 | ctx_sched_in(struct perf_event_context *ctx, | 2078 | ctx_sched_in(struct perf_event_context *ctx, |
2070 | struct perf_cpu_context *cpuctx, | 2079 | struct perf_cpu_context *cpuctx, |
2071 | enum event_type_t event_type, | 2080 | enum event_type_t event_type, |
2072 | struct task_struct *task); | 2081 | struct task_struct *task); |
2073 | 2082 | ||
2083 | static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, | ||
2084 | struct perf_event_context *ctx) | ||
2085 | { | ||
2086 | if (!cpuctx->task_ctx) | ||
2087 | return; | ||
2088 | |||
2089 | if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) | ||
2090 | return; | ||
2091 | |||
2092 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); | ||
2093 | } | ||
2094 | |||
2074 | static void perf_event_sched_in(struct perf_cpu_context *cpuctx, | 2095 | static void perf_event_sched_in(struct perf_cpu_context *cpuctx, |
2075 | struct perf_event_context *ctx, | 2096 | struct perf_event_context *ctx, |
2076 | struct task_struct *task) | 2097 | struct task_struct *task) |
@@ -2097,49 +2118,68 @@ static void ctx_resched(struct perf_cpu_context *cpuctx, | |||
2097 | /* | 2118 | /* |
2098 | * Cross CPU call to install and enable a performance event | 2119 | * Cross CPU call to install and enable a performance event |
2099 | * | 2120 | * |
2100 | * Must be called with ctx->mutex held | 2121 | * Very similar to remote_function() + event_function() but cannot assume that |
2122 | * things like ctx->is_active and cpuctx->task_ctx are set. | ||
2101 | */ | 2123 | */ |
2102 | static int __perf_install_in_context(void *info) | 2124 | static int __perf_install_in_context(void *info) |
2103 | { | 2125 | { |
2104 | struct perf_event_context *ctx = info; | 2126 | struct perf_event *event = info; |
2127 | struct perf_event_context *ctx = event->ctx; | ||
2105 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); | 2128 | struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); |
2106 | struct perf_event_context *task_ctx = cpuctx->task_ctx; | 2129 | struct perf_event_context *task_ctx = cpuctx->task_ctx; |
2130 | bool activate = true; | ||
2131 | int ret = 0; | ||
2107 | 2132 | ||
2108 | raw_spin_lock(&cpuctx->ctx.lock); | 2133 | raw_spin_lock(&cpuctx->ctx.lock); |
2109 | if (ctx->task) { | 2134 | if (ctx->task) { |
2110 | raw_spin_lock(&ctx->lock); | 2135 | raw_spin_lock(&ctx->lock); |
2111 | /* | ||
2112 | * If we hit the 'wrong' task, we've since scheduled and | ||
2113 | * everything should be sorted, nothing to do! | ||
2114 | */ | ||
2115 | task_ctx = ctx; | 2136 | task_ctx = ctx; |
2116 | if (ctx->task != current) | 2137 | |
2138 | /* If we're on the wrong CPU, try again */ | ||
2139 | if (task_cpu(ctx->task) != smp_processor_id()) { | ||
2140 | ret = -ESRCH; | ||
2117 | goto unlock; | 2141 | goto unlock; |
2142 | } | ||
2118 | 2143 | ||
2119 | /* | 2144 | /* |
2120 | * If task_ctx is set, it had better be to us. | 2145 | * If we're on the right CPU, see if the task we target is |
2146 | * current, if not we don't have to activate the ctx, a future | ||
2147 | * context switch will do that for us. | ||
2121 | */ | 2148 | */ |
2122 | WARN_ON_ONCE(cpuctx->task_ctx != ctx && cpuctx->task_ctx); | 2149 | if (ctx->task != current) |
2150 | activate = false; | ||
2151 | else | ||
2152 | WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); | ||
2153 | |||
2123 | } else if (task_ctx) { | 2154 | } else if (task_ctx) { |
2124 | raw_spin_lock(&task_ctx->lock); | 2155 | raw_spin_lock(&task_ctx->lock); |
2125 | } | 2156 | } |
2126 | 2157 | ||
2127 | ctx_resched(cpuctx, task_ctx); | 2158 | if (activate) { |
2159 | ctx_sched_out(ctx, cpuctx, EVENT_TIME); | ||
2160 | add_event_to_ctx(event, ctx); | ||
2161 | ctx_resched(cpuctx, task_ctx); | ||
2162 | } else { | ||
2163 | add_event_to_ctx(event, ctx); | ||
2164 | } | ||
2165 | |||
2128 | unlock: | 2166 | unlock: |
2129 | perf_ctx_unlock(cpuctx, task_ctx); | 2167 | perf_ctx_unlock(cpuctx, task_ctx); |
2130 | 2168 | ||
2131 | return 0; | 2169 | return ret; |
2132 | } | 2170 | } |
2133 | 2171 | ||
2134 | /* | 2172 | /* |
2135 | * Attach a performance event to a context | 2173 | * Attach a performance event to a context. |
2174 | * | ||
2175 | * Very similar to event_function_call, see comment there. | ||
2136 | */ | 2176 | */ |
2137 | static void | 2177 | static void |
2138 | perf_install_in_context(struct perf_event_context *ctx, | 2178 | perf_install_in_context(struct perf_event_context *ctx, |
2139 | struct perf_event *event, | 2179 | struct perf_event *event, |
2140 | int cpu) | 2180 | int cpu) |
2141 | { | 2181 | { |
2142 | struct task_struct *task = NULL; | 2182 | struct task_struct *task = READ_ONCE(ctx->task); |
2143 | 2183 | ||
2144 | lockdep_assert_held(&ctx->mutex); | 2184 | lockdep_assert_held(&ctx->mutex); |
2145 | 2185 | ||
@@ -2147,40 +2187,46 @@ perf_install_in_context(struct perf_event_context *ctx, | |||
2147 | if (event->cpu != -1) | 2187 | if (event->cpu != -1) |
2148 | event->cpu = cpu; | 2188 | event->cpu = cpu; |
2149 | 2189 | ||
2190 | if (!task) { | ||
2191 | cpu_function_call(cpu, __perf_install_in_context, event); | ||
2192 | return; | ||
2193 | } | ||
2194 | |||
2195 | /* | ||
2196 | * Should not happen, we validate the ctx is still alive before calling. | ||
2197 | */ | ||
2198 | if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) | ||
2199 | return; | ||
2200 | |||
2150 | /* | 2201 | /* |
2151 | * Installing events is tricky because we cannot rely on ctx->is_active | 2202 | * Installing events is tricky because we cannot rely on ctx->is_active |
2152 | * to be set in case this is the nr_events 0 -> 1 transition. | 2203 | * to be set in case this is the nr_events 0 -> 1 transition. |
2153 | * | ||
2154 | * So what we do is we add the event to the list here, which will allow | ||
2155 | * a future context switch to DTRT and then send a racy IPI. If the IPI | ||
2156 | * fails to hit the right task, this means a context switch must have | ||
2157 | * happened and that will have taken care of business. | ||
2158 | */ | 2204 | */ |
2159 | raw_spin_lock_irq(&ctx->lock); | 2205 | again: |
2160 | task = ctx->task; | ||
2161 | /* | 2206 | /* |
2162 | * Worse, we cannot even rely on the ctx actually existing anymore. If | 2207 | * Cannot use task_function_call() because we need to run on the task's |
2163 | * between find_get_context() and perf_install_in_context() the task | 2208 | * CPU regardless of whether its current or not. |
2164 | * went through perf_event_exit_task() its dead and we should not be | ||
2165 | * adding new events. | ||
2166 | */ | 2209 | */ |
2167 | if (task == TASK_TOMBSTONE) { | 2210 | if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) |
2211 | return; | ||
2212 | |||
2213 | raw_spin_lock_irq(&ctx->lock); | ||
2214 | task = ctx->task; | ||
2215 | if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { | ||
2216 | /* | ||
2217 | * Cannot happen because we already checked above (which also | ||
2218 | * cannot happen), and we hold ctx->mutex, which serializes us | ||
2219 | * against perf_event_exit_task_context(). | ||
2220 | */ | ||
2168 | raw_spin_unlock_irq(&ctx->lock); | 2221 | raw_spin_unlock_irq(&ctx->lock); |
2169 | return; | 2222 | return; |
2170 | } | 2223 | } |
2171 | update_context_time(ctx); | 2224 | raw_spin_unlock_irq(&ctx->lock); |
2172 | /* | 2225 | /* |
2173 | * Update cgrp time only if current cgrp matches event->cgrp. | 2226 | * Since !ctx->is_active doesn't mean anything, we must IPI |
2174 | * Must be done before calling add_event_to_ctx(). | 2227 | * unconditionally. |
2175 | */ | 2228 | */ |
2176 | update_cgrp_time_from_event(event); | 2229 | goto again; |
2177 | add_event_to_ctx(event, ctx); | ||
2178 | raw_spin_unlock_irq(&ctx->lock); | ||
2179 | |||
2180 | if (task) | ||
2181 | task_function_call(task, __perf_install_in_context, ctx); | ||
2182 | else | ||
2183 | cpu_function_call(cpu, __perf_install_in_context, ctx); | ||
2184 | } | 2230 | } |
2185 | 2231 | ||
2186 | /* | 2232 | /* |
@@ -2219,17 +2265,18 @@ static void __perf_event_enable(struct perf_event *event, | |||
2219 | event->state <= PERF_EVENT_STATE_ERROR) | 2265 | event->state <= PERF_EVENT_STATE_ERROR) |
2220 | return; | 2266 | return; |
2221 | 2267 | ||
2222 | update_context_time(ctx); | 2268 | if (ctx->is_active) |
2269 | ctx_sched_out(ctx, cpuctx, EVENT_TIME); | ||
2270 | |||
2223 | __perf_event_mark_enabled(event); | 2271 | __perf_event_mark_enabled(event); |
2224 | 2272 | ||
2225 | if (!ctx->is_active) | 2273 | if (!ctx->is_active) |
2226 | return; | 2274 | return; |
2227 | 2275 | ||
2228 | if (!event_filter_match(event)) { | 2276 | if (!event_filter_match(event)) { |
2229 | if (is_cgroup_event(event)) { | 2277 | if (is_cgroup_event(event)) |
2230 | perf_cgroup_set_timestamp(current, ctx); // XXX ? | ||
2231 | perf_cgroup_defer_enabled(event); | 2278 | perf_cgroup_defer_enabled(event); |
2232 | } | 2279 | ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); |
2233 | return; | 2280 | return; |
2234 | } | 2281 | } |
2235 | 2282 | ||
@@ -2237,8 +2284,10 @@ static void __perf_event_enable(struct perf_event *event, | |||
2237 | * If the event is in a group and isn't the group leader, | 2284 | * If the event is in a group and isn't the group leader, |
2238 | * then don't put it on unless the group is on. | 2285 | * then don't put it on unless the group is on. |
2239 | */ | 2286 | */ |
2240 | if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) | 2287 | if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { |
2288 | ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); | ||
2241 | return; | 2289 | return; |
2290 | } | ||
2242 | 2291 | ||
2243 | task_ctx = cpuctx->task_ctx; | 2292 | task_ctx = cpuctx->task_ctx; |
2244 | if (ctx->task) | 2293 | if (ctx->task) |
@@ -2344,24 +2393,33 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
2344 | } | 2393 | } |
2345 | 2394 | ||
2346 | ctx->is_active &= ~event_type; | 2395 | ctx->is_active &= ~event_type; |
2396 | if (!(ctx->is_active & EVENT_ALL)) | ||
2397 | ctx->is_active = 0; | ||
2398 | |||
2347 | if (ctx->task) { | 2399 | if (ctx->task) { |
2348 | WARN_ON_ONCE(cpuctx->task_ctx != ctx); | 2400 | WARN_ON_ONCE(cpuctx->task_ctx != ctx); |
2349 | if (!ctx->is_active) | 2401 | if (!ctx->is_active) |
2350 | cpuctx->task_ctx = NULL; | 2402 | cpuctx->task_ctx = NULL; |
2351 | } | 2403 | } |
2352 | 2404 | ||
2353 | update_context_time(ctx); | 2405 | is_active ^= ctx->is_active; /* changed bits */ |
2354 | update_cgrp_time_from_cpuctx(cpuctx); | 2406 | |
2355 | if (!ctx->nr_active) | 2407 | if (is_active & EVENT_TIME) { |
2408 | /* update (and stop) ctx time */ | ||
2409 | update_context_time(ctx); | ||
2410 | update_cgrp_time_from_cpuctx(cpuctx); | ||
2411 | } | ||
2412 | |||
2413 | if (!ctx->nr_active || !(is_active & EVENT_ALL)) | ||
2356 | return; | 2414 | return; |
2357 | 2415 | ||
2358 | perf_pmu_disable(ctx->pmu); | 2416 | perf_pmu_disable(ctx->pmu); |
2359 | if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { | 2417 | if (is_active & EVENT_PINNED) { |
2360 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) | 2418 | list_for_each_entry(event, &ctx->pinned_groups, group_entry) |
2361 | group_sched_out(event, cpuctx, ctx); | 2419 | group_sched_out(event, cpuctx, ctx); |
2362 | } | 2420 | } |
2363 | 2421 | ||
2364 | if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { | 2422 | if (is_active & EVENT_FLEXIBLE) { |
2365 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) | 2423 | list_for_each_entry(event, &ctx->flexible_groups, group_entry) |
2366 | group_sched_out(event, cpuctx, ctx); | 2424 | group_sched_out(event, cpuctx, ctx); |
2367 | } | 2425 | } |
@@ -2641,18 +2699,6 @@ void __perf_event_task_sched_out(struct task_struct *task, | |||
2641 | perf_cgroup_sched_out(task, next); | 2699 | perf_cgroup_sched_out(task, next); |
2642 | } | 2700 | } |
2643 | 2701 | ||
2644 | static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, | ||
2645 | struct perf_event_context *ctx) | ||
2646 | { | ||
2647 | if (!cpuctx->task_ctx) | ||
2648 | return; | ||
2649 | |||
2650 | if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) | ||
2651 | return; | ||
2652 | |||
2653 | ctx_sched_out(ctx, cpuctx, EVENT_ALL); | ||
2654 | } | ||
2655 | |||
2656 | /* | 2702 | /* |
2657 | * Called with IRQs disabled | 2703 | * Called with IRQs disabled |
2658 | */ | 2704 | */ |
@@ -2735,7 +2781,7 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
2735 | if (likely(!ctx->nr_events)) | 2781 | if (likely(!ctx->nr_events)) |
2736 | return; | 2782 | return; |
2737 | 2783 | ||
2738 | ctx->is_active |= event_type; | 2784 | ctx->is_active |= (event_type | EVENT_TIME); |
2739 | if (ctx->task) { | 2785 | if (ctx->task) { |
2740 | if (!is_active) | 2786 | if (!is_active) |
2741 | cpuctx->task_ctx = ctx; | 2787 | cpuctx->task_ctx = ctx; |
@@ -2743,18 +2789,24 @@ ctx_sched_in(struct perf_event_context *ctx, | |||
2743 | WARN_ON_ONCE(cpuctx->task_ctx != ctx); | 2789 | WARN_ON_ONCE(cpuctx->task_ctx != ctx); |
2744 | } | 2790 | } |
2745 | 2791 | ||
2746 | now = perf_clock(); | 2792 | is_active ^= ctx->is_active; /* changed bits */ |
2747 | ctx->timestamp = now; | 2793 | |
2748 | perf_cgroup_set_timestamp(task, ctx); | 2794 | if (is_active & EVENT_TIME) { |
2795 | /* start ctx time */ | ||
2796 | now = perf_clock(); | ||
2797 | ctx->timestamp = now; | ||
2798 | perf_cgroup_set_timestamp(task, ctx); | ||
2799 | } | ||
2800 | |||
2749 | /* | 2801 | /* |
2750 | * First go through the list and put on any pinned groups | 2802 | * First go through the list and put on any pinned groups |
2751 | * in order to give them the best chance of going on. | 2803 | * in order to give them the best chance of going on. |
2752 | */ | 2804 | */ |
2753 | if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) | 2805 | if (is_active & EVENT_PINNED) |
2754 | ctx_pinned_sched_in(ctx, cpuctx); | 2806 | ctx_pinned_sched_in(ctx, cpuctx); |
2755 | 2807 | ||
2756 | /* Then walk through the lower prio flexible groups */ | 2808 | /* Then walk through the lower prio flexible groups */ |
2757 | if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) | 2809 | if (is_active & EVENT_FLEXIBLE) |
2758 | ctx_flexible_sched_in(ctx, cpuctx); | 2810 | ctx_flexible_sched_in(ctx, cpuctx); |
2759 | } | 2811 | } |
2760 | 2812 | ||
@@ -3120,6 +3172,7 @@ static void perf_event_enable_on_exec(int ctxn) | |||
3120 | 3172 | ||
3121 | cpuctx = __get_cpu_context(ctx); | 3173 | cpuctx = __get_cpu_context(ctx); |
3122 | perf_ctx_lock(cpuctx, ctx); | 3174 | perf_ctx_lock(cpuctx, ctx); |
3175 | ctx_sched_out(ctx, cpuctx, EVENT_TIME); | ||
3123 | list_for_each_entry(event, &ctx->event_list, event_entry) | 3176 | list_for_each_entry(event, &ctx->event_list, event_entry) |
3124 | enabled |= event_enable_on_exec(event, ctx); | 3177 | enabled |= event_enable_on_exec(event, ctx); |
3125 | 3178 | ||
@@ -3537,12 +3590,22 @@ static void unaccount_event(struct perf_event *event) | |||
3537 | if (has_branch_stack(event)) | 3590 | if (has_branch_stack(event)) |
3538 | dec = true; | 3591 | dec = true; |
3539 | 3592 | ||
3540 | if (dec) | 3593 | if (dec) { |
3541 | static_key_slow_dec_deferred(&perf_sched_events); | 3594 | if (!atomic_add_unless(&perf_sched_count, -1, 1)) |
3595 | schedule_delayed_work(&perf_sched_work, HZ); | ||
3596 | } | ||
3542 | 3597 | ||
3543 | unaccount_event_cpu(event, event->cpu); | 3598 | unaccount_event_cpu(event, event->cpu); |
3544 | } | 3599 | } |
3545 | 3600 | ||
3601 | static void perf_sched_delayed(struct work_struct *work) | ||
3602 | { | ||
3603 | mutex_lock(&perf_sched_mutex); | ||
3604 | if (atomic_dec_and_test(&perf_sched_count)) | ||
3605 | static_branch_disable(&perf_sched_events); | ||
3606 | mutex_unlock(&perf_sched_mutex); | ||
3607 | } | ||
3608 | |||
3546 | /* | 3609 | /* |
3547 | * The following implement mutual exclusion of events on "exclusive" pmus | 3610 | * The following implement mutual exclusion of events on "exclusive" pmus |
3548 | * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled | 3611 | * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled |
@@ -3752,30 +3815,42 @@ static void put_event(struct perf_event *event) | |||
3752 | */ | 3815 | */ |
3753 | int perf_event_release_kernel(struct perf_event *event) | 3816 | int perf_event_release_kernel(struct perf_event *event) |
3754 | { | 3817 | { |
3755 | struct perf_event_context *ctx; | 3818 | struct perf_event_context *ctx = event->ctx; |
3756 | struct perf_event *child, *tmp; | 3819 | struct perf_event *child, *tmp; |
3757 | 3820 | ||
3821 | /* | ||
3822 | * If we got here through err_file: fput(event_file); we will not have | ||
3823 | * attached to a context yet. | ||
3824 | */ | ||
3825 | if (!ctx) { | ||
3826 | WARN_ON_ONCE(event->attach_state & | ||
3827 | (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); | ||
3828 | goto no_ctx; | ||
3829 | } | ||
3830 | |||
3758 | if (!is_kernel_event(event)) | 3831 | if (!is_kernel_event(event)) |
3759 | perf_remove_from_owner(event); | 3832 | perf_remove_from_owner(event); |
3760 | 3833 | ||
3761 | ctx = perf_event_ctx_lock(event); | 3834 | ctx = perf_event_ctx_lock(event); |
3762 | WARN_ON_ONCE(ctx->parent_ctx); | 3835 | WARN_ON_ONCE(ctx->parent_ctx); |
3763 | perf_remove_from_context(event, DETACH_GROUP | DETACH_STATE); | 3836 | perf_remove_from_context(event, DETACH_GROUP); |
3764 | perf_event_ctx_unlock(event, ctx); | ||
3765 | 3837 | ||
3838 | raw_spin_lock_irq(&ctx->lock); | ||
3766 | /* | 3839 | /* |
3767 | * At this point we must have event->state == PERF_EVENT_STATE_EXIT, | 3840 | * Mark this even as STATE_DEAD, there is no external reference to it |
3768 | * either from the above perf_remove_from_context() or through | 3841 | * anymore. |
3769 | * perf_event_exit_event(). | ||
3770 | * | 3842 | * |
3771 | * Therefore, anybody acquiring event->child_mutex after the below | 3843 | * Anybody acquiring event->child_mutex after the below loop _must_ |
3772 | * loop _must_ also see this, most importantly inherit_event() which | 3844 | * also see this, most importantly inherit_event() which will avoid |
3773 | * will avoid placing more children on the list. | 3845 | * placing more children on the list. |
3774 | * | 3846 | * |
3775 | * Thus this guarantees that we will in fact observe and kill _ALL_ | 3847 | * Thus this guarantees that we will in fact observe and kill _ALL_ |
3776 | * child events. | 3848 | * child events. |
3777 | */ | 3849 | */ |
3778 | WARN_ON_ONCE(event->state != PERF_EVENT_STATE_EXIT); | 3850 | event->state = PERF_EVENT_STATE_DEAD; |
3851 | raw_spin_unlock_irq(&ctx->lock); | ||
3852 | |||
3853 | perf_event_ctx_unlock(event, ctx); | ||
3779 | 3854 | ||
3780 | again: | 3855 | again: |
3781 | mutex_lock(&event->child_mutex); | 3856 | mutex_lock(&event->child_mutex); |
@@ -3830,8 +3905,8 @@ again: | |||
3830 | } | 3905 | } |
3831 | mutex_unlock(&event->child_mutex); | 3906 | mutex_unlock(&event->child_mutex); |
3832 | 3907 | ||
3833 | /* Must be the last reference */ | 3908 | no_ctx: |
3834 | put_event(event); | 3909 | put_event(event); /* Must be the 'last' reference */ |
3835 | return 0; | 3910 | return 0; |
3836 | } | 3911 | } |
3837 | EXPORT_SYMBOL_GPL(perf_event_release_kernel); | 3912 | EXPORT_SYMBOL_GPL(perf_event_release_kernel); |
@@ -3988,7 +4063,7 @@ static bool is_event_hup(struct perf_event *event) | |||
3988 | { | 4063 | { |
3989 | bool no_children; | 4064 | bool no_children; |
3990 | 4065 | ||
3991 | if (event->state != PERF_EVENT_STATE_EXIT) | 4066 | if (event->state > PERF_EVENT_STATE_EXIT) |
3992 | return false; | 4067 | return false; |
3993 | 4068 | ||
3994 | mutex_lock(&event->child_mutex); | 4069 | mutex_lock(&event->child_mutex); |
@@ -7769,8 +7844,28 @@ static void account_event(struct perf_event *event) | |||
7769 | if (is_cgroup_event(event)) | 7844 | if (is_cgroup_event(event)) |
7770 | inc = true; | 7845 | inc = true; |
7771 | 7846 | ||
7772 | if (inc) | 7847 | if (inc) { |
7773 | static_key_slow_inc(&perf_sched_events.key); | 7848 | if (atomic_inc_not_zero(&perf_sched_count)) |
7849 | goto enabled; | ||
7850 | |||
7851 | mutex_lock(&perf_sched_mutex); | ||
7852 | if (!atomic_read(&perf_sched_count)) { | ||
7853 | static_branch_enable(&perf_sched_events); | ||
7854 | /* | ||
7855 | * Guarantee that all CPUs observe they key change and | ||
7856 | * call the perf scheduling hooks before proceeding to | ||
7857 | * install events that need them. | ||
7858 | */ | ||
7859 | synchronize_sched(); | ||
7860 | } | ||
7861 | /* | ||
7862 | * Now that we have waited for the sync_sched(), allow further | ||
7863 | * increments to by-pass the mutex. | ||
7864 | */ | ||
7865 | atomic_inc(&perf_sched_count); | ||
7866 | mutex_unlock(&perf_sched_mutex); | ||
7867 | } | ||
7868 | enabled: | ||
7774 | 7869 | ||
7775 | account_event_cpu(event, event->cpu); | 7870 | account_event_cpu(event, event->cpu); |
7776 | } | 7871 | } |
@@ -8389,10 +8484,19 @@ SYSCALL_DEFINE5(perf_event_open, | |||
8389 | if (move_group) { | 8484 | if (move_group) { |
8390 | gctx = group_leader->ctx; | 8485 | gctx = group_leader->ctx; |
8391 | mutex_lock_double(&gctx->mutex, &ctx->mutex); | 8486 | mutex_lock_double(&gctx->mutex, &ctx->mutex); |
8487 | if (gctx->task == TASK_TOMBSTONE) { | ||
8488 | err = -ESRCH; | ||
8489 | goto err_locked; | ||
8490 | } | ||
8392 | } else { | 8491 | } else { |
8393 | mutex_lock(&ctx->mutex); | 8492 | mutex_lock(&ctx->mutex); |
8394 | } | 8493 | } |
8395 | 8494 | ||
8495 | if (ctx->task == TASK_TOMBSTONE) { | ||
8496 | err = -ESRCH; | ||
8497 | goto err_locked; | ||
8498 | } | ||
8499 | |||
8396 | if (!perf_event_validate_size(event)) { | 8500 | if (!perf_event_validate_size(event)) { |
8397 | err = -E2BIG; | 8501 | err = -E2BIG; |
8398 | goto err_locked; | 8502 | goto err_locked; |
@@ -8509,7 +8613,12 @@ err_context: | |||
8509 | perf_unpin_context(ctx); | 8613 | perf_unpin_context(ctx); |
8510 | put_ctx(ctx); | 8614 | put_ctx(ctx); |
8511 | err_alloc: | 8615 | err_alloc: |
8512 | free_event(event); | 8616 | /* |
8617 | * If event_file is set, the fput() above will have called ->release() | ||
8618 | * and that will take care of freeing the event. | ||
8619 | */ | ||
8620 | if (!event_file) | ||
8621 | free_event(event); | ||
8513 | err_cpus: | 8622 | err_cpus: |
8514 | put_online_cpus(); | 8623 | put_online_cpus(); |
8515 | err_task: | 8624 | err_task: |
@@ -8563,12 +8672,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
8563 | 8672 | ||
8564 | WARN_ON_ONCE(ctx->parent_ctx); | 8673 | WARN_ON_ONCE(ctx->parent_ctx); |
8565 | mutex_lock(&ctx->mutex); | 8674 | mutex_lock(&ctx->mutex); |
8675 | if (ctx->task == TASK_TOMBSTONE) { | ||
8676 | err = -ESRCH; | ||
8677 | goto err_unlock; | ||
8678 | } | ||
8679 | |||
8566 | if (!exclusive_event_installable(event, ctx)) { | 8680 | if (!exclusive_event_installable(event, ctx)) { |
8567 | mutex_unlock(&ctx->mutex); | ||
8568 | perf_unpin_context(ctx); | ||
8569 | put_ctx(ctx); | ||
8570 | err = -EBUSY; | 8681 | err = -EBUSY; |
8571 | goto err_free; | 8682 | goto err_unlock; |
8572 | } | 8683 | } |
8573 | 8684 | ||
8574 | perf_install_in_context(ctx, event, cpu); | 8685 | perf_install_in_context(ctx, event, cpu); |
@@ -8577,6 +8688,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
8577 | 8688 | ||
8578 | return event; | 8689 | return event; |
8579 | 8690 | ||
8691 | err_unlock: | ||
8692 | mutex_unlock(&ctx->mutex); | ||
8693 | perf_unpin_context(ctx); | ||
8694 | put_ctx(ctx); | ||
8580 | err_free: | 8695 | err_free: |
8581 | free_event(event); | 8696 | free_event(event); |
8582 | err: | 8697 | err: |
@@ -8695,7 +8810,7 @@ perf_event_exit_event(struct perf_event *child_event, | |||
8695 | if (parent_event) | 8810 | if (parent_event) |
8696 | perf_group_detach(child_event); | 8811 | perf_group_detach(child_event); |
8697 | list_del_event(child_event, child_ctx); | 8812 | list_del_event(child_event, child_ctx); |
8698 | child_event->state = PERF_EVENT_STATE_EXIT; /* see perf_event_release_kernel() */ | 8813 | child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ |
8699 | raw_spin_unlock_irq(&child_ctx->lock); | 8814 | raw_spin_unlock_irq(&child_ctx->lock); |
8700 | 8815 | ||
8701 | /* | 8816 | /* |
@@ -9313,9 +9428,6 @@ void __init perf_event_init(void) | |||
9313 | ret = init_hw_breakpoint(); | 9428 | ret = init_hw_breakpoint(); |
9314 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); | 9429 | WARN(ret, "hw_breakpoint initialization failed with: %d", ret); |
9315 | 9430 | ||
9316 | /* do not patch jump label more than once per second */ | ||
9317 | jump_label_rate_limit(&perf_sched_events, HZ); | ||
9318 | |||
9319 | /* | 9431 | /* |
9320 | * Build time assertion that we keep the data_head at the intended | 9432 | * Build time assertion that we keep the data_head at the intended |
9321 | * location. IOW, validation we got the __reserved[] size right. | 9433 | * location. IOW, validation we got the __reserved[] size right. |