diff options
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r-- | kernel/events/core.c | 156 |
1 files changed, 97 insertions, 59 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c index 663f43a20f73..8c875ef6e120 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c | |||
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; | |||
175 | static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); | 175 | static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); |
176 | static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; | 176 | static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; |
177 | 177 | ||
178 | static atomic_t perf_sample_allowed_ns __read_mostly = | 178 | static int perf_sample_allowed_ns __read_mostly = |
179 | ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); | 179 | DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100; |
180 | 180 | ||
181 | void update_perf_cpu_limits(void) | 181 | void update_perf_cpu_limits(void) |
182 | { | 182 | { |
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void) | |||
184 | 184 | ||
185 | tmp *= sysctl_perf_cpu_time_max_percent; | 185 | tmp *= sysctl_perf_cpu_time_max_percent; |
186 | do_div(tmp, 100); | 186 | do_div(tmp, 100); |
187 | atomic_set(&perf_sample_allowed_ns, tmp); | 187 | ACCESS_ONCE(perf_sample_allowed_ns) = tmp; |
188 | } | 188 | } |
189 | 189 | ||
190 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); | 190 | static int perf_rotate_context(struct perf_cpu_context *cpuctx); |
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write, | |||
193 | void __user *buffer, size_t *lenp, | 193 | void __user *buffer, size_t *lenp, |
194 | loff_t *ppos) | 194 | loff_t *ppos) |
195 | { | 195 | { |
196 | int ret = proc_dointvec(table, write, buffer, lenp, ppos); | 196 | int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
197 | 197 | ||
198 | if (ret || !write) | 198 | if (ret || !write) |
199 | return ret; | 199 | return ret; |
@@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, | |||
228 | * we detect that events are taking too long. | 228 | * we detect that events are taking too long. |
229 | */ | 229 | */ |
230 | #define NR_ACCUMULATED_SAMPLES 128 | 230 | #define NR_ACCUMULATED_SAMPLES 128 |
231 | DEFINE_PER_CPU(u64, running_sample_length); | 231 | static DEFINE_PER_CPU(u64, running_sample_length); |
232 | 232 | ||
233 | void perf_sample_event_took(u64 sample_len_ns) | 233 | void perf_sample_event_took(u64 sample_len_ns) |
234 | { | 234 | { |
235 | u64 avg_local_sample_len; | 235 | u64 avg_local_sample_len; |
236 | u64 local_samples_len; | 236 | u64 local_samples_len; |
237 | u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); | ||
237 | 238 | ||
238 | if (atomic_read(&perf_sample_allowed_ns) == 0) | 239 | if (allowed_ns == 0) |
239 | return; | 240 | return; |
240 | 241 | ||
241 | /* decay the counter by 1 average sample */ | 242 | /* decay the counter by 1 average sample */ |
@@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns) | |||
251 | */ | 252 | */ |
252 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; | 253 | avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; |
253 | 254 | ||
254 | if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) | 255 | if (avg_local_sample_len <= allowed_ns) |
255 | return; | 256 | return; |
256 | 257 | ||
257 | if (max_samples_per_tick <= 1) | 258 | if (max_samples_per_tick <= 1) |
@@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns) | |||
262 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; | 263 | perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; |
263 | 264 | ||
264 | printk_ratelimited(KERN_WARNING | 265 | printk_ratelimited(KERN_WARNING |
265 | "perf samples too long (%lld > %d), lowering " | 266 | "perf samples too long (%lld > %lld), lowering " |
266 | "kernel.perf_event_max_sample_rate to %d\n", | 267 | "kernel.perf_event_max_sample_rate to %d\n", |
267 | avg_local_sample_len, | 268 | avg_local_sample_len, allowed_ns, |
268 | atomic_read(&perf_sample_allowed_ns), | ||
269 | sysctl_perf_event_sample_rate); | 269 | sysctl_perf_event_sample_rate); |
270 | 270 | ||
271 | update_perf_cpu_limits(); | 271 | update_perf_cpu_limits(); |
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx) | |||
899 | put_ctx(ctx->parent_ctx); | 899 | put_ctx(ctx->parent_ctx); |
900 | ctx->parent_ctx = NULL; | 900 | ctx->parent_ctx = NULL; |
901 | } | 901 | } |
902 | ctx->generation++; | ||
902 | } | 903 | } |
903 | 904 | ||
904 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) | 905 | static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) |
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) | |||
1136 | ctx->nr_events++; | 1137 | ctx->nr_events++; |
1137 | if (event->attr.inherit_stat) | 1138 | if (event->attr.inherit_stat) |
1138 | ctx->nr_stat++; | 1139 | ctx->nr_stat++; |
1140 | |||
1141 | ctx->generation++; | ||
1139 | } | 1142 | } |
1140 | 1143 | ||
1141 | /* | 1144 | /* |
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event) | |||
1201 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 1204 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
1202 | size += sizeof(data->data_src.val); | 1205 | size += sizeof(data->data_src.val); |
1203 | 1206 | ||
1207 | if (sample_type & PERF_SAMPLE_TRANSACTION) | ||
1208 | size += sizeof(data->txn); | ||
1209 | |||
1204 | event->header_size = size; | 1210 | event->header_size = size; |
1205 | } | 1211 | } |
1206 | 1212 | ||
@@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) | |||
1310 | */ | 1316 | */ |
1311 | if (event->state > PERF_EVENT_STATE_OFF) | 1317 | if (event->state > PERF_EVENT_STATE_OFF) |
1312 | event->state = PERF_EVENT_STATE_OFF; | 1318 | event->state = PERF_EVENT_STATE_OFF; |
1319 | |||
1320 | ctx->generation++; | ||
1313 | } | 1321 | } |
1314 | 1322 | ||
1315 | static void perf_group_detach(struct perf_event *event) | 1323 | static void perf_group_detach(struct perf_event *event) |
@@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx, | |||
2146 | } | 2154 | } |
2147 | 2155 | ||
2148 | /* | 2156 | /* |
2149 | * Test whether two contexts are equivalent, i.e. whether they | 2157 | * Test whether two contexts are equivalent, i.e. whether they have both been |
2150 | * have both been cloned from the same version of the same context | 2158 | * cloned from the same version of the same context. |
2151 | * and they both have the same number of enabled events. | 2159 | * |
2152 | * If the number of enabled events is the same, then the set | 2160 | * Equivalence is measured using a generation number in the context that is |
2153 | * of enabled events should be the same, because these are both | 2161 | * incremented on each modification to it; see unclone_ctx(), list_add_event() |
2154 | * inherited contexts, therefore we can't access individual events | 2162 | * and list_del_event(). |
2155 | * in them directly with an fd; we can only enable/disable all | ||
2156 | * events via prctl, or enable/disable all events in a family | ||
2157 | * via ioctl, which will have the same effect on both contexts. | ||
2158 | */ | 2163 | */ |
2159 | static int context_equiv(struct perf_event_context *ctx1, | 2164 | static int context_equiv(struct perf_event_context *ctx1, |
2160 | struct perf_event_context *ctx2) | 2165 | struct perf_event_context *ctx2) |
2161 | { | 2166 | { |
2162 | return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx | 2167 | /* Pinning disables the swap optimization */ |
2163 | && ctx1->parent_gen == ctx2->parent_gen | 2168 | if (ctx1->pin_count || ctx2->pin_count) |
2164 | && !ctx1->pin_count && !ctx2->pin_count; | 2169 | return 0; |
2170 | |||
2171 | /* If ctx1 is the parent of ctx2 */ | ||
2172 | if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen) | ||
2173 | return 1; | ||
2174 | |||
2175 | /* If ctx2 is the parent of ctx1 */ | ||
2176 | if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation) | ||
2177 | return 1; | ||
2178 | |||
2179 | /* | ||
2180 | * If ctx1 and ctx2 have the same parent; we flatten the parent | ||
2181 | * hierarchy, see perf_event_init_context(). | ||
2182 | */ | ||
2183 | if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx && | ||
2184 | ctx1->parent_gen == ctx2->parent_gen) | ||
2185 | return 1; | ||
2186 | |||
2187 | /* Unmatched */ | ||
2188 | return 0; | ||
2165 | } | 2189 | } |
2166 | 2190 | ||
2167 | static void __perf_event_sync_stat(struct perf_event *event, | 2191 | static void __perf_event_sync_stat(struct perf_event *event, |
@@ -2244,7 +2268,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2244 | { | 2268 | { |
2245 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; | 2269 | struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; |
2246 | struct perf_event_context *next_ctx; | 2270 | struct perf_event_context *next_ctx; |
2247 | struct perf_event_context *parent; | 2271 | struct perf_event_context *parent, *next_parent; |
2248 | struct perf_cpu_context *cpuctx; | 2272 | struct perf_cpu_context *cpuctx; |
2249 | int do_switch = 1; | 2273 | int do_switch = 1; |
2250 | 2274 | ||
@@ -2256,10 +2280,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2256 | return; | 2280 | return; |
2257 | 2281 | ||
2258 | rcu_read_lock(); | 2282 | rcu_read_lock(); |
2259 | parent = rcu_dereference(ctx->parent_ctx); | ||
2260 | next_ctx = next->perf_event_ctxp[ctxn]; | 2283 | next_ctx = next->perf_event_ctxp[ctxn]; |
2261 | if (parent && next_ctx && | 2284 | if (!next_ctx) |
2262 | rcu_dereference(next_ctx->parent_ctx) == parent) { | 2285 | goto unlock; |
2286 | |||
2287 | parent = rcu_dereference(ctx->parent_ctx); | ||
2288 | next_parent = rcu_dereference(next_ctx->parent_ctx); | ||
2289 | |||
2290 | /* If neither context have a parent context; they cannot be clones. */ | ||
2291 | if (!parent && !next_parent) | ||
2292 | goto unlock; | ||
2293 | |||
2294 | if (next_parent == ctx || next_ctx == parent || next_parent == parent) { | ||
2263 | /* | 2295 | /* |
2264 | * Looks like the two contexts are clones, so we might be | 2296 | * Looks like the two contexts are clones, so we might be |
2265 | * able to optimize the context switch. We lock both | 2297 | * able to optimize the context switch. We lock both |
@@ -2287,6 +2319,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, | |||
2287 | raw_spin_unlock(&next_ctx->lock); | 2319 | raw_spin_unlock(&next_ctx->lock); |
2288 | raw_spin_unlock(&ctx->lock); | 2320 | raw_spin_unlock(&ctx->lock); |
2289 | } | 2321 | } |
2322 | unlock: | ||
2290 | rcu_read_unlock(); | 2323 | rcu_read_unlock(); |
2291 | 2324 | ||
2292 | if (do_switch) { | 2325 | if (do_switch) { |
@@ -4572,6 +4605,9 @@ void perf_output_sample(struct perf_output_handle *handle, | |||
4572 | if (sample_type & PERF_SAMPLE_DATA_SRC) | 4605 | if (sample_type & PERF_SAMPLE_DATA_SRC) |
4573 | perf_output_put(handle, data->data_src.val); | 4606 | perf_output_put(handle, data->data_src.val); |
4574 | 4607 | ||
4608 | if (sample_type & PERF_SAMPLE_TRANSACTION) | ||
4609 | perf_output_put(handle, data->txn); | ||
4610 | |||
4575 | if (!event->attr.watermark) { | 4611 | if (!event->attr.watermark) { |
4576 | int wakeup_events = event->attr.wakeup_events; | 4612 | int wakeup_events = event->attr.wakeup_events; |
4577 | 4613 | ||
@@ -5100,27 +5136,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5100 | unsigned int size; | 5136 | unsigned int size; |
5101 | char tmp[16]; | 5137 | char tmp[16]; |
5102 | char *buf = NULL; | 5138 | char *buf = NULL; |
5103 | const char *name; | 5139 | char *name; |
5104 | |||
5105 | memset(tmp, 0, sizeof(tmp)); | ||
5106 | 5140 | ||
5107 | if (file) { | 5141 | if (file) { |
5108 | struct inode *inode; | 5142 | struct inode *inode; |
5109 | dev_t dev; | 5143 | dev_t dev; |
5144 | |||
5145 | buf = kmalloc(PATH_MAX, GFP_KERNEL); | ||
5146 | if (!buf) { | ||
5147 | name = "//enomem"; | ||
5148 | goto cpy_name; | ||
5149 | } | ||
5110 | /* | 5150 | /* |
5111 | * d_path works from the end of the rb backwards, so we | 5151 | * d_path() works from the end of the rb backwards, so we |
5112 | * need to add enough zero bytes after the string to handle | 5152 | * need to add enough zero bytes after the string to handle |
5113 | * the 64bit alignment we do later. | 5153 | * the 64bit alignment we do later. |
5114 | */ | 5154 | */ |
5115 | buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); | 5155 | name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64)); |
5116 | if (!buf) { | ||
5117 | name = strncpy(tmp, "//enomem", sizeof(tmp)); | ||
5118 | goto got_name; | ||
5119 | } | ||
5120 | name = d_path(&file->f_path, buf, PATH_MAX); | ||
5121 | if (IS_ERR(name)) { | 5156 | if (IS_ERR(name)) { |
5122 | name = strncpy(tmp, "//toolong", sizeof(tmp)); | 5157 | name = "//toolong"; |
5123 | goto got_name; | 5158 | goto cpy_name; |
5124 | } | 5159 | } |
5125 | inode = file_inode(vma->vm_file); | 5160 | inode = file_inode(vma->vm_file); |
5126 | dev = inode->i_sb->s_dev; | 5161 | dev = inode->i_sb->s_dev; |
@@ -5128,34 +5163,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) | |||
5128 | gen = inode->i_generation; | 5163 | gen = inode->i_generation; |
5129 | maj = MAJOR(dev); | 5164 | maj = MAJOR(dev); |
5130 | min = MINOR(dev); | 5165 | min = MINOR(dev); |
5131 | 5166 | goto got_name; | |
5132 | } else { | 5167 | } else { |
5133 | if (arch_vma_name(mmap_event->vma)) { | 5168 | name = (char *)arch_vma_name(vma); |
5134 | name = strncpy(tmp, arch_vma_name(mmap_event->vma), | 5169 | if (name) |
5135 | sizeof(tmp) - 1); | 5170 | goto cpy_name; |
5136 | tmp[sizeof(tmp) - 1] = '\0'; | ||
5137 | goto got_name; | ||
5138 | } | ||
5139 | 5171 | ||
5140 | if (!vma->vm_mm) { | 5172 | if (vma->vm_start <= vma->vm_mm->start_brk && |
5141 | name = strncpy(tmp, "[vdso]", sizeof(tmp)); | ||
5142 | goto got_name; | ||
5143 | } else if (vma->vm_start <= vma->vm_mm->start_brk && | ||
5144 | vma->vm_end >= vma->vm_mm->brk) { | 5173 | vma->vm_end >= vma->vm_mm->brk) { |
5145 | name = strncpy(tmp, "[heap]", sizeof(tmp)); | 5174 | name = "[heap]"; |
5146 | goto got_name; | 5175 | goto cpy_name; |
5147 | } else if (vma->vm_start <= vma->vm_mm->start_stack && | 5176 | } |
5177 | if (vma->vm_start <= vma->vm_mm->start_stack && | ||
5148 | vma->vm_end >= vma->vm_mm->start_stack) { | 5178 | vma->vm_end >= vma->vm_mm->start_stack) { |
5149 | name = strncpy(tmp, "[stack]", sizeof(tmp)); | 5179 | name = "[stack]"; |
5150 | goto got_name; | 5180 | goto cpy_name; |
5151 | } | 5181 | } |
5152 | 5182 | ||
5153 | name = strncpy(tmp, "//anon", sizeof(tmp)); | 5183 | name = "//anon"; |
5154 | goto got_name; | 5184 | goto cpy_name; |
5155 | } | 5185 | } |
5156 | 5186 | ||
5187 | cpy_name: | ||
5188 | strlcpy(tmp, name, sizeof(tmp)); | ||
5189 | name = tmp; | ||
5157 | got_name: | 5190 | got_name: |
5158 | size = ALIGN(strlen(name)+1, sizeof(u64)); | 5191 | /* |
5192 | * Since our buffer works in 8 byte units we need to align our string | ||
5193 | * size to a multiple of 8. However, we must guarantee the tail end is | ||
5194 | * zero'd out to avoid leaking random bits to userspace. | ||
5195 | */ | ||
5196 | size = strlen(name)+1; | ||
5197 | while (!IS_ALIGNED(size, sizeof(u64))) | ||
5198 | name[size++] = '\0'; | ||
5159 | 5199 | ||
5160 | mmap_event->file_name = name; | 5200 | mmap_event->file_name = name; |
5161 | mmap_event->file_size = size; | 5201 | mmap_event->file_size = size; |
@@ -7129,7 +7169,6 @@ SYSCALL_DEFINE5(perf_event_open, | |||
7129 | } | 7169 | } |
7130 | 7170 | ||
7131 | perf_install_in_context(ctx, event, event->cpu); | 7171 | perf_install_in_context(ctx, event, event->cpu); |
7132 | ++ctx->generation; | ||
7133 | perf_unpin_context(ctx); | 7172 | perf_unpin_context(ctx); |
7134 | mutex_unlock(&ctx->mutex); | 7173 | mutex_unlock(&ctx->mutex); |
7135 | 7174 | ||
@@ -7212,7 +7251,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, | |||
7212 | WARN_ON_ONCE(ctx->parent_ctx); | 7251 | WARN_ON_ONCE(ctx->parent_ctx); |
7213 | mutex_lock(&ctx->mutex); | 7252 | mutex_lock(&ctx->mutex); |
7214 | perf_install_in_context(ctx, event, cpu); | 7253 | perf_install_in_context(ctx, event, cpu); |
7215 | ++ctx->generation; | ||
7216 | perf_unpin_context(ctx); | 7254 | perf_unpin_context(ctx); |
7217 | mutex_unlock(&ctx->mutex); | 7255 | mutex_unlock(&ctx->mutex); |
7218 | 7256 | ||