aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/events/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/events/core.c')
-rw-r--r--kernel/events/core.c156
1 files changed, 97 insertions, 59 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 663f43a20f73..8c875ef6e120 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
175static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 175static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
176static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; 176static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
177 177
178static atomic_t perf_sample_allowed_ns __read_mostly = 178static int perf_sample_allowed_ns __read_mostly =
179 ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); 179 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
180 180
181void update_perf_cpu_limits(void) 181void update_perf_cpu_limits(void)
182{ 182{
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void)
184 184
185 tmp *= sysctl_perf_cpu_time_max_percent; 185 tmp *= sysctl_perf_cpu_time_max_percent;
186 do_div(tmp, 100); 186 do_div(tmp, 100);
187 atomic_set(&perf_sample_allowed_ns, tmp); 187 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
188} 188}
189 189
190static int perf_rotate_context(struct perf_cpu_context *cpuctx); 190static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
193 void __user *buffer, size_t *lenp, 193 void __user *buffer, size_t *lenp,
194 loff_t *ppos) 194 loff_t *ppos)
195{ 195{
196 int ret = proc_dointvec(table, write, buffer, lenp, ppos); 196 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
197 197
198 if (ret || !write) 198 if (ret || !write)
199 return ret; 199 return ret;
@@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
228 * we detect that events are taking too long. 228 * we detect that events are taking too long.
229 */ 229 */
230#define NR_ACCUMULATED_SAMPLES 128 230#define NR_ACCUMULATED_SAMPLES 128
231DEFINE_PER_CPU(u64, running_sample_length); 231static DEFINE_PER_CPU(u64, running_sample_length);
232 232
233void perf_sample_event_took(u64 sample_len_ns) 233void perf_sample_event_took(u64 sample_len_ns)
234{ 234{
235 u64 avg_local_sample_len; 235 u64 avg_local_sample_len;
236 u64 local_samples_len; 236 u64 local_samples_len;
237 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
237 238
238 if (atomic_read(&perf_sample_allowed_ns) == 0) 239 if (allowed_ns == 0)
239 return; 240 return;
240 241
241 /* decay the counter by 1 average sample */ 242 /* decay the counter by 1 average sample */
@@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)
251 */ 252 */
252 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; 253 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
253 254
254 if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) 255 if (avg_local_sample_len <= allowed_ns)
255 return; 256 return;
256 257
257 if (max_samples_per_tick <= 1) 258 if (max_samples_per_tick <= 1)
@@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)
262 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 263 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
263 264
264 printk_ratelimited(KERN_WARNING 265 printk_ratelimited(KERN_WARNING
265 "perf samples too long (%lld > %d), lowering " 266 "perf samples too long (%lld > %lld), lowering "
266 "kernel.perf_event_max_sample_rate to %d\n", 267 "kernel.perf_event_max_sample_rate to %d\n",
267 avg_local_sample_len, 268 avg_local_sample_len, allowed_ns,
268 atomic_read(&perf_sample_allowed_ns),
269 sysctl_perf_event_sample_rate); 269 sysctl_perf_event_sample_rate);
270 270
271 update_perf_cpu_limits(); 271 update_perf_cpu_limits();
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
899 put_ctx(ctx->parent_ctx); 899 put_ctx(ctx->parent_ctx);
900 ctx->parent_ctx = NULL; 900 ctx->parent_ctx = NULL;
901 } 901 }
902 ctx->generation++;
902} 903}
903 904
904static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 905static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1136 ctx->nr_events++; 1137 ctx->nr_events++;
1137 if (event->attr.inherit_stat) 1138 if (event->attr.inherit_stat)
1138 ctx->nr_stat++; 1139 ctx->nr_stat++;
1140
1141 ctx->generation++;
1139} 1142}
1140 1143
1141/* 1144/*
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)
1201 if (sample_type & PERF_SAMPLE_DATA_SRC) 1204 if (sample_type & PERF_SAMPLE_DATA_SRC)
1202 size += sizeof(data->data_src.val); 1205 size += sizeof(data->data_src.val);
1203 1206
1207 if (sample_type & PERF_SAMPLE_TRANSACTION)
1208 size += sizeof(data->txn);
1209
1204 event->header_size = size; 1210 event->header_size = size;
1205} 1211}
1206 1212
@@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1310 */ 1316 */
1311 if (event->state > PERF_EVENT_STATE_OFF) 1317 if (event->state > PERF_EVENT_STATE_OFF)
1312 event->state = PERF_EVENT_STATE_OFF; 1318 event->state = PERF_EVENT_STATE_OFF;
1319
1320 ctx->generation++;
1313} 1321}
1314 1322
1315static void perf_group_detach(struct perf_event *event) 1323static void perf_group_detach(struct perf_event *event)
@@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
2146} 2154}
2147 2155
2148/* 2156/*
2149 * Test whether two contexts are equivalent, i.e. whether they 2157 * Test whether two contexts are equivalent, i.e. whether they have both been
2150 * have both been cloned from the same version of the same context 2158 * cloned from the same version of the same context.
2151 * and they both have the same number of enabled events. 2159 *
2152 * If the number of enabled events is the same, then the set 2160 * Equivalence is measured using a generation number in the context that is
2153 * of enabled events should be the same, because these are both 2161 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2154 * inherited contexts, therefore we can't access individual events 2162 * and list_del_event().
2155 * in them directly with an fd; we can only enable/disable all
2156 * events via prctl, or enable/disable all events in a family
2157 * via ioctl, which will have the same effect on both contexts.
2158 */ 2163 */
2159static int context_equiv(struct perf_event_context *ctx1, 2164static int context_equiv(struct perf_event_context *ctx1,
2160 struct perf_event_context *ctx2) 2165 struct perf_event_context *ctx2)
2161{ 2166{
2162 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx 2167 /* Pinning disables the swap optimization */
2163 && ctx1->parent_gen == ctx2->parent_gen 2168 if (ctx1->pin_count || ctx2->pin_count)
2164 && !ctx1->pin_count && !ctx2->pin_count; 2169 return 0;
2170
2171 /* If ctx1 is the parent of ctx2 */
2172 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2173 return 1;
2174
2175 /* If ctx2 is the parent of ctx1 */
2176 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2177 return 1;
2178
2179 /*
2180 * If ctx1 and ctx2 have the same parent; we flatten the parent
2181 * hierarchy, see perf_event_init_context().
2182 */
2183 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2184 ctx1->parent_gen == ctx2->parent_gen)
2185 return 1;
2186
2187 /* Unmatched */
2188 return 0;
2165} 2189}
2166 2190
2167static void __perf_event_sync_stat(struct perf_event *event, 2191static void __perf_event_sync_stat(struct perf_event *event,
@@ -2244,7 +2268,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2244{ 2268{
2245 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 2269 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2246 struct perf_event_context *next_ctx; 2270 struct perf_event_context *next_ctx;
2247 struct perf_event_context *parent; 2271 struct perf_event_context *parent, *next_parent;
2248 struct perf_cpu_context *cpuctx; 2272 struct perf_cpu_context *cpuctx;
2249 int do_switch = 1; 2273 int do_switch = 1;
2250 2274
@@ -2256,10 +2280,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2256 return; 2280 return;
2257 2281
2258 rcu_read_lock(); 2282 rcu_read_lock();
2259 parent = rcu_dereference(ctx->parent_ctx);
2260 next_ctx = next->perf_event_ctxp[ctxn]; 2283 next_ctx = next->perf_event_ctxp[ctxn];
2261 if (parent && next_ctx && 2284 if (!next_ctx)
2262 rcu_dereference(next_ctx->parent_ctx) == parent) { 2285 goto unlock;
2286
2287 parent = rcu_dereference(ctx->parent_ctx);
2288 next_parent = rcu_dereference(next_ctx->parent_ctx);
2289
2290 /* If neither context have a parent context; they cannot be clones. */
2291 if (!parent && !next_parent)
2292 goto unlock;
2293
2294 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2263 /* 2295 /*
2264 * Looks like the two contexts are clones, so we might be 2296 * Looks like the two contexts are clones, so we might be
2265 * able to optimize the context switch. We lock both 2297 * able to optimize the context switch. We lock both
@@ -2287,6 +2319,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2287 raw_spin_unlock(&next_ctx->lock); 2319 raw_spin_unlock(&next_ctx->lock);
2288 raw_spin_unlock(&ctx->lock); 2320 raw_spin_unlock(&ctx->lock);
2289 } 2321 }
2322unlock:
2290 rcu_read_unlock(); 2323 rcu_read_unlock();
2291 2324
2292 if (do_switch) { 2325 if (do_switch) {
@@ -4572,6 +4605,9 @@ void perf_output_sample(struct perf_output_handle *handle,
4572 if (sample_type & PERF_SAMPLE_DATA_SRC) 4605 if (sample_type & PERF_SAMPLE_DATA_SRC)
4573 perf_output_put(handle, data->data_src.val); 4606 perf_output_put(handle, data->data_src.val);
4574 4607
4608 if (sample_type & PERF_SAMPLE_TRANSACTION)
4609 perf_output_put(handle, data->txn);
4610
4575 if (!event->attr.watermark) { 4611 if (!event->attr.watermark) {
4576 int wakeup_events = event->attr.wakeup_events; 4612 int wakeup_events = event->attr.wakeup_events;
4577 4613
@@ -5100,27 +5136,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5100 unsigned int size; 5136 unsigned int size;
5101 char tmp[16]; 5137 char tmp[16];
5102 char *buf = NULL; 5138 char *buf = NULL;
5103 const char *name; 5139 char *name;
5104
5105 memset(tmp, 0, sizeof(tmp));
5106 5140
5107 if (file) { 5141 if (file) {
5108 struct inode *inode; 5142 struct inode *inode;
5109 dev_t dev; 5143 dev_t dev;
5144
5145 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5146 if (!buf) {
5147 name = "//enomem";
5148 goto cpy_name;
5149 }
5110 /* 5150 /*
5111 * d_path works from the end of the rb backwards, so we 5151 * d_path() works from the end of the rb backwards, so we
5112 * need to add enough zero bytes after the string to handle 5152 * need to add enough zero bytes after the string to handle
5113 * the 64bit alignment we do later. 5153 * the 64bit alignment we do later.
5114 */ 5154 */
5115 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); 5155 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5116 if (!buf) {
5117 name = strncpy(tmp, "//enomem", sizeof(tmp));
5118 goto got_name;
5119 }
5120 name = d_path(&file->f_path, buf, PATH_MAX);
5121 if (IS_ERR(name)) { 5156 if (IS_ERR(name)) {
5122 name = strncpy(tmp, "//toolong", sizeof(tmp)); 5157 name = "//toolong";
5123 goto got_name; 5158 goto cpy_name;
5124 } 5159 }
5125 inode = file_inode(vma->vm_file); 5160 inode = file_inode(vma->vm_file);
5126 dev = inode->i_sb->s_dev; 5161 dev = inode->i_sb->s_dev;
@@ -5128,34 +5163,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5128 gen = inode->i_generation; 5163 gen = inode->i_generation;
5129 maj = MAJOR(dev); 5164 maj = MAJOR(dev);
5130 min = MINOR(dev); 5165 min = MINOR(dev);
5131 5166 goto got_name;
5132 } else { 5167 } else {
5133 if (arch_vma_name(mmap_event->vma)) { 5168 name = (char *)arch_vma_name(vma);
5134 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 5169 if (name)
5135 sizeof(tmp) - 1); 5170 goto cpy_name;
5136 tmp[sizeof(tmp) - 1] = '\0';
5137 goto got_name;
5138 }
5139 5171
5140 if (!vma->vm_mm) { 5172 if (vma->vm_start <= vma->vm_mm->start_brk &&
5141 name = strncpy(tmp, "[vdso]", sizeof(tmp));
5142 goto got_name;
5143 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
5144 vma->vm_end >= vma->vm_mm->brk) { 5173 vma->vm_end >= vma->vm_mm->brk) {
5145 name = strncpy(tmp, "[heap]", sizeof(tmp)); 5174 name = "[heap]";
5146 goto got_name; 5175 goto cpy_name;
5147 } else if (vma->vm_start <= vma->vm_mm->start_stack && 5176 }
5177 if (vma->vm_start <= vma->vm_mm->start_stack &&
5148 vma->vm_end >= vma->vm_mm->start_stack) { 5178 vma->vm_end >= vma->vm_mm->start_stack) {
5149 name = strncpy(tmp, "[stack]", sizeof(tmp)); 5179 name = "[stack]";
5150 goto got_name; 5180 goto cpy_name;
5151 } 5181 }
5152 5182
5153 name = strncpy(tmp, "//anon", sizeof(tmp)); 5183 name = "//anon";
5154 goto got_name; 5184 goto cpy_name;
5155 } 5185 }
5156 5186
5187cpy_name:
5188 strlcpy(tmp, name, sizeof(tmp));
5189 name = tmp;
5157got_name: 5190got_name:
5158 size = ALIGN(strlen(name)+1, sizeof(u64)); 5191 /*
5192 * Since our buffer works in 8 byte units we need to align our string
5193 * size to a multiple of 8. However, we must guarantee the tail end is
5194 * zero'd out to avoid leaking random bits to userspace.
5195 */
5196 size = strlen(name)+1;
5197 while (!IS_ALIGNED(size, sizeof(u64)))
5198 name[size++] = '\0';
5159 5199
5160 mmap_event->file_name = name; 5200 mmap_event->file_name = name;
5161 mmap_event->file_size = size; 5201 mmap_event->file_size = size;
@@ -7129,7 +7169,6 @@ SYSCALL_DEFINE5(perf_event_open,
7129 } 7169 }
7130 7170
7131 perf_install_in_context(ctx, event, event->cpu); 7171 perf_install_in_context(ctx, event, event->cpu);
7132 ++ctx->generation;
7133 perf_unpin_context(ctx); 7172 perf_unpin_context(ctx);
7134 mutex_unlock(&ctx->mutex); 7173 mutex_unlock(&ctx->mutex);
7135 7174
@@ -7212,7 +7251,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7212 WARN_ON_ONCE(ctx->parent_ctx); 7251 WARN_ON_ONCE(ctx->parent_ctx);
7213 mutex_lock(&ctx->mutex); 7252 mutex_lock(&ctx->mutex);
7214 perf_install_in_context(ctx, event, cpu); 7253 perf_install_in_context(ctx, event, cpu);
7215 ++ctx->generation;
7216 perf_unpin_context(ctx); 7254 perf_unpin_context(ctx);
7217 mutex_unlock(&ctx->mutex); 7255 mutex_unlock(&ctx->mutex);
7218 7256