aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/core.c156
-rw-r--r--kernel/events/internal.h35
-rw-r--r--kernel/events/ring_buffer.c101
-rw-r--r--kernel/events/uprobes.c223
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/sysctl.c1
6 files changed, 314 insertions, 204 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 663f43a20f73..8c875ef6e120 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
175static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 175static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
176static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS; 176static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
177 177
178static atomic_t perf_sample_allowed_ns __read_mostly = 178static int perf_sample_allowed_ns __read_mostly =
179 ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100); 179 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
180 180
181void update_perf_cpu_limits(void) 181void update_perf_cpu_limits(void)
182{ 182{
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void)
184 184
185 tmp *= sysctl_perf_cpu_time_max_percent; 185 tmp *= sysctl_perf_cpu_time_max_percent;
186 do_div(tmp, 100); 186 do_div(tmp, 100);
187 atomic_set(&perf_sample_allowed_ns, tmp); 187 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
188} 188}
189 189
190static int perf_rotate_context(struct perf_cpu_context *cpuctx); 190static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
193 void __user *buffer, size_t *lenp, 193 void __user *buffer, size_t *lenp,
194 loff_t *ppos) 194 loff_t *ppos)
195{ 195{
196 int ret = proc_dointvec(table, write, buffer, lenp, ppos); 196 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
197 197
198 if (ret || !write) 198 if (ret || !write)
199 return ret; 199 return ret;
@@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
228 * we detect that events are taking too long. 228 * we detect that events are taking too long.
229 */ 229 */
230#define NR_ACCUMULATED_SAMPLES 128 230#define NR_ACCUMULATED_SAMPLES 128
231DEFINE_PER_CPU(u64, running_sample_length); 231static DEFINE_PER_CPU(u64, running_sample_length);
232 232
233void perf_sample_event_took(u64 sample_len_ns) 233void perf_sample_event_took(u64 sample_len_ns)
234{ 234{
235 u64 avg_local_sample_len; 235 u64 avg_local_sample_len;
236 u64 local_samples_len; 236 u64 local_samples_len;
237 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
237 238
238 if (atomic_read(&perf_sample_allowed_ns) == 0) 239 if (allowed_ns == 0)
239 return; 240 return;
240 241
241 /* decay the counter by 1 average sample */ 242 /* decay the counter by 1 average sample */
@@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)
251 */ 252 */
252 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; 253 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
253 254
254 if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns)) 255 if (avg_local_sample_len <= allowed_ns)
255 return; 256 return;
256 257
257 if (max_samples_per_tick <= 1) 258 if (max_samples_per_tick <= 1)
@@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)
262 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; 263 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
263 264
264 printk_ratelimited(KERN_WARNING 265 printk_ratelimited(KERN_WARNING
265 "perf samples too long (%lld > %d), lowering " 266 "perf samples too long (%lld > %lld), lowering "
266 "kernel.perf_event_max_sample_rate to %d\n", 267 "kernel.perf_event_max_sample_rate to %d\n",
267 avg_local_sample_len, 268 avg_local_sample_len, allowed_ns,
268 atomic_read(&perf_sample_allowed_ns),
269 sysctl_perf_event_sample_rate); 269 sysctl_perf_event_sample_rate);
270 270
271 update_perf_cpu_limits(); 271 update_perf_cpu_limits();
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
899 put_ctx(ctx->parent_ctx); 899 put_ctx(ctx->parent_ctx);
900 ctx->parent_ctx = NULL; 900 ctx->parent_ctx = NULL;
901 } 901 }
902 ctx->generation++;
902} 903}
903 904
904static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) 905static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1136 ctx->nr_events++; 1137 ctx->nr_events++;
1137 if (event->attr.inherit_stat) 1138 if (event->attr.inherit_stat)
1138 ctx->nr_stat++; 1139 ctx->nr_stat++;
1140
1141 ctx->generation++;
1139} 1142}
1140 1143
1141/* 1144/*
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)
1201 if (sample_type & PERF_SAMPLE_DATA_SRC) 1204 if (sample_type & PERF_SAMPLE_DATA_SRC)
1202 size += sizeof(data->data_src.val); 1205 size += sizeof(data->data_src.val);
1203 1206
1207 if (sample_type & PERF_SAMPLE_TRANSACTION)
1208 size += sizeof(data->txn);
1209
1204 event->header_size = size; 1210 event->header_size = size;
1205} 1211}
1206 1212
@@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1310 */ 1316 */
1311 if (event->state > PERF_EVENT_STATE_OFF) 1317 if (event->state > PERF_EVENT_STATE_OFF)
1312 event->state = PERF_EVENT_STATE_OFF; 1318 event->state = PERF_EVENT_STATE_OFF;
1319
1320 ctx->generation++;
1313} 1321}
1314 1322
1315static void perf_group_detach(struct perf_event *event) 1323static void perf_group_detach(struct perf_event *event)
@@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
2146} 2154}
2147 2155
2148/* 2156/*
2149 * Test whether two contexts are equivalent, i.e. whether they 2157 * Test whether two contexts are equivalent, i.e. whether they have both been
2150 * have both been cloned from the same version of the same context 2158 * cloned from the same version of the same context.
2151 * and they both have the same number of enabled events. 2159 *
2152 * If the number of enabled events is the same, then the set 2160 * Equivalence is measured using a generation number in the context that is
2153 * of enabled events should be the same, because these are both 2161 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2154 * inherited contexts, therefore we can't access individual events 2162 * and list_del_event().
2155 * in them directly with an fd; we can only enable/disable all
2156 * events via prctl, or enable/disable all events in a family
2157 * via ioctl, which will have the same effect on both contexts.
2158 */ 2163 */
2159static int context_equiv(struct perf_event_context *ctx1, 2164static int context_equiv(struct perf_event_context *ctx1,
2160 struct perf_event_context *ctx2) 2165 struct perf_event_context *ctx2)
2161{ 2166{
2162 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx 2167 /* Pinning disables the swap optimization */
2163 && ctx1->parent_gen == ctx2->parent_gen 2168 if (ctx1->pin_count || ctx2->pin_count)
2164 && !ctx1->pin_count && !ctx2->pin_count; 2169 return 0;
2170
2171 /* If ctx1 is the parent of ctx2 */
2172 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2173 return 1;
2174
2175 /* If ctx2 is the parent of ctx1 */
2176 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2177 return 1;
2178
2179 /*
2180 * If ctx1 and ctx2 have the same parent; we flatten the parent
2181 * hierarchy, see perf_event_init_context().
2182 */
2183 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2184 ctx1->parent_gen == ctx2->parent_gen)
2185 return 1;
2186
2187 /* Unmatched */
2188 return 0;
2165} 2189}
2166 2190
2167static void __perf_event_sync_stat(struct perf_event *event, 2191static void __perf_event_sync_stat(struct perf_event *event,
@@ -2244,7 +2268,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2244{ 2268{
2245 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; 2269 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2246 struct perf_event_context *next_ctx; 2270 struct perf_event_context *next_ctx;
2247 struct perf_event_context *parent; 2271 struct perf_event_context *parent, *next_parent;
2248 struct perf_cpu_context *cpuctx; 2272 struct perf_cpu_context *cpuctx;
2249 int do_switch = 1; 2273 int do_switch = 1;
2250 2274
@@ -2256,10 +2280,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2256 return; 2280 return;
2257 2281
2258 rcu_read_lock(); 2282 rcu_read_lock();
2259 parent = rcu_dereference(ctx->parent_ctx);
2260 next_ctx = next->perf_event_ctxp[ctxn]; 2283 next_ctx = next->perf_event_ctxp[ctxn];
2261 if (parent && next_ctx && 2284 if (!next_ctx)
2262 rcu_dereference(next_ctx->parent_ctx) == parent) { 2285 goto unlock;
2286
2287 parent = rcu_dereference(ctx->parent_ctx);
2288 next_parent = rcu_dereference(next_ctx->parent_ctx);
2289
2290 /* If neither context have a parent context; they cannot be clones. */
2291 if (!parent && !next_parent)
2292 goto unlock;
2293
2294 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2263 /* 2295 /*
2264 * Looks like the two contexts are clones, so we might be 2296 * Looks like the two contexts are clones, so we might be
2265 * able to optimize the context switch. We lock both 2297 * able to optimize the context switch. We lock both
@@ -2287,6 +2319,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2287 raw_spin_unlock(&next_ctx->lock); 2319 raw_spin_unlock(&next_ctx->lock);
2288 raw_spin_unlock(&ctx->lock); 2320 raw_spin_unlock(&ctx->lock);
2289 } 2321 }
2322unlock:
2290 rcu_read_unlock(); 2323 rcu_read_unlock();
2291 2324
2292 if (do_switch) { 2325 if (do_switch) {
@@ -4572,6 +4605,9 @@ void perf_output_sample(struct perf_output_handle *handle,
4572 if (sample_type & PERF_SAMPLE_DATA_SRC) 4605 if (sample_type & PERF_SAMPLE_DATA_SRC)
4573 perf_output_put(handle, data->data_src.val); 4606 perf_output_put(handle, data->data_src.val);
4574 4607
4608 if (sample_type & PERF_SAMPLE_TRANSACTION)
4609 perf_output_put(handle, data->txn);
4610
4575 if (!event->attr.watermark) { 4611 if (!event->attr.watermark) {
4576 int wakeup_events = event->attr.wakeup_events; 4612 int wakeup_events = event->attr.wakeup_events;
4577 4613
@@ -5100,27 +5136,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5100 unsigned int size; 5136 unsigned int size;
5101 char tmp[16]; 5137 char tmp[16];
5102 char *buf = NULL; 5138 char *buf = NULL;
5103 const char *name; 5139 char *name;
5104
5105 memset(tmp, 0, sizeof(tmp));
5106 5140
5107 if (file) { 5141 if (file) {
5108 struct inode *inode; 5142 struct inode *inode;
5109 dev_t dev; 5143 dev_t dev;
5144
5145 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5146 if (!buf) {
5147 name = "//enomem";
5148 goto cpy_name;
5149 }
5110 /* 5150 /*
5111 * d_path works from the end of the rb backwards, so we 5151 * d_path() works from the end of the rb backwards, so we
5112 * need to add enough zero bytes after the string to handle 5152 * need to add enough zero bytes after the string to handle
5113 * the 64bit alignment we do later. 5153 * the 64bit alignment we do later.
5114 */ 5154 */
5115 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); 5155 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5116 if (!buf) {
5117 name = strncpy(tmp, "//enomem", sizeof(tmp));
5118 goto got_name;
5119 }
5120 name = d_path(&file->f_path, buf, PATH_MAX);
5121 if (IS_ERR(name)) { 5156 if (IS_ERR(name)) {
5122 name = strncpy(tmp, "//toolong", sizeof(tmp)); 5157 name = "//toolong";
5123 goto got_name; 5158 goto cpy_name;
5124 } 5159 }
5125 inode = file_inode(vma->vm_file); 5160 inode = file_inode(vma->vm_file);
5126 dev = inode->i_sb->s_dev; 5161 dev = inode->i_sb->s_dev;
@@ -5128,34 +5163,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5128 gen = inode->i_generation; 5163 gen = inode->i_generation;
5129 maj = MAJOR(dev); 5164 maj = MAJOR(dev);
5130 min = MINOR(dev); 5165 min = MINOR(dev);
5131 5166 goto got_name;
5132 } else { 5167 } else {
5133 if (arch_vma_name(mmap_event->vma)) { 5168 name = (char *)arch_vma_name(vma);
5134 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 5169 if (name)
5135 sizeof(tmp) - 1); 5170 goto cpy_name;
5136 tmp[sizeof(tmp) - 1] = '\0';
5137 goto got_name;
5138 }
5139 5171
5140 if (!vma->vm_mm) { 5172 if (vma->vm_start <= vma->vm_mm->start_brk &&
5141 name = strncpy(tmp, "[vdso]", sizeof(tmp));
5142 goto got_name;
5143 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
5144 vma->vm_end >= vma->vm_mm->brk) { 5173 vma->vm_end >= vma->vm_mm->brk) {
5145 name = strncpy(tmp, "[heap]", sizeof(tmp)); 5174 name = "[heap]";
5146 goto got_name; 5175 goto cpy_name;
5147 } else if (vma->vm_start <= vma->vm_mm->start_stack && 5176 }
5177 if (vma->vm_start <= vma->vm_mm->start_stack &&
5148 vma->vm_end >= vma->vm_mm->start_stack) { 5178 vma->vm_end >= vma->vm_mm->start_stack) {
5149 name = strncpy(tmp, "[stack]", sizeof(tmp)); 5179 name = "[stack]";
5150 goto got_name; 5180 goto cpy_name;
5151 } 5181 }
5152 5182
5153 name = strncpy(tmp, "//anon", sizeof(tmp)); 5183 name = "//anon";
5154 goto got_name; 5184 goto cpy_name;
5155 } 5185 }
5156 5186
5187cpy_name:
5188 strlcpy(tmp, name, sizeof(tmp));
5189 name = tmp;
5157got_name: 5190got_name:
5158 size = ALIGN(strlen(name)+1, sizeof(u64)); 5191 /*
5192 * Since our buffer works in 8 byte units we need to align our string
5193 * size to a multiple of 8. However, we must guarantee the tail end is
5194 * zero'd out to avoid leaking random bits to userspace.
5195 */
5196 size = strlen(name)+1;
5197 while (!IS_ALIGNED(size, sizeof(u64)))
5198 name[size++] = '\0';
5159 5199
5160 mmap_event->file_name = name; 5200 mmap_event->file_name = name;
5161 mmap_event->file_size = size; 5201 mmap_event->file_size = size;
@@ -7129,7 +7169,6 @@ SYSCALL_DEFINE5(perf_event_open,
7129 } 7169 }
7130 7170
7131 perf_install_in_context(ctx, event, event->cpu); 7171 perf_install_in_context(ctx, event, event->cpu);
7132 ++ctx->generation;
7133 perf_unpin_context(ctx); 7172 perf_unpin_context(ctx);
7134 mutex_unlock(&ctx->mutex); 7173 mutex_unlock(&ctx->mutex);
7135 7174
@@ -7212,7 +7251,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7212 WARN_ON_ONCE(ctx->parent_ctx); 7251 WARN_ON_ONCE(ctx->parent_ctx);
7213 mutex_lock(&ctx->mutex); 7252 mutex_lock(&ctx->mutex);
7214 perf_install_in_context(ctx, event, cpu); 7253 perf_install_in_context(ctx, event, cpu);
7215 ++ctx->generation;
7216 perf_unpin_context(ctx); 7254 perf_unpin_context(ctx);
7217 mutex_unlock(&ctx->mutex); 7255 mutex_unlock(&ctx->mutex);
7218 7256
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca6599723be5..569b218782ad 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
82} 82}
83 83
84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ 84#define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \
85static inline unsigned int \ 85static inline unsigned long \
86func_name(struct perf_output_handle *handle, \ 86func_name(struct perf_output_handle *handle, \
87 const void *buf, unsigned int len) \ 87 const void *buf, unsigned long len) \
88{ \ 88{ \
89 unsigned long size, written; \ 89 unsigned long size, written; \
90 \ 90 \
91 do { \ 91 do { \
92 size = min_t(unsigned long, handle->size, len); \ 92 size = min(handle->size, len); \
93 \
94 written = memcpy_func(handle->addr, buf, size); \ 93 written = memcpy_func(handle->addr, buf, size); \
94 written = size - written; \
95 \ 95 \
96 len -= written; \ 96 len -= written; \
97 handle->addr += written; \ 97 handle->addr += written; \
@@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle, \
110 return len; \ 110 return len; \
111} 111}
112 112
113static inline int memcpy_common(void *dst, const void *src, size_t n) 113static inline unsigned long
114memcpy_common(void *dst, const void *src, unsigned long n)
114{ 115{
115 memcpy(dst, src, n); 116 memcpy(dst, src, n);
116 return n; 117 return 0;
117} 118}
118 119
119DEFINE_OUTPUT_COPY(__output_copy, memcpy_common) 120DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
120 121
121#define MEMCPY_SKIP(dst, src, n) (n) 122static inline unsigned long
123memcpy_skip(void *dst, const void *src, unsigned long n)
124{
125 return 0;
126}
122 127
123DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP) 128DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
124 129
125#ifndef arch_perf_out_copy_user 130#ifndef arch_perf_out_copy_user
126#define arch_perf_out_copy_user __copy_from_user_inatomic 131#define arch_perf_out_copy_user arch_perf_out_copy_user
132
133static inline unsigned long
134arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
135{
136 unsigned long ret;
137
138 pagefault_disable();
139 ret = __copy_from_user_inatomic(dst, src, n);
140 pagefault_enable();
141
142 return ret;
143}
127#endif 144#endif
128 145
129DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) 146DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 9c2ddfbf4525..e8b168af135b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,40 +12,10 @@
12#include <linux/perf_event.h> 12#include <linux/perf_event.h>
13#include <linux/vmalloc.h> 13#include <linux/vmalloc.h>
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/circ_buf.h>
15 16
16#include "internal.h" 17#include "internal.h"
17 18
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head)
20{
21 unsigned long sz = perf_data_size(rb);
22 unsigned long mask = sz - 1;
23
24 /*
25 * check if user-writable
26 * overwrite : over-write its own tail
27 * !overwrite: buffer possibly drops events.
28 */
29 if (rb->overwrite)
30 return true;
31
32 /*
33 * verify that payload is not bigger than buffer
34 * otherwise masking logic may fail to detect
35 * the "not enough space" condition
36 */
37 if ((head - offset) > sz)
38 return false;
39
40 offset = (offset - tail) & mask;
41 head = (head - tail) & mask;
42
43 if ((int)(head - offset) < 0)
44 return false;
45
46 return true;
47}
48
49static void perf_output_wakeup(struct perf_output_handle *handle) 19static void perf_output_wakeup(struct perf_output_handle *handle)
50{ 20{
51 atomic_set(&handle->rb->poll, POLL_IN); 21 atomic_set(&handle->rb->poll, POLL_IN);
@@ -115,8 +85,8 @@ again:
115 rb->user_page->data_head = head; 85 rb->user_page->data_head = head;
116 86
117 /* 87 /*
118 * Now check if we missed an update, rely on the (compiler) 88 * Now check if we missed an update -- rely on previous implied
119 * barrier in atomic_dec_and_test() to re-read rb->head. 89 * compiler barriers to force a re-read.
120 */ 90 */
121 if (unlikely(head != local_read(&rb->head))) { 91 if (unlikely(head != local_read(&rb->head))) {
122 local_inc(&rb->nest); 92 local_inc(&rb->nest);
@@ -135,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle,
135{ 105{
136 struct ring_buffer *rb; 106 struct ring_buffer *rb;
137 unsigned long tail, offset, head; 107 unsigned long tail, offset, head;
138 int have_lost; 108 int have_lost, page_shift;
139 struct perf_sample_data sample_data;
140 struct { 109 struct {
141 struct perf_event_header header; 110 struct perf_event_header header;
142 u64 id; 111 u64 id;
@@ -151,57 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle,
151 event = event->parent; 120 event = event->parent;
152 121
153 rb = rcu_dereference(event->rb); 122 rb = rcu_dereference(event->rb);
154 if (!rb) 123 if (unlikely(!rb))
155 goto out; 124 goto out;
156 125
157 handle->rb = rb; 126 if (unlikely(!rb->nr_pages))
158 handle->event = event;
159
160 if (!rb->nr_pages)
161 goto out; 127 goto out;
162 128
129 handle->rb = rb;
130 handle->event = event;
131
163 have_lost = local_read(&rb->lost); 132 have_lost = local_read(&rb->lost);
164 if (have_lost) { 133 if (unlikely(have_lost)) {
165 lost_event.header.size = sizeof(lost_event); 134 size += sizeof(lost_event);
166 perf_event_header__init_id(&lost_event.header, &sample_data, 135 if (event->attr.sample_id_all)
167 event); 136 size += event->id_header_size;
168 size += lost_event.header.size;
169 } 137 }
170 138
171 perf_output_get_handle(handle); 139 perf_output_get_handle(handle);
172 140
173 do { 141 do {
174 /*
175 * Userspace could choose to issue a mb() before updating the
176 * tail pointer. So that all reads will be completed before the
177 * write is issued.
178 *
179 * See perf_output_put_handle().
180 */
181 tail = ACCESS_ONCE(rb->user_page->data_tail); 142 tail = ACCESS_ONCE(rb->user_page->data_tail);
182 smp_mb();
183 offset = head = local_read(&rb->head); 143 offset = head = local_read(&rb->head);
184 head += size; 144 if (!rb->overwrite &&
185 if (unlikely(!perf_output_space(rb, tail, offset, head))) 145 unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
186 goto fail; 146 goto fail;
147 head += size;
187 } while (local_cmpxchg(&rb->head, offset, head) != offset); 148 } while (local_cmpxchg(&rb->head, offset, head) != offset);
188 149
189 if (head - local_read(&rb->wakeup) > rb->watermark) 150 /*
151 * Separate the userpage->tail read from the data stores below.
152 * Matches the MB userspace SHOULD issue after reading the data
153 * and before storing the new tail position.
154 *
155 * See perf_output_put_handle().
156 */
157 smp_mb();
158
159 if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
190 local_add(rb->watermark, &rb->wakeup); 160 local_add(rb->watermark, &rb->wakeup);
191 161
192 handle->page = offset >> (PAGE_SHIFT + page_order(rb)); 162 page_shift = PAGE_SHIFT + page_order(rb);
193 handle->page &= rb->nr_pages - 1;
194 handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
195 handle->addr = rb->data_pages[handle->page];
196 handle->addr += handle->size;
197 handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
198 163
199 if (have_lost) { 164 handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
165 offset &= (1UL << page_shift) - 1;
166 handle->addr = rb->data_pages[handle->page] + offset;
167 handle->size = (1UL << page_shift) - offset;
168
169 if (unlikely(have_lost)) {
170 struct perf_sample_data sample_data;
171
172 lost_event.header.size = sizeof(lost_event);
200 lost_event.header.type = PERF_RECORD_LOST; 173 lost_event.header.type = PERF_RECORD_LOST;
201 lost_event.header.misc = 0; 174 lost_event.header.misc = 0;
202 lost_event.id = event->id; 175 lost_event.id = event->id;
203 lost_event.lost = local_xchg(&rb->lost, 0); 176 lost_event.lost = local_xchg(&rb->lost, 0);
204 177
178 perf_event_header__init_id(&lost_event.header,
179 &sample_data, event);
205 perf_output_put(handle, lost_event); 180 perf_output_put(handle, lost_event);
206 perf_event__output_id_sample(event, handle, &sample_data); 181 perf_event__output_id_sample(event, handle, &sample_data);
207 } 182 }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ad8e1bdca70e..24b7d6ca871b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -35,6 +35,7 @@
35#include <linux/kdebug.h> /* notifier mechanism */ 35#include <linux/kdebug.h> /* notifier mechanism */
36#include "../../mm/internal.h" /* munlock_vma_page */ 36#include "../../mm/internal.h" /* munlock_vma_page */
37#include <linux/percpu-rwsem.h> 37#include <linux/percpu-rwsem.h>
38#include <linux/task_work.h>
38 39
39#include <linux/uprobes.h> 40#include <linux/uprobes.h>
40 41
@@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
244 * the architecture. If an arch has variable length instruction and the 245 * the architecture. If an arch has variable length instruction and the
245 * breakpoint instruction is not of the smallest length instruction 246 * breakpoint instruction is not of the smallest length instruction
246 * supported by that architecture then we need to modify is_trap_at_addr and 247 * supported by that architecture then we need to modify is_trap_at_addr and
247 * write_opcode accordingly. This would never be a problem for archs that 248 * uprobe_write_opcode accordingly. This would never be a problem for archs
248 * have fixed length instructions. 249 * that have fixed length instructions.
249 */ 250 */
250 251
251/* 252/*
252 * write_opcode - write the opcode at a given virtual address. 253 * uprobe_write_opcode - write the opcode at a given virtual address.
253 * @mm: the probed process address space. 254 * @mm: the probed process address space.
254 * @vaddr: the virtual address to store the opcode. 255 * @vaddr: the virtual address to store the opcode.
255 * @opcode: opcode to be written at @vaddr. 256 * @opcode: opcode to be written at @vaddr.
@@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
260 * For mm @mm, write the opcode at @vaddr. 261 * For mm @mm, write the opcode at @vaddr.
261 * Return 0 (success) or a negative errno. 262 * Return 0 (success) or a negative errno.
262 */ 263 */
263static int write_opcode(struct mm_struct *mm, unsigned long vaddr, 264int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
264 uprobe_opcode_t opcode) 265 uprobe_opcode_t opcode)
265{ 266{
266 struct page *old_page, *new_page; 267 struct page *old_page, *new_page;
@@ -314,7 +315,7 @@ put_old:
314 */ 315 */
315int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 316int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
316{ 317{
317 return write_opcode(mm, vaddr, UPROBE_SWBP_INSN); 318 return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
318} 319}
319 320
320/** 321/**
@@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
329int __weak 330int __weak
330set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr) 331set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
331{ 332{
332 return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn); 333 return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
333} 334}
334 335
335static int match_uprobe(struct uprobe *l, struct uprobe *r) 336static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
503 return ret; 504 return ret;
504} 505}
505 506
506static int 507static int __copy_insn(struct address_space *mapping, struct file *filp,
507__copy_insn(struct address_space *mapping, struct file *filp, char *insn, 508 void *insn, int nbytes, loff_t offset)
508 unsigned long nbytes, loff_t offset)
509{ 509{
510 struct page *page; 510 struct page *page;
511 511
@@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
527 527
528static int copy_insn(struct uprobe *uprobe, struct file *filp) 528static int copy_insn(struct uprobe *uprobe, struct file *filp)
529{ 529{
530 struct address_space *mapping; 530 struct address_space *mapping = uprobe->inode->i_mapping;
531 unsigned long nbytes; 531 loff_t offs = uprobe->offset;
532 int bytes; 532 void *insn = uprobe->arch.insn;
533 533 int size = MAX_UINSN_BYTES;
534 nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK); 534 int len, err = -EIO;
535 mapping = uprobe->inode->i_mapping;
536 535
537 /* Instruction at end of binary; copy only available bytes */ 536 /* Copy only available bytes, -EIO if nothing was read */
538 if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size) 537 do {
539 bytes = uprobe->inode->i_size - uprobe->offset; 538 if (offs >= i_size_read(uprobe->inode))
540 else 539 break;
541 bytes = MAX_UINSN_BYTES;
542 540
543 /* Instruction at the page-boundary; copy bytes in second page */ 541 len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
544 if (nbytes < bytes) { 542 err = __copy_insn(mapping, filp, insn, len, offs);
545 int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
546 bytes - nbytes, uprobe->offset + nbytes);
547 if (err) 543 if (err)
548 return err; 544 break;
549 bytes = nbytes; 545
550 } 546 insn += len;
551 return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset); 547 offs += len;
548 size -= len;
549 } while (size);
550
551 return err;
552} 552}
553 553
554static int prepare_uprobe(struct uprobe *uprobe, struct file *file, 554static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
576 if (ret) 576 if (ret)
577 goto out; 577 goto out;
578 578
579 /* write_opcode() assumes we don't cross page boundary */ 579 /* uprobe_write_opcode() assumes we don't cross page boundary */
580 BUG_ON((uprobe->offset & ~PAGE_MASK) + 580 BUG_ON((uprobe->offset & ~PAGE_MASK) +
581 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE); 581 UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
582 582
@@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
1096} 1096}
1097 1097
1098/* Slot allocation for XOL */ 1098/* Slot allocation for XOL */
1099static int xol_add_vma(struct xol_area *area) 1099static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
1100{ 1100{
1101 struct mm_struct *mm = current->mm;
1102 int ret = -EALREADY; 1101 int ret = -EALREADY;
1103 1102
1104 down_write(&mm->mmap_sem); 1103 down_write(&mm->mmap_sem);
1105 if (mm->uprobes_state.xol_area) 1104 if (mm->uprobes_state.xol_area)
1106 goto fail; 1105 goto fail;
1107 1106
1108 ret = -ENOMEM; 1107 if (!area->vaddr) {
1109 /* Try to map as high as possible, this is only a hint. */ 1108 /* Try to map as high as possible, this is only a hint. */
1110 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0); 1109 area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
1111 if (area->vaddr & ~PAGE_MASK) { 1110 PAGE_SIZE, 0, 0);
1112 ret = area->vaddr; 1111 if (area->vaddr & ~PAGE_MASK) {
1113 goto fail; 1112 ret = area->vaddr;
1113 goto fail;
1114 }
1114 } 1115 }
1115 1116
1116 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE, 1117 ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
@@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area)
1120 1121
1121 smp_wmb(); /* pairs with get_xol_area() */ 1122 smp_wmb(); /* pairs with get_xol_area() */
1122 mm->uprobes_state.xol_area = area; 1123 mm->uprobes_state.xol_area = area;
1123 ret = 0;
1124 fail: 1124 fail:
1125 up_write(&mm->mmap_sem); 1125 up_write(&mm->mmap_sem);
1126 1126
1127 return ret; 1127 return ret;
1128} 1128}
1129 1129
1130/* 1130static struct xol_area *__create_xol_area(unsigned long vaddr)
1131 * get_xol_area - Allocate process's xol_area if necessary.
1132 * This area will be used for storing instructions for execution out of line.
1133 *
1134 * Returns the allocated area or NULL.
1135 */
1136static struct xol_area *get_xol_area(void)
1137{ 1131{
1138 struct mm_struct *mm = current->mm; 1132 struct mm_struct *mm = current->mm;
1139 struct xol_area *area;
1140 uprobe_opcode_t insn = UPROBE_SWBP_INSN; 1133 uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1134 struct xol_area *area;
1141 1135
1142 area = mm->uprobes_state.xol_area; 1136 area = kmalloc(sizeof(*area), GFP_KERNEL);
1143 if (area)
1144 goto ret;
1145
1146 area = kzalloc(sizeof(*area), GFP_KERNEL);
1147 if (unlikely(!area)) 1137 if (unlikely(!area))
1148 goto out; 1138 goto out;
1149 1139
@@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void)
1155 if (!area->page) 1145 if (!area->page)
1156 goto free_bitmap; 1146 goto free_bitmap;
1157 1147
1158 /* allocate first slot of task's xol_area for the return probes */ 1148 area->vaddr = vaddr;
1149 init_waitqueue_head(&area->wq);
1150 /* Reserve the 1st slot for get_trampoline_vaddr() */
1159 set_bit(0, area->bitmap); 1151 set_bit(0, area->bitmap);
1160 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1161 atomic_set(&area->slot_count, 1); 1152 atomic_set(&area->slot_count, 1);
1162 init_waitqueue_head(&area->wq); 1153 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1163 1154
1164 if (!xol_add_vma(area)) 1155 if (!xol_add_vma(mm, area))
1165 return area; 1156 return area;
1166 1157
1167 __free_page(area->page); 1158 __free_page(area->page);
@@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void)
1170 free_area: 1161 free_area:
1171 kfree(area); 1162 kfree(area);
1172 out: 1163 out:
1164 return NULL;
1165}
1166
1167/*
1168 * get_xol_area - Allocate process's xol_area if necessary.
1169 * This area will be used for storing instructions for execution out of line.
1170 *
1171 * Returns the allocated area or NULL.
1172 */
1173static struct xol_area *get_xol_area(void)
1174{
1175 struct mm_struct *mm = current->mm;
1176 struct xol_area *area;
1177
1178 if (!mm->uprobes_state.xol_area)
1179 __create_xol_area(0);
1180
1173 area = mm->uprobes_state.xol_area; 1181 area = mm->uprobes_state.xol_area;
1174 ret: 1182 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1175 smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */
1176 return area; 1183 return area;
1177} 1184}
1178 1185
@@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1256 return 0; 1263 return 0;
1257 1264
1258 /* Initialize the slot */ 1265 /* Initialize the slot */
1259 copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES); 1266 copy_to_page(area->page, xol_vaddr,
1267 uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
1260 /* 1268 /*
1261 * We probably need flush_icache_user_range() but it needs vma. 1269 * We probably need flush_icache_user_range() but it needs vma.
1262 * This should work on supported architectures too. 1270 * This should work on supported architectures too.
@@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t)
1345} 1353}
1346 1354
1347/* 1355/*
1348 * Called in context of a new clone/fork from copy_process.
1349 */
1350void uprobe_copy_process(struct task_struct *t)
1351{
1352 t->utask = NULL;
1353}
1354
1355/*
1356 * Allocate a uprobe_task object for the task if if necessary. 1356 * Allocate a uprobe_task object for the task if if necessary.
1357 * Called when the thread hits a breakpoint. 1357 * Called when the thread hits a breakpoint.
1358 * 1358 *
@@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void)
1367 return current->utask; 1367 return current->utask;
1368} 1368}
1369 1369
1370static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
1371{
1372 struct uprobe_task *n_utask;
1373 struct return_instance **p, *o, *n;
1374
1375 n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
1376 if (!n_utask)
1377 return -ENOMEM;
1378 t->utask = n_utask;
1379
1380 p = &n_utask->return_instances;
1381 for (o = o_utask->return_instances; o; o = o->next) {
1382 n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
1383 if (!n)
1384 return -ENOMEM;
1385
1386 *n = *o;
1387 atomic_inc(&n->uprobe->ref);
1388 n->next = NULL;
1389
1390 *p = n;
1391 p = &n->next;
1392 n_utask->depth++;
1393 }
1394
1395 return 0;
1396}
1397
1398static void uprobe_warn(struct task_struct *t, const char *msg)
1399{
1400 pr_warn("uprobe: %s:%d failed to %s\n",
1401 current->comm, current->pid, msg);
1402}
1403
1404static void dup_xol_work(struct callback_head *work)
1405{
1406 kfree(work);
1407
1408 if (current->flags & PF_EXITING)
1409 return;
1410
1411 if (!__create_xol_area(current->utask->vaddr))
1412 uprobe_warn(current, "dup xol area");
1413}
1414
1415/*
1416 * Called in context of a new clone/fork from copy_process.
1417 */
1418void uprobe_copy_process(struct task_struct *t, unsigned long flags)
1419{
1420 struct uprobe_task *utask = current->utask;
1421 struct mm_struct *mm = current->mm;
1422 struct callback_head *work;
1423 struct xol_area *area;
1424
1425 t->utask = NULL;
1426
1427 if (!utask || !utask->return_instances)
1428 return;
1429
1430 if (mm == t->mm && !(flags & CLONE_VFORK))
1431 return;
1432
1433 if (dup_utask(t, utask))
1434 return uprobe_warn(t, "dup ret instances");
1435
1436 /* The task can fork() after dup_xol_work() fails */
1437 area = mm->uprobes_state.xol_area;
1438 if (!area)
1439 return uprobe_warn(t, "dup xol area");
1440
1441 if (mm == t->mm)
1442 return;
1443
1444 /* TODO: move it into the union in uprobe_task */
1445 work = kmalloc(sizeof(*work), GFP_KERNEL);
1446 if (!work)
1447 return uprobe_warn(t, "dup xol area");
1448
1449 t->utask->vaddr = area->vaddr;
1450 init_task_work(work, dup_xol_work);
1451 task_work_add(t, work, true);
1452}
1453
1370/* 1454/*
1371 * Current area->vaddr notion assume the trampoline address is always 1455 * Current area->vaddr notion assume the trampoline address is always
1372 * equal area->vaddr. 1456 * equal area->vaddr.
@@ -1857,9 +1941,4 @@ static int __init init_uprobes(void)
1857 1941
1858 return register_die_notifier(&uprobe_exception_nb); 1942 return register_die_notifier(&uprobe_exception_nb);
1859} 1943}
1860module_init(init_uprobes); 1944__initcall(init_uprobes);
1861
1862static void __exit exit_uprobes(void)
1863{
1864}
1865module_exit(exit_uprobes);
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73ad6bd..8531609b6a82 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1373,7 +1373,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1373 INIT_LIST_HEAD(&p->pi_state_list); 1373 INIT_LIST_HEAD(&p->pi_state_list);
1374 p->pi_state_cache = NULL; 1374 p->pi_state_cache = NULL;
1375#endif 1375#endif
1376 uprobe_copy_process(p);
1377 /* 1376 /*
1378 * sigaltstack should be cleared when sharing the same VM 1377 * sigaltstack should be cleared when sharing the same VM
1379 */ 1378 */
@@ -1490,6 +1489,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1490 perf_event_fork(p); 1489 perf_event_fork(p);
1491 1490
1492 trace_task_newtask(p, clone_flags); 1491 trace_task_newtask(p, clone_flags);
1492 uprobe_copy_process(p, clone_flags);
1493 1493
1494 return p; 1494 return p;
1495 1495
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8b80f1bae21a..5fee859888a4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1049,6 +1049,7 @@ static struct ctl_table kern_table[] = {
1049 .maxlen = sizeof(sysctl_perf_event_sample_rate), 1049 .maxlen = sizeof(sysctl_perf_event_sample_rate),
1050 .mode = 0644, 1050 .mode = 0644,
1051 .proc_handler = perf_proc_update_handler, 1051 .proc_handler = perf_proc_update_handler,
1052 .extra1 = &one,
1052 }, 1053 },
1053 { 1054 {
1054 .procname = "perf_cpu_time_max_percent", 1055 .procname = "perf_cpu_time_max_percent",