aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/events
diff options
context:
space:
mode:
authorFrederic Weisbecker <fweisbec@gmail.com>2013-05-02 11:37:49 -0400
committerFrederic Weisbecker <fweisbec@gmail.com>2013-05-02 11:54:19 -0400
commitc032862fba51a3ca504752d3a25186b324c5ce83 (patch)
tree955dc2ba4ab3df76ecc2bb780ee84aca04967e8d /kernel/events
parentfda76e074c7737fc57855dd17c762e50ed526052 (diff)
parent8700c95adb033843fc163d112b9d21d4fda78018 (diff)
Merge commit '8700c95adb03' into timers/nohz
The full dynticks tree needs the latest RCU and sched upstream updates in order to fix some dependencies. Merge a common upstream merge point that has these updates. Conflicts: include/linux/perf_event.h kernel/rcutree.h kernel/rcutree_plugin.h Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Diffstat (limited to 'kernel/events')
-rw-r--r--kernel/events/core.c70
-rw-r--r--kernel/events/internal.h2
-rw-r--r--kernel/events/ring_buffer.c22
-rw-r--r--kernel/events/uprobes.c300
4 files changed, 328 insertions, 66 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index ddb993b52190..6b41c1899a8b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -38,6 +38,7 @@
38#include <linux/ftrace_event.h> 38#include <linux/ftrace_event.h>
39#include <linux/hw_breakpoint.h> 39#include <linux/hw_breakpoint.h>
40#include <linux/mm_types.h> 40#include <linux/mm_types.h>
41#include <linux/cgroup.h>
41 42
42#include "internal.h" 43#include "internal.h"
43 44
@@ -235,6 +236,20 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
235#ifdef CONFIG_CGROUP_PERF 236#ifdef CONFIG_CGROUP_PERF
236 237
237/* 238/*
239 * perf_cgroup_info keeps track of time_enabled for a cgroup.
240 * This is a per-cpu dynamically allocated data structure.
241 */
242struct perf_cgroup_info {
243 u64 time;
244 u64 timestamp;
245};
246
247struct perf_cgroup {
248 struct cgroup_subsys_state css;
249 struct perf_cgroup_info __percpu *info;
250};
251
252/*
238 * Must ensure cgroup is pinned (css_get) before calling 253 * Must ensure cgroup is pinned (css_get) before calling
239 * this function. In other words, we cannot call this function 254 * this function. In other words, we cannot call this function
240 * if there is no cgroup event for the current CPU context. 255 * if there is no cgroup event for the current CPU context.
@@ -252,7 +267,22 @@ perf_cgroup_match(struct perf_event *event)
252 struct perf_event_context *ctx = event->ctx; 267 struct perf_event_context *ctx = event->ctx;
253 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); 268 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
254 269
255 return !event->cgrp || event->cgrp == cpuctx->cgrp; 270 /* @event doesn't care about cgroup */
271 if (!event->cgrp)
272 return true;
273
274 /* wants specific cgroup scope but @cpuctx isn't associated with any */
275 if (!cpuctx->cgrp)
276 return false;
277
278 /*
279 * Cgroup scoping is recursive. An event enabled for a cgroup is
280 * also enabled for all its descendant cgroups. If @cpuctx's
281 * cgroup is a descendant of @event's (the test covers identity
282 * case), it's a match.
283 */
284 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
285 event->cgrp->css.cgroup);
256} 286}
257 287
258static inline bool perf_tryget_cgroup(struct perf_event *event) 288static inline bool perf_tryget_cgroup(struct perf_event *event)
@@ -966,9 +996,15 @@ static void perf_event__header_size(struct perf_event *event)
966 if (sample_type & PERF_SAMPLE_PERIOD) 996 if (sample_type & PERF_SAMPLE_PERIOD)
967 size += sizeof(data->period); 997 size += sizeof(data->period);
968 998
999 if (sample_type & PERF_SAMPLE_WEIGHT)
1000 size += sizeof(data->weight);
1001
969 if (sample_type & PERF_SAMPLE_READ) 1002 if (sample_type & PERF_SAMPLE_READ)
970 size += event->read_size; 1003 size += event->read_size;
971 1004
1005 if (sample_type & PERF_SAMPLE_DATA_SRC)
1006 size += sizeof(data->data_src.val);
1007
972 event->header_size = size; 1008 event->header_size = size;
973} 1009}
974 1010
@@ -4193,6 +4229,12 @@ void perf_output_sample(struct perf_output_handle *handle,
4193 perf_output_sample_ustack(handle, 4229 perf_output_sample_ustack(handle,
4194 data->stack_user_size, 4230 data->stack_user_size,
4195 data->regs_user.regs); 4231 data->regs_user.regs);
4232
4233 if (sample_type & PERF_SAMPLE_WEIGHT)
4234 perf_output_put(handle, data->weight);
4235
4236 if (sample_type & PERF_SAMPLE_DATA_SRC)
4237 perf_output_put(handle, data->data_src.val);
4196} 4238}
4197 4239
4198void perf_prepare_sample(struct perf_event_header *header, 4240void perf_prepare_sample(struct perf_event_header *header,
@@ -4449,12 +4491,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)
4449 if (ctxn < 0) 4491 if (ctxn < 0)
4450 goto next; 4492 goto next;
4451 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); 4493 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4494 if (ctx)
4495 perf_event_task_ctx(ctx, task_event);
4452 } 4496 }
4453 if (ctx)
4454 perf_event_task_ctx(ctx, task_event);
4455next: 4497next:
4456 put_cpu_ptr(pmu->pmu_cpu_context); 4498 put_cpu_ptr(pmu->pmu_cpu_context);
4457 } 4499 }
4500 if (task_event->task_ctx)
4501 perf_event_task_ctx(task_event->task_ctx, task_event);
4502
4458 rcu_read_unlock(); 4503 rcu_read_unlock();
4459} 4504}
4460 4505
@@ -4608,6 +4653,7 @@ void perf_event_comm(struct task_struct *task)
4608 struct perf_event_context *ctx; 4653 struct perf_event_context *ctx;
4609 int ctxn; 4654 int ctxn;
4610 4655
4656 rcu_read_lock();
4611 for_each_task_context_nr(ctxn) { 4657 for_each_task_context_nr(ctxn) {
4612 ctx = task->perf_event_ctxp[ctxn]; 4658 ctx = task->perf_event_ctxp[ctxn];
4613 if (!ctx) 4659 if (!ctx)
@@ -4615,6 +4661,7 @@ void perf_event_comm(struct task_struct *task)
4615 4661
4616 perf_event_enable_on_exec(ctx); 4662 perf_event_enable_on_exec(ctx);
4617 } 4663 }
4664 rcu_read_unlock();
4618 4665
4619 if (!atomic_read(&nr_comm_events)) 4666 if (!atomic_read(&nr_comm_events))
4620 return; 4667 return;
@@ -4749,7 +4796,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4749 } else { 4796 } else {
4750 if (arch_vma_name(mmap_event->vma)) { 4797 if (arch_vma_name(mmap_event->vma)) {
4751 name = strncpy(tmp, arch_vma_name(mmap_event->vma), 4798 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4752 sizeof(tmp)); 4799 sizeof(tmp) - 1);
4800 tmp[sizeof(tmp) - 1] = '\0';
4753 goto got_name; 4801 goto got_name;
4754 } 4802 }
4755 4803
@@ -4776,6 +4824,9 @@ got_name:
4776 mmap_event->file_name = name; 4824 mmap_event->file_name = name;
4777 mmap_event->file_size = size; 4825 mmap_event->file_size = size;
4778 4826
4827 if (!(vma->vm_flags & VM_EXEC))
4828 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
4829
4779 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; 4830 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4780 4831
4781 rcu_read_lock(); 4832 rcu_read_lock();
@@ -5342,7 +5393,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
5342 5393
5343static int perf_swevent_init(struct perf_event *event) 5394static int perf_swevent_init(struct perf_event *event)
5344{ 5395{
5345 int event_id = event->attr.config; 5396 u64 event_id = event->attr.config;
5346 5397
5347 if (event->attr.type != PERF_TYPE_SOFTWARE) 5398 if (event->attr.type != PERF_TYPE_SOFTWARE)
5348 return -ENOENT; 5399 return -ENOENT;
@@ -5662,6 +5713,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
5662 event->attr.sample_period = NSEC_PER_SEC / freq; 5713 event->attr.sample_period = NSEC_PER_SEC / freq;
5663 hwc->sample_period = event->attr.sample_period; 5714 hwc->sample_period = event->attr.sample_period;
5664 local64_set(&hwc->period_left, hwc->sample_period); 5715 local64_set(&hwc->period_left, hwc->sample_period);
5716 hwc->last_period = hwc->sample_period;
5665 event->attr.freq = 0; 5717 event->attr.freq = 0;
5666 } 5718 }
5667} 5719}
@@ -5997,6 +6049,7 @@ skip_type:
5997 if (pmu->pmu_cpu_context) 6049 if (pmu->pmu_cpu_context)
5998 goto got_cpu_context; 6050 goto got_cpu_context;
5999 6051
6052 ret = -ENOMEM;
6000 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); 6053 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
6001 if (!pmu->pmu_cpu_context) 6054 if (!pmu->pmu_cpu_context)
6002 goto free_dev; 6055 goto free_dev;
@@ -7524,12 +7577,5 @@ struct cgroup_subsys perf_subsys = {
7524 .css_free = perf_cgroup_css_free, 7577 .css_free = perf_cgroup_css_free,
7525 .exit = perf_cgroup_exit, 7578 .exit = perf_cgroup_exit,
7526 .attach = perf_cgroup_attach, 7579 .attach = perf_cgroup_attach,
7527
7528 /*
7529 * perf_event cgroup doesn't handle nesting correctly.
7530 * ctx->nr_cgroups adjustments should be propagated through the
7531 * cgroup hierarchy. Fix it and remove the following.
7532 */
7533 .broken_hierarchy = true,
7534}; 7580};
7535#endif /* CONFIG_CGROUP_PERF */ 7581#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8b..eb675c4d59df 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
16 int page_order; /* allocation order */ 16 int page_order; /* allocation order */
17#endif 17#endif
18 int nr_pages; /* nr of data pages */ 18 int nr_pages; /* nr of data pages */
19 int writable; /* are we writable */ 19 int overwrite; /* can overwrite itself */
20 20
21 atomic_t poll; /* POLL_ for wakeups */ 21 atomic_t poll; /* POLL_ for wakeups */
22 22
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff3973..97fddb09762b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, 18static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
19 unsigned long offset, unsigned long head) 19 unsigned long offset, unsigned long head)
20{ 20{
21 unsigned long mask; 21 unsigned long sz = perf_data_size(rb);
22 unsigned long mask = sz - 1;
22 23
23 if (!rb->writable) 24 /*
25 * check if user-writable
26 * overwrite : over-write its own tail
27 * !overwrite: buffer possibly drops events.
28 */
29 if (rb->overwrite)
24 return true; 30 return true;
25 31
26 mask = perf_data_size(rb) - 1; 32 /*
33 * verify that payload is not bigger than buffer
34 * otherwise masking logic may fail to detect
35 * the "not enough space" condition
36 */
37 if ((head - offset) > sz)
38 return false;
27 39
28 offset = (offset - tail) & mask; 40 offset = (offset - tail) & mask;
29 head = (head - tail) & mask; 41 head = (head - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
212 rb->watermark = max_size / 2; 224 rb->watermark = max_size / 2;
213 225
214 if (flags & RING_BUFFER_WRITABLE) 226 if (flags & RING_BUFFER_WRITABLE)
215 rb->writable = 1; 227 rb->overwrite = 0;
228 else
229 rb->overwrite = 1;
216 230
217 atomic_set(&rb->refcount, 1); 231 atomic_set(&rb->refcount, 1);
218 232
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index a567c8c7ef31..f3569747d629 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -75,6 +75,15 @@ struct uprobe {
75 struct arch_uprobe arch; 75 struct arch_uprobe arch;
76}; 76};
77 77
78struct return_instance {
79 struct uprobe *uprobe;
80 unsigned long func;
81 unsigned long orig_ret_vaddr; /* original return address */
82 bool chained; /* true, if instance is nested */
83
84 struct return_instance *next; /* keep as stack */
85};
86
78/* 87/*
79 * valid_vma: Verify if the specified vma is an executable vma 88 * valid_vma: Verify if the specified vma is an executable vma
80 * Relax restrictions while unregistering: vm_flags might have 89 * Relax restrictions while unregistering: vm_flags might have
@@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
173 return *insn == UPROBE_SWBP_INSN; 182 return *insn == UPROBE_SWBP_INSN;
174} 183}
175 184
176static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode) 185/**
186 * is_trap_insn - check if instruction is breakpoint instruction.
187 * @insn: instruction to be checked.
188 * Default implementation of is_trap_insn
189 * Returns true if @insn is a breakpoint instruction.
190 *
191 * This function is needed for the case where an architecture has multiple
192 * trap instructions (like powerpc).
193 */
194bool __weak is_trap_insn(uprobe_opcode_t *insn)
195{
196 return is_swbp_insn(insn);
197}
198
199static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
177{ 200{
178 void *kaddr = kmap_atomic(page); 201 void *kaddr = kmap_atomic(page);
179 memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE); 202 memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
203 kunmap_atomic(kaddr);
204}
205
206static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
207{
208 void *kaddr = kmap_atomic(page);
209 memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
180 kunmap_atomic(kaddr); 210 kunmap_atomic(kaddr);
181} 211}
182 212
@@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
185 uprobe_opcode_t old_opcode; 215 uprobe_opcode_t old_opcode;
186 bool is_swbp; 216 bool is_swbp;
187 217
188 copy_opcode(page, vaddr, &old_opcode); 218 /*
219 * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
220 * We do not check if it is any other 'trap variant' which could
221 * be conditional trap instruction such as the one powerpc supports.
222 *
223 * The logic is that we do not care if the underlying instruction
224 * is a trap variant; uprobes always wins over any other (gdb)
225 * breakpoint.
226 */
227 copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
189 is_swbp = is_swbp_insn(&old_opcode); 228 is_swbp = is_swbp_insn(&old_opcode);
190 229
191 if (is_swbp_insn(new_opcode)) { 230 if (is_swbp_insn(new_opcode)) {
@@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
204 * Expect the breakpoint instruction to be the smallest size instruction for 243 * Expect the breakpoint instruction to be the smallest size instruction for
205 * the architecture. If an arch has variable length instruction and the 244 * the architecture. If an arch has variable length instruction and the
206 * breakpoint instruction is not of the smallest length instruction 245 * breakpoint instruction is not of the smallest length instruction
207 * supported by that architecture then we need to modify is_swbp_at_addr and 246 * supported by that architecture then we need to modify is_trap_at_addr and
208 * write_opcode accordingly. This would never be a problem for archs that 247 * write_opcode accordingly. This would never be a problem for archs that
209 * have fixed length instructions. 248 * have fixed length instructions.
210 */ 249 */
@@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
225 uprobe_opcode_t opcode) 264 uprobe_opcode_t opcode)
226{ 265{
227 struct page *old_page, *new_page; 266 struct page *old_page, *new_page;
228 void *vaddr_old, *vaddr_new;
229 struct vm_area_struct *vma; 267 struct vm_area_struct *vma;
230 int ret; 268 int ret;
231 269
@@ -246,15 +284,8 @@ retry:
246 284
247 __SetPageUptodate(new_page); 285 __SetPageUptodate(new_page);
248 286
249 /* copy the page now that we've got it stable */ 287 copy_highpage(new_page, old_page);
250 vaddr_old = kmap_atomic(old_page); 288 copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
251 vaddr_new = kmap_atomic(new_page);
252
253 memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
254 memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
255
256 kunmap_atomic(vaddr_new);
257 kunmap_atomic(vaddr_old);
258 289
259 ret = anon_vma_prepare(vma); 290 ret = anon_vma_prepare(vma);
260 if (ret) 291 if (ret)
@@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
477 unsigned long nbytes, loff_t offset) 508 unsigned long nbytes, loff_t offset)
478{ 509{
479 struct page *page; 510 struct page *page;
480 void *vaddr;
481 unsigned long off;
482 pgoff_t idx;
483
484 if (!filp)
485 return -EINVAL;
486 511
487 if (!mapping->a_ops->readpage) 512 if (!mapping->a_ops->readpage)
488 return -EIO; 513 return -EIO;
489
490 idx = offset >> PAGE_CACHE_SHIFT;
491 off = offset & ~PAGE_MASK;
492
493 /* 514 /*
494 * Ensure that the page that has the original instruction is 515 * Ensure that the page that has the original instruction is
495 * populated and in page-cache. 516 * populated and in page-cache.
496 */ 517 */
497 page = read_mapping_page(mapping, idx, filp); 518 page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
498 if (IS_ERR(page)) 519 if (IS_ERR(page))
499 return PTR_ERR(page); 520 return PTR_ERR(page);
500 521
501 vaddr = kmap_atomic(page); 522 copy_from_page(page, offset, insn, nbytes);
502 memcpy(insn, vaddr + off, nbytes);
503 kunmap_atomic(vaddr);
504 page_cache_release(page); 523 page_cache_release(page);
505 524
506 return 0; 525 return 0;
@@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
550 goto out; 569 goto out;
551 570
552 ret = -ENOTSUPP; 571 ret = -ENOTSUPP;
553 if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn)) 572 if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
554 goto out; 573 goto out;
555 574
556 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr); 575 ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
758 down_write(&mm->mmap_sem); 777 down_write(&mm->mmap_sem);
759 vma = find_vma(mm, info->vaddr); 778 vma = find_vma(mm, info->vaddr);
760 if (!vma || !valid_vma(vma, is_register) || 779 if (!vma || !valid_vma(vma, is_register) ||
761 vma->vm_file->f_mapping->host != uprobe->inode) 780 file_inode(vma->vm_file) != uprobe->inode)
762 goto unlock; 781 goto unlock;
763 782
764 if (vma->vm_start > info->vaddr || 783 if (vma->vm_start > info->vaddr ||
@@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
828 struct uprobe *uprobe; 847 struct uprobe *uprobe;
829 int ret; 848 int ret;
830 849
850 /* Uprobe must have at least one set consumer */
851 if (!uc->handler && !uc->ret_handler)
852 return -EINVAL;
853
831 /* Racy, just to catch the obvious mistakes */ 854 /* Racy, just to catch the obvious mistakes */
832 if (offset > i_size_read(inode)) 855 if (offset > i_size_read(inode))
833 return -EINVAL; 856 return -EINVAL;
@@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
917 loff_t offset; 940 loff_t offset;
918 941
919 if (!valid_vma(vma, false) || 942 if (!valid_vma(vma, false) ||
920 vma->vm_file->f_mapping->host != uprobe->inode) 943 file_inode(vma->vm_file) != uprobe->inode)
921 continue; 944 continue;
922 945
923 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT; 946 offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
@@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
1010 if (no_uprobe_events() || !valid_vma(vma, true)) 1033 if (no_uprobe_events() || !valid_vma(vma, true))
1011 return 0; 1034 return 0;
1012 1035
1013 inode = vma->vm_file->f_mapping->host; 1036 inode = file_inode(vma->vm_file);
1014 if (!inode) 1037 if (!inode)
1015 return 0; 1038 return 0;
1016 1039
@@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
1041 struct inode *inode; 1064 struct inode *inode;
1042 struct rb_node *n; 1065 struct rb_node *n;
1043 1066
1044 inode = vma->vm_file->f_mapping->host; 1067 inode = file_inode(vma->vm_file);
1045 1068
1046 min = vaddr_to_offset(vma, start); 1069 min = vaddr_to_offset(vma, start);
1047 max = min + (end - start) - 1; 1070 max = min + (end - start) - 1;
@@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void)
1114{ 1137{
1115 struct mm_struct *mm = current->mm; 1138 struct mm_struct *mm = current->mm;
1116 struct xol_area *area; 1139 struct xol_area *area;
1140 uprobe_opcode_t insn = UPROBE_SWBP_INSN;
1117 1141
1118 area = mm->uprobes_state.xol_area; 1142 area = mm->uprobes_state.xol_area;
1119 if (area) 1143 if (area)
@@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void)
1131 if (!area->page) 1155 if (!area->page)
1132 goto free_bitmap; 1156 goto free_bitmap;
1133 1157
1158 /* allocate first slot of task's xol_area for the return probes */
1159 set_bit(0, area->bitmap);
1160 copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
1161 atomic_set(&area->slot_count, 1);
1134 init_waitqueue_head(&area->wq); 1162 init_waitqueue_head(&area->wq);
1163
1135 if (!xol_add_vma(area)) 1164 if (!xol_add_vma(area))
1136 return area; 1165 return area;
1137 1166
@@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
1216static unsigned long xol_get_insn_slot(struct uprobe *uprobe) 1245static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1217{ 1246{
1218 struct xol_area *area; 1247 struct xol_area *area;
1219 unsigned long offset;
1220 unsigned long xol_vaddr; 1248 unsigned long xol_vaddr;
1221 void *vaddr;
1222 1249
1223 area = get_xol_area(); 1250 area = get_xol_area();
1224 if (!area) 1251 if (!area)
@@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
1229 return 0; 1256 return 0;
1230 1257
1231 /* Initialize the slot */ 1258 /* Initialize the slot */
1232 offset = xol_vaddr & ~PAGE_MASK; 1259 copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
1233 vaddr = kmap_atomic(area->page);
1234 memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
1235 kunmap_atomic(vaddr);
1236 /* 1260 /*
1237 * We probably need flush_icache_user_range() but it needs vma. 1261 * We probably need flush_icache_user_range() but it needs vma.
1238 * This should work on supported architectures too. 1262 * This should work on supported architectures too.
@@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
1298void uprobe_free_utask(struct task_struct *t) 1322void uprobe_free_utask(struct task_struct *t)
1299{ 1323{
1300 struct uprobe_task *utask = t->utask; 1324 struct uprobe_task *utask = t->utask;
1325 struct return_instance *ri, *tmp;
1301 1326
1302 if (!utask) 1327 if (!utask)
1303 return; 1328 return;
@@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t)
1305 if (utask->active_uprobe) 1330 if (utask->active_uprobe)
1306 put_uprobe(utask->active_uprobe); 1331 put_uprobe(utask->active_uprobe);
1307 1332
1333 ri = utask->return_instances;
1334 while (ri) {
1335 tmp = ri;
1336 ri = ri->next;
1337
1338 put_uprobe(tmp->uprobe);
1339 kfree(tmp);
1340 }
1341
1308 xol_free_insn_slot(t); 1342 xol_free_insn_slot(t);
1309 kfree(utask); 1343 kfree(utask);
1310 t->utask = NULL; 1344 t->utask = NULL;
@@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void)
1333 return current->utask; 1367 return current->utask;
1334} 1368}
1335 1369
1370/*
1371 * Current area->vaddr notion assume the trampoline address is always
1372 * equal area->vaddr.
1373 *
1374 * Returns -1 in case the xol_area is not allocated.
1375 */
1376static unsigned long get_trampoline_vaddr(void)
1377{
1378 struct xol_area *area;
1379 unsigned long trampoline_vaddr = -1;
1380
1381 area = current->mm->uprobes_state.xol_area;
1382 smp_read_barrier_depends();
1383 if (area)
1384 trampoline_vaddr = area->vaddr;
1385
1386 return trampoline_vaddr;
1387}
1388
1389static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
1390{
1391 struct return_instance *ri;
1392 struct uprobe_task *utask;
1393 unsigned long orig_ret_vaddr, trampoline_vaddr;
1394 bool chained = false;
1395
1396 if (!get_xol_area())
1397 return;
1398
1399 utask = get_utask();
1400 if (!utask)
1401 return;
1402
1403 if (utask->depth >= MAX_URETPROBE_DEPTH) {
1404 printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
1405 " nestedness limit pid/tgid=%d/%d\n",
1406 current->pid, current->tgid);
1407 return;
1408 }
1409
1410 ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
1411 if (!ri)
1412 goto fail;
1413
1414 trampoline_vaddr = get_trampoline_vaddr();
1415 orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
1416 if (orig_ret_vaddr == -1)
1417 goto fail;
1418
1419 /*
1420 * We don't want to keep trampoline address in stack, rather keep the
1421 * original return address of first caller thru all the consequent
1422 * instances. This also makes breakpoint unwrapping easier.
1423 */
1424 if (orig_ret_vaddr == trampoline_vaddr) {
1425 if (!utask->return_instances) {
1426 /*
1427 * This situation is not possible. Likely we have an
1428 * attack from user-space.
1429 */
1430 pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
1431 current->pid, current->tgid);
1432 goto fail;
1433 }
1434
1435 chained = true;
1436 orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
1437 }
1438
1439 atomic_inc(&uprobe->ref);
1440 ri->uprobe = uprobe;
1441 ri->func = instruction_pointer(regs);
1442 ri->orig_ret_vaddr = orig_ret_vaddr;
1443 ri->chained = chained;
1444
1445 utask->depth++;
1446
1447 /* add instance to the stack */
1448 ri->next = utask->return_instances;
1449 utask->return_instances = ri;
1450
1451 return;
1452
1453 fail:
1454 kfree(ri);
1455}
1456
1336/* Prepare to single-step probed instruction out of line. */ 1457/* Prepare to single-step probed instruction out of line. */
1337static int 1458static int
1338pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr) 1459pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
@@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
1431 clear_bit(MMF_HAS_UPROBES, &mm->flags); 1552 clear_bit(MMF_HAS_UPROBES, &mm->flags);
1432} 1553}
1433 1554
1434static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr) 1555static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
1435{ 1556{
1436 struct page *page; 1557 struct page *page;
1437 uprobe_opcode_t opcode; 1558 uprobe_opcode_t opcode;
@@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
1449 if (result < 0) 1570 if (result < 0)
1450 return result; 1571 return result;
1451 1572
1452 copy_opcode(page, vaddr, &opcode); 1573 copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
1453 put_page(page); 1574 put_page(page);
1454 out: 1575 out:
1455 return is_swbp_insn(&opcode); 1576 /* This needs to return true for any variant of the trap insn */
1577 return is_trap_insn(&opcode);
1456} 1578}
1457 1579
1458static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp) 1580static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
@@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
1465 vma = find_vma(mm, bp_vaddr); 1587 vma = find_vma(mm, bp_vaddr);
1466 if (vma && vma->vm_start <= bp_vaddr) { 1588 if (vma && vma->vm_start <= bp_vaddr) {
1467 if (valid_vma(vma, false)) { 1589 if (valid_vma(vma, false)) {
1468 struct inode *inode = vma->vm_file->f_mapping->host; 1590 struct inode *inode = file_inode(vma->vm_file);
1469 loff_t offset = vaddr_to_offset(vma, bp_vaddr); 1591 loff_t offset = vaddr_to_offset(vma, bp_vaddr);
1470 1592
1471 uprobe = find_uprobe(inode, offset); 1593 uprobe = find_uprobe(inode, offset);
1472 } 1594 }
1473 1595
1474 if (!uprobe) 1596 if (!uprobe)
1475 *is_swbp = is_swbp_at_addr(mm, bp_vaddr); 1597 *is_swbp = is_trap_at_addr(mm, bp_vaddr);
1476 } else { 1598 } else {
1477 *is_swbp = -EFAULT; 1599 *is_swbp = -EFAULT;
1478 } 1600 }
@@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1488{ 1610{
1489 struct uprobe_consumer *uc; 1611 struct uprobe_consumer *uc;
1490 int remove = UPROBE_HANDLER_REMOVE; 1612 int remove = UPROBE_HANDLER_REMOVE;
1613 bool need_prep = false; /* prepare return uprobe, when needed */
1491 1614
1492 down_read(&uprobe->register_rwsem); 1615 down_read(&uprobe->register_rwsem);
1493 for (uc = uprobe->consumers; uc; uc = uc->next) { 1616 for (uc = uprobe->consumers; uc; uc = uc->next) {
1494 int rc = uc->handler(uc, regs); 1617 int rc = 0;
1618
1619 if (uc->handler) {
1620 rc = uc->handler(uc, regs);
1621 WARN(rc & ~UPROBE_HANDLER_MASK,
1622 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1623 }
1624
1625 if (uc->ret_handler)
1626 need_prep = true;
1495 1627
1496 WARN(rc & ~UPROBE_HANDLER_MASK,
1497 "bad rc=0x%x from %pf()\n", rc, uc->handler);
1498 remove &= rc; 1628 remove &= rc;
1499 } 1629 }
1500 1630
1631 if (need_prep && !remove)
1632 prepare_uretprobe(uprobe, regs); /* put bp at return */
1633
1501 if (remove && uprobe->consumers) { 1634 if (remove && uprobe->consumers) {
1502 WARN_ON(!uprobe_is_active(uprobe)); 1635 WARN_ON(!uprobe_is_active(uprobe));
1503 unapply_uprobe(uprobe, current->mm); 1636 unapply_uprobe(uprobe, current->mm);
@@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
1505 up_read(&uprobe->register_rwsem); 1638 up_read(&uprobe->register_rwsem);
1506} 1639}
1507 1640
1641static void
1642handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
1643{
1644 struct uprobe *uprobe = ri->uprobe;
1645 struct uprobe_consumer *uc;
1646
1647 down_read(&uprobe->register_rwsem);
1648 for (uc = uprobe->consumers; uc; uc = uc->next) {
1649 if (uc->ret_handler)
1650 uc->ret_handler(uc, ri->func, regs);
1651 }
1652 up_read(&uprobe->register_rwsem);
1653}
1654
1655static bool handle_trampoline(struct pt_regs *regs)
1656{
1657 struct uprobe_task *utask;
1658 struct return_instance *ri, *tmp;
1659 bool chained;
1660
1661 utask = current->utask;
1662 if (!utask)
1663 return false;
1664
1665 ri = utask->return_instances;
1666 if (!ri)
1667 return false;
1668
1669 /*
1670 * TODO: we should throw out return_instance's invalidated by
1671 * longjmp(), currently we assume that the probed function always
1672 * returns.
1673 */
1674 instruction_pointer_set(regs, ri->orig_ret_vaddr);
1675
1676 for (;;) {
1677 handle_uretprobe_chain(ri, regs);
1678
1679 chained = ri->chained;
1680 put_uprobe(ri->uprobe);
1681
1682 tmp = ri;
1683 ri = ri->next;
1684 kfree(tmp);
1685
1686 if (!chained)
1687 break;
1688
1689 utask->depth--;
1690
1691 BUG_ON(!ri);
1692 }
1693
1694 utask->return_instances = ri;
1695
1696 return true;
1697}
1698
1508/* 1699/*
1509 * Run handler and ask thread to singlestep. 1700 * Run handler and ask thread to singlestep.
1510 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps. 1701 * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs)
1516 int uninitialized_var(is_swbp); 1707 int uninitialized_var(is_swbp);
1517 1708
1518 bp_vaddr = uprobe_get_swbp_addr(regs); 1709 bp_vaddr = uprobe_get_swbp_addr(regs);
1519 uprobe = find_active_uprobe(bp_vaddr, &is_swbp); 1710 if (bp_vaddr == get_trampoline_vaddr()) {
1711 if (handle_trampoline(regs))
1712 return;
1713
1714 pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
1715 current->pid, current->tgid);
1716 }
1520 1717
1718 uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
1521 if (!uprobe) { 1719 if (!uprobe) {
1522 if (is_swbp > 0) { 1720 if (is_swbp > 0) {
1523 /* No matching uprobe; signal SIGTRAP. */ 1721 /* No matching uprobe; signal SIGTRAP. */
@@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs)
1616 */ 1814 */
1617int uprobe_pre_sstep_notifier(struct pt_regs *regs) 1815int uprobe_pre_sstep_notifier(struct pt_regs *regs)
1618{ 1816{
1619 if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags)) 1817 if (!current->mm)
1818 return 0;
1819
1820 if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
1821 (!current->utask || !current->utask->return_instances))
1620 return 0; 1822 return 0;
1621 1823
1622 set_thread_flag(TIF_UPROBE); 1824 set_thread_flag(TIF_UPROBE);