6 files changed, 314 insertions, 204 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 663f43a20f73..8c875ef6e120 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,8 @@ int sysctl_perf_event_sample_rate __read_mostly	= DEFAULT_MAX_SAMPLE_RATE;
 static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
-static atomic_t perf_sample_allowed_ns __read_mostly =
+static int perf_sample_allowed_ns __read_mostly =
-        ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+        DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
 void update_perf_cpu_limits(void)
 {
@@ -184,7 +184,7 @@ void update_perf_cpu_limits(void)
        tmp *= sysctl_perf_cpu_time_max_percent;
        do_div(tmp, 100);
-        atomic_set(&perf_sample_allowed_ns, tmp);
+        ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
 }
 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
@@ -193,7 +193,7 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-        int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
        if (ret || !write)
                return ret;
@@ -228,14 +228,15 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 * we detect that events are taking too long.
 */
 #define NR_ACCUMULATED_SAMPLES 128
-DEFINE_PER_CPU(u64, running_sample_length);
+static DEFINE_PER_CPU(u64, running_sample_length);
 void perf_sample_event_took(u64 sample_len_ns)
 {
        u64 avg_local_sample_len;
        u64 local_samples_len;
+        u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
-        if (atomic_read(&perf_sample_allowed_ns) == 0)
+        if (allowed_ns == 0)
                return;
        /* decay the counter by 1 average sample */
@@ -251,7 +252,7 @@ void perf_sample_event_took(u64 sample_len_ns)
         */
        avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
-        if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+        if (avg_local_sample_len <= allowed_ns)
                return;
        if (max_samples_per_tick <= 1)
@@ -262,10 +263,9 @@ void perf_sample_event_took(u64 sample_len_ns)
        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
        printk_ratelimited(KERN_WARNING
-                        "perf samples too long (%lld > %d), lowering "
+                        "perf samples too long (%lld > %lld), lowering "
                        "kernel.perf_event_max_sample_rate to %d\n",
-                        avg_local_sample_len,
+                        avg_local_sample_len, allowed_ns,
-                        atomic_read(&perf_sample_allowed_ns),
                        sysctl_perf_event_sample_rate);
        update_perf_cpu_limits();
@@ -899,6 +899,7 @@ static void unclone_ctx(struct perf_event_context *ctx)
                put_ctx(ctx->parent_ctx);
                ctx->parent_ctx = NULL;
        }
+        ctx->generation++;
 }
 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
@@ -1136,6 +1137,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
        ctx->nr_events++;
        if (event->attr.inherit_stat)
                ctx->nr_stat++;
+        ctx->generation++;
 }
 /*
@@ -1201,6 +1204,9 @@ static void perf_event__header_size(struct perf_event *event)
        if (sample_type & PERF_SAMPLE_DATA_SRC)
                size += sizeof(data->data_src.val);
+        if (sample_type & PERF_SAMPLE_TRANSACTION)
+                size += sizeof(data->txn);
        event->header_size = size;
 }
@@ -1310,6 +1316,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
         */
        if (event->state > PERF_EVENT_STATE_OFF)
                event->state = PERF_EVENT_STATE_OFF;
+        ctx->generation++;
 }
 static void perf_group_detach(struct perf_event *event)
@@ -2146,22 +2154,38 @@ static void ctx_sched_out(struct perf_event_context *ctx,
 }
 /*
- * Test whether two contexts are equivalent, i.e. whether they
+ * Test whether two contexts are equivalent, i.e. whether they have both been
- * have both been cloned from the same version of the same context
+ * cloned from the same version of the same context.
- * and they both have the same number of enabled events.
+ *
- * If the number of enabled events is the same, then the set
+ * Equivalence is measured using a generation number in the context that is
- * of enabled events should be the same, because these are both
+ * incremented on each modification to it; see unclone_ctx(), list_add_event()
- * inherited contexts, therefore we can't access individual events
+ * and list_del_event().
- * in them directly with an fd; we can only enable/disable all
- * events via prctl, or enable/disable all events in a family
- * via ioctl, which will have the same effect on both contexts.
 */
 static int context_equiv(struct perf_event_context *ctx1,
                         struct perf_event_context *ctx2)
 {
-        return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+        /* Pinning disables the swap optimization */
-                && ctx1->parent_gen == ctx2->parent_gen
+        if (ctx1->pin_count || ctx2->pin_count)
-                && !ctx1->pin_count && !ctx2->pin_count;
+                return 0;
+        /* If ctx1 is the parent of ctx2 */
+        if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
+                return 1;
+        /* If ctx2 is the parent of ctx1 */
+        if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
+                return 1;
+        /*
+         * If ctx1 and ctx2 have the same parent; we flatten the parent
+         * hierarchy, see perf_event_init_context().
+         */
+        if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
+                        ctx1->parent_gen == ctx2->parent_gen)
+                return 1;
+        /* Unmatched */
+        return 0;
 }
 static void __perf_event_sync_stat(struct perf_event *event,
@@ -2244,7 +2268,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
 {
        struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
        struct perf_event_context *next_ctx;
-        struct perf_event_context *parent;
+        struct perf_event_context *parent, *next_parent;
        struct perf_cpu_context *cpuctx;
        int do_switch = 1;
@@ -2256,10 +2280,18 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                return;
        rcu_read_lock();
-        parent = rcu_dereference(ctx->parent_ctx);
        next_ctx = next->perf_event_ctxp[ctxn];
-        if (parent && next_ctx &&
+        if (!next_ctx)
-            rcu_dereference(next_ctx->parent_ctx) == parent) {
+                goto unlock;
+        parent = rcu_dereference(ctx->parent_ctx);
+        next_parent = rcu_dereference(next_ctx->parent_ctx);
+        /* If neither context have a parent context; they cannot be clones. */
+        if (!parent && !next_parent)
+                goto unlock;
+        if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
                /*
                 * Looks like the two contexts are clones, so we might be
                 * able to optimize the context switch.  We lock both
@@ -2287,6 +2319,7 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
                raw_spin_unlock(&next_ctx->lock);
                raw_spin_unlock(&ctx->lock);
        }
+unlock:
        rcu_read_unlock();
        if (do_switch) {
@@ -4572,6 +4605,9 @@ void perf_output_sample(struct perf_output_handle *handle,
        if (sample_type & PERF_SAMPLE_DATA_SRC)
                perf_output_put(handle, data->data_src.val);
+        if (sample_type & PERF_SAMPLE_TRANSACTION)
+                perf_output_put(handle, data->txn);
        if (!event->attr.watermark) {
                int wakeup_events = event->attr.wakeup_events;
@@ -5100,27 +5136,26 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
        unsigned int size;
        char tmp[16];
        char *buf = NULL;
-        const char *name;
+        char *name;
-        memset(tmp, 0, sizeof(tmp));
        if (file) {
                struct inode *inode;
                dev_t dev;
+                buf = kmalloc(PATH_MAX, GFP_KERNEL);
+                if (!buf) {
+                        name = "//enomem";
+                        goto cpy_name;
+                }
                /*
-                 * d_path works from the end of the rb backwards, so we
+                 * d_path() works from the end of the rb backwards, so we
                 * need to add enough zero bytes after the string to handle
                 * the 64bit alignment we do later.
                 */
-                buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
+                name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
-                if (!buf) {
-                        name = strncpy(tmp, "//enomem", sizeof(tmp));
-                        goto got_name;
-                }
-                name = d_path(&file->f_path, buf, PATH_MAX);
                if (IS_ERR(name)) {
-                        name = strncpy(tmp, "//toolong", sizeof(tmp));
+                        name = "//toolong";
-                        goto got_name;
+                        goto cpy_name;
                }
                inode = file_inode(vma->vm_file);
                dev = inode->i_sb->s_dev;
@@ -5128,34 +5163,39 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
                gen = inode->i_generation;
                maj = MAJOR(dev);
                min = MINOR(dev);
+                goto got_name;
        } else {
-                if (arch_vma_name(mmap_event->vma)) {
+                name = (char *)arch_vma_name(vma);
-                        name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+                if (name)
-                                       sizeof(tmp) - 1);
+                        goto cpy_name;
-                        tmp[sizeof(tmp) - 1] = '\0';
-                        goto got_name;
-                }
-                if (!vma->vm_mm) {
+                if (vma->vm_start <= vma->vm_mm->start_brk &&
-                        name = strncpy(tmp, "[vdso]", sizeof(tmp));
-                        goto got_name;
-                } else if (vma->vm_start <= vma->vm_mm->start_brk &&
                                vma->vm_end >= vma->vm_mm->brk) {
-                        name = strncpy(tmp, "[heap]", sizeof(tmp));
+                        name = "[heap]";
-                        goto got_name;
+                        goto cpy_name;
-                } else if (vma->vm_start <= vma->vm_mm->start_stack &&
+                }
+                if (vma->vm_start <= vma->vm_mm->start_stack &&
                                vma->vm_end >= vma->vm_mm->start_stack) {
-                        name = strncpy(tmp, "[stack]", sizeof(tmp));
+                        name = "[stack]";
-                        goto got_name;
+                        goto cpy_name;
                }
-                name = strncpy(tmp, "//anon", sizeof(tmp));
+                name = "//anon";
-                goto got_name;
+                goto cpy_name;
        }
+cpy_name:
+        strlcpy(tmp, name, sizeof(tmp));
+        name = tmp;
 got_name:
-        size = ALIGN(strlen(name)+1, sizeof(u64));
+        /*
+         * Since our buffer works in 8 byte units we need to align our string
+         * size to a multiple of 8. However, we must guarantee the tail end is
+         * zero'd out to avoid leaking random bits to userspace.
+         */
+        size = strlen(name)+1;
+        while (!IS_ALIGNED(size, sizeof(u64)))
+                name[size++] = '\0';
        mmap_event->file_name = name;
        mmap_event->file_size = size;
@@ -7129,7 +7169,6 @@ SYSCALL_DEFINE5(perf_event_open,
        }
        perf_install_in_context(ctx, event, event->cpu);
-        ++ctx->generation;
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
@@ -7212,7 +7251,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
        WARN_ON_ONCE(ctx->parent_ctx);
        mutex_lock(&ctx->mutex);
        perf_install_in_context(ctx, event, cpu);
-        ++ctx->generation;
        perf_unpin_context(ctx);
        mutex_unlock(&ctx->mutex);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index ca6599723be5..569b218782ad 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -82,16 +82,16 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb)
 }
 #define DEFINE_OUTPUT_COPY(func_name, memcpy_func)                      \
-static inline unsigned int                                              \
+static inline unsigned long                                             \
 func_name(struct perf_output_handle *handle,                            \
-          const void *buf, unsigned int len)                            \
+          const void *buf, unsigned long len)                           \
 {                                                                       \
        unsigned long size, written;                                    \
                                                                        \
        do {                                                            \
-                size = min_t(unsigned long, handle->size, len);         \
+                size    = min(handle->size, len);                       \
-                                                                        \
                written = memcpy_func(handle->addr, buf, size);         \
+                written = size - written;                               \
                                                                        \
                len -= written;                                         \
                handle->addr += written;                                \
@@ -110,20 +110,37 @@ func_name(struct perf_output_handle *handle,				\
        return len;                                                     \
 }
-static inline int memcpy_common(void *dst, const void *src, size_t n)
+static inline unsigned long
+memcpy_common(void *dst, const void *src, unsigned long n)
 {
        memcpy(dst, src, n);
-        return n;
+        return 0;
 }
 DEFINE_OUTPUT_COPY(__output_copy, memcpy_common)
-#define MEMCPY_SKIP(dst, src, n) (n)
+static inline unsigned long
+memcpy_skip(void *dst, const void *src, unsigned long n)
+{
+        return 0;
+}
-DEFINE_OUTPUT_COPY(__output_skip, MEMCPY_SKIP)
+DEFINE_OUTPUT_COPY(__output_skip, memcpy_skip)
 #ifndef arch_perf_out_copy_user
-#define arch_perf_out_copy_user __copy_from_user_inatomic
+#define arch_perf_out_copy_user arch_perf_out_copy_user
+static inline unsigned long
+arch_perf_out_copy_user(void *dst, const void *src, unsigned long n)
+{
+        unsigned long ret;
+        pagefault_disable();
+        ret = __copy_from_user_inatomic(dst, src, n);
+        pagefault_enable();
+        return ret;
+}
 #endif
 DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 9c2ddfbf4525..e8b168af135b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -12,40 +12,10 @@
 #include <linux/perf_event.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/circ_buf.h>
 #include "internal.h"
-static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
-                              unsigned long offset, unsigned long head)
-{
-        unsigned long sz = perf_data_size(rb);
-        unsigned long mask = sz - 1;
-        /*
-         * check if user-writable
-         * overwrite : over-write its own tail
-         * !overwrite: buffer possibly drops events.
-         */
-        if (rb->overwrite)
-                return true;
-        /*
-         * verify that payload is not bigger than buffer
-         * otherwise masking logic may fail to detect
-         * the "not enough space" condition
-         */
-        if ((head - offset) > sz)
-                return false;
-        offset = (offset - tail) & mask;
-        head   = (head   - tail) & mask;
-        if ((int)(head - offset) < 0)
-                return false;
-        return true;
-}
 static void perf_output_wakeup(struct perf_output_handle *handle)
 {
        atomic_set(&handle->rb->poll, POLL_IN);
@@ -115,8 +85,8 @@ again:
        rb->user_page->data_head = head;
        /*
-         * Now check if we missed an update, rely on the (compiler)
+         * Now check if we missed an update -- rely on previous implied
-         * barrier in atomic_dec_and_test() to re-read rb->head.
+         * compiler barriers to force a re-read.
         */
        if (unlikely(head != local_read(&rb->head))) {
                local_inc(&rb->nest);
@@ -135,8 +105,7 @@ int perf_output_begin(struct perf_output_handle *handle,
 {
        struct ring_buffer *rb;
        unsigned long tail, offset, head;
-        int have_lost;
+        int have_lost, page_shift;
-        struct perf_sample_data sample_data;
        struct {
                struct perf_event_header header;
                u64                      id;
@@ -151,57 +120,63 @@ int perf_output_begin(struct perf_output_handle *handle,
                event = event->parent;
        rb = rcu_dereference(event->rb);
-        if (!rb)
+        if (unlikely(!rb))
                goto out;
-        handle->rb      = rb;
+        if (unlikely(!rb->nr_pages))
-        handle->event   = event;
-        if (!rb->nr_pages)
                goto out;
+        handle->rb    = rb;
+        handle->event = event;
        have_lost = local_read(&rb->lost);
-        if (have_lost) {
+        if (unlikely(have_lost)) {
-                lost_event.header.size = sizeof(lost_event);
+                size += sizeof(lost_event);
-                perf_event_header__init_id(&lost_event.header, &sample_data,
+                if (event->attr.sample_id_all)
-                                           event);
+                        size += event->id_header_size;
-                size += lost_event.header.size;
        }
        perf_output_get_handle(handle);
        do {
-                /*
-                 * Userspace could choose to issue a mb() before updating the
-                 * tail pointer. So that all reads will be completed before the
-                 * write is issued.
-                 *
-                 * See perf_output_put_handle().
-                 */
                tail = ACCESS_ONCE(rb->user_page->data_tail);
-                smp_mb();
                offset = head = local_read(&rb->head);
-                head += size;
+                if (!rb->overwrite &&
-                if (unlikely(!perf_output_space(rb, tail, offset, head)))
+                    unlikely(CIRC_SPACE(head, tail, perf_data_size(rb)) < size))
                        goto fail;
+                head += size;
        } while (local_cmpxchg(&rb->head, offset, head) != offset);
-        if (head - local_read(&rb->wakeup) > rb->watermark)
+        /*
+         * Separate the userpage->tail read from the data stores below.
+         * Matches the MB userspace SHOULD issue after reading the data
+         * and before storing the new tail position.
+         *
+         * See perf_output_put_handle().
+         */
+        smp_mb();
+        if (unlikely(head - local_read(&rb->wakeup) > rb->watermark))
                local_add(rb->watermark, &rb->wakeup);
-        handle->page = offset >> (PAGE_SHIFT + page_order(rb));
+        page_shift = PAGE_SHIFT + page_order(rb);
-        handle->page &= rb->nr_pages - 1;
-        handle->size = offset & ((PAGE_SIZE << page_order(rb)) - 1);
-        handle->addr = rb->data_pages[handle->page];
-        handle->addr += handle->size;
-        handle->size = (PAGE_SIZE << page_order(rb)) - handle->size;
-        if (have_lost) {
+        handle->page = (offset >> page_shift) & (rb->nr_pages - 1);
+        offset &= (1UL << page_shift) - 1;
+        handle->addr = rb->data_pages[handle->page] + offset;
+        handle->size = (1UL << page_shift) - offset;
+        if (unlikely(have_lost)) {
+                struct perf_sample_data sample_data;
+                lost_event.header.size = sizeof(lost_event);
                lost_event.header.type = PERF_RECORD_LOST;
                lost_event.header.misc = 0;
                lost_event.id          = event->id;
                lost_event.lost        = local_xchg(&rb->lost, 0);
+                perf_event_header__init_id(&lost_event.header,
+                                           &sample_data, event);
                perf_output_put(handle, lost_event);
                perf_event__output_id_sample(event, handle, &sample_data);
        }
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index ad8e1bdca70e..24b7d6ca871b 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -35,6 +35,7 @@
 #include <linux/kdebug.h>       /* notifier mechanism */
 #include "../../mm/internal.h"  /* munlock_vma_page */
 #include <linux/percpu-rwsem.h>
+#include <linux/task_work.h>
 #include <linux/uprobes.h>
@@ -244,12 +245,12 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
 * the architecture. If an arch has variable length instruction and the
 * breakpoint instruction is not of the smallest length instruction
 * supported by that architecture then we need to modify is_trap_at_addr and
- * write_opcode accordingly. This would never be a problem for archs that
+ * uprobe_write_opcode accordingly. This would never be a problem for archs
- * have fixed length instructions.
+ * that have fixed length instructions.
 */
 /*
- * write_opcode - write the opcode at a given virtual address.
+ * uprobe_write_opcode - write the opcode at a given virtual address.
 * @mm: the probed process address space.
 * @vaddr: the virtual address to store the opcode.
 * @opcode: opcode to be written at @vaddr.
@@ -260,7 +261,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
 * For mm @mm, write the opcode at @vaddr.
 * Return 0 (success) or a negative errno.
 */
-static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
+int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
                        uprobe_opcode_t opcode)
 {
        struct page *old_page, *new_page;
@@ -314,7 +315,7 @@ put_old:
 */
 int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        return write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
+        return uprobe_write_opcode(mm, vaddr, UPROBE_SWBP_INSN);
 }
 /**
@@ -329,7 +330,7 @@ int __weak set_swbp(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned
 int __weak
 set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long vaddr)
 {
-        return write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
+        return uprobe_write_opcode(mm, vaddr, *(uprobe_opcode_t *)auprobe->insn);
 }
 static int match_uprobe(struct uprobe *l, struct uprobe *r)
@@ -503,9 +504,8 @@ static bool consumer_del(struct uprobe *uprobe, struct uprobe_consumer *uc)
        return ret;
 }
-static int
+static int __copy_insn(struct address_space *mapping, struct file *filp,
-__copy_insn(struct address_space *mapping, struct file *filp, char *insn,
+                        void *insn, int nbytes, loff_t offset)
-                        unsigned long nbytes, loff_t offset)
 {
        struct page *page;
@@ -527,28 +527,28 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
 static int copy_insn(struct uprobe *uprobe, struct file *filp)
 {
-        struct address_space *mapping;
+        struct address_space *mapping = uprobe->inode->i_mapping;
-        unsigned long nbytes;
+        loff_t offs = uprobe->offset;
-        int bytes;
+        void *insn = uprobe->arch.insn;
+        int size = MAX_UINSN_BYTES;
-        nbytes = PAGE_SIZE - (uprobe->offset & ~PAGE_MASK);
+        int len, err = -EIO;
-        mapping = uprobe->inode->i_mapping;
-        /* Instruction at end of binary; copy only available bytes */
+        /* Copy only available bytes, -EIO if nothing was read */
-        if (uprobe->offset + MAX_UINSN_BYTES > uprobe->inode->i_size)
+        do {
-                bytes = uprobe->inode->i_size - uprobe->offset;
+                if (offs >= i_size_read(uprobe->inode))
-        else
+                        break;
-                bytes = MAX_UINSN_BYTES;
-        /* Instruction at the page-boundary; copy bytes in second page */
+                len = min_t(int, size, PAGE_SIZE - (offs & ~PAGE_MASK));
-        if (nbytes < bytes) {
+                err = __copy_insn(mapping, filp, insn, len, offs);
-                int err = __copy_insn(mapping, filp, uprobe->arch.insn + nbytes,
-                                bytes - nbytes, uprobe->offset + nbytes);
                if (err)
-                        return err;
+                        break;
-                bytes = nbytes;
-        }
+                insn += len;
-        return __copy_insn(mapping, filp, uprobe->arch.insn, bytes, uprobe->offset);
+                offs += len;
+                size -= len;
+        } while (size);
+        return err;
 }
 static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
@@ -576,7 +576,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
        if (ret)
                goto out;
-        /* write_opcode() assumes we don't cross page boundary */
+        /* uprobe_write_opcode() assumes we don't cross page boundary */
        BUG_ON((uprobe->offset & ~PAGE_MASK) +
                        UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
@@ -1096,21 +1096,22 @@ void uprobe_munmap(struct vm_area_struct *vma, unsigned long start, unsigned lon
 }
 /* Slot allocation for XOL */
-static int xol_add_vma(struct xol_area *area)
+static int xol_add_vma(struct mm_struct *mm, struct xol_area *area)
 {
-        struct mm_struct *mm = current->mm;
        int ret = -EALREADY;
        down_write(&mm->mmap_sem);
        if (mm->uprobes_state.xol_area)
                goto fail;
-        ret = -ENOMEM;
+        if (!area->vaddr) {
-        /* Try to map as high as possible, this is only a hint. */
+                /* Try to map as high as possible, this is only a hint. */
-        area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE, PAGE_SIZE, 0, 0);
+                area->vaddr = get_unmapped_area(NULL, TASK_SIZE - PAGE_SIZE,
-        if (area->vaddr & ~PAGE_MASK) {
+                                                PAGE_SIZE, 0, 0);
-                ret = area->vaddr;
+                if (area->vaddr & ~PAGE_MASK) {
-                goto fail;
+                        ret = area->vaddr;
+                        goto fail;
+                }
        }
        ret = install_special_mapping(mm, area->vaddr, PAGE_SIZE,
@@ -1120,30 +1121,19 @@ static int xol_add_vma(struct xol_area *area)
        smp_wmb();      /* pairs with get_xol_area() */
        mm->uprobes_state.xol_area = area;
-        ret = 0;
 fail:
        up_write(&mm->mmap_sem);
        return ret;
 }
-/*
+static struct xol_area *__create_xol_area(unsigned long vaddr)
- * get_xol_area - Allocate process's xol_area if necessary.
- * This area will be used for storing instructions for execution out of line.
- *
- * Returns the allocated area or NULL.
- */
-static struct xol_area *get_xol_area(void)
 {
        struct mm_struct *mm = current->mm;
-        struct xol_area *area;
        uprobe_opcode_t insn = UPROBE_SWBP_INSN;
+        struct xol_area *area;
-        area = mm->uprobes_state.xol_area;
+        area = kmalloc(sizeof(*area), GFP_KERNEL);
-        if (area)
-                goto ret;
-        area = kzalloc(sizeof(*area), GFP_KERNEL);
        if (unlikely(!area))
                goto out;
@@ -1155,13 +1145,14 @@ static struct xol_area *get_xol_area(void)
        if (!area->page)
                goto free_bitmap;
-        /* allocate first slot of task's xol_area for the return probes */
+        area->vaddr = vaddr;
+        init_waitqueue_head(&area->wq);
+        /* Reserve the 1st slot for get_trampoline_vaddr() */
        set_bit(0, area->bitmap);
-        copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
        atomic_set(&area->slot_count, 1);
-        init_waitqueue_head(&area->wq);
+        copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
-        if (!xol_add_vma(area))
+        if (!xol_add_vma(mm, area))
                return area;
        __free_page(area->page);
@@ -1170,9 +1161,25 @@ static struct xol_area *get_xol_area(void)
 free_area:
        kfree(area);
 out:
+        return NULL;
+}
+/*
+ * get_xol_area - Allocate process's xol_area if necessary.
+ * This area will be used for storing instructions for execution out of line.
+ *
+ * Returns the allocated area or NULL.
+ */
+static struct xol_area *get_xol_area(void)
+{
+        struct mm_struct *mm = current->mm;
+        struct xol_area *area;
+        if (!mm->uprobes_state.xol_area)
+                __create_xol_area(0);
        area = mm->uprobes_state.xol_area;
- ret:
+        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
-        smp_read_barrier_depends();     /* pairs with wmb in xol_add_vma() */
        return area;
 }
@@ -1256,7 +1263,8 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
                return 0;
        /* Initialize the slot */
-        copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
+        copy_to_page(area->page, xol_vaddr,
+                        uprobe->arch.ixol, sizeof(uprobe->arch.ixol));
        /*
         * We probably need flush_icache_user_range() but it needs vma.
         * This should work on supported architectures too.
@@ -1345,14 +1353,6 @@ void uprobe_free_utask(struct task_struct *t)
 }
 /*
- * Called in context of a new clone/fork from copy_process.
- */
-void uprobe_copy_process(struct task_struct *t)
-{
-        t->utask = NULL;
-}
-/*
 * Allocate a uprobe_task object for the task if if necessary.
 * Called when the thread hits a breakpoint.
 *
@@ -1367,6 +1367,90 @@ static struct uprobe_task *get_utask(void)
        return current->utask;
 }
+static int dup_utask(struct task_struct *t, struct uprobe_task *o_utask)
+{
+        struct uprobe_task *n_utask;
+        struct return_instance **p, *o, *n;
+        n_utask = kzalloc(sizeof(struct uprobe_task), GFP_KERNEL);
+        if (!n_utask)
+                return -ENOMEM;
+        t->utask = n_utask;
+        p = &n_utask->return_instances;
+        for (o = o_utask->return_instances; o; o = o->next) {
+                n = kmalloc(sizeof(struct return_instance), GFP_KERNEL);
+                if (!n)
+                        return -ENOMEM;
+                *n = *o;
+                atomic_inc(&n->uprobe->ref);
+                n->next = NULL;
+                *p = n;
+                p = &n->next;
+                n_utask->depth++;
+        }
+        return 0;
+}
+static void uprobe_warn(struct task_struct *t, const char *msg)
+{
+        pr_warn("uprobe: %s:%d failed to %s\n",
+                        current->comm, current->pid, msg);
+}
+static void dup_xol_work(struct callback_head *work)
+{
+        kfree(work);
+        if (current->flags & PF_EXITING)
+                return;
+        if (!__create_xol_area(current->utask->vaddr))
+                uprobe_warn(current, "dup xol area");
+}
+/*
+ * Called in context of a new clone/fork from copy_process.
+ */
+void uprobe_copy_process(struct task_struct *t, unsigned long flags)
+{
+        struct uprobe_task *utask = current->utask;
+        struct mm_struct *mm = current->mm;
+        struct callback_head *work;
+        struct xol_area *area;
+        t->utask = NULL;
+        if (!utask || !utask->return_instances)
+                return;
+        if (mm == t->mm && !(flags & CLONE_VFORK))
+                return;
+        if (dup_utask(t, utask))
+                return uprobe_warn(t, "dup ret instances");
+        /* The task can fork() after dup_xol_work() fails */
+        area = mm->uprobes_state.xol_area;
+        if (!area)
+                return uprobe_warn(t, "dup xol area");
+        if (mm == t->mm)
+                return;
+        /* TODO: move it into the union in uprobe_task */
+        work = kmalloc(sizeof(*work), GFP_KERNEL);
+        if (!work)
+                return uprobe_warn(t, "dup xol area");
+        t->utask->vaddr = area->vaddr;
+        init_task_work(work, dup_xol_work);
+        task_work_add(t, work, true);
+}
 /*
 * Current area->vaddr notion assume the trampoline address is always
 * equal area->vaddr.
@@ -1857,9 +1941,4 @@ static int __init init_uprobes(void)
        return register_die_notifier(&uprobe_exception_nb);
 }
-module_init(init_uprobes);
+__initcall(init_uprobes);
-static void __exit exit_uprobes(void)
-{
-}
-module_exit(exit_uprobes);
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73ad6bd..8531609b6a82 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1373,7 +1373,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        INIT_LIST_HEAD(&p->pi_state_list);
        p->pi_state_cache = NULL;
 #endif
-        uprobe_copy_process(p);
        /*
         * sigaltstack should be cleared when sharing the same VM
         */
@@ -1490,6 +1489,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        perf_event_fork(p);
        trace_task_newtask(p, clone_flags);
+        uprobe_copy_process(p, clone_flags);
        return p;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8b80f1bae21a..5fee859888a4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1049,6 +1049,7 @@ static struct ctl_table kern_table[] = {
                .maxlen         = sizeof(sysctl_perf_event_sample_rate),
                .mode           = 0644,
                .proc_handler   = perf_proc_update_handler,
+                .extra1         = &one,
        },
        {
                .procname       = "perf_cpu_time_max_percent",