1 files changed, 418 insertions, 93 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 9dc297faf7c0..1db3af933704 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
 /*
 * max perf event sample rate
 */
-#define DEFAULT_MAX_SAMPLE_RATE 100000
+#define DEFAULT_MAX_SAMPLE_RATE         100000
-int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+#define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
-static int max_samples_per_tick __read_mostly =
+#define DEFAULT_CPU_TIME_MAX_PERCENT    25
-        DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
+static atomic_t perf_sample_allowed_ns __read_mostly =
+        ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+void update_perf_cpu_limits(void)
+{
+        u64 tmp = perf_sample_period_ns;
+        tmp *= sysctl_perf_cpu_time_max_percent;
+        tmp = do_div(tmp, 100);
+        atomic_set(&perf_sample_allowed_ns, tmp);
+}
+static int perf_rotate_context(struct perf_cpu_context *cpuctx);
 int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
@@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
                return ret;
        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+        update_perf_cpu_limits();
        return 0;
 }
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp,
+                                loff_t *ppos)
+{
+        int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (ret || !write)
+                return ret;
+        update_perf_cpu_limits();
+        return 0;
+}
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done.  This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+DEFINE_PER_CPU(u64, running_sample_length);
+void perf_sample_event_took(u64 sample_len_ns)
+{
+        u64 avg_local_sample_len;
+        u64 local_samples_len = __get_cpu_var(running_sample_length);
+        if (atomic_read(&perf_sample_allowed_ns) == 0)
+                return;
+        /* decay the counter by 1 average sample */
+        local_samples_len = __get_cpu_var(running_sample_length);
+        local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+        local_samples_len += sample_len_ns;
+        __get_cpu_var(running_sample_length) = local_samples_len;
+        /*
+         * note: this will be biased artifically low until we have
+         * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
+         * from having to maintain a count.
+         */
+        avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+        if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+                return;
+        if (max_samples_per_tick <= 1)
+                return;
+        max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+        sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+        printk_ratelimited(KERN_WARNING
+                        "perf samples too long (%lld > %d), lowering "
+                        "kernel.perf_event_max_sample_rate to %d\n",
+                        avg_local_sample_len,
+                        atomic_read(&perf_sample_allowed_ns),
+                        sysctl_perf_event_sample_rate);
+        update_perf_cpu_limits();
+}
 static atomic64_t perf_event_id;
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -196,9 +282,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
-static void ring_buffer_attach(struct perf_event *event,
-                               struct ring_buffer *rb);
 void __weak perf_event_print_debug(void)        { }
 extern __weak const char *perf_pmu_name(void)
@@ -658,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 }
 #endif
+/*
+ * set default to be dependent on timer tick just
+ * like original code
+ */
+#define PERF_CPU_HRTIMER (1000 / HZ)
+/*
+ * function must be called with interrupts disbled
+ */
+static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+{
+        struct perf_cpu_context *cpuctx;
+        enum hrtimer_restart ret = HRTIMER_NORESTART;
+        int rotations = 0;
+        WARN_ON(!irqs_disabled());
+        cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+        rotations = perf_rotate_context(cpuctx);
+        /*
+         * arm timer if needed
+         */
+        if (rotations) {
+                hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+                ret = HRTIMER_RESTART;
+        }
+        return ret;
+}
+/* CPU is going down */
+void perf_cpu_hrtimer_cancel(int cpu)
+{
+        struct perf_cpu_context *cpuctx;
+        struct pmu *pmu;
+        unsigned long flags;
+        if (WARN_ON(cpu != smp_processor_id()))
+                return;
+        local_irq_save(flags);
+        rcu_read_lock();
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+                if (pmu->task_ctx_nr == perf_sw_context)
+                        continue;
+                hrtimer_cancel(&cpuctx->hrtimer);
+        }
+        rcu_read_unlock();
+        local_irq_restore(flags);
+}
+static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+{
+        struct hrtimer *hr = &cpuctx->hrtimer;
+        struct pmu *pmu = cpuctx->ctx.pmu;
+        int timer;
+        /* no multiplexing needed for SW PMU */
+        if (pmu->task_ctx_nr == perf_sw_context)
+                return;
+        /*
+         * check default is sane, if not set then force to
+         * default interval (1/tick)
+         */
+        timer = pmu->hrtimer_interval_ms;
+        if (timer < 1)
+                timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+        cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+        hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+        hr->function = perf_cpu_hrtimer_handler;
+}
+static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+{
+        struct hrtimer *hr = &cpuctx->hrtimer;
+        struct pmu *pmu = cpuctx->ctx.pmu;
+        /* not for SW PMU */
+        if (pmu->task_ctx_nr == perf_sw_context)
+                return;
+        if (hrtimer_active(hr))
+                return;
+        if (!hrtimer_callback_running(hr))
+                __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
+                                         0, HRTIMER_MODE_REL_PINNED, 0);
+}
 void perf_pmu_disable(struct pmu *pmu)
 {
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1506,6 +1689,7 @@ group_sched_in(struct perf_event *group_event,
        if (event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
+                perf_cpu_hrtimer_restart(cpuctx);
                return -EAGAIN;
        }
@@ -1552,6 +1736,8 @@ group_error:
        pmu->cancel_txn(pmu);
+        perf_cpu_hrtimer_restart(cpuctx);
        return -EAGAIN;
 }
@@ -1807,8 +1993,10 @@ static int __perf_event_enable(void *info)
                 * If this event can't go on and it's part of a
                 * group, then the whole group has to come off.
                 */
-                if (leader != event)
+                if (leader != event) {
                        group_sched_out(leader, cpuctx, ctx);
+                        perf_cpu_hrtimer_restart(cpuctx);
+                }
                if (leader->attr.pinned) {
                        update_group_times(leader);
                        leader->state = PERF_EVENT_STATE_ERROR;
@@ -2555,7 +2743,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
 * because they're strictly cpu affine and rotate_start is called with IRQs
 * disabled, while rotate_context is called from IRQ context.
 */
-static void perf_rotate_context(struct perf_cpu_context *cpuctx)
+static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        struct perf_event_context *ctx = NULL;
        int rotate = 0, remove = 1;
@@ -2594,6 +2782,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 done:
        if (remove)
                list_del_init(&cpuctx->rotation_list);
+        return rotate;
 }
 #ifdef CONFIG_NO_HZ_FULL
@@ -2625,10 +2815,6 @@ void perf_event_task_tick(void)
                ctx = cpuctx->task_ctx;
                if (ctx)
                        perf_adjust_freq_unthr_context(ctx, throttled);
-                if (cpuctx->jiffies_interval == 1 ||
-                                !(jiffies % cpuctx->jiffies_interval))
-                        perf_rotate_context(cpuctx);
        }
 }
@@ -2918,6 +3104,7 @@ static void free_event_rcu(struct rcu_head *head)
 }
 static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
 static void free_event(struct perf_event *event)
 {
@@ -2942,15 +3129,30 @@ static void free_event(struct perf_event *event)
                if (has_branch_stack(event)) {
                        static_key_slow_dec_deferred(&perf_sched_events);
                        /* is system-wide event */
-                        if (!(event->attach_state & PERF_ATTACH_TASK))
+                        if (!(event->attach_state & PERF_ATTACH_TASK)) {
                                atomic_dec(&per_cpu(perf_branch_stack_events,
                                                    event->cpu));
+                        }
                }
        }
        if (event->rb) {
-                ring_buffer_put(event->rb);
+                struct ring_buffer *rb;
-                event->rb = NULL;
+                /*
+                 * Can happen when we close an event with re-directed output.
+                 *
+                 * Since we have a 0 refcount, perf_mmap_close() will skip
+                 * over us; possibly making our ring_buffer_put() the last.
+                 */
+                mutex_lock(&event->mmap_mutex);
+                rb = event->rb;
+                if (rb) {
+                        rcu_assign_pointer(event->rb, NULL);
+                        ring_buffer_detach(event, rb);
+                        ring_buffer_put(rb); /* could be last */
+                }
+                mutex_unlock(&event->mmap_mutex);
        }
        if (is_cgroup_event(event))
@@ -3188,30 +3390,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
        unsigned int events = POLL_HUP;
        /*
-         * Race between perf_event_set_output() and perf_poll(): perf_poll()
+         * Pin the event->rb by taking event->mmap_mutex; otherwise
-         * grabs the rb reference but perf_event_set_output() overrides it.
+         * perf_event_set_output() can swizzle our rb and make us miss wakeups.
-         * Here is the timeline for two threads T1, T2:
-         * t0: T1, rb = rcu_dereference(event->rb)
-         * t1: T2, old_rb = event->rb
-         * t2: T2, event->rb = new rb
-         * t3: T2, ring_buffer_detach(old_rb)
-         * t4: T1, ring_buffer_attach(rb1)
-         * t5: T1, poll_wait(event->waitq)
-         *
-         * To avoid this problem, we grab mmap_mutex in perf_poll()
-         * thereby ensuring that the assignment of the new ring buffer
-         * and the detachment of the old buffer appear atomic to perf_poll()
         */
        mutex_lock(&event->mmap_mutex);
+        rb = event->rb;
-        rcu_read_lock();
+        if (rb)
-        rb = rcu_dereference(event->rb);
-        if (rb) {
-                ring_buffer_attach(event, rb);
                events = atomic_xchg(&rb->poll, 0);
-        }
-        rcu_read_unlock();
        mutex_unlock(&event->mmap_mutex);
        poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3706,12 @@ static void ring_buffer_attach(struct perf_event *event,
                return;
        spin_lock_irqsave(&rb->event_lock, flags);
-        if (!list_empty(&event->rb_entry))
+        if (list_empty(&event->rb_entry))
-                goto unlock;
+                list_add(&event->rb_entry, &rb->event_list);
-        list_add(&event->rb_entry, &rb->event_list);
-unlock:
        spin_unlock_irqrestore(&rb->event_lock, flags);
 }
-static void ring_buffer_detach(struct perf_event *event,
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
-                               struct ring_buffer *rb)
 {
        unsigned long flags;
@@ -3549,13 +3730,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
        rcu_read_lock();
        rb = rcu_dereference(event->rb);
-        if (!rb)
+        if (rb) {
-                goto unlock;
+                list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+                        wake_up_all(&event->waitq);
-        list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+        }
-                wake_up_all(&event->waitq);
-unlock:
        rcu_read_unlock();
 }
@@ -3584,18 +3762,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 static void ring_buffer_put(struct ring_buffer *rb)
 {
-        struct perf_event *event, *n;
-        unsigned long flags;
        if (!atomic_dec_and_test(&rb->refcount))
                return;
-        spin_lock_irqsave(&rb->event_lock, flags);
+        WARN_ON_ONCE(!list_empty(&rb->event_list));
-        list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
-                list_del_init(&event->rb_entry);
-                wake_up_all(&event->waitq);
-        }
-        spin_unlock_irqrestore(&rb->event_lock, flags);
        call_rcu(&rb->rcu_head, rb_free_rcu);
 }
@@ -3605,26 +3775,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
        struct perf_event *event = vma->vm_file->private_data;
        atomic_inc(&event->mmap_count);
+        atomic_inc(&event->rb->mmap_count);
 }
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
 static void perf_mmap_close(struct vm_area_struct *vma)
 {
        struct perf_event *event = vma->vm_file->private_data;
-        if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
+        struct ring_buffer *rb = event->rb;
-                unsigned long size = perf_data_size(event->rb);
+        struct user_struct *mmap_user = rb->mmap_user;
-                struct user_struct *user = event->mmap_user;
+        int mmap_locked = rb->mmap_locked;
-                struct ring_buffer *rb = event->rb;
+        unsigned long size = perf_data_size(rb);
+        atomic_dec(&rb->mmap_count);
+        if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+                return;
-                atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
+        /* Detach current event from the buffer. */
-                vma->vm_mm->pinned_vm -= event->mmap_locked;
+        rcu_assign_pointer(event->rb, NULL);
-                rcu_assign_pointer(event->rb, NULL);
+        ring_buffer_detach(event, rb);
-                ring_buffer_detach(event, rb);
+        mutex_unlock(&event->mmap_mutex);
+        /* If there's still other mmap()s of this buffer, we're done. */
+        if (atomic_read(&rb->mmap_count)) {
+                ring_buffer_put(rb); /* can't be last */
+                return;
+        }
+        /*
+         * No other mmap()s, detach from all other events that might redirect
+         * into the now unreachable buffer. Somewhat complicated by the
+         * fact that rb::event_lock otherwise nests inside mmap_mutex.
+         */
+again:
+        rcu_read_lock();
+        list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+                if (!atomic_long_inc_not_zero(&event->refcount)) {
+                        /*
+                         * This event is en-route to free_event() which will
+                         * detach it and remove it from the list.
+                         */
+                        continue;
+                }
+                rcu_read_unlock();
+                mutex_lock(&event->mmap_mutex);
+                /*
+                 * Check we didn't race with perf_event_set_output() which can
+                 * swizzle the rb from under us while we were waiting to
+                 * acquire mmap_mutex.
+                 *
+                 * If we find a different rb; ignore this event, a next
+                 * iteration will no longer find it on the list. We have to
+                 * still restart the iteration to make sure we're not now
+                 * iterating the wrong list.
+                 */
+                if (event->rb == rb) {
+                        rcu_assign_pointer(event->rb, NULL);
+                        ring_buffer_detach(event, rb);
+                        ring_buffer_put(rb); /* can't be last, we still have one */
+                }
                mutex_unlock(&event->mmap_mutex);
+                put_event(event);
-                ring_buffer_put(rb);
+                /*
-                free_uid(user);
+                 * Restart the iteration; either we're on the wrong list or
+                 * destroyed its integrity by doing a deletion.
+                 */
+                goto again;
        }
+        rcu_read_unlock();
+        /*
+         * It could be there's still a few 0-ref events on the list; they'll
+         * get cleaned up by free_event() -- they'll also still have their
+         * ref on the rb and will free it whenever they are done with it.
+         *
+         * Aside from that, this buffer is 'fully' detached and unmapped,
+         * undo the VM accounting.
+         */
+        atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+        vma->vm_mm->pinned_vm -= mmap_locked;
+        free_uid(mmap_user);
+        ring_buffer_put(rb); /* could be last */
 }
 static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3674,12 +3918,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                return -EINVAL;
        WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
        mutex_lock(&event->mmap_mutex);
        if (event->rb) {
-                if (event->rb->nr_pages == nr_pages)
+                if (event->rb->nr_pages != nr_pages) {
-                        atomic_inc(&event->rb->refcount);
-                else
                        ret = -EINVAL;
+                        goto unlock;
+                }
+                if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+                        /*
+                         * Raced against perf_mmap_close() through
+                         * perf_event_set_output(). Try again, hope for better
+                         * luck.
+                         */
+                        mutex_unlock(&event->mmap_mutex);
+                        goto again;
+                }
                goto unlock;
        }
@@ -3720,12 +3976,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                ret = -ENOMEM;
                goto unlock;
        }
-        rcu_assign_pointer(event->rb, rb);
+        atomic_set(&rb->mmap_count, 1);
+        rb->mmap_locked = extra;
+        rb->mmap_user = get_current_user();
        atomic_long_add(user_extra, &user->locked_vm);
-        event->mmap_locked = extra;
+        vma->vm_mm->pinned_vm += extra;
-        event->mmap_user = get_current_user();
-        vma->vm_mm->pinned_vm += event->mmap_locked;
+        ring_buffer_attach(event, rb);
+        rcu_assign_pointer(event->rb, rb);
        perf_event_update_userpage(event);
@@ -3734,7 +3994,11 @@ unlock:
                atomic_inc(&event->mmap_count);
        mutex_unlock(&event->mmap_mutex);
-        vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+        /*
+         * Since pinned accounting is per vm we cannot allow fork() to copy our
+         * vma.
+         */
+        vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &perf_mmap_vmops;
        return ret;
@@ -4961,7 +5225,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
 * sign as trigger.
 */
-static u64 perf_swevent_set_period(struct perf_event *event)
+u64 perf_swevent_set_period(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
        u64 period = hwc->last_period;
@@ -5904,9 +6168,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
 }
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+                                struct device_attribute *attr,
+                                char *page)
+{
+        struct pmu *pmu = dev_get_drvdata(dev);
+        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+}
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t count)
+{
+        struct pmu *pmu = dev_get_drvdata(dev);
+        int timer, cpu, ret;
+        ret = kstrtoint(buf, 0, &timer);
+        if (ret)
+                return ret;
+        if (timer < 1)
+                return -EINVAL;
+        /* same value, noting to do */
+        if (timer == pmu->hrtimer_interval_ms)
+                return count;
+        pmu->hrtimer_interval_ms = timer;
+        /* update all cpuctx for this PMU */
+        for_each_possible_cpu(cpu) {
+                struct perf_cpu_context *cpuctx;
+                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+                cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+                if (hrtimer_active(&cpuctx->hrtimer))
+                        hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+        }
+        return count;
+}
+#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 static struct device_attribute pmu_dev_attrs[] = {
-       __ATTR_RO(type),
+        __ATTR_RO(type),
-       __ATTR_NULL,
+        __ATTR_RW(perf_event_mux_interval_ms),
+        __ATTR_NULL,
 };
 static int pmu_bus_running;
@@ -5952,7 +6263,7 @@ free_dev:
 static struct lock_class_key cpuctx_mutex;
 static struct lock_class_key cpuctx_lock;
-int perf_pmu_register(struct pmu *pmu, char *name, int type)
+int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 {
        int cpu, ret;
@@ -6001,7 +6312,9 @@ skip_type:
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.type = cpu_context;
                cpuctx->ctx.pmu = pmu;
-                cpuctx->jiffies_interval = 1;
+                __perf_cpu_hrtimer_init(cpuctx, cpu);
                INIT_LIST_HEAD(&cpuctx->rotation_list);
                cpuctx->unique_pmu = pmu;
        }
@@ -6327,11 +6640,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
                        return -EINVAL;
-                /* kernel level capture: check permissions */
-                if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
-                    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-                        return -EACCES;
                /* propagate priv level, when not set for branch */
                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
@@ -6349,6 +6657,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
                         */
                        attr->branch_sample_type = mask;
                }
+                /* privileged levels capture (kernel, hv): check permissions */
+                if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+                    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+                        return -EACCES;
        }
        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -6412,6 +6724,8 @@ set:
        if (atomic_read(&event->mmap_count))
                goto unlock;
+        old_rb = event->rb;
        if (output_event) {
                /* get the rb we want to redirect to */
                rb = ring_buffer_get(output_event);
@@ -6419,16 +6733,28 @@ set:
                        goto unlock;
        }
-        old_rb = event->rb;
-        rcu_assign_pointer(event->rb, rb);
        if (old_rb)
                ring_buffer_detach(event, old_rb);
+        if (rb)
+                ring_buffer_attach(event, rb);
+        rcu_assign_pointer(event->rb, rb);
+        if (old_rb) {
+                ring_buffer_put(old_rb);
+                /*
+                 * Since we detached before setting the new rb, so that we
+                 * could attach the new rb, we could have missed a wakeup.
+                 * Provide it now.
+                 */
+                wake_up_all(&event->waitq);
+        }
        ret = 0;
 unlock:
        mutex_unlock(&event->mmap_mutex);
-        if (old_rb)
-                ring_buffer_put(old_rb);
 out:
        return ret;
 }
@@ -7387,7 +7713,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
        case CPU_DOWN_PREPARE:
                perf_event_exit_cpu(cpu);
                break;
        default:
                break;
        }