Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf updates from Ingo Molnar: "Kernel improvements: - watchdog driver improvements by Li Zefan - Power7 CPI stack events related improvements by Sukadev Bhattiprolu - event multiplexing via hrtimers and other improvements by Stephane Eranian - kernel stack use optimization by Andrew Hunter - AMD IOMMU uncore PMU support by Suravee Suthikulpanit - NMI handling rate-limits by Dave Hansen - various hw_breakpoint fixes by Oleg Nesterov - hw_breakpoint overflow period sampling and related signal handling fixes by Jiri Olsa - Intel Haswell PMU support by Andi Kleen Tooling improvements: - Reset SIGTERM handler in workload child process, fix from David Ahern. - Makefile reorganization, prep work for Kconfig patches, from Jiri Olsa. - Add automated make test suite, from Jiri Olsa. - Add --percent-limit option to 'top' and 'report', from Namhyung Kim. - Sorting improvements, from Namhyung Kim. - Expand definition of sysfs format attribute, from Michael Ellerman. Tooling fixes: - 'perf tests' fixes from Jiri Olsa. - Make Power7 CPI stack events available in sysfs, from Sukadev Bhattiprolu. - Handle death by SIGTERM in 'perf record', fix from David Ahern. - Fix printing of perf_event_paranoid message, from David Ahern. - Handle realloc failures in 'perf kvm', from David Ahern. - Fix divide by 0 in variance, from David Ahern. - Save parent pid in thread struct, from David Ahern. - Handle JITed code in shared memory, from Andi Kleen. - Fixes for 'perf diff', from Jiri Olsa. - Remove some unused struct members, from Jiri Olsa. - Add missing liblk.a dependency for python/perf.so, fix from Jiri Olsa. - Respect CROSS_COMPILE in liblk.a, from Rabin Vincent. - No need to do locking when adding hists in perf report, only 'top' needs that, from Namhyung Kim. - Fix alignment of symbol column in in the hists browser (top, report) when -v is given, from NAmhyung Kim. - Fix 'perf top' -E option behavior, from Namhyung Kim. - Fix bug in isupper() and islower(), from Sukadev Bhattiprolu. - Fix compile errors in bp_signal 'perf test', from Sukadev Bhattiprolu. ... and more things" * 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (102 commits) perf/x86: Disable PEBS-LL in intel_pmu_pebs_disable() perf/x86: Fix shared register mutual exclusion enforcement perf/x86/intel: Support full width counting x86: Add NMI duration tracepoints perf: Drop sample rate when sampling is too slow x86: Warn when NMI handlers take large amounts of time hw_breakpoint: Introduce "struct bp_cpuinfo" hw_breakpoint: Simplify *register_wide_hw_breakpoint() hw_breakpoint: Introduce cpumask_of_bp() hw_breakpoint: Simplify the "weight" usage in toggle_bp_slot() paths hw_breakpoint: Simplify list/idx mess in toggle_bp_slot() paths perf/x86/intel: Add mem-loads/stores support for Haswell perf/x86/intel: Support Haswell/v4 LBR format perf/x86/intel: Move NMI clearing to end of PMI handler perf/x86/intel: Add Haswell PEBS support perf/x86/intel: Add simple Haswell PMU support perf/x86/intel: Add Haswell PEBS record support perf/x86/intel: Fix sparse warning perf/x86/amd: AMD IOMMU Performance Counter PERF uncore PMU implementation perf/x86/amd: Add IOMMU Performance Counter resource management ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2013-07-02 19:15:23 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2013-07-02 19:15:23 -0400
commit: f0bb4c0ab064a8aeeffbda1cee380151a594eaab (patch)
tree: 14d55a89c5db455aa10ff9a96ca14c474a9c4d55 /kernel
parent: a4883ef6af5e513a1e8c2ab9aab721604aa3a4f5 (diff)
parent: 983433b5812c5cf33a9008fa38c6f9b407fedb76 (diff)
3 files changed, 350 insertions, 131 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b391907d5352..1db3af933704 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
 /*
 * max perf event sample rate
 */
-#define DEFAULT_MAX_SAMPLE_RATE 100000
+#define DEFAULT_MAX_SAMPLE_RATE         100000
-int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+#define DEFAULT_SAMPLE_PERIOD_NS        (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
-static int max_samples_per_tick __read_mostly =
+#define DEFAULT_CPU_TIME_MAX_PERCENT    25
-        DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int max_samples_per_tick __read_mostly   = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly  = DEFAULT_SAMPLE_PERIOD_NS;
+static atomic_t perf_sample_allowed_ns __read_mostly =
+        ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+void update_perf_cpu_limits(void)
+{
+        u64 tmp = perf_sample_period_ns;
+        tmp *= sysctl_perf_cpu_time_max_percent;
+        tmp = do_div(tmp, 100);
+        atomic_set(&perf_sample_allowed_ns, tmp);
+}
+static int perf_rotate_context(struct perf_cpu_context *cpuctx);
 int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
@@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
                return ret;
        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+        update_perf_cpu_limits();
        return 0;
 }
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+                                void __user *buffer, size_t *lenp,
+                                loff_t *ppos)
+{
+        int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        if (ret || !write)
+                return ret;
+        update_perf_cpu_limits();
+        return 0;
+}
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done.  This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+DEFINE_PER_CPU(u64, running_sample_length);
+void perf_sample_event_took(u64 sample_len_ns)
+{
+        u64 avg_local_sample_len;
+        u64 local_samples_len = __get_cpu_var(running_sample_length);
+        if (atomic_read(&perf_sample_allowed_ns) == 0)
+                return;
+        /* decay the counter by 1 average sample */
+        local_samples_len = __get_cpu_var(running_sample_length);
+        local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+        local_samples_len += sample_len_ns;
+        __get_cpu_var(running_sample_length) = local_samples_len;
+        /*
+         * note: this will be biased artifically low until we have
+         * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
+         * from having to maintain a count.
+         */
+        avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+        if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+                return;
+        if (max_samples_per_tick <= 1)
+                return;
+        max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+        sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+        perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+        printk_ratelimited(KERN_WARNING
+                        "perf samples too long (%lld > %d), lowering "
+                        "kernel.perf_event_max_sample_rate to %d\n",
+                        avg_local_sample_len,
+                        atomic_read(&perf_sample_allowed_ns),
+                        sysctl_perf_event_sample_rate);
+        update_perf_cpu_limits();
+}
 static atomic64_t perf_event_id;
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -655,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 }
 #endif
+/*
+ * set default to be dependent on timer tick just
+ * like original code
+ */
+#define PERF_CPU_HRTIMER (1000 / HZ)
+/*
+ * function must be called with interrupts disbled
+ */
+static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+{
+        struct perf_cpu_context *cpuctx;
+        enum hrtimer_restart ret = HRTIMER_NORESTART;
+        int rotations = 0;
+        WARN_ON(!irqs_disabled());
+        cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+        rotations = perf_rotate_context(cpuctx);
+        /*
+         * arm timer if needed
+         */
+        if (rotations) {
+                hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+                ret = HRTIMER_RESTART;
+        }
+        return ret;
+}
+/* CPU is going down */
+void perf_cpu_hrtimer_cancel(int cpu)
+{
+        struct perf_cpu_context *cpuctx;
+        struct pmu *pmu;
+        unsigned long flags;
+        if (WARN_ON(cpu != smp_processor_id()))
+                return;
+        local_irq_save(flags);
+        rcu_read_lock();
+        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+                if (pmu->task_ctx_nr == perf_sw_context)
+                        continue;
+                hrtimer_cancel(&cpuctx->hrtimer);
+        }
+        rcu_read_unlock();
+        local_irq_restore(flags);
+}
+static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+{
+        struct hrtimer *hr = &cpuctx->hrtimer;
+        struct pmu *pmu = cpuctx->ctx.pmu;
+        int timer;
+        /* no multiplexing needed for SW PMU */
+        if (pmu->task_ctx_nr == perf_sw_context)
+                return;
+        /*
+         * check default is sane, if not set then force to
+         * default interval (1/tick)
+         */
+        timer = pmu->hrtimer_interval_ms;
+        if (timer < 1)
+                timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+        cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+        hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+        hr->function = perf_cpu_hrtimer_handler;
+}
+static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+{
+        struct hrtimer *hr = &cpuctx->hrtimer;
+        struct pmu *pmu = cpuctx->ctx.pmu;
+        /* not for SW PMU */
+        if (pmu->task_ctx_nr == perf_sw_context)
+                return;
+        if (hrtimer_active(hr))
+                return;
+        if (!hrtimer_callback_running(hr))
+                __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
+                                         0, HRTIMER_MODE_REL_PINNED, 0);
+}
 void perf_pmu_disable(struct pmu *pmu)
 {
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1503,6 +1689,7 @@ group_sched_in(struct perf_event *group_event,
        if (event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
+                perf_cpu_hrtimer_restart(cpuctx);
                return -EAGAIN;
        }
@@ -1549,6 +1736,8 @@ group_error:
        pmu->cancel_txn(pmu);
+        perf_cpu_hrtimer_restart(cpuctx);
        return -EAGAIN;
 }
@@ -1804,8 +1993,10 @@ static int __perf_event_enable(void *info)
                 * If this event can't go on and it's part of a
                 * group, then the whole group has to come off.
                 */
-                if (leader != event)
+                if (leader != event) {
                        group_sched_out(leader, cpuctx, ctx);
+                        perf_cpu_hrtimer_restart(cpuctx);
+                }
                if (leader->attr.pinned) {
                        update_group_times(leader);
                        leader->state = PERF_EVENT_STATE_ERROR;
@@ -2552,7 +2743,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
 * because they're strictly cpu affine and rotate_start is called with IRQs
 * disabled, while rotate_context is called from IRQ context.
 */
-static void perf_rotate_context(struct perf_cpu_context *cpuctx)
+static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        struct perf_event_context *ctx = NULL;
        int rotate = 0, remove = 1;
@@ -2591,6 +2782,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 done:
        if (remove)
                list_del_init(&cpuctx->rotation_list);
+        return rotate;
 }
 #ifdef CONFIG_NO_HZ_FULL
@@ -2622,10 +2815,6 @@ void perf_event_task_tick(void)
                ctx = cpuctx->task_ctx;
                if (ctx)
                        perf_adjust_freq_unthr_context(ctx, throttled);
-                if (cpuctx->jiffies_interval == 1 ||
-                                !(jiffies % cpuctx->jiffies_interval))
-                        perf_rotate_context(cpuctx);
        }
 }
@@ -5036,7 +5225,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
 * sign as trigger.
 */
-static u64 perf_swevent_set_period(struct perf_event *event)
+u64 perf_swevent_set_period(struct perf_event *event)
 {
        struct hw_perf_event *hwc = &event->hw;
        u64 period = hwc->last_period;
@@ -5979,9 +6168,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
 }
+static ssize_t
+perf_event_mux_interval_ms_show(struct device *dev,
+                                struct device_attribute *attr,
+                                char *page)
+{
+        struct pmu *pmu = dev_get_drvdata(dev);
+        return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
+}
+static ssize_t
+perf_event_mux_interval_ms_store(struct device *dev,
+                                 struct device_attribute *attr,
+                                 const char *buf, size_t count)
+{
+        struct pmu *pmu = dev_get_drvdata(dev);
+        int timer, cpu, ret;
+        ret = kstrtoint(buf, 0, &timer);
+        if (ret)
+                return ret;
+        if (timer < 1)
+                return -EINVAL;
+        /* same value, noting to do */
+        if (timer == pmu->hrtimer_interval_ms)
+                return count;
+        pmu->hrtimer_interval_ms = timer;
+        /* update all cpuctx for this PMU */
+        for_each_possible_cpu(cpu) {
+                struct perf_cpu_context *cpuctx;
+                cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
+                cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+                if (hrtimer_active(&cpuctx->hrtimer))
+                        hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+        }
+        return count;
+}
+#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 static struct device_attribute pmu_dev_attrs[] = {
-       __ATTR_RO(type),
+        __ATTR_RO(type),
-       __ATTR_NULL,
+        __ATTR_RW(perf_event_mux_interval_ms),
+        __ATTR_NULL,
 };
 static int pmu_bus_running;
@@ -6027,7 +6263,7 @@ free_dev:
 static struct lock_class_key cpuctx_mutex;
 static struct lock_class_key cpuctx_lock;
-int perf_pmu_register(struct pmu *pmu, char *name, int type)
+int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 {
        int cpu, ret;
@@ -6076,7 +6312,9 @@ skip_type:
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.type = cpu_context;
                cpuctx->ctx.pmu = pmu;
-                cpuctx->jiffies_interval = 1;
+                __perf_cpu_hrtimer_init(cpuctx, cpu);
                INIT_LIST_HEAD(&cpuctx->rotation_list);
                cpuctx->unique_pmu = pmu;
        }
@@ -6402,11 +6640,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
                if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
                        return -EINVAL;
-                /* kernel level capture: check permissions */
-                if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
-                    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-                        return -EACCES;
                /* propagate priv level, when not set for branch */
                if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
@@ -6424,6 +6657,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
                         */
                        attr->branch_sample_type = mask;
                }
+                /* privileged levels capture (kernel, hv): check permissions */
+                if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
+                    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+                        return -EACCES;
        }
        if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -7476,7 +7713,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
        case CPU_DOWN_PREPARE:
                perf_event_exit_cpu(cpu);
                break;
        default:
                break;
        }
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 20185ea64aa6..1559fb0b9296 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -46,23 +46,26 @@
 #include <linux/smp.h>
 #include <linux/hw_breakpoint.h>
 /*
 * Constraints data
 */
+struct bp_cpuinfo {
+        /* Number of pinned cpu breakpoints in a cpu */
+        unsigned int    cpu_pinned;
+        /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
+        unsigned int    *tsk_pinned;
+        /* Number of non-pinned cpu/task breakpoints in a cpu */
+        unsigned int    flexible; /* XXX: placeholder, see fetch_this_slot() */
+};
-/* Number of pinned cpu breakpoints in a cpu */
+static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
-static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
-/* Number of pinned task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
-/* Number of non-pinned cpu/task breakpoints in a cpu */
-static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
 static int nr_slots[TYPE_MAX];
+static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
+{
+        return per_cpu_ptr(bp_cpuinfo + type, cpu);
+}
 /* Keep track of the breakpoints attached to tasks */
 static LIST_HEAD(bp_task_head);
@@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
 */
 static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
 {
+        unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
        int i;
-        unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
        for (i = nr_slots[type] - 1; i >= 0; i--) {
                if (tsk_pinned[i] > 0)
@@ -127,6 +130,13 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
        return count;
 }
+static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
+{
+        if (bp->cpu >= 0)
+                return cpumask_of(bp->cpu);
+        return cpu_possible_mask;
+}
 /*
 * Report the number of pinned/un-pinned breakpoints we have in
 * a given cpu (cpu > -1) or in all of them (cpu = -1).
@@ -135,25 +145,15 @@ static void
 fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                    enum bp_type_idx type)
 {
-        int cpu = bp->cpu;
+        const struct cpumask *cpumask = cpumask_of_bp(bp);
-        struct task_struct *tsk = bp->hw.bp_target;
+        int cpu;
-        if (cpu >= 0) {
-                slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
-                if (!tsk)
-                        slots->pinned += max_task_bp_pinned(cpu, type);
-                else
-                        slots->pinned += task_bp_pinned(cpu, bp, type);
-                slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
-                return;
-        }
-        for_each_possible_cpu(cpu) {
+        for_each_cpu(cpu, cpumask) {
-                unsigned int nr;
+                struct bp_cpuinfo *info = get_bp_info(cpu, type);
+                int nr;
-                nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
+                nr = info->cpu_pinned;
-                if (!tsk)
+                if (!bp->hw.bp_target)
                        nr += max_task_bp_pinned(cpu, type);
                else
                        nr += task_bp_pinned(cpu, bp, type);
@@ -161,8 +161,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
                if (nr > slots->pinned)
                        slots->pinned = nr;
-                nr = per_cpu(nr_bp_flexible[type], cpu);
+                nr = info->flexible;
                if (nr > slots->flexible)
                        slots->flexible = nr;
        }
@@ -182,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
 /*
 * Add a pinned breakpoint for the given task in our constraint table
 */
-static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable,
+static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
                                enum bp_type_idx type, int weight)
 {
-        unsigned int *tsk_pinned;
+        unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
-        int old_count = 0;
+        int old_idx, new_idx;
-        int old_idx = 0;
-        int idx = 0;
+        old_idx = task_bp_pinned(cpu, bp, type) - 1;
+        new_idx = old_idx + weight;
-        old_count = task_bp_pinned(cpu, bp, type);
-        old_idx = old_count - 1;
+        if (old_idx >= 0)
-        idx = old_idx + weight;
+                tsk_pinned[old_idx]--;
+        if (new_idx >= 0)
-        /* tsk_pinned[n] is the number of tasks having n breakpoints */
+                tsk_pinned[new_idx]++;
-        tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
-        if (enable) {
-                tsk_pinned[idx]++;
-                if (old_count > 0)
-                        tsk_pinned[old_idx]--;
-        } else {
-                tsk_pinned[idx]--;
-                if (old_count > 0)
-                        tsk_pinned[old_idx]++;
-        }
 }
 /*
@@ -214,33 +203,26 @@ static void
 toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
               int weight)
 {
-        int cpu = bp->cpu;
+        const struct cpumask *cpumask = cpumask_of_bp(bp);
-        struct task_struct *tsk = bp->hw.bp_target;
+        int cpu;
-        /* Pinned counter cpu profiling */
+        if (!enable)
-        if (!tsk) {
+                weight = -weight;
-                if (enable)
+        /* Pinned counter cpu profiling */
-                        per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight;
+        if (!bp->hw.bp_target) {
-                else
+                get_bp_info(bp->cpu, type)->cpu_pinned += weight;
-                        per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
                return;
        }
        /* Pinned counter task profiling */
+        for_each_cpu(cpu, cpumask)
-        if (!enable)
+                toggle_bp_task_slot(bp, cpu, type, weight);
-                list_del(&bp->hw.bp_list);
-        if (cpu >= 0) {
-                toggle_bp_task_slot(bp, cpu, enable, type, weight);
-        } else {
-                for_each_possible_cpu(cpu)
-                        toggle_bp_task_slot(bp, cpu, enable, type, weight);
-        }
        if (enable)
                list_add_tail(&bp->hw.bp_list, &bp_task_head);
+        else
+                list_del(&bp->hw.bp_list);
 }
 /*
@@ -261,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
 *
 *   - If attached to a single cpu, check:
 *
- *       (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu)
+ *       (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu)
- *           + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM
+ *           + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM
 *
 *       -> If there are already non-pinned counters in this cpu, it means
 *          there is already a free slot for them.
@@ -272,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
 *
 *   - If attached to every cpus, check:
 *
- *       (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *))
+ *       (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *))
- *           + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM
+ *           + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM
 *
 *       -> This is roughly the same, except we check the number of per cpu
 *          bp for every cpu and we keep the max one. Same for the per tasks
@@ -284,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
 *
 *   - If attached to a single cpu, check:
 *
- *       ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu)
+ *       ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu)
- *            + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM
+ *            + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM
 *
- *       -> Same checks as before. But now the nr_bp_flexible, if any, must keep
+ *       -> Same checks as before. But now the info->flexible, if any, must keep
 *          one register at least (or they will never be fed).
 *
 *   - If attached to every cpus, check:
 *
- *       ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *))
+ *       ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
- *            + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM
+ *            + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
 */
 static int __reserve_bp_slot(struct perf_event *bp)
 {
@@ -518,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
                            perf_overflow_handler_t triggered,
                            void *context)
 {
-        struct perf_event * __percpu *cpu_events, **pevent, *bp;
+        struct perf_event * __percpu *cpu_events, *bp;
-        long err;
+        long err = 0;
        int cpu;
        cpu_events = alloc_percpu(typeof(*cpu_events));
@@ -528,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
        get_online_cpus();
        for_each_online_cpu(cpu) {
-                pevent = per_cpu_ptr(cpu_events, cpu);
                bp = perf_event_create_kernel_counter(attr, cpu, NULL,
                                                      triggered, context);
-                *pevent = bp;
                if (IS_ERR(bp)) {
                        err = PTR_ERR(bp);
-                        goto fail;
+                        break;
                }
-        }
-        put_online_cpus();
-        return cpu_events;
+                per_cpu(*cpu_events, cpu) = bp;
-fail:
-        for_each_online_cpu(cpu) {
-                pevent = per_cpu_ptr(cpu_events, cpu);
-                if (IS_ERR(*pevent))
-                        break;
-                unregister_hw_breakpoint(*pevent);
        }
        put_online_cpus();
-        free_percpu(cpu_events);
+        if (likely(!err))
+                return cpu_events;
+        unregister_wide_hw_breakpoint(cpu_events);
        return (void __percpu __force *)ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -564,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
 void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
 {
        int cpu;
-        struct perf_event **pevent;
-        for_each_possible_cpu(cpu) {
+        for_each_possible_cpu(cpu)
-                pevent = per_cpu_ptr(cpu_events, cpu);
+                unregister_hw_breakpoint(per_cpu(*cpu_events, cpu));
-                unregister_hw_breakpoint(*pevent);
-        }
        free_percpu(cpu_events);
 }
 EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
@@ -612,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags)
        if (!(flags & PERF_EF_START))
                bp->hw.state = PERF_HES_STOPPED;
+        if (is_sampling_event(bp)) {
+                bp->hw.last_period = bp->hw.sample_period;
+                perf_swevent_set_period(bp);
+        }
        return arch_install_hw_breakpoint(bp);
 }
@@ -650,7 +625,6 @@ static struct pmu perf_breakpoint = {
 int __init init_hw_breakpoint(void)
 {
-        unsigned int **task_bp_pinned;
        int cpu, err_cpu;
        int i;
@@ -659,10 +633,11 @@ int __init init_hw_breakpoint(void)
        for_each_possible_cpu(cpu) {
                for (i = 0; i < TYPE_MAX; i++) {
-                        task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu);
+                        struct bp_cpuinfo *info = get_bp_info(cpu, i);
-                        *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i],
-                                                  GFP_KERNEL);
+                        info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
-                        if (!*task_bp_pinned)
+                                                        GFP_KERNEL);
+                        if (!info->tsk_pinned)
                                goto err_alloc;
                }
        }
@@ -676,7 +651,7 @@ int __init init_hw_breakpoint(void)
 err_alloc:
        for_each_possible_cpu(err_cpu) {
                for (i = 0; i < TYPE_MAX; i++)
-                        kfree(per_cpu(nr_task_bp_pinned[i], err_cpu));
+                        kfree(get_bp_info(err_cpu, i)->tsk_pinned);
                if (err_cpu == cpu)
                        break;
        }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9edcf456e0fc..4ce13c3cedb9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -120,7 +120,6 @@ extern int blk_iopoll_enabled;
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_LOCKUP_DETECTOR
 static int sixty = 60;
-static int neg_one = -1;
 #endif
 static int zero;
@@ -814,7 +813,7 @@ static struct ctl_table kern_table[] = {
                .maxlen         = sizeof(int),
                .mode           = 0644,
                .proc_handler   = proc_dowatchdog,
-                .extra1         = &neg_one,
+                .extra1         = &zero,
                .extra2         = &sixty,
        },
        {
@@ -1044,6 +1043,15 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = perf_proc_update_handler,
        },
+        {
+                .procname       = "perf_cpu_time_max_percent",
+                .data           = &sysctl_perf_cpu_time_max_percent,
+                .maxlen         = sizeof(sysctl_perf_cpu_time_max_percent),
+                .mode           = 0644,
+                .proc_handler   = perf_cpu_time_max_percent_handler,
+                .extra1         = &zero,
+                .extra2         = &one_hundred,
+        },
 #endif
 #ifdef CONFIG_KMEMCHECK
        {
author	Linus Torvalds <torvalds@linux-foundation.org>	2013-07-02 19:15:23 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2013-07-02 19:15:23 -0400
commit	f0bb4c0ab064a8aeeffbda1cee380151a594eaab (patch)
tree	14d55a89c5db455aa10ff9a96ca14c474a9c4d55 /kernel
parent	a4883ef6af5e513a1e8c2ab9aab721604aa3a4f5 (diff)
parent	983433b5812c5cf33a9008fa38c6f9b407fedb76 (diff)