aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2013-07-02 19:15:23 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2013-07-02 19:15:23 -0400
commitf0bb4c0ab064a8aeeffbda1cee380151a594eaab (patch)
tree14d55a89c5db455aa10ff9a96ca14c474a9c4d55 /kernel
parenta4883ef6af5e513a1e8c2ab9aab721604aa3a4f5 (diff)
parent983433b5812c5cf33a9008fa38c6f9b407fedb76 (diff)
Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull perf updates from Ingo Molnar: "Kernel improvements: - watchdog driver improvements by Li Zefan - Power7 CPI stack events related improvements by Sukadev Bhattiprolu - event multiplexing via hrtimers and other improvements by Stephane Eranian - kernel stack use optimization by Andrew Hunter - AMD IOMMU uncore PMU support by Suravee Suthikulpanit - NMI handling rate-limits by Dave Hansen - various hw_breakpoint fixes by Oleg Nesterov - hw_breakpoint overflow period sampling and related signal handling fixes by Jiri Olsa - Intel Haswell PMU support by Andi Kleen Tooling improvements: - Reset SIGTERM handler in workload child process, fix from David Ahern. - Makefile reorganization, prep work for Kconfig patches, from Jiri Olsa. - Add automated make test suite, from Jiri Olsa. - Add --percent-limit option to 'top' and 'report', from Namhyung Kim. - Sorting improvements, from Namhyung Kim. - Expand definition of sysfs format attribute, from Michael Ellerman. Tooling fixes: - 'perf tests' fixes from Jiri Olsa. - Make Power7 CPI stack events available in sysfs, from Sukadev Bhattiprolu. - Handle death by SIGTERM in 'perf record', fix from David Ahern. - Fix printing of perf_event_paranoid message, from David Ahern. - Handle realloc failures in 'perf kvm', from David Ahern. - Fix divide by 0 in variance, from David Ahern. - Save parent pid in thread struct, from David Ahern. - Handle JITed code in shared memory, from Andi Kleen. - Fixes for 'perf diff', from Jiri Olsa. - Remove some unused struct members, from Jiri Olsa. - Add missing liblk.a dependency for python/perf.so, fix from Jiri Olsa. - Respect CROSS_COMPILE in liblk.a, from Rabin Vincent. - No need to do locking when adding hists in perf report, only 'top' needs that, from Namhyung Kim. - Fix alignment of symbol column in in the hists browser (top, report) when -v is given, from NAmhyung Kim. - Fix 'perf top' -E option behavior, from Namhyung Kim. - Fix bug in isupper() and islower(), from Sukadev Bhattiprolu. - Fix compile errors in bp_signal 'perf test', from Sukadev Bhattiprolu. ... and more things" * 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (102 commits) perf/x86: Disable PEBS-LL in intel_pmu_pebs_disable() perf/x86: Fix shared register mutual exclusion enforcement perf/x86/intel: Support full width counting x86: Add NMI duration tracepoints perf: Drop sample rate when sampling is too slow x86: Warn when NMI handlers take large amounts of time hw_breakpoint: Introduce "struct bp_cpuinfo" hw_breakpoint: Simplify *register_wide_hw_breakpoint() hw_breakpoint: Introduce cpumask_of_bp() hw_breakpoint: Simplify the "weight" usage in toggle_bp_slot() paths hw_breakpoint: Simplify list/idx mess in toggle_bp_slot() paths perf/x86/intel: Add mem-loads/stores support for Haswell perf/x86/intel: Support Haswell/v4 LBR format perf/x86/intel: Move NMI clearing to end of PMI handler perf/x86/intel: Add Haswell PEBS support perf/x86/intel: Add simple Haswell PMU support perf/x86/intel: Add Haswell PEBS record support perf/x86/intel: Fix sparse warning perf/x86/amd: AMD IOMMU Performance Counter PERF uncore PMU implementation perf/x86/amd: Add IOMMU Performance Counter resource management ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/events/core.c278
-rw-r--r--kernel/events/hw_breakpoint.c191
-rw-r--r--kernel/sysctl.c12
3 files changed, 350 insertions, 131 deletions
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b391907d5352..1db3af933704 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -165,10 +165,28 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
165/* 165/*
166 * max perf event sample rate 166 * max perf event sample rate
167 */ 167 */
168#define DEFAULT_MAX_SAMPLE_RATE 100000 168#define DEFAULT_MAX_SAMPLE_RATE 100000
169int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE; 169#define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
170static int max_samples_per_tick __read_mostly = 170#define DEFAULT_CPU_TIME_MAX_PERCENT 25
171 DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ); 171
172int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
173
174static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
175static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
176
177static atomic_t perf_sample_allowed_ns __read_mostly =
178 ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
179
180void update_perf_cpu_limits(void)
181{
182 u64 tmp = perf_sample_period_ns;
183
184 tmp *= sysctl_perf_cpu_time_max_percent;
185 tmp = do_div(tmp, 100);
186 atomic_set(&perf_sample_allowed_ns, tmp);
187}
188
189static int perf_rotate_context(struct perf_cpu_context *cpuctx);
172 190
173int perf_proc_update_handler(struct ctl_table *table, int write, 191int perf_proc_update_handler(struct ctl_table *table, int write,
174 void __user *buffer, size_t *lenp, 192 void __user *buffer, size_t *lenp,
@@ -180,10 +198,78 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
180 return ret; 198 return ret;
181 199
182 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); 200 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
201 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
202 update_perf_cpu_limits();
183 203
184 return 0; 204 return 0;
185} 205}
186 206
207int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
208
209int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
210 void __user *buffer, size_t *lenp,
211 loff_t *ppos)
212{
213 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
214
215 if (ret || !write)
216 return ret;
217
218 update_perf_cpu_limits();
219
220 return 0;
221}
222
223/*
224 * perf samples are done in some very critical code paths (NMIs).
225 * If they take too much CPU time, the system can lock up and not
226 * get any real work done. This will drop the sample rate when
227 * we detect that events are taking too long.
228 */
229#define NR_ACCUMULATED_SAMPLES 128
230DEFINE_PER_CPU(u64, running_sample_length);
231
232void perf_sample_event_took(u64 sample_len_ns)
233{
234 u64 avg_local_sample_len;
235 u64 local_samples_len = __get_cpu_var(running_sample_length);
236
237 if (atomic_read(&perf_sample_allowed_ns) == 0)
238 return;
239
240 /* decay the counter by 1 average sample */
241 local_samples_len = __get_cpu_var(running_sample_length);
242 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
243 local_samples_len += sample_len_ns;
244 __get_cpu_var(running_sample_length) = local_samples_len;
245
246 /*
247 * note: this will be biased artifically low until we have
248 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
249 * from having to maintain a count.
250 */
251 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
252
253 if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
254 return;
255
256 if (max_samples_per_tick <= 1)
257 return;
258
259 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
260 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
261 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
262
263 printk_ratelimited(KERN_WARNING
264 "perf samples too long (%lld > %d), lowering "
265 "kernel.perf_event_max_sample_rate to %d\n",
266 avg_local_sample_len,
267 atomic_read(&perf_sample_allowed_ns),
268 sysctl_perf_event_sample_rate);
269
270 update_perf_cpu_limits();
271}
272
187static atomic64_t perf_event_id; 273static atomic64_t perf_event_id;
188 274
189static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx, 275static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -655,6 +741,106 @@ perf_cgroup_mark_enabled(struct perf_event *event,
655} 741}
656#endif 742#endif
657 743
744/*
745 * set default to be dependent on timer tick just
746 * like original code
747 */
748#define PERF_CPU_HRTIMER (1000 / HZ)
749/*
750 * function must be called with interrupts disbled
751 */
752static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
753{
754 struct perf_cpu_context *cpuctx;
755 enum hrtimer_restart ret = HRTIMER_NORESTART;
756 int rotations = 0;
757
758 WARN_ON(!irqs_disabled());
759
760 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
761
762 rotations = perf_rotate_context(cpuctx);
763
764 /*
765 * arm timer if needed
766 */
767 if (rotations) {
768 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
769 ret = HRTIMER_RESTART;
770 }
771
772 return ret;
773}
774
775/* CPU is going down */
776void perf_cpu_hrtimer_cancel(int cpu)
777{
778 struct perf_cpu_context *cpuctx;
779 struct pmu *pmu;
780 unsigned long flags;
781
782 if (WARN_ON(cpu != smp_processor_id()))
783 return;
784
785 local_irq_save(flags);
786
787 rcu_read_lock();
788
789 list_for_each_entry_rcu(pmu, &pmus, entry) {
790 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
791
792 if (pmu->task_ctx_nr == perf_sw_context)
793 continue;
794
795 hrtimer_cancel(&cpuctx->hrtimer);
796 }
797
798 rcu_read_unlock();
799
800 local_irq_restore(flags);
801}
802
803static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
804{
805 struct hrtimer *hr = &cpuctx->hrtimer;
806 struct pmu *pmu = cpuctx->ctx.pmu;
807 int timer;
808
809 /* no multiplexing needed for SW PMU */
810 if (pmu->task_ctx_nr == perf_sw_context)
811 return;
812
813 /*
814 * check default is sane, if not set then force to
815 * default interval (1/tick)
816 */
817 timer = pmu->hrtimer_interval_ms;
818 if (timer < 1)
819 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
820
821 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
822
823 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
824 hr->function = perf_cpu_hrtimer_handler;
825}
826
827static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
828{
829 struct hrtimer *hr = &cpuctx->hrtimer;
830 struct pmu *pmu = cpuctx->ctx.pmu;
831
832 /* not for SW PMU */
833 if (pmu->task_ctx_nr == perf_sw_context)
834 return;
835
836 if (hrtimer_active(hr))
837 return;
838
839 if (!hrtimer_callback_running(hr))
840 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
841 0, HRTIMER_MODE_REL_PINNED, 0);
842}
843
658void perf_pmu_disable(struct pmu *pmu) 844void perf_pmu_disable(struct pmu *pmu)
659{ 845{
660 int *count = this_cpu_ptr(pmu->pmu_disable_count); 846 int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1503,6 +1689,7 @@ group_sched_in(struct perf_event *group_event,
1503 1689
1504 if (event_sched_in(group_event, cpuctx, ctx)) { 1690 if (event_sched_in(group_event, cpuctx, ctx)) {
1505 pmu->cancel_txn(pmu); 1691 pmu->cancel_txn(pmu);
1692 perf_cpu_hrtimer_restart(cpuctx);
1506 return -EAGAIN; 1693 return -EAGAIN;
1507 } 1694 }
1508 1695
@@ -1549,6 +1736,8 @@ group_error:
1549 1736
1550 pmu->cancel_txn(pmu); 1737 pmu->cancel_txn(pmu);
1551 1738
1739 perf_cpu_hrtimer_restart(cpuctx);
1740
1552 return -EAGAIN; 1741 return -EAGAIN;
1553} 1742}
1554 1743
@@ -1804,8 +1993,10 @@ static int __perf_event_enable(void *info)
1804 * If this event can't go on and it's part of a 1993 * If this event can't go on and it's part of a
1805 * group, then the whole group has to come off. 1994 * group, then the whole group has to come off.
1806 */ 1995 */
1807 if (leader != event) 1996 if (leader != event) {
1808 group_sched_out(leader, cpuctx, ctx); 1997 group_sched_out(leader, cpuctx, ctx);
1998 perf_cpu_hrtimer_restart(cpuctx);
1999 }
1809 if (leader->attr.pinned) { 2000 if (leader->attr.pinned) {
1810 update_group_times(leader); 2001 update_group_times(leader);
1811 leader->state = PERF_EVENT_STATE_ERROR; 2002 leader->state = PERF_EVENT_STATE_ERROR;
@@ -2552,7 +2743,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
2552 * because they're strictly cpu affine and rotate_start is called with IRQs 2743 * because they're strictly cpu affine and rotate_start is called with IRQs
2553 * disabled, while rotate_context is called from IRQ context. 2744 * disabled, while rotate_context is called from IRQ context.
2554 */ 2745 */
2555static void perf_rotate_context(struct perf_cpu_context *cpuctx) 2746static int perf_rotate_context(struct perf_cpu_context *cpuctx)
2556{ 2747{
2557 struct perf_event_context *ctx = NULL; 2748 struct perf_event_context *ctx = NULL;
2558 int rotate = 0, remove = 1; 2749 int rotate = 0, remove = 1;
@@ -2591,6 +2782,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2591done: 2782done:
2592 if (remove) 2783 if (remove)
2593 list_del_init(&cpuctx->rotation_list); 2784 list_del_init(&cpuctx->rotation_list);
2785
2786 return rotate;
2594} 2787}
2595 2788
2596#ifdef CONFIG_NO_HZ_FULL 2789#ifdef CONFIG_NO_HZ_FULL
@@ -2622,10 +2815,6 @@ void perf_event_task_tick(void)
2622 ctx = cpuctx->task_ctx; 2815 ctx = cpuctx->task_ctx;
2623 if (ctx) 2816 if (ctx)
2624 perf_adjust_freq_unthr_context(ctx, throttled); 2817 perf_adjust_freq_unthr_context(ctx, throttled);
2625
2626 if (cpuctx->jiffies_interval == 1 ||
2627 !(jiffies % cpuctx->jiffies_interval))
2628 perf_rotate_context(cpuctx);
2629 } 2818 }
2630} 2819}
2631 2820
@@ -5036,7 +5225,7 @@ static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
5036 * sign as trigger. 5225 * sign as trigger.
5037 */ 5226 */
5038 5227
5039static u64 perf_swevent_set_period(struct perf_event *event) 5228u64 perf_swevent_set_period(struct perf_event *event)
5040{ 5229{
5041 struct hw_perf_event *hwc = &event->hw; 5230 struct hw_perf_event *hwc = &event->hw;
5042 u64 period = hwc->last_period; 5231 u64 period = hwc->last_period;
@@ -5979,9 +6168,56 @@ type_show(struct device *dev, struct device_attribute *attr, char *page)
5979 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type); 6168 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
5980} 6169}
5981 6170
6171static ssize_t
6172perf_event_mux_interval_ms_show(struct device *dev,
6173 struct device_attribute *attr,
6174 char *page)
6175{
6176 struct pmu *pmu = dev_get_drvdata(dev);
6177
6178 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
6179}
6180
6181static ssize_t
6182perf_event_mux_interval_ms_store(struct device *dev,
6183 struct device_attribute *attr,
6184 const char *buf, size_t count)
6185{
6186 struct pmu *pmu = dev_get_drvdata(dev);
6187 int timer, cpu, ret;
6188
6189 ret = kstrtoint(buf, 0, &timer);
6190 if (ret)
6191 return ret;
6192
6193 if (timer < 1)
6194 return -EINVAL;
6195
6196 /* same value, noting to do */
6197 if (timer == pmu->hrtimer_interval_ms)
6198 return count;
6199
6200 pmu->hrtimer_interval_ms = timer;
6201
6202 /* update all cpuctx for this PMU */
6203 for_each_possible_cpu(cpu) {
6204 struct perf_cpu_context *cpuctx;
6205 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6206 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
6207
6208 if (hrtimer_active(&cpuctx->hrtimer))
6209 hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
6210 }
6211
6212 return count;
6213}
6214
6215#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
6216
5982static struct device_attribute pmu_dev_attrs[] = { 6217static struct device_attribute pmu_dev_attrs[] = {
5983 __ATTR_RO(type), 6218 __ATTR_RO(type),
5984 __ATTR_NULL, 6219 __ATTR_RW(perf_event_mux_interval_ms),
6220 __ATTR_NULL,
5985}; 6221};
5986 6222
5987static int pmu_bus_running; 6223static int pmu_bus_running;
@@ -6027,7 +6263,7 @@ free_dev:
6027static struct lock_class_key cpuctx_mutex; 6263static struct lock_class_key cpuctx_mutex;
6028static struct lock_class_key cpuctx_lock; 6264static struct lock_class_key cpuctx_lock;
6029 6265
6030int perf_pmu_register(struct pmu *pmu, char *name, int type) 6266int perf_pmu_register(struct pmu *pmu, const char *name, int type)
6031{ 6267{
6032 int cpu, ret; 6268 int cpu, ret;
6033 6269
@@ -6076,7 +6312,9 @@ skip_type:
6076 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); 6312 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6077 cpuctx->ctx.type = cpu_context; 6313 cpuctx->ctx.type = cpu_context;
6078 cpuctx->ctx.pmu = pmu; 6314 cpuctx->ctx.pmu = pmu;
6079 cpuctx->jiffies_interval = 1; 6315
6316 __perf_cpu_hrtimer_init(cpuctx, cpu);
6317
6080 INIT_LIST_HEAD(&cpuctx->rotation_list); 6318 INIT_LIST_HEAD(&cpuctx->rotation_list);
6081 cpuctx->unique_pmu = pmu; 6319 cpuctx->unique_pmu = pmu;
6082 } 6320 }
@@ -6402,11 +6640,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6402 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL)) 6640 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6403 return -EINVAL; 6641 return -EINVAL;
6404 6642
6405 /* kernel level capture: check permissions */
6406 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6407 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6408 return -EACCES;
6409
6410 /* propagate priv level, when not set for branch */ 6643 /* propagate priv level, when not set for branch */
6411 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) { 6644 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6412 6645
@@ -6424,6 +6657,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
6424 */ 6657 */
6425 attr->branch_sample_type = mask; 6658 attr->branch_sample_type = mask;
6426 } 6659 }
6660 /* privileged levels capture (kernel, hv): check permissions */
6661 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6662 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6663 return -EACCES;
6427 } 6664 }
6428 6665
6429 if (attr->sample_type & PERF_SAMPLE_REGS_USER) { 6666 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -7476,7 +7713,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7476 case CPU_DOWN_PREPARE: 7713 case CPU_DOWN_PREPARE:
7477 perf_event_exit_cpu(cpu); 7714 perf_event_exit_cpu(cpu);
7478 break; 7715 break;
7479
7480 default: 7716 default:
7481 break; 7717 break;
7482 } 7718 }
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 20185ea64aa6..1559fb0b9296 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -46,23 +46,26 @@
46#include <linux/smp.h> 46#include <linux/smp.h>
47 47
48#include <linux/hw_breakpoint.h> 48#include <linux/hw_breakpoint.h>
49
50
51/* 49/*
52 * Constraints data 50 * Constraints data
53 */ 51 */
52struct bp_cpuinfo {
53 /* Number of pinned cpu breakpoints in a cpu */
54 unsigned int cpu_pinned;
55 /* tsk_pinned[n] is the number of tasks having n+1 breakpoints */
56 unsigned int *tsk_pinned;
57 /* Number of non-pinned cpu/task breakpoints in a cpu */
58 unsigned int flexible; /* XXX: placeholder, see fetch_this_slot() */
59};
54 60
55/* Number of pinned cpu breakpoints in a cpu */ 61static DEFINE_PER_CPU(struct bp_cpuinfo, bp_cpuinfo[TYPE_MAX]);
56static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned[TYPE_MAX]);
57
58/* Number of pinned task breakpoints in a cpu */
59static DEFINE_PER_CPU(unsigned int *, nr_task_bp_pinned[TYPE_MAX]);
60
61/* Number of non-pinned cpu/task breakpoints in a cpu */
62static DEFINE_PER_CPU(unsigned int, nr_bp_flexible[TYPE_MAX]);
63
64static int nr_slots[TYPE_MAX]; 62static int nr_slots[TYPE_MAX];
65 63
64static struct bp_cpuinfo *get_bp_info(int cpu, enum bp_type_idx type)
65{
66 return per_cpu_ptr(bp_cpuinfo + type, cpu);
67}
68
66/* Keep track of the breakpoints attached to tasks */ 69/* Keep track of the breakpoints attached to tasks */
67static LIST_HEAD(bp_task_head); 70static LIST_HEAD(bp_task_head);
68 71
@@ -96,8 +99,8 @@ static inline enum bp_type_idx find_slot_idx(struct perf_event *bp)
96 */ 99 */
97static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) 100static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
98{ 101{
102 unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
99 int i; 103 int i;
100 unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
101 104
102 for (i = nr_slots[type] - 1; i >= 0; i--) { 105 for (i = nr_slots[type] - 1; i >= 0; i--) {
103 if (tsk_pinned[i] > 0) 106 if (tsk_pinned[i] > 0)
@@ -127,6 +130,13 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
127 return count; 130 return count;
128} 131}
129 132
133static const struct cpumask *cpumask_of_bp(struct perf_event *bp)
134{
135 if (bp->cpu >= 0)
136 return cpumask_of(bp->cpu);
137 return cpu_possible_mask;
138}
139
130/* 140/*
131 * Report the number of pinned/un-pinned breakpoints we have in 141 * Report the number of pinned/un-pinned breakpoints we have in
132 * a given cpu (cpu > -1) or in all of them (cpu = -1). 142 * a given cpu (cpu > -1) or in all of them (cpu = -1).
@@ -135,25 +145,15 @@ static void
135fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, 145fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
136 enum bp_type_idx type) 146 enum bp_type_idx type)
137{ 147{
138 int cpu = bp->cpu; 148 const struct cpumask *cpumask = cpumask_of_bp(bp);
139 struct task_struct *tsk = bp->hw.bp_target; 149 int cpu;
140
141 if (cpu >= 0) {
142 slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu);
143 if (!tsk)
144 slots->pinned += max_task_bp_pinned(cpu, type);
145 else
146 slots->pinned += task_bp_pinned(cpu, bp, type);
147 slots->flexible = per_cpu(nr_bp_flexible[type], cpu);
148
149 return;
150 }
151 150
152 for_each_possible_cpu(cpu) { 151 for_each_cpu(cpu, cpumask) {
153 unsigned int nr; 152 struct bp_cpuinfo *info = get_bp_info(cpu, type);
153 int nr;
154 154
155 nr = per_cpu(nr_cpu_bp_pinned[type], cpu); 155 nr = info->cpu_pinned;
156 if (!tsk) 156 if (!bp->hw.bp_target)
157 nr += max_task_bp_pinned(cpu, type); 157 nr += max_task_bp_pinned(cpu, type);
158 else 158 else
159 nr += task_bp_pinned(cpu, bp, type); 159 nr += task_bp_pinned(cpu, bp, type);
@@ -161,8 +161,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
161 if (nr > slots->pinned) 161 if (nr > slots->pinned)
162 slots->pinned = nr; 162 slots->pinned = nr;
163 163
164 nr = per_cpu(nr_bp_flexible[type], cpu); 164 nr = info->flexible;
165
166 if (nr > slots->flexible) 165 if (nr > slots->flexible)
167 slots->flexible = nr; 166 slots->flexible = nr;
168 } 167 }
@@ -182,29 +181,19 @@ fetch_this_slot(struct bp_busy_slots *slots, int weight)
182/* 181/*
183 * Add a pinned breakpoint for the given task in our constraint table 182 * Add a pinned breakpoint for the given task in our constraint table
184 */ 183 */
185static void toggle_bp_task_slot(struct perf_event *bp, int cpu, bool enable, 184static void toggle_bp_task_slot(struct perf_event *bp, int cpu,
186 enum bp_type_idx type, int weight) 185 enum bp_type_idx type, int weight)
187{ 186{
188 unsigned int *tsk_pinned; 187 unsigned int *tsk_pinned = get_bp_info(cpu, type)->tsk_pinned;
189 int old_count = 0; 188 int old_idx, new_idx;
190 int old_idx = 0; 189
191 int idx = 0; 190 old_idx = task_bp_pinned(cpu, bp, type) - 1;
192 191 new_idx = old_idx + weight;
193 old_count = task_bp_pinned(cpu, bp, type); 192
194 old_idx = old_count - 1; 193 if (old_idx >= 0)
195 idx = old_idx + weight; 194 tsk_pinned[old_idx]--;
196 195 if (new_idx >= 0)
197 /* tsk_pinned[n] is the number of tasks having n breakpoints */ 196 tsk_pinned[new_idx]++;
198 tsk_pinned = per_cpu(nr_task_bp_pinned[type], cpu);
199 if (enable) {
200 tsk_pinned[idx]++;
201 if (old_count > 0)
202 tsk_pinned[old_idx]--;
203 } else {
204 tsk_pinned[idx]--;
205 if (old_count > 0)
206 tsk_pinned[old_idx]++;
207 }
208} 197}
209 198
210/* 199/*
@@ -214,33 +203,26 @@ static void
214toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, 203toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
215 int weight) 204 int weight)
216{ 205{
217 int cpu = bp->cpu; 206 const struct cpumask *cpumask = cpumask_of_bp(bp);
218 struct task_struct *tsk = bp->hw.bp_target; 207 int cpu;
219 208
220 /* Pinned counter cpu profiling */ 209 if (!enable)
221 if (!tsk) { 210 weight = -weight;
222 211
223 if (enable) 212 /* Pinned counter cpu profiling */
224 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) += weight; 213 if (!bp->hw.bp_target) {
225 else 214 get_bp_info(bp->cpu, type)->cpu_pinned += weight;
226 per_cpu(nr_cpu_bp_pinned[type], bp->cpu) -= weight;
227 return; 215 return;
228 } 216 }
229 217
230 /* Pinned counter task profiling */ 218 /* Pinned counter task profiling */
231 219 for_each_cpu(cpu, cpumask)
232 if (!enable) 220 toggle_bp_task_slot(bp, cpu, type, weight);
233 list_del(&bp->hw.bp_list);
234
235 if (cpu >= 0) {
236 toggle_bp_task_slot(bp, cpu, enable, type, weight);
237 } else {
238 for_each_possible_cpu(cpu)
239 toggle_bp_task_slot(bp, cpu, enable, type, weight);
240 }
241 221
242 if (enable) 222 if (enable)
243 list_add_tail(&bp->hw.bp_list, &bp_task_head); 223 list_add_tail(&bp->hw.bp_list, &bp_task_head);
224 else
225 list_del(&bp->hw.bp_list);
244} 226}
245 227
246/* 228/*
@@ -261,8 +243,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
261 * 243 *
262 * - If attached to a single cpu, check: 244 * - If attached to a single cpu, check:
263 * 245 *
264 * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) 246 * (per_cpu(info->flexible, cpu) || (per_cpu(info->cpu_pinned, cpu)
265 * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM 247 * + max(per_cpu(info->tsk_pinned, cpu)))) < HBP_NUM
266 * 248 *
267 * -> If there are already non-pinned counters in this cpu, it means 249 * -> If there are already non-pinned counters in this cpu, it means
268 * there is already a free slot for them. 250 * there is already a free slot for them.
@@ -272,8 +254,8 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
272 * 254 *
273 * - If attached to every cpus, check: 255 * - If attached to every cpus, check:
274 * 256 *
275 * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) 257 * (per_cpu(info->flexible, *) || (max(per_cpu(info->cpu_pinned, *))
276 * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM 258 * + max(per_cpu(info->tsk_pinned, *)))) < HBP_NUM
277 * 259 *
278 * -> This is roughly the same, except we check the number of per cpu 260 * -> This is roughly the same, except we check the number of per cpu
279 * bp for every cpu and we keep the max one. Same for the per tasks 261 * bp for every cpu and we keep the max one. Same for the per tasks
@@ -284,16 +266,16 @@ __weak void arch_unregister_hw_breakpoint(struct perf_event *bp)
284 * 266 *
285 * - If attached to a single cpu, check: 267 * - If attached to a single cpu, check:
286 * 268 *
287 * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) 269 * ((per_cpu(info->flexible, cpu) > 1) + per_cpu(info->cpu_pinned, cpu)
288 * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM 270 * + max(per_cpu(info->tsk_pinned, cpu))) < HBP_NUM
289 * 271 *
290 * -> Same checks as before. But now the nr_bp_flexible, if any, must keep 272 * -> Same checks as before. But now the info->flexible, if any, must keep
291 * one register at least (or they will never be fed). 273 * one register at least (or they will never be fed).
292 * 274 *
293 * - If attached to every cpus, check: 275 * - If attached to every cpus, check:
294 * 276 *
295 * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) 277 * ((per_cpu(info->flexible, *) > 1) + max(per_cpu(info->cpu_pinned, *))
296 * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM 278 * + max(per_cpu(info->tsk_pinned, *))) < HBP_NUM
297 */ 279 */
298static int __reserve_bp_slot(struct perf_event *bp) 280static int __reserve_bp_slot(struct perf_event *bp)
299{ 281{
@@ -518,8 +500,8 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
518 perf_overflow_handler_t triggered, 500 perf_overflow_handler_t triggered,
519 void *context) 501 void *context)
520{ 502{
521 struct perf_event * __percpu *cpu_events, **pevent, *bp; 503 struct perf_event * __percpu *cpu_events, *bp;
522 long err; 504 long err = 0;
523 int cpu; 505 int cpu;
524 506
525 cpu_events = alloc_percpu(typeof(*cpu_events)); 507 cpu_events = alloc_percpu(typeof(*cpu_events));
@@ -528,31 +510,21 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
528 510
529 get_online_cpus(); 511 get_online_cpus();
530 for_each_online_cpu(cpu) { 512 for_each_online_cpu(cpu) {
531 pevent = per_cpu_ptr(cpu_events, cpu);
532 bp = perf_event_create_kernel_counter(attr, cpu, NULL, 513 bp = perf_event_create_kernel_counter(attr, cpu, NULL,
533 triggered, context); 514 triggered, context);
534
535 *pevent = bp;
536
537 if (IS_ERR(bp)) { 515 if (IS_ERR(bp)) {
538 err = PTR_ERR(bp); 516 err = PTR_ERR(bp);
539 goto fail; 517 break;
540 } 518 }
541 }
542 put_online_cpus();
543 519
544 return cpu_events; 520 per_cpu(*cpu_events, cpu) = bp;
545
546fail:
547 for_each_online_cpu(cpu) {
548 pevent = per_cpu_ptr(cpu_events, cpu);
549 if (IS_ERR(*pevent))
550 break;
551 unregister_hw_breakpoint(*pevent);
552 } 521 }
553 put_online_cpus(); 522 put_online_cpus();
554 523
555 free_percpu(cpu_events); 524 if (likely(!err))
525 return cpu_events;
526
527 unregister_wide_hw_breakpoint(cpu_events);
556 return (void __percpu __force *)ERR_PTR(err); 528 return (void __percpu __force *)ERR_PTR(err);
557} 529}
558EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); 530EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
@@ -564,12 +536,10 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint);
564void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) 536void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events)
565{ 537{
566 int cpu; 538 int cpu;
567 struct perf_event **pevent;
568 539
569 for_each_possible_cpu(cpu) { 540 for_each_possible_cpu(cpu)
570 pevent = per_cpu_ptr(cpu_events, cpu); 541 unregister_hw_breakpoint(per_cpu(*cpu_events, cpu));
571 unregister_hw_breakpoint(*pevent); 542
572 }
573 free_percpu(cpu_events); 543 free_percpu(cpu_events);
574} 544}
575EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint); 545EXPORT_SYMBOL_GPL(unregister_wide_hw_breakpoint);
@@ -612,6 +582,11 @@ static int hw_breakpoint_add(struct perf_event *bp, int flags)
612 if (!(flags & PERF_EF_START)) 582 if (!(flags & PERF_EF_START))
613 bp->hw.state = PERF_HES_STOPPED; 583 bp->hw.state = PERF_HES_STOPPED;
614 584
585 if (is_sampling_event(bp)) {
586 bp->hw.last_period = bp->hw.sample_period;
587 perf_swevent_set_period(bp);
588 }
589
615 return arch_install_hw_breakpoint(bp); 590 return arch_install_hw_breakpoint(bp);
616} 591}
617 592
@@ -650,7 +625,6 @@ static struct pmu perf_breakpoint = {
650 625
651int __init init_hw_breakpoint(void) 626int __init init_hw_breakpoint(void)
652{ 627{
653 unsigned int **task_bp_pinned;
654 int cpu, err_cpu; 628 int cpu, err_cpu;
655 int i; 629 int i;
656 630
@@ -659,10 +633,11 @@ int __init init_hw_breakpoint(void)
659 633
660 for_each_possible_cpu(cpu) { 634 for_each_possible_cpu(cpu) {
661 for (i = 0; i < TYPE_MAX; i++) { 635 for (i = 0; i < TYPE_MAX; i++) {
662 task_bp_pinned = &per_cpu(nr_task_bp_pinned[i], cpu); 636 struct bp_cpuinfo *info = get_bp_info(cpu, i);
663 *task_bp_pinned = kzalloc(sizeof(int) * nr_slots[i], 637
664 GFP_KERNEL); 638 info->tsk_pinned = kcalloc(nr_slots[i], sizeof(int),
665 if (!*task_bp_pinned) 639 GFP_KERNEL);
640 if (!info->tsk_pinned)
666 goto err_alloc; 641 goto err_alloc;
667 } 642 }
668 } 643 }
@@ -676,7 +651,7 @@ int __init init_hw_breakpoint(void)
676 err_alloc: 651 err_alloc:
677 for_each_possible_cpu(err_cpu) { 652 for_each_possible_cpu(err_cpu) {
678 for (i = 0; i < TYPE_MAX; i++) 653 for (i = 0; i < TYPE_MAX; i++)
679 kfree(per_cpu(nr_task_bp_pinned[i], err_cpu)); 654 kfree(get_bp_info(err_cpu, i)->tsk_pinned);
680 if (err_cpu == cpu) 655 if (err_cpu == cpu)
681 break; 656 break;
682 } 657 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 9edcf456e0fc..4ce13c3cedb9 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -120,7 +120,6 @@ extern int blk_iopoll_enabled;
120/* Constants used for minimum and maximum */ 120/* Constants used for minimum and maximum */
121#ifdef CONFIG_LOCKUP_DETECTOR 121#ifdef CONFIG_LOCKUP_DETECTOR
122static int sixty = 60; 122static int sixty = 60;
123static int neg_one = -1;
124#endif 123#endif
125 124
126static int zero; 125static int zero;
@@ -814,7 +813,7 @@ static struct ctl_table kern_table[] = {
814 .maxlen = sizeof(int), 813 .maxlen = sizeof(int),
815 .mode = 0644, 814 .mode = 0644,
816 .proc_handler = proc_dowatchdog, 815 .proc_handler = proc_dowatchdog,
817 .extra1 = &neg_one, 816 .extra1 = &zero,
818 .extra2 = &sixty, 817 .extra2 = &sixty,
819 }, 818 },
820 { 819 {
@@ -1044,6 +1043,15 @@ static struct ctl_table kern_table[] = {
1044 .mode = 0644, 1043 .mode = 0644,
1045 .proc_handler = perf_proc_update_handler, 1044 .proc_handler = perf_proc_update_handler,
1046 }, 1045 },
1046 {
1047 .procname = "perf_cpu_time_max_percent",
1048 .data = &sysctl_perf_cpu_time_max_percent,
1049 .maxlen = sizeof(sysctl_perf_cpu_time_max_percent),
1050 .mode = 0644,
1051 .proc_handler = perf_cpu_time_max_percent_handler,
1052 .extra1 = &zero,
1053 .extra2 = &one_hundred,
1054 },
1047#endif 1055#endif
1048#ifdef CONFIG_KMEMCHECK 1056#ifdef CONFIG_KMEMCHECK
1049 { 1057 {